39 #include "../iterator/cache_modified_input_iterator.cuh"
40 #include "../util_ptx.cuh"
41 #include "../util_macro.cuh"
42 #include "../util_type.cuh"
43 #include "../util_namespace.cuh"
75 typename InputIteratorT>
78 InputIteratorT block_itr,
79 InputT (&items)[ITEMS_PER_THREAD])
81 InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
85 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
87 items[ITEM] = thread_itr[ITEM];
103 int ITEMS_PER_THREAD,
104 typename InputIteratorT>
107 InputIteratorT block_itr,
108 InputT (&items)[ITEMS_PER_THREAD],
111 InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
114 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
116 if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
118 items[ITEM] = thread_itr[ITEM];
136 int ITEMS_PER_THREAD,
137 typename InputIteratorT>
140 InputIteratorT block_itr,
141 InputT (&items)[ITEMS_PER_THREAD],
143 DefaultT oob_default)
146 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
147 items[ITEM] = oob_default;
153 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
161 int ITEMS_PER_THREAD>
165 T (&items)[ITEMS_PER_THREAD])
172 TOTAL_WORDS =
sizeof(items) /
sizeof(DeviceWord),
174 VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
176 (TOTAL_WORDS % 2 == 0) ?
180 VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
187 Vector vec_items[VECTORS_PER_THREAD];
190 Vector* vec_ptr =
reinterpret_cast<Vector*
>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
194 for (
int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
196 vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
201 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
203 items[ITEM] = *(
reinterpret_cast<T*
>(vec_items) + ITEM);
207 #endif // DOXYGEN_SHOULD_SKIP_THIS
226 int ITEMS_PER_THREAD>
230 T (&items)[ITEMS_PER_THREAD])
232 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
256 int ITEMS_PER_THREAD,
257 typename InputIteratorT>
260 InputIteratorT block_itr,
261 InputT (&items)[ITEMS_PER_THREAD])
263 InputIteratorT thread_itr = block_itr + linear_tid;
266 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
268 items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
286 int ITEMS_PER_THREAD,
287 typename InputIteratorT>
290 InputIteratorT block_itr,
291 InputT (&items)[ITEMS_PER_THREAD],
294 InputIteratorT thread_itr = block_itr + linear_tid;
297 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
299 if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
301 items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
321 int ITEMS_PER_THREAD,
322 typename InputIteratorT>
325 InputIteratorT block_itr,
326 InputT (&items)[ITEMS_PER_THREAD],
328 DefaultT oob_default)
331 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332 items[ITEM] = oob_default;
334 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
360 int ITEMS_PER_THREAD,
361 typename InputIteratorT>
364 InputIteratorT block_itr,
365 InputT (&items)[ITEMS_PER_THREAD])
367 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
368 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
369 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
371 InputIteratorT thread_itr = block_itr + warp_offset + tid ;
375 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
377 items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
396 int ITEMS_PER_THREAD,
397 typename InputIteratorT>
400 InputIteratorT block_itr,
401 InputT (&items)[ITEMS_PER_THREAD],
404 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
405 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
406 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
408 InputIteratorT thread_itr = block_itr + warp_offset + tid ;
412 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
414 if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
416 items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
437 int ITEMS_PER_THREAD,
438 typename InputIteratorT>
441 InputIteratorT block_itr,
442 InputT (&items)[ITEMS_PER_THREAD],
444 DefaultT oob_default)
448 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
449 items[ITEM] = oob_default;
635 int ITEMS_PER_THREAD,
661 template <BlockLoadAlgorithm _POLICY,
int DUMMY>
686 template <
typename InputIteratorT>
687 __device__ __forceinline__
void Load(
688 InputIteratorT block_itr,
689 InputT (&items)[ITEMS_PER_THREAD])
695 template <
typename InputIteratorT>
696 __device__ __forceinline__
void Load(
697 InputIteratorT block_itr,
698 InputT (&items)[ITEMS_PER_THREAD],
705 template <
typename InputIteratorT,
typename DefaultT>
706 __device__ __forceinline__
void Load(
707 InputIteratorT block_itr,
708 InputT (&items)[ITEMS_PER_THREAD],
710 DefaultT oob_default)
739 template <
typename InputIteratorT>
740 __device__ __forceinline__
void Load(
742 InputT (&items)[ITEMS_PER_THREAD])
744 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(
linear_tid, block_ptr, items);
748 template <
typename InputIteratorT>
749 __device__ __forceinline__
void Load(
750 const InputT *block_ptr,
751 InputT (&items)[ITEMS_PER_THREAD])
753 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(
linear_tid, block_ptr, items);
761 __device__ __forceinline__
void Load(
763 InputT (&items)[ITEMS_PER_THREAD])
765 InternalLoadDirectBlockedVectorized<MODIFIER>(
linear_tid, block_itr.
ptr, items);
769 template <
typename _InputIteratorT>
770 __device__ __forceinline__
void Load(
771 _InputIteratorT block_itr,
772 InputT (&items)[ITEMS_PER_THREAD])
778 template <
typename InputIteratorT>
779 __device__ __forceinline__
void Load(
780 InputIteratorT block_itr,
781 InputT (&items)[ITEMS_PER_THREAD],
788 template <
typename InputIteratorT,
typename DefaultT>
789 __device__ __forceinline__
void Load(
790 InputIteratorT block_itr,
791 InputT (&items)[ITEMS_PER_THREAD],
793 DefaultT oob_default)
808 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
833 template <
typename InputIteratorT>
834 __device__ __forceinline__
void Load(
835 InputIteratorT block_itr,
836 InputT (&items)[ITEMS_PER_THREAD])
838 LoadDirectStriped<BLOCK_THREADS>(
linear_tid, block_itr, items);
839 BlockExchange(
temp_storage).StripedToBlocked(items, items);
843 template <
typename InputIteratorT>
844 __device__ __forceinline__
void Load(
845 InputIteratorT block_itr,
846 InputT (&items)[ITEMS_PER_THREAD],
849 LoadDirectStriped<BLOCK_THREADS>(
linear_tid, block_itr, items, valid_items);
850 BlockExchange(
temp_storage).StripedToBlocked(items, items);
854 template <
typename InputIteratorT,
typename DefaultT>
855 __device__ __forceinline__
void Load(
856 InputIteratorT block_itr,
857 InputT (&items)[ITEMS_PER_THREAD],
859 DefaultT oob_default)
861 LoadDirectStriped<BLOCK_THREADS>(
linear_tid, block_itr, items, valid_items, oob_default);
862 BlockExchange(
temp_storage).StripedToBlocked(items, items);
876 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
883 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
908 template <
typename InputIteratorT>
909 __device__ __forceinline__
void Load(
910 InputIteratorT block_itr,
911 InputT (&items)[ITEMS_PER_THREAD])
914 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
918 template <
typename InputIteratorT>
919 __device__ __forceinline__
void Load(
920 InputIteratorT block_itr,
921 InputT (&items)[ITEMS_PER_THREAD],
925 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
930 template <
typename InputIteratorT,
typename DefaultT>
931 __device__ __forceinline__
void Load(
932 InputIteratorT block_itr,
933 InputT (&items)[ITEMS_PER_THREAD],
935 DefaultT oob_default)
938 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
951 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
958 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
983 template <
typename InputIteratorT>
984 __device__ __forceinline__
void Load(
985 InputIteratorT block_itr,
986 InputT (&items)[ITEMS_PER_THREAD])
989 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
993 template <
typename InputIteratorT>
994 __device__ __forceinline__
void Load(
995 InputIteratorT block_itr,
996 InputT (&items)[ITEMS_PER_THREAD],
1000 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
1005 template <
typename InputIteratorT,
typename DefaultT>
1006 __device__ __forceinline__
void Load(
1007 InputIteratorT block_itr,
1008 InputT (&items)[ITEMS_PER_THREAD],
1010 DefaultT oob_default)
1013 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
1038 return private_storage;
1129 template <
typename InputIteratorT>
1130 __device__ __forceinline__
void Load(
1131 InputIteratorT block_itr,
1132 InputT (&items)[ITEMS_PER_THREAD])
1175 template <
typename InputIteratorT>
1176 __device__ __forceinline__
void Load(
1177 InputIteratorT block_itr,
1178 InputT (&items)[ITEMS_PER_THREAD],
1223 template <
typename InputIteratorT,
typename DefaultT>
1224 __device__ __forceinline__
void Load(
1225 InputIteratorT block_itr,
1226 InputT (&items)[ITEMS_PER_THREAD],
1228 DefaultT oob_default)