39#include "../iterator/cache_modified_input_iterator.cuh"
40#include "../util_ptx.cuh"
41#include "../util_macro.cuh"
42#include "../util_type.cuh"
43#include "../util_namespace.cuh"
75 typename InputIteratorT>
78 InputIteratorT block_itr,
79 InputT (&items)[ITEMS_PER_THREAD])
81 InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
85 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
87 items[ITEM] = thread_itr[ITEM];
103 int ITEMS_PER_THREAD,
104 typename InputIteratorT>
107 InputIteratorT block_itr,
108 InputT (&items)[ITEMS_PER_THREAD],
111 InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
114 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
116 if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
118 items[ITEM] = thread_itr[ITEM];
136 int ITEMS_PER_THREAD,
137 typename InputIteratorT>
140 InputIteratorT block_itr,
141 InputT (&items)[ITEMS_PER_THREAD],
143 DefaultT oob_default)
146 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
147 items[ITEM] = oob_default;
153#ifndef DOXYGEN_SHOULD_SKIP_THIS
161 int ITEMS_PER_THREAD>
165 T (&items)[ITEMS_PER_THREAD])
172 TOTAL_WORDS =
sizeof(items) /
sizeof(DeviceWord),
174 VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
176 (TOTAL_WORDS % 2 == 0) ?
180 VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
187 Vector vec_items[VECTORS_PER_THREAD];
190 Vector* vec_ptr =
reinterpret_cast<Vector*
>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
194 for (
int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
196 vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
201 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
203 items[ITEM] = *(
reinterpret_cast<T*
>(vec_items) + ITEM);
226 int ITEMS_PER_THREAD>
230 T (&items)[ITEMS_PER_THREAD])
232 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
256 int ITEMS_PER_THREAD,
257 typename InputIteratorT>
260 InputIteratorT block_itr,
261 InputT (&items)[ITEMS_PER_THREAD])
263 InputIteratorT thread_itr = block_itr + linear_tid;
266 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
268 items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
286 int ITEMS_PER_THREAD,
287 typename InputIteratorT>
290 InputIteratorT block_itr,
291 InputT (&items)[ITEMS_PER_THREAD],
294 InputIteratorT thread_itr = block_itr + linear_tid;
297 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
299 if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
301 items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
321 int ITEMS_PER_THREAD,
322 typename InputIteratorT>
325 InputIteratorT block_itr,
326 InputT (&items)[ITEMS_PER_THREAD],
328 DefaultT oob_default)
331 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332 items[ITEM] = oob_default;
334 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
360 int ITEMS_PER_THREAD,
361 typename InputIteratorT>
364 InputIteratorT block_itr,
365 InputT (&items)[ITEMS_PER_THREAD])
367 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
368 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
369 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
371 InputIteratorT thread_itr = block_itr + warp_offset + tid ;
375 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
377 items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
396 int ITEMS_PER_THREAD,
397 typename InputIteratorT>
400 InputIteratorT block_itr,
401 InputT (&items)[ITEMS_PER_THREAD],
404 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
405 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
406 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
408 InputIteratorT thread_itr = block_itr + warp_offset + tid ;
412 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
414 if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
416 items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
437 int ITEMS_PER_THREAD,
438 typename InputIteratorT>
441 InputIteratorT block_itr,
442 InputT (&items)[ITEMS_PER_THREAD],
444 DefaultT oob_default)
448 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
449 items[ITEM] = oob_default;
635 int ITEMS_PER_THREAD,
661 template <BlockLoadAlgorithm _POLICY,
int DUMMY>
686 template <
typename InputIteratorT>
687 __device__ __forceinline__
void Load(
688 InputIteratorT block_itr,
689 InputT (&items)[ITEMS_PER_THREAD])
695 template <
typename InputIteratorT>
696 __device__ __forceinline__
void Load(
697 InputIteratorT block_itr,
698 InputT (&items)[ITEMS_PER_THREAD],
705 template <
typename InputIteratorT,
typename DefaultT>
706 __device__ __forceinline__
void Load(
707 InputIteratorT block_itr,
708 InputT (&items)[ITEMS_PER_THREAD],
710 DefaultT oob_default)
739 template <
typename InputIteratorT>
740 __device__ __forceinline__
void Load(
742 InputT (&items)[ITEMS_PER_THREAD])
744 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(
linear_tid, block_ptr, items);
748 template <
typename InputIteratorT>
749 __device__ __forceinline__
void Load(
750 const InputT *block_ptr,
751 InputT (&items)[ITEMS_PER_THREAD])
753 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(
linear_tid, block_ptr, items);
761 __device__ __forceinline__
void Load(
763 InputT (&items)[ITEMS_PER_THREAD])
765 InternalLoadDirectBlockedVectorized<MODIFIER>(
linear_tid, block_itr.
ptr, items);
769 template <
typename _InputIteratorT>
770 __device__ __forceinline__
void Load(
771 _InputIteratorT block_itr,
772 InputT (&items)[ITEMS_PER_THREAD])
778 template <
typename InputIteratorT>
779 __device__ __forceinline__
void Load(
780 InputIteratorT block_itr,
781 InputT (&items)[ITEMS_PER_THREAD],
788 template <
typename InputIteratorT,
typename DefaultT>
789 __device__ __forceinline__
void Load(
790 InputIteratorT block_itr,
791 InputT (&items)[ITEMS_PER_THREAD],
793 DefaultT oob_default)
808 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
833 template <
typename InputIteratorT>
834 __device__ __forceinline__
void Load(
835 InputIteratorT block_itr,
836 InputT (&items)[ITEMS_PER_THREAD])
838 LoadDirectStriped<BLOCK_THREADS>(
linear_tid, block_itr, items);
839 BlockExchange(
temp_storage).StripedToBlocked(items, items);
843 template <
typename InputIteratorT>
844 __device__ __forceinline__
void Load(
845 InputIteratorT block_itr,
846 InputT (&items)[ITEMS_PER_THREAD],
849 LoadDirectStriped<BLOCK_THREADS>(
linear_tid, block_itr, items, valid_items);
850 BlockExchange(
temp_storage).StripedToBlocked(items, items);
854 template <
typename InputIteratorT,
typename DefaultT>
855 __device__ __forceinline__
void Load(
856 InputIteratorT block_itr,
857 InputT (&items)[ITEMS_PER_THREAD],
859 DefaultT oob_default)
861 LoadDirectStriped<BLOCK_THREADS>(
linear_tid, block_itr, items, valid_items, oob_default);
862 BlockExchange(
temp_storage).StripedToBlocked(items, items);
876 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
883 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
908 template <
typename InputIteratorT>
909 __device__ __forceinline__
void Load(
910 InputIteratorT block_itr,
911 InputT (&items)[ITEMS_PER_THREAD])
914 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
918 template <
typename InputIteratorT>
919 __device__ __forceinline__
void Load(
920 InputIteratorT block_itr,
921 InputT (&items)[ITEMS_PER_THREAD],
925 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
930 template <
typename InputIteratorT,
typename DefaultT>
931 __device__ __forceinline__
void Load(
932 InputIteratorT block_itr,
933 InputT (&items)[ITEMS_PER_THREAD],
935 DefaultT oob_default)
938 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
951 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
958 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
983 template <
typename InputIteratorT>
984 __device__ __forceinline__
void Load(
985 InputIteratorT block_itr,
986 InputT (&items)[ITEMS_PER_THREAD])
989 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
993 template <
typename InputIteratorT>
994 __device__ __forceinline__
void Load(
995 InputIteratorT block_itr,
996 InputT (&items)[ITEMS_PER_THREAD],
1000 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
1005 template <
typename InputIteratorT,
typename DefaultT>
1006 __device__ __forceinline__
void Load(
1007 InputIteratorT block_itr,
1008 InputT (&items)[ITEMS_PER_THREAD],
1010 DefaultT oob_default)
1013 BlockExchange(
temp_storage).WarpStripedToBlocked(items, items);
1038 return private_storage;
1129 template <
typename InputIteratorT>
1130 __device__ __forceinline__
void Load(
1131 InputIteratorT block_itr,
1132 InputT (&items)[ITEMS_PER_THREAD])
1175 template <
typename InputIteratorT>
1176 __device__ __forceinline__
void Load(
1177 InputIteratorT block_itr,
1178 InputT (&items)[ITEMS_PER_THREAD],
1223 template <
typename InputIteratorT,
typename DefaultT>
1224 __device__ __forceinline__
void Load(
1225 InputIteratorT block_itr,
1226 InputT (&items)[ITEMS_PER_THREAD],
1228 DefaultT oob_default)
Sparse Matrix implementation stub object when OpenFPM is compiled with no linear algebra support.
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
int linear_tid
Linear thread-id.
__device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a blocked arrangement across the thread block.
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range (skips vectorization)
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
_TempStorage & temp_storage
Thread reference to shared storage.
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
__device__ __forceinline__ void Load(CacheModifiedInputIterator< MODIFIER, ValueType, OffsetT > block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
int linear_tid
Linear thread-id.
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ LoadInternal(TempStorage &, int linear_tid)
Constructor.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
__device__ __forceinline__ BlockLoad()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
NullType TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a striped arrangement across the thread block.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
__device__ __forceinline__ LoadInternal(TempStorage &, int linear_tid)
Constructor.
__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a warp-striped arrangement across the thread block.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
__device__ __forceinline__ void Load(_InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorizati...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ void LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a blocked arrangement across the thread block.
__device__ __forceinline__ void Load(const InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
int linear_tid
Linear thread-id.
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ void Load(InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
LoadInternal< ALGORITHM, 0 > InternalLoad
Internal load implementation to use.
int linear_tid
Linear thread-id.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
InternalLoad::TempStorage _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ BlockLoad(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
NullType TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
int linear_tid
Linear thread-id.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
int linear_tid
Linear thread-id.
@ BLOCK_LOAD_WARP_TRANSPOSE
@ BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
@ BLOCK_THREADS
The thread block size in threads.
#define CUB_STATIC_ASSERT(cond, msg)
Static assert.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
OffsetT OffsetT
[in] Total number of input data items
\smemstorage{BlockExchange}
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
A simple "NULL" marker type.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...