39#include "../util_ptx.cuh"
40#include "../util_macro.cuh"
41#include "../util_type.cuh"
42#include "../util_namespace.cuh"
73 typename OutputIteratorT>
76 OutputIteratorT block_itr,
77 T (&items)[ITEMS_PER_THREAD])
79 OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
83 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
85 thread_itr[ITEM] = items[ITEM];
101 int ITEMS_PER_THREAD,
102 typename OutputIteratorT>
105 OutputIteratorT block_itr,
106 T (&items)[ITEMS_PER_THREAD],
109 OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
113 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
115 if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
117 thread_itr[ITEM] = items[ITEM];
142 int ITEMS_PER_THREAD>
146 T (&items)[ITEMS_PER_THREAD])
151 MAX_VEC_SIZE =
CUB_MIN(4, ITEMS_PER_THREAD),
154 VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
158 VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
165 Vector *block_ptr_vectors =
reinterpret_cast<Vector*
>(
const_cast<T*
>(block_ptr));
168 Vector raw_vector[VECTORS_PER_THREAD];
169 T *raw_items =
reinterpret_cast<T*
>(raw_vector);
173 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
175 raw_items[ITEM] = items[ITEM];
204 int ITEMS_PER_THREAD,
205 typename OutputIteratorT>
208 OutputIteratorT block_itr,
209 T (&items)[ITEMS_PER_THREAD])
211 OutputIteratorT thread_itr = block_itr + linear_tid;
215 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
217 thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
235 int ITEMS_PER_THREAD,
236 typename OutputIteratorT>
239 OutputIteratorT block_itr,
240 T (&items)[ITEMS_PER_THREAD],
243 OutputIteratorT thread_itr = block_itr + linear_tid;
247 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
249 if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
251 thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
279 int ITEMS_PER_THREAD,
280 typename OutputIteratorT>
283 OutputIteratorT block_itr,
284 T (&items)[ITEMS_PER_THREAD])
286 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
287 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
288 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
290 OutputIteratorT thread_itr = block_itr + warp_offset + tid;
294 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
296 thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
315 int ITEMS_PER_THREAD,
316 typename OutputIteratorT>
319 OutputIteratorT block_itr,
320 T (&items)[ITEMS_PER_THREAD],
323 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
324 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
325 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
327 OutputIteratorT thread_itr = block_itr + warp_offset + tid;
331 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
333 if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
335 thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
510 int ITEMS_PER_THREAD,
535 template <BlockStoreAlgorithm _POLICY,
int DUMMY>
560 template <
typename OutputIteratorT>
561 __device__ __forceinline__
void Store(
562 OutputIteratorT block_itr,
563 T (&items)[ITEMS_PER_THREAD])
569 template <
typename OutputIteratorT>
570 __device__ __forceinline__
void Store(
571 OutputIteratorT block_itr,
572 T (&items)[ITEMS_PER_THREAD],
601 __device__ __forceinline__
void Store(
603 T (&items)[ITEMS_PER_THREAD])
609 template <
typename OutputIteratorT>
610 __device__ __forceinline__
void Store(
611 OutputIteratorT block_itr,
612 T (&items)[ITEMS_PER_THREAD])
618 template <
typename OutputIteratorT>
619 __device__ __forceinline__
void Store(
620 OutputIteratorT block_itr,
621 T (&items)[ITEMS_PER_THREAD],
636 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
664 template <
typename OutputIteratorT>
665 __device__ __forceinline__
void Store(
666 OutputIteratorT block_itr,
667 T (&items)[ITEMS_PER_THREAD])
670 StoreDirectStriped<BLOCK_THREADS>(
linear_tid, block_itr, items);
674 template <
typename OutputIteratorT>
675 __device__ __forceinline__
void Store(
676 OutputIteratorT block_itr,
677 T (&items)[ITEMS_PER_THREAD],
697 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
704 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
732 template <
typename OutputIteratorT>
733 __device__ __forceinline__
void Store(
734 OutputIteratorT block_itr,
735 T (&items)[ITEMS_PER_THREAD])
737 BlockExchange(
temp_storage).BlockedToWarpStriped(items);
742 template <
typename OutputIteratorT>
743 __device__ __forceinline__
void Store(
744 OutputIteratorT block_itr,
745 T (&items)[ITEMS_PER_THREAD],
748 BlockExchange(
temp_storage).BlockedToWarpStriped(items);
765 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
772 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
800 template <
typename OutputIteratorT>
801 __device__ __forceinline__
void Store(
802 OutputIteratorT block_itr,
803 T (&items)[ITEMS_PER_THREAD])
805 BlockExchange(
temp_storage).BlockedToWarpStriped(items);
810 template <
typename OutputIteratorT>
811 __device__ __forceinline__
void Store(
812 OutputIteratorT block_itr,
813 T (&items)[ITEMS_PER_THREAD],
816 BlockExchange(
temp_storage).BlockedToWarpStriped(items);
844 return private_storage;
938 template <
typename OutputIteratorT>
939 __device__ __forceinline__
void Store(
940 OutputIteratorT block_itr,
941 T (&items)[ITEMS_PER_THREAD])
987 template <
typename OutputIteratorT>
988 __device__ __forceinline__
void Store(
989 OutputIteratorT block_itr,
990 T (&items)[ITEMS_PER_THREAD],
Sparse Matrix implementation stub object when OpenFPM is compiled with no linear algebra support.
The BlockStore class provides collective data movement methods for writing a blocked arrangement of i...
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
NullType TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ void Store(T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory, specialized for native pointer types (attempts vectoriza...
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
_TempStorage & temp_storage
Thread reference to shared storage.
volatile int valid_items
Temporary storage for partially-full block guard.
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ BlockStore(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
int linear_tid
Linear thread-id.
__device__ __forceinline__ void StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a blocked arrangement of items across a thread block into a linear segment of items.
int linear_tid
Linear thread-id.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
_TempStorage & temp_storage
Thread reference to shared storage.
BlockStoreAlgorithm
cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arr...
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
NullType TempStorage
Shared memory storage layout type.
InternalStore::TempStorage _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
__device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a warp-striped arrangement of data across the thread block into a linear segment of items.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorizat...
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ StoreInternal(TempStorage &, int linear_tid)
Constructor.
int linear_tid
Linear thread-id.
StoreInternal< ALGORITHM, 0 > InternalStore
Internal load implementation to use.
volatile int valid_items
Temporary storage for partially-full block guard.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a striped arrangement of data across the thread block into a linear segment of items.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
__device__ __forceinline__ BlockStore()
Collective constructor using a private static allocation of shared memory as temporary storage.
volatile int valid_items
Temporary storage for partially-full block guard.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
int linear_tid
Linear thread-id.
int linear_tid
Linear thread-id.
__device__ __forceinline__ void StoreDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Store a blocked arrangement of items across a thread block into a linear segment of items.
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
int linear_tid
Linear thread-id.
__device__ __forceinline__ StoreInternal(TempStorage &, int linear_tid)
Constructor.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
@ BLOCK_THREADS
The thread block size in threads.
@ BLOCK_STORE_WARP_TRANSPOSE
@ BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
#define CUB_MIN(a, b)
Select minimum(a, b)
#define CUB_STATIC_ASSERT(cond, msg)
Static assert.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
\smemstorage{BlockExchange}
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
A simple "NULL" marker type.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...