36 #include "../util_arch.cuh" 37 #include "../util_ptx.cuh" 38 #include "../util_macro.cuh" 39 #include "../util_type.cuh" 40 #include "../util_namespace.cuh" 80 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
83 WARP_THREADS = 1 << LOG_WARP_THREADS,
84 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
94 T prev[BLOCK_THREADS];
95 T next[BLOCK_THREADS];
126 return private_storage;
194 unsigned int distance = 1)
200 unsigned int offset = threadIdx.x + distance;
201 if (offset >= BLOCK_THREADS)
202 offset -= BLOCK_THREADS;
216 template <
int ITEMS_PER_THREAD>
217 __device__ __forceinline__
void Up(
218 T (&input)[ITEMS_PER_THREAD],
219 T (&prev)[ITEMS_PER_THREAD])
226 for (
int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
227 prev[ITEM] = input[ITEM - 1];
243 template <
int ITEMS_PER_THREAD>
244 __device__ __forceinline__
void Up(
245 T (&input)[ITEMS_PER_THREAD],
246 T (&prev)[ITEMS_PER_THREAD],
262 template <
int ITEMS_PER_THREAD>
263 __device__ __forceinline__
void Down(
264 T (&input)[ITEMS_PER_THREAD],
265 T (&prev)[ITEMS_PER_THREAD])
272 for (
int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
273 prev[ITEM] = input[ITEM - 1];
288 template <
int ITEMS_PER_THREAD>
289 __device__ __forceinline__
void Down(
290 T (&input)[ITEMS_PER_THREAD],
291 T (&prev)[ITEMS_PER_THREAD],
__device__ __forceinline__ void Up(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD])
The thread block rotates its blocked arrangement of input items, shifting it up by one item.
Shared memory storage layout type (last element from each thread's input)
Optional outer namespace(s)
\smemstorage{BlockShuffle}
__device__ __forceinline__ void Down(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_prefix)
The thread block rotates its blocked arrangement of input items, shifting it down by one item....
__device__ __forceinline__ BlockShuffle(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
The BlockShuffle class provides collective methods for shuffling data partitioned across a CUDA threa...
__device__ __forceinline__ void Down(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD])
The thread block rotates its blocked arrangement of input items, shifting it down by one item.
__device__ __forceinline__ void Rotate(T input, T &output, unsigned int distance=1)
Each threadi obtains the input provided by threadi+distance.
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void Up(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_suffix)
The thread block rotates its blocked arrangement of input items, shifting it up by one item....
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
__device__ __forceinline__ BlockShuffle()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void Offset(T input, T &output, int distance=1)
Each threadi obtains the input provided by threadi+distance. The offset distance may be negative.