36#include "../util_arch.cuh"
37#include "../util_ptx.cuh"
38#include "../util_macro.cuh"
39#include "../util_type.cuh"
40#include "../util_namespace.cuh"
80 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
83 WARP_THREADS = 1 << LOG_WARP_THREADS,
84 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
94 T prev[BLOCK_THREADS];
95 T next[BLOCK_THREADS];
126 return private_storage;
194 unsigned int distance = 1)
200 unsigned int offset = threadIdx.x + distance;
201 if (offset >= BLOCK_THREADS)
202 offset -= BLOCK_THREADS;
216 template <
int ITEMS_PER_THREAD>
217 __device__ __forceinline__
void Up(
218 T (&input)[ITEMS_PER_THREAD],
219 T (&prev)[ITEMS_PER_THREAD])
226 for (
int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
227 prev[ITEM] = input[ITEM - 1];
243 template <
int ITEMS_PER_THREAD>
244 __device__ __forceinline__
void Up(
245 T (&input)[ITEMS_PER_THREAD],
246 T (&prev)[ITEMS_PER_THREAD],
262 template <
int ITEMS_PER_THREAD>
263 __device__ __forceinline__
void Down(
264 T (&input)[ITEMS_PER_THREAD],
265 T (&prev)[ITEMS_PER_THREAD])
272 for (
int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
273 prev[ITEM] = input[ITEM - 1];
288 template <
int ITEMS_PER_THREAD>
289 __device__ __forceinline__
void Down(
290 T (&input)[ITEMS_PER_THREAD],
291 T (&prev)[ITEMS_PER_THREAD],
The BlockShuffle class provides collective methods for shuffling data partitioned across a CUDA threa...
__device__ __forceinline__ void Up(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_suffix)
The thread block rotates its blocked arrangement of input items, shifting it up by one item....
__device__ __forceinline__ BlockShuffle()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void Down(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD])
The thread block rotates its blocked arrangement of input items, shifting it down by one item.
__device__ __forceinline__ BlockShuffle(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ void Down(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_prefix)
The thread block rotates its blocked arrangement of input items, shifting it down by one item....
__device__ __forceinline__ void Rotate(T input, T &output, unsigned int distance=1)
Each threadi obtains the input provided by threadi+distance.
__device__ __forceinline__ void Up(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD])
The thread block rotates its blocked arrangement of input items, shifting it up by one item.
_TempStorage & temp_storage
Shared storage reference.
unsigned int linear_tid
Linear thread-id.
__device__ __forceinline__ void Offset(T input, T &output, int distance=1)
Each threadi obtains the input provided by threadi+distance. The offset distance may be negative.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Optional outer namespace(s)
\smemstorage{BlockShuffle}
Shared memory storage layout type (last element from each thread's input)
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...