The BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory.
More...
template<typename T, int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory.
- Template Parameters
-
T | The type of data to be written. |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ITEMS_PER_THREAD | The number of consecutive items partitioned onto each thread. |
ALGORITHM | [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. |
WARP_TIME_SLICING | [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
- Overview
-
- A Simple Example
- \blockcollective{BlockStore}
- The code snippet below illustrates the storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for
BLOCK_STORE_WARP_TRANSPOSE
, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
#include <cub/cub.cuh>
__global__ void ExampleKernel(int *d_data, ...)
{
int thread_data[4];
...
int thread_data[4];
- Suppose the set of
thread_data
across the block of threads is { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }
. The output d_data
will be 0, 1, 2, 3, 4, 5, ...
.
Definition at line 515 of file block_store.cuh.
|
|
__device__ __forceinline__ | BlockStore () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockStore (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage. More...
|
|
|
template<typename OutputIteratorT > |
__device__ __forceinline__ void | Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
| Store items into a linear segment of memory. More...
|
|
template<typename OutputIteratorT > |
__device__ __forceinline__ void | Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
| Store items into a linear segment of memory, guarded by range. More...
|
|
◆ anonymous enum
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
Constants.
Enumerator |
---|
BLOCK_THREADS | The thread block size in threads.
|
Definition at line 523 of file block_store.cuh.
◆ BlockStore()
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
Collective constructor using the specified memory allocation as temporary storage.
- Parameters
-
[in] | temp_storage | Reference to memory allocation having layout type TempStorage |
Definition at line 883 of file block_store.cuh.
◆ Store() [1/2]
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store |
( |
OutputIteratorT |
block_itr, |
|
|
T(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
|
inline |
Store items into a linear segment of memory.
-
- Snippet
- The code snippet below illustrates the storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for
BLOCK_STORE_WARP_TRANSPOSE
, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
#include <cub/cub.cuh>
__global__ void ExampleKernel(int *d_data, ...)
{
int thread_data[4];
...
int thread_data[4];
- Suppose the set of
thread_data
across the block of threads is { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }
. The output d_data
will be 0, 1, 2, 3, 4, 5, ...
.
- Parameters
-
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 939 of file block_store.cuh.
◆ Store() [2/2]
template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store |
( |
OutputIteratorT |
block_itr, |
|
|
T(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items |
|
) |
| |
|
inline |
Store items into a linear segment of memory, guarded by range.
-
- Snippet
- The code snippet below illustrates the guarded storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for
BLOCK_STORE_WARP_TRANSPOSE
, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
#include <cub/cub.cuh>
__global__ void ExampleKernel(int *d_data, int valid_items, ...)
{
int thread_data[4];
...
int thread_data[4];
- Suppose the set of
thread_data
across the block of threads is { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }
and valid_items
is 5
. The output d_data
will be 0, 1, 2, 3, 4, ?, ?, ?, ...
, with only the first two threads being unmasked to store portions of valid data.
- Parameters
-
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 988 of file block_store.cuh.
The documentation for this class was generated from the following file: