The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block.
More...
template<typename InputT, int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
class cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block.
- Template Parameters
-
InputT | The data type to read into (which must be convertible from the input iterator's value type). |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ITEMS_PER_THREAD | The number of consecutive items partitioned onto each thread. |
ALGORITHM | [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. |
WARP_TIME_SLICING | [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) |
BLOCK_DIM_Y | [optional] The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | [optional] The thread block length in threads along the Z dimension (default: 1) |
PTX_ARCH | [optional] \ptxversion |
- Overview
-
- A Simple Example
- \blockcollective{BlockLoad}
- The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
BLOCK_LOAD_WARP_TRANSPOSE
, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
#include <cub/cub.cuh>
__global__ void ExampleKernel(int *d_data, ...)
{
int thread_data[4];
- Suppose the input
d_data
is 0, 1, 2, 3, 4, 5, ...
. The set of thread_data
across the block of threads in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }
.
Definition at line 640 of file block_load.cuh.
|
|
__device__ __forceinline__ | BlockLoad () |
| Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
__device__ __forceinline__ | BlockLoad (TempStorage &temp_storage) |
| Collective constructor using the specified memory allocation as temporary storage. More...
|
|
|
template<typename InputIteratorT > |
__device__ __forceinline__ void | Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
| Load a linear segment of items from memory. More...
|
|
template<typename InputIteratorT > |
__device__ __forceinline__ void | Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
| Load a linear segment of items from memory, guarded by range. More...
|
|
template<typename InputIteratorT , typename DefaultT > |
__device__ __forceinline__ void | Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
| Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements. More...
|
|
◆ anonymous enum
template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
Constants.
Enumerator |
---|
BLOCK_THREADS | The thread block size in threads.
|
Definition at line 649 of file block_load.cuh.
◆ BlockLoad()
template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad |
( |
TempStorage & |
temp_storage | ) |
|
|
inline |
Collective constructor using the specified memory allocation as temporary storage.
- Parameters
-
[in] | temp_storage | Reference to memory allocation having layout type TempStorage |
Definition at line 1076 of file block_load.cuh.
◆ Load() [1/3]
template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load |
( |
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
|
inline |
Load a linear segment of items from memory.
-
- Snippet
- The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
BLOCK_LOAD_WARP_TRANSPOSE
, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
#include <cub/cub.cuh>
__global__ void ExampleKernel(int *d_data, ...)
{
int thread_data[4];
- Suppose the input
d_data
is 0, 1, 2, 3, 4, 5, ...
. The set of thread_data
across the block of threads in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }
.
- Parameters
-
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 1130 of file block_load.cuh.
◆ Load() [2/3]
template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load |
( |
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items |
|
) |
| |
|
inline |
Load a linear segment of items from memory, guarded by range.
-
- Snippet
- The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
BLOCK_LOAD_WARP_TRANSPOSE
, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
#include <cub/cub.cuh>
__global__ void ExampleKernel(int *d_data, int valid_items, ...)
{
int thread_data[4];
- Suppose the input
d_data
is 0, 1, 2, 3, 4, 5, 6...
and valid_items
is 5
. The set of thread_data
across the block of threads in those threads will be { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }
, with only the first two threads being unmasked to load portions of valid data (and other items remaining unassigned).
- Parameters
-
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 1176 of file block_load.cuh.
◆ Load() [3/3]
template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load |
( |
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items, |
|
|
DefaultT |
oob_default |
|
) |
| |
|
inline |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
-
- Snippet
- The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
BLOCK_LOAD_WARP_TRANSPOSE
, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
#include <cub/cub.cuh>
__global__ void ExampleKernel(int *d_data, int valid_items, ...)
{
int thread_data[4];
- Suppose the input
d_data
is 0, 1, 2, 3, 4, 5, 6...
, valid_items
is 5
, and the out-of-bounds default is -1
. The set of thread_data
across the block of threads in those threads will be { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }
, with only the first two threads being unmasked to load portions of valid data (and other items are assigned -1
)
- Parameters
-
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 1224 of file block_load.cuh.
The documentation for this class was generated from the following file: