Data Structures | |
class | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > |
The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block. More... | |
class | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > |
The BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory. More... | |
struct | cub::IterateThreadLoad< COUNT, MAX > |
Helper structure for templated load iteration (inductive case) More... | |
struct | cub::IterateThreadLoad< MAX, MAX > |
Helper structure for templated load iteration (termination case) More... | |
struct | cub::IterateThreadStore< COUNT, MAX > |
Helper structure for templated store iteration (inductive case) More... | |
struct | cub::IterateThreadStore< MAX, MAX > |
Helper structure for templated store iteration (termination case) More... | |
Macros | |
#define | _CUB_LOAD_16(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_8(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_4(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_2(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_1(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_ALL(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_16(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_8(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_4(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_2(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_1(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_ALL(cub_modifier, ptx_modifier) |
Enumerations | |
enum | cub::CacheLoadModifier { cub::LOAD_DEFAULT, cub::LOAD_CA, cub::LOAD_CG, cub::LOAD_CS, cub::LOAD_CV, cub::LOAD_LDG, cub::LOAD_VOLATILE } |
Enumeration of cache modifiers for memory load operations. More... | |
enum | cub::CacheStoreModifier { cub::STORE_DEFAULT, cub::STORE_WB, cub::STORE_CG, cub::STORE_CS, cub::STORE_WT, cub::STORE_VOLATILE } |
Enumeration of cache modifiers for memory store operations. More... | |
Functions | |
template<typename InputIteratorT > | |
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type | cub::ThreadLoad (InputIteratorT itr, Int2Type< LOAD_DEFAULT >, Int2Type< false >) |
template<typename T > | |
__device__ __forceinline__ T | cub::ThreadLoad (T *ptr, Int2Type< LOAD_DEFAULT >, Int2Type< true >) |
template<typename T > | |
__device__ __forceinline__ T | cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< true >) |
template<typename T > | |
__device__ __forceinline__ T | cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< false >) |
template<typename T > | |
__device__ __forceinline__ T | cub::ThreadLoad (T *ptr, Int2Type< LOAD_VOLATILE >, Int2Type< true >) |
template<typename T , int MODIFIER> | |
__device__ __forceinline__ T | cub::ThreadLoad (T const *ptr, Int2Type< MODIFIER >, Int2Type< true >) |
template<typename OutputIteratorT , typename T > | |
__device__ __forceinline__ void | cub::ThreadStore (OutputIteratorT itr, T val, Int2Type< STORE_DEFAULT >, Int2Type< false >) |
template<typename T > | |
__device__ __forceinline__ void | cub::ThreadStore (T *ptr, T val, Int2Type< STORE_DEFAULT >, Int2Type< true >) |
template<typename T > | |
__device__ __forceinline__ void | cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< true >) |
template<typename T > | |
__device__ __forceinline__ void | cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< false >) |
template<typename T > | |
__device__ __forceinline__ void | cub::ThreadStore (T *ptr, T val, Int2Type< STORE_VOLATILE >, Int2Type< true >) |
template<typename T , int MODIFIER> | |
__device__ __forceinline__ void | cub::ThreadStore (T *ptr, T val, Int2Type< MODIFIER >, Int2Type< true >) |
Blocked arrangement I/O (direct) | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a blocked arrangement across the thread block. More... | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. More... | |
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. More... | |
template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD> | |
__device__ __forceinline__ void | cub::InternalLoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
template<typename T , int ITEMS_PER_THREAD> | |
__device__ __forceinline__ void | cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a blocked arrangement across the thread block. More... | |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store a blocked arrangement of items across a thread block into a linear segment of items. More... | |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range. More... | |
template<typename T , int ITEMS_PER_THREAD> | |
__device__ __forceinline__ void | cub::StoreDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
Store a blocked arrangement of items across a thread block into a linear segment of items. More... | |
Striped arrangement I/O (direct) | |
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a striped arrangement across the thread block. More... | |
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range. More... | |
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More... | |
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store a striped arrangement of data across the thread block into a linear segment of items. More... | |
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range. More... | |
Warp-striped arrangement I/O (direct) | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a warp-striped arrangement across the thread block. More... | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range. More... | |
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More... | |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store a warp-striped arrangement of data across the thread block into a linear segment of items. More... | |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range. More... | |
Thread I/O (cache modified) | |
template<CacheLoadModifier MODIFIER, typename InputIteratorT > | |
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type | cub::ThreadLoad (InputIteratorT itr) |
Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. More... | |
template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T > | |
__device__ __forceinline__ void | cub::ThreadStore (OutputIteratorT itr, T val) |
Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. More... | |
#define _CUB_LOAD_1 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
Definition at line 247 of file thread_load.cuh.
#define _CUB_LOAD_16 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
Definition at line 154 of file thread_load.cuh.
#define _CUB_LOAD_2 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
Definition at line 232 of file thread_load.cuh.
#define _CUB_LOAD_4 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
Definition at line 217 of file thread_load.cuh.
#define _CUB_LOAD_8 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
Definition at line 181 of file thread_load.cuh.
#define _CUB_LOAD_ALL | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define powers-of-two ThreadLoad specializations for the given Cache load modifier
Definition at line 267 of file thread_load.cuh.
#define _CUB_STORE_1 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
Definition at line 238 of file thread_store.cuh.
#define _CUB_STORE_16 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
Definition at line 158 of file thread_store.cuh.
#define _CUB_STORE_2 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
Definition at line 225 of file thread_store.cuh.
#define _CUB_STORE_4 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
Definition at line 212 of file thread_store.cuh.
#define _CUB_STORE_8 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
Definition at line 182 of file thread_store.cuh.
#define _CUB_STORE_ALL | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define powers-of-two ThreadStore specializations for the given Cache load modifier
Definition at line 255 of file thread_store.cuh.
Enumeration of cache modifiers for memory load operations.
Definition at line 62 of file thread_load.cuh.
Enumeration of cache modifiers for memory store operations.
Definition at line 61 of file thread_store.cuh.
__device__ __forceinline__ void cub::InternalLoadDirectBlockedVectorized | ( | int | linear_tid, |
T * | block_ptr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Internal implementation for load vectorization
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_ptr | Input pointer for loading from |
[out] | items | Data to load |
Definition at line 162 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectBlocked | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD] | ||
) |
Load a linear segment of items into a blocked arrangement across the thread block.
\blocked
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 76 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectBlocked | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
\blocked
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 105 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectBlocked | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items, | ||
DefaultT | oob_default | ||
) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
\blocked
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 138 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized | ( | int | linear_tid, |
T * | block_ptr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Load a linear segment of items into a blocked arrangement across the thread block.
\blocked
The input offset (block_ptr
+ block_offset
) must be quad-item aligned
The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
ITEMS_PER_THREAD
is oddT
is not a built-in primitive or CUDA vector type (e.g., short
, int2
, double
, float2
, etc.)T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_ptr | Input pointer for loading from |
[out] | items | Data to load |
Definition at line 227 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD] | ||
) |
Load a linear segment of items into a striped arrangement across the thread block.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 258 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 288 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items, | ||
DefaultT | oob_default | ||
) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 323 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectWarpStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD] | ||
) |
Load a linear segment of items into a warp-striped arrangement across the thread block.
\warpstriped
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 362 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectWarpStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.
\warpstriped
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 398 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectWarpStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items, | ||
DefaultT | oob_default | ||
) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
\warpstriped
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 439 of file block_load.cuh.
__device__ __forceinline__ void cub::StoreDirectBlocked | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Store a blocked arrangement of items across a thread block into a linear segment of items.
\blocked
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 74 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectBlocked | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range.
\blocked
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 103 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectBlockedVectorized | ( | int | linear_tid, |
T * | block_ptr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Store a blocked arrangement of items across a thread block into a linear segment of items.
\blocked
The output offset (block_ptr
+ block_offset
) must be quad-item aligned, which is the default starting offset returned by cudaMalloc()
ITEMS_PER_THREAD
is oddT
is not a built-in primitive or CUDA vector type (e.g., short
, int2
, double
, float2
, etc.)T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_ptr | Input pointer for storing from |
[in] | items | Data to store |
Definition at line 143 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectStriped | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Store a striped arrangement of data across the thread block into a linear segment of items.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 206 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectStriped | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 237 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectWarpStriped | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Store a warp-striped arrangement of data across the thread block into a linear segment of items.
\warpstriped
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[out] | items | Data to load |
Definition at line 281 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectWarpStriped | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range.
\warpstriped
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 317 of file block_store.cuh.
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad | ( | InputIteratorT | itr | ) |
Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.
MODIFIER | [inferred] CacheLoadModifier enumeration |
InputIteratorT | [inferred] Input iterator type \iterator |
ThreadLoad definition for generic modifiers
Definition at line 420 of file thread_load.cuh.
__device__ __forceinline__ std::iterator_traits<InputIteratorT>::value_type cub::ThreadLoad | ( | InputIteratorT | itr, |
Int2Type< LOAD_DEFAULT > | , | ||
Int2Type< false > | |||
) |
Define powers-of-two ThreadLoad specializations for the various Cache load modifiersThreadLoad definition for LOAD_DEFAULT modifier on iterator types
Definition at line 312 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoad | ( | T * | ptr, |
Int2Type< LOAD_DEFAULT > | , | ||
Int2Type< true > | |||
) |
ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
Definition at line 325 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoad | ( | T * | ptr, |
Int2Type< LOAD_VOLATILE > | , | ||
Int2Type< true > | |||
) |
ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
Definition at line 381 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoad | ( | T const * | ptr, |
Int2Type< MODIFIER > | , | ||
Int2Type< true > | |||
) |
ThreadLoad definition for generic modifiers on pointer types
Definition at line 395 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer | ( | T * | ptr, |
Int2Type< true > | |||
) |
ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
Definition at line 338 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer | ( | T * | ptr, |
Int2Type< false > | |||
) |
ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
Definition at line 351 of file thread_load.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | OutputIteratorT | itr, |
T | val | ||
) |
Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.
MODIFIER | [inferred] CacheStoreModifier enumeration |
InputIteratorT | [inferred] Output iterator type \iterator |
T | [inferred] Data type of output value |
ThreadStore definition for generic modifiers
Definition at line 404 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | OutputIteratorT | itr, |
T | val, | ||
Int2Type< STORE_DEFAULT > | , | ||
Int2Type< false > | |||
) |
Define ThreadStore specializations for the various Cache load modifiersThreadStore definition for STORE_DEFAULT modifier on iterator types
Definition at line 292 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | T * | ptr, |
T | val, | ||
Int2Type< STORE_DEFAULT > | , | ||
Int2Type< true > | |||
) |
ThreadStore definition for STORE_DEFAULT modifier on pointer types
Definition at line 306 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | T * | ptr, |
T | val, | ||
Int2Type< STORE_VOLATILE > | , | ||
Int2Type< true > | |||
) |
ThreadStore definition for STORE_VOLATILE modifier on pointer types
Definition at line 361 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | T * | ptr, |
T | val, | ||
Int2Type< MODIFIER > | , | ||
Int2Type< true > | |||
) |
ThreadStore definition for generic modifiers on pointer types
Definition at line 375 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr | ( | T * | ptr, |
T | val, | ||
Int2Type< true > | |||
) |
ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
Definition at line 320 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr | ( | T * | ptr, |
T | val, | ||
Int2Type< false > | |||
) |
ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
Definition at line 333 of file thread_store.cuh.