OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
UtilIo

Data Structures

class  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
 The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< _POLICY, DUMMY >
 Load helper. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::TempStorage
 \smemstorage{BlockLoad} More...
 
class  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
 The BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory. More...
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< _POLICY, DUMMY >
 Store helper. More...
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::TempStorage
 \smemstorage{BlockStore} More...
 
struct  cub::IterateThreadLoad< COUNT, MAX >
 Helper structure for templated load iteration (inductive case) More...
 
struct  cub::IterateThreadLoad< MAX, MAX >
 Helper structure for templated load iteration (termination case) More...
 
struct  cub::IterateThreadStore< COUNT, MAX >
 Helper structure for templated store iteration (inductive case) More...
 
struct  cub::IterateThreadStore< MAX, MAX >
 Helper structure for templated store iteration (termination case) More...
 

Typedefs

typedef NullType cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::TempStorage
 Shared memory storage layout type.
 
typedef NullType cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::TempStorage
 Shared memory storage layout type.
 
typedef BlockExchange< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::BlockExchange
 
typedef BlockExchange< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::BlockExchange
 
typedef BlockExchange< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange
 
typedef LoadInternal< ALGORITHM, 0 > cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InternalLoad
 Internal load implementation to use.
 
typedef InternalLoad::TempStorage cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::_TempStorage
 Shared memory storage layout type.
 
typedef NullType cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::TempStorage
 Shared memory storage layout type.
 
typedef NullType cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::TempStorage
 Shared memory storage layout type.
 
typedef BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::BlockExchange
 
typedef BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::BlockExchange
 
typedef BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange
 
typedef StoreInternal< ALGORITHM, 0 > cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InternalStore
 Internal load implementation to use.
 
typedef InternalStore::TempStorage cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::_TempStorage
 Shared memory storage layout type.
 

Enumerations

enum  cub::BlockLoadAlgorithm {
  cub::BLOCK_LOAD_DIRECT , cub::BLOCK_LOAD_VECTORIZE , cub::BLOCK_LOAD_TRANSPOSE , cub::BLOCK_LOAD_WARP_TRANSPOSE ,
  cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
}
 cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. More...
 
enum  { cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z }
 Constants. More...
 
enum  { WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) }
 
enum  { WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) }
 
enum  cub::BlockStoreAlgorithm {
  cub::BLOCK_STORE_DIRECT , cub::BLOCK_STORE_VECTORIZE , cub::BLOCK_STORE_TRANSPOSE , cub::BLOCK_STORE_WARP_TRANSPOSE ,
  cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
}
 cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. More...
 
enum  { cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z }
 Constants. More...
 
enum  { WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) }
 
enum  { WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) }
 
enum  cub::CacheLoadModifier {
  cub::LOAD_DEFAULT , cub::LOAD_CA , cub::LOAD_CG , cub::LOAD_CS ,
  cub::LOAD_CV , cub::LOAD_LDG , cub::LOAD_VOLATILE
}
 Enumeration of cache modifiers for memory load operations. More...
 
enum  cub::CacheStoreModifier {
  cub::STORE_DEFAULT , cub::STORE_WB , cub::STORE_CG , cub::STORE_CS ,
  cub::STORE_WT , cub::STORE_VOLATILE
}
 Enumeration of cache modifiers for memory store operations. More...
 

Functions

__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::LoadInternal (TempStorage &, int linear_tid)
 Constructor.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items from memory.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items from memory, guarded by range.
 
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
 
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::LoadInternal (TempStorage &, int linear_tid)
 Constructor.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (const InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
 
template<CacheLoadModifier MODIFIER, typename ValueType , typename OffsetT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (CacheModifiedInputIterator< MODIFIER, ValueType, OffsetT > block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
 
template<typename _InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (_InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items from memory, guarded by range (skips vectorization)
 
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
 
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::LoadInternal (TempStorage &temp_storage, int linear_tid)
 Constructor.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items from memory.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items from memory, guarded by range.
 
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
 
 cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS")
 
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::LoadInternal (TempStorage &temp_storage, int linear_tid)
 Constructor.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items from memory.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items from memory, guarded by range.
 
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
 
 cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS")
 
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::LoadInternal (TempStorage &temp_storage, int linear_tid)
 Constructor.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items from memory.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items from memory, guarded by range.
 
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
 
__device__ __forceinline__ _TempStoragecub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage ()
 Internal storage allocator.
 
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::StoreInternal (TempStorage &, int linear_tid)
 Constructor.
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store items into a linear segment of memory.
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store items into a linear segment of memory, guarded by range.
 
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::StoreInternal (TempStorage &, int linear_tid)
 Constructor.
 
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store (T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store items into a linear segment of memory, guarded by range.
 
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::StoreInternal (TempStorage &temp_storage, int linear_tid)
 Constructor.
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store items into a linear segment of memory.
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store items into a linear segment of memory, guarded by range.
 
 cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS")
 
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::StoreInternal (TempStorage &temp_storage, int linear_tid)
 Constructor.
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store items into a linear segment of memory.
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store items into a linear segment of memory, guarded by range.
 
 cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS")
 
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::StoreInternal (TempStorage &temp_storage, int linear_tid)
 Constructor.
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store items into a linear segment of memory.
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store items into a linear segment of memory, guarded by range.
 
__device__ __forceinline__ _TempStoragecub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage ()
 Internal storage allocator.
 
template<CacheLoadModifier MODIFIER, typename T >
static __device__ __forceinline__ void cub::IterateThreadLoad< COUNT, MAX >::Load (T const *ptr, T *vals)
 
template<typename InputIteratorT , typename T >
static __device__ __forceinline__ void cub::IterateThreadLoad< COUNT, MAX >::Dereference (InputIteratorT itr, T *vals)
 
template<CacheLoadModifier MODIFIER, typename T >
static __device__ __forceinline__ void cub::IterateThreadLoad< MAX, MAX >::Load (T const *, T *)
 
template<typename InputIteratorT , typename T >
static __device__ __forceinline__ void cub::IterateThreadLoad< MAX, MAX >::Dereference (InputIteratorT, T *)
 
template<CacheStoreModifier MODIFIER, typename T >
static __device__ __forceinline__ void cub::IterateThreadStore< COUNT, MAX >::Store (T *ptr, T *vals)
 
template<typename OutputIteratorT , typename T >
static __device__ __forceinline__ void cub::IterateThreadStore< COUNT, MAX >::Dereference (OutputIteratorT ptr, T *vals)
 
template<CacheStoreModifier MODIFIER, typename T >
static __device__ __forceinline__ void cub::IterateThreadStore< MAX, MAX >::Store (T *, T *)
 
template<typename OutputIteratorT , typename T >
static __device__ __forceinline__ void cub::IterateThreadStore< MAX, MAX >::Dereference (OutputIteratorT, T *)
 

Variables

int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::linear_tid
 Linear thread-id.
 
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::linear_tid
 Linear thread-id.
 
_TempStoragecub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::temp_storage
 Thread reference to shared storage.
 
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::linear_tid
 Linear thread-id.
 
_TempStoragecub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::temp_storage
 Thread reference to shared storage.
 
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::linear_tid
 Linear thread-id.
 
_TempStoragecub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage
 Thread reference to shared storage.
 
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid
 Linear thread-id.
 
_TempStoragecub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage
 Thread reference to shared storage.
 
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid
 Linear thread-id.
 
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::linear_tid
 Linear thread-id.
 
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::linear_tid
 Linear thread-id.
 
volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::_TempStorage::valid_items
 Temporary storage for partially-full block guard.
 
_TempStoragecub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::temp_storage
 Thread reference to shared storage.
 
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::linear_tid
 Linear thread-id.
 
volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::_TempStorage::valid_items
 Temporary storage for partially-full block guard.
 
_TempStoragecub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::temp_storage
 Thread reference to shared storage.
 
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::linear_tid
 Linear thread-id.
 
volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage::valid_items
 Temporary storage for partially-full block guard.
 
_TempStoragecub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage
 Thread reference to shared storage.
 
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid
 Linear thread-id.
 
_TempStoragecub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage
 Thread reference to shared storage.
 
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid
 Linear thread-id.
 

Blocked arrangement I/O (direct)

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block.
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
 
template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::InternalLoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block.
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store a blocked arrangement of items across a thread block into a linear segment of items.
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range.
 
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::StoreDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 Store a blocked arrangement of items across a thread block into a linear segment of items.
 

Striped arrangement I/O (direct)

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a striped arrangement across the thread block.
 
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range.
 
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
 
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store a striped arrangement of data across the thread block into a linear segment of items.
 
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range.
 

Warp-striped arrangement I/O (direct)

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a warp-striped arrangement across the thread block.
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store a warp-striped arrangement of data across the thread block into a linear segment of items.
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range.
 

Collective constructors

__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad ()
 Collective constructor using a private static allocation of shared memory as temporary storage.
 
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad (TempStorage &temp_storage)
 Collective constructor using the specified memory allocation as temporary storage.
 

Data movement

template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items from memory.
 
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items from memory, guarded by range.
 
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
 

Collective constructors

__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore ()
 Collective constructor using a private static allocation of shared memory as temporary storage.
 
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore (TempStorage &temp_storage)
 Collective constructor using the specified memory allocation as temporary storage.
 

Data movement

template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store items into a linear segment of memory.
 
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store items into a linear segment of memory, guarded by range.
 

Thread I/O (cache modified)

template<CacheLoadModifier MODIFIER, typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad (InputIteratorT itr)
 Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.
 
template<typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad (InputIteratorT itr, Int2Type< LOAD_DEFAULT >, Int2Type< false >)
 
template<typename T >
__device__ __forceinline__ T cub::ThreadLoad (T *ptr, Int2Type< LOAD_DEFAULT >, Int2Type< true >)
 
template<typename T >
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< true >)
 
template<typename T >
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< false >)
 
template<typename T >
__device__ __forceinline__ T cub::ThreadLoad (T *ptr, Int2Type< LOAD_VOLATILE >, Int2Type< true >)
 
template<typename T , int MODIFIER>
__device__ __forceinline__ T cub::ThreadLoad (T const *ptr, Int2Type< MODIFIER >, Int2Type< true >)
 
template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore (OutputIteratorT itr, T val)
 Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.
 
template<typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore (OutputIteratorT itr, T val, Int2Type< STORE_DEFAULT >, Int2Type< false >)
 
template<typename T >
__device__ __forceinline__ void cub::ThreadStore (T *ptr, T val, Int2Type< STORE_DEFAULT >, Int2Type< true >)
 
template<typename T >
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< true >)
 
template<typename T >
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< false >)
 
template<typename T >
__device__ __forceinline__ void cub::ThreadStore (T *ptr, T val, Int2Type< STORE_VOLATILE >, Int2Type< true >)
 
template<typename T , int MODIFIER>
__device__ __forceinline__ void cub::ThreadStore (T *ptr, T val, Int2Type< MODIFIER >, Int2Type< true >)
 
#define _CUB_LOAD_16(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_8(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_4(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_2(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_1(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_16(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_8(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_4(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_2(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_1(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)
 

Detailed Description

Macro Definition Documentation

◆ _CUB_LOAD_1

#define _CUB_LOAD_1 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr) \
{ \
unsigned short retval; \
asm volatile ( \
"{" \
" .reg .u8 datum;" \
" ld."#ptx_modifier".u8 datum, [%1];" \
" cvt.u16.u8 %0, datum;" \
"}" : \
"=h"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return (unsigned char) retval; \
}

Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier

Definition at line 247 of file thread_load.cuh.

◆ _CUB_LOAD_16

#define _CUB_LOAD_16 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr) \
{ \
uint4 retval; \
asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \
"=r"(retval.x), \
"=r"(retval.y), \
"=r"(retval.z), \
"=r"(retval.w) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
} \
template<> \
__device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr) \
{ \
ulonglong2 retval; \
asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \
"=l"(retval.x), \
"=l"(retval.y) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}

Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier

Definition at line 154 of file thread_load.cuh.

◆ _CUB_LOAD_2

#define _CUB_LOAD_2 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr) \
{ \
unsigned short retval; \
asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \
"=h"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}

Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier

Definition at line 232 of file thread_load.cuh.

◆ _CUB_LOAD_4

#define _CUB_LOAD_4 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr) \
{ \
unsigned int retval; \
asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \
"=r"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}

Define a uint (4B) ThreadLoad specialization for the given Cache load modifier

Definition at line 217 of file thread_load.cuh.

◆ _CUB_LOAD_8

#define _CUB_LOAD_8 (   cub_modifier,
  ptx_modifier 
)

Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier

Definition at line 181 of file thread_load.cuh.

◆ _CUB_LOAD_ALL

#define _CUB_LOAD_ALL (   cub_modifier,
  ptx_modifier 
)
Value:
_CUB_LOAD_16(cub_modifier, ptx_modifier) \
_CUB_LOAD_8(cub_modifier, ptx_modifier) \
_CUB_LOAD_4(cub_modifier, ptx_modifier) \
_CUB_LOAD_2(cub_modifier, ptx_modifier) \
_CUB_LOAD_1(cub_modifier, ptx_modifier) \
#define _CUB_LOAD_16(cub_modifier, ptx_modifier)

Define powers-of-two ThreadLoad specializations for the given Cache load modifier

Definition at line 267 of file thread_load.cuh.

◆ _CUB_STORE_1

#define _CUB_STORE_1 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
{ \
asm volatile ( \
"{" \
" .reg .u8 datum;" \
" cvt.u8.u16 datum, %1;" \
" st."#ptx_modifier".u8 [%0], datum;" \
"}" : : \
_CUB_ASM_PTR_(ptr), \
"h"((unsigned short) val)); \
}

Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier

Definition at line 238 of file thread_store.cuh.

◆ _CUB_STORE_16

#define _CUB_STORE_16 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
{ \
asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val.x), \
"r"(val.y), \
"r"(val.z), \
"r"(val.w)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
{ \
asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
_CUB_ASM_PTR_(ptr), \
"l"(val.x), \
"l"(val.y)); \
}

Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier

Definition at line 158 of file thread_store.cuh.

◆ _CUB_STORE_2

#define _CUB_STORE_2 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
{ \
asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"h"(val)); \
}

Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier

Definition at line 225 of file thread_store.cuh.

◆ _CUB_STORE_4

#define _CUB_STORE_4 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
{ \
asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val)); \
}

Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier

Definition at line 212 of file thread_store.cuh.

◆ _CUB_STORE_8

#define _CUB_STORE_8 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
{ \
asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
_CUB_ASM_PTR_(ptr), \
"h"(val.x), \
"h"(val.y), \
"h"(val.z), \
"h"(val.w)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
{ \
asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val.x), \
"r"(val.y)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
{ \
asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"l"(val)); \
}

Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier

Definition at line 182 of file thread_store.cuh.

◆ _CUB_STORE_ALL

#define _CUB_STORE_ALL (   cub_modifier,
  ptx_modifier 
)
Value:
_CUB_STORE_16(cub_modifier, ptx_modifier) \
_CUB_STORE_8(cub_modifier, ptx_modifier) \
_CUB_STORE_4(cub_modifier, ptx_modifier) \
_CUB_STORE_2(cub_modifier, ptx_modifier) \
_CUB_STORE_1(cub_modifier, ptx_modifier) \
#define _CUB_STORE_16(cub_modifier, ptx_modifier)

Define powers-of-two ThreadStore specializations for the given Cache load modifier

Definition at line 255 of file thread_store.cuh.

Typedef Documentation

◆ _TempStorage [1/2]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
typedef InternalLoad::TempStorage cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::_TempStorage
private

Shared memory storage layout type.

Definition at line 1027 of file block_load.cuh.

◆ _TempStorage [2/2]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
typedef InternalStore::TempStorage cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::_TempStorage
private

Shared memory storage layout type.

Definition at line 833 of file block_store.cuh.

◆ BlockExchange [1/6]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::BlockExchange

Definition at line 808 of file block_load.cuh.

◆ BlockExchange [2/6]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::BlockExchange

Definition at line 883 of file block_load.cuh.

◆ BlockExchange [3/6]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange

Definition at line 958 of file block_load.cuh.

◆ BlockExchange [4/6]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::BlockExchange

Definition at line 636 of file block_store.cuh.

◆ BlockExchange [5/6]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::BlockExchange

Definition at line 704 of file block_store.cuh.

◆ BlockExchange [6/6]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange

Definition at line 772 of file block_store.cuh.

◆ InternalLoad

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
typedef LoadInternal<ALGORITHM, 0> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InternalLoad
private

Internal load implementation to use.

Definition at line 1023 of file block_load.cuh.

◆ InternalStore

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
typedef StoreInternal<ALGORITHM, 0> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InternalStore
private

Internal load implementation to use.

Definition at line 829 of file block_store.cuh.

◆ TempStorage [1/4]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef NullType cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::TempStorage

Shared memory storage layout type.

Definition at line 672 of file block_load.cuh.

◆ TempStorage [2/4]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef NullType cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::TempStorage

Shared memory storage layout type.

Definition at line 725 of file block_load.cuh.

◆ TempStorage [3/4]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef NullType cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::TempStorage

Shared memory storage layout type.

Definition at line 546 of file block_store.cuh.

◆ TempStorage [4/4]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
typedef NullType cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::TempStorage

Shared memory storage layout type.

Definition at line 587 of file block_store.cuh.

Enumeration Type Documentation

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
anonymous enum
private

Constants.

Enumerator
BLOCK_THREADS 

The thread block size in threads.

Definition at line 523 of file block_store.cuh.

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
anonymous enum

Definition at line 695 of file block_store.cuh.

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
anonymous enum

Definition at line 763 of file block_store.cuh.

◆ anonymous enum

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
anonymous enum
private

Constants.

Enumerator
BLOCK_THREADS 

The thread block size in threads.

Definition at line 649 of file block_load.cuh.

◆ anonymous enum

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
anonymous enum

Definition at line 874 of file block_load.cuh.

◆ anonymous enum

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
anonymous enum

Definition at line 949 of file block_load.cuh.

◆ BlockLoadAlgorithm

cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.

cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.

Enumerator
BLOCK_LOAD_DIRECT 
Overview

A blocked arrangement of data is read directly from memory.

Performance Considerations
  • The utilization of memory transactions (coalescing) decreases as the access stride between threads increases (i.e., the number items per thread).
BLOCK_LOAD_VECTORIZE 
Overview

A blocked arrangement of data is read from memory using CUDA's built-in vectorized loads as a coalescing optimization. For example, ld.global.v4.s32 instructions will be generated when T = int and ITEMS_PER_THREAD % 4 == 0.

Performance Considerations
  • The utilization of memory transactions (coalescing) remains high until the the access stride between threads (i.e., the number items per thread) exceeds the maximum vector load width (typically 4 items or 64B, whichever is lower).
  • The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
    • ITEMS_PER_THREAD is odd
    • The InputIteratorTis not a simple pointer type
    • The block input offset is not quadword-aligned
    • The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)
BLOCK_LOAD_TRANSPOSE 
Overview

A striped arrangement of data is read efficiently from memory and then locally transposed into a blocked arrangement.

Performance Considerations
  • The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
  • The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
BLOCK_LOAD_WARP_TRANSPOSE 
Overview

A warp-striped arrangement of data is read efficiently from memory and then locally transposed into a blocked arrangement.

Usage Considerations
  • BLOCK_THREADS must be a multiple of WARP_THREADS
Performance Considerations
  • The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
  • The local reordering incurs slightly larger latencies than the direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
  • Provisions more shared storage, but incurs smaller latencies than the BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED 
Overview

Like BLOCK_LOAD_WARP_TRANSPOSE, a warp-striped arrangement of data is read directly from memory and then is locally transposed into a blocked arrangement. To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently time-sliced among warps.

Usage Considerations
  • BLOCK_THREADS must be a multiple of WARP_THREADS
Performance Considerations
  • The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
  • Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.

Definition at line 473 of file block_load.cuh.

◆ BlockStoreAlgorithm

cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.

Enumerator
BLOCK_STORE_DIRECT 
Overview

A blocked arrangement of data is written directly to memory.

Performance Considerations
  • The utilization of memory transactions (coalescing) decreases as the access stride between threads increases (i.e., the number items per thread).
BLOCK_STORE_VECTORIZE 
Overview

A blocked arrangement of data is written directly to memory using CUDA's built-in vectorized stores as a coalescing optimization. For example, st.global.v4.s32 instructions will be generated when T = int and ITEMS_PER_THREAD % 4 == 0.

Performance Considerations
  • The utilization of memory transactions (coalescing) remains high until the the access stride between threads (i.e., the number items per thread) exceeds the maximum vector store width (typically 4 items or 64B, whichever is lower).
  • The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
    • ITEMS_PER_THREAD is odd
    • The OutputIteratorT is not a simple pointer type
    • The block output offset is not quadword-aligned
    • The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)
BLOCK_STORE_TRANSPOSE 
Overview
A blocked arrangement is locally transposed and then efficiently written to memory as a striped arrangement.
Performance Considerations
  • The utilization of memory transactions (coalescing) remains high regardless of items written per thread.
  • The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
BLOCK_STORE_WARP_TRANSPOSE 
Overview
A blocked arrangement is locally transposed and then efficiently written to memory as a warp-striped arrangement
Usage Considerations
  • BLOCK_THREADS must be a multiple of WARP_THREADS
Performance Considerations
  • The utilization of memory transactions (coalescing) remains high regardless of items written per thread.
  • The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED 
Overview
A blocked arrangement is locally transposed and then efficiently written to memory as a warp-striped arrangement To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently time-sliced among warps.
Usage Considerations
  • BLOCK_THREADS must be a multiple of WARP_THREADS
Performance Considerations
  • The utilization of memory transactions (coalescing) remains high regardless of items written per thread.
  • Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.

Definition at line 354 of file block_store.cuh.

◆ CacheLoadModifier

Enumeration of cache modifiers for memory load operations.

Enumerator
LOAD_DEFAULT 

Default (no modifier)

LOAD_CA 

Cache at all levels.

LOAD_CG 

Cache at global level.

LOAD_CS 

Cache streaming (likely to be accessed once)

LOAD_CV 

Cache as volatile (including cached system lines)

LOAD_LDG 

Cache as texture.

LOAD_VOLATILE 

Volatile (any memory space)

Definition at line 62 of file thread_load.cuh.

◆ CacheStoreModifier

Enumeration of cache modifiers for memory store operations.

Enumerator
STORE_DEFAULT 

Default (no modifier)

STORE_WB 

Cache write-back all coherent levels.

STORE_CG 

Cache at global level.

STORE_CS 

Cache streaming (likely to be accessed once)

STORE_WT 

Cache write-through (to system memory)

STORE_VOLATILE 

Volatile shared (any memory space)

Definition at line 61 of file thread_store.cuh.

Function Documentation

◆ BlockLoad() [1/2]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad ( )
inline

Collective constructor using a private static allocation of shared memory as temporary storage.

Definition at line 1066 of file block_load.cuh.

◆ BlockLoad() [2/2]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad ( TempStorage temp_storage)
inline

Collective constructor using the specified memory allocation as temporary storage.

Parameters
[in]temp_storageReference to memory allocation having layout type TempStorage

Definition at line 1076 of file block_load.cuh.

◆ BlockStore() [1/2]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore ( )
inline

Collective constructor using a private static allocation of shared memory as temporary storage.

Definition at line 873 of file block_store.cuh.

◆ BlockStore() [2/2]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore ( TempStorage temp_storage)
inline

Collective constructor using the specified memory allocation as temporary storage.

Parameters
[in]temp_storageReference to memory allocation having layout type TempStorage

Definition at line 883 of file block_store.cuh.

◆ Dereference() [1/4]

template<int COUNT, int MAX>
template<typename InputIteratorT , typename T >
static __device__ __forceinline__ void cub::IterateThreadLoad< COUNT, MAX >::Dereference ( InputIteratorT  itr,
T *  vals 
)
inlinestatic

Definition at line 131 of file thread_load.cuh.

◆ Dereference() [2/4]

template<int MAX>
template<typename InputIteratorT , typename T >
static __device__ __forceinline__ void cub::IterateThreadLoad< MAX, MAX >::Dereference ( InputIteratorT  ,
T *   
)
inlinestatic

Definition at line 147 of file thread_load.cuh.

◆ Dereference() [3/4]

template<int COUNT, int MAX>
template<typename OutputIteratorT , typename T >
static __device__ __forceinline__ void cub::IterateThreadStore< COUNT, MAX >::Dereference ( OutputIteratorT  ptr,
T *  vals 
)
inlinestatic

Definition at line 135 of file thread_store.cuh.

◆ Dereference() [4/4]

template<int MAX>
template<typename OutputIteratorT , typename T >
static __device__ __forceinline__ void cub::IterateThreadStore< MAX, MAX >::Dereference ( OutputIteratorT  ,
T *   
)
inlinestatic

Definition at line 151 of file thread_store.cuh.

◆ InternalLoadDirectBlockedVectorized()

template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::InternalLoadDirectBlockedVectorized ( int  linear_tid,
T *  block_ptr,
T(&)  items[ITEMS_PER_THREAD] 
)

Internal implementation for load vectorization

Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_ptrInput pointer for loading from
[out]itemsData to load

Definition at line 162 of file block_load.cuh.

◆ Load() [1/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename _InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load ( _InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)
inline

Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 770 of file block_load.cuh.

◆ Load() [2/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<CacheLoadModifier MODIFIER, typename ValueType , typename OffsetT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load ( CacheModifiedInputIterator< MODIFIER, ValueType, OffsetT block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)
inline

Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 761 of file block_load.cuh.

◆ Load() [3/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load ( const InputT *  block_ptr,
InputT(&)  items[ITEMS_PER_THREAD] 
)
inline

Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)

Parameters
[in]block_ptrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 749 of file block_load.cuh.

◆ Load() [4/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)
inline

Load a linear segment of items from memory.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 687 of file block_load.cuh.

◆ Load() [5/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)
inline

Load a linear segment of items from memory.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load{

Definition at line 834 of file block_load.cuh.

◆ Load() [6/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)
inline

Load a linear segment of items from memory.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load{

Definition at line 909 of file block_load.cuh.

◆ Load() [7/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)
inline

Load a linear segment of items from memory.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load{

Definition at line 984 of file block_load.cuh.

◆ Load() [8/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)
inline

Load a linear segment of items from memory.

  • \blocked
  • \smemreuse
Snippet
The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for BLOCK_LOAD_WARP_TRANSPOSE, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
#include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
__global__ void ExampleKernel(int *d_data, ...)
{
// Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
// Allocate shared memory for BlockLoad
__shared__ typename BlockLoad::TempStorage temp_storage;
// Load a segment of consecutive items that are blocked across threads
int thread_data[4];
BlockLoad(temp_storage).Load(d_data, thread_data);
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ BlockLoad()
Collective constructor using a private static allocation of shared memory as temporary storage.
\smemstorage{BlockLoad}
Suppose the input d_data is 0, 1, 2, 3, 4, 5, .... The set of thread_data across the block of threads in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }.
Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 1130 of file block_load.cuh.

◆ Load() [9/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Load a linear segment of items from memory, guarded by range.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 696 of file block_load.cuh.

◆ Load() [10/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Load a linear segment of items from memory, guarded by range (skips vectorization)

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 779 of file block_load.cuh.

◆ Load() [11/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Load a linear segment of items from memory, guarded by range.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 844 of file block_load.cuh.

◆ Load() [12/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Load a linear segment of items from memory, guarded by range.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 919 of file block_load.cuh.

◆ Load() [13/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Load a linear segment of items from memory, guarded by range.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 994 of file block_load.cuh.

◆ Load() [14/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Load a linear segment of items from memory, guarded by range.

  • \blocked
  • \smemreuse
Snippet
The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for BLOCK_LOAD_WARP_TRANSPOSE, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
#include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
__global__ void ExampleKernel(int *d_data, int valid_items, ...)
{
// Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
// Allocate shared memory for BlockLoad
__shared__ typename BlockLoad::TempStorage temp_storage;
// Load a segment of consecutive items that are blocked across threads
int thread_data[4];
BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
Suppose the input d_data is 0, 1, 2, 3, 4, 5, 6... and valid_items is 5. The set of thread_data across the block of threads in those threads will be { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads being unmasked to load portions of valid data (and other items remaining unassigned).
Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 1176 of file block_load.cuh.

◆ Load() [15/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)
inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 706 of file block_load.cuh.

◆ Load() [16/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)
inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 789 of file block_load.cuh.

◆ Load() [17/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)
inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 855 of file block_load.cuh.

◆ Load() [18/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)
inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 931 of file block_load.cuh.

◆ Load() [19/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)
inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 1006 of file block_load.cuh.

◆ Load() [20/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load ( InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)
inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

  • \blocked
  • \smemreuse
Snippet
The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for BLOCK_LOAD_WARP_TRANSPOSE, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).
#include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
__global__ void ExampleKernel(int *d_data, int valid_items, ...)
{
// Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
// Allocate shared memory for BlockLoad
__shared__ typename BlockLoad::TempStorage temp_storage;
// Load a segment of consecutive items that are blocked across threads
int thread_data[4];
BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
Suppose the input d_data is 0, 1, 2, 3, 4, 5, 6..., valid_items is 5, and the out-of-bounds default is -1. The set of thread_data across the block of threads in those threads will be { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads being unmasked to load portions of valid data (and other items are assigned -1)
Parameters
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 1224 of file block_load.cuh.

◆ Load() [21/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename InputIteratorT >
__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load ( InputT *  block_ptr,
InputT(&)  items[ITEMS_PER_THREAD] 
)
inline

Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)

Parameters
[in]block_ptrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 740 of file block_load.cuh.

◆ Load() [22/23]

template<int MAX>
template<CacheLoadModifier MODIFIER, typename T >
static __device__ __forceinline__ void cub::IterateThreadLoad< MAX, MAX >::Load ( T const *  ,
T *   
)
inlinestatic

Definition at line 144 of file thread_load.cuh.

◆ Load() [23/23]

template<int COUNT, int MAX>
template<CacheLoadModifier MODIFIER, typename T >
static __device__ __forceinline__ void cub::IterateThreadLoad< COUNT, MAX >::Load ( T const *  ptr,
T *  vals 
)
inlinestatic

Definition at line 124 of file thread_load.cuh.

◆ LoadDirectBlocked() [1/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a blocked arrangement across the thread block.

\blocked

Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 76 of file block_load.cuh.

◆ LoadDirectBlocked() [2/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.

\blocked

Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 105 of file block_load.cuh.

◆ LoadDirectBlocked() [3/3]

template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)

Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..

\blocked

Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 138 of file block_load.cuh.

◆ LoadDirectBlockedVectorized()

template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized ( int  linear_tid,
T *  block_ptr,
T(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a blocked arrangement across the thread block.

\blocked

The input offset (block_ptr + block_offset) must be quad-item aligned

The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:

  • ITEMS_PER_THREAD is odd
  • The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_ptrInput pointer for loading from
[out]itemsData to load

Definition at line 227 of file block_load.cuh.

◆ LoadDirectStriped() [1/3]

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a striped arrangement across the thread block.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 258 of file block_load.cuh.

◆ LoadDirectStriped() [2/3]

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Load a linear segment of items into a striped arrangement across the thread block, guarded by range.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 288 of file block_load.cuh.

◆ LoadDirectStriped() [3/3]

template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)

Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 323 of file block_load.cuh.

◆ LoadDirectWarpStriped() [1/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a warp-striped arrangement across the thread block.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 362 of file block_load.cuh.

◆ LoadDirectWarpStriped() [2/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 398 of file block_load.cuh.

◆ LoadDirectWarpStriped() [3/3]

template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)

Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 439 of file block_load.cuh.

◆ LoadInternal() [1/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::LoadInternal ( TempStorage ,
int  linear_tid 
)
inline

Constructor.

Definition at line 678 of file block_load.cuh.

◆ LoadInternal() [2/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::LoadInternal ( TempStorage ,
int  linear_tid 
)
inline

Constructor.

Definition at line 731 of file block_load.cuh.

◆ LoadInternal() [3/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::LoadInternal ( TempStorage temp_storage,
int  linear_tid 
)
inline

Constructor.

Definition at line 824 of file block_load.cuh.

◆ LoadInternal() [4/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::LoadInternal ( TempStorage temp_storage,
int  linear_tid 
)
inline

Constructor.

Definition at line 899 of file block_load.cuh.

◆ LoadInternal() [5/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::LoadInternal ( TempStorage temp_storage,
int  linear_tid 
)
inline

Constructor.

Definition at line 974 of file block_load.cuh.

◆ PrivateStorage() [1/2]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ _TempStorage & cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage ( )
inlineprivate

Internal storage allocator.

Definition at line 1035 of file block_load.cuh.

◆ PrivateStorage() [2/2]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ _TempStorage & cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage ( )
inlineprivate

Internal storage allocator.

Definition at line 841 of file block_store.cuh.

◆ Store() [1/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)
inline

Store items into a linear segment of memory.

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 561 of file block_store.cuh.

◆ Store() [2/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)
inline

Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 610 of file block_store.cuh.

◆ Store() [3/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)
inline

Store items into a linear segment of memory.

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 665 of file block_store.cuh.

◆ Store() [4/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)
inline

Store items into a linear segment of memory.

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 733 of file block_store.cuh.

◆ Store() [5/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)
inline

Store items into a linear segment of memory.

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 801 of file block_store.cuh.

◆ Store() [6/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)
inline

Store items into a linear segment of memory.

  • \blocked
  • \smemreuse
Snippet
The code snippet below illustrates the storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for BLOCK_STORE_WARP_TRANSPOSE, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
#include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>
__global__ void ExampleKernel(int *d_data, ...)
{
// Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
// Allocate shared memory for BlockStore
// Obtain a segment of consecutive items that are blocked across threads
int thread_data[4];
...
// Store items to linear memory
int thread_data[4];
BlockStore(temp_storage).Store(d_data, thread_data);
The BlockStore class provides collective data movement methods for writing a blocked arrangement of i...
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ BlockStore()
Collective constructor using a private static allocation of shared memory as temporary storage.
\smemstorage{BlockStore}
Suppose the set of thread_data across the block of threads is { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. The output d_data will be 0, 1, 2, 3, 4, 5, ....
Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 939 of file block_store.cuh.

◆ Store() [7/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Store items into a linear segment of memory, guarded by range.

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 570 of file block_store.cuh.

◆ Store() [8/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Store items into a linear segment of memory, guarded by range.

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 619 of file block_store.cuh.

◆ Store() [9/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Store items into a linear segment of memory, guarded by range.

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 675 of file block_store.cuh.

◆ Store() [10/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Store items into a linear segment of memory, guarded by range.

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 743 of file block_store.cuh.

◆ Store() [11/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Store items into a linear segment of memory, guarded by range.

Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 811 of file block_store.cuh.

◆ Store() [12/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<typename OutputIteratorT >
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store ( OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)
inline

Store items into a linear segment of memory, guarded by range.

  • \blocked
  • \smemreuse
Snippet
The code snippet below illustrates the guarded storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for BLOCK_STORE_WARP_TRANSPOSE, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.
#include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>
__global__ void ExampleKernel(int *d_data, int valid_items, ...)
{
// Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
// Allocate shared memory for BlockStore
// Obtain a segment of consecutive items that are blocked across threads
int thread_data[4];
...
// Store items to linear memory
int thread_data[4];
BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
Suppose the set of thread_data across the block of threads is { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and valid_items is 5. The output d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with only the first two threads being unmasked to store portions of valid data.
Parameters
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 988 of file block_store.cuh.

◆ Store() [13/15]

template<int MAX>
template<CacheStoreModifier MODIFIER, typename T >
static __device__ __forceinline__ void cub::IterateThreadStore< MAX, MAX >::Store ( T *  ,
T *   
)
inlinestatic

Definition at line 148 of file thread_store.cuh.

◆ Store() [14/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store ( T *  block_ptr,
T(&)  items[ITEMS_PER_THREAD] 
)
inline

Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)

Parameters
[in]block_ptrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 601 of file block_store.cuh.

◆ Store() [15/15]

template<int COUNT, int MAX>
template<CacheStoreModifier MODIFIER, typename T >
static __device__ __forceinline__ void cub::IterateThreadStore< COUNT, MAX >::Store ( T *  ptr,
T *  vals 
)
inlinestatic

Definition at line 128 of file thread_store.cuh.

◆ StoreDirectBlocked() [1/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a blocked arrangement of items across a thread block into a linear segment of items.

\blocked

Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 74 of file block_store.cuh.

◆ StoreDirectBlocked() [2/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range.

\blocked

Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 103 of file block_store.cuh.

◆ StoreDirectBlockedVectorized()

template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::StoreDirectBlockedVectorized ( int  linear_tid,
T *  block_ptr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a blocked arrangement of items across a thread block into a linear segment of items.

\blocked

The output offset (block_ptr + block_offset) must be quad-item aligned, which is the default starting offset returned by cudaMalloc()

The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
  • ITEMS_PER_THREAD is odd
  • The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)
Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_ptrInput pointer for storing from
[in]itemsData to store

Definition at line 143 of file block_store.cuh.

◆ StoreDirectStriped() [1/2]

template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a striped arrangement of data across the thread block into a linear segment of items.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 206 of file block_store.cuh.

◆ StoreDirectStriped() [2/2]

template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 237 of file block_store.cuh.

◆ StoreDirectWarpStriped() [1/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a warp-striped arrangement of data across the thread block into a linear segment of items.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[out]itemsData to load

Definition at line 281 of file block_store.cuh.

◆ StoreDirectWarpStriped() [2/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 317 of file block_store.cuh.

◆ StoreInternal() [1/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::StoreInternal ( TempStorage ,
int  linear_tid 
)
inline

Constructor.

Definition at line 552 of file block_store.cuh.

◆ StoreInternal() [2/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::StoreInternal ( TempStorage ,
int  linear_tid 
)
inline

Constructor.

Definition at line 593 of file block_store.cuh.

◆ StoreInternal() [3/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::StoreInternal ( TempStorage temp_storage,
int  linear_tid 
)
inline

Constructor.

Definition at line 655 of file block_store.cuh.

◆ StoreInternal() [4/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::StoreInternal ( TempStorage temp_storage,
int  linear_tid 
)
inline

Constructor.

Definition at line 723 of file block_store.cuh.

◆ StoreInternal() [5/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::StoreInternal ( TempStorage temp_storage,
int  linear_tid 
)
inline

Constructor.

Definition at line 791 of file block_store.cuh.

◆ ThreadLoad() [1/5]

template<CacheLoadModifier MODIFIER, typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad ( InputIteratorT  itr)

Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.

Example
#include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh>
// 32-bit load using cache-global modifier:
int *d_in;
int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
// 16-bit load using default modifier
short *d_in;
short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
// 256-bit load using cache-volatile modifier
double4 *d_in;
double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
// 96-bit load using cache-streaming modifier
struct TestFoo { bool a; short b; };
TestFoo *d_struct;
TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
Template Parameters
MODIFIER[inferred] CacheLoadModifier enumeration
InputIteratorT[inferred] Input iterator type \iterator

ThreadLoad definition for generic modifiers

Definition at line 420 of file thread_load.cuh.

◆ ThreadLoad() [2/5]

template<typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad ( InputIteratorT  itr,
Int2Type< LOAD_DEFAULT ,
Int2Type< false >   
)

Define powers-of-two ThreadLoad specializations for the various Cache load modifiers ThreadLoad definition for LOAD_DEFAULT modifier on iterator types

Definition at line 312 of file thread_load.cuh.

◆ ThreadLoad() [3/5]

template<typename T >
__device__ __forceinline__ T cub::ThreadLoad ( T *  ptr,
Int2Type< LOAD_DEFAULT ,
Int2Type< true >   
)

ThreadLoad definition for LOAD_DEFAULT modifier on pointer types

Definition at line 325 of file thread_load.cuh.

◆ ThreadLoad() [4/5]

template<typename T >
__device__ __forceinline__ T cub::ThreadLoad ( T *  ptr,
Int2Type< LOAD_VOLATILE ,
Int2Type< true >   
)

ThreadLoad definition for LOAD_VOLATILE modifier on pointer types

Definition at line 381 of file thread_load.cuh.

◆ ThreadLoad() [5/5]

template<typename T , int MODIFIER>
__device__ __forceinline__ T cub::ThreadLoad ( T const *  ptr,
Int2Type< MODIFIER >  ,
Int2Type< true >   
)

ThreadLoad definition for generic modifiers on pointer types

Definition at line 395 of file thread_load.cuh.

◆ ThreadLoadVolatilePointer() [1/2]

template<typename T >
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer ( T *  ptr,
Int2Type< false >   
)

ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types

Definition at line 351 of file thread_load.cuh.

◆ ThreadLoadVolatilePointer() [2/2]

template<typename T >
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer ( T *  ptr,
Int2Type< true >   
)

ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types

Definition at line 338 of file thread_load.cuh.

◆ ThreadStore() [1/5]

template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore ( OutputIteratorT  itr,
val 
)

Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.

Example
#include <cub/cub.cuh> // or equivalently <cub/thread/thread_store.cuh>
// 32-bit store using cache-global modifier:
int *d_out;
int val;
cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
// 16-bit store using default modifier
short *d_out;
short val;
cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
// 256-bit store using write-through modifier
double4 *d_out;
double4 val;
cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
// 96-bit store using cache-streaming cache modifier
struct TestFoo { bool a; short b; };
TestFoo *d_struct;
TestFoo val;
cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Template Parameters
MODIFIER[inferred] CacheStoreModifier enumeration
InputIteratorT[inferred] Output iterator type \iterator
T[inferred] Data type of output value

ThreadStore definition for generic modifiers

Definition at line 404 of file thread_store.cuh.

◆ ThreadStore() [2/5]

template<typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore ( OutputIteratorT  itr,
val,
Int2Type< STORE_DEFAULT ,
Int2Type< false >   
)

Define ThreadStore specializations for the various Cache load modifiers ThreadStore definition for STORE_DEFAULT modifier on iterator types

Definition at line 292 of file thread_store.cuh.

◆ ThreadStore() [3/5]

template<typename T , int MODIFIER>
__device__ __forceinline__ void cub::ThreadStore ( T *  ptr,
val,
Int2Type< MODIFIER >  ,
Int2Type< true >   
)

ThreadStore definition for generic modifiers on pointer types

Definition at line 375 of file thread_store.cuh.

◆ ThreadStore() [4/5]

template<typename T >
__device__ __forceinline__ void cub::ThreadStore ( T *  ptr,
val,
Int2Type< STORE_DEFAULT ,
Int2Type< true >   
)

ThreadStore definition for STORE_DEFAULT modifier on pointer types

Definition at line 306 of file thread_store.cuh.

◆ ThreadStore() [5/5]

template<typename T >
__device__ __forceinline__ void cub::ThreadStore ( T *  ptr,
val,
Int2Type< STORE_VOLATILE ,
Int2Type< true >   
)

ThreadStore definition for STORE_VOLATILE modifier on pointer types

Definition at line 361 of file thread_store.cuh.

◆ ThreadStoreVolatilePtr() [1/2]

template<typename T >
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr ( T *  ptr,
val,
Int2Type< false >   
)

ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types

Definition at line 333 of file thread_store.cuh.

◆ ThreadStoreVolatilePtr() [2/2]

template<typename T >
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr ( T *  ptr,
val,
Int2Type< true >   
)

ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types

Definition at line 320 of file thread_store.cuh.

Variable Documentation

◆ linear_tid [1/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::linear_tid

Linear thread-id.

Definition at line 675 of file block_load.cuh.

◆ linear_tid [2/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 728 of file block_load.cuh.

◆ linear_tid [3/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 821 of file block_load.cuh.

◆ linear_tid [4/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 896 of file block_load.cuh.

◆ linear_tid [5/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid

Linear thread-id.

Definition at line 971 of file block_load.cuh.

◆ linear_tid [6/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid
private

Linear thread-id.

Definition at line 1050 of file block_load.cuh.

◆ linear_tid [7/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::linear_tid

Linear thread-id.

Definition at line 549 of file block_store.cuh.

◆ linear_tid [8/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 590 of file block_store.cuh.

◆ linear_tid [9/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 652 of file block_store.cuh.

◆ linear_tid [10/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 720 of file block_store.cuh.

◆ linear_tid [11/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid

Linear thread-id.

Definition at line 788 of file block_store.cuh.

◆ linear_tid [12/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid
private

Linear thread-id.

Definition at line 856 of file block_store.cuh.

◆ temp_storage [1/8]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 818 of file block_load.cuh.

◆ temp_storage [2/8]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 893 of file block_load.cuh.

◆ temp_storage [3/8]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 968 of file block_load.cuh.

◆ temp_storage [4/8]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage
private

Thread reference to shared storage.

Definition at line 1047 of file block_load.cuh.

◆ temp_storage [5/8]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 649 of file block_store.cuh.

◆ temp_storage [6/8]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 717 of file block_store.cuh.

◆ temp_storage [7/8]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 785 of file block_store.cuh.

◆ temp_storage [8/8]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage
private

Thread reference to shared storage.

Definition at line 853 of file block_store.cuh.

◆ valid_items [1/3]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::_TempStorage::valid_items

Temporary storage for partially-full block guard.

Definition at line 642 of file block_store.cuh.

◆ valid_items [2/3]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::_TempStorage::valid_items

Temporary storage for partially-full block guard.

Definition at line 710 of file block_store.cuh.

◆ valid_items [3/3]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>
template<int DUMMY>
volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage::valid_items

Temporary storage for partially-full block guard.

Definition at line 778 of file block_store.cuh.