Data Structures
class	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
	The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block. More...

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< _POLICY, DUMMY >
	Load helper. More...

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::_TempStorage
	Shared memory storage layout type. More...

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::TempStorage
	Alias wrapper allowing storage to be unioned. More...

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::_TempStorage
	Shared memory storage layout type. More...

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::TempStorage
	Alias wrapper allowing storage to be unioned. More...

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage
	Shared memory storage layout type. More...

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::TempStorage
	Alias wrapper allowing storage to be unioned. More...

struct	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::TempStorage
	\smemstorage{BlockLoad} More...

class	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
	The BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory. More...

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< _POLICY, DUMMY >
	Store helper. More...

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::_TempStorage
	Shared memory storage layout type. More...

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::TempStorage
	Alias wrapper allowing storage to be unioned. More...

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::_TempStorage
	Shared memory storage layout type. More...

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::TempStorage
	Alias wrapper allowing storage to be unioned. More...

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage
	Shared memory storage layout type. More...

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::TempStorage
	Alias wrapper allowing storage to be unioned. More...

struct	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::TempStorage
	\smemstorage{BlockStore} More...

struct	cub::IterateThreadLoad< COUNT, MAX >
	Helper structure for templated load iteration (inductive case) More...

struct	cub::IterateThreadLoad< MAX, MAX >
	Helper structure for templated load iteration (termination case) More...

struct	cub::IterateThreadStore< COUNT, MAX >
	Helper structure for templated store iteration (inductive case) More...

struct	cub::IterateThreadStore< MAX, MAX >
	Helper structure for templated store iteration (termination case) More...

Typedefs
typedef NullType	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::TempStorage
	Shared memory storage layout type.

typedef NullType	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::TempStorage
	Shared memory storage layout type.

typedef BlockExchange< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::BlockExchange

typedef BlockExchange< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::BlockExchange

typedef BlockExchange< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange

typedef LoadInternal< ALGORITHM, 0 >	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InternalLoad
	Internal load implementation to use.

typedef InternalLoad::TempStorage	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::_TempStorage
	Shared memory storage layout type.

typedef NullType	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::TempStorage
	Shared memory storage layout type.

typedef NullType	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::TempStorage
	Shared memory storage layout type.

typedef BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::BlockExchange

typedef BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::BlockExchange

typedef BlockExchange< T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange

typedef StoreInternal< ALGORITHM, 0 >	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InternalStore
	Internal load implementation to use.

typedef InternalStore::TempStorage	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::_TempStorage
	Shared memory storage layout type.

Enumerations
enum	cub::BlockLoadAlgorithm { cub::BLOCK_LOAD_DIRECT , cub::BLOCK_LOAD_VECTORIZE , cub::BLOCK_LOAD_TRANSPOSE , cub::BLOCK_LOAD_WARP_TRANSPOSE , cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED }
	cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. More...

enum	{ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z }
	Constants. More...

enum	{ WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) }

enum	{ WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) }

enum	cub::BlockStoreAlgorithm { cub::BLOCK_STORE_DIRECT , cub::BLOCK_STORE_VECTORIZE , cub::BLOCK_STORE_TRANSPOSE , cub::BLOCK_STORE_WARP_TRANSPOSE , cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED }
	cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. More...

enum	{ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z }
	Constants. More...

enum	{ WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) }

enum	{ WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) }

enum	cub::CacheLoadModifier { cub::LOAD_DEFAULT , cub::LOAD_CA , cub::LOAD_CG , cub::LOAD_CS , cub::LOAD_CV , cub::LOAD_LDG , cub::LOAD_VOLATILE }
	Enumeration of cache modifiers for memory load operations. More...

enum	cub::CacheStoreModifier { cub::STORE_DEFAULT , cub::STORE_WB , cub::STORE_CG , cub::STORE_CS , cub::STORE_WT , cub::STORE_VOLATILE }
	Enumeration of cache modifiers for memory store operations. More...

Functions
__device__ __forceinline__	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::LoadInternal (TempStorage &, int linear_tid)
	Constructor.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items from memory.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
	Load a linear segment of items from memory, guarded by range.

template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
	Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

__device__ __forceinline__	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::LoadInternal (TempStorage &, int linear_tid)
	Constructor.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (const InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)

template<CacheLoadModifier MODIFIER, typename ValueType , typename OffsetT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (CacheModifiedInputIterator< MODIFIER, ValueType, OffsetT > block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)

template<typename _InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (_InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
	Load a linear segment of items from memory, guarded by range (skips vectorization)

template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
	Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)

__device__ __forceinline__	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::LoadInternal (TempStorage &temp_storage, int linear_tid)
	Constructor.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items from memory.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
	Load a linear segment of items from memory, guarded by range.

template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
	Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS")

__device__ __forceinline__	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::LoadInternal (TempStorage &temp_storage, int linear_tid)
	Constructor.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items from memory.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
	Load a linear segment of items from memory, guarded by range.

template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
	Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS")

__device__ __forceinline__	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::LoadInternal (TempStorage &temp_storage, int linear_tid)
	Constructor.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items from memory.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
	Load a linear segment of items from memory, guarded by range.

template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
	Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

__device__ __forceinline__ _TempStorage &	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage ()
	Internal storage allocator.

__device__ __forceinline__	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::StoreInternal (TempStorage &, int linear_tid)
	Constructor.

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
	Store items into a linear segment of memory.

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
	Store items into a linear segment of memory, guarded by range.

__device__ __forceinline__	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::StoreInternal (TempStorage &, int linear_tid)
	Constructor.

__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store (T *block_ptr, T(&items)[ITEMS_PER_THREAD])
	Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
	Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
	Store items into a linear segment of memory, guarded by range.

__device__ __forceinline__	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::StoreInternal (TempStorage &temp_storage, int linear_tid)
	Constructor.

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
	Store items into a linear segment of memory.

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
	Store items into a linear segment of memory, guarded by range.

	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS")

__device__ __forceinline__	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::StoreInternal (TempStorage &temp_storage, int linear_tid)
	Constructor.

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
	Store items into a linear segment of memory.

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
	Store items into a linear segment of memory, guarded by range.

	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS")

__device__ __forceinline__	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::StoreInternal (TempStorage &temp_storage, int linear_tid)
	Constructor.

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
	Store items into a linear segment of memory.

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
	Store items into a linear segment of memory, guarded by range.

__device__ __forceinline__ _TempStorage &	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage ()
	Internal storage allocator.

template<CacheLoadModifier MODIFIER, typename T >
static __device__ __forceinline__ void	cub::IterateThreadLoad< COUNT, MAX >::Load (T const ptr, T vals)

template<typename InputIteratorT , typename T >
static __device__ __forceinline__ void	cub::IterateThreadLoad< COUNT, MAX >::Dereference (InputIteratorT itr, T *vals)

template<CacheLoadModifier MODIFIER, typename T >
static __device__ __forceinline__ void	cub::IterateThreadLoad< MAX, MAX >::Load (T const , T )

template<typename InputIteratorT , typename T >
static __device__ __forceinline__ void	cub::IterateThreadLoad< MAX, MAX >::Dereference (InputIteratorT, T *)

template<CacheStoreModifier MODIFIER, typename T >
static __device__ __forceinline__ void	cub::IterateThreadStore< COUNT, MAX >::Store (T ptr, T vals)

template<typename OutputIteratorT , typename T >
static __device__ __forceinline__ void	cub::IterateThreadStore< COUNT, MAX >::Dereference (OutputIteratorT ptr, T *vals)

template<CacheStoreModifier MODIFIER, typename T >
static __device__ __forceinline__ void	cub::IterateThreadStore< MAX, MAX >::Store (T , T )

template<typename OutputIteratorT , typename T >
static __device__ __forceinline__ void	cub::IterateThreadStore< MAX, MAX >::Dereference (OutputIteratorT, T *)

Variables
int	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::linear_tid
	Linear thread-id.

int	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::linear_tid
	Linear thread-id.

_TempStorage &	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::temp_storage
	Thread reference to shared storage.

int	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::linear_tid
	Linear thread-id.

_TempStorage &	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::temp_storage
	Thread reference to shared storage.

int	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::linear_tid
	Linear thread-id.

_TempStorage &	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage
	Thread reference to shared storage.

int	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid
	Linear thread-id.

_TempStorage &	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage
	Thread reference to shared storage.

int	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid
	Linear thread-id.

int	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::linear_tid
	Linear thread-id.

int	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::linear_tid
	Linear thread-id.

volatile int	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::_TempStorage::valid_items
	Temporary storage for partially-full block guard.

_TempStorage &	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::temp_storage
	Thread reference to shared storage.

int	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::linear_tid
	Linear thread-id.

volatile int	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::_TempStorage::valid_items
	Temporary storage for partially-full block guard.

_TempStorage &	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::temp_storage
	Thread reference to shared storage.

int	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::linear_tid
	Linear thread-id.

volatile int	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage::valid_items
	Temporary storage for partially-full block guard.

_TempStorage &	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage
	Thread reference to shared storage.

int	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid
	Linear thread-id.

_TempStorage &	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage
	Thread reference to shared storage.

int	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid
	Linear thread-id.

Blocked arrangement I/O (direct)
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void	cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items into a blocked arrangement across the thread block.

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void	cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
	Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.

template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void	cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
	Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..

template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void	cub::InternalLoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])

template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void	cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items into a blocked arrangement across the thread block.

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void	cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
	Store a blocked arrangement of items across a thread block into a linear segment of items.

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void	cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
	Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range.

template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void	cub::StoreDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
	Store a blocked arrangement of items across a thread block into a linear segment of items.

Striped arrangement I/O (direct)
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void	cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items into a striped arrangement across the thread block.

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void	cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
	Load a linear segment of items into a striped arrangement across the thread block, guarded by range.

template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void	cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
	Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void	cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
	Store a striped arrangement of data across the thread block into a linear segment of items.

template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void	cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
	Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range.

Warp-striped arrangement I/O (direct)
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void	cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items into a warp-striped arrangement across the thread block.

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void	cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
	Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.

template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void	cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
	Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void	cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
	Store a warp-striped arrangement of data across the thread block into a linear segment of items.

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void	cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
	Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range.

Collective constructors
__device__ __forceinline__	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad ()
	Collective constructor using a private static allocation of shared memory as temporary storage.

__device__ __forceinline__	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad (TempStorage &temp_storage)
	Collective constructor using the specified memory allocation as temporary storage.

Data movement
template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
	Load a linear segment of items from memory.

template<typename InputIteratorT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
	Load a linear segment of items from memory, guarded by range.

template<typename InputIteratorT , typename DefaultT >
__device__ __forceinline__ void	cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
	Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

Collective constructors
__device__ __forceinline__	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore ()
	Collective constructor using a private static allocation of shared memory as temporary storage.

__device__ __forceinline__	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore (TempStorage &temp_storage)
	Collective constructor using the specified memory allocation as temporary storage.

Data movement
template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
	Store items into a linear segment of memory.

template<typename OutputIteratorT >
__device__ __forceinline__ void	cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
	Store items into a linear segment of memory, guarded by range.

Thread I/O (cache modified)
template<CacheLoadModifier MODIFIER, typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type	cub::ThreadLoad (InputIteratorT itr)
	Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.

template<typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type	cub::ThreadLoad (InputIteratorT itr, Int2Type< LOAD_DEFAULT >, Int2Type< false >)

template<typename T >
__device__ __forceinline__ T	cub::ThreadLoad (T *ptr, Int2Type< LOAD_DEFAULT >, Int2Type< true >)

template<typename T >
__device__ __forceinline__ T	cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< true >)

template<typename T >
__device__ __forceinline__ T	cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< false >)

template<typename T >
__device__ __forceinline__ T	cub::ThreadLoad (T *ptr, Int2Type< LOAD_VOLATILE >, Int2Type< true >)

template<typename T , int MODIFIER>
__device__ __forceinline__ T	cub::ThreadLoad (T const *ptr, Int2Type< MODIFIER >, Int2Type< true >)

template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T >
__device__ __forceinline__ void	cub::ThreadStore (OutputIteratorT itr, T val)
	Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.

template<typename OutputIteratorT , typename T >
__device__ __forceinline__ void	cub::ThreadStore (OutputIteratorT itr, T val, Int2Type< STORE_DEFAULT >, Int2Type< false >)

template<typename T >
__device__ __forceinline__ void	cub::ThreadStore (T *ptr, T val, Int2Type< STORE_DEFAULT >, Int2Type< true >)

template<typename T >
__device__ __forceinline__ void	cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< true >)

template<typename T >
__device__ __forceinline__ void	cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< false >)

template<typename T >
__device__ __forceinline__ void	cub::ThreadStore (T *ptr, T val, Int2Type< STORE_VOLATILE >, Int2Type< true >)

template<typename T , int MODIFIER>
__device__ __forceinline__ void	cub::ThreadStore (T *ptr, T val, Int2Type< MODIFIER >, Int2Type< true >)

#define	_CUB_LOAD_16(cub_modifier, ptx_modifier)

#define	_CUB_LOAD_8(cub_modifier, ptx_modifier)

#define	_CUB_LOAD_4(cub_modifier, ptx_modifier)

#define	_CUB_LOAD_2(cub_modifier, ptx_modifier)

#define	_CUB_LOAD_1(cub_modifier, ptx_modifier)

#define	_CUB_LOAD_ALL(cub_modifier, ptx_modifier)

#define	_CUB_STORE_16(cub_modifier, ptx_modifier)

#define	_CUB_STORE_8(cub_modifier, ptx_modifier)

#define	_CUB_STORE_4(cub_modifier, ptx_modifier)

#define	_CUB_STORE_2(cub_modifier, ptx_modifier)

#define	_CUB_STORE_1(cub_modifier, ptx_modifier)

#define	_CUB_STORE_ALL(cub_modifier, ptx_modifier)

Detailed Description

Macro Definition Documentation

◆ _CUB_LOAD_1

#define _CUB_LOAD_1	(	cub_modifier,
		ptx_modifier
	)

Value:

    template<>                                                                              \
    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
    {                                                                                       \
        unsigned short retval;                                                              \
        asm volatile (                                                                      \
        "{"                                                                                 \
        "   .reg .u8 datum;"                                                                \
        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
        "    cvt.u16.u8 %0, datum;"                                                         \
        "}" :                                                                               \
            "=h"(retval) :                                                                  \
            _CUB_ASM_PTR_(ptr));                                                            \
        return (unsigned char) retval;                                                      \
    }

Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier

Definition at line 247 of file thread_load.cuh.

◆ _CUB_LOAD_16

#define _CUB_LOAD_16	(	cub_modifier,
		ptx_modifier
	)

Value:

    template<>                                                                              \
    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
    {                                                                                       \
        uint4 retval;                                                                       \
        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
            "=r"(retval.x),                                                                 \
            "=r"(retval.y),                                                                 \
            "=r"(retval.z),                                                                 \
            "=r"(retval.w) :                                                                \
            _CUB_ASM_PTR_(ptr));                                                            \
        return retval;                                                                      \
    }                                                                                       \
    template<>                                                                              \
    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
    {                                                                                       \
        ulonglong2 retval;                                                                  \
        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
            "=l"(retval.x),                                                                 \
            "=l"(retval.y) :                                                                \
            _CUB_ASM_PTR_(ptr));                                                            \
        return retval;                                                                      \
    }

Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier

Definition at line 154 of file thread_load.cuh.

◆ _CUB_LOAD_2

#define _CUB_LOAD_2	(	cub_modifier,
		ptx_modifier
	)

Value:

    template<>                                                                              \
    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
    {                                                                                       \
        unsigned short retval;                                                              \
        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
            "=h"(retval) :                                                                  \
            _CUB_ASM_PTR_(ptr));                                                            \
        return retval;                                                                      \
    }

Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier

Definition at line 232 of file thread_load.cuh.

◆ _CUB_LOAD_4

#define _CUB_LOAD_4	(	cub_modifier,
		ptx_modifier
	)

Value:

    template<>                                                                              \
    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
    {                                                                                       \
        unsigned int retval;                                                                \
        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
            "=r"(retval) :                                                                  \
            _CUB_ASM_PTR_(ptr));                                                            \
        return retval;                                                                      \
    }

Define a uint (4B) ThreadLoad specialization for the given Cache load modifier

Definition at line 217 of file thread_load.cuh.

◆ _CUB_LOAD_8

#define _CUB_LOAD_8	(	cub_modifier,
		ptx_modifier
	)

Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier

Definition at line 181 of file thread_load.cuh.

◆ _CUB_LOAD_ALL

#define _CUB_LOAD_ALL	(	cub_modifier,
		ptx_modifier
	)

Value:

    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \

Define powers-of-two ThreadLoad specializations for the given Cache load modifier

Definition at line 267 of file thread_load.cuh.

◆ _CUB_STORE_1

#define _CUB_STORE_1	(	cub_modifier,
		ptx_modifier
	)

Value:

    template<>                                                                              \
    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
    {                                                                                       \
        asm volatile (                                                                      \
        "{"                                                                                 \
        "   .reg .u8 datum;"                                                                \
        "   cvt.u8.u16 datum, %1;"                                                          \
        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
        "}" : :                                                                             \
            _CUB_ASM_PTR_(ptr),                                                             \
            "h"((unsigned short) val));                                                               \
    }

Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier

Definition at line 238 of file thread_store.cuh.

◆ _CUB_STORE_16

#define _CUB_STORE_16	(	cub_modifier,
		ptx_modifier
	)

Value:

    template<>                                                                              \
    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
    {                                                                                       \
        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
            _CUB_ASM_PTR_(ptr),                                                             \
            "r"(val.x),                                                                     \
            "r"(val.y),                                                                     \
            "r"(val.z),                                                                     \
            "r"(val.w));                                                                    \
    }                                                                                       \
    template<>                                                                              \
    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
    {                                                                                       \
        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
            _CUB_ASM_PTR_(ptr),                                                             \
            "l"(val.x),                                                                     \
            "l"(val.y));                                                                    \
    }

Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier

Definition at line 158 of file thread_store.cuh.

◆ _CUB_STORE_2

#define _CUB_STORE_2	(	cub_modifier,
		ptx_modifier
	)

Value:

    template<>                                                                              \
    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
    {                                                                                       \
        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
            _CUB_ASM_PTR_(ptr),                                                             \
            "h"(val));                                                                      \
    }

Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier

Definition at line 225 of file thread_store.cuh.

◆ _CUB_STORE_4

#define _CUB_STORE_4	(	cub_modifier,
		ptx_modifier
	)

Value:

    template<>                                                                              \
    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
    {                                                                                       \
        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
            _CUB_ASM_PTR_(ptr),                                                             \
            "r"(val));                                                                      \
    }

Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier

Definition at line 212 of file thread_store.cuh.

◆ _CUB_STORE_8

#define _CUB_STORE_8	(	cub_modifier,
		ptx_modifier
	)

Value:

    template<>                                                                              \
    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
    {                                                                                       \
        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
            _CUB_ASM_PTR_(ptr),                                                             \
            "h"(val.x),                                                                     \
            "h"(val.y),                                                                     \
            "h"(val.z),                                                                     \
            "h"(val.w));                                                                    \
    }                                                                                       \
    template<>                                                                              \
    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
    {                                                                                       \
        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
            _CUB_ASM_PTR_(ptr),                                                             \
            "r"(val.x),                                                                     \
            "r"(val.y));                                                                    \
    }                                                                                       \
    template<>                                                                              \
    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
    {                                                                                       \
        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
            _CUB_ASM_PTR_(ptr),                                                             \
            "l"(val));                                                                      \
    }

Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier

Definition at line 182 of file thread_store.cuh.

◆ _CUB_STORE_ALL

#define _CUB_STORE_ALL	(	cub_modifier,
		ptx_modifier
	)

Value:

    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \

Define powers-of-two ThreadStore specializations for the given Cache load modifier

Definition at line 255 of file thread_store.cuh.

Typedef Documentation

◆ _TempStorage [1/2]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

typedef InternalLoad::TempStorage cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::_TempStorage

private

Shared memory storage layout type.

Definition at line 1027 of file block_load.cuh.

◆ _TempStorage [2/2]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

typedef InternalStore::TempStorage cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::_TempStorage

private

Shared memory storage layout type.

Definition at line 833 of file block_store.cuh.

◆ BlockExchange [1/6]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::BlockExchange

Definition at line 808 of file block_load.cuh.

◆ BlockExchange [2/6]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::BlockExchange

Definition at line 883 of file block_load.cuh.

◆ BlockExchange [3/6]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange

Definition at line 958 of file block_load.cuh.

◆ BlockExchange [4/6]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::BlockExchange

Definition at line 636 of file block_store.cuh.

◆ BlockExchange [5/6]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::BlockExchange

Definition at line 704 of file block_store.cuh.

◆ BlockExchange [6/6]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange

Definition at line 772 of file block_store.cuh.

◆ InternalLoad

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

typedef LoadInternal<ALGORITHM, 0> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InternalLoad

private

Internal load implementation to use.

Definition at line 1023 of file block_load.cuh.

◆ InternalStore

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

typedef StoreInternal<ALGORITHM, 0> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::InternalStore

private

Internal load implementation to use.

Definition at line 829 of file block_store.cuh.

◆ TempStorage [1/4]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef NullType cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::TempStorage

Shared memory storage layout type.

Definition at line 672 of file block_load.cuh.

◆ TempStorage [2/4]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef NullType cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::TempStorage

Shared memory storage layout type.

Definition at line 725 of file block_load.cuh.

◆ TempStorage [3/4]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef NullType cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::TempStorage

Shared memory storage layout type.

Definition at line 546 of file block_store.cuh.

◆ TempStorage [4/4]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

typedef NullType cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::TempStorage

Shared memory storage layout type.

Definition at line 587 of file block_store.cuh.

Enumeration Type Documentation

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

anonymous enum

private

Constants.

Enumerator
BLOCK_THREADS	The thread block size in threads.

Definition at line 523 of file block_store.cuh.

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

anonymous enum

Definition at line 695 of file block_store.cuh.

◆ anonymous enum

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

anonymous enum

Definition at line 763 of file block_store.cuh.

◆ anonymous enum

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

anonymous enum

private

Constants.

Enumerator
BLOCK_THREADS	The thread block size in threads.

Definition at line 649 of file block_load.cuh.

◆ anonymous enum

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

anonymous enum

Definition at line 874 of file block_load.cuh.

◆ anonymous enum

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

anonymous enum

Definition at line 949 of file block_load.cuh.

◆ BlockLoadAlgorithm

enum cub::BlockLoadAlgorithm

cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.

Enumerator
BLOCK_LOAD_DIRECT	Overview A blocked arrangement of data is read directly from memory. Performance Considerations The utilization of memory transactions (coalescing) decreases as the access stride between threads increases (i.e., the number items per thread).
BLOCK_LOAD_VECTORIZE	Overview A blocked arrangement of data is read from memory using CUDA's built-in vectorized loads as a coalescing optimization. For example, `ld.global.v4.s32` instructions will be generated when `T` = `int` and `ITEMS_PER_THREAD` % 4 == 0. Performance Considerations The utilization of memory transactions (coalescing) remains high until the the access stride between threads (i.e., the number items per thread) exceeds the maximum vector load width (typically 4 items or 64B, whichever is lower). The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: `ITEMS_PER_THREAD` is odd The `InputIteratorTis` not a simple pointer type The block input offset is not quadword-aligned The data type `T` is not a built-in primitive or CUDA vector type (e.g., `short`, `int2`, `double`, `float2`, etc.)
BLOCK_LOAD_TRANSPOSE	Overview A striped arrangement of data is read efficiently from memory and then locally transposed into a blocked arrangement. Performance Considerations The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
BLOCK_LOAD_WARP_TRANSPOSE	Overview A warp-striped arrangement of data is read efficiently from memory and then locally transposed into a blocked arrangement. Usage Considerations BLOCK_THREADS must be a multiple of WARP_THREADS Performance Considerations The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. The local reordering incurs slightly larger latencies than the direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. Provisions more shared storage, but incurs smaller latencies than the BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED	Overview Like `BLOCK_LOAD_WARP_TRANSPOSE`, a warp-striped arrangement of data is read directly from memory and then is locally transposed into a blocked arrangement. To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently time-sliced among warps. Usage Considerations BLOCK_THREADS must be a multiple of WARP_THREADS Performance Considerations The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.

Definition at line 473 of file block_load.cuh.

◆ BlockStoreAlgorithm

enum cub::BlockStoreAlgorithm

cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.

Enumerator
BLOCK_STORE_DIRECT	Overview A blocked arrangement of data is written directly to memory. Performance Considerations The utilization of memory transactions (coalescing) decreases as the access stride between threads increases (i.e., the number items per thread).
BLOCK_STORE_VECTORIZE	Overview A blocked arrangement of data is written directly to memory using CUDA's built-in vectorized stores as a coalescing optimization. For example, `st.global.v4.s32` instructions will be generated when `T` = `int` and `ITEMS_PER_THREAD` % 4 == 0. Performance Considerations The utilization of memory transactions (coalescing) remains high until the the access stride between threads (i.e., the number items per thread) exceeds the maximum vector store width (typically 4 items or 64B, whichever is lower). The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: `ITEMS_PER_THREAD` is odd The `OutputIteratorT` is not a simple pointer type The block output offset is not quadword-aligned The data type `T` is not a built-in primitive or CUDA vector type (e.g., `short`, `int2`, `double`, `float2`, etc.)
BLOCK_STORE_TRANSPOSE	Overview A blocked arrangement is locally transposed and then efficiently written to memory as a striped arrangement. Performance Considerations The utilization of memory transactions (coalescing) remains high regardless of items written per thread. The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
BLOCK_STORE_WARP_TRANSPOSE	Overview A blocked arrangement is locally transposed and then efficiently written to memory as a warp-striped arrangement Usage Considerations BLOCK_THREADS must be a multiple of WARP_THREADS Performance Considerations The utilization of memory transactions (coalescing) remains high regardless of items written per thread. The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED	Overview A blocked arrangement is locally transposed and then efficiently written to memory as a warp-striped arrangement To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently time-sliced among warps. Usage Considerations BLOCK_THREADS must be a multiple of WARP_THREADS Performance Considerations The utilization of memory transactions (coalescing) remains high regardless of items written per thread. Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.

Definition at line 354 of file block_store.cuh.

◆ CacheLoadModifier

enum cub::CacheLoadModifier

Enumeration of cache modifiers for memory load operations.

Enumerator
LOAD_DEFAULT	Default (no modifier)
LOAD_CA	Cache at all levels.
LOAD_CG	Cache at global level.
LOAD_CS	Cache streaming (likely to be accessed once)
LOAD_CV	Cache as volatile (including cached system lines)
LOAD_LDG	Cache as texture.
LOAD_VOLATILE	Volatile (any memory space)

Definition at line 62 of file thread_load.cuh.

◆ CacheStoreModifier

enum cub::CacheStoreModifier

Enumeration of cache modifiers for memory store operations.

Enumerator
STORE_DEFAULT	Default (no modifier)
STORE_WB	Cache write-back all coherent levels.
STORE_CG	Cache at global level.
STORE_CS	Cache streaming (likely to be accessed once)
STORE_WT	Cache write-through (to system memory)
STORE_VOLATILE	Volatile shared (any memory space)

Definition at line 61 of file thread_store.cuh.

Function Documentation

◆ BlockLoad() [1/2]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad ( )

inline

Collective constructor using a private static allocation of shared memory as temporary storage.

Definition at line 1066 of file block_load.cuh.

◆ BlockLoad() [2/2]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad ( TempStorage & temp_storage )

inline

Collective constructor using the specified memory allocation as temporary storage.

Parameters

[in] temp_storage Reference to memory allocation having layout type TempStorage

Definition at line 1076 of file block_load.cuh.

◆ BlockStore() [1/2]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore ( )

inline

Collective constructor using a private static allocation of shared memory as temporary storage.

Definition at line 873 of file block_store.cuh.

◆ BlockStore() [2/2]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore ( TempStorage & temp_storage )

inline

Collective constructor using the specified memory allocation as temporary storage.

Parameters

[in] temp_storage Reference to memory allocation having layout type TempStorage

Definition at line 883 of file block_store.cuh.

◆ Dereference() [1/4]

template<int COUNT, int MAX>

template<typename InputIteratorT , typename T >

static __device__ __forceinline__ void cub::IterateThreadLoad< COUNT, MAX >::Dereference	(	InputIteratorT	itr,
		T *	vals
	)

inlinestatic

Definition at line 131 of file thread_load.cuh.

◆ Dereference() [2/4]

template<int MAX>

template<typename InputIteratorT , typename T >

static __device__ __forceinline__ void cub::IterateThreadLoad< MAX, MAX >::Dereference	(	InputIteratorT	,
		T *
	)

inlinestatic

Definition at line 147 of file thread_load.cuh.

◆ Dereference() [3/4]

template<int COUNT, int MAX>

template<typename OutputIteratorT , typename T >

static __device__ __forceinline__ void cub::IterateThreadStore< COUNT, MAX >::Dereference	(	OutputIteratorT	ptr,
		T *	vals
	)

inlinestatic

Definition at line 135 of file thread_store.cuh.

◆ Dereference() [4/4]

template<int MAX>

template<typename OutputIteratorT , typename T >

static __device__ __forceinline__ void cub::IterateThreadStore< MAX, MAX >::Dereference	(	OutputIteratorT	,
		T *
	)

inlinestatic

Definition at line 151 of file thread_store.cuh.

◆ InternalLoadDirectBlockedVectorized()

template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD>

__device__ __forceinline__ void cub::InternalLoadDirectBlockedVectorized	(	int	linear_tid,
		T *	block_ptr,
		T(&)	items[ITEMS_PER_THREAD]
	)

Internal implementation for load vectorization

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_ptr	Input pointer for loading from
[out]	items	Data to load

Definition at line 162 of file block_load.cuh.

◆ Load() [1/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename _InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load	(	_InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

inline

Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load

Definition at line 770 of file block_load.cuh.

◆ Load() [2/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<CacheLoadModifier MODIFIER, typename ValueType , typename OffsetT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load	(	CacheModifiedInputIterator< MODIFIER, ValueType, OffsetT >	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

inline

Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load

Definition at line 761 of file block_load.cuh.

◆ Load() [3/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load	(	const InputT *	block_ptr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

inline

Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)

Parameters

[in]	block_ptr	The thread block's base input iterator for loading from
[out]	items	Data to load

Definition at line 749 of file block_load.cuh.

◆ Load() [4/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

inline

Load a linear segment of items from memory.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load

Definition at line 687 of file block_load.cuh.

◆ Load() [5/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

inline

Load a linear segment of items from memory.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load{

Definition at line 834 of file block_load.cuh.

◆ Load() [6/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

inline

Load a linear segment of items from memory.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load{

Definition at line 909 of file block_load.cuh.

◆ Load() [7/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

inline

Load a linear segment of items from memory.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load{

Definition at line 984 of file block_load.cuh.

◆ Load() [8/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

inline

Load a linear segment of items from memory.

\blocked
\smemreuse

Snippet: The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for BLOCK_LOAD_WARP_TRANSPOSE, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).

: #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>

__global__ void ExampleKernel(int *d_data, ...)

{

// Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each

typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;

// Allocate shared memory for BlockLoad

__shared__ typename BlockLoad::TempStorage temp_storage;

// Load a segment of consecutive items that are blocked across threads

int thread_data[4];

BlockLoad(temp_storage).Load(d_data, thread_data);

cub::BlockLoad
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
Definition block_load.cuh:641

cub::BlockLoad::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_load.cuh:1047

cub::BlockLoad::BlockLoad
__device__ __forceinline__ BlockLoad()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition block_load.cuh:1066

cub::BlockLoad::TempStorage
\smemstorage{BlockLoad}
Definition block_load.cuh:1055

: Suppose the input d_data is 0, 1, 2, 3, 4, 5, .... The set of thread_data across the block of threads in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load

Definition at line 1130 of file block_load.cuh.

◆ Load() [9/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Load a linear segment of items from memory, guarded by range.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load

Definition at line 696 of file block_load.cuh.

◆ Load() [10/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Load a linear segment of items from memory, guarded by range (skips vectorization)

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load

Definition at line 779 of file block_load.cuh.

◆ Load() [11/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Load a linear segment of items from memory, guarded by range.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load

Definition at line 844 of file block_load.cuh.

◆ Load() [12/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Load a linear segment of items from memory, guarded by range.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load

Definition at line 919 of file block_load.cuh.

◆ Load() [13/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Load a linear segment of items from memory, guarded by range.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load

Definition at line 994 of file block_load.cuh.

◆ Load() [14/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Load a linear segment of items from memory, guarded by range.

\blocked
\smemreuse

Snippet: The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for BLOCK_LOAD_WARP_TRANSPOSE, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).

: #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>

__global__ void ExampleKernel(int *d_data, int valid_items, ...)

{

// Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each

typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;

// Allocate shared memory for BlockLoad

__shared__ typename BlockLoad::TempStorage temp_storage;

// Load a segment of consecutive items that are blocked across threads

int thread_data[4];

BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);

: Suppose the input d_data is 0, 1, 2, 3, 4, 5, 6... and valid_items is 5. The set of thread_data across the block of threads in those threads will be { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads being unmasked to load portions of valid data (and other items remaining unassigned).

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load

Definition at line 1176 of file block_load.cuh.

◆ Load() [15/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT , typename DefaultT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items,
		DefaultT	oob_default
	)

inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load
[in]	oob_default	Default value to assign out-of-bound items

Definition at line 706 of file block_load.cuh.

◆ Load() [16/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT , typename DefaultT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items,
		DefaultT	oob_default
	)

inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load
[in]	oob_default	Default value to assign out-of-bound items

Definition at line 789 of file block_load.cuh.

◆ Load() [17/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT , typename DefaultT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items,
		DefaultT	oob_default
	)

inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load
[in]	oob_default	Default value to assign out-of-bound items

Definition at line 855 of file block_load.cuh.

◆ Load() [18/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT , typename DefaultT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items,
		DefaultT	oob_default
	)

inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load
[in]	oob_default	Default value to assign out-of-bound items

Definition at line 931 of file block_load.cuh.

◆ Load() [19/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT , typename DefaultT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items,
		DefaultT	oob_default
	)

inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load
[in]	oob_default	Default value to assign out-of-bound items

Definition at line 1006 of file block_load.cuh.

◆ Load() [20/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<typename InputIteratorT , typename DefaultT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load	(	InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items,
		DefaultT	oob_default
	)

inline

Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.

\blocked
\smemreuse

Snippet: The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked" arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for BLOCK_LOAD_WARP_TRANSPOSE, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads).

: #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>

__global__ void ExampleKernel(int *d_data, int valid_items, ...)

{

// Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each

typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;

// Allocate shared memory for BlockLoad

__shared__ typename BlockLoad::TempStorage temp_storage;

// Load a segment of consecutive items that are blocked across threads

int thread_data[4];

BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);

: Suppose the input d_data is 0, 1, 2, 3, 4, 5, 6..., valid_items is 5, and the out-of-bounds default is -1. The set of thread_data across the block of threads in those threads will be { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads being unmasked to load portions of valid data (and other items are assigned -1)

Parameters

[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load
[in]	oob_default	Default value to assign out-of-bound items

Definition at line 1224 of file block_load.cuh.

◆ Load() [21/23]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename InputIteratorT >

__device__ __forceinline__ void cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load	(	InputT *	block_ptr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

inline

Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)

Parameters

[in]	block_ptr	The thread block's base input iterator for loading from
[out]	items	Data to load

Definition at line 740 of file block_load.cuh.

◆ Load() [22/23]

template<int MAX>

template<CacheLoadModifier MODIFIER, typename T >

static __device__ __forceinline__ void cub::IterateThreadLoad< MAX, MAX >::Load	(	T const *	,
		T *
	)

inlinestatic

Definition at line 144 of file thread_load.cuh.

◆ Load() [23/23]

template<int COUNT, int MAX>

template<CacheLoadModifier MODIFIER, typename T >

static __device__ __forceinline__ void cub::IterateThreadLoad< COUNT, MAX >::Load	(	T const *	ptr,
		T *	vals
	)

inlinestatic

Definition at line 124 of file thread_load.cuh.

◆ LoadDirectBlocked() [1/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >

__device__ __forceinline__ void cub::LoadDirectBlocked	(	int	linear_tid,
		InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

Load a linear segment of items into a blocked arrangement across the thread block.

\blocked

Template Parameters

T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT	[inferred] The random-access iterator type for input \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load

Definition at line 76 of file block_load.cuh.

◆ LoadDirectBlocked() [2/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >

__device__ __forceinline__ void cub::LoadDirectBlocked	(	int	linear_tid,
		InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.

\blocked

Template Parameters

T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT	[inferred] The random-access iterator type for input \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load

Definition at line 105 of file block_load.cuh.

◆ LoadDirectBlocked() [3/3]

template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >

__device__ __forceinline__ void cub::LoadDirectBlocked	(	int	linear_tid,
		InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items,
		DefaultT	oob_default
	)

Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..

\blocked

Template Parameters

T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT	[inferred] The random-access iterator type for input \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load
[in]	oob_default	Default value to assign out-of-bound items

Definition at line 138 of file block_load.cuh.

◆ LoadDirectBlockedVectorized()

template<typename T , int ITEMS_PER_THREAD>

__device__ __forceinline__ void cub::LoadDirectBlockedVectorized	(	int	linear_tid,
		T *	block_ptr,
		T(&)	items[ITEMS_PER_THREAD]
	)

Load a linear segment of items into a blocked arrangement across the thread block.

\blocked

The input offset (block_ptr + block_offset) must be quad-item aligned

The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:

ITEMS_PER_THREAD is odd
The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)

Template Parameters

T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_ptr	Input pointer for loading from
[out]	items	Data to load

Definition at line 227 of file block_load.cuh.

◆ LoadDirectStriped() [1/3]

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >

__device__ __forceinline__ void cub::LoadDirectStriped	(	int	linear_tid,
		InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

Load a linear segment of items into a striped arrangement across the thread block.

\striped

Template Parameters

BLOCK_THREADS	The thread block size in threads
T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT	[inferred] The random-access iterator type for input \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load

Definition at line 258 of file block_load.cuh.

◆ LoadDirectStriped() [2/3]

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >

__device__ __forceinline__ void cub::LoadDirectStriped	(	int	linear_tid,
		InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

Load a linear segment of items into a striped arrangement across the thread block, guarded by range.

\striped

Template Parameters

BLOCK_THREADS	The thread block size in threads
T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT	[inferred] The random-access iterator type for input \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load

Definition at line 288 of file block_load.cuh.

◆ LoadDirectStriped() [3/3]

template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >

__device__ __forceinline__ void cub::LoadDirectStriped	(	int	linear_tid,
		InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items,
		DefaultT	oob_default
	)

Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

\striped

Template Parameters

BLOCK_THREADS	The thread block size in threads
T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT	[inferred] The random-access iterator type for input \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load
[in]	oob_default	Default value to assign out-of-bound items

Definition at line 323 of file block_load.cuh.

◆ LoadDirectWarpStriped() [1/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >

__device__ __forceinline__ void cub::LoadDirectWarpStriped	(	int	linear_tid,
		InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD]
	)

Load a linear segment of items into a warp-striped arrangement across the thread block.

\warpstriped

Usage Considerations: The number of threads in the thread block must be a multiple of the architecture's warp size.

Template Parameters

T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT	[inferred] The random-access iterator type for input \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load

Definition at line 362 of file block_load.cuh.

◆ LoadDirectWarpStriped() [2/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >

__device__ __forceinline__ void cub::LoadDirectWarpStriped	(	int	linear_tid,
		InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.

\warpstriped

Usage Considerations: The number of threads in the thread block must be a multiple of the architecture's warp size.

Template Parameters

T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT	[inferred] The random-access iterator type for input \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load

Definition at line 398 of file block_load.cuh.

◆ LoadDirectWarpStriped() [3/3]

template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >

__device__ __forceinline__ void cub::LoadDirectWarpStriped	(	int	linear_tid,
		InputIteratorT	block_itr,
		InputT(&)	items[ITEMS_PER_THREAD],
		int	valid_items,
		DefaultT	oob_default
	)

Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

\warpstriped

Usage Considerations: The number of threads in the thread block must be a multiple of the architecture's warp size.

Template Parameters

T	[inferred] The data type to load.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT	[inferred] The random-access iterator type for input \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base input iterator for loading from
[out]	items	Data to load
[in]	valid_items	Number of valid items to load
[in]	oob_default	Default value to assign out-of-bound items

Definition at line 439 of file block_load.cuh.

◆ LoadInternal() [1/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::LoadInternal	(	TempStorage &	,
		int	linear_tid
	)

inline

Constructor.

Definition at line 678 of file block_load.cuh.

◆ LoadInternal() [2/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::LoadInternal	(	TempStorage &	,
		int	linear_tid
	)

inline

Constructor.

Definition at line 731 of file block_load.cuh.

◆ LoadInternal() [3/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::LoadInternal	(	TempStorage &	temp_storage,
		int	linear_tid
	)

inline

Constructor.

Definition at line 824 of file block_load.cuh.

◆ LoadInternal() [4/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::LoadInternal	(	TempStorage &	temp_storage,
		int	linear_tid
	)

inline

Constructor.

Definition at line 899 of file block_load.cuh.

◆ LoadInternal() [5/5]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::LoadInternal	(	TempStorage &	temp_storage,
		int	linear_tid
	)

inline

Constructor.

Definition at line 974 of file block_load.cuh.

◆ PrivateStorage() [1/2]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

__device__ __forceinline__ _TempStorage & cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage ( )

inlineprivate

Internal storage allocator.

Definition at line 1035 of file block_load.cuh.

◆ PrivateStorage() [2/2]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

__device__ __forceinline__ _TempStorage & cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage ( )

inlineprivate

Internal storage allocator.

Definition at line 841 of file block_store.cuh.

◆ Store() [1/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD]
	)

inline

Store items into a linear segment of memory.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store

Definition at line 561 of file block_store.cuh.

◆ Store() [2/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD]
	)

inline

Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store

Definition at line 610 of file block_store.cuh.

◆ Store() [3/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD]
	)

inline

Store items into a linear segment of memory.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store

Definition at line 665 of file block_store.cuh.

◆ Store() [4/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD]
	)

inline

Store items into a linear segment of memory.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store

Definition at line 733 of file block_store.cuh.

◆ Store() [5/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD]
	)

inline

Store items into a linear segment of memory.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store

Definition at line 801 of file block_store.cuh.

◆ Store() [6/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD]
	)

inline

Store items into a linear segment of memory.

\blocked
\smemreuse

Snippet: The code snippet below illustrates the storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for BLOCK_STORE_WARP_TRANSPOSE, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.

: #include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>

__global__ void ExampleKernel(int *d_data, ...)

{

// Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each

typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;

// Allocate shared memory for BlockStore

__shared__ typename BlockStore::TempStorage temp_storage;

// Obtain a segment of consecutive items that are blocked across threads

int thread_data[4];

...

// Store items to linear memory

int thread_data[4];

BlockStore(temp_storage).Store(d_data, thread_data);

cub::BlockStore
The BlockStore class provides collective data movement methods for writing a blocked arrangement of i...
Definition block_store.cuh:516

cub::BlockStore::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_store.cuh:853

cub::BlockStore::BlockStore
__device__ __forceinline__ BlockStore()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition block_store.cuh:873

cub::BlockStore::TempStorage
\smemstorage{BlockStore}
Definition block_store.cuh:862

: Suppose the set of thread_data across the block of threads is { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. The output d_data will be 0, 1, 2, 3, 4, 5, ....

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store

Definition at line 939 of file block_store.cuh.

◆ Store() [7/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Store items into a linear segment of memory, guarded by range.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store
[in]	valid_items	Number of valid items to write

Definition at line 570 of file block_store.cuh.

◆ Store() [8/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Store items into a linear segment of memory, guarded by range.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store
[in]	valid_items	Number of valid items to write

Definition at line 619 of file block_store.cuh.

◆ Store() [9/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Store items into a linear segment of memory, guarded by range.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store
[in]	valid_items	Number of valid items to write

Definition at line 675 of file block_store.cuh.

◆ Store() [10/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Store items into a linear segment of memory, guarded by range.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store
[in]	valid_items	Number of valid items to write

Definition at line 743 of file block_store.cuh.

◆ Store() [11/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Store items into a linear segment of memory, guarded by range.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store
[in]	valid_items	Number of valid items to write

Definition at line 811 of file block_store.cuh.

◆ Store() [12/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<typename OutputIteratorT >

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store	(	OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

inline

Store items into a linear segment of memory, guarded by range.

\blocked
\smemreuse

Snippet: The code snippet below illustrates the guarded storing of a "blocked" arrangement of 512 integers across 128 threads (where each thread owns 4 consecutive items) into a linear segment of memory. The store is specialized for BLOCK_STORE_WARP_TRANSPOSE, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern.

: #include <cub/cub.cuh> // or equivalently <cub/block/block_store.cuh>

__global__ void ExampleKernel(int *d_data, int valid_items, ...)

{

// Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each

typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;

// Allocate shared memory for BlockStore

__shared__ typename BlockStore::TempStorage temp_storage;

// Obtain a segment of consecutive items that are blocked across threads

int thread_data[4];

...

// Store items to linear memory

int thread_data[4];

BlockStore(temp_storage).Store(d_data, thread_data, valid_items);

: Suppose the set of thread_data across the block of threads is { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and valid_items is 5. The output d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with only the first two threads being unmasked to store portions of valid data.

Parameters

[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store
[in]	valid_items	Number of valid items to write

Definition at line 988 of file block_store.cuh.

◆ Store() [13/15]

template<int MAX>

template<CacheStoreModifier MODIFIER, typename T >

static __device__ __forceinline__ void cub::IterateThreadStore< MAX, MAX >::Store	(	T *	,
		T *
	)

inlinestatic

Definition at line 148 of file thread_store.cuh.

◆ Store() [14/15]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ void cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store	(	T *	block_ptr,
		T(&)	items[ITEMS_PER_THREAD]
	)

inline

Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)

Parameters

[in]	block_ptr	The thread block's base output iterator for storing to
[in]	items	Data to store

Definition at line 601 of file block_store.cuh.

◆ Store() [15/15]

template<int COUNT, int MAX>

template<CacheStoreModifier MODIFIER, typename T >

static __device__ __forceinline__ void cub::IterateThreadStore< COUNT, MAX >::Store	(	T *	ptr,
		T *	vals
	)

inlinestatic

Definition at line 128 of file thread_store.cuh.

◆ StoreDirectBlocked() [1/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >

__device__ __forceinline__ void cub::StoreDirectBlocked	(	int	linear_tid,
		OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD]
	)

Store a blocked arrangement of items across a thread block into a linear segment of items.

\blocked

Template Parameters

T	[inferred] The data type to store.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT	[inferred] The random-access iterator type for output \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store

Definition at line 74 of file block_store.cuh.

◆ StoreDirectBlocked() [2/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >

__device__ __forceinline__ void cub::StoreDirectBlocked	(	int	linear_tid,
		OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range.

\blocked

Template Parameters

T	[inferred] The data type to store.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT	[inferred] The random-access iterator type for output \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store
[in]	valid_items	Number of valid items to write

Definition at line 103 of file block_store.cuh.

◆ StoreDirectBlockedVectorized()

template<typename T , int ITEMS_PER_THREAD>

__device__ __forceinline__ void cub::StoreDirectBlockedVectorized	(	int	linear_tid,
		T *	block_ptr,
		T(&)	items[ITEMS_PER_THREAD]
	)

Store a blocked arrangement of items across a thread block into a linear segment of items.

\blocked

The output offset (block_ptr + block_offset) must be quad-item aligned, which is the default starting offset returned by cudaMalloc()

The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:

ITEMS_PER_THREAD is odd
The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)

Template Parameters

T	[inferred] The data type to store.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_ptr	Input pointer for storing from
[in]	items	Data to store

Definition at line 143 of file block_store.cuh.

◆ StoreDirectStriped() [1/2]

template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >

__device__ __forceinline__ void cub::StoreDirectStriped	(	int	linear_tid,
		OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD]
	)

Store a striped arrangement of data across the thread block into a linear segment of items.

\striped

Template Parameters

BLOCK_THREADS	The thread block size in threads
T	[inferred] The data type to store.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT	[inferred] The random-access iterator type for output \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store

Definition at line 206 of file block_store.cuh.

◆ StoreDirectStriped() [2/2]

template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >

__device__ __forceinline__ void cub::StoreDirectStriped	(	int	linear_tid,
		OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range.

\striped

Template Parameters

BLOCK_THREADS	The thread block size in threads
T	[inferred] The data type to store.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT	[inferred] The random-access iterator type for output \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store
[in]	valid_items	Number of valid items to write

Definition at line 237 of file block_store.cuh.

◆ StoreDirectWarpStriped() [1/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >

__device__ __forceinline__ void cub::StoreDirectWarpStriped	(	int	linear_tid,
		OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD]
	)

Store a warp-striped arrangement of data across the thread block into a linear segment of items.

\warpstriped

Usage Considerations: The number of threads in the thread block must be a multiple of the architecture's warp size.

Template Parameters

T	[inferred] The data type to store.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT	[inferred] The random-access iterator type for output \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base output iterator for storing to
[out]	items	Data to load

Definition at line 281 of file block_store.cuh.

◆ StoreDirectWarpStriped() [2/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >

__device__ __forceinline__ void cub::StoreDirectWarpStriped	(	int	linear_tid,
		OutputIteratorT	block_itr,
		T(&)	items[ITEMS_PER_THREAD],
		int	valid_items
	)

Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range.

\warpstriped

Usage Considerations: The number of threads in the thread block must be a multiple of the architecture's warp size.

Template Parameters

T	[inferred] The data type to store.
ITEMS_PER_THREAD	[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT	[inferred] The random-access iterator type for output \iterator.

Parameters

[in]	linear_tid	A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
[in]	block_itr	The thread block's base output iterator for storing to
[in]	items	Data to store
[in]	valid_items	Number of valid items to write

Definition at line 317 of file block_store.cuh.

◆ StoreInternal() [1/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::StoreInternal	(	TempStorage &	,
		int	linear_tid
	)

inline

Constructor.

Definition at line 552 of file block_store.cuh.

◆ StoreInternal() [2/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::StoreInternal	(	TempStorage &	,
		int	linear_tid
	)

inline

Constructor.

Definition at line 593 of file block_store.cuh.

◆ StoreInternal() [3/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::StoreInternal	(	TempStorage &	temp_storage,
		int	linear_tid
	)

inline

Constructor.

Definition at line 655 of file block_store.cuh.

◆ StoreInternal() [4/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::StoreInternal	(	TempStorage &	temp_storage,
		int	linear_tid
	)

inline

Constructor.

Definition at line 723 of file block_store.cuh.

◆ StoreInternal() [5/5]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

__device__ __forceinline__ cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::StoreInternal	(	TempStorage &	temp_storage,
		int	linear_tid
	)

inline

Constructor.

Definition at line 791 of file block_store.cuh.

◆ ThreadLoad() [1/5]

template<CacheLoadModifier MODIFIER, typename InputIteratorT >

__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad ( InputIteratorT itr )

Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.

Example: #include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh>

// 32-bit load using cache-global modifier:

int *d_in;

int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);

// 16-bit load using default modifier

short *d_in;

short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);

// 256-bit load using cache-volatile modifier

double4 *d_in;

double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);

// 96-bit load using cache-streaming modifier

struct TestFoo { bool a; short b; };

TestFoo *d_struct;

TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);

Template Parameters

MODIFIER	[inferred] CacheLoadModifier enumeration
InputIteratorT	[inferred] Input iterator type \iterator

ThreadLoad definition for generic modifiers

Definition at line 420 of file thread_load.cuh.

◆ ThreadLoad() [2/5]

template<typename InputIteratorT >

__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad	(	InputIteratorT	itr,
		Int2Type< LOAD_DEFAULT >	,
		Int2Type< false >
	)

Define powers-of-two ThreadLoad specializations for the various Cache load modifiers ThreadLoad definition for LOAD_DEFAULT modifier on iterator types

Definition at line 312 of file thread_load.cuh.

◆ ThreadLoad() [3/5]

template<typename T >

__device__ __forceinline__ T cub::ThreadLoad	(	T *	ptr,
		Int2Type< LOAD_DEFAULT >	,
		Int2Type< true >
	)

ThreadLoad definition for LOAD_DEFAULT modifier on pointer types

Definition at line 325 of file thread_load.cuh.

◆ ThreadLoad() [4/5]

template<typename T >

__device__ __forceinline__ T cub::ThreadLoad	(	T *	ptr,
		Int2Type< LOAD_VOLATILE >	,
		Int2Type< true >
	)

ThreadLoad definition for LOAD_VOLATILE modifier on pointer types

Definition at line 381 of file thread_load.cuh.

◆ ThreadLoad() [5/5]

template<typename T , int MODIFIER>

__device__ __forceinline__ T cub::ThreadLoad	(	T const *	ptr,
		Int2Type< MODIFIER >	,
		Int2Type< true >
	)

ThreadLoad definition for generic modifiers on pointer types

Definition at line 395 of file thread_load.cuh.

◆ ThreadLoadVolatilePointer() [1/2]

template<typename T >

__device__ __forceinline__ T cub::ThreadLoadVolatilePointer	(	T *	ptr,
		Int2Type< false >
	)

ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types

Definition at line 351 of file thread_load.cuh.

◆ ThreadLoadVolatilePointer() [2/2]

template<typename T >

__device__ __forceinline__ T cub::ThreadLoadVolatilePointer	(	T *	ptr,
		Int2Type< true >
	)

ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types

Definition at line 338 of file thread_load.cuh.

◆ ThreadStore() [1/5]

template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T >

__device__ __forceinline__ void cub::ThreadStore	(	OutputIteratorT	itr,
		T	val
	)

Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.

Example: #include <cub/cub.cuh> // or equivalently <cub/thread/thread_store.cuh>

// 32-bit store using cache-global modifier:

int *d_out;

int val;

cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);

// 16-bit store using default modifier

short *d_out;

short val;

cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);

// 256-bit store using write-through modifier

double4 *d_out;

double4 val;

cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);

// 96-bit store using cache-streaming cache modifier

struct TestFoo { bool a; short b; };

TestFoo *d_struct;

TestFoo val;

cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);

cub::d_out
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Definition dispatch_reduce.cuh:71

Template Parameters

MODIFIER	[inferred] CacheStoreModifier enumeration
InputIteratorT	[inferred] Output iterator type \iterator
T	[inferred] Data type of output value

ThreadStore definition for generic modifiers

Definition at line 404 of file thread_store.cuh.

◆ ThreadStore() [2/5]

template<typename OutputIteratorT , typename T >

__device__ __forceinline__ void cub::ThreadStore	(	OutputIteratorT	itr,
		T	val,
		Int2Type< STORE_DEFAULT >	,
		Int2Type< false >
	)

Define ThreadStore specializations for the various Cache load modifiers ThreadStore definition for STORE_DEFAULT modifier on iterator types

Definition at line 292 of file thread_store.cuh.

◆ ThreadStore() [3/5]

template<typename T , int MODIFIER>

__device__ __forceinline__ void cub::ThreadStore	(	T *	ptr,
		T	val,
		Int2Type< MODIFIER >	,
		Int2Type< true >
	)

ThreadStore definition for generic modifiers on pointer types

Definition at line 375 of file thread_store.cuh.

◆ ThreadStore() [4/5]

template<typename T >

__device__ __forceinline__ void cub::ThreadStore	(	T *	ptr,
		T	val,
		Int2Type< STORE_DEFAULT >	,
		Int2Type< true >
	)

ThreadStore definition for STORE_DEFAULT modifier on pointer types

Definition at line 306 of file thread_store.cuh.

◆ ThreadStore() [5/5]

template<typename T >

__device__ __forceinline__ void cub::ThreadStore	(	T *	ptr,
		T	val,
		Int2Type< STORE_VOLATILE >	,
		Int2Type< true >
	)

ThreadStore definition for STORE_VOLATILE modifier on pointer types

Definition at line 361 of file thread_store.cuh.

◆ ThreadStoreVolatilePtr() [1/2]

template<typename T >

__device__ __forceinline__ void cub::ThreadStoreVolatilePtr	(	T *	ptr,
		T	val,
		Int2Type< false >
	)

ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types

Definition at line 333 of file thread_store.cuh.

◆ ThreadStoreVolatilePtr() [2/2]

template<typename T >

__device__ __forceinline__ void cub::ThreadStoreVolatilePtr	(	T *	ptr,
		T	val,
		Int2Type< true >
	)

ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types

Definition at line 320 of file thread_store.cuh.

Variable Documentation

◆ linear_tid [1/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::linear_tid

Linear thread-id.

Definition at line 675 of file block_load.cuh.

◆ linear_tid [2/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 728 of file block_load.cuh.

◆ linear_tid [3/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 821 of file block_load.cuh.

◆ linear_tid [4/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 896 of file block_load.cuh.

◆ linear_tid [5/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid

Linear thread-id.

Definition at line 971 of file block_load.cuh.

◆ linear_tid [6/12]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid

private

Linear thread-id.

Definition at line 1050 of file block_load.cuh.

◆ linear_tid [7/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::linear_tid

Linear thread-id.

Definition at line 549 of file block_store.cuh.

◆ linear_tid [8/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 590 of file block_store.cuh.

◆ linear_tid [9/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 652 of file block_store.cuh.

◆ linear_tid [10/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::linear_tid

Linear thread-id.

Definition at line 720 of file block_store.cuh.

◆ linear_tid [11/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid

Linear thread-id.

Definition at line 788 of file block_store.cuh.

◆ linear_tid [12/12]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::linear_tid

private

Linear thread-id.

Definition at line 856 of file block_store.cuh.

◆ temp_storage [1/8]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 818 of file block_load.cuh.

◆ temp_storage [2/8]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 893 of file block_load.cuh.

◆ temp_storage [3/8]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 968 of file block_load.cuh.

◆ temp_storage [4/8]

template<typename InputT , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage

private

Thread reference to shared storage.

Definition at line 1047 of file block_load.cuh.

◆ temp_storage [5/8]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 649 of file block_store.cuh.

◆ temp_storage [6/8]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 717 of file block_store.cuh.

◆ temp_storage [7/8]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage

Thread reference to shared storage.

Definition at line 785 of file block_store.cuh.

◆ temp_storage [8/8]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::temp_storage

private

Thread reference to shared storage.

Definition at line 853 of file block_store.cuh.

◆ valid_items [1/3]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::_TempStorage::valid_items

Temporary storage for partially-full block guard.

Definition at line 642 of file block_store.cuh.

◆ valid_items [2/3]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::_TempStorage::valid_items

Temporary storage for partially-full block guard.

Definition at line 710 of file block_store.cuh.

◆ valid_items [3/3]

template<typename T , int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH>

template<int DUMMY>

volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage::valid_items

Temporary storage for partially-full block guard.

Definition at line 778 of file block_store.cuh.

Data Structures

Typedefs

Enumerations

Functions

Variables

Blocked arrangement I/O (direct)

Striped arrangement I/O (direct)

Warp-striped arrangement I/O (direct)

Collective constructors

Data movement

Collective constructors

Data movement

Thread I/O (cache modified)

Detailed Description

Macro Definition Documentation

◆ _CUB_LOAD_1

◆ _CUB_LOAD_16

◆ _CUB_LOAD_2

◆ _CUB_LOAD_4

◆ _CUB_LOAD_8

◆ _CUB_LOAD_ALL

◆ _CUB_STORE_1

◆ _CUB_STORE_16

◆ _CUB_STORE_2

◆ _CUB_STORE_4

◆ _CUB_STORE_8

◆ _CUB_STORE_ALL

Typedef Documentation

◆ _TempStorage [1/2]

◆ _TempStorage [2/2]

◆ BlockExchange [1/6]

◆ BlockExchange [2/6]

◆ BlockExchange [3/6]

◆ BlockExchange [4/6]

◆ BlockExchange [5/6]

◆ BlockExchange [6/6]

◆ InternalLoad

◆ InternalStore

◆ TempStorage [1/4]

◆ TempStorage [2/4]

◆ TempStorage [3/4]

◆ TempStorage [4/4]

Enumeration Type Documentation

◆ anonymous enum

◆ anonymous enum

◆ anonymous enum

◆ anonymous enum

◆ anonymous enum

◆ anonymous enum

◆ BlockLoadAlgorithm

◆ BlockStoreAlgorithm

◆ CacheLoadModifier

◆ CacheStoreModifier

Function Documentation

◆ BlockLoad() [1/2]

◆ BlockLoad() [2/2]

◆ BlockStore() [1/2]

◆ BlockStore() [2/2]

◆ Dereference() [1/4]

◆ Dereference() [2/4]

◆ Dereference() [3/4]

◆ Dereference() [4/4]

◆ InternalLoadDirectBlockedVectorized()

◆ Load() [1/23]

◆ Load() [2/23]

◆ Load() [3/23]

◆ Load() [4/23]

◆ Load() [5/23]

◆ Load() [6/23]

◆ Load() [7/23]

◆ Load() [8/23]

◆ Load() [9/23]

◆ Load() [10/23]

◆ Load() [11/23]

◆ Load() [12/23]

◆ Load() [13/23]

◆ Load() [14/23]

◆ Load() [15/23]

◆ Load() [16/23]

◆ Load() [17/23]