Functions | |
__device__ __forceinline__ | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::LoadInternal (TempStorage &, int linear_tid) |
Constructor. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items from memory. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items from memory, guarded by range. | |
template<typename InputIteratorT , typename DefaultT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements. | |
__device__ __forceinline__ | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::LoadInternal (TempStorage &, int linear_tid) |
Constructor. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (const InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) | |
template<CacheLoadModifier MODIFIER, typename ValueType , typename OffsetT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (CacheModifiedInputIterator< MODIFIER, ValueType, OffsetT > block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) | |
template<typename _InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (_InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization) | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items from memory, guarded by range (skips vectorization) | |
template<typename InputIteratorT , typename DefaultT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization) | |
__device__ __forceinline__ | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::LoadInternal (TempStorage &temp_storage, int linear_tid) |
Constructor. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items from memory. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items from memory, guarded by range. | |
template<typename InputIteratorT , typename DefaultT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements. | |
cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS") | |
__device__ __forceinline__ | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::LoadInternal (TempStorage &temp_storage, int linear_tid) |
Constructor. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items from memory. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items from memory, guarded by range. | |
template<typename InputIteratorT , typename DefaultT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements. | |
cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS") | |
__device__ __forceinline__ | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::LoadInternal (TempStorage &temp_storage, int linear_tid) |
Constructor. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items from memory. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items from memory, guarded by range. | |
template<typename InputIteratorT , typename DefaultT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements. | |
__device__ __forceinline__ _TempStorage & | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage () |
Internal storage allocator. | |
__device__ __forceinline__ | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::StoreInternal (TempStorage &, int linear_tid) |
Constructor. | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store items into a linear segment of memory. | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store items into a linear segment of memory, guarded by range. | |
__device__ __forceinline__ | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::StoreInternal (TempStorage &, int linear_tid) |
Constructor. | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store (T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store items into a linear segment of memory, guarded by range. | |
__device__ __forceinline__ | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::StoreInternal (TempStorage &temp_storage, int linear_tid) |
Constructor. | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store items into a linear segment of memory. | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store items into a linear segment of memory, guarded by range. | |
cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS") | |
__device__ __forceinline__ | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::StoreInternal (TempStorage &temp_storage, int linear_tid) |
Constructor. | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store items into a linear segment of memory. | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store items into a linear segment of memory, guarded by range. | |
cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::CUB_STATIC_ASSERT ((BLOCK_THREADS % WARP_THREADS==0), "BLOCK_THREADS must be a multiple of WARP_THREADS") | |
__device__ __forceinline__ | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::StoreInternal (TempStorage &temp_storage, int linear_tid) |
Constructor. | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store items into a linear segment of memory. | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store items into a linear segment of memory, guarded by range. | |
__device__ __forceinline__ _TempStorage & | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::PrivateStorage () |
Internal storage allocator. | |
template<CacheLoadModifier MODIFIER, typename T > | |
static __device__ __forceinline__ void | cub::IterateThreadLoad< COUNT, MAX >::Load (T const *ptr, T *vals) |
template<typename InputIteratorT , typename T > | |
static __device__ __forceinline__ void | cub::IterateThreadLoad< COUNT, MAX >::Dereference (InputIteratorT itr, T *vals) |
template<CacheLoadModifier MODIFIER, typename T > | |
static __device__ __forceinline__ void | cub::IterateThreadLoad< MAX, MAX >::Load (T const *, T *) |
template<typename InputIteratorT , typename T > | |
static __device__ __forceinline__ void | cub::IterateThreadLoad< MAX, MAX >::Dereference (InputIteratorT, T *) |
template<CacheStoreModifier MODIFIER, typename T > | |
static __device__ __forceinline__ void | cub::IterateThreadStore< COUNT, MAX >::Store (T *ptr, T *vals) |
template<typename OutputIteratorT , typename T > | |
static __device__ __forceinline__ void | cub::IterateThreadStore< COUNT, MAX >::Dereference (OutputIteratorT ptr, T *vals) |
template<CacheStoreModifier MODIFIER, typename T > | |
static __device__ __forceinline__ void | cub::IterateThreadStore< MAX, MAX >::Store (T *, T *) |
template<typename OutputIteratorT , typename T > | |
static __device__ __forceinline__ void | cub::IterateThreadStore< MAX, MAX >::Dereference (OutputIteratorT, T *) |
Blocked arrangement I/O (direct) | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a blocked arrangement across the thread block. | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. | |
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. | |
template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD> | |
__device__ __forceinline__ void | cub::InternalLoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
template<typename T , int ITEMS_PER_THREAD> | |
__device__ __forceinline__ void | cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a blocked arrangement across the thread block. | |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store a blocked arrangement of items across a thread block into a linear segment of items. | |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range. | |
template<typename T , int ITEMS_PER_THREAD> | |
__device__ __forceinline__ void | cub::StoreDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
Store a blocked arrangement of items across a thread block into a linear segment of items. | |
Striped arrangement I/O (direct) | |
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a striped arrangement across the thread block. | |
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range. | |
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. | |
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store a striped arrangement of data across the thread block into a linear segment of items. | |
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range. | |
Warp-striped arrangement I/O (direct) | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a warp-striped arrangement across the thread block. | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range. | |
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. | |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store a warp-striped arrangement of data across the thread block into a linear segment of items. | |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range. | |
Collective constructors | |
__device__ __forceinline__ | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad () |
Collective constructor using a private static allocation of shared memory as temporary storage. | |
__device__ __forceinline__ | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockLoad (TempStorage &temp_storage) |
Collective constructor using the specified memory allocation as temporary storage. | |
Data movement | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items from memory. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items from memory, guarded by range. | |
template<typename InputIteratorT , typename DefaultT > | |
__device__ __forceinline__ void | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Load (InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements. | |
Collective constructors | |
__device__ __forceinline__ | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore () |
Collective constructor using a private static allocation of shared memory as temporary storage. | |
__device__ __forceinline__ | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::BlockStore (TempStorage &temp_storage) |
Collective constructor using the specified memory allocation as temporary storage. | |
Data movement | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
Store items into a linear segment of memory. | |
template<typename OutputIteratorT > | |
__device__ __forceinline__ void | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::Store (OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
Store items into a linear segment of memory, guarded by range. | |
Thread I/O (cache modified) | |
template<CacheLoadModifier MODIFIER, typename InputIteratorT > | |
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type | cub::ThreadLoad (InputIteratorT itr) |
Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. | |
template<typename InputIteratorT > | |
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type | cub::ThreadLoad (InputIteratorT itr, Int2Type< LOAD_DEFAULT >, Int2Type< false >) |
template<typename T > | |
__device__ __forceinline__ T | cub::ThreadLoad (T *ptr, Int2Type< LOAD_DEFAULT >, Int2Type< true >) |
template<typename T > | |
__device__ __forceinline__ T | cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< true >) |
template<typename T > | |
__device__ __forceinline__ T | cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< false >) |
template<typename T > | |
__device__ __forceinline__ T | cub::ThreadLoad (T *ptr, Int2Type< LOAD_VOLATILE >, Int2Type< true >) |
template<typename T , int MODIFIER> | |
__device__ __forceinline__ T | cub::ThreadLoad (T const *ptr, Int2Type< MODIFIER >, Int2Type< true >) |
template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T > | |
__device__ __forceinline__ void | cub::ThreadStore (OutputIteratorT itr, T val) |
Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. | |
template<typename OutputIteratorT , typename T > | |
__device__ __forceinline__ void | cub::ThreadStore (OutputIteratorT itr, T val, Int2Type< STORE_DEFAULT >, Int2Type< false >) |
template<typename T > | |
__device__ __forceinline__ void | cub::ThreadStore (T *ptr, T val, Int2Type< STORE_DEFAULT >, Int2Type< true >) |
template<typename T > | |
__device__ __forceinline__ void | cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< true >) |
template<typename T > | |
__device__ __forceinline__ void | cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< false >) |
template<typename T > | |
__device__ __forceinline__ void | cub::ThreadStore (T *ptr, T val, Int2Type< STORE_VOLATILE >, Int2Type< true >) |
template<typename T , int MODIFIER> | |
__device__ __forceinline__ void | cub::ThreadStore (T *ptr, T val, Int2Type< MODIFIER >, Int2Type< true >) |
#define | _CUB_LOAD_16(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_8(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_4(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_2(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_1(cub_modifier, ptx_modifier) |
#define | _CUB_LOAD_ALL(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_16(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_8(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_4(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_2(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_1(cub_modifier, ptx_modifier) |
#define | _CUB_STORE_ALL(cub_modifier, ptx_modifier) |
#define _CUB_LOAD_1 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
Definition at line 247 of file thread_load.cuh.
#define _CUB_LOAD_16 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
Definition at line 154 of file thread_load.cuh.
#define _CUB_LOAD_2 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
Definition at line 232 of file thread_load.cuh.
#define _CUB_LOAD_4 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
Definition at line 217 of file thread_load.cuh.
#define _CUB_LOAD_8 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
Definition at line 181 of file thread_load.cuh.
#define _CUB_LOAD_ALL | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define powers-of-two ThreadLoad specializations for the given Cache load modifier
Definition at line 267 of file thread_load.cuh.
#define _CUB_STORE_1 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
Definition at line 238 of file thread_store.cuh.
#define _CUB_STORE_16 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
Definition at line 158 of file thread_store.cuh.
#define _CUB_STORE_2 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
Definition at line 225 of file thread_store.cuh.
#define _CUB_STORE_4 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
Definition at line 212 of file thread_store.cuh.
#define _CUB_STORE_8 | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
Definition at line 182 of file thread_store.cuh.
#define _CUB_STORE_ALL | ( | cub_modifier, | |
ptx_modifier | |||
) |
Define powers-of-two ThreadStore specializations for the given Cache load modifier
Definition at line 255 of file thread_store.cuh.
|
private |
Shared memory storage layout type.
Definition at line 1027 of file block_load.cuh.
|
private |
Shared memory storage layout type.
Definition at line 833 of file block_store.cuh.
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::BlockExchange |
Definition at line 808 of file block_load.cuh.
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::BlockExchange |
Definition at line 883 of file block_load.cuh.
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange |
Definition at line 958 of file block_load.cuh.
typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::BlockExchange |
Definition at line 636 of file block_store.cuh.
typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::BlockExchange |
Definition at line 704 of file block_store.cuh.
typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::BlockExchange |
Definition at line 772 of file block_store.cuh.
|
private |
Internal load implementation to use.
Definition at line 1023 of file block_load.cuh.
|
private |
Internal load implementation to use.
Definition at line 829 of file block_store.cuh.
typedef NullType cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::TempStorage |
Shared memory storage layout type.
Definition at line 672 of file block_load.cuh.
typedef NullType cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::TempStorage |
Shared memory storage layout type.
Definition at line 725 of file block_load.cuh.
typedef NullType cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::TempStorage |
Shared memory storage layout type.
Definition at line 546 of file block_store.cuh.
typedef NullType cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::TempStorage |
Shared memory storage layout type.
Definition at line 587 of file block_store.cuh.
|
private |
Constants.
Enumerator | |
---|---|
BLOCK_THREADS | The thread block size in threads. |
Definition at line 523 of file block_store.cuh.
anonymous enum |
Definition at line 695 of file block_store.cuh.
anonymous enum |
Definition at line 763 of file block_store.cuh.
|
private |
Constants.
Enumerator | |
---|---|
BLOCK_THREADS | The thread block size in threads. |
Definition at line 649 of file block_load.cuh.
anonymous enum |
Definition at line 874 of file block_load.cuh.
anonymous enum |
Definition at line 949 of file block_load.cuh.
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
Enumerator | |
---|---|
BLOCK_LOAD_DIRECT |
A blocked arrangement of data is read directly from memory.
|
BLOCK_LOAD_VECTORIZE |
A blocked arrangement of data is read from memory using CUDA's built-in vectorized loads as a coalescing optimization. For example,
|
BLOCK_LOAD_TRANSPOSE |
A striped arrangement of data is read efficiently from memory and then locally transposed into a blocked arrangement.
|
BLOCK_LOAD_WARP_TRANSPOSE |
A warp-striped arrangement of data is read efficiently from memory and then locally transposed into a blocked arrangement.
|
BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED |
Like
|
Definition at line 473 of file block_load.cuh.
cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
Enumerator | |
---|---|
BLOCK_STORE_DIRECT |
A blocked arrangement of data is written directly to memory.
|
BLOCK_STORE_VECTORIZE |
A blocked arrangement of data is written directly to memory using CUDA's built-in vectorized stores as a coalescing optimization. For example,
|
BLOCK_STORE_TRANSPOSE |
|
BLOCK_STORE_WARP_TRANSPOSE |
|
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED |
|
Definition at line 354 of file block_store.cuh.
Enumeration of cache modifiers for memory load operations.
Definition at line 62 of file thread_load.cuh.
Enumeration of cache modifiers for memory store operations.
Definition at line 61 of file thread_store.cuh.
|
inline |
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition at line 1066 of file block_load.cuh.
|
inline |
Collective constructor using the specified memory allocation as temporary storage.
[in] | temp_storage | Reference to memory allocation having layout type TempStorage |
Definition at line 1076 of file block_load.cuh.
|
inline |
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition at line 873 of file block_store.cuh.
|
inline |
Collective constructor using the specified memory allocation as temporary storage.
[in] | temp_storage | Reference to memory allocation having layout type TempStorage |
Definition at line 883 of file block_store.cuh.
|
inlinestatic |
Definition at line 131 of file thread_load.cuh.
|
inlinestatic |
Definition at line 147 of file thread_load.cuh.
|
inlinestatic |
Definition at line 135 of file thread_store.cuh.
|
inlinestatic |
Definition at line 151 of file thread_store.cuh.
__device__ __forceinline__ void cub::InternalLoadDirectBlockedVectorized | ( | int | linear_tid, |
T * | block_ptr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Internal implementation for load vectorization
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_ptr | Input pointer for loading from |
[out] | items | Data to load |
Definition at line 162 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 770 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 761 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
[in] | block_ptr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 749 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 687 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load{ |
Definition at line 834 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load{ |
Definition at line 909 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load{ |
Definition at line 984 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory.
BLOCK_LOAD_WARP_TRANSPOSE
, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads). d_data
is 0, 1, 2, 3, 4, 5, ...
. The set of thread_data
across the block of threads in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }
. [in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 1130 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 696 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range (skips vectorization)
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 779 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 844 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 919 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 994 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range.
BLOCK_LOAD_WARP_TRANSPOSE
, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads). d_data
is 0, 1, 2, 3, 4, 5, 6...
and valid_items
is 5
. The set of thread_data
across the block of threads in those threads will be { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }
, with only the first two threads being unmasked to load portions of valid data (and other items remaining unassigned). [in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 1176 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 706 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 789 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 855 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 931 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 1006 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements.
BLOCK_LOAD_WARP_TRANSPOSE
, meaning memory references are efficiently coalesced using a warp-striped access pattern (after which items are locally reordered among threads). d_data
is 0, 1, 2, 3, 4, 5, 6...
, valid_items
is 5
, and the out-of-bounds default is -1
. The set of thread_data
across the block of threads in those threads will be { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }
, with only the first two threads being unmasked to load portions of valid data (and other items are assigned -1
) [in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 1224 of file block_load.cuh.
|
inline |
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
[in] | block_ptr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 740 of file block_load.cuh.
|
inlinestatic |
Definition at line 144 of file thread_load.cuh.
|
inlinestatic |
Definition at line 124 of file thread_load.cuh.
__device__ __forceinline__ void cub::LoadDirectBlocked | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD] | ||
) |
Load a linear segment of items into a blocked arrangement across the thread block.
\blocked
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 76 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectBlocked | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
\blocked
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 105 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectBlocked | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items, | ||
DefaultT | oob_default | ||
) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
\blocked
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 138 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized | ( | int | linear_tid, |
T * | block_ptr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Load a linear segment of items into a blocked arrangement across the thread block.
\blocked
The input offset (block_ptr
+ block_offset
) must be quad-item aligned
The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
ITEMS_PER_THREAD
is oddT
is not a built-in primitive or CUDA vector type (e.g., short
, int2
, double
, float2
, etc.)T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_ptr | Input pointer for loading from |
[out] | items | Data to load |
Definition at line 227 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD] | ||
) |
Load a linear segment of items into a striped arrangement across the thread block.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 258 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 288 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items, | ||
DefaultT | oob_default | ||
) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 323 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectWarpStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD] | ||
) |
Load a linear segment of items into a warp-striped arrangement across the thread block.
\warpstriped
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
Definition at line 362 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectWarpStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.
\warpstriped
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
Definition at line 398 of file block_load.cuh.
__device__ __forceinline__ void cub::LoadDirectWarpStriped | ( | int | linear_tid, |
InputIteratorT | block_itr, | ||
InputT(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items, | ||
DefaultT | oob_default | ||
) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
\warpstriped
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
Definition at line 439 of file block_load.cuh.
|
inline |
Constructor.
Definition at line 678 of file block_load.cuh.
|
inline |
Constructor.
Definition at line 731 of file block_load.cuh.
|
inline |
Constructor.
Definition at line 824 of file block_load.cuh.
|
inline |
Constructor.
Definition at line 899 of file block_load.cuh.
|
inline |
Constructor.
Definition at line 974 of file block_load.cuh.
|
inlineprivate |
Internal storage allocator.
Definition at line 1035 of file block_load.cuh.
|
inlineprivate |
Internal storage allocator.
Definition at line 841 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory.
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 561 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 610 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory.
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 665 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory.
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 733 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory.
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 801 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory.
BLOCK_STORE_WARP_TRANSPOSE
, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern. thread_data
across the block of threads is { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }
. The output d_data
will be 0, 1, 2, 3, 4, 5, ...
. [in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 939 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory, guarded by range.
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 570 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory, guarded by range.
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 619 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory, guarded by range.
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 675 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory, guarded by range.
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 743 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory, guarded by range.
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 811 of file block_store.cuh.
|
inline |
Store items into a linear segment of memory, guarded by range.
BLOCK_STORE_WARP_TRANSPOSE
, meaning items are locally reordered among threads so that memory references will be efficiently coalesced using a warp-striped access pattern. thread_data
across the block of threads is { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }
and valid_items
is 5
. The output d_data
will be 0, 1, 2, 3, 4, ?, ?, ?, ...
, with only the first two threads being unmasked to store portions of valid data. [in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 988 of file block_store.cuh.
|
inlinestatic |
Definition at line 148 of file thread_store.cuh.
|
inline |
Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
[in] | block_ptr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 601 of file block_store.cuh.
|
inlinestatic |
Definition at line 128 of file thread_store.cuh.
__device__ __forceinline__ void cub::StoreDirectBlocked | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Store a blocked arrangement of items across a thread block into a linear segment of items.
\blocked
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 74 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectBlocked | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range.
\blocked
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 103 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectBlockedVectorized | ( | int | linear_tid, |
T * | block_ptr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Store a blocked arrangement of items across a thread block into a linear segment of items.
\blocked
The output offset (block_ptr
+ block_offset
) must be quad-item aligned, which is the default starting offset returned by cudaMalloc()
ITEMS_PER_THREAD
is oddT
is not a built-in primitive or CUDA vector type (e.g., short
, int2
, double
, float2
, etc.)T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_ptr | Input pointer for storing from |
[in] | items | Data to store |
Definition at line 143 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectStriped | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Store a striped arrangement of data across the thread block into a linear segment of items.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
Definition at line 206 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectStriped | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range.
\striped
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 237 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectWarpStriped | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD] | ||
) |
Store a warp-striped arrangement of data across the thread block into a linear segment of items.
\warpstriped
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[out] | items | Data to load |
Definition at line 281 of file block_store.cuh.
__device__ __forceinline__ void cub::StoreDirectWarpStriped | ( | int | linear_tid, |
OutputIteratorT | block_itr, | ||
T(&) | items[ITEMS_PER_THREAD], | ||
int | valid_items | ||
) |
Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range.
\warpstriped
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output \iterator. |
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
Definition at line 317 of file block_store.cuh.
|
inline |
Constructor.
Definition at line 552 of file block_store.cuh.
|
inline |
Constructor.
Definition at line 593 of file block_store.cuh.
|
inline |
Constructor.
Definition at line 655 of file block_store.cuh.
|
inline |
Constructor.
Definition at line 723 of file block_store.cuh.
|
inline |
Constructor.
Definition at line 791 of file block_store.cuh.
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad | ( | InputIteratorT | itr | ) |
Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.
MODIFIER | [inferred] CacheLoadModifier enumeration |
InputIteratorT | [inferred] Input iterator type \iterator |
ThreadLoad definition for generic modifiers
Definition at line 420 of file thread_load.cuh.
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad | ( | InputIteratorT | itr, |
Int2Type< LOAD_DEFAULT > | , | ||
Int2Type< false > | |||
) |
Define powers-of-two ThreadLoad specializations for the various Cache load modifiers ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
Definition at line 312 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoad | ( | T * | ptr, |
Int2Type< LOAD_DEFAULT > | , | ||
Int2Type< true > | |||
) |
ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
Definition at line 325 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoad | ( | T * | ptr, |
Int2Type< LOAD_VOLATILE > | , | ||
Int2Type< true > | |||
) |
ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
Definition at line 381 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoad | ( | T const * | ptr, |
Int2Type< MODIFIER > | , | ||
Int2Type< true > | |||
) |
ThreadLoad definition for generic modifiers on pointer types
Definition at line 395 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer | ( | T * | ptr, |
Int2Type< false > | |||
) |
ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
Definition at line 351 of file thread_load.cuh.
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer | ( | T * | ptr, |
Int2Type< true > | |||
) |
ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
Definition at line 338 of file thread_load.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | OutputIteratorT | itr, |
T | val | ||
) |
Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.
MODIFIER | [inferred] CacheStoreModifier enumeration |
InputIteratorT | [inferred] Output iterator type \iterator |
T | [inferred] Data type of output value |
ThreadStore definition for generic modifiers
Definition at line 404 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | OutputIteratorT | itr, |
T | val, | ||
Int2Type< STORE_DEFAULT > | , | ||
Int2Type< false > | |||
) |
Define ThreadStore specializations for the various Cache load modifiers ThreadStore definition for STORE_DEFAULT modifier on iterator types
Definition at line 292 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | T * | ptr, |
T | val, | ||
Int2Type< MODIFIER > | , | ||
Int2Type< true > | |||
) |
ThreadStore definition for generic modifiers on pointer types
Definition at line 375 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | T * | ptr, |
T | val, | ||
Int2Type< STORE_DEFAULT > | , | ||
Int2Type< true > | |||
) |
ThreadStore definition for STORE_DEFAULT modifier on pointer types
Definition at line 306 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStore | ( | T * | ptr, |
T | val, | ||
Int2Type< STORE_VOLATILE > | , | ||
Int2Type< true > | |||
) |
ThreadStore definition for STORE_VOLATILE modifier on pointer types
Definition at line 361 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr | ( | T * | ptr, |
T | val, | ||
Int2Type< false > | |||
) |
ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
Definition at line 333 of file thread_store.cuh.
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr | ( | T * | ptr, |
T | val, | ||
Int2Type< true > | |||
) |
ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
Definition at line 320 of file thread_store.cuh.
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 675 of file block_load.cuh.
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 728 of file block_load.cuh.
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 821 of file block_load.cuh.
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 896 of file block_load.cuh.
int cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 971 of file block_load.cuh.
|
private |
Linear thread-id.
Definition at line 1050 of file block_load.cuh.
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 549 of file block_store.cuh.
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 590 of file block_store.cuh.
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 652 of file block_store.cuh.
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 720 of file block_store.cuh.
int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid |
Linear thread-id.
Definition at line 788 of file block_store.cuh.
|
private |
Linear thread-id.
Definition at line 856 of file block_store.cuh.
_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::temp_storage |
Thread reference to shared storage.
Definition at line 818 of file block_load.cuh.
_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::temp_storage |
Thread reference to shared storage.
Definition at line 893 of file block_load.cuh.
_TempStorage& cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage |
Thread reference to shared storage.
Definition at line 968 of file block_load.cuh.
|
private |
Thread reference to shared storage.
Definition at line 1047 of file block_load.cuh.
_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::temp_storage |
Thread reference to shared storage.
Definition at line 649 of file block_store.cuh.
_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::temp_storage |
Thread reference to shared storage.
Definition at line 717 of file block_store.cuh.
_TempStorage& cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage |
Thread reference to shared storage.
Definition at line 785 of file block_store.cuh.
|
private |
Thread reference to shared storage.
Definition at line 853 of file block_store.cuh.
volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::_TempStorage::valid_items |
Temporary storage for partially-full block guard.
Definition at line 642 of file block_store.cuh.
volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::_TempStorage::valid_items |
Temporary storage for partially-full block guard.
Definition at line 710 of file block_store.cuh.
volatile int cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage::valid_items |
Temporary storage for partially-full block guard.
Definition at line 778 of file block_store.cuh.