#include <iterator>
#include "block_exchange.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_ptx.cuh"
#include "../util_macro.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"
Go to the source code of this file.
Namespaces | |
namespace | cub |
Optional outer namespace(s) | |
Enumerations | |
enum | cub::BlockLoadAlgorithm { cub::BLOCK_LOAD_DIRECT , cub::BLOCK_LOAD_VECTORIZE , cub::BLOCK_LOAD_TRANSPOSE , cub::BLOCK_LOAD_WARP_TRANSPOSE , cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED } |
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. More... | |
Functions | |
Blocked arrangement I/O (direct) | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a blocked arrangement across the thread block. | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. | |
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. | |
template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD> | |
__device__ __forceinline__ void | cub::InternalLoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
template<typename T , int ITEMS_PER_THREAD> | |
__device__ __forceinline__ void | cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a blocked arrangement across the thread block. | |
Striped arrangement I/O (direct) | |
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a striped arrangement across the thread block. | |
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range. | |
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. | |
Warp-striped arrangement I/O (direct) | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
Load a linear segment of items into a warp-striped arrangement across the thread block. | |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range. | |
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > | |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. | |
Operations for reading linear tiles of data into the CUDA thread block.
Definition in file block_load.cuh.