OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_load.cuh File Reference
#include <iterator>
#include "block_exchange.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_ptx.cuh"
#include "../util_macro.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"

Go to the source code of this file.

Data Structures

class  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
 The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::TempStorage
 \smemstorage{BlockLoad} More...
 

Namespaces

namespace  cub
 Optional outer namespace(s)
 

Enumerations

enum  cub::BlockLoadAlgorithm {
  cub::BLOCK_LOAD_DIRECT , cub::BLOCK_LOAD_VECTORIZE , cub::BLOCK_LOAD_TRANSPOSE , cub::BLOCK_LOAD_WARP_TRANSPOSE ,
  cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
}
 cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. More...
 

Functions

Blocked arrangement I/O (direct)
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block.
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
 
template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::InternalLoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block.
 
Striped arrangement I/O (direct)
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a striped arrangement across the thread block.
 
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range.
 
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
 
Warp-striped arrangement I/O (direct)
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a warp-striped arrangement across the thread block.
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
 

Detailed Description

Operations for reading linear tiles of data into the CUDA thread block.

Definition in file block_load.cuh.