OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
UtilIo

Data Structures

class  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
 The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block. More...
 
class  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
 The BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory. More...
 
struct  cub::IterateThreadLoad< COUNT, MAX >
 Helper structure for templated load iteration (inductive case) More...
 
struct  cub::IterateThreadLoad< MAX, MAX >
 Helper structure for templated load iteration (termination case) More...
 
struct  cub::IterateThreadStore< COUNT, MAX >
 Helper structure for templated store iteration (inductive case) More...
 
struct  cub::IterateThreadStore< MAX, MAX >
 Helper structure for templated store iteration (termination case) More...
 

Macros

#define _CUB_LOAD_16(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_8(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_4(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_2(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_1(cub_modifier, ptx_modifier)
 
#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_16(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_8(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_4(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_2(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_1(cub_modifier, ptx_modifier)
 
#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)
 

Enumerations

enum  cub::CacheLoadModifier {
  cub::LOAD_DEFAULT, cub::LOAD_CA, cub::LOAD_CG, cub::LOAD_CS,
  cub::LOAD_CV, cub::LOAD_LDG, cub::LOAD_VOLATILE
}
 Enumeration of cache modifiers for memory load operations. More...
 
enum  cub::CacheStoreModifier {
  cub::STORE_DEFAULT, cub::STORE_WB, cub::STORE_CG, cub::STORE_CS,
  cub::STORE_WT, cub::STORE_VOLATILE
}
 Enumeration of cache modifiers for memory store operations. More...
 

Functions

template<typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad (InputIteratorT itr, Int2Type< LOAD_DEFAULT >, Int2Type< false >)
 
template<typename T >
__device__ __forceinline__ T cub::ThreadLoad (T *ptr, Int2Type< LOAD_DEFAULT >, Int2Type< true >)
 
template<typename T >
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< true >)
 
template<typename T >
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer (T *ptr, Int2Type< false >)
 
template<typename T >
__device__ __forceinline__ T cub::ThreadLoad (T *ptr, Int2Type< LOAD_VOLATILE >, Int2Type< true >)
 
template<typename T , int MODIFIER>
__device__ __forceinline__ T cub::ThreadLoad (T const *ptr, Int2Type< MODIFIER >, Int2Type< true >)
 
template<typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore (OutputIteratorT itr, T val, Int2Type< STORE_DEFAULT >, Int2Type< false >)
 
template<typename T >
__device__ __forceinline__ void cub::ThreadStore (T *ptr, T val, Int2Type< STORE_DEFAULT >, Int2Type< true >)
 
template<typename T >
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< true >)
 
template<typename T >
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr (T *ptr, T val, Int2Type< false >)
 
template<typename T >
__device__ __forceinline__ void cub::ThreadStore (T *ptr, T val, Int2Type< STORE_VOLATILE >, Int2Type< true >)
 
template<typename T , int MODIFIER>
__device__ __forceinline__ void cub::ThreadStore (T *ptr, T val, Int2Type< MODIFIER >, Int2Type< true >)
 

Blocked arrangement I/O (direct)

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block. More...
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. More...
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. More...
 
template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::InternalLoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block. More...
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store a blocked arrangement of items across a thread block into a linear segment of items. More...
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range. More...
 
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::StoreDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 Store a blocked arrangement of items across a thread block into a linear segment of items. More...
 

Striped arrangement I/O (direct)

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a striped arrangement across the thread block. More...
 
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range. More...
 
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More...
 
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store a striped arrangement of data across the thread block into a linear segment of items. More...
 
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range. More...
 

Warp-striped arrangement I/O (direct)

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a warp-striped arrangement across the thread block. More...
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range. More...
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More...
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store a warp-striped arrangement of data across the thread block into a linear segment of items. More...
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range. More...
 

Thread I/O (cache modified)

template<CacheLoadModifier MODIFIER, typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad (InputIteratorT itr)
 Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. More...
 
template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore (OutputIteratorT itr, T val)
 Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. More...
 

Detailed Description

Macro Definition Documentation

◆ _CUB_LOAD_1

#define _CUB_LOAD_1 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr) \
{ \
unsigned short retval; \
asm volatile ( \
"{" \
" .reg .u8 datum;" \
" ld."#ptx_modifier".u8 datum, [%1];" \
" cvt.u16.u8 %0, datum;" \
"}" : \
"=h"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return (unsigned char) retval; \
}

Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier

Definition at line 247 of file thread_load.cuh.

◆ _CUB_LOAD_16

#define _CUB_LOAD_16 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr) \
{ \
uint4 retval; \
asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \
"=r"(retval.x), \
"=r"(retval.y), \
"=r"(retval.z), \
"=r"(retval.w) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
} \
template<> \
__device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr) \
{ \
ulonglong2 retval; \
asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \
"=l"(retval.x), \
"=l"(retval.y) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}

Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier

Definition at line 154 of file thread_load.cuh.

◆ _CUB_LOAD_2

#define _CUB_LOAD_2 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr) \
{ \
unsigned short retval; \
asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \
"=h"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}

Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier

Definition at line 232 of file thread_load.cuh.

◆ _CUB_LOAD_4

#define _CUB_LOAD_4 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr) \
{ \
unsigned int retval; \
asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \
"=r"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}

Define a uint (4B) ThreadLoad specialization for the given Cache load modifier

Definition at line 217 of file thread_load.cuh.

◆ _CUB_LOAD_8

#define _CUB_LOAD_8 (   cub_modifier,
  ptx_modifier 
)

Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier

Definition at line 181 of file thread_load.cuh.

◆ _CUB_LOAD_ALL

#define _CUB_LOAD_ALL (   cub_modifier,
  ptx_modifier 
)
Value:
_CUB_LOAD_16(cub_modifier, ptx_modifier) \
_CUB_LOAD_8(cub_modifier, ptx_modifier) \
_CUB_LOAD_4(cub_modifier, ptx_modifier) \
_CUB_LOAD_2(cub_modifier, ptx_modifier) \
_CUB_LOAD_1(cub_modifier, ptx_modifier) \
#define _CUB_LOAD_16(cub_modifier, ptx_modifier)

Define powers-of-two ThreadLoad specializations for the given Cache load modifier

Definition at line 267 of file thread_load.cuh.

◆ _CUB_STORE_1

#define _CUB_STORE_1 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
{ \
asm volatile ( \
"{" \
" .reg .u8 datum;" \
" cvt.u8.u16 datum, %1;" \
" st."#ptx_modifier".u8 [%0], datum;" \
"}" : : \
_CUB_ASM_PTR_(ptr), \
"h"((unsigned short) val)); \
}

Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier

Definition at line 238 of file thread_store.cuh.

◆ _CUB_STORE_16

#define _CUB_STORE_16 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
{ \
asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val.x), \
"r"(val.y), \
"r"(val.z), \
"r"(val.w)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
{ \
asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
_CUB_ASM_PTR_(ptr), \
"l"(val.x), \
"l"(val.y)); \
}

Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier

Definition at line 158 of file thread_store.cuh.

◆ _CUB_STORE_2

#define _CUB_STORE_2 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
{ \
asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"h"(val)); \
}

Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier

Definition at line 225 of file thread_store.cuh.

◆ _CUB_STORE_4

#define _CUB_STORE_4 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
{ \
asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val)); \
}

Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier

Definition at line 212 of file thread_store.cuh.

◆ _CUB_STORE_8

#define _CUB_STORE_8 (   cub_modifier,
  ptx_modifier 
)
Value:
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
{ \
asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
_CUB_ASM_PTR_(ptr), \
"h"(val.x), \
"h"(val.y), \
"h"(val.z), \
"h"(val.w)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
{ \
asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val.x), \
"r"(val.y)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
{ \
asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"l"(val)); \
}

Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier

Definition at line 182 of file thread_store.cuh.

◆ _CUB_STORE_ALL

#define _CUB_STORE_ALL (   cub_modifier,
  ptx_modifier 
)
Value:
_CUB_STORE_16(cub_modifier, ptx_modifier) \
_CUB_STORE_8(cub_modifier, ptx_modifier) \
_CUB_STORE_4(cub_modifier, ptx_modifier) \
_CUB_STORE_2(cub_modifier, ptx_modifier) \
_CUB_STORE_1(cub_modifier, ptx_modifier) \
#define _CUB_STORE_16(cub_modifier, ptx_modifier)

Define powers-of-two ThreadStore specializations for the given Cache load modifier

Definition at line 255 of file thread_store.cuh.

Enumeration Type Documentation

◆ CacheLoadModifier

Enumeration of cache modifiers for memory load operations.

Enumerator
LOAD_DEFAULT 

Default (no modifier)

LOAD_CA 

Cache at all levels.

LOAD_CG 

Cache at global level.

LOAD_CS 

Cache streaming (likely to be accessed once)

LOAD_CV 

Cache as volatile (including cached system lines)

LOAD_LDG 

Cache as texture.

LOAD_VOLATILE 

Volatile (any memory space)

Definition at line 62 of file thread_load.cuh.

◆ CacheStoreModifier

Enumeration of cache modifiers for memory store operations.

Enumerator
STORE_DEFAULT 

Default (no modifier)

STORE_WB 

Cache write-back all coherent levels.

STORE_CG 

Cache at global level.

STORE_CS 

Cache streaming (likely to be accessed once)

STORE_WT 

Cache write-through (to system memory)

STORE_VOLATILE 

Volatile shared (any memory space)

Definition at line 61 of file thread_store.cuh.

Function Documentation

◆ InternalLoadDirectBlockedVectorized()

template<CacheLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::InternalLoadDirectBlockedVectorized ( int  linear_tid,
T *  block_ptr,
T(&)  items[ITEMS_PER_THREAD] 
)

Internal implementation for load vectorization

Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_ptrInput pointer for loading from
[out]itemsData to load

Definition at line 162 of file block_load.cuh.

◆ LoadDirectBlocked() [1/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a blocked arrangement across the thread block.

\blocked

Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 76 of file block_load.cuh.

◆ LoadDirectBlocked() [2/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.

\blocked

Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 105 of file block_load.cuh.

◆ LoadDirectBlocked() [3/3]

template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)

Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..

\blocked

Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 138 of file block_load.cuh.

◆ LoadDirectBlockedVectorized()

template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized ( int  linear_tid,
T *  block_ptr,
T(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a blocked arrangement across the thread block.

\blocked

The input offset (block_ptr + block_offset) must be quad-item aligned

The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:

  • ITEMS_PER_THREAD is odd
  • The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_ptrInput pointer for loading from
[out]itemsData to load

Definition at line 227 of file block_load.cuh.

◆ LoadDirectStriped() [1/3]

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a striped arrangement across the thread block.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 258 of file block_load.cuh.

◆ LoadDirectStriped() [2/3]

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Load a linear segment of items into a striped arrangement across the thread block, guarded by range.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 288 of file block_load.cuh.

◆ LoadDirectStriped() [3/3]

template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)

Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 323 of file block_load.cuh.

◆ LoadDirectWarpStriped() [1/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a warp-striped arrangement across the thread block.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load

Definition at line 362 of file block_load.cuh.

◆ LoadDirectWarpStriped() [2/3]

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load

Definition at line 398 of file block_load.cuh.

◆ LoadDirectWarpStriped() [3/3]

template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)

Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items

Definition at line 439 of file block_load.cuh.

◆ StoreDirectBlocked() [1/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a blocked arrangement of items across a thread block into a linear segment of items.

\blocked

Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 74 of file block_store.cuh.

◆ StoreDirectBlocked() [2/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range.

\blocked

Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 103 of file block_store.cuh.

◆ StoreDirectBlockedVectorized()

template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::StoreDirectBlockedVectorized ( int  linear_tid,
T *  block_ptr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a blocked arrangement of items across a thread block into a linear segment of items.

\blocked

The output offset (block_ptr + block_offset) must be quad-item aligned, which is the default starting offset returned by cudaMalloc()

The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
  • ITEMS_PER_THREAD is odd
  • The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)
Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_ptrInput pointer for storing from
[in]itemsData to store

Definition at line 143 of file block_store.cuh.

◆ StoreDirectStriped() [1/2]

template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a striped arrangement of data across the thread block into a linear segment of items.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store

Definition at line 206 of file block_store.cuh.

◆ StoreDirectStriped() [2/2]

template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range.

\striped

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 237 of file block_store.cuh.

◆ StoreDirectWarpStriped() [1/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a warp-striped arrangement of data across the thread block into a linear segment of items.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[out]itemsData to load

Definition at line 281 of file block_store.cuh.

◆ StoreDirectWarpStriped() [2/2]

template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range.

\warpstriped

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output \iterator.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write

Definition at line 317 of file block_store.cuh.

◆ ThreadLoad() [1/5]

template<CacheLoadModifier MODIFIER, typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type cub::ThreadLoad ( InputIteratorT  itr)

Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.

Example
#include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh>
// 32-bit load using cache-global modifier:
int *d_in;
int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
// 16-bit load using default modifier
short *d_in;
short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
// 256-bit load using cache-volatile modifier
double4 *d_in;
double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
// 96-bit load using cache-streaming modifier
struct TestFoo { bool a; short b; };
TestFoo *d_struct;
TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
Template Parameters
MODIFIER[inferred] CacheLoadModifier enumeration
InputIteratorT[inferred] Input iterator type \iterator

ThreadLoad definition for generic modifiers

Definition at line 420 of file thread_load.cuh.

◆ ThreadLoad() [2/5]

template<typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits<InputIteratorT>::value_type cub::ThreadLoad ( InputIteratorT  itr,
Int2Type< LOAD_DEFAULT ,
Int2Type< false >   
)

Define powers-of-two ThreadLoad specializations for the various Cache load modifiersThreadLoad definition for LOAD_DEFAULT modifier on iterator types

Definition at line 312 of file thread_load.cuh.

◆ ThreadLoad() [3/5]

template<typename T >
__device__ __forceinline__ T cub::ThreadLoad ( T *  ptr,
Int2Type< LOAD_DEFAULT ,
Int2Type< true >   
)

ThreadLoad definition for LOAD_DEFAULT modifier on pointer types

Definition at line 325 of file thread_load.cuh.

◆ ThreadLoad() [4/5]

template<typename T >
__device__ __forceinline__ T cub::ThreadLoad ( T *  ptr,
Int2Type< LOAD_VOLATILE ,
Int2Type< true >   
)

ThreadLoad definition for LOAD_VOLATILE modifier on pointer types

Definition at line 381 of file thread_load.cuh.

◆ ThreadLoad() [5/5]

template<typename T , int MODIFIER>
__device__ __forceinline__ T cub::ThreadLoad ( T const *  ptr,
Int2Type< MODIFIER >  ,
Int2Type< true >   
)

ThreadLoad definition for generic modifiers on pointer types

Definition at line 395 of file thread_load.cuh.

◆ ThreadLoadVolatilePointer() [1/2]

template<typename T >
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer ( T *  ptr,
Int2Type< true >   
)

ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types

Definition at line 338 of file thread_load.cuh.

◆ ThreadLoadVolatilePointer() [2/2]

template<typename T >
__device__ __forceinline__ T cub::ThreadLoadVolatilePointer ( T *  ptr,
Int2Type< false >   
)

ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types

Definition at line 351 of file thread_load.cuh.

◆ ThreadStore() [1/5]

template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore ( OutputIteratorT  itr,
val 
)

Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.

Example
#include <cub/cub.cuh> // or equivalently <cub/thread/thread_store.cuh>
// 32-bit store using cache-global modifier:
int *d_out;
int val;
cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
// 16-bit store using default modifier
short *d_out;
short val;
cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
// 256-bit store using write-through modifier
double4 *d_out;
double4 val;
cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
// 96-bit store using cache-streaming cache modifier
struct TestFoo { bool a; short b; };
TestFoo *d_struct;
TestFoo val;
cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
Template Parameters
MODIFIER[inferred] CacheStoreModifier enumeration
InputIteratorT[inferred] Output iterator type \iterator
T[inferred] Data type of output value

ThreadStore definition for generic modifiers

Definition at line 404 of file thread_store.cuh.

◆ ThreadStore() [2/5]

template<typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore ( OutputIteratorT  itr,
val,
Int2Type< STORE_DEFAULT ,
Int2Type< false >   
)

Define ThreadStore specializations for the various Cache load modifiersThreadStore definition for STORE_DEFAULT modifier on iterator types

Definition at line 292 of file thread_store.cuh.

◆ ThreadStore() [3/5]

template<typename T >
__device__ __forceinline__ void cub::ThreadStore ( T *  ptr,
val,
Int2Type< STORE_DEFAULT ,
Int2Type< true >   
)

ThreadStore definition for STORE_DEFAULT modifier on pointer types

Definition at line 306 of file thread_store.cuh.

◆ ThreadStore() [4/5]

template<typename T >
__device__ __forceinline__ void cub::ThreadStore ( T *  ptr,
val,
Int2Type< STORE_VOLATILE ,
Int2Type< true >   
)

ThreadStore definition for STORE_VOLATILE modifier on pointer types

Definition at line 361 of file thread_store.cuh.

◆ ThreadStore() [5/5]

template<typename T , int MODIFIER>
__device__ __forceinline__ void cub::ThreadStore ( T *  ptr,
val,
Int2Type< MODIFIER >  ,
Int2Type< true >   
)

ThreadStore definition for generic modifiers on pointer types

Definition at line 375 of file thread_store.cuh.

◆ ThreadStoreVolatilePtr() [1/2]

template<typename T >
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr ( T *  ptr,
val,
Int2Type< true >   
)

ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types

Definition at line 320 of file thread_store.cuh.

◆ ThreadStoreVolatilePtr() [2/2]

template<typename T >
__device__ __forceinline__ void cub::ThreadStoreVolatilePtr ( T *  ptr,
val,
Int2Type< false >   
)

ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types

Definition at line 333 of file thread_store.cuh.