40 #include "../util_ptx.cuh" 41 #include "../util_type.cuh" 42 #include "../util_namespace.cuh" 109 typename InputIteratorT>
110 __device__ __forceinline__
typename std::iterator_traits<InputIteratorT>::value_type
ThreadLoad(InputIteratorT itr);
116 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 120 template <
int COUNT,
int MAX>
123 template <CacheLoadModifier MODIFIER,
typename T>
124 static __device__ __forceinline__
void Load(T
const *ptr, T *vals)
126 vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
130 template <
typename InputIteratorT,
typename T>
131 static __device__ __forceinline__
void Dereference(InputIteratorT itr, T *vals)
133 vals[COUNT] = itr[COUNT];
143 template <CacheLoadModifier MODIFIER,
typename T>
144 static __device__ __forceinline__
void Load(T
const * , T * ) {}
146 template <
typename InputIteratorT,
typename T>
147 static __device__ __forceinline__
void Dereference(InputIteratorT , T * ) {}
154 #define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ 156 __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr) \ 159 asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ 164 _CUB_ASM_PTR_(ptr)); \ 168 __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr) \ 171 asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ 174 _CUB_ASM_PTR_(ptr)); \ 181 #define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ 183 __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr) \ 186 asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ 191 _CUB_ASM_PTR_(ptr)); \ 195 __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr) \ 198 asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ 201 _CUB_ASM_PTR_(ptr)); \ 205 __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr) \ 207 unsigned long long retval; \ 208 asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ 210 _CUB_ASM_PTR_(ptr)); \ 217 #define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ 219 __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr) \ 221 unsigned int retval; \ 222 asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ 224 _CUB_ASM_PTR_(ptr)); \ 232 #define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ 234 __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr) \ 236 unsigned short retval; \ 237 asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ 239 _CUB_ASM_PTR_(ptr)); \ 247 #define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ 249 __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr) \ 251 unsigned short retval; \ 255 " ld."#ptx_modifier".u8 datum, [%1];" \ 256 " cvt.u16.u8 %0, datum;" \ 259 _CUB_ASM_PTR_(ptr)); \ 260 return (unsigned char) retval; \ 267 #define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ 268 _CUB_LOAD_16(cub_modifier, ptx_modifier) \ 269 _CUB_LOAD_8(cub_modifier, ptx_modifier) \ 270 _CUB_LOAD_4(cub_modifier, ptx_modifier) \ 271 _CUB_LOAD_2(cub_modifier, ptx_modifier) \ 272 _CUB_LOAD_1(cub_modifier, ptx_modifier) \ 278 #if CUB_PTX_ARCH >= 200 291 #if CUB_PTX_ARCH >= 350 311 template <
typename InputIteratorT>
312 __device__ __forceinline__
typename std::iterator_traits<InputIteratorT>::value_type
ThreadLoad(
324 template <
typename T>
337 template <
typename T>
342 T retval = *reinterpret_cast<volatile T*>(ptr);
350 template <
typename T>
357 const int VOLATILE_MULTIPLE =
sizeof(T) /
sizeof(VolatileWord);
369 VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
371 reinterpret_cast<volatile VolatileWord*>(ptr),
380 template <
typename T>
394 template <
typename T,
int MODIFIER>
402 const int DEVICE_MULTIPLE =
sizeof(T) /
sizeof(DeviceWord);
404 DeviceWord words[DEVICE_MULTIPLE];
407 reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
410 return *reinterpret_cast<T*>(words);
419 typename InputIteratorT>
420 __device__ __forceinline__
typename std::iterator_traits<InputIteratorT>::value_type
ThreadLoad(InputIteratorT itr)
431 #endif // DOXYGEN_SHOULD_SKIP_THIS
#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Optional outer namespace(s)
Volatile (any memory space)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ T ThreadLoadVolatilePointer(T *ptr, Int2Type< true >)
Cache as volatile (including cached system lines)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Cache streaming (likely to be accessed once)
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type ThreadLoad(InputIteratorT itr)
Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load a...
Helper structure for templated load iteration (inductive case)