38 #include "../util_ptx.cuh" 39 #include "../util_type.cuh" 40 #include "../util_namespace.cuh" 112 typename OutputIteratorT,
114 __device__ __forceinline__
void ThreadStore(OutputIteratorT itr, T val);
120 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 124 template <
int COUNT,
int MAX>
127 template <CacheStoreModifier MODIFIER,
typename T>
128 static __device__ __forceinline__
void Store(T *ptr, T *vals)
130 ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
134 template <
typename OutputIteratorT,
typename T>
135 static __device__ __forceinline__
void Dereference(OutputIteratorT ptr, T *vals)
137 ptr[COUNT] = vals[COUNT];
147 template <CacheStoreModifier MODIFIER,
typename T>
148 static __device__ __forceinline__
void Store(T * , T * ) {}
150 template <
typename OutputIteratorT,
typename T>
151 static __device__ __forceinline__
void Dereference(OutputIteratorT , T * ) {}
158 #define _CUB_STORE_16(cub_modifier, ptx_modifier) \ 160 __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \ 162 asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ 163 _CUB_ASM_PTR_(ptr), \ 170 __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \ 172 asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ 173 _CUB_ASM_PTR_(ptr), \ 182 #define _CUB_STORE_8(cub_modifier, ptx_modifier) \ 184 __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \ 186 asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ 187 _CUB_ASM_PTR_(ptr), \ 194 __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \ 196 asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ 197 _CUB_ASM_PTR_(ptr), \ 202 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \ 204 asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ 205 _CUB_ASM_PTR_(ptr), \ 212 #define _CUB_STORE_4(cub_modifier, ptx_modifier) \ 214 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \ 216 asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ 217 _CUB_ASM_PTR_(ptr), \ 225 #define _CUB_STORE_2(cub_modifier, ptx_modifier) \ 227 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \ 229 asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ 230 _CUB_ASM_PTR_(ptr), \ 238 #define _CUB_STORE_1(cub_modifier, ptx_modifier) \ 240 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \ 245 " cvt.u8.u16 datum, %1;" \ 246 " st."#ptx_modifier".u8 [%0], datum;" \ 248 _CUB_ASM_PTR_(ptr), \ 249 "h"((unsigned short) val)); \ 255 #define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ 256 _CUB_STORE_16(cub_modifier, ptx_modifier) \ 257 _CUB_STORE_8(cub_modifier, ptx_modifier) \ 258 _CUB_STORE_4(cub_modifier, ptx_modifier) \ 259 _CUB_STORE_2(cub_modifier, ptx_modifier) \ 260 _CUB_STORE_1(cub_modifier, ptx_modifier) \ 266 #if CUB_PTX_ARCH >= 200 280 #undef _CUB_STORE_ALL 291 template <
typename OutputIteratorT,
typename T>
305 template <
typename T>
319 template <
typename T>
325 *reinterpret_cast<volatile T*>(ptr) = val;
332 template <
typename T>
342 const int VOLATILE_MULTIPLE =
sizeof(T) /
sizeof(VolatileWord);
343 const int SHUFFLE_MULTIPLE =
sizeof(T) /
sizeof(ShuffleWord);
345 VolatileWord words[VOLATILE_MULTIPLE];
348 for (
int i = 0; i < SHUFFLE_MULTIPLE; ++i)
349 reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
352 reinterpret_cast<volatile VolatileWord*>(ptr),
360 template <
typename T>
374 template <
typename T,
int MODIFIER>
385 const int DEVICE_MULTIPLE =
sizeof(T) /
sizeof(DeviceWord);
386 const int SHUFFLE_MULTIPLE =
sizeof(T) /
sizeof(ShuffleWord);
388 DeviceWord words[DEVICE_MULTIPLE];
391 for (
int i = 0; i < SHUFFLE_MULTIPLE; ++i)
392 reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
395 reinterpret_cast<DeviceWord*>(ptr),
403 template <CacheStoreModifier MODIFIER,
typename OutputIteratorT,
typename T>
404 __device__ __forceinline__
void ThreadStore(OutputIteratorT itr, T val)
415 #endif // DOXYGEN_SHOULD_SKIP_THIS Volatile shared (any memory space)
Optional outer namespace(s)
Cache write-back all coherent levels.
Cache streaming (likely to be accessed once)
CacheStoreModifier
Enumeration of cache modifiers for memory store operations.
Cache write-through (to system memory)
__device__ __forceinline__ void ThreadStoreVolatilePtr(T *ptr, T val, Int2Type< true >)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store...
Helper structure for templated store iteration (inductive case)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)