38#include "../util_ptx.cuh"
39#include "../util_type.cuh"
40#include "../util_namespace.cuh"
112 typename OutputIteratorT,
114__device__ __forceinline__
void ThreadStore(OutputIteratorT itr, T val);
120#ifndef DOXYGEN_SHOULD_SKIP_THIS
124template <
int COUNT,
int MAX>
127 template <CacheStoreModifier MODIFIER,
typename T>
128 static __device__ __forceinline__
void Store(T *ptr, T *vals)
130 ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
134 template <
typename OutputIteratorT,
typename T>
135 static __device__ __forceinline__
void Dereference(OutputIteratorT ptr, T *vals)
137 ptr[COUNT] = vals[COUNT];
147 template <CacheStoreModifier MODIFIER,
typename T>
148 static __device__ __forceinline__
void Store(T * , T * ) {}
150 template <
typename OutputIteratorT,
typename T>
151 static __device__ __forceinline__
void Dereference(OutputIteratorT , T * ) {}
158#define _CUB_STORE_16(cub_modifier, ptx_modifier) \
160 __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
162 asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
163 _CUB_ASM_PTR_(ptr), \
170 __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
172 asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
173 _CUB_ASM_PTR_(ptr), \
182#define _CUB_STORE_8(cub_modifier, ptx_modifier) \
184 __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
186 asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
187 _CUB_ASM_PTR_(ptr), \
194 __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
196 asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
197 _CUB_ASM_PTR_(ptr), \
202 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
204 asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
205 _CUB_ASM_PTR_(ptr), \
212#define _CUB_STORE_4(cub_modifier, ptx_modifier) \
214 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
216 asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
217 _CUB_ASM_PTR_(ptr), \
225#define _CUB_STORE_2(cub_modifier, ptx_modifier) \
227 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
229 asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
230 _CUB_ASM_PTR_(ptr), \
238#define _CUB_STORE_1(cub_modifier, ptx_modifier) \
240 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
245 " cvt.u8.u16 datum, %1;" \
246 " st."#ptx_modifier".u8 [%0], datum;" \
248 _CUB_ASM_PTR_(ptr), \
249 "h"((unsigned short) val)); \
255#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \
256 _CUB_STORE_16(cub_modifier, ptx_modifier) \
257 _CUB_STORE_8(cub_modifier, ptx_modifier) \
258 _CUB_STORE_4(cub_modifier, ptx_modifier) \
259 _CUB_STORE_2(cub_modifier, ptx_modifier) \
260 _CUB_STORE_1(cub_modifier, ptx_modifier) \
266#if CUB_PTX_ARCH >= 200
291template <
typename OutputIteratorT,
typename T>
325 *
reinterpret_cast<volatile T*
>(ptr) = val;
342 const int VOLATILE_MULTIPLE =
sizeof(T) /
sizeof(VolatileWord);
343 const int SHUFFLE_MULTIPLE =
sizeof(T) /
sizeof(ShuffleWord);
345 VolatileWord words[VOLATILE_MULTIPLE];
348 for (
int i = 0; i < SHUFFLE_MULTIPLE; ++i)
349 reinterpret_cast<ShuffleWord*
>(words)[i] =
reinterpret_cast<ShuffleWord*
>(&val)[i];
352 reinterpret_cast<volatile VolatileWord*
>(ptr),
374template <
typename T,
int MODIFIER>
385 const int DEVICE_MULTIPLE =
sizeof(T) /
sizeof(DeviceWord);
386 const int SHUFFLE_MULTIPLE =
sizeof(T) /
sizeof(ShuffleWord);
388 DeviceWord words[DEVICE_MULTIPLE];
391 for (
int i = 0; i < SHUFFLE_MULTIPLE; ++i)
392 reinterpret_cast<ShuffleWord*
>(words)[i] =
reinterpret_cast<ShuffleWord*
>(&val)[i];
395 reinterpret_cast<DeviceWord*
>(ptr),
403template <CacheStoreModifier MODIFIER,
typename OutputIteratorT,
typename T>
404__device__ __forceinline__
void ThreadStore(OutputIteratorT itr, T val)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
CacheStoreModifier
Enumeration of cache modifiers for memory store operations.
__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store...
#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)
__device__ __forceinline__ void ThreadStoreVolatilePtr(T *ptr, T val, Int2Type< true >)
@ STORE_CS
Cache streaming (likely to be accessed once)
@ STORE_DEFAULT
Default (no modifier)
@ STORE_WT
Cache write-through (to system memory)
@ STORE_CG
Cache at global level.
@ STORE_VOLATILE
Volatile shared (any memory space)
@ STORE_WB
Cache write-back all coherent levels.
Optional outer namespace(s)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Helper structure for templated store iteration (inductive case)