OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
thread_store.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include <cuda.h>
37
38#include "../util_ptx.cuh"
39#include "../util_type.cuh"
40#include "../util_namespace.cuh"
41
43CUB_NS_PREFIX
44
46namespace cub {
47
54//-----------------------------------------------------------------------------
55// Tags and constants
56//-----------------------------------------------------------------------------
57
62{
69};
70
71
110template <
111 CacheStoreModifier MODIFIER,
112 typename OutputIteratorT,
113 typename T>
114__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
115
116
118
119
120#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
121
122
124template <int COUNT, int MAX>
126{
127 template <CacheStoreModifier MODIFIER, typename T>
128 static __device__ __forceinline__ void Store(T *ptr, T *vals)
129 {
130 ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
131 IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
132 }
133
134 template <typename OutputIteratorT, typename T>
135 static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
136 {
137 ptr[COUNT] = vals[COUNT];
139 }
140
141};
142
144template <int MAX>
145struct IterateThreadStore<MAX, MAX>
146{
147 template <CacheStoreModifier MODIFIER, typename T>
148 static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
149
150 template <typename OutputIteratorT, typename T>
151 static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
152};
153
154
158#define _CUB_STORE_16(cub_modifier, ptx_modifier) \
159 template<> \
160 __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
161 { \
162 asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
163 _CUB_ASM_PTR_(ptr), \
164 "r"(val.x), \
165 "r"(val.y), \
166 "r"(val.z), \
167 "r"(val.w)); \
168 } \
169 template<> \
170 __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
171 { \
172 asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
173 _CUB_ASM_PTR_(ptr), \
174 "l"(val.x), \
175 "l"(val.y)); \
176 }
177
178
182#define _CUB_STORE_8(cub_modifier, ptx_modifier) \
183 template<> \
184 __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
185 { \
186 asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
187 _CUB_ASM_PTR_(ptr), \
188 "h"(val.x), \
189 "h"(val.y), \
190 "h"(val.z), \
191 "h"(val.w)); \
192 } \
193 template<> \
194 __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
195 { \
196 asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
197 _CUB_ASM_PTR_(ptr), \
198 "r"(val.x), \
199 "r"(val.y)); \
200 } \
201 template<> \
202 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
203 { \
204 asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
205 _CUB_ASM_PTR_(ptr), \
206 "l"(val)); \
207 }
208
212#define _CUB_STORE_4(cub_modifier, ptx_modifier) \
213 template<> \
214 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
215 { \
216 asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
217 _CUB_ASM_PTR_(ptr), \
218 "r"(val)); \
219 }
220
221
225#define _CUB_STORE_2(cub_modifier, ptx_modifier) \
226 template<> \
227 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
228 { \
229 asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
230 _CUB_ASM_PTR_(ptr), \
231 "h"(val)); \
232 }
233
234
238#define _CUB_STORE_1(cub_modifier, ptx_modifier) \
239 template<> \
240 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
241 { \
242 asm volatile ( \
243 "{" \
244 " .reg .u8 datum;" \
245 " cvt.u8.u16 datum, %1;" \
246 " st."#ptx_modifier".u8 [%0], datum;" \
247 "}" : : \
248 _CUB_ASM_PTR_(ptr), \
249 "h"((unsigned short) val)); \
250 }
251
255#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \
256 _CUB_STORE_16(cub_modifier, ptx_modifier) \
257 _CUB_STORE_8(cub_modifier, ptx_modifier) \
258 _CUB_STORE_4(cub_modifier, ptx_modifier) \
259 _CUB_STORE_2(cub_modifier, ptx_modifier) \
260 _CUB_STORE_1(cub_modifier, ptx_modifier) \
261
262
266#if CUB_PTX_ARCH >= 200
271#else
272 _CUB_STORE_ALL(STORE_WB, global)
273 _CUB_STORE_ALL(STORE_CG, global)
274 _CUB_STORE_ALL(STORE_CS, global)
275 _CUB_STORE_ALL(STORE_WT, volatile.global)
276#endif
277
278
279// Macro cleanup
280#undef _CUB_STORE_ALL
281#undef _CUB_STORE_1
282#undef _CUB_STORE_2
283#undef _CUB_STORE_4
284#undef _CUB_STORE_8
285#undef _CUB_STORE_16
286
287
291template <typename OutputIteratorT, typename T>
292__device__ __forceinline__ void ThreadStore(
293 OutputIteratorT itr,
294 T val,
295 Int2Type<STORE_DEFAULT> /*modifier*/,
296 Int2Type<false> /*is_pointer*/)
297{
298 *itr = val;
299}
300
301
305template <typename T>
306__device__ __forceinline__ void ThreadStore(
307 T *ptr,
308 T val,
309 Int2Type<STORE_DEFAULT> /*modifier*/,
310 Int2Type<true> /*is_pointer*/)
311{
312 *ptr = val;
313}
314
315
319template <typename T>
320__device__ __forceinline__ void ThreadStoreVolatilePtr(
321 T *ptr,
322 T val,
323 Int2Type<true> /*is_primitive*/)
324{
325 *reinterpret_cast<volatile T*>(ptr) = val;
326}
327
328
332template <typename T>
333__device__ __forceinline__ void ThreadStoreVolatilePtr(
334 T *ptr,
335 T val,
336 Int2Type<false> /*is_primitive*/)
337{
338 // Create a temporary using shuffle-words, then store using volatile-words
339 typedef typename UnitWord<T>::VolatileWord VolatileWord;
340 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
341
342 const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
343 const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
344
345 VolatileWord words[VOLATILE_MULTIPLE];
346
347 #pragma unroll
348 for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
349 reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
350
352 reinterpret_cast<volatile VolatileWord*>(ptr),
353 words);
354}
355
356
360template <typename T>
361__device__ __forceinline__ void ThreadStore(
362 T *ptr,
363 T val,
364 Int2Type<STORE_VOLATILE> /*modifier*/,
365 Int2Type<true> /*is_pointer*/)
366{
368}
369
370
374template <typename T, int MODIFIER>
375__device__ __forceinline__ void ThreadStore(
376 T *ptr,
377 T val,
378 Int2Type<MODIFIER> /*modifier*/,
379 Int2Type<true> /*is_pointer*/)
380{
381 // Create a temporary using shuffle-words, then store using device-words
382 typedef typename UnitWord<T>::DeviceWord DeviceWord;
383 typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
384
385 const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
386 const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
387
388 DeviceWord words[DEVICE_MULTIPLE];
389
390 #pragma unroll
391 for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
392 reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
393
394 IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
395 reinterpret_cast<DeviceWord*>(ptr),
396 words);
397}
398
399
403template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
404__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
405{
407 itr,
408 val,
411}
412
413
414
415#endif // DOXYGEN_SHOULD_SKIP_THIS
416
417 // end group UtilIo
419
420
421} // CUB namespace
422CUB_NS_POSTFIX // Optional outer namespace(s)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
CacheStoreModifier
Enumeration of cache modifiers for memory store operations.
__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store...
#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)
__device__ __forceinline__ void ThreadStoreVolatilePtr(T *ptr, T val, Int2Type< true >)
@ STORE_CS
Cache streaming (likely to be accessed once)
@ STORE_DEFAULT
Default (no modifier)
@ STORE_WT
Cache write-through (to system memory)
@ STORE_CG
Cache at global level.
@ STORE_VOLATILE
Volatile shared (any memory space)
@ STORE_WB
Cache write-back all coherent levels.
Optional outer namespace(s)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Pointer vs. iterator.
Helper structure for templated store iteration (inductive case)
Type traits.