OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
thread_store.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <cuda.h>
37 
38 #include "../util_ptx.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
54 //-----------------------------------------------------------------------------
55 // Tags and constants
56 //-----------------------------------------------------------------------------
57 
62 {
69 };
70 
71 
110 template <
111  CacheStoreModifier MODIFIER,
112  typename OutputIteratorT,
113  typename T>
114 __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
115 
116 
118 
119 
120 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
121 
122 
124 template <int COUNT, int MAX>
126 {
127  template <CacheStoreModifier MODIFIER, typename T>
128  static __device__ __forceinline__ void Store(T *ptr, T *vals)
129  {
130  ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
131  IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
132  }
133 
134  template <typename OutputIteratorT, typename T>
135  static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
136  {
137  ptr[COUNT] = vals[COUNT];
139  }
140 
141 };
142 
144 template <int MAX>
145 struct IterateThreadStore<MAX, MAX>
146 {
147  template <CacheStoreModifier MODIFIER, typename T>
148  static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
149 
150  template <typename OutputIteratorT, typename T>
151  static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
152 };
153 
154 
158 #define _CUB_STORE_16(cub_modifier, ptx_modifier) \
159  template<> \
160  __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
161  { \
162  asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
163  _CUB_ASM_PTR_(ptr), \
164  "r"(val.x), \
165  "r"(val.y), \
166  "r"(val.z), \
167  "r"(val.w)); \
168  } \
169  template<> \
170  __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
171  { \
172  asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
173  _CUB_ASM_PTR_(ptr), \
174  "l"(val.x), \
175  "l"(val.y)); \
176  }
177 
178 
182 #define _CUB_STORE_8(cub_modifier, ptx_modifier) \
183  template<> \
184  __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
185  { \
186  asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
187  _CUB_ASM_PTR_(ptr), \
188  "h"(val.x), \
189  "h"(val.y), \
190  "h"(val.z), \
191  "h"(val.w)); \
192  } \
193  template<> \
194  __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
195  { \
196  asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
197  _CUB_ASM_PTR_(ptr), \
198  "r"(val.x), \
199  "r"(val.y)); \
200  } \
201  template<> \
202  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
203  { \
204  asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
205  _CUB_ASM_PTR_(ptr), \
206  "l"(val)); \
207  }
208 
212 #define _CUB_STORE_4(cub_modifier, ptx_modifier) \
213  template<> \
214  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
215  { \
216  asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
217  _CUB_ASM_PTR_(ptr), \
218  "r"(val)); \
219  }
220 
221 
225 #define _CUB_STORE_2(cub_modifier, ptx_modifier) \
226  template<> \
227  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
228  { \
229  asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
230  _CUB_ASM_PTR_(ptr), \
231  "h"(val)); \
232  }
233 
234 
238 #define _CUB_STORE_1(cub_modifier, ptx_modifier) \
239  template<> \
240  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
241  { \
242  asm volatile ( \
243  "{" \
244  " .reg .u8 datum;" \
245  " cvt.u8.u16 datum, %1;" \
246  " st."#ptx_modifier".u8 [%0], datum;" \
247  "}" : : \
248  _CUB_ASM_PTR_(ptr), \
249  "h"((unsigned short) val)); \
250  }
251 
255 #define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \
256  _CUB_STORE_16(cub_modifier, ptx_modifier) \
257  _CUB_STORE_8(cub_modifier, ptx_modifier) \
258  _CUB_STORE_4(cub_modifier, ptx_modifier) \
259  _CUB_STORE_2(cub_modifier, ptx_modifier) \
260  _CUB_STORE_1(cub_modifier, ptx_modifier) \
261 
262 
266 #if CUB_PTX_ARCH >= 200
271 #else
272  _CUB_STORE_ALL(STORE_WB, global)
273  _CUB_STORE_ALL(STORE_CG, global)
274  _CUB_STORE_ALL(STORE_CS, global)
275  _CUB_STORE_ALL(STORE_WT, volatile.global)
276 #endif
277 
278 
279 // Macro cleanup
280 #undef _CUB_STORE_ALL
281 #undef _CUB_STORE_1
282 #undef _CUB_STORE_2
283 #undef _CUB_STORE_4
284 #undef _CUB_STORE_8
285 #undef _CUB_STORE_16
286 
287 
291 template <typename OutputIteratorT, typename T>
292 __device__ __forceinline__ void ThreadStore(
293  OutputIteratorT itr,
294  T val,
295  Int2Type<STORE_DEFAULT> /*modifier*/,
296  Int2Type<false> /*is_pointer*/)
297 {
298  *itr = val;
299 }
300 
301 
305 template <typename T>
306 __device__ __forceinline__ void ThreadStore(
307  T *ptr,
308  T val,
309  Int2Type<STORE_DEFAULT> /*modifier*/,
310  Int2Type<true> /*is_pointer*/)
311 {
312  *ptr = val;
313 }
314 
315 
319 template <typename T>
320 __device__ __forceinline__ void ThreadStoreVolatilePtr(
321  T *ptr,
322  T val,
323  Int2Type<true> /*is_primitive*/)
324 {
325  *reinterpret_cast<volatile T*>(ptr) = val;
326 }
327 
328 
332 template <typename T>
333 __device__ __forceinline__ void ThreadStoreVolatilePtr(
334  T *ptr,
335  T val,
336  Int2Type<false> /*is_primitive*/)
337 {
338  // Create a temporary using shuffle-words, then store using volatile-words
339  typedef typename UnitWord<T>::VolatileWord VolatileWord;
340  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
341 
342  const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
343  const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
344 
345  VolatileWord words[VOLATILE_MULTIPLE];
346 
347  #pragma unroll
348  for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
349  reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
350 
352  reinterpret_cast<volatile VolatileWord*>(ptr),
353  words);
354 }
355 
356 
360 template <typename T>
361 __device__ __forceinline__ void ThreadStore(
362  T *ptr,
363  T val,
364  Int2Type<STORE_VOLATILE> /*modifier*/,
365  Int2Type<true> /*is_pointer*/)
366 {
368 }
369 
370 
374 template <typename T, int MODIFIER>
375 __device__ __forceinline__ void ThreadStore(
376  T *ptr,
377  T val,
378  Int2Type<MODIFIER> /*modifier*/,
379  Int2Type<true> /*is_pointer*/)
380 {
381  // Create a temporary using shuffle-words, then store using device-words
382  typedef typename UnitWord<T>::DeviceWord DeviceWord;
383  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
384 
385  const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
386  const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
387 
388  DeviceWord words[DEVICE_MULTIPLE];
389 
390  #pragma unroll
391  for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
392  reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
393 
394  IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
395  reinterpret_cast<DeviceWord*>(ptr),
396  words);
397 }
398 
399 
403 template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
404 __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
405 {
406  ThreadStore(
407  itr,
408  val,
411 }
412 
413 
414 
415 #endif // DOXYGEN_SHOULD_SKIP_THIS
416 
417  // end group UtilIo
419 
420 
421 } // CUB namespace
422 CUB_NS_POSTFIX // Optional outer namespace(s)
Volatile shared (any memory space)
Type traits.
Definition: util_type.cuh:1158
Default (no modifier)
Cache at global level.
Optional outer namespace(s)
Cache write-back all coherent levels.
Cache streaming (likely to be accessed once)
CacheStoreModifier
Enumeration of cache modifiers for memory store operations.
Cache write-through (to system memory)
__device__ __forceinline__ void ThreadStoreVolatilePtr(T *ptr, T val, Int2Type< true >)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store...
Helper structure for templated store iteration (inductive case)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
Pointer vs. iterator.
Definition: util_type.cuh:170
#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)