OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
thread_load.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <cuda.h>
37 
38 #include <iterator>
39 
40 #include "../util_ptx.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43 
45 CUB_NS_PREFIX
46 
48 namespace cub {
49 
55 //-----------------------------------------------------------------------------
56 // Tags and constants
57 //-----------------------------------------------------------------------------
58 
63 {
71 };
72 
73 
107 template <
108  CacheLoadModifier MODIFIER,
109  typename InputIteratorT>
110 __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
111 
112 
114 
115 
116 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
117 
118 
120 template <int COUNT, int MAX>
122 {
123  template <CacheLoadModifier MODIFIER, typename T>
124  static __device__ __forceinline__ void Load(T const *ptr, T *vals)
125  {
126  vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
127  IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
128  }
129 
130  template <typename InputIteratorT, typename T>
131  static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
132  {
133  vals[COUNT] = itr[COUNT];
135  }
136 };
137 
138 
140 template <int MAX>
141 struct IterateThreadLoad<MAX, MAX>
142 {
143  template <CacheLoadModifier MODIFIER, typename T>
144  static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
145 
146  template <typename InputIteratorT, typename T>
147  static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
148 };
149 
150 
154 #define _CUB_LOAD_16(cub_modifier, ptx_modifier) \
155  template<> \
156  __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr) \
157  { \
158  uint4 retval; \
159  asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \
160  "=r"(retval.x), \
161  "=r"(retval.y), \
162  "=r"(retval.z), \
163  "=r"(retval.w) : \
164  _CUB_ASM_PTR_(ptr)); \
165  return retval; \
166  } \
167  template<> \
168  __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr) \
169  { \
170  ulonglong2 retval; \
171  asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \
172  "=l"(retval.x), \
173  "=l"(retval.y) : \
174  _CUB_ASM_PTR_(ptr)); \
175  return retval; \
176  }
177 
181 #define _CUB_LOAD_8(cub_modifier, ptx_modifier) \
182  template<> \
183  __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr) \
184  { \
185  ushort4 retval; \
186  asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \
187  "=h"(retval.x), \
188  "=h"(retval.y), \
189  "=h"(retval.z), \
190  "=h"(retval.w) : \
191  _CUB_ASM_PTR_(ptr)); \
192  return retval; \
193  } \
194  template<> \
195  __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr) \
196  { \
197  uint2 retval; \
198  asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \
199  "=r"(retval.x), \
200  "=r"(retval.y) : \
201  _CUB_ASM_PTR_(ptr)); \
202  return retval; \
203  } \
204  template<> \
205  __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr) \
206  { \
207  unsigned long long retval; \
208  asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \
209  "=l"(retval) : \
210  _CUB_ASM_PTR_(ptr)); \
211  return retval; \
212  }
213 
217 #define _CUB_LOAD_4(cub_modifier, ptx_modifier) \
218  template<> \
219  __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr) \
220  { \
221  unsigned int retval; \
222  asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \
223  "=r"(retval) : \
224  _CUB_ASM_PTR_(ptr)); \
225  return retval; \
226  }
227 
228 
232 #define _CUB_LOAD_2(cub_modifier, ptx_modifier) \
233  template<> \
234  __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr) \
235  { \
236  unsigned short retval; \
237  asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \
238  "=h"(retval) : \
239  _CUB_ASM_PTR_(ptr)); \
240  return retval; \
241  }
242 
243 
247 #define _CUB_LOAD_1(cub_modifier, ptx_modifier) \
248  template<> \
249  __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr) \
250  { \
251  unsigned short retval; \
252  asm volatile ( \
253  "{" \
254  " .reg .u8 datum;" \
255  " ld."#ptx_modifier".u8 datum, [%1];" \
256  " cvt.u16.u8 %0, datum;" \
257  "}" : \
258  "=h"(retval) : \
259  _CUB_ASM_PTR_(ptr)); \
260  return (unsigned char) retval; \
261  }
262 
263 
267 #define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
268  _CUB_LOAD_16(cub_modifier, ptx_modifier) \
269  _CUB_LOAD_8(cub_modifier, ptx_modifier) \
270  _CUB_LOAD_4(cub_modifier, ptx_modifier) \
271  _CUB_LOAD_2(cub_modifier, ptx_modifier) \
272  _CUB_LOAD_1(cub_modifier, ptx_modifier) \
273 
274 
278 #if CUB_PTX_ARCH >= 200
283 #else
284  _CUB_LOAD_ALL(LOAD_CA, global)
285  // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
286  _CUB_LOAD_ALL(LOAD_CG, volatile.global)
287  _CUB_LOAD_ALL(LOAD_CS, global)
288  _CUB_LOAD_ALL(LOAD_CV, volatile.global)
289 #endif
290 
291 #if CUB_PTX_ARCH >= 350
292  _CUB_LOAD_ALL(LOAD_LDG, global.nc)
293 #else
294  _CUB_LOAD_ALL(LOAD_LDG, global)
295 #endif
296 
297 
298 // Macro cleanup
299 #undef _CUB_LOAD_ALL
300 #undef _CUB_LOAD_1
301 #undef _CUB_LOAD_2
302 #undef _CUB_LOAD_4
303 #undef _CUB_LOAD_8
304 #undef _CUB_LOAD_16
305 
306 
307 
311 template <typename InputIteratorT>
312 __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
313  InputIteratorT itr,
314  Int2Type<LOAD_DEFAULT> /*modifier*/,
315  Int2Type<false> /*is_pointer*/)
316 {
317  return *itr;
318 }
319 
320 
324 template <typename T>
325 __device__ __forceinline__ T ThreadLoad(
326  T *ptr,
327  Int2Type<LOAD_DEFAULT> /*modifier*/,
328  Int2Type<true> /*is_pointer*/)
329 {
330  return *ptr;
331 }
332 
333 
337 template <typename T>
338 __device__ __forceinline__ T ThreadLoadVolatilePointer(
339  T *ptr,
340  Int2Type<true> /*is_primitive*/)
341 {
342  T retval = *reinterpret_cast<volatile T*>(ptr);
343  return retval;
344 }
345 
346 
350 template <typename T>
351 __device__ __forceinline__ T ThreadLoadVolatilePointer(
352  T *ptr,
353  Int2Type<false> /*is_primitive*/)
354 {
355  typedef typename UnitWord<T>::VolatileWord VolatileWord; // Word type for memcopying
356 
357  const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
358 /*
359  VolatileWord words[VOLATILE_MULTIPLE];
360 
361  IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
362  reinterpret_cast<volatile VolatileWord*>(ptr),
363  words);
364 
365  return *reinterpret_cast<T*>(words);
366 */
367 
368  T retval;
369  VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
371  reinterpret_cast<volatile VolatileWord*>(ptr),
372  words);
373  return retval;
374 }
375 
376 
380 template <typename T>
381 __device__ __forceinline__ T ThreadLoad(
382  T *ptr,
383  Int2Type<LOAD_VOLATILE> /*modifier*/,
384  Int2Type<true> /*is_pointer*/)
385 {
386  // Apply tags for partial-specialization
388 }
389 
390 
394 template <typename T, int MODIFIER>
395 __device__ __forceinline__ T ThreadLoad(
396  T const *ptr,
397  Int2Type<MODIFIER> /*modifier*/,
398  Int2Type<true> /*is_pointer*/)
399 {
400  typedef typename UnitWord<T>::DeviceWord DeviceWord;
401 
402  const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
403 
404  DeviceWord words[DEVICE_MULTIPLE];
405 
406  IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
407  reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
408  words);
409 
410  return *reinterpret_cast<T*>(words);
411 }
412 
413 
417 template <
418  CacheLoadModifier MODIFIER,
419  typename InputIteratorT>
420 __device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
421 {
422  // Apply tags for partial-specialization
423  return ThreadLoad(
424  itr,
427 }
428 
429 
430 
431 #endif // DOXYGEN_SHOULD_SKIP_THIS
432 
433  // end group UtilIo
435 
436 
437 } // CUB namespace
438 CUB_NS_POSTFIX // Optional outer namespace(s)
Cache as texture.
Definition: thread_load.cuh:69
Cache at all levels.
Definition: thread_load.cuh:65
#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)
Type traits.
Definition: util_type.cuh:1158
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition: thread_load.cuh:62
Default (no modifier)
Definition: thread_load.cuh:64
Optional outer namespace(s)
Volatile (any memory space)
Definition: thread_load.cuh:70
Cache at global level.
Definition: thread_load.cuh:66
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ T ThreadLoadVolatilePointer(T *ptr, Int2Type< true >)
Cache as volatile (including cached system lines)
Definition: thread_load.cuh:68
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
Pointer vs. iterator.
Definition: util_type.cuh:170
Cache streaming (likely to be accessed once)
Definition: thread_load.cuh:67
__device__ __forceinline__ std::iterator_traits< InputIteratorT >::value_type ThreadLoad(InputIteratorT itr)
Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load a...
Helper structure for templated load iteration (inductive case)