OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_load.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "block_exchange.cuh"
39 #include "../iterator/cache_modified_input_iterator.cuh"
40 #include "../util_ptx.cuh"
41 #include "../util_macro.cuh"
42 #include "../util_type.cuh"
43 #include "../util_namespace.cuh"
44 
46 CUB_NS_PREFIX
47 
49 namespace cub {
50 
57 /******************************************************************/
61 
62 
72 template <
73  typename InputT,
74  int ITEMS_PER_THREAD,
75  typename InputIteratorT>
76 __device__ __forceinline__ void LoadDirectBlocked(
77  int linear_tid,
78  InputIteratorT block_itr,
79  InputT (&items)[ITEMS_PER_THREAD])
80 {
81  InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
82 
83  // Load directly in thread-blocked order
84  #pragma unroll
85  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
86  {
87  items[ITEM] = thread_itr[ITEM];
88  }
89 }
90 
91 
101 template <
102  typename InputT,
103  int ITEMS_PER_THREAD,
104  typename InputIteratorT>
105 __device__ __forceinline__ void LoadDirectBlocked(
106  int linear_tid,
107  InputIteratorT block_itr,
108  InputT (&items)[ITEMS_PER_THREAD],
109  int valid_items)
110 {
111  InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
112 
113  #pragma unroll
114  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
115  {
116  if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
117  {
118  items[ITEM] = thread_itr[ITEM];
119  }
120  }
121 }
122 
123 
133 template <
134  typename InputT,
135  typename DefaultT,
136  int ITEMS_PER_THREAD,
137  typename InputIteratorT>
138 __device__ __forceinline__ void LoadDirectBlocked(
139  int linear_tid,
140  InputIteratorT block_itr,
141  InputT (&items)[ITEMS_PER_THREAD],
142  int valid_items,
143  DefaultT oob_default)
144 {
145  #pragma unroll
146  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
147  items[ITEM] = oob_default;
148 
149  LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
150 }
151 
152 
153 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
154 
158 template <
159  CacheLoadModifier MODIFIER,
160  typename T,
161  int ITEMS_PER_THREAD>
162 __device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
163  int linear_tid,
164  T *block_ptr,
165  T (&items)[ITEMS_PER_THREAD])
166 {
167  // Biggest memory access word that T is a whole multiple of
168  typedef typename UnitWord<T>::DeviceWord DeviceWord;
169 
170  enum
171  {
172  TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
173 
174  VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
175  4 :
176  (TOTAL_WORDS % 2 == 0) ?
177  2 :
178  1,
179 
180  VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
181  };
182 
183  // Vector type
185 
186  // Vector items
187  Vector vec_items[VECTORS_PER_THREAD];
188 
189  // Aliased input ptr
190  Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
191 
192  // Load directly in thread-blocked order
193  #pragma unroll
194  for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
195  {
196  vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
197  }
198 
199  // Copy
200  #pragma unroll
201  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
202  {
203  items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
204  }
205 }
206 
207 #endif // DOXYGEN_SHOULD_SKIP_THIS
208 
209 
224 template <
225  typename T,
226  int ITEMS_PER_THREAD>
227 __device__ __forceinline__ void LoadDirectBlockedVectorized(
228  int linear_tid,
229  T *block_ptr,
230  T (&items)[ITEMS_PER_THREAD])
231 {
232  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
233 }
234 
235 
237 /******************************************************************/
241 
242 
253 template <
254  int BLOCK_THREADS,
255  typename InputT,
256  int ITEMS_PER_THREAD,
257  typename InputIteratorT>
258 __device__ __forceinline__ void LoadDirectStriped(
259  int linear_tid,
260  InputIteratorT block_itr,
261  InputT (&items)[ITEMS_PER_THREAD])
262 {
263  InputIteratorT thread_itr = block_itr + linear_tid;
264 
265  #pragma unroll
266  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
267  {
268  items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
269  }
270 }
271 
272 
283 template <
284  int BLOCK_THREADS,
285  typename InputT,
286  int ITEMS_PER_THREAD,
287  typename InputIteratorT>
288 __device__ __forceinline__ void LoadDirectStriped(
289  int linear_tid,
290  InputIteratorT block_itr,
291  InputT (&items)[ITEMS_PER_THREAD],
292  int valid_items)
293 {
294  InputIteratorT thread_itr = block_itr + linear_tid;
295 
296  #pragma unroll
297  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
298  {
299  if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
300  {
301  items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
302  }
303  }
304 }
305 
306 
317 template <
318  int BLOCK_THREADS,
319  typename InputT,
320  typename DefaultT,
321  int ITEMS_PER_THREAD,
322  typename InputIteratorT>
323 __device__ __forceinline__ void LoadDirectStriped(
324  int linear_tid,
325  InputIteratorT block_itr,
326  InputT (&items)[ITEMS_PER_THREAD],
327  int valid_items,
328  DefaultT oob_default)
329 {
330  #pragma unroll
331  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332  items[ITEM] = oob_default;
333 
334  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
335 }
336 
337 
338 
340 /******************************************************************/
344 
345 
358 template <
359  typename InputT,
360  int ITEMS_PER_THREAD,
361  typename InputIteratorT>
362 __device__ __forceinline__ void LoadDirectWarpStriped(
363  int linear_tid,
364  InputIteratorT block_itr,
365  InputT (&items)[ITEMS_PER_THREAD])
366 {
367  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
368  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
369  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
370 
371  InputIteratorT thread_itr = block_itr + warp_offset + tid ;
372 
373  // Load directly in warp-striped order
374  #pragma unroll
375  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
376  {
377  items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
378  }
379 }
380 
381 
394 template <
395  typename InputT,
396  int ITEMS_PER_THREAD,
397  typename InputIteratorT>
398 __device__ __forceinline__ void LoadDirectWarpStriped(
399  int linear_tid,
400  InputIteratorT block_itr,
401  InputT (&items)[ITEMS_PER_THREAD],
402  int valid_items)
403 {
404  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
405  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
406  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
407 
408  InputIteratorT thread_itr = block_itr + warp_offset + tid ;
409 
410  // Load directly in warp-striped order
411  #pragma unroll
412  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
413  {
414  if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
415  {
416  items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
417  }
418  }
419 }
420 
421 
434 template <
435  typename InputT,
436  typename DefaultT,
437  int ITEMS_PER_THREAD,
438  typename InputIteratorT>
439 __device__ __forceinline__ void LoadDirectWarpStriped(
440  int linear_tid,
441  InputIteratorT block_itr,
442  InputT (&items)[ITEMS_PER_THREAD],
443  int valid_items,
444  DefaultT oob_default)
445 {
446  // Load directly in warp-striped order
447  #pragma unroll
448  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
449  items[ITEM] = oob_default;
450 
451  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
452 }
453 
454 
455 
457  // end group UtilIo
459 
460 
461 
462 //-----------------------------------------------------------------------------
463 // Generic BlockLoad abstraction
464 //-----------------------------------------------------------------------------
465 
474 {
486 
506 
521 
522 
542 
543 
563 };
564 
565 
632 template <
633  typename InputT,
634  int BLOCK_DIM_X,
635  int ITEMS_PER_THREAD,
637  int BLOCK_DIM_Y = 1,
638  int BLOCK_DIM_Z = 1,
639  int PTX_ARCH = CUB_PTX_ARCH>
641 {
642 private:
643 
644  /******************************************************************************
645  * Constants and typed definitions
646  ******************************************************************************/
647 
649  enum
650  {
652  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
653  };
654 
655 
656  /******************************************************************************
657  * Algorithmic variants
658  ******************************************************************************/
659 
661  template <BlockLoadAlgorithm _POLICY, int DUMMY>
662  struct LoadInternal;
663 
664 
668  template <int DUMMY>
670  {
673 
676 
678  __device__ __forceinline__ LoadInternal(
679  TempStorage &/*temp_storage*/,
680  int linear_tid)
681  :
683  {}
684 
686  template <typename InputIteratorT>
687  __device__ __forceinline__ void Load(
688  InputIteratorT block_itr,
689  InputT (&items)[ITEMS_PER_THREAD])
690  {
691  LoadDirectBlocked(linear_tid, block_itr, items);
692  }
693 
695  template <typename InputIteratorT>
696  __device__ __forceinline__ void Load(
697  InputIteratorT block_itr,
698  InputT (&items)[ITEMS_PER_THREAD],
699  int valid_items)
700  {
701  LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
702  }
703 
705  template <typename InputIteratorT, typename DefaultT>
706  __device__ __forceinline__ void Load(
707  InputIteratorT block_itr,
708  InputT (&items)[ITEMS_PER_THREAD],
709  int valid_items,
710  DefaultT oob_default)
711  {
712  LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
713  }
714 
715  };
716 
717 
721  template <int DUMMY>
723  {
726 
729 
731  __device__ __forceinline__ LoadInternal(
732  TempStorage &/*temp_storage*/,
733  int linear_tid)
734  :
736  {}
737 
739  template <typename InputIteratorT>
740  __device__ __forceinline__ void Load(
741  InputT *block_ptr,
742  InputT (&items)[ITEMS_PER_THREAD])
743  {
744  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
745  }
746 
748  template <typename InputIteratorT>
749  __device__ __forceinline__ void Load(
750  const InputT *block_ptr,
751  InputT (&items)[ITEMS_PER_THREAD])
752  {
753  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
754  }
755 
757  template <
758  CacheLoadModifier MODIFIER,
759  typename ValueType,
760  typename OffsetT>
761  __device__ __forceinline__ void Load(
763  InputT (&items)[ITEMS_PER_THREAD])
764  {
765  InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
766  }
767 
769  template <typename _InputIteratorT>
770  __device__ __forceinline__ void Load(
771  _InputIteratorT block_itr,
772  InputT (&items)[ITEMS_PER_THREAD])
773  {
774  LoadDirectBlocked(linear_tid, block_itr, items);
775  }
776 
778  template <typename InputIteratorT>
779  __device__ __forceinline__ void Load(
780  InputIteratorT block_itr,
781  InputT (&items)[ITEMS_PER_THREAD],
782  int valid_items)
783  {
784  LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
785  }
786 
788  template <typename InputIteratorT, typename DefaultT>
789  __device__ __forceinline__ void Load(
790  InputIteratorT block_itr,
791  InputT (&items)[ITEMS_PER_THREAD],
792  int valid_items,
793  DefaultT oob_default)
794  {
795  LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
796  }
797 
798  };
799 
800 
804  template <int DUMMY>
806  {
807  // BlockExchange utility type for keys
808  typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
809 
812  {};
813 
815  struct TempStorage : Uninitialized<_TempStorage> {};
816 
819 
822 
824  __device__ __forceinline__ LoadInternal(
826  int linear_tid)
827  :
828  temp_storage(temp_storage.Alias()),
830  {}
831 
833  template <typename InputIteratorT>
834  __device__ __forceinline__ void Load(
835  InputIteratorT block_itr,
836  InputT (&items)[ITEMS_PER_THREAD])
837  {
838  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
839  BlockExchange(temp_storage).StripedToBlocked(items, items);
840  }
841 
843  template <typename InputIteratorT>
844  __device__ __forceinline__ void Load(
845  InputIteratorT block_itr,
846  InputT (&items)[ITEMS_PER_THREAD],
847  int valid_items)
848  {
849  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
850  BlockExchange(temp_storage).StripedToBlocked(items, items);
851  }
852 
854  template <typename InputIteratorT, typename DefaultT>
855  __device__ __forceinline__ void Load(
856  InputIteratorT block_itr,
857  InputT (&items)[ITEMS_PER_THREAD],
858  int valid_items,
859  DefaultT oob_default)
860  {
861  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
862  BlockExchange(temp_storage).StripedToBlocked(items, items);
863  }
864 
865  };
866 
867 
871  template <int DUMMY>
873  {
874  enum
875  {
876  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
877  };
878 
879  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
880  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
881 
882  // BlockExchange utility type for keys
883  typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
884 
887  {};
888 
890  struct TempStorage : Uninitialized<_TempStorage> {};
891 
894 
897 
899  __device__ __forceinline__ LoadInternal(
901  int linear_tid)
902  :
903  temp_storage(temp_storage.Alias()),
905  {}
906 
908  template <typename InputIteratorT>
909  __device__ __forceinline__ void Load(
910  InputIteratorT block_itr,
911  InputT (&items)[ITEMS_PER_THREAD])
912  {
913  LoadDirectWarpStriped(linear_tid, block_itr, items);
914  BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
915  }
916 
918  template <typename InputIteratorT>
919  __device__ __forceinline__ void Load(
920  InputIteratorT block_itr,
921  InputT (&items)[ITEMS_PER_THREAD],
922  int valid_items)
923  {
924  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
925  BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
926  }
927 
928 
930  template <typename InputIteratorT, typename DefaultT>
931  __device__ __forceinline__ void Load(
932  InputIteratorT block_itr,
933  InputT (&items)[ITEMS_PER_THREAD],
934  int valid_items,
935  DefaultT oob_default)
936  {
937  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
938  BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
939  }
940  };
941 
942 
946  template <int DUMMY>
948  {
949  enum
950  {
951  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
952  };
953 
954  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
955  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
956 
957  // BlockExchange utility type for keys
958  typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
959 
962  {};
963 
965  struct TempStorage : Uninitialized<_TempStorage> {};
966 
969 
972 
974  __device__ __forceinline__ LoadInternal(
976  int linear_tid)
977  :
978  temp_storage(temp_storage.Alias()),
980  {}
981 
983  template <typename InputIteratorT>
984  __device__ __forceinline__ void Load(
985  InputIteratorT block_itr,
986  InputT (&items)[ITEMS_PER_THREAD])
987  {
988  LoadDirectWarpStriped(linear_tid, block_itr, items);
989  BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
990  }
991 
993  template <typename InputIteratorT>
994  __device__ __forceinline__ void Load(
995  InputIteratorT block_itr,
996  InputT (&items)[ITEMS_PER_THREAD],
997  int valid_items)
998  {
999  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
1000  BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
1001  }
1002 
1003 
1005  template <typename InputIteratorT, typename DefaultT>
1006  __device__ __forceinline__ void Load(
1007  InputIteratorT block_itr,
1008  InputT (&items)[ITEMS_PER_THREAD],
1009  int valid_items,
1010  DefaultT oob_default)
1011  {
1012  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
1013  BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
1014  }
1015  };
1016 
1017 
1018  /******************************************************************************
1019  * Type definitions
1020  ******************************************************************************/
1021 
1024 
1025 
1027  typedef typename InternalLoad::TempStorage _TempStorage;
1028 
1029 
1030  /******************************************************************************
1031  * Utility methods
1032  ******************************************************************************/
1033 
1035  __device__ __forceinline__ _TempStorage& PrivateStorage()
1036  {
1037  __shared__ _TempStorage private_storage;
1038  return private_storage;
1039  }
1040 
1041 
1042  /******************************************************************************
1043  * Thread fields
1044  ******************************************************************************/
1045 
1048 
1051 
1052 public:
1053 
1055  struct TempStorage : Uninitialized<_TempStorage> {};
1056 
1057 
1058  /******************************************************************/
1062 
1066  __device__ __forceinline__ BlockLoad()
1067  :
1069  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1070  {}
1071 
1072 
1076  __device__ __forceinline__ BlockLoad(
1078  :
1079  temp_storage(temp_storage.Alias()),
1080  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1081  {}
1082 
1083 
1084 
1085 
1087  /******************************************************************/
1091 
1092 
1129  template <typename InputIteratorT>
1130  __device__ __forceinline__ void Load(
1131  InputIteratorT block_itr,
1132  InputT (&items)[ITEMS_PER_THREAD])
1133  {
1134  InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
1135  }
1136 
1137 
1175  template <typename InputIteratorT>
1176  __device__ __forceinline__ void Load(
1177  InputIteratorT block_itr,
1178  InputT (&items)[ITEMS_PER_THREAD],
1179  int valid_items)
1180  {
1181  InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
1182  }
1183 
1184 
1223  template <typename InputIteratorT, typename DefaultT>
1224  __device__ __forceinline__ void Load(
1225  InputIteratorT block_itr,
1226  InputT (&items)[ITEMS_PER_THREAD],
1227  int valid_items,
1228  DefaultT oob_default)
1229  {
1230  InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
1231  }
1232 
1233 
1235 
1236 };
1237 
1238 
1239 } // CUB namespace
1240 CUB_NS_POSTFIX // Optional outer namespace(s)
1241 
__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Definition: block_load.cuh:162
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
Definition: block_load.cuh:844
__device__ __forceinline__ void LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a blocked arrangement across the thread block.
Definition: block_load.cuh:76
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
Definition: block_load.cuh:899
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition: thread_load.cuh:62
NullType TempStorage
Shared memory storage layout type.
Definition: block_load.cuh:725
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
#define CUB_STATIC_ASSERT(cond, msg)
Static assert.
Definition: util_macro.cuh:97
Optional outer namespace(s)
__device__ __forceinline__ void LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a striped arrangement across the thread block.
Definition: block_load.cuh:258
Sparse Matrix implementation stub object when OpenFPM is compiled with no linear algebra support.
Definition: Vector.hpp:39
__device__ __forceinline__ LoadInternal(TempStorage &, int linear_tid)
Constructor.
Definition: block_load.cuh:678
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
\smemstorage{BlockExchange}
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition: block_load.cuh:834
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition: block_load.cuh:931
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
NullType TempStorage
Shared memory storage layout type.
Definition: block_load.cuh:672
\smemstorage{BlockLoad}
int linear_tid
Linear thread-id.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
Definition: block_load.cuh:473
__device__ __forceinline__ void Load(CacheModifiedInputIterator< MODIFIER, ValueType, OffsetT > block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
Definition: block_load.cuh:761
__device__ __forceinline__ LoadInternal(TempStorage &, int linear_tid)
Constructor.
Definition: block_load.cuh:731
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition: block_load.cuh:984
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
Definition: block_load.cuh:640
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a blocked arrangement across the thread block.
Definition: block_load.cuh:227
OffsetT OffsetT
[in] Total number of input data items
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
Definition: block_load.cuh:919
InternalLoad::TempStorage _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ BlockLoad(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition: block_load.cuh:687
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition: block_load.cuh:706
__device__ __forceinline__ void Load(InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
Definition: block_load.cuh:740
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
Definition: block_load.cuh:696
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition: block_load.cuh:855
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
_TempStorage & temp_storage
Thread reference to shared storage.
Definition: block_load.cuh:893
_TempStorage & temp_storage
Thread reference to shared storage.
Definition: block_load.cuh:818
ValueType * ptr
Wrapped native pointer.
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ void Load(_InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorizati...
Definition: block_load.cuh:770
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
Definition: block_load.cuh:824
A simple "NULL" marker type.
Definition: util_type.cuh:256
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition: util_type.cuh:454
_TempStorage & temp_storage
Thread reference to shared storage.
Definition: block_load.cuh:968
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition: block_load.cuh:789
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
Definition: block_load.cuh:974
The thread block size in threads.
Definition: block_load.cuh:652
__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a warp-striped arrangement across the thread block.
Definition: block_load.cuh:362
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition: block_load.cuh:909
__device__ __forceinline__ void Load(const InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
Definition: block_load.cuh:749
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
LoadInternal< ALGORITHM, 0 > InternalLoad
Internal load implementation to use.
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
Definition: block_load.cuh:994
__device__ __forceinline__ BlockLoad()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range (skips vectorization)
Definition: block_load.cuh:779