OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_load.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include <iterator>
37
38#include "block_exchange.cuh"
39#include "../iterator/cache_modified_input_iterator.cuh"
40#include "../util_ptx.cuh"
41#include "../util_macro.cuh"
42#include "../util_type.cuh"
43#include "../util_namespace.cuh"
44
46CUB_NS_PREFIX
47
49namespace cub {
50
57/******************************************************************/
61
62
72template <
73 typename InputT,
74 int ITEMS_PER_THREAD,
75 typename InputIteratorT>
76__device__ __forceinline__ void LoadDirectBlocked(
77 int linear_tid,
78 InputIteratorT block_itr,
79 InputT (&items)[ITEMS_PER_THREAD])
80{
81 InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
82
83 // Load directly in thread-blocked order
84 #pragma unroll
85 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
86 {
87 items[ITEM] = thread_itr[ITEM];
88 }
89}
90
91
101template <
102 typename InputT,
103 int ITEMS_PER_THREAD,
104 typename InputIteratorT>
105__device__ __forceinline__ void LoadDirectBlocked(
106 int linear_tid,
107 InputIteratorT block_itr,
108 InputT (&items)[ITEMS_PER_THREAD],
109 int valid_items)
110{
111 InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
112
113 #pragma unroll
114 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
115 {
116 if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
117 {
118 items[ITEM] = thread_itr[ITEM];
119 }
120 }
121}
122
123
133template <
134 typename InputT,
135 typename DefaultT,
136 int ITEMS_PER_THREAD,
137 typename InputIteratorT>
138__device__ __forceinline__ void LoadDirectBlocked(
139 int linear_tid,
140 InputIteratorT block_itr,
141 InputT (&items)[ITEMS_PER_THREAD],
142 int valid_items,
143 DefaultT oob_default)
144{
145 #pragma unroll
146 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
147 items[ITEM] = oob_default;
148
149 LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
150}
151
152
153#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
154
158template <
159 CacheLoadModifier MODIFIER,
160 typename T,
161 int ITEMS_PER_THREAD>
162__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
163 int linear_tid,
164 T *block_ptr,
165 T (&items)[ITEMS_PER_THREAD])
166{
167 // Biggest memory access word that T is a whole multiple of
168 typedef typename UnitWord<T>::DeviceWord DeviceWord;
169
170 enum
171 {
172 TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
173
174 VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
175 4 :
176 (TOTAL_WORDS % 2 == 0) ?
177 2 :
178 1,
179
180 VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
181 };
182
183 // Vector type
185
186 // Vector items
187 Vector vec_items[VECTORS_PER_THREAD];
188
189 // Aliased input ptr
190 Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
191
192 // Load directly in thread-blocked order
193 #pragma unroll
194 for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
195 {
196 vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
197 }
198
199 // Copy
200 #pragma unroll
201 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
202 {
203 items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
204 }
205}
206
207#endif // DOXYGEN_SHOULD_SKIP_THIS
208
209
224template <
225 typename T,
226 int ITEMS_PER_THREAD>
227__device__ __forceinline__ void LoadDirectBlockedVectorized(
228 int linear_tid,
229 T *block_ptr,
230 T (&items)[ITEMS_PER_THREAD])
231{
232 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
233}
234
235
237/******************************************************************/
241
242
253template <
254 int BLOCK_THREADS,
255 typename InputT,
256 int ITEMS_PER_THREAD,
257 typename InputIteratorT>
258__device__ __forceinline__ void LoadDirectStriped(
259 int linear_tid,
260 InputIteratorT block_itr,
261 InputT (&items)[ITEMS_PER_THREAD])
262{
263 InputIteratorT thread_itr = block_itr + linear_tid;
264
265 #pragma unroll
266 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
267 {
268 items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
269 }
270}
271
272
283template <
284 int BLOCK_THREADS,
285 typename InputT,
286 int ITEMS_PER_THREAD,
287 typename InputIteratorT>
288__device__ __forceinline__ void LoadDirectStriped(
289 int linear_tid,
290 InputIteratorT block_itr,
291 InputT (&items)[ITEMS_PER_THREAD],
292 int valid_items)
293{
294 InputIteratorT thread_itr = block_itr + linear_tid;
295
296 #pragma unroll
297 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
298 {
299 if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
300 {
301 items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
302 }
303 }
304}
305
306
317template <
318 int BLOCK_THREADS,
319 typename InputT,
320 typename DefaultT,
321 int ITEMS_PER_THREAD,
322 typename InputIteratorT>
323__device__ __forceinline__ void LoadDirectStriped(
324 int linear_tid,
325 InputIteratorT block_itr,
326 InputT (&items)[ITEMS_PER_THREAD],
327 int valid_items,
328 DefaultT oob_default)
329{
330 #pragma unroll
331 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332 items[ITEM] = oob_default;
333
334 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
335}
336
337
338
340/******************************************************************/
344
345
358template <
359 typename InputT,
360 int ITEMS_PER_THREAD,
361 typename InputIteratorT>
362__device__ __forceinline__ void LoadDirectWarpStriped(
363 int linear_tid,
364 InputIteratorT block_itr,
365 InputT (&items)[ITEMS_PER_THREAD])
366{
367 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
368 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
369 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
370
371 InputIteratorT thread_itr = block_itr + warp_offset + tid ;
372
373 // Load directly in warp-striped order
374 #pragma unroll
375 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
376 {
377 items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
378 }
379}
380
381
394template <
395 typename InputT,
396 int ITEMS_PER_THREAD,
397 typename InputIteratorT>
398__device__ __forceinline__ void LoadDirectWarpStriped(
399 int linear_tid,
400 InputIteratorT block_itr,
401 InputT (&items)[ITEMS_PER_THREAD],
402 int valid_items)
403{
404 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
405 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
406 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
407
408 InputIteratorT thread_itr = block_itr + warp_offset + tid ;
409
410 // Load directly in warp-striped order
411 #pragma unroll
412 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
413 {
414 if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
415 {
416 items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
417 }
418 }
419}
420
421
434template <
435 typename InputT,
436 typename DefaultT,
437 int ITEMS_PER_THREAD,
438 typename InputIteratorT>
439__device__ __forceinline__ void LoadDirectWarpStriped(
440 int linear_tid,
441 InputIteratorT block_itr,
442 InputT (&items)[ITEMS_PER_THREAD],
443 int valid_items,
444 DefaultT oob_default)
445{
446 // Load directly in warp-striped order
447 #pragma unroll
448 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
449 items[ITEM] = oob_default;
450
451 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
452}
453
454
455
457 // end group UtilIo
459
460
461
462//-----------------------------------------------------------------------------
463// Generic BlockLoad abstraction
464//-----------------------------------------------------------------------------
465
474{
486
506
521
522
542
543
563};
564
565
632template <
633 typename InputT,
634 int BLOCK_DIM_X,
635 int ITEMS_PER_THREAD,
637 int BLOCK_DIM_Y = 1,
638 int BLOCK_DIM_Z = 1,
639 int PTX_ARCH = CUB_PTX_ARCH>
641{
642private:
643
644 /******************************************************************************
645 * Constants and typed definitions
646 ******************************************************************************/
647
649 enum
650 {
652 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
653 };
654
655
656 /******************************************************************************
657 * Algorithmic variants
658 ******************************************************************************/
659
661 template <BlockLoadAlgorithm _POLICY, int DUMMY>
663
664
668 template <int DUMMY>
670 {
673
676
678 __device__ __forceinline__ LoadInternal(
679 TempStorage &/*temp_storage*/,
680 int linear_tid)
681 :
683 {}
684
686 template <typename InputIteratorT>
687 __device__ __forceinline__ void Load(
688 InputIteratorT block_itr,
689 InputT (&items)[ITEMS_PER_THREAD])
690 {
691 LoadDirectBlocked(linear_tid, block_itr, items);
692 }
693
695 template <typename InputIteratorT>
696 __device__ __forceinline__ void Load(
697 InputIteratorT block_itr,
698 InputT (&items)[ITEMS_PER_THREAD],
699 int valid_items)
700 {
701 LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
702 }
703
705 template <typename InputIteratorT, typename DefaultT>
706 __device__ __forceinline__ void Load(
707 InputIteratorT block_itr,
708 InputT (&items)[ITEMS_PER_THREAD],
709 int valid_items,
710 DefaultT oob_default)
711 {
712 LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
713 }
714
715 };
716
717
721 template <int DUMMY>
723 {
726
729
731 __device__ __forceinline__ LoadInternal(
732 TempStorage &/*temp_storage*/,
733 int linear_tid)
734 :
736 {}
737
739 template <typename InputIteratorT>
740 __device__ __forceinline__ void Load(
741 InputT *block_ptr,
742 InputT (&items)[ITEMS_PER_THREAD])
743 {
744 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
745 }
746
748 template <typename InputIteratorT>
749 __device__ __forceinline__ void Load(
750 const InputT *block_ptr,
751 InputT (&items)[ITEMS_PER_THREAD])
752 {
753 InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
754 }
755
757 template <
758 CacheLoadModifier MODIFIER,
759 typename ValueType,
760 typename OffsetT>
761 __device__ __forceinline__ void Load(
763 InputT (&items)[ITEMS_PER_THREAD])
764 {
765 InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
766 }
767
769 template <typename _InputIteratorT>
770 __device__ __forceinline__ void Load(
771 _InputIteratorT block_itr,
772 InputT (&items)[ITEMS_PER_THREAD])
773 {
774 LoadDirectBlocked(linear_tid, block_itr, items);
775 }
776
778 template <typename InputIteratorT>
779 __device__ __forceinline__ void Load(
780 InputIteratorT block_itr,
781 InputT (&items)[ITEMS_PER_THREAD],
782 int valid_items)
783 {
784 LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
785 }
786
788 template <typename InputIteratorT, typename DefaultT>
789 __device__ __forceinline__ void Load(
790 InputIteratorT block_itr,
791 InputT (&items)[ITEMS_PER_THREAD],
792 int valid_items,
793 DefaultT oob_default)
794 {
795 LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
796 }
797
798 };
799
800
804 template <int DUMMY>
806 {
807 // BlockExchange utility type for keys
808 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
809
812 {};
813
815 struct TempStorage : Uninitialized<_TempStorage> {};
816
819
822
824 __device__ __forceinline__ LoadInternal(
826 int linear_tid)
827 :
828 temp_storage(temp_storage.Alias()),
830 {}
831
833 template <typename InputIteratorT>
834 __device__ __forceinline__ void Load(
835 InputIteratorT block_itr,
836 InputT (&items)[ITEMS_PER_THREAD])
837 {
838 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
839 BlockExchange(temp_storage).StripedToBlocked(items, items);
840 }
841
843 template <typename InputIteratorT>
844 __device__ __forceinline__ void Load(
845 InputIteratorT block_itr,
846 InputT (&items)[ITEMS_PER_THREAD],
847 int valid_items)
848 {
849 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
850 BlockExchange(temp_storage).StripedToBlocked(items, items);
851 }
852
854 template <typename InputIteratorT, typename DefaultT>
855 __device__ __forceinline__ void Load(
856 InputIteratorT block_itr,
857 InputT (&items)[ITEMS_PER_THREAD],
858 int valid_items,
859 DefaultT oob_default)
860 {
861 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
862 BlockExchange(temp_storage).StripedToBlocked(items, items);
863 }
864
865 };
866
867
871 template <int DUMMY>
873 {
874 enum
875 {
876 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
877 };
878
879 // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
880 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
881
882 // BlockExchange utility type for keys
883 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
884
887 {};
888
890 struct TempStorage : Uninitialized<_TempStorage> {};
891
894
897
899 __device__ __forceinline__ LoadInternal(
901 int linear_tid)
902 :
903 temp_storage(temp_storage.Alias()),
905 {}
906
908 template <typename InputIteratorT>
909 __device__ __forceinline__ void Load(
910 InputIteratorT block_itr,
911 InputT (&items)[ITEMS_PER_THREAD])
912 {
913 LoadDirectWarpStriped(linear_tid, block_itr, items);
914 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
915 }
916
918 template <typename InputIteratorT>
919 __device__ __forceinline__ void Load(
920 InputIteratorT block_itr,
921 InputT (&items)[ITEMS_PER_THREAD],
922 int valid_items)
923 {
924 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
925 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
926 }
927
928
930 template <typename InputIteratorT, typename DefaultT>
931 __device__ __forceinline__ void Load(
932 InputIteratorT block_itr,
933 InputT (&items)[ITEMS_PER_THREAD],
934 int valid_items,
935 DefaultT oob_default)
936 {
937 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
938 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
939 }
940 };
941
942
946 template <int DUMMY>
948 {
949 enum
950 {
951 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
952 };
953
954 // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
955 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
956
957 // BlockExchange utility type for keys
958 typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
959
962 {};
963
965 struct TempStorage : Uninitialized<_TempStorage> {};
966
969
972
974 __device__ __forceinline__ LoadInternal(
976 int linear_tid)
977 :
978 temp_storage(temp_storage.Alias()),
980 {}
981
983 template <typename InputIteratorT>
984 __device__ __forceinline__ void Load(
985 InputIteratorT block_itr,
986 InputT (&items)[ITEMS_PER_THREAD])
987 {
988 LoadDirectWarpStriped(linear_tid, block_itr, items);
989 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
990 }
991
993 template <typename InputIteratorT>
994 __device__ __forceinline__ void Load(
995 InputIteratorT block_itr,
996 InputT (&items)[ITEMS_PER_THREAD],
997 int valid_items)
998 {
999 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
1000 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
1001 }
1002
1003
1005 template <typename InputIteratorT, typename DefaultT>
1006 __device__ __forceinline__ void Load(
1007 InputIteratorT block_itr,
1008 InputT (&items)[ITEMS_PER_THREAD],
1009 int valid_items,
1010 DefaultT oob_default)
1011 {
1012 LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
1013 BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
1014 }
1015 };
1016
1017
1018 /******************************************************************************
1019 * Type definitions
1020 ******************************************************************************/
1021
1024
1025
1027 typedef typename InternalLoad::TempStorage _TempStorage;
1028
1029
1030 /******************************************************************************
1031 * Utility methods
1032 ******************************************************************************/
1033
1035 __device__ __forceinline__ _TempStorage& PrivateStorage()
1036 {
1037 __shared__ _TempStorage private_storage;
1038 return private_storage;
1039 }
1040
1041
1042 /******************************************************************************
1043 * Thread fields
1044 ******************************************************************************/
1045
1048
1051
1052public:
1053
1055 struct TempStorage : Uninitialized<_TempStorage> {};
1056
1057
1058 /******************************************************************/
1062
1066 __device__ __forceinline__ BlockLoad()
1067 :
1069 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1070 {}
1071
1072
1076 __device__ __forceinline__ BlockLoad(
1078 :
1079 temp_storage(temp_storage.Alias()),
1080 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1081 {}
1082
1083
1084
1085
1087 /******************************************************************/
1091
1092
1129 template <typename InputIteratorT>
1130 __device__ __forceinline__ void Load(
1131 InputIteratorT block_itr,
1132 InputT (&items)[ITEMS_PER_THREAD])
1133 {
1134 InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
1135 }
1136
1137
1175 template <typename InputIteratorT>
1176 __device__ __forceinline__ void Load(
1177 InputIteratorT block_itr,
1178 InputT (&items)[ITEMS_PER_THREAD],
1179 int valid_items)
1180 {
1181 InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
1182 }
1183
1184
1223 template <typename InputIteratorT, typename DefaultT>
1224 __device__ __forceinline__ void Load(
1225 InputIteratorT block_itr,
1226 InputT (&items)[ITEMS_PER_THREAD],
1227 int valid_items,
1228 DefaultT oob_default)
1229 {
1230 InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
1231 }
1232
1233
1235
1236};
1237
1238
1239} // CUB namespace
1240CUB_NS_POSTFIX // Optional outer namespace(s)
1241
Sparse Matrix implementation stub object when OpenFPM is compiled with no linear algebra support.
Definition Vector.hpp:40
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
ValueType * ptr
Wrapped native pointer.
__device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a blocked arrangement across the thread block.
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range (skips vectorization)
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
_TempStorage & temp_storage
Thread reference to shared storage.
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
__device__ __forceinline__ void Load(CacheModifiedInputIterator< MODIFIER, ValueType, OffsetT > block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ LoadInternal(TempStorage &, int linear_tid)
Constructor.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
__device__ __forceinline__ BlockLoad()
Collective constructor using a private static allocation of shared memory as temporary storage.
__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
NullType TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a striped arrangement across the thread block.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
__device__ __forceinline__ LoadInternal(TempStorage &, int linear_tid)
Constructor.
__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a warp-striped arrangement across the thread block.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
__device__ __forceinline__ void Load(_InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorizati...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ void LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a blocked arrangement across the thread block.
__device__ __forceinline__ void Load(const InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
int linear_tid
Linear thread-id.
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ void Load(InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
LoadInternal< ALGORITHM, 0 > InternalLoad
Internal load implementation to use.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
InternalLoad::TempStorage _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ BlockLoad(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
NullType TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
@ BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_VECTORIZE
@ BLOCK_LOAD_WARP_TRANSPOSE
@ BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
@ BLOCK_LOAD_TRANSPOSE
@ BLOCK_THREADS
The thread block size in threads.
#define CUB_STATIC_ASSERT(cond, msg)
Static assert.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409
Optional outer namespace(s)
OffsetT OffsetT
[in] Total number of input data items
\smemstorage{BlockExchange}
\smemstorage{BlockLoad}
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
A simple "NULL" marker type.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53