OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_store.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include <iterator>
37
38#include "block_exchange.cuh"
39#include "../util_ptx.cuh"
40#include "../util_macro.cuh"
41#include "../util_type.cuh"
42#include "../util_namespace.cuh"
43
45CUB_NS_PREFIX
46
48namespace cub {
49
56/******************************************************************/
60
70template <
71 typename T,
72 int ITEMS_PER_THREAD,
73 typename OutputIteratorT>
74__device__ __forceinline__ void StoreDirectBlocked(
75 int linear_tid,
76 OutputIteratorT block_itr,
77 T (&items)[ITEMS_PER_THREAD])
78{
79 OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
80
81 // Store directly in thread-blocked order
82 #pragma unroll
83 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
84 {
85 thread_itr[ITEM] = items[ITEM];
86 }
87}
88
89
99template <
100 typename T,
101 int ITEMS_PER_THREAD,
102 typename OutputIteratorT>
103__device__ __forceinline__ void StoreDirectBlocked(
104 int linear_tid,
105 OutputIteratorT block_itr,
106 T (&items)[ITEMS_PER_THREAD],
107 int valid_items)
108{
109 OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
110
111 // Store directly in thread-blocked order
112 #pragma unroll
113 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
114 {
115 if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
116 {
117 thread_itr[ITEM] = items[ITEM];
118 }
119 }
120}
121
122
140template <
141 typename T,
142 int ITEMS_PER_THREAD>
143__device__ __forceinline__ void StoreDirectBlockedVectorized(
144 int linear_tid,
145 T *block_ptr,
146 T (&items)[ITEMS_PER_THREAD])
147{
148 enum
149 {
150 // Maximum CUDA vector size is 4 elements
151 MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
152
153 // Vector size must be a power of two and an even divisor of the items per thread
154 VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
155 MAX_VEC_SIZE :
156 1,
157
158 VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
159 };
160
161 // Vector type
162 typedef typename CubVector<T, VEC_SIZE>::Type Vector;
163
164 // Alias global pointer
165 Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
166
167 // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
168 Vector raw_vector[VECTORS_PER_THREAD];
169 T *raw_items = reinterpret_cast<T*>(raw_vector);
170
171 // Copy
172 #pragma unroll
173 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
174 {
175 raw_items[ITEM] = items[ITEM];
176 }
177
178 // Direct-store using vector types
179 StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
180}
181
182
183
185/******************************************************************/
189
190
201template <
202 int BLOCK_THREADS,
203 typename T,
204 int ITEMS_PER_THREAD,
205 typename OutputIteratorT>
206__device__ __forceinline__ void StoreDirectStriped(
207 int linear_tid,
208 OutputIteratorT block_itr,
209 T (&items)[ITEMS_PER_THREAD])
210{
211 OutputIteratorT thread_itr = block_itr + linear_tid;
212
213 // Store directly in striped order
214 #pragma unroll
215 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
216 {
217 thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
218 }
219}
220
221
232template <
233 int BLOCK_THREADS,
234 typename T,
235 int ITEMS_PER_THREAD,
236 typename OutputIteratorT>
237__device__ __forceinline__ void StoreDirectStriped(
238 int linear_tid,
239 OutputIteratorT block_itr,
240 T (&items)[ITEMS_PER_THREAD],
241 int valid_items)
242{
243 OutputIteratorT thread_itr = block_itr + linear_tid;
244
245 // Store directly in striped order
246 #pragma unroll
247 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
248 {
249 if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
250 {
251 thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
252 }
253 }
254}
255
256
257
259/******************************************************************/
263
264
277template <
278 typename T,
279 int ITEMS_PER_THREAD,
280 typename OutputIteratorT>
281__device__ __forceinline__ void StoreDirectWarpStriped(
282 int linear_tid,
283 OutputIteratorT block_itr,
284 T (&items)[ITEMS_PER_THREAD])
285{
286 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
287 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
288 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
289
290 OutputIteratorT thread_itr = block_itr + warp_offset + tid;
291
292 // Store directly in warp-striped order
293 #pragma unroll
294 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
295 {
296 thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
297 }
298}
299
300
313template <
314 typename T,
315 int ITEMS_PER_THREAD,
316 typename OutputIteratorT>
317__device__ __forceinline__ void StoreDirectWarpStriped(
318 int linear_tid,
319 OutputIteratorT block_itr,
320 T (&items)[ITEMS_PER_THREAD],
321 int valid_items)
322{
323 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
324 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
325 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
326
327 OutputIteratorT thread_itr = block_itr + warp_offset + tid;
328
329 // Store directly in warp-striped order
330 #pragma unroll
331 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332 {
333 if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
334 {
335 thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
336 }
337 }
338}
339
340
342
343 // end group UtilIo
345
346
347//-----------------------------------------------------------------------------
348// Generic BlockStore abstraction
349//-----------------------------------------------------------------------------
350
355{
367
387
400
417
436
437};
438
439
507template <
508 typename T,
509 int BLOCK_DIM_X,
510 int ITEMS_PER_THREAD,
512 int BLOCK_DIM_Y = 1,
513 int BLOCK_DIM_Z = 1,
514 int PTX_ARCH = CUB_PTX_ARCH>
516{
517private:
518 /******************************************************************************
519 * Constants and typed definitions
520 ******************************************************************************/
521
523 enum
524 {
526 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
527 };
528
529
530 /******************************************************************************
531 * Algorithmic variants
532 ******************************************************************************/
533
535 template <BlockStoreAlgorithm _POLICY, int DUMMY>
537
538
542 template <int DUMMY>
544 {
547
550
552 __device__ __forceinline__ StoreInternal(
553 TempStorage &/*temp_storage*/,
554 int linear_tid)
555 :
557 {}
558
560 template <typename OutputIteratorT>
561 __device__ __forceinline__ void Store(
562 OutputIteratorT block_itr,
563 T (&items)[ITEMS_PER_THREAD])
564 {
565 StoreDirectBlocked(linear_tid, block_itr, items);
566 }
567
569 template <typename OutputIteratorT>
570 __device__ __forceinline__ void Store(
571 OutputIteratorT block_itr,
572 T (&items)[ITEMS_PER_THREAD],
573 int valid_items)
574 {
575 StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
576 }
577 };
578
579
583 template <int DUMMY>
585 {
588
591
593 __device__ __forceinline__ StoreInternal(
594 TempStorage &/*temp_storage*/,
595 int linear_tid)
596 :
598 {}
599
601 __device__ __forceinline__ void Store(
602 T *block_ptr,
603 T (&items)[ITEMS_PER_THREAD])
604 {
605 StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
606 }
607
609 template <typename OutputIteratorT>
610 __device__ __forceinline__ void Store(
611 OutputIteratorT block_itr,
612 T (&items)[ITEMS_PER_THREAD])
613 {
614 StoreDirectBlocked(linear_tid, block_itr, items);
615 }
616
618 template <typename OutputIteratorT>
619 __device__ __forceinline__ void Store(
620 OutputIteratorT block_itr,
621 T (&items)[ITEMS_PER_THREAD],
622 int valid_items)
623 {
624 StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
625 }
626 };
627
628
632 template <int DUMMY>
634 {
635 // BlockExchange utility type for keys
636 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
637
640 {
642 volatile int valid_items;
643 };
644
646 struct TempStorage : Uninitialized<_TempStorage> {};
647
650
653
655 __device__ __forceinline__ StoreInternal(
657 int linear_tid)
658 :
659 temp_storage(temp_storage.Alias()),
661 {}
662
664 template <typename OutputIteratorT>
665 __device__ __forceinline__ void Store(
666 OutputIteratorT block_itr,
667 T (&items)[ITEMS_PER_THREAD])
668 {
669 BlockExchange(temp_storage).BlockedToStriped(items);
670 StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
671 }
672
674 template <typename OutputIteratorT>
675 __device__ __forceinline__ void Store(
676 OutputIteratorT block_itr,
677 T (&items)[ITEMS_PER_THREAD],
678 int valid_items)
679 {
680 BlockExchange(temp_storage).BlockedToStriped(items);
681 if (linear_tid == 0)
682 temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
683 CTA_SYNC();
684 StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
685 }
686 };
687
688
692 template <int DUMMY>
694 {
695 enum
696 {
697 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
698 };
699
700 // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
701 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
702
703 // BlockExchange utility type for keys
704 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
705
708 {
710 volatile int valid_items;
711 };
712
714 struct TempStorage : Uninitialized<_TempStorage> {};
715
718
721
723 __device__ __forceinline__ StoreInternal(
725 int linear_tid)
726 :
727 temp_storage(temp_storage.Alias()),
729 {}
730
732 template <typename OutputIteratorT>
733 __device__ __forceinline__ void Store(
734 OutputIteratorT block_itr,
735 T (&items)[ITEMS_PER_THREAD])
736 {
737 BlockExchange(temp_storage).BlockedToWarpStriped(items);
738 StoreDirectWarpStriped(linear_tid, block_itr, items);
739 }
740
742 template <typename OutputIteratorT>
743 __device__ __forceinline__ void Store(
744 OutputIteratorT block_itr,
745 T (&items)[ITEMS_PER_THREAD],
746 int valid_items)
747 {
748 BlockExchange(temp_storage).BlockedToWarpStriped(items);
749 if (linear_tid == 0)
750 temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
751 CTA_SYNC();
752 StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
753 }
754 };
755
756
760 template <int DUMMY>
762 {
763 enum
764 {
765 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
766 };
767
768 // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
769 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
770
771 // BlockExchange utility type for keys
772 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
773
776 {
778 volatile int valid_items;
779 };
780
782 struct TempStorage : Uninitialized<_TempStorage> {};
783
786
789
791 __device__ __forceinline__ StoreInternal(
793 int linear_tid)
794 :
795 temp_storage(temp_storage.Alias()),
797 {}
798
800 template <typename OutputIteratorT>
801 __device__ __forceinline__ void Store(
802 OutputIteratorT block_itr,
803 T (&items)[ITEMS_PER_THREAD])
804 {
805 BlockExchange(temp_storage).BlockedToWarpStriped(items);
806 StoreDirectWarpStriped(linear_tid, block_itr, items);
807 }
808
810 template <typename OutputIteratorT>
811 __device__ __forceinline__ void Store(
812 OutputIteratorT block_itr,
813 T (&items)[ITEMS_PER_THREAD],
814 int valid_items)
815 {
816 BlockExchange(temp_storage).BlockedToWarpStriped(items);
817 if (linear_tid == 0)
818 temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
819 CTA_SYNC();
820 StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
821 }
822 };
823
824 /******************************************************************************
825 * Type definitions
826 ******************************************************************************/
827
830
831
833 typedef typename InternalStore::TempStorage _TempStorage;
834
835
836 /******************************************************************************
837 * Utility methods
838 ******************************************************************************/
839
841 __device__ __forceinline__ _TempStorage& PrivateStorage()
842 {
843 __shared__ _TempStorage private_storage;
844 return private_storage;
845 }
846
847
848 /******************************************************************************
849 * Thread fields
850 ******************************************************************************/
851
854
857
858public:
859
860
862 struct TempStorage : Uninitialized<_TempStorage> {};
863
864
865 /******************************************************************/
869
873 __device__ __forceinline__ BlockStore()
874 :
876 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
877 {}
878
879
883 __device__ __forceinline__ BlockStore(
885 :
886 temp_storage(temp_storage.Alias()),
887 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
888 {}
889
890
892 /******************************************************************/
896
897
938 template <typename OutputIteratorT>
939 __device__ __forceinline__ void Store(
940 OutputIteratorT block_itr,
941 T (&items)[ITEMS_PER_THREAD])
942 {
943 InternalStore(temp_storage, linear_tid).Store(block_itr, items);
944 }
945
987 template <typename OutputIteratorT>
988 __device__ __forceinline__ void Store(
989 OutputIteratorT block_itr,
990 T (&items)[ITEMS_PER_THREAD],
991 int valid_items)
992 {
993 InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
994 }
995};
996
997
998} // CUB namespace
999CUB_NS_POSTFIX // Optional outer namespace(s)
1000
Sparse Matrix implementation stub object when OpenFPM is compiled with no linear algebra support.
Definition Vector.hpp:40
The BlockStore class provides collective data movement methods for writing a blocked arrangement of i...
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
NullType TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ void Store(T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory, specialized for native pointer types (attempts vectoriza...
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
_TempStorage & temp_storage
Thread reference to shared storage.
volatile int valid_items
Temporary storage for partially-full block guard.
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ BlockStore(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
__device__ __forceinline__ void StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a blocked arrangement of items across a thread block into a linear segment of items.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
BlockStoreAlgorithm
cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arr...
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
NullType TempStorage
Shared memory storage layout type.
InternalStore::TempStorage _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
__device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a warp-striped arrangement of data across the thread block into a linear segment of items.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorizat...
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ StoreInternal(TempStorage &, int linear_tid)
Constructor.
StoreInternal< ALGORITHM, 0 > InternalStore
Internal load implementation to use.
volatile int valid_items
Temporary storage for partially-full block guard.
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ void StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a striped arrangement of data across the thread block into a linear segment of items.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
__device__ __forceinline__ BlockStore()
Collective constructor using a private static allocation of shared memory as temporary storage.
volatile int valid_items
Temporary storage for partially-full block guard.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
__device__ __forceinline__ void StoreDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Store a blocked arrangement of items across a thread block into a linear segment of items.
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
int linear_tid
Linear thread-id.
__device__ __forceinline__ StoreInternal(TempStorage &, int linear_tid)
Constructor.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
@ BLOCK_THREADS
The thread block size in threads.
@ BLOCK_STORE_VECTORIZE
@ BLOCK_STORE_WARP_TRANSPOSE
@ BLOCK_STORE_DIRECT
@ BLOCK_STORE_TRANSPOSE
@ BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
#define CUB_MIN(a, b)
Select minimum(a, b)
#define CUB_STATIC_ASSERT(cond, msg)
Static assert.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409
CTA_SYNC()
Definition util_ptx.cuh:255
Optional outer namespace(s)
\smemstorage{BlockExchange}
\smemstorage{BlockStore}
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
A simple "NULL" marker type.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53