OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_store.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "block_exchange.cuh"
39 #include "../util_ptx.cuh"
40 #include "../util_macro.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43 
45 CUB_NS_PREFIX
46 
48 namespace cub {
49 
56 /******************************************************************/
60 
70 template <
71  typename T,
72  int ITEMS_PER_THREAD,
73  typename OutputIteratorT>
74 __device__ __forceinline__ void StoreDirectBlocked(
75  int linear_tid,
76  OutputIteratorT block_itr,
77  T (&items)[ITEMS_PER_THREAD])
78 {
79  OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
80 
81  // Store directly in thread-blocked order
82  #pragma unroll
83  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
84  {
85  thread_itr[ITEM] = items[ITEM];
86  }
87 }
88 
89 
99 template <
100  typename T,
101  int ITEMS_PER_THREAD,
102  typename OutputIteratorT>
103 __device__ __forceinline__ void StoreDirectBlocked(
104  int linear_tid,
105  OutputIteratorT block_itr,
106  T (&items)[ITEMS_PER_THREAD],
107  int valid_items)
108 {
109  OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
110 
111  // Store directly in thread-blocked order
112  #pragma unroll
113  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
114  {
115  if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
116  {
117  thread_itr[ITEM] = items[ITEM];
118  }
119  }
120 }
121 
122 
140 template <
141  typename T,
142  int ITEMS_PER_THREAD>
143 __device__ __forceinline__ void StoreDirectBlockedVectorized(
144  int linear_tid,
145  T *block_ptr,
146  T (&items)[ITEMS_PER_THREAD])
147 {
148  enum
149  {
150  // Maximum CUDA vector size is 4 elements
151  MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
152 
153  // Vector size must be a power of two and an even divisor of the items per thread
154  VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
155  MAX_VEC_SIZE :
156  1,
157 
158  VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
159  };
160 
161  // Vector type
162  typedef typename CubVector<T, VEC_SIZE>::Type Vector;
163 
164  // Alias global pointer
165  Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
166 
167  // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
168  Vector raw_vector[VECTORS_PER_THREAD];
169  T *raw_items = reinterpret_cast<T*>(raw_vector);
170 
171  // Copy
172  #pragma unroll
173  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
174  {
175  raw_items[ITEM] = items[ITEM];
176  }
177 
178  // Direct-store using vector types
179  StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
180 }
181 
182 
183 
185 /******************************************************************/
189 
190 
201 template <
202  int BLOCK_THREADS,
203  typename T,
204  int ITEMS_PER_THREAD,
205  typename OutputIteratorT>
206 __device__ __forceinline__ void StoreDirectStriped(
207  int linear_tid,
208  OutputIteratorT block_itr,
209  T (&items)[ITEMS_PER_THREAD])
210 {
211  OutputIteratorT thread_itr = block_itr + linear_tid;
212 
213  // Store directly in striped order
214  #pragma unroll
215  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
216  {
217  thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
218  }
219 }
220 
221 
232 template <
233  int BLOCK_THREADS,
234  typename T,
235  int ITEMS_PER_THREAD,
236  typename OutputIteratorT>
237 __device__ __forceinline__ void StoreDirectStriped(
238  int linear_tid,
239  OutputIteratorT block_itr,
240  T (&items)[ITEMS_PER_THREAD],
241  int valid_items)
242 {
243  OutputIteratorT thread_itr = block_itr + linear_tid;
244 
245  // Store directly in striped order
246  #pragma unroll
247  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
248  {
249  if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
250  {
251  thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
252  }
253  }
254 }
255 
256 
257 
259 /******************************************************************/
263 
264 
277 template <
278  typename T,
279  int ITEMS_PER_THREAD,
280  typename OutputIteratorT>
281 __device__ __forceinline__ void StoreDirectWarpStriped(
282  int linear_tid,
283  OutputIteratorT block_itr,
284  T (&items)[ITEMS_PER_THREAD])
285 {
286  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
287  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
288  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
289 
290  OutputIteratorT thread_itr = block_itr + warp_offset + tid;
291 
292  // Store directly in warp-striped order
293  #pragma unroll
294  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
295  {
296  thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
297  }
298 }
299 
300 
313 template <
314  typename T,
315  int ITEMS_PER_THREAD,
316  typename OutputIteratorT>
317 __device__ __forceinline__ void StoreDirectWarpStriped(
318  int linear_tid,
319  OutputIteratorT block_itr,
320  T (&items)[ITEMS_PER_THREAD],
321  int valid_items)
322 {
323  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
324  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
325  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
326 
327  OutputIteratorT thread_itr = block_itr + warp_offset + tid;
328 
329  // Store directly in warp-striped order
330  #pragma unroll
331  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
332  {
333  if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
334  {
335  thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
336  }
337  }
338 }
339 
340 
342 
343  // end group UtilIo
345 
346 
347 //-----------------------------------------------------------------------------
348 // Generic BlockStore abstraction
349 //-----------------------------------------------------------------------------
350 
355 {
367 
387 
400 
417 
436 
437 };
438 
439 
507 template <
508  typename T,
509  int BLOCK_DIM_X,
510  int ITEMS_PER_THREAD,
512  int BLOCK_DIM_Y = 1,
513  int BLOCK_DIM_Z = 1,
514  int PTX_ARCH = CUB_PTX_ARCH>
516 {
517 private:
518  /******************************************************************************
519  * Constants and typed definitions
520  ******************************************************************************/
521 
523  enum
524  {
526  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
527  };
528 
529 
530  /******************************************************************************
531  * Algorithmic variants
532  ******************************************************************************/
533 
535  template <BlockStoreAlgorithm _POLICY, int DUMMY>
537 
538 
542  template <int DUMMY>
544  {
547 
550 
552  __device__ __forceinline__ StoreInternal(
553  TempStorage &/*temp_storage*/,
554  int linear_tid)
555  :
557  {}
558 
560  template <typename OutputIteratorT>
561  __device__ __forceinline__ void Store(
562  OutputIteratorT block_itr,
563  T (&items)[ITEMS_PER_THREAD])
564  {
565  StoreDirectBlocked(linear_tid, block_itr, items);
566  }
567 
569  template <typename OutputIteratorT>
570  __device__ __forceinline__ void Store(
571  OutputIteratorT block_itr,
572  T (&items)[ITEMS_PER_THREAD],
573  int valid_items)
574  {
575  StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
576  }
577  };
578 
579 
583  template <int DUMMY>
585  {
588 
591 
593  __device__ __forceinline__ StoreInternal(
594  TempStorage &/*temp_storage*/,
595  int linear_tid)
596  :
598  {}
599 
601  __device__ __forceinline__ void Store(
602  T *block_ptr,
603  T (&items)[ITEMS_PER_THREAD])
604  {
605  StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
606  }
607 
609  template <typename OutputIteratorT>
610  __device__ __forceinline__ void Store(
611  OutputIteratorT block_itr,
612  T (&items)[ITEMS_PER_THREAD])
613  {
614  StoreDirectBlocked(linear_tid, block_itr, items);
615  }
616 
618  template <typename OutputIteratorT>
619  __device__ __forceinline__ void Store(
620  OutputIteratorT block_itr,
621  T (&items)[ITEMS_PER_THREAD],
622  int valid_items)
623  {
624  StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
625  }
626  };
627 
628 
632  template <int DUMMY>
634  {
635  // BlockExchange utility type for keys
636  typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
637 
640  {
642  volatile int valid_items;
643  };
644 
646  struct TempStorage : Uninitialized<_TempStorage> {};
647 
650 
653 
655  __device__ __forceinline__ StoreInternal(
657  int linear_tid)
658  :
659  temp_storage(temp_storage.Alias()),
661  {}
662 
664  template <typename OutputIteratorT>
665  __device__ __forceinline__ void Store(
666  OutputIteratorT block_itr,
667  T (&items)[ITEMS_PER_THREAD])
668  {
669  BlockExchange(temp_storage).BlockedToStriped(items);
670  StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
671  }
672 
674  template <typename OutputIteratorT>
675  __device__ __forceinline__ void Store(
676  OutputIteratorT block_itr,
677  T (&items)[ITEMS_PER_THREAD],
678  int valid_items)
679  {
680  BlockExchange(temp_storage).BlockedToStriped(items);
681  if (linear_tid == 0)
682  temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
683  CTA_SYNC();
684  StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
685  }
686  };
687 
688 
692  template <int DUMMY>
694  {
695  enum
696  {
697  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
698  };
699 
700  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
701  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
702 
703  // BlockExchange utility type for keys
704  typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
705 
708  {
710  volatile int valid_items;
711  };
712 
714  struct TempStorage : Uninitialized<_TempStorage> {};
715 
718 
721 
723  __device__ __forceinline__ StoreInternal(
725  int linear_tid)
726  :
727  temp_storage(temp_storage.Alias()),
729  {}
730 
732  template <typename OutputIteratorT>
733  __device__ __forceinline__ void Store(
734  OutputIteratorT block_itr,
735  T (&items)[ITEMS_PER_THREAD])
736  {
737  BlockExchange(temp_storage).BlockedToWarpStriped(items);
738  StoreDirectWarpStriped(linear_tid, block_itr, items);
739  }
740 
742  template <typename OutputIteratorT>
743  __device__ __forceinline__ void Store(
744  OutputIteratorT block_itr,
745  T (&items)[ITEMS_PER_THREAD],
746  int valid_items)
747  {
748  BlockExchange(temp_storage).BlockedToWarpStriped(items);
749  if (linear_tid == 0)
750  temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
751  CTA_SYNC();
752  StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
753  }
754  };
755 
756 
760  template <int DUMMY>
762  {
763  enum
764  {
765  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
766  };
767 
768  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
769  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
770 
771  // BlockExchange utility type for keys
772  typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
773 
776  {
778  volatile int valid_items;
779  };
780 
782  struct TempStorage : Uninitialized<_TempStorage> {};
783 
786 
789 
791  __device__ __forceinline__ StoreInternal(
793  int linear_tid)
794  :
795  temp_storage(temp_storage.Alias()),
797  {}
798 
800  template <typename OutputIteratorT>
801  __device__ __forceinline__ void Store(
802  OutputIteratorT block_itr,
803  T (&items)[ITEMS_PER_THREAD])
804  {
805  BlockExchange(temp_storage).BlockedToWarpStriped(items);
806  StoreDirectWarpStriped(linear_tid, block_itr, items);
807  }
808 
810  template <typename OutputIteratorT>
811  __device__ __forceinline__ void Store(
812  OutputIteratorT block_itr,
813  T (&items)[ITEMS_PER_THREAD],
814  int valid_items)
815  {
816  BlockExchange(temp_storage).BlockedToWarpStriped(items);
817  if (linear_tid == 0)
818  temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
819  CTA_SYNC();
820  StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
821  }
822  };
823 
824  /******************************************************************************
825  * Type definitions
826  ******************************************************************************/
827 
830 
831 
833  typedef typename InternalStore::TempStorage _TempStorage;
834 
835 
836  /******************************************************************************
837  * Utility methods
838  ******************************************************************************/
839 
841  __device__ __forceinline__ _TempStorage& PrivateStorage()
842  {
843  __shared__ _TempStorage private_storage;
844  return private_storage;
845  }
846 
847 
848  /******************************************************************************
849  * Thread fields
850  ******************************************************************************/
851 
854 
857 
858 public:
859 
860 
862  struct TempStorage : Uninitialized<_TempStorage> {};
863 
864 
865  /******************************************************************/
869 
873  __device__ __forceinline__ BlockStore()
874  :
876  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
877  {}
878 
879 
883  __device__ __forceinline__ BlockStore(
885  :
886  temp_storage(temp_storage.Alias()),
887  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
888  {}
889 
890 
892  /******************************************************************/
896 
897 
938  template <typename OutputIteratorT>
939  __device__ __forceinline__ void Store(
940  OutputIteratorT block_itr,
941  T (&items)[ITEMS_PER_THREAD])
942  {
943  InternalStore(temp_storage, linear_tid).Store(block_itr, items);
944  }
945 
987  template <typename OutputIteratorT>
988  __device__ __forceinline__ void Store(
989  OutputIteratorT block_itr,
990  T (&items)[ITEMS_PER_THREAD],
991  int valid_items)
992  {
993  InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
994  }
995 };
996 
997 
998 } // CUB namespace
999 CUB_NS_POSTFIX // Optional outer namespace(s)
1000 
InternalStore::TempStorage _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ StoreInternal(TempStorage &, int linear_tid)
Constructor.
volatile int valid_items
Temporary storage for partially-full block guard.
int linear_tid
Linear thread-id.
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
_TempStorage & temp_storage
Thread reference to shared storage.
volatile int valid_items
Temporary storage for partially-full block guard.
_TempStorage & temp_storage
Thread reference to shared storage.
#define CUB_STATIC_ASSERT(cond, msg)
Static assert.
Definition: util_macro.cuh:97
BlockStoreAlgorithm
cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arr...
Optional outer namespace(s)
Sparse Matrix implementation stub object when OpenFPM is compiled with no linear algebra support.
Definition: Vector.hpp:39
volatile int valid_items
Temporary storage for partially-full block guard.
__device__ __forceinline__ BlockStore(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
\smemstorage{BlockExchange}
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
The BlockStore class provides collective data movement methods for writing a blocked arrangement of i...
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
The thread block size in threads.
CTA_SYNC()
Definition: util_ptx.cuh:255
__device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a warp-striped arrangement of data across the thread block into a linear segment of items.
__device__ __forceinline__ void Store(T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory, specialized for native pointer types (attempts vectoriza...
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
\smemstorage{BlockStore}
__device__ __forceinline__ void StoreDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Store a blocked arrangement of items across a thread block into a linear segment of items.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorizat...
NullType TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
__device__ __forceinline__ void StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a blocked arrangement of items across a thread block into a linear segment of items.
Definition: block_store.cuh:74
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ void StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a striped arrangement of data across the thread block into a linear segment of items.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ BlockStore()
Collective constructor using a private static allocation of shared memory as temporary storage.
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
A simple "NULL" marker type.
Definition: util_type.cuh:256
NullType TempStorage
Shared memory storage layout type.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition: util_type.cuh:454
_TempStorage & temp_storage
Thread reference to shared storage.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
StoreInternal< ALGORITHM, 0 > InternalStore
Internal load implementation to use.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
__device__ __forceinline__ StoreInternal(TempStorage &, int linear_tid)
Constructor.
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.