OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
warp_scan.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
38 #include "../thread/thread_operators.cuh"
39 #include "../util_arch.cuh"
40 #include "../util_type.cuh"
41 #include "../util_namespace.cuh"
42 
44 CUB_NS_PREFIX
45 
47 namespace cub {
48 
142 template <
143  typename T,
144  int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
145  int PTX_ARCH = CUB_PTX_ARCH>
146 class WarpScan
147 {
148 private:
149 
150  /******************************************************************************
151  * Constants and type definitions
152  ******************************************************************************/
153 
154  enum
155  {
157  IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
158 
160  IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
161 
163  IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
164  };
165 
167  typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
170 
172  typedef typename InternalWarpScan::TempStorage _TempStorage;
173 
174 
175  /******************************************************************************
176  * Thread fields
177  ******************************************************************************/
178 
181  unsigned int lane_id;
182 
183 
184 
185  /******************************************************************************
186  * Public types
187  ******************************************************************************/
188 
189 public:
190 
192  struct TempStorage : Uninitialized<_TempStorage> {};
193 
194 
195  /******************************************************************/
199 
203  __device__ __forceinline__ WarpScan(
205  :
206  temp_storage(temp_storage.Alias()),
207  lane_id(IS_ARCH_WARP ?
208  LaneId() :
209  LaneId() % LOGICAL_WARP_THREADS)
210  {}
211 
212 
214  /******************************************************************/
218 
219 
254  __device__ __forceinline__ void InclusiveSum(
255  T input,
256  T &inclusive_output)
257  {
258  InclusiveScan(input, inclusive_output, cub::Sum());
259  }
260 
261 
297  __device__ __forceinline__ void InclusiveSum(
298  T input,
299  T &inclusive_output,
300  T &warp_aggregate)
301  {
302  InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
303  }
304 
305 
307  /******************************************************************/
311 
312 
349  __device__ __forceinline__ void ExclusiveSum(
350  T input,
351  T &exclusive_output)
352  {
353  T initial_value = 0;
354  ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
355  }
356 
357 
394  __device__ __forceinline__ void ExclusiveSum(
395  T input,
396  T &exclusive_output,
397  T &warp_aggregate)
398  {
399  T initial_value = 0;
400  ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
401  }
402 
403 
405  /******************************************************************/
409 
446  template <typename ScanOp>
447  __device__ __forceinline__ void InclusiveScan(
448  T input,
449  T &inclusive_output,
450  ScanOp scan_op)
451  {
452  InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
453  }
454 
455 
496  template <typename ScanOp>
497  __device__ __forceinline__ void InclusiveScan(
498  T input,
499  T &inclusive_output,
500  ScanOp scan_op,
501  T &warp_aggregate)
502  {
503  InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
504  }
505 
506 
508  /******************************************************************/
512 
550  template <typename ScanOp>
551  __device__ __forceinline__ void ExclusiveScan(
552  T input,
553  T &exclusive_output,
554  ScanOp scan_op)
555  {
556  InternalWarpScan internal(temp_storage);
557 
558  T inclusive_output;
559  internal.InclusiveScan(input, inclusive_output, scan_op);
560 
561  internal.Update(
562  input,
563  inclusive_output,
564  exclusive_output,
565  scan_op,
567  }
568 
569 
606  template <typename ScanOp>
607  __device__ __forceinline__ void ExclusiveScan(
608  T input,
609  T &exclusive_output,
610  T initial_value,
611  ScanOp scan_op)
612  {
613  InternalWarpScan internal(temp_storage);
614 
615  T inclusive_output;
616  internal.InclusiveScan(input, inclusive_output, scan_op);
617 
618  internal.Update(
619  input,
620  inclusive_output,
621  exclusive_output,
622  scan_op,
623  initial_value,
625  }
626 
627 
667  template <typename ScanOp>
668  __device__ __forceinline__ void ExclusiveScan(
669  T input,
670  T &exclusive_output,
671  ScanOp scan_op,
672  T &warp_aggregate)
673  {
674  InternalWarpScan internal(temp_storage);
675 
676  T inclusive_output;
677  internal.InclusiveScan(input, inclusive_output, scan_op);
678 
679  internal.Update(
680  input,
681  inclusive_output,
682  exclusive_output,
683  warp_aggregate,
684  scan_op,
686  }
687 
688 
728  template <typename ScanOp>
729  __device__ __forceinline__ void ExclusiveScan(
730  T input,
731  T &exclusive_output,
732  T initial_value,
733  ScanOp scan_op,
734  T &warp_aggregate)
735  {
736  InternalWarpScan internal(temp_storage);
737 
738  T inclusive_output;
739  internal.InclusiveScan(input, inclusive_output, scan_op);
740 
741  internal.Update(
742  input,
743  inclusive_output,
744  exclusive_output,
745  warp_aggregate,
746  scan_op,
747  initial_value,
749  }
750 
751 
753  /******************************************************************/
757 
758 
798  template <typename ScanOp>
799  __device__ __forceinline__ void Scan(
800  T input,
801  T &inclusive_output,
802  T &exclusive_output,
803  ScanOp scan_op)
804  {
805  InternalWarpScan internal(temp_storage);
806 
807  internal.InclusiveScan(input, inclusive_output, scan_op);
808 
809  internal.Update(
810  input,
811  inclusive_output,
812  exclusive_output,
813  scan_op,
815  }
816 
817 
857  template <typename ScanOp>
858  __device__ __forceinline__ void Scan(
859  T input,
860  T &inclusive_output,
861  T &exclusive_output,
862  T initial_value,
863  ScanOp scan_op)
864  {
865  InternalWarpScan internal(temp_storage);
866 
867  internal.InclusiveScan(input, inclusive_output, scan_op);
868 
869  internal.Update(
870  input,
871  inclusive_output,
872  exclusive_output,
873  scan_op,
874  initial_value,
876  }
877 
878 
879 
881  /******************************************************************/
885 
922  __device__ __forceinline__ T Broadcast(
923  T input,
924  unsigned int src_lane)
925  {
926  return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
927  }
928 
930 
931 };
932  // end group WarpModule
934 
935 } // CUB namespace
936 CUB_NS_POSTFIX // Optional outer namespace(s)
If<(PTX_ARCH >=300) &&(IS_POW_OF_TWO), WarpScanShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >, WarpScanSmem< T, LOGICAL_WARP_THREADS, PTX_ARCH > >::Type InternalWarpScan
Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS i...
Definition: warp_scan.cuh:169
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, T initial_value, ScanOp scan_op)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
Definition: warp_scan.cuh:607
WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA ...
Type traits.
Definition: util_type.cuh:1158
Optional outer namespace(s)
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, T initial_value, ScanOp scan_op, T &warp_aggregate)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
Definition: warp_scan.cuh:729
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition: util_arch.cuh:53
__device__ __forceinline__ void InclusiveSum(T input, T &inclusive_output, T &warp_aggregate)
Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wi...
Definition: warp_scan.cuh:297
__device__ __forceinline__ void ExclusiveSum(T input, T &exclusive_output)
Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial va...
Definition: warp_scan.cuh:349
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition: util_ptx.cuh:420
Whether the data type is an integer (which has fully-associative addition)
Definition: warp_scan.cuh:163
InternalWarpScan::TempStorage _TempStorage
Shared memory storage layout type for WarpScan.
Definition: warp_scan.cuh:172
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, T initial_value, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
Definition: warp_scan.cuh:858
__device__ __forceinline__ WarpScan(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage....
Definition: warp_scan.cuh:203
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &warp_aggregate)
Computes an inclusive prefix scan using the specified binary scan functor across the calling warp....
Definition: warp_scan.cuh:497
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
Definition: warp_scan.cuh:551
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
Definition: warp_scan.cuh:447
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
Definition: warp_scan.cuh:799
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
_TempStorage & temp_storage
Shared storage reference.
Definition: warp_scan.cuh:180
__device__ __forceinline__ void ExclusiveSum(T input, T &exclusive_output, T &warp_aggregate)
Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial va...
Definition: warp_scan.cuh:394
__device__ __forceinline__ void InclusiveSum(T input, T &inclusive_output)
Computes an inclusive prefix sum across the calling warp.
Definition: warp_scan.cuh:254
Whether the logical warp size and the PTX warp size coincide.
Definition: warp_scan.cuh:157
WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA ...
Default sum functor.
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast the value input from warp-lanesrc_lane to all lanes in the warp.
Definition: warp_scan.cuh:922
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &warp_aggregate)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
Definition: warp_scan.cuh:668
\smemstorage{WarpScan}
Definition: warp_scan.cuh:192
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
Definition: warp_scan.cuh:146
Whether the logical warp size is a power-of-two.
Definition: warp_scan.cuh:160