OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
warp_scan.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
38#include "../thread/thread_operators.cuh"
39#include "../util_arch.cuh"
40#include "../util_type.cuh"
41#include "../util_namespace.cuh"
42
44CUB_NS_PREFIX
45
47namespace cub {
48
142template <
143 typename T,
144 int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
145 int PTX_ARCH = CUB_PTX_ARCH>
147{
148private:
149
150 /******************************************************************************
151 * Constants and type definitions
152 ******************************************************************************/
153
154 enum
155 {
157 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
158
160 IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
161
163 IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
164 };
165
167 typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
170
173
174
175 /******************************************************************************
176 * Thread fields
177 ******************************************************************************/
178
181 unsigned int lane_id;
182
183
184
185 /******************************************************************************
186 * Public types
187 ******************************************************************************/
188
189public:
190
192 struct TempStorage : Uninitialized<_TempStorage> {};
193
194
195 /******************************************************************/
199
203 __device__ __forceinline__ WarpScan(
205 :
206 temp_storage(temp_storage.Alias()),
207 lane_id(IS_ARCH_WARP ?
208 LaneId() :
209 LaneId() % LOGICAL_WARP_THREADS)
210 {}
211
212
214 /******************************************************************/
218
219
254 __device__ __forceinline__ void InclusiveSum(
255 T input,
256 T &inclusive_output)
257 {
258 InclusiveScan(input, inclusive_output, cub::Sum());
259 }
260
261
297 __device__ __forceinline__ void InclusiveSum(
298 T input,
299 T &inclusive_output,
300 T &warp_aggregate)
301 {
302 InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
303 }
304
305
307 /******************************************************************/
311
312
349 __device__ __forceinline__ void ExclusiveSum(
350 T input,
351 T &exclusive_output)
352 {
353 T initial_value = 0;
354 ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
355 }
356
357
394 __device__ __forceinline__ void ExclusiveSum(
395 T input,
396 T &exclusive_output,
397 T &warp_aggregate)
398 {
399 T initial_value = 0;
400 ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
401 }
402
403
405 /******************************************************************/
409
446 template <typename ScanOp>
447 __device__ __forceinline__ void InclusiveScan(
448 T input,
449 T &inclusive_output,
450 ScanOp scan_op)
451 {
452 InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
453 }
454
455
496 template <typename ScanOp>
497 __device__ __forceinline__ void InclusiveScan(
498 T input,
499 T &inclusive_output,
500 ScanOp scan_op,
501 T &warp_aggregate)
502 {
503 InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
504 }
505
506
508 /******************************************************************/
512
550 template <typename ScanOp>
551 __device__ __forceinline__ void ExclusiveScan(
552 T input,
553 T &exclusive_output,
554 ScanOp scan_op)
555 {
557
558 T inclusive_output;
559 internal.InclusiveScan(input, inclusive_output, scan_op);
560
561 internal.Update(
562 input,
563 inclusive_output,
564 exclusive_output,
565 scan_op,
567 }
568
569
606 template <typename ScanOp>
607 __device__ __forceinline__ void ExclusiveScan(
608 T input,
609 T &exclusive_output,
610 T initial_value,
611 ScanOp scan_op)
612 {
614
615 T inclusive_output;
616 internal.InclusiveScan(input, inclusive_output, scan_op);
617
618 internal.Update(
619 input,
620 inclusive_output,
621 exclusive_output,
622 scan_op,
623 initial_value,
625 }
626
627
667 template <typename ScanOp>
668 __device__ __forceinline__ void ExclusiveScan(
669 T input,
670 T &exclusive_output,
671 ScanOp scan_op,
672 T &warp_aggregate)
673 {
675
676 T inclusive_output;
677 internal.InclusiveScan(input, inclusive_output, scan_op);
678
679 internal.Update(
680 input,
681 inclusive_output,
682 exclusive_output,
683 warp_aggregate,
684 scan_op,
686 }
687
688
728 template <typename ScanOp>
729 __device__ __forceinline__ void ExclusiveScan(
730 T input,
731 T &exclusive_output,
732 T initial_value,
733 ScanOp scan_op,
734 T &warp_aggregate)
735 {
737
738 T inclusive_output;
739 internal.InclusiveScan(input, inclusive_output, scan_op);
740
741 internal.Update(
742 input,
743 inclusive_output,
744 exclusive_output,
745 warp_aggregate,
746 scan_op,
747 initial_value,
749 }
750
751
753 /******************************************************************/
757
758
798 template <typename ScanOp>
799 __device__ __forceinline__ void Scan(
800 T input,
801 T &inclusive_output,
802 T &exclusive_output,
803 ScanOp scan_op)
804 {
806
807 internal.InclusiveScan(input, inclusive_output, scan_op);
808
809 internal.Update(
810 input,
811 inclusive_output,
812 exclusive_output,
813 scan_op,
815 }
816
817
857 template <typename ScanOp>
858 __device__ __forceinline__ void Scan(
859 T input,
860 T &inclusive_output,
861 T &exclusive_output,
862 T initial_value,
863 ScanOp scan_op)
864 {
866
867 internal.InclusiveScan(input, inclusive_output, scan_op);
868
869 internal.Update(
870 input,
871 inclusive_output,
872 exclusive_output,
873 scan_op,
874 initial_value,
876 }
877
878
879
881 /******************************************************************/
885
922 __device__ __forceinline__ T Broadcast(
923 T input,
924 unsigned int src_lane)
925 {
926 return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
927 }
928
930
931};
932 // end group WarpModule
934
935} // CUB namespace
936CUB_NS_POSTFIX // Optional outer namespace(s)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
InternalWarpScan::TempStorage _TempStorage
Shared memory storage layout type for WarpScan.
__device__ __forceinline__ void ExclusiveSum(T input, T &exclusive_output, T &warp_aggregate)
Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial va...
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
__device__ __forceinline__ void InclusiveSum(T input, T &inclusive_output)
Computes an inclusive prefix sum across the calling warp.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &warp_aggregate)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
@ IS_POW_OF_TWO
Whether the logical warp size is a power-of-two.
@ IS_INTEGER
Whether the data type is an integer (which has fully-associative addition)
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &warp_aggregate)
Computes an inclusive prefix scan using the specified binary scan functor across the calling warp....
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast the value input from warp-lanesrc_lane to all lanes in the warp.
__device__ __forceinline__ WarpScan(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage....
__device__ __forceinline__ void ExclusiveSum(T input, T &exclusive_output)
Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial va...
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, T initial_value, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, T initial_value, ScanOp scan_op)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
_TempStorage & temp_storage
Shared storage reference.
__device__ __forceinline__ void InclusiveSum(T input, T &inclusive_output, T &warp_aggregate)
Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wi...
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, T initial_value, ScanOp scan_op, T &warp_aggregate)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
If<(PTX_ARCH >=300)&&(IS_POW_OF_TWO), WarpScanShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >, WarpScanSmem< T, LOGICAL_WARP_THREADS, PTX_ARCH > >::Type InternalWarpScan
Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS i...
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Default sum functor.
Type traits.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA ...
WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA ...
\smemstorage{WarpScan}
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53