36#include "../../thread/thread_operators.cuh"
37#include "../../util_ptx.cuh"
38#include "../../util_type.cuh"
39#include "../../util_macro.cuh"
40#include "../../util_namespace.cuh"
56 int LOGICAL_WARP_THREADS,
67 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
76 SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
118 member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
141 int shfl_c = last_lane |
SHFL_C;
144#ifdef CUB_USE_COOPERATIVE_GROUPS
149 " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
150 " @p add.u32 r0, r0, %4;"
153 :
"=r"(output) :
"r"(input),
"r"(offset),
"r"(shfl_c),
"r"(input),
"r"(
member_mask));
159 " shfl.down.b32 r0|p, %1, %2, %3;"
160 " @p add.u32 r0, r0, %4;"
163 :
"=r"(output) :
"r"(input),
"r"(offset),
"r"(shfl_c),
"r"(input));
178 int shfl_c = last_lane |
SHFL_C;
181#ifdef CUB_USE_COOPERATIVE_GROUPS
186 " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
187 " @p add.f32 r0, r0, %4;"
190 :
"=f"(output) :
"f"(input),
"r"(offset),
"r"(shfl_c),
"f"(input),
"r"(
member_mask));
196 " shfl.down.b32 r0|p, %1, %2, %3;"
197 " @p add.f32 r0, r0, %4;"
200 :
"=f"(output) :
"f"(input),
"r"(offset),
"r"(shfl_c),
"f"(input));
209 unsigned long long input,
214 unsigned long long output;
215 int shfl_c = last_lane |
SHFL_C;
217#ifdef CUB_USE_COOPERATIVE_GROUPS
223 " mov.b64 {lo, hi}, %1;"
224 " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
225 " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
226 " mov.b64 %0, {lo, hi};"
227 " @p add.u64 %0, %0, %1;"
229 :
"=l"(output) :
"l"(input),
"r"(offset),
"r"(shfl_c),
"r"(
member_mask));
236 " mov.b64 {lo, hi}, %1;"
237 " shfl.down.b32 lo|p, lo, %2, %3;"
238 " shfl.down.b32 hi|p, hi, %2, %3;"
239 " mov.b64 %0, {lo, hi};"
240 " @p add.u64 %0, %0, %1;"
242 :
"=l"(output) :
"l"(input),
"r"(offset),
"r"(shfl_c));
257 int shfl_c = last_lane |
SHFL_C;
260#ifdef CUB_USE_COOPERATIVE_GROUPS
266 " mov.b64 {lo, hi}, %1;"
267 " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
268 " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
269 " mov.b64 %0, {lo, hi};"
270 " @p add.s64 %0, %0, %1;"
272 :
"=l"(output) :
"l"(input),
"r"(offset),
"r"(shfl_c),
"r"(
member_mask));
279 " mov.b64 {lo, hi}, %1;"
280 " shfl.down.b32 lo|p, lo, %2, %3;"
281 " shfl.down.b32 hi|p, hi, %2, %3;"
282 " mov.b64 %0, {lo, hi};"
283 " @p add.s64 %0, %0, %1;"
285 :
"=l"(output) :
"l"(input),
"r"(offset),
"r"(shfl_c));
300 int shfl_c = last_lane |
SHFL_C;
303#ifdef CUB_USE_COOPERATIVE_GROUPS
311 " mov.b64 {lo, hi}, %1;"
312 " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
313 " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
314 " mov.b64 r0, {lo, hi};"
315 " @p add.f64 %0, %0, r0;"
317 :
"=d"(output) :
"d"(input),
"r"(offset),
"r"(shfl_c),
"r"(
member_mask));
326 " mov.b64 {lo, hi}, %1;"
327 " shfl.down.b32 lo|p, lo, %2, %3;"
328 " shfl.down.b32 hi|p, hi, %2, %3;"
329 " mov.b64 r0, {lo, hi};"
330 " @p add.f64 %0, %0, r0;"
332 :
"=d"(output) :
"d"(input),
"r"(offset),
"r"(shfl_c));
340 template <
typename ValueT,
typename KeyT>
349 KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.
key, offset, last_lane,
member_mask);
359 if (input.
key != other_key)
368 template <
typename ValueT,
typename OffsetT>
388 template <
typename _T,
typename ReductionOp>
397 _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane,
member_mask);
400 if (offset +
lane_id <= last_lane)
408 template <
typename _T,
typename ReductionOp>
421 template <
typename _T,
typename ReductionOp>
437 template <
typename ReductionOp,
int STEP>
449 template <
typename ReductionOp>
464 bool ALL_LANES_VALID,
465 typename ReductionOp>
471 int last_lane = (ALL_LANES_VALID) ?
472 LOGICAL_WARP_THREADS - 1 :
495 typename ReductionOp>
518 warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
521 int last_lane = __clz(__brev(warp_flags));
Binary operator wrapper for switching non-commutative scan arguments.
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
__device__ __forceinline__ unsigned int LaneMaskGe()
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A key identifier paired with a corresponding value.
Statically determine log2(N), rounded up.
A simple "NULL" marker type.
< Binary reduction operator to apply to values
Reduce-by-segment functor.
@ IS_SMALL_UNSIGNED
Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction...
WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA ...
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset)
Reduction step (generic)
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< false >)
Reduction step (specialized for types other than small unsigned integers size 32b or less)
__device__ __forceinline__ double ReduceStep(double input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across double types)
__device__ __forceinline__ WarpReduceShfl(TempStorage &)
Constructor.
unsigned int lane_id
Lane index in logical warp.
unsigned int member_mask
32-thread physical warp member mask of logical warp
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< true >)
Reduction step (specialized for small unsigned integers size 32b or less)
__device__ __forceinline__ KeyValuePair< OffsetT, ValueT > ReduceStep(KeyValuePair< OffsetT, ValueT > input, SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >, int last_lane, int offset)
Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT,...
__device__ __forceinline__ float ReduceStep(float input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across fp32 types)
__device__ __forceinline__ unsigned long long ReduceStep(unsigned long long input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across unsigned long long types)
__device__ __forceinline__ long long ReduceStep(long long input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across long long types)
NullType TempStorage
Shared memory storage layout type.
__device__ __forceinline__ KeyValuePair< KeyT, ValueT > ReduceStep(KeyValuePair< KeyT, ValueT > input, SwizzleScanOp< ReduceByKeyOp< cub::Sum > >, int last_lane, int offset)
Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT,...
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
@ SHFL_C
The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up.
@ LOGICAL_WARPS
Number of logical warps in a PTX warp.
@ STEPS
The number of warp reduction steps.
__device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)
Reduction.
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
Segmented reduction.
__device__ __forceinline__ unsigned int ReduceStep(unsigned int input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across uint32 types)
__device__ __forceinline__ void ReduceStep(T &input, ReductionOp reduction_op, int last_lane, Int2Type< STEP >)
unsigned int warp_id
Logical warp index in 32-thread physical warp.