36 #include "../../thread/thread_operators.cuh" 37 #include "../../util_ptx.cuh" 38 #include "../../util_type.cuh" 39 #include "../../util_macro.cuh" 40 #include "../../util_namespace.cuh" 56 int LOGICAL_WARP_THREADS,
67 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
76 SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
118 member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
141 int shfl_c = last_lane |
SHFL_C;
144 #ifdef CUB_USE_COOPERATIVE_GROUPS 149 " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" 150 " @p add.u32 r0, r0, %4;" 153 :
"=r"(output) :
"r"(input),
"r"(offset),
"r"(shfl_c),
"r"(input),
"r"(
member_mask));
159 " shfl.down.b32 r0|p, %1, %2, %3;" 160 " @p add.u32 r0, r0, %4;" 163 :
"=r"(output) :
"r"(input),
"r"(offset),
"r"(shfl_c),
"r"(input));
178 int shfl_c = last_lane |
SHFL_C;
181 #ifdef CUB_USE_COOPERATIVE_GROUPS 186 " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" 187 " @p add.f32 r0, r0, %4;" 190 :
"=f"(output) :
"f"(input),
"r"(offset),
"r"(shfl_c),
"f"(input),
"r"(
member_mask));
196 " shfl.down.b32 r0|p, %1, %2, %3;" 197 " @p add.f32 r0, r0, %4;" 200 :
"=f"(output) :
"f"(input),
"r"(offset),
"r"(shfl_c),
"f"(input));
209 unsigned long long input,
214 unsigned long long output;
215 int shfl_c = last_lane |
SHFL_C;
217 #ifdef CUB_USE_COOPERATIVE_GROUPS 223 " mov.b64 {lo, hi}, %1;" 224 " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" 225 " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" 226 " mov.b64 %0, {lo, hi};" 227 " @p add.u64 %0, %0, %1;" 229 :
"=l"(output) :
"l"(input),
"r"(offset),
"r"(shfl_c),
"r"(
member_mask));
236 " mov.b64 {lo, hi}, %1;" 237 " shfl.down.b32 lo|p, lo, %2, %3;" 238 " shfl.down.b32 hi|p, hi, %2, %3;" 239 " mov.b64 %0, {lo, hi};" 240 " @p add.u64 %0, %0, %1;" 242 :
"=l"(output) :
"l"(input),
"r"(offset),
"r"(shfl_c));
257 int shfl_c = last_lane |
SHFL_C;
260 #ifdef CUB_USE_COOPERATIVE_GROUPS 266 " mov.b64 {lo, hi}, %1;" 267 " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" 268 " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" 269 " mov.b64 %0, {lo, hi};" 270 " @p add.s64 %0, %0, %1;" 272 :
"=l"(output) :
"l"(input),
"r"(offset),
"r"(shfl_c),
"r"(
member_mask));
279 " mov.b64 {lo, hi}, %1;" 280 " shfl.down.b32 lo|p, lo, %2, %3;" 281 " shfl.down.b32 hi|p, hi, %2, %3;" 282 " mov.b64 %0, {lo, hi};" 283 " @p add.s64 %0, %0, %1;" 285 :
"=l"(output) :
"l"(input),
"r"(offset),
"r"(shfl_c));
300 int shfl_c = last_lane |
SHFL_C;
303 #ifdef CUB_USE_COOPERATIVE_GROUPS 311 " mov.b64 {lo, hi}, %1;" 312 " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" 313 " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" 314 " mov.b64 r0, {lo, hi};" 315 " @p add.f64 %0, %0, r0;" 317 :
"=d"(output) :
"d"(input),
"r"(offset),
"r"(shfl_c),
"r"(
member_mask));
326 " mov.b64 {lo, hi}, %1;" 327 " shfl.down.b32 lo|p, lo, %2, %3;" 328 " shfl.down.b32 hi|p, hi, %2, %3;" 329 " mov.b64 r0, {lo, hi};" 330 " @p add.f64 %0, %0, r0;" 332 :
"=d"(output) :
"d"(input),
"r"(offset),
"r"(shfl_c));
340 template <
typename ValueT,
typename KeyT>
349 KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.
key, offset, last_lane,
member_mask);
359 if (input.
key != other_key)
368 template <
typename ValueT,
typename OffsetT>
388 template <
typename _T,
typename ReductionOp>
397 _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane,
member_mask);
400 if (offset +
lane_id <= last_lane)
408 template <
typename _T,
typename ReductionOp>
421 template <
typename _T,
typename ReductionOp>
437 template <
typename ReductionOp,
int STEP>
449 template <
typename ReductionOp>
464 bool ALL_LANES_VALID,
465 typename ReductionOp>
471 int last_lane = (ALL_LANES_VALID) ?
472 LOGICAL_WARP_THREADS - 1 :
495 typename ReductionOp>
518 warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
521 int last_lane = __clz(__brev(warp_flags));
__device__ __forceinline__ WarpReduceShfl(TempStorage &)
Constructor.
WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA ...
Optional outer namespace(s)
Number of logical warps in a PTX warp.
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< true >)
Reduction step (specialized for small unsigned integers size 32b or less)
__device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)
Reduction.
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
unsigned int lane_id
Lane index in logical warp.
A key identifier paired with a corresponding value.
unsigned int member_mask
32-thread physical warp member mask of logical warp
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
__device__ __forceinline__ unsigned long long ReduceStep(unsigned long long input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across unsigned long long types)
__device__ __forceinline__ unsigned int LaneMaskGe()
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
__device__ __forceinline__ long long ReduceStep(long long input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across long long types)
NullType TempStorage
Shared memory storage layout type.
unsigned int warp_id
Logical warp index in 32-thread physical warp.
Whether the logical warp size and the PTX warp size coincide.
The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up.
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< false >)
Reduction step (specialized for types other than small unsigned integers size 32b or less)
__device__ __forceinline__ KeyValuePair< OffsetT, ValueT > ReduceStep(KeyValuePair< OffsetT, ValueT > input, SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >, int last_lane, int offset)
Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT,...
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
__device__ __forceinline__ float ReduceStep(float input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across fp32 types)
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset)
Reduction step (generic)
Statically determine log2(N), rounded up.
__device__ __forceinline__ void ReduceStep(T &input, ReductionOp reduction_op, int last_lane, Int2Type< STEP >)
A simple "NULL" marker type.
Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction...
The number of warp reduction steps.
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
Segmented reduction.
__device__ __forceinline__ double ReduceStep(double input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across double types)
__device__ __forceinline__ KeyValuePair< KeyT, ValueT > ReduceStep(KeyValuePair< KeyT, ValueT > input, SwizzleScanOp< ReduceByKeyOp< cub::Sum > >, int last_lane, int offset)
Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT,...
Binary operator wrapper for switching non-commutative scan arguments.
__device__ __forceinline__ unsigned int ReduceStep(unsigned int input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across uint32 types)