36#include "../../thread/thread_operators.cuh"
37#include "../../thread/thread_load.cuh"
38#include "../../thread/thread_store.cuh"
39#include "../../util_type.cuh"
40#include "../../util_namespace.cuh"
53 int LOGICAL_WARP_THREADS,
64 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
103 unsigned int lane_id;
104 unsigned int member_mask;
115 temp_storage(temp_storage.Alias()),
119 LaneId() % LOGICAL_WARP_THREADS),
123 ((
LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
138 bool ALL_LANES_VALID,
139 typename ReductionOp,
147 const int OFFSET = 1 << STEP;
150 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
155 if ((ALL_LANES_VALID &&
IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
157 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
171 bool ALL_LANES_VALID,
172 typename ReductionOp>
194 typename ReductionOp>
213 warp_flags >>= (
LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
217 int next_flag = __clz(__brev(warp_flags));
220 if (LOGICAL_WARP_THREADS != 32)
221 next_flag =
CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
224 for (
int STEP = 0; STEP <
STEPS; STEP++)
226 const int OFFSET = 1 << STEP;
229 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
234 if (OFFSET + lane_id < next_flag)
236 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
253 typename ReductionOp>
268 volatile SmemFlag *flag_storage = temp_storage.flags;
272 for (
int STEP = 0; STEP <
STEPS; STEP++)
274 const int OFFSET = 1 << STEP;
277 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
282 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
287 flag_storage[lane_id] = flag_status;
290 SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
293 if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
298 if ((flag_status & SEEN) == 0)
301 if (peer_flag_status & SET)
313 flag_status |= (peer_flag_status & SEEN);
322 flag_status |= peer_flag_status;
341 bool ALL_LANES_VALID,
342 typename ReductionOp>
358 typename ReductionOp>
364 return SegmentedReduce<HEAD_SEGMENTED>(input, flag,
reduction_op,
Int2Type<(PTX_ARCH >= 200)>());
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Statically determine log2(N), rounded up.
Statically determine if N is a power-of-two.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Shared memory storage layout type (1.5 warps-worth of elements for each warp)
WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA ...
@ IS_POW_OF_TWO
Whether the logical warp size is a power-of-two.
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
@ HALF_WARP_THREADS
The number of threads in half a warp.
@ WARP_SMEM_ELEMENTS
The number of shared memory elements per warp.
@ UNSET
FlagT status (when not using ballot)
@ STEPS
The number of warp scan steps.
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp reduction_op, Int2Type< STEP >)
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< false >)
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp, Int2Type< STEPS >)
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< true >)
__device__ __forceinline__ WarpReduceSmem(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
unsigned char SmemFlag
Shared memory flag type.
__device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)