36 #include "../../thread/thread_operators.cuh" 37 #include "../../thread/thread_load.cuh" 38 #include "../../thread/thread_store.cuh" 39 #include "../../util_type.cuh" 40 #include "../../util_namespace.cuh" 53 int LOGICAL_WARP_THREADS,
64 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
103 unsigned int lane_id;
104 unsigned int member_mask;
115 temp_storage(temp_storage.Alias()),
119 LaneId() % LOGICAL_WARP_THREADS),
123 ((
LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
138 bool ALL_LANES_VALID,
139 typename ReductionOp,
147 const int OFFSET = 1 << STEP;
150 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
155 if ((ALL_LANES_VALID &&
IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
157 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
171 bool ALL_LANES_VALID,
172 typename ReductionOp>
194 typename ReductionOp>
213 warp_flags >>= (
LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
217 int next_flag = __clz(__brev(warp_flags));
220 if (LOGICAL_WARP_THREADS != 32)
221 next_flag =
CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
224 for (
int STEP = 0; STEP <
STEPS; STEP++)
226 const int OFFSET = 1 << STEP;
229 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
234 if (OFFSET + lane_id < next_flag)
236 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
253 typename ReductionOp>
268 volatile SmemFlag *flag_storage = temp_storage.flags;
272 for (
int STEP = 0; STEP <
STEPS; STEP++)
274 const int OFFSET = 1 << STEP;
277 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
282 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
287 flag_storage[lane_id] = flag_status;
290 SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
293 if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
298 if ((flag_status & SEEN) == 0)
301 if (peer_flag_status & SET)
313 flag_status |= (peer_flag_status & SEEN);
322 flag_status |= peer_flag_status;
341 bool ALL_LANES_VALID,
342 typename ReductionOp>
358 typename ReductionOp>
364 return SegmentedReduce<HEAD_SEGMENTED>(input, flag,
reduction_op,
Int2Type<(PTX_ARCH >= 200)>());
__device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)
Optional outer namespace(s)
The number of shared memory elements per warp.
Whether the logical warp size and the PTX warp size coincide.
FlagT status (when not using ballot)
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
The number of warp scan steps.
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< true >)
Shared memory storage layout type (1.5 warps-worth of elements for each warp)
Statically determine if N is a power-of-two.
WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA ...
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
The number of threads in half a warp.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
__device__ __forceinline__ WarpReduceSmem(TempStorage &temp_storage)
Constructor.
Statically determine log2(N), rounded up.
#define CUB_MIN(a, b)
Select minimum(a, b)
Whether the logical warp size is a power-of-two.
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp reduction_op, Int2Type< STEP >)
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
unsigned char SmemFlag
Shared memory flag type.
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp, Int2Type< STEPS >)
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< false >)