36#include "../../thread/thread_operators.cuh"
37#include "../../thread/thread_load.cuh"
38#include "../../thread/thread_store.cuh"
39#include "../../util_type.cuh"
40#include "../../util_namespace.cuh"
53 int LOGICAL_WARP_THREADS,
64 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
95 unsigned int member_mask;
106 temp_storage(temp_storage.Alias()),
110 LaneId() % LOGICAL_WARP_THREADS),
114 ((
LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
132 const int OFFSET = 1 << STEP;
140 if (HAS_IDENTITY || (lane_id >= OFFSET))
142 T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[
HALF_WARP_THREADS + lane_id - OFFSET]);
143 partial =
scan_op(addend, partial);
170 ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (
CellT) identity);
181 template <
typename ScanOp,
int IS_PRIMITIVE>
205 unsigned int src_lane)
207 if (lane_id == src_lane)
209 ThreadStore<STORE_VOLATILE>(temp_storage, (
CellT) input);
214 return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
223 template <
typename ScanOp>
234 template <
typename ScanOp>
248 warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[
WARP_SMEM_ELEMENTS - 1]);
259 template <
typename ScanOpT,
typename IsIntegerT>
272 exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[
HALF_WARP_THREADS + lane_id - 1]);
284 exclusive = inclusive - input;
288 template <
typename ScanOpT,
typename IsIntegerT>
297 inclusive =
scan_op(initial_value, inclusive);
302 exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[
HALF_WARP_THREADS + lane_id - 1]);
304 exclusive = initial_value;
316 inclusive =
scan_op(initial_value, inclusive);
317 exclusive = inclusive - input;
322 template <
typename ScanOpT,
typename IsIntegerT>
336 exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[
HALF_WARP_THREADS + lane_id - 1]);
337 warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[
WARP_SMEM_ELEMENTS - 1]);
354 warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[
WARP_SMEM_ELEMENTS - 1]);
355 exclusive = inclusive - input;
359 template <
typename ScanOpT,
typename IsIntegerT>
374 warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[
WARP_SMEM_ELEMENTS - 1]);
379 inclusive =
scan_op(initial_value, inclusive);
386 exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[
HALF_WARP_THREADS + lane_id - 2]);
389 exclusive = initial_value;
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Type selection (IF ? ThenType : ElseType)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Statically determine log2(N), rounded up.
Statically determine if N is a power-of-two.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA ...
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, Int2Type< IS_PRIMITIVE >)
Inclusive prefix scan.
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, cub::Sum, Int2Type< true >)
Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Inclusive scan.
__device__ __forceinline__ WarpScanSmem(TempStorage &temp_storage)
Constructor.
@ IS_POW_OF_TWO
Whether the logical warp size is a power-of-two.
@ HALF_WARP_THREADS
The number of threads in half a warp.
@ STEPS
The number of warp scan steps.
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
@ WARP_SMEM_ELEMENTS
The number of shared memory elements per warp.
CellT _TempStorage[WARP_SMEM_ELEMENTS]
Shared memory storage layout type (1.5 warps-worth of elements for each warp)
__device__ __forceinline__ void ScanStep(T &partial, ScanOp scan_op, Int2Type< STEP >)
Basic inclusive scan iteration (template unrolled, inductive-case specialization)
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT, IsIntegerT)
Update inclusive, exclusive, and warp aggregate using input and inclusive.
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, cub::Sum scan_op, T initial_value, Int2Type< true >)
Update inclusive and exclusive using initial value using input and inclusive (specialized for summati...
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, ScanOpT scan_op, T initial_value, IsIntegerT)
Update inclusive and exclusive using initial value using input, inclusive, and initial value.
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, ScanOpT, IsIntegerT)
Update inclusive and exclusive using input and inclusive.
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, T &warp_aggregate, cub::Sum, Int2Type< true >)
Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation ...
__device__ __forceinline__ void InclusiveScan(T input, T &output, Sum scan_op, Int2Type< true >)
Inclusive prefix scan (specialized for summation across primitive types)
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast.
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT scan_op, T initial_value, IsIntegerT)
Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value.
__device__ __forceinline__ void ScanStep(T &, ScanOp, Int2Type< STEPS >)
Basic inclusive scan iteration(template unrolled, base-case specialization)
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &warp_aggregate)
Inclusive scan with aggregate.