39#include "../block/block_load.cuh"
40#include "../block/block_store.cuh"
41#include "../block/block_scan.cuh"
42#include "../block/block_discontinuity.cuh"
43#include "../iterator/cache_modified_input_iterator.cuh"
44#include "../iterator/constant_input_iterator.cuh"
45#include "../util_namespace.cuh"
63 int _ITEMS_PER_THREAD,
89 typename AgentSegmentFixupPolicyT,
90 typename PairsInputIteratorT,
91 typename AggregatesOutputIteratorT,
93 typename ReductionOpT,
102 typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
105 typedef typename KeyValuePairT::Value ValueT;
113 BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS,
114 ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
115 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
131 PairsInputIteratorT>::Type
132 WrappedPairsInputIteratorT;
137 AggregatesOutputIteratorT>::Type
138 WrappedFixupInputIteratorT;
148 AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
155 AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
200 __device__ __forceinline__
226 template <
bool IS_LAST_TILE>
234 KeyValuePairT pairs[ITEMS_PER_THREAD];
237 KeyValuePairT oob_pair;
247 for (
int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
250 if (pairs[ITEM].key != pairs[ITEM - 1].key)
251 atomicAdd(d_scatter, pairs[ITEM - 1].value);
253 pairs[ITEM].value =
reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
258 if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
259 atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
266 template <
bool IS_LAST_TILE>
274 KeyValuePairT pairs[ITEMS_PER_THREAD];
275 KeyValuePairT scatter_pairs[ITEMS_PER_THREAD];
278 KeyValuePairT oob_pair;
288 KeyValuePairT tile_aggregate;
295 if (threadIdx.x == 0)
298 scatter_pairs[0].key = pairs[0].key;
310 tile_aggregate = prefix_op.GetBlockAggregate();
315 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
317 if (scatter_pairs[ITEM].key != pairs[ITEM].key)
320 ValueT value =
d_fixup_in[scatter_pairs[ITEM].key];
331 if (threadIdx.x == BLOCK_THREADS - 1)
334 if (num_remaining == TILE_ITEMS)
337 OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
354 int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;
355 OffsetT tile_offset = tile_idx * TILE_ITEMS;
358 if (num_remaining > TILE_ITEMS)
363 else if (num_remaining > 0)
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
< The BlockScan algorithm to use
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
@ BLOCK_THREADS
Threads per thread block.
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device...
WrappedFixupInputIteratorT d_fixup_in
Fixup input values.
__device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state, Int2Type< false > use_atomic_fixup)
__device__ __forceinline__ void ConsumeRange(int num_items, int num_tiles, ScanTileStateT &tile_state)
AggregatesOutputIteratorT d_aggregates_out
Output value aggregates.
ReduceBySegmentOpT scan_op
Reduce-by-segment scan operator.
WrappedPairsInputIteratorT d_pairs_in
Input keys.
ReductionOpT reduction_op
Reduction operator.
_TempStorage & temp_storage
Reference to temp_storage.
__device__ __forceinline__ AgentSegmentFixup(TempStorage &temp_storage, PairsInputIteratorT d_pairs_in, AggregatesOutputIteratorT d_aggregates_out, EqualityOpT equality_op, ReductionOpT reduction_op)
__device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state, Int2Type< true > use_atomic_fixup)
InequalityWrapper< EqualityOpT > inequality_op
KeyT inequality operator.
Type selection (IF ? ThenType : ElseType)
Inequality functor (wraps equality functor)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
< Binary reduction operator to apply to values
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...