39 #include "../block/block_load.cuh" 40 #include "../block/block_store.cuh" 41 #include "../block/block_scan.cuh" 42 #include "../block/block_discontinuity.cuh" 43 #include "../iterator/cache_modified_input_iterator.cuh" 44 #include "../iterator/constant_input_iterator.cuh" 45 #include "../util_namespace.cuh" 63 int _ITEMS_PER_THREAD,
89 typename AgentSegmentFixupPolicyT,
90 typename PairsInputIteratorT,
91 typename AggregatesOutputIteratorT,
93 typename ReductionOpT,
102 typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
105 typedef typename KeyValuePairT::Value ValueT;
113 BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS,
114 ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
115 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
131 PairsInputIteratorT>::Type
132 WrappedPairsInputIteratorT;
137 AggregatesOutputIteratorT>::Type
138 WrappedFixupInputIteratorT;
148 AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
155 AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
200 __device__ __forceinline__
226 template <
bool IS_LAST_TILE>
234 KeyValuePairT pairs[ITEMS_PER_THREAD];
237 KeyValuePairT oob_pair;
247 for (
int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
250 if (pairs[ITEM].key != pairs[ITEM - 1].key)
251 atomicAdd(d_scatter, pairs[ITEM - 1].value);
253 pairs[ITEM].value =
reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
258 if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
259 atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
266 template <
bool IS_LAST_TILE>
274 KeyValuePairT pairs[ITEMS_PER_THREAD];
275 KeyValuePairT scatter_pairs[ITEMS_PER_THREAD];
278 KeyValuePairT oob_pair;
288 KeyValuePairT tile_aggregate;
295 if (threadIdx.x == 0)
298 scatter_pairs[0].key = pairs[0].key;
310 tile_aggregate = prefix_op.GetBlockAggregate();
315 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
317 if (scatter_pairs[ITEM].key != pairs[ITEM].key)
320 ValueT value =
d_fixup_in[scatter_pairs[ITEM].key];
331 if (threadIdx.x == BLOCK_THREADS - 1)
334 if (num_remaining == TILE_ITEMS)
337 OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
354 int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;
355 OffsetT tile_offset = tile_idx * TILE_ITEMS;
358 if (num_remaining > TILE_ITEMS)
363 else if (num_remaining > 0)
ReduceBySegmentOpT scan_op
Reduce-by-segment scan operator.
AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device...
__device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state, Int2Type< false > use_atomic_fixup)
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Optional outer namespace(s)
_TempStorage & temp_storage
Reference to temp_storage.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Threads per thread block.
InequalityWrapper< EqualityOpT > inequality_op
KeyT inequality operator.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
OffsetT OffsetT
[in] Total number of input data items
Items per thread (per tile of input)
__device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state, Int2Type< true > use_atomic_fixup)
< The BlockScan algorithm to use
WrappedFixupInputIteratorT d_fixup_in
Fixup input values.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
WrappedPairsInputIteratorT d_pairs_in
Input keys.
AggregatesOutputIteratorT d_aggregates_out
Output value aggregates.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
ReductionOpT reduction_op
Reduction operator.
Type selection (IF ? ThenType : ElseType)
__device__ __forceinline__ AgentSegmentFixup(TempStorage &temp_storage, PairsInputIteratorT d_pairs_in, AggregatesOutputIteratorT d_aggregates_out, EqualityOpT equality_op, ReductionOpT reduction_op)
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void ConsumeRange(int num_items, int num_tiles, ScanTileStateT &tile_state)
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...