AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. More...
AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
< Signed integer type for global offsets
Definition at line 98 of file agent_reduce_by_key.cuh.
Data Structures | |
union | _TempStorage |
struct | GuardedInequalityWrapper |
struct | TempStorage |
Public Types | |
enum | { BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), HAS_IDENTITY_ZERO = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE) } |
typedef std::iterator_traits< KeysInputIteratorT >::value_type | KeyInputT |
typedef If<(Equals< typename std::iterator_traits< UniqueOutputIteratorT >::value_type, void >::VALUE), typename std::iterator_traits< KeysInputIteratorT >::value_type, typename std::iterator_traits< UniqueOutputIteratorT >::value_type >::Type | KeyOutputT |
typedef std::iterator_traits< ValuesInputIteratorT >::value_type | ValueInputT |
typedef If<(Equals< typename std::iterator_traits< AggregatesOutputIteratorT >::value_type, void >::VALUE), typename std::iterator_traits< ValuesInputIteratorT >::value_type, typename std::iterator_traits< AggregatesOutputIteratorT >::value_type >::Type | ValueOutputT |
typedef KeyValuePair< OffsetT, ValueOutputT > | OffsetValuePairT |
typedef KeyValuePair< KeyOutputT, ValueOutputT > | KeyValuePairT |
typedef ReduceByKeyScanTileState< ValueOutputT, OffsetT > | ScanTileStateT |
typedef If< IsPointer< KeysInputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT >, KeysInputIteratorT >::Type | WrappedKeysInputIteratorT |
typedef If< IsPointer< ValuesInputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT >, ValuesInputIteratorT >::Type | WrappedValuesInputIteratorT |
typedef If< IsPointer< AggregatesOutputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT >, AggregatesOutputIteratorT >::Type | WrappedFixupInputIteratorT |
typedef ReduceBySegmentOp< ReductionOpT > | ReduceBySegmentOpT |
typedef BlockLoad< KeyOutputT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentReduceByKeyPolicyT::LOAD_ALGORITHM > | BlockLoadKeysT |
typedef BlockLoad< ValueOutputT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentReduceByKeyPolicyT::LOAD_ALGORITHM > | BlockLoadValuesT |
typedef BlockDiscontinuity< KeyOutputT, BLOCK_THREADS > | BlockDiscontinuityKeys |
typedef BlockScan< OffsetValuePairT, BLOCK_THREADS, AgentReduceByKeyPolicyT::SCAN_ALGORITHM > | BlockScanT |
typedef TilePrefixCallbackOp< OffsetValuePairT, ReduceBySegmentOpT, ScanTileStateT > | TilePrefixCallbackOpT |
typedef KeyOutputT | KeyExchangeT[TILE_ITEMS+1] |
typedef ValueOutputT | ValueExchangeT[TILE_ITEMS+1] |
Public Member Functions | |
__device__ __forceinline__ | AgentReduceByKey (TempStorage &temp_storage, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op) |
__device__ __forceinline__ void | ScatterDirect (KeyValuePairT(&scatter_items)[ITEMS_PER_THREAD], OffsetT(&segment_flags)[ITEMS_PER_THREAD], OffsetT(&segment_indices)[ITEMS_PER_THREAD]) |
__device__ __forceinline__ void | ScatterTwoPhase (KeyValuePairT(&scatter_items)[ITEMS_PER_THREAD], OffsetT(&segment_flags)[ITEMS_PER_THREAD], OffsetT(&segment_indices)[ITEMS_PER_THREAD], OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) |
__device__ __forceinline__ void | Scatter (KeyValuePairT(&scatter_items)[ITEMS_PER_THREAD], OffsetT(&segment_flags)[ITEMS_PER_THREAD], OffsetT(&segment_indices)[ITEMS_PER_THREAD], OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) |
template<bool IS_LAST_TILE> | |
__device__ __forceinline__ void | ConsumeTile (OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state) |
< Whether the current tile is the last tile More... | |
__device__ __forceinline__ void | ConsumeRange (int num_items, ScanTileStateT &tile_state, int start_tile) |
Data Fields | |
_TempStorage & | temp_storage |
Reference to temp_storage. | |
WrappedKeysInputIteratorT | d_keys_in |
Input keys. | |
UniqueOutputIteratorT | d_unique_out |
Unique output keys. | |
WrappedValuesInputIteratorT | d_values_in |
Input values. | |
AggregatesOutputIteratorT | d_aggregates_out |
Output value aggregates. | |
NumRunsOutputIteratorT | d_num_runs_out |
Output pointer for total number of segments identified. | |
EqualityOpT | equality_op |
KeyT equality operator. | |
ReductionOpT | reduction_op |
Reduction operator. | |
ReduceBySegmentOpT | scan_op |
Reduce-by-segment scan operator. | |
|
inline |
temp_storage | Reference to temp_storage |
d_keys_in | Input keys |
d_unique_out | Unique output keys |
d_values_in | Input values |
d_aggregates_out | Output value aggregates |
d_num_runs_out | Output pointer for total number of segments identified |
equality_op | KeyT equality operator |
reduction_op | ValueT reduction operator |
Definition at line 271 of file agent_reduce_by_key.cuh.
|
inline |
Scan tiles of items as part of a dynamic chained scan
num_items | Total number of input items |
tile_state | Global tile state descriptor |
start_tile | The starting tile for the current grid |
Definition at line 520 of file agent_reduce_by_key.cuh.
|
inline |
< Whether the current tile is the last tile
Process a tile of input (dynamic chained scan)
num_remaining | Number of global input items remaining (including this tile) |
tile_idx | Tile index |
tile_offset | Tile offset |
tile_state | Global tile state descriptor |
Definition at line 392 of file agent_reduce_by_key.cuh.
|
inline |
Scatter flagged items
Definition at line 357 of file agent_reduce_by_key.cuh.
|
inline |
Directly scatter flagged items to output offsets
Definition at line 300 of file agent_reduce_by_key.cuh.
|
inline |
2-phase scatter flagged items to output offsets
The exclusive scan causes each head flag to be paired with the previous value aggregate: the scatter offsets must be decremented for value aggregates
Definition at line 324 of file agent_reduce_by_key.cuh.