AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. More...
Data Structures | |
| union | _TempStorage |
| struct | GuardedInequalityWrapper |
| struct | TempStorage |
Public Types | |
| enum | { BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), HAS_IDENTITY_ZERO = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE) } |
| typedef std::iterator_traits< KeysInputIteratorT >::value_type | KeyInputT |
| typedef If<(Equals< typename std::iterator_traits< UniqueOutputIteratorT >::value_type, void >::VALUE), typename std::iterator_traits< KeysInputIteratorT >::value_type, typename std::iterator_traits< UniqueOutputIteratorT >::value_type >::Type | KeyOutputT |
| typedef std::iterator_traits< ValuesInputIteratorT >::value_type | ValueInputT |
| typedef If<(Equals< typename std::iterator_traits< AggregatesOutputIteratorT >::value_type, void >::VALUE), typename std::iterator_traits< ValuesInputIteratorT >::value_type, typename std::iterator_traits< AggregatesOutputIteratorT >::value_type >::Type | ValueOutputT |
| typedef KeyValuePair< OffsetT, ValueOutputT > | OffsetValuePairT |
| typedef KeyValuePair< KeyOutputT, ValueOutputT > | KeyValuePairT |
| typedef ReduceByKeyScanTileState< ValueOutputT, OffsetT > | ScanTileStateT |
| typedef If< IsPointer< KeysInputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT >, KeysInputIteratorT >::Type | WrappedKeysInputIteratorT |
| typedef If< IsPointer< ValuesInputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT >, ValuesInputIteratorT >::Type | WrappedValuesInputIteratorT |
| typedef If< IsPointer< AggregatesOutputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT >, AggregatesOutputIteratorT >::Type | WrappedFixupInputIteratorT |
| typedef ReduceBySegmentOp< ReductionOpT > | ReduceBySegmentOpT |
| typedef BlockLoad< KeyOutputT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentReduceByKeyPolicyT::LOAD_ALGORITHM > | BlockLoadKeysT |
| typedef BlockLoad< ValueOutputT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentReduceByKeyPolicyT::LOAD_ALGORITHM > | BlockLoadValuesT |
| typedef BlockDiscontinuity< KeyOutputT, BLOCK_THREADS > | BlockDiscontinuityKeys |
| typedef BlockScan< OffsetValuePairT, BLOCK_THREADS, AgentReduceByKeyPolicyT::SCAN_ALGORITHM > | BlockScanT |
| typedef TilePrefixCallbackOp< OffsetValuePairT, ReduceBySegmentOpT, ScanTileStateT > | TilePrefixCallbackOpT |
| typedef KeyOutputT | KeyExchangeT[TILE_ITEMS+1] |
| typedef ValueOutputT | ValueExchangeT[TILE_ITEMS+1] |
Public Member Functions | |
| __device__ __forceinline__ | AgentReduceByKey (TempStorage &temp_storage, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op) |
| __device__ __forceinline__ void | ScatterDirect (KeyValuePairT(&scatter_items)[ITEMS_PER_THREAD], OffsetT(&segment_flags)[ITEMS_PER_THREAD], OffsetT(&segment_indices)[ITEMS_PER_THREAD]) |
| __device__ __forceinline__ void | ScatterTwoPhase (KeyValuePairT(&scatter_items)[ITEMS_PER_THREAD], OffsetT(&segment_flags)[ITEMS_PER_THREAD], OffsetT(&segment_indices)[ITEMS_PER_THREAD], OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) |
| __device__ __forceinline__ void | Scatter (KeyValuePairT(&scatter_items)[ITEMS_PER_THREAD], OffsetT(&segment_flags)[ITEMS_PER_THREAD], OffsetT(&segment_indices)[ITEMS_PER_THREAD], OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) |
| template<bool IS_LAST_TILE> | |
| __device__ __forceinline__ void | ConsumeTile (OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state) |
| < Whether the current tile is the last tile More... | |
| __device__ __forceinline__ void | ConsumeRange (int num_items, ScanTileStateT &tile_state, int start_tile) |
Data Fields | |
| _TempStorage & | temp_storage |
| Reference to temp_storage. | |
| WrappedKeysInputIteratorT | d_keys_in |
| Input keys. | |
| UniqueOutputIteratorT | d_unique_out |
| Unique output keys. | |
| WrappedValuesInputIteratorT | d_values_in |
| Input values. | |
| AggregatesOutputIteratorT | d_aggregates_out |
| Output value aggregates. | |
| NumRunsOutputIteratorT | d_num_runs_out |
| Output pointer for total number of segments identified. | |
| EqualityOpT | equality_op |
| KeyT equality operator. | |
| ReductionOpT | reduction_op |
| Reduction operator. | |
| ReduceBySegmentOpT | scan_op |
| Reduce-by-segment scan operator. | |
AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
< Signed integer type for global offsets
Definition at line 98 of file agent_reduce_by_key.cuh.
|
inline |
| temp_storage | Reference to temp_storage |
| d_keys_in | Input keys |
| d_unique_out | Unique output keys |
| d_values_in | Input values |
| d_aggregates_out | Output value aggregates |
| d_num_runs_out | Output pointer for total number of segments identified |
| equality_op | KeyT equality operator |
| reduction_op | ValueT reduction operator |
Definition at line 273 of file agent_reduce_by_key.cuh.
|
inline |
Scan tiles of items as part of a dynamic chained scan
| num_items | Total number of input items |
| tile_state | Global tile state descriptor |
| start_tile | The starting tile for the current grid |
Definition at line 522 of file agent_reduce_by_key.cuh.
|
inline |
< Whether the current tile is the last tile
Process a tile of input (dynamic chained scan)
| num_remaining | Number of global input items remaining (including this tile) |
| tile_idx | Tile index |
| tile_offset | Tile offset |
| tile_state | Global tile state descriptor |
Definition at line 394 of file agent_reduce_by_key.cuh.
|
inline |
Scatter flagged items
Definition at line 359 of file agent_reduce_by_key.cuh.
|
inline |
Directly scatter flagged items to output offsets
Definition at line 302 of file agent_reduce_by_key.cuh.
|
inline |
2-phase scatter flagged items to output offsets
The exclusive scan causes each head flag to be paired with the previous value aggregate: the scatter offsets must be decremented for value aggregates
Definition at line 326 of file agent_reduce_by_key.cuh.