doxygen/openfpm/agent__segment__fixup_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <iterator>


#include "single_pass_scan_operators.cuh"

#include "../block/block_load.cuh"

#include "../block/block_store.cuh"

#include "../block/block_scan.cuh"

#include "../block/block_discontinuity.cuh"

#include "../iterator/cache_modified_input_iterator.cuh"

#include "../iterator/constant_input_iterator.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Tuning policy types

 ******************************************************************************/


template <

    int                         _BLOCK_THREADS,

    int                         _ITEMS_PER_THREAD,

    BlockLoadAlgorithm          _LOAD_ALGORITHM,

    CacheLoadModifier           _LOAD_MODIFIER,

    BlockScanAlgorithm          _SCAN_ALGORITHM>

struct AgentSegmentFixupPolicy

{

    enum

    {

        BLOCK_THREADS           = _BLOCK_THREADS,

        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,

    };


    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;

    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;

    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;

};


/******************************************************************************

 * Thread block abstractions

 ******************************************************************************/


template <

    typename    AgentSegmentFixupPolicyT,

    typename    PairsInputIteratorT,

    typename    AggregatesOutputIteratorT,

    typename    EqualityOpT,

    typename    ReductionOpT,

    typename    OffsetT>

struct AgentSegmentFixup

{

    //---------------------------------------------------------------------

    // Types and constants

    //---------------------------------------------------------------------


    // Data type of key-value input iterator

    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;


    // Value type

    typedef typename KeyValuePairT::Value ValueT;


    // Tile status descriptor interface type

    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;


    // Constants

    enum

    {

        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,

        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,

        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,


        // Whether or not do fixup using RLE + global atomics

        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) &&

                                (Equals<ValueT, float>::VALUE ||

                                 Equals<ValueT, int>::VALUE ||

                                 Equals<ValueT, unsigned int>::VALUE ||

                                 Equals<ValueT, unsigned long long>::VALUE),


        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)

        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),

    };


    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys

    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,

            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator

            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type

        WrappedPairsInputIteratorT;


    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values

    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,

            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator

            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type

        WrappedFixupInputIteratorT;


    // Reduce-value-by-segment scan operator

    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;


    // Parameterized BlockLoad type for pairs

    typedef BlockLoad<

            KeyValuePairT,

            BLOCK_THREADS,

            ITEMS_PER_THREAD,

            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>

        BlockLoadPairs;


    // Parameterized BlockScan type

    typedef BlockScan<

            KeyValuePairT,

            BLOCK_THREADS,

            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>

        BlockScanT;


    // Callback type for obtaining tile prefix during block scan

    typedef TilePrefixCallbackOp<

            KeyValuePairT,

            ReduceBySegmentOpT,

            ScanTileStateT>

        TilePrefixCallbackOpT;


    // Shared memory type for this thread block

    union _TempStorage

    {

        struct

        {

            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning

            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback

        };


        // Smem needed for loading keys

        typename BlockLoadPairs::TempStorage load_pairs;

    };


    // Alias wrapper allowing storage to be unioned

    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------

    // Per-thread fields

    //---------------------------------------------------------------------


    _TempStorage&                   temp_storage;

    WrappedPairsInputIteratorT      d_pairs_in;

    AggregatesOutputIteratorT       d_aggregates_out;

    WrappedFixupInputIteratorT      d_fixup_in;

    InequalityWrapper<EqualityOpT>  inequality_op;

    ReductionOpT                    reduction_op;

    ReduceBySegmentOpT              scan_op;


    //---------------------------------------------------------------------

    // Constructor

    //---------------------------------------------------------------------


    // Constructor

    __device__ __forceinline__

    AgentSegmentFixup(

        TempStorage&                temp_storage,

        PairsInputIteratorT         d_pairs_in,

        AggregatesOutputIteratorT   d_aggregates_out,

        EqualityOpT                 equality_op,

        ReductionOpT                reduction_op)

    :

        temp_storage(temp_storage.Alias()),

        d_pairs_in(d_pairs_in),

        d_aggregates_out(d_aggregates_out),

        d_fixup_in(d_aggregates_out),

        inequality_op(equality_op),

        reduction_op(reduction_op),

        scan_op(reduction_op)

    {}


    //---------------------------------------------------------------------

    // Cooperatively scan a device-wide sequence of tiles with other CTAs

    //---------------------------------------------------------------------


    template <bool IS_LAST_TILE>

    __device__ __forceinline__ void ConsumeTile(

        OffsetT             num_remaining,

        int                 tile_idx,

        OffsetT             tile_offset,

        ScanTileStateT&     tile_state,

        Int2Type<true>      use_atomic_fixup)

    {

        KeyValuePairT   pairs[ITEMS_PER_THREAD];


        // Load pairs

        KeyValuePairT oob_pair;

        oob_pair.key = -1;


        if (IS_LAST_TILE)

            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);

        else

            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);


        // RLE

        #pragma unroll

        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)

        {

            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;

            if (pairs[ITEM].key != pairs[ITEM - 1].key)

                atomicAdd(d_scatter, pairs[ITEM - 1].value);

            else

                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);

        }


        // Flush last item if valid

        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;

        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))

            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);

    }


    template <bool IS_LAST_TILE>

    __device__ __forceinline__ void ConsumeTile(

        OffsetT             num_remaining,

        int                 tile_idx,

        OffsetT             tile_offset,

        ScanTileStateT&     tile_state,

        Int2Type<false>     use_atomic_fixup)

    {

        KeyValuePairT   pairs[ITEMS_PER_THREAD];

        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];


        // Load pairs

        KeyValuePairT oob_pair;

        oob_pair.key = -1;


        if (IS_LAST_TILE)

            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);

        else

            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);


        CTA_SYNC();


        KeyValuePairT tile_aggregate;

        if (tile_idx == 0)

        {

            // Exclusive scan of values and segment_flags

            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);


            // Update tile status if this is not the last tile

            if (threadIdx.x == 0)

            {

                // Set first segment id to not trigger a flush (invalid from exclusive scan)

                scatter_pairs[0].key = pairs[0].key;


                if (!IS_LAST_TILE)

                    tile_state.SetInclusive(0, tile_aggregate);


            }

        }

        else

        {

            // Exclusive scan of values and segment_flags

            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);

            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);

            tile_aggregate = prefix_op.GetBlockAggregate();

        }


        // Scatter updated values

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)

        {

            if (scatter_pairs[ITEM].key != pairs[ITEM].key)

            {

                // Update the value at the key location

                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];

                value           = reduction_op(value, scatter_pairs[ITEM].value);


                d_aggregates_out[scatter_pairs[ITEM].key] = value;

            }

        }


        // Finalize the last item

        if (IS_LAST_TILE)

        {

            // Last thread will output final count and last item, if necessary

            if (threadIdx.x == BLOCK_THREADS - 1)

            {

                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment

                if (num_remaining == TILE_ITEMS)

                {

                    // Update the value at the key location

                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;

                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);

                }

            }

        }

    }


    __device__ __forceinline__ void ConsumeRange(

        int                 num_items,

        int                 num_tiles,

        ScanTileStateT&     tile_state)

    {

        // Blocks are launched in increasing order, so just assign one tile per block

        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index

        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile

        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)


        if (num_remaining > TILE_ITEMS)

        {

            // Not the last tile (full)

            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());

        }

        else if (num_remaining > 0)

        {

            // The last tile (possibly partially-full)

            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());

        }

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockLoad
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
Definition block_load.cuh:641

cub::BlockScan
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Definition block_scan.cuh:194

cub::BlockScan::ExclusiveScan
__device__ __forceinline__ void ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
Computes an exclusive block-wide prefix scan using the specified binary scan_op functor....
Definition block_scan.cuh:728

cub::CacheModifiedInputIterator
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
Definition cache_modified_input_iterator.cuh:108

cub::BlockLoadAlgorithm
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
Definition block_load.cuh:474

cub::CacheLoadModifier
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition thread_load.cuh:63

cub::BlockLoad::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition block_load.cuh:1130

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::num_items
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:168

cub::num_tiles
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
Definition dispatch_rle.cuh:84

cub::BlockScanAlgorithm
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
Definition block_scan.cuh:58

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::tile_state
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_state
Tile status interface.
Definition dispatch_reduce_by_key.cuh:78

cub::equality_op
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
Definition dispatch_reduce_by_key.cuh:80

single_pass_scan_operators.cuh

cub::AgentSegmentFixupPolicy
< The BlockScan algorithm to use
Definition agent_segment_fixup.cuh:68

cub::AgentSegmentFixupPolicy::LOAD_ALGORITHM
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
Definition agent_segment_fixup.cuh:75

cub::AgentSegmentFixupPolicy::LOAD_MODIFIER
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
Definition agent_segment_fixup.cuh:76

cub::AgentSegmentFixupPolicy::SCAN_ALGORITHM
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
Definition agent_segment_fixup.cuh:77

cub::AgentSegmentFixupPolicy::BLOCK_THREADS
@ BLOCK_THREADS
Threads per thread block.
Definition agent_segment_fixup.cuh:71

cub::AgentSegmentFixupPolicy::ITEMS_PER_THREAD
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
Definition agent_segment_fixup.cuh:72

cub::AgentSegmentFixup::TempStorage
Definition agent_segment_fixup.cuh:179

cub::AgentSegmentFixup
AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device...
Definition agent_segment_fixup.cuh:96

cub::AgentSegmentFixup::d_fixup_in
WrappedFixupInputIteratorT d_fixup_in
Fixup input values.
Definition agent_segment_fixup.cuh:189

cub::AgentSegmentFixup::ConsumeTile
__device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state, Int2Type< false > use_atomic_fixup)
Definition agent_segment_fixup.cuh:267

cub::AgentSegmentFixup::ConsumeRange
__device__ __forceinline__ void ConsumeRange(int num_items, int num_tiles, ScanTileStateT &tile_state)
Definition agent_segment_fixup.cuh:348

cub::AgentSegmentFixup::d_aggregates_out
AggregatesOutputIteratorT d_aggregates_out
Output value aggregates.
Definition agent_segment_fixup.cuh:188

cub::AgentSegmentFixup::scan_op
ReduceBySegmentOpT scan_op
Reduce-by-segment scan operator.
Definition agent_segment_fixup.cuh:192

cub::AgentSegmentFixup::d_pairs_in
WrappedPairsInputIteratorT d_pairs_in
Input keys.
Definition agent_segment_fixup.cuh:187

cub::AgentSegmentFixup::reduction_op
ReductionOpT reduction_op
Reduction operator.
Definition agent_segment_fixup.cuh:191

cub::AgentSegmentFixup::temp_storage
_TempStorage & temp_storage
Reference to temp_storage.
Definition agent_segment_fixup.cuh:186

cub::AgentSegmentFixup::AgentSegmentFixup
__device__ __forceinline__ AgentSegmentFixup(TempStorage &temp_storage, PairsInputIteratorT d_pairs_in, AggregatesOutputIteratorT d_aggregates_out, EqualityOpT equality_op, ReductionOpT reduction_op)
Definition agent_segment_fixup.cuh:201

cub::AgentSegmentFixup::ConsumeTile
__device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state, Int2Type< true > use_atomic_fixup)
Definition agent_segment_fixup.cuh:227

cub::AgentSegmentFixup::inequality_op
InequalityWrapper< EqualityOpT > inequality_op
KeyT inequality operator.
Definition agent_segment_fixup.cuh:190

cub::BlockLoad::TempStorage
\smemstorage{BlockLoad}
Definition block_load.cuh:1055

cub::BlockScan::TempStorage
\smemstorage{BlockScan}
Definition block_scan.cuh:260

cub::Equals
Type equality test.
Definition util_type.cuh:99

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::InequalityWrapper
Inequality functor (wraps equality functor)
Definition thread_operators.cuh:89

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

cub::ReduceByKeyOp
< Binary reduction operator to apply to values
Definition thread_operators.cuh:282

cub::ReduceByKeyScanTileState
Definition single_pass_scan_operators.cuh:450

cub::TilePrefixCallbackOp::TempStorage
Definition single_pass_scan_operators.cuh:695

cub::TilePrefixCallbackOp
Definition single_pass_scan_operators.cuh:681

cub::Traits
Type traits.
Definition util_type.cuh:1158

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

cub::AgentSegmentFixup::_TempStorage
Definition agent_segment_fixup.cuh:167

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53