doxygen/openfpm/agent__reduce_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <iterator>


#include "../block/block_load.cuh"

#include "../block/block_reduce.cuh"

#include "../grid/grid_mapping.cuh"

#include "../grid/grid_even_share.cuh"

#include "../util_type.cuh"

#include "../iterator/cache_modified_input_iterator.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Tuning policy types

 ******************************************************************************/


template <

    int                     _BLOCK_THREADS,

    int                     _ITEMS_PER_THREAD,

    int                     _VECTOR_LOAD_LENGTH,

    BlockReduceAlgorithm    _BLOCK_ALGORITHM,

    CacheLoadModifier       _LOAD_MODIFIER>

struct AgentReducePolicy

{

    enum

    {

        BLOCK_THREADS       = _BLOCK_THREADS,

        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,

        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,

    };


    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;

    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;

};


/******************************************************************************

 * Thread block abstractions

 ******************************************************************************/


template <

    typename AgentReducePolicy,

    typename InputIteratorT,

    typename OutputIteratorT,

    typename OffsetT,

    typename ReductionOp>

struct AgentReduce

{


    //---------------------------------------------------------------------

    // Types and constants

    //---------------------------------------------------------------------


    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;


    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,

        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type


    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;


    typedef typename If<IsPointer<InputIteratorT>::VALUE,

            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator

            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type

        WrappedInputIteratorT;


    enum

    {

        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,

        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,

        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),

        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,


        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type

        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&

                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&

                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,


    };


    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;

    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;


    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;


    struct _TempStorage

    {

        typename BlockReduceT::TempStorage  reduce;

    };


    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------

    // Per-thread fields

    //---------------------------------------------------------------------


    _TempStorage&           temp_storage;

    InputIteratorT          d_in;

    WrappedInputIteratorT   d_wrapped_in;

    ReductionOp             reduction_op;


    //---------------------------------------------------------------------

    // Utility

    //---------------------------------------------------------------------


    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)

    template <typename Iterator>

    static __device__ __forceinline__ bool IsAligned(

        Iterator        d_in,

        Int2Type<true>  /*can_vectorize*/)

    {

        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;

    }


    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)

    template <typename Iterator>

    static __device__ __forceinline__ bool IsAligned(

        Iterator        /*d_in*/,

        Int2Type<false> /*can_vectorize*/)

    {

        return false;

    }


    //---------------------------------------------------------------------

    // Constructor

    //---------------------------------------------------------------------


    __device__ __forceinline__ AgentReduce(

        TempStorage&            temp_storage,

        InputIteratorT          d_in,

        ReductionOp             reduction_op)

    :

        temp_storage(temp_storage.Alias()),

        d_in(d_in),

        d_wrapped_in(d_in),

        reduction_op(reduction_op)

    {}


    //---------------------------------------------------------------------

    // Tile consumption

    //---------------------------------------------------------------------


    template <int IS_FIRST_TILE>

    __device__ __forceinline__ void ConsumeTile(

        OutputT                 &thread_aggregate,

        OffsetT                 block_offset,

        int                     /*valid_items*/,

        Int2Type<true>          /*is_full_tile*/,

        Int2Type<false>         /*can_vectorize*/)

    {

        OutputT items[ITEMS_PER_THREAD];


        // Load items in striped fashion

        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);


        // Reduce items within each thread stripe

        thread_aggregate = (IS_FIRST_TILE) ?

            internal::ThreadReduce(items, reduction_op) :

            internal::ThreadReduce(items, reduction_op, thread_aggregate);

    }


    template <int IS_FIRST_TILE>

    __device__ __forceinline__ void ConsumeTile(

        OutputT                 &thread_aggregate,

        OffsetT                 block_offset,

        int                     /*valid_items*/,

        Int2Type<true>          /*is_full_tile*/,

        Int2Type<true>          /*can_vectorize*/)

    {

        // Alias items as an array of VectorT and load it in striped fashion

        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };


        // Fabricate a vectorized input iterator

        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);

        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(

            reinterpret_cast<VectorT*>(d_in_unqualified));


        // Load items as vector items

        InputT input_items[ITEMS_PER_THREAD];

        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);

        #pragma unroll

        for (int i = 0; i < WORDS; ++i)

            vec_items[i] = d_vec_in[BLOCK_THREADS * i];


        // Convert from input type to output type

        OutputT items[ITEMS_PER_THREAD];

        #pragma unroll

        for (int i = 0; i < ITEMS_PER_THREAD; ++i)

            items[i] = input_items[i];


        // Reduce items within each thread stripe

        thread_aggregate = (IS_FIRST_TILE) ?

            internal::ThreadReduce(items, reduction_op) :

            internal::ThreadReduce(items, reduction_op, thread_aggregate);

    }


    template <int IS_FIRST_TILE, int CAN_VECTORIZE>

    __device__ __forceinline__ void ConsumeTile(

        OutputT                 &thread_aggregate,

        OffsetT                 block_offset,

        int                     valid_items,

        Int2Type<false>         /*is_full_tile*/,

        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)

    {

        // Partial tile

        int thread_offset = threadIdx.x;


        // Read first item

        if ((IS_FIRST_TILE) && (thread_offset < valid_items))

        {

            thread_aggregate = d_wrapped_in[block_offset + thread_offset];

            thread_offset += BLOCK_THREADS;

        }


        // Continue reading items (block-striped)

        while (thread_offset < valid_items)

        {

            OutputT item        = d_wrapped_in[block_offset + thread_offset];

            thread_aggregate    = reduction_op(thread_aggregate, item);

            thread_offset       += BLOCK_THREADS;

        }

    }


    //---------------------------------------------------------------

    // Consume a contiguous segment of tiles

    //---------------------------------------------------------------------


    template <int CAN_VECTORIZE>

    __device__ __forceinline__ OutputT ConsumeRange(

        GridEvenShare<OffsetT> &even_share,

        Int2Type<CAN_VECTORIZE> can_vectorize)

    {

        OutputT thread_aggregate;


        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)

        {

            // First tile isn't full (not all threads have valid items)

            int valid_items = even_share.block_end - even_share.block_offset;

            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);

            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);

        }


        // At least one full block

        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);

        even_share.block_offset += even_share.block_stride;


        // Consume subsequent full tiles of input

        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)

        {

            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);

            even_share.block_offset += even_share.block_stride;

        }


        // Consume a partially-full tile

        if (even_share.block_offset < even_share.block_end)

        {

            int valid_items = even_share.block_end - even_share.block_offset;

            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);

        }


        // Compute block-wide reduction (all threads have valid items)

        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);

    }


    __device__ __forceinline__ OutputT ConsumeRange(

        OffsetT block_offset,

        OffsetT block_end)

    {

        GridEvenShare<OffsetT> even_share;

        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);


        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?

            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :

            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());

    }


    __device__ __forceinline__ OutputT ConsumeTiles(

        GridEvenShare<OffsetT> &even_share)

    {

        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block

        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();


        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?

            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :

            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());


    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockReduce
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
Definition block_reduce.cuh:222

cub::CacheModifiedInputIterator
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
Definition cache_modified_input_iterator.cuh:108

cub::CacheLoadModifier
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition thread_load.cuh:63

CUB_MIN
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition util_macro.cuh:66

cub::internal::ThreadReduce
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Definition thread_reduce.cuh:55

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::even_share
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
Definition dispatch_radix_sort.cuh:79

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::BlockReduceAlgorithm
BlockReduceAlgorithm
Definition block_reduce.cuh:61

block_offset
Definition SparseGridGpu_ker.cuh:14

cub::AgentReducePolicy
< Cache load modifier for reading input elements
Definition agent_reduce.cuh:68

cub::AgentReducePolicy::LOAD_MODIFIER
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
Definition agent_reduce.cuh:77

cub::AgentReducePolicy::BLOCK_ALGORITHM
static const BlockReduceAlgorithm BLOCK_ALGORITHM
Cooperative block-wide reduction algorithm to use.
Definition agent_reduce.cuh:76

cub::AgentReducePolicy::BLOCK_THREADS
@ BLOCK_THREADS
Threads per thread block.
Definition agent_reduce.cuh:71

cub::AgentReducePolicy::VECTOR_LOAD_LENGTH
@ VECTOR_LOAD_LENGTH
Number of items per vectorized load.
Definition agent_reduce.cuh:73

cub::AgentReducePolicy::ITEMS_PER_THREAD
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
Definition agent_reduce.cuh:72

cub::AgentReduce::TempStorage
Alias wrapper allowing storage to be unioned.
Definition agent_reduce.cuh:151

cub::AgentReduce::_TempStorage
Shared memory type required by this thread block.
Definition agent_reduce.cuh:146

cub::AgentReduce
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
Definition agent_reduce.cuh:100

cub::AgentReduce::ConsumeTile
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< true >)
Definition agent_reduce.cuh:238

cub::AgentReduce::BlockReduceT
BlockReduce< OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM > BlockReduceT
Parameterized BlockReduce primitive.
Definition agent_reduce.cuh:142

cub::AgentReduce::reduction_op
ReductionOp reduction_op
Binary reduction operator.
Definition agent_reduce.cuh:161

cub::AgentReduce::ConsumeRange
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
Definition agent_reduce.cuh:312

cub::AgentReduce::ConsumeTile
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< false >)
Definition agent_reduce.cuh:215

cub::AgentReduce::d_wrapped_in
WrappedInputIteratorT d_wrapped_in
Wrapped input data to reduce.
Definition agent_reduce.cuh:160

cub::AgentReduce::AgentReduce
__device__ __forceinline__ AgentReduce(TempStorage &temp_storage, InputIteratorT d_in, ReductionOp reduction_op)
Definition agent_reduce.cuh:195

cub::AgentReduce::VectorT
CubVector< InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH >::Type VectorT
Vector type of InputT for data movement.
Definition agent_reduce.cuh:115

cub::AgentReduce::ConsumeTiles
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
Definition agent_reduce.cuh:368

cub::AgentReduce::ConsumeRange
__device__ __forceinline__ OutputT ConsumeRange(OffsetT block_offset, OffsetT block_end)
Reduce a contiguous segment of input tiles.
Definition agent_reduce.cuh:352

cub::AgentReduce::d_in
InputIteratorT d_in
Input data to reduce.
Definition agent_reduce.cuh:159

cub::AgentReduce::WrappedInputIteratorT
If< IsPointer< InputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT >, InputIteratorT >::Type WrappedInputIteratorT
Input iterator wrapper type (for applying cache modifier)
Definition agent_reduce.cuh:121

cub::AgentReduce::temp_storage
_TempStorage & temp_storage
Reference to temp_storage.
Definition agent_reduce.cuh:158

cub::AgentReduce::ConsumeTile
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int valid_items, Int2Type< false >, Int2Type< CAN_VECTORIZE >)
Definition agent_reduce.cuh:277

cub::AgentReduce::InputT
std::iterator_traits< InputIteratorT >::value_type InputT
The input value type.
Definition agent_reduce.cuh:107

cub::AgentReduce::OutputT
If<(Equals< typenamestd::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typenamestd::iterator_traits< InputIteratorT >::value_type, typenamestd::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
Definition agent_reduce.cuh:112

cub::BlockReduce::TempStorage
\smemstorage{BlockReduce}
Definition block_reduce.cuh:277

cub::CubVector
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition util_type.cuh:454

cub::Equals
Type equality test.
Definition util_type.cuh:99

cub::GridEvenShare
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
Definition grid_even_share.cuh:75

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

cub::IsPointer
Pointer vs. iterator.
Definition util_type.cuh:171

cub::Traits
Type traits.
Definition util_type.cuh:1158

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635