doxygen/openfpm/block__reduce_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include "specializations/block_reduce_raking.cuh"

#include "specializations/block_reduce_raking_commutative_only.cuh"

#include "specializations/block_reduce_warp_reductions.cuh"

#include "../util_ptx.cuh"

#include "../util_type.cuh"

#include "../thread/thread_operators.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Algorithmic variants

 ******************************************************************************/


enum BlockReduceAlgorithm

{


    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,


    BLOCK_REDUCE_RAKING,


    BLOCK_REDUCE_WARP_REDUCTIONS,

};


/******************************************************************************

 * Block reduce

 ******************************************************************************/


template <

    typename                T,

    int                     BLOCK_DIM_X,

    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,

    int                     BLOCK_DIM_Y     = 1,

    int                     BLOCK_DIM_Z     = 1,

    int                     PTX_ARCH        = CUB_PTX_ARCH>

class BlockReduce

{

private:


    /******************************************************************************

     * Constants and type definitions

     ******************************************************************************/


    enum

    {

        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,

    };


    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;

    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;

    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;


    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),

        WarpReductions,

        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),

            RakingCommutativeOnly,

            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking


    typedef typename InternalBlockReduce::TempStorage _TempStorage;


    /******************************************************************************

     * Utility methods

     ******************************************************************************/


    __device__ __forceinline__ _TempStorage& PrivateStorage()

    {

        __shared__ _TempStorage private_storage;

        return private_storage;

    }


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage &temp_storage;


    unsigned int linear_tid;


public:


    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************/


    __device__ __forceinline__ BlockReduce()

    :

        temp_storage(PrivateStorage()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    __device__ __forceinline__ BlockReduce(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    /******************************************************************/


    template <typename ReductionOp>

    __device__ __forceinline__ T Reduce(

        T               input,

        ReductionOp     reduction_op)

    {

        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);

    }


    template <

        int ITEMS_PER_THREAD,

        typename ReductionOp>

    __device__ __forceinline__ T Reduce(

        T               (&inputs)[ITEMS_PER_THREAD],

        ReductionOp     reduction_op)

    {

        // Reduce partials

        T partial = internal::ThreadReduce(inputs, reduction_op);

        return Reduce(partial, reduction_op);

    }


    template <typename ReductionOp>

    __device__ __forceinline__ T Reduce(

        T                   input,

        ReductionOp         reduction_op,

        int                 num_valid)

    {

        // Determine if we scan skip bounds checking

        if (num_valid >= BLOCK_THREADS)

        {

            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);

        }

        else

        {

            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);

        }

    }


    /******************************************************************/


    __device__ __forceinline__ T Sum(

        T   input)

    {

        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);

    }


    template <int ITEMS_PER_THREAD>

    __device__ __forceinline__ T Sum(

        T   (&inputs)[ITEMS_PER_THREAD])

    {

        // Reduce partials

        T partial = internal::ThreadReduce(inputs, cub::Sum());

        return Sum(partial);

    }


    __device__ __forceinline__ T Sum(

        T   input,

        int num_valid)

    {

        // Determine if we scan skip bounds checking

        if (num_valid >= BLOCK_THREADS)

        {

            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);

        }

        else

        {

            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);

        }

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


block_reduce_raking.cuh

block_reduce_raking_commutative_only.cuh

block_reduce_warp_reductions.cuh

cub::BlockReduce
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
Definition block_reduce.cuh:222

cub::BlockReduce::Reduce
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
Definition block_reduce.cuh:348

cub::BlockReduce::Reduce
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int num_valid)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
Definition block_reduce.cuh:440

cub::BlockReduce::linear_tid
unsigned int linear_tid
Linear thread-id.
Definition block_reduce.cuh:271

cub::BlockReduce::BlockReduce
__device__ __forceinline__ BlockReduce()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition block_reduce.cuh:288

cub::BlockReduce::Sum
__device__ __forceinline__ T Sum(T input, int num_valid)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
Definition block_reduce.cuh:582

cub::BlockReduce::BlockReduce
__device__ __forceinline__ BlockReduce(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition block_reduce.cuh:298

cub::BlockReduce::temp_storage
_TempStorage & temp_storage
Shared storage reference.
Definition block_reduce.cuh:268

cub::BlockReduce::Sum
__device__ __forceinline__ T Sum(T input)
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
Definition block_reduce.cuh:497

cub::BlockReduce::Reduce
__device__ __forceinline__ T Reduce(T(&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op)
Computes a block-wide reduction for thread0 using the specified binary reduction functor....
Definition block_reduce.cuh:395

cub::BlockReduce::PrivateStorage
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
Definition block_reduce.cuh:256

cub::BlockReduce::BLOCK_THREADS
@ BLOCK_THREADS
The thread block size in threads.
Definition block_reduce.cuh:233

cub::BlockReduce::_TempStorage
InternalBlockReduce::TempStorage _TempStorage
Shared memory storage layout type for BlockReduce.
Definition block_reduce.cuh:248

cub::BlockReduce::Sum
__device__ __forceinline__ T Sum(T(&inputs)[ITEMS_PER_THREAD])
Computes a block-wide reduction for thread0 using addition (+) as the reduction operator....
Definition block_reduce.cuh:539

cub::BlockReduce::InternalBlockReduce
If<(ALGORITHM==BLOCK_REDUCE_WARP_REDUCTIONS), WarpReductions, typenameIf<(ALGORITHM==BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), RakingCommutativeOnly, Raking >::Type >::Type InternalBlockReduce
Internal specialization type.
Definition block_reduce.cuh:245

cub::RowMajorTid
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409

cub::internal::ThreadReduce
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Definition thread_reduce.cuh:55

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::reduction_op
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Definition dispatch_reduce.cuh:75

cub::BlockReduceAlgorithm
BlockReduceAlgorithm
Definition block_reduce.cuh:61

cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
@ BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
Definition block_reduce.cuh:89

cub::BLOCK_REDUCE_WARP_REDUCTIONS
@ BLOCK_REDUCE_WARP_REDUCTIONS
Definition block_reduce.cuh:148

cub::BLOCK_REDUCE_RAKING
@ BLOCK_REDUCE_RAKING
Definition block_reduce.cuh:119

cub::BlockHistogramSort::TempStorage
Alias wrapper allowing storage to be unioned.
Definition block_histogram_sort.cuh:112

cub::BlockRadixRank::TempStorage
\smemstorage{BlockScan}
Definition block_radix_rank.cuh:310

cub::BlockReduceRakingCommutativeOnly
BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA th...
Definition block_reduce_raking_commutative_only.cuh:59

cub::BlockReduceRaking
BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block....
Definition block_reduce_raking.cuh:69

cub::BlockReduceWarpReductions
BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA ...
Definition block_reduce_warp_reductions.cuh:58

cub::BlockReduce::TempStorage
\smemstorage{BlockReduce}
Definition block_reduce.cuh:277

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::Sum
Default sum functor.
Definition thread_operators.cuh:110

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53