doxygen/openfpm/warp__reduce_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include "specializations/warp_reduce_shfl.cuh"

#include "specializations/warp_reduce_smem.cuh"

#include "../thread/thread_operators.cuh"

#include "../util_arch.cuh"

#include "../util_type.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


template <

    typename    T,

    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,

    int         PTX_ARCH                = CUB_PTX_ARCH>

class WarpReduce

{

private:


    /******************************************************************************

     * Constants and type definitions

     ******************************************************************************/


    enum

    {

        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),


        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,

    };


public:


    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),

        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,

        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;


    #endif // DOXYGEN_SHOULD_SKIP_THIS


private:


    typedef typename InternalWarpReduce::TempStorage _TempStorage;


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage &temp_storage;


    /******************************************************************************

     * Utility methods

     ******************************************************************************/


public:


    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************/


    __device__ __forceinline__ WarpReduce(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias())

    {}


    /******************************************************************/


    __device__ __forceinline__ T Sum(

        T                   input)

    {

        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, cub::Sum());

    }


    __device__ __forceinline__ T Sum(

        T                   input,

        int                 valid_items)

    {

        // Determine if we don't need bounds checking

        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, cub::Sum());

    }


    template <

        typename            FlagT>

    __device__ __forceinline__ T HeadSegmentedSum(

        T                   input,

        FlagT                head_flag)

    {

        return HeadSegmentedReduce(input, head_flag, cub::Sum());

    }


    template <

        typename            FlagT>

    __device__ __forceinline__ T TailSegmentedSum(

        T                   input,

        FlagT                tail_flag)

    {

        return TailSegmentedReduce(input, tail_flag, cub::Sum());

    }


    /******************************************************************/


    template <typename ReductionOp>

    __device__ __forceinline__ T Reduce(

        T                   input,

        ReductionOp         reduction_op)

    {

        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);

    }


    template <typename ReductionOp>

    __device__ __forceinline__ T Reduce(

        T                   input,

        ReductionOp         reduction_op,

        int                 valid_items)

    {

        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);

    }


    template <

        typename            ReductionOp,

        typename            FlagT>

    __device__ __forceinline__ T HeadSegmentedReduce(

        T                   input,

        FlagT               head_flag,

        ReductionOp         reduction_op)

    {

        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);

    }


    template <

        typename            ReductionOp,

        typename            FlagT>

    __device__ __forceinline__ T TailSegmentedReduce(

        T                   input,

        FlagT               tail_flag,

        ReductionOp         reduction_op)

    {

        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);

    }


};

       // end group WarpModule


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)

cub::WarpReduce
The WarpReduce class provides collective methods for computing a parallel reduction of items partitio...
Definition warp_reduce.cuh:142

cub::WarpReduce::Reduce
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op)
Computes a warp-wide reduction in the calling warp using the specified binary reduction functor....
Definition warp_reduce.cuh:445

cub::WarpReduce::HeadSegmentedSum
__device__ __forceinline__ T HeadSegmentedSum(T input, FlagT head_flag)
Computes a segmented sum in the calling warp where segments are defined by head-flags....
Definition warp_reduce.cuh:344

cub::WarpReduce::HeadSegmentedReduce
__device__ __forceinline__ T HeadSegmentedReduce(T input, FlagT head_flag, ReductionOp reduction_op)
Computes a segmented reduction in the calling warp where segments are defined by head-flags....
Definition warp_reduce.cuh:545

cub::WarpReduce::TailSegmentedSum
__device__ __forceinline__ T TailSegmentedSum(T input, FlagT tail_flag)
Computes a segmented sum in the calling warp where segments are defined by tail-flags....
Definition warp_reduce.cuh:391

cub::WarpReduce::IS_POW_OF_TWO
@ IS_POW_OF_TWO
Whether the logical warp size is a power-of-two.
Definition warp_reduce.cuh:155

cub::WarpReduce::IS_ARCH_WARP
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
Definition warp_reduce.cuh:152

cub::WarpReduce::temp_storage
_TempStorage & temp_storage
Shared storage reference.
Definition warp_reduce.cuh:181

cub::WarpReduce::_TempStorage
InternalWarpReduce::TempStorage _TempStorage
Shared memory storage layout type for WarpReduce.
Definition warp_reduce.cuh:173

cub::WarpReduce::TailSegmentedReduce
__device__ __forceinline__ T TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op)
Computes a segmented reduction in the calling warp where segments are defined by tail-flags....
Definition warp_reduce.cuh:596

cub::WarpReduce::Sum
__device__ __forceinline__ T Sum(T input)
Computes a warp-wide sum in the calling warp. The output is valid in warp lane0.
Definition warp_reduce.cuh:251

cub::WarpReduce::WarpReduce
__device__ __forceinline__ WarpReduce(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage....
Definition warp_reduce.cuh:203

cub::WarpReduce::Reduce
__device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int valid_items)
Computes a partially-full warp-wide reduction in the calling warp using the specified binary reductio...
Definition warp_reduce.cuh:494

cub::WarpReduce::Sum
__device__ __forceinline__ T Sum(T input, int valid_items)
Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0.
Definition warp_reduce.cuh:295

cub::WarpReduce::InternalWarpReduce
If<(PTX_ARCH >=300)&&(IS_POW_OF_TWO), WarpReduceShfl< T, LOGICAL_WARP_THREADS, PTX_ARCH >, WarpReduceSmem< T, LOGICAL_WARP_THREADS, PTX_ARCH > >::Type InternalWarpReduce
Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THRE...
Definition warp_reduce.cuh:165

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::reduction_op
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Definition dispatch_reduce.cuh:75

cub::BlockHistogramSort::TempStorage
Alias wrapper allowing storage to be unioned.
Definition block_histogram_sort.cuh:112

cub::BlockRadixRank::TempStorage
\smemstorage{BlockScan}
Definition block_radix_rank.cuh:310

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::PowerOfTwo
Statically determine if N is a power-of-two.
Definition util_type.cuh:156

cub::Sum
Default sum functor.
Definition thread_operators.cuh:110

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

cub::WarpReduceShfl
WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA ...
Definition warp_reduce_shfl.cuh:59

cub::WarpReduceSmem
WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA ...
Definition warp_reduce_smem.cuh:56

cub::WarpReduce::TempStorage
\smemstorage{WarpReduce}
Definition warp_reduce.cuh:191

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53

warp_reduce_shfl.cuh

warp_reduce_smem.cuh