doxygen/openfpm/warp__reduce__smem_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include "../../thread/thread_operators.cuh"

#include "../../thread/thread_load.cuh"

#include "../../thread/thread_store.cuh"

#include "../../util_type.cuh"

#include "../../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


template <

    typename    T,

    int         LOGICAL_WARP_THREADS,

    int         PTX_ARCH>

struct WarpReduceSmem

{

    /******************************************************************************

     * Constants and type definitions

     ******************************************************************************/


    enum

    {

        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),


        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,


        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,


        HALF_WARP_THREADS = 1 << (STEPS - 1),


        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,


        UNSET   = 0x0,  // Is initially unset

        SET     = 0x1,  // Is initially set

        SEEN    = 0x2,  // Has seen another head flag from a successor peer

    };


    typedef unsigned char SmemFlag;


    struct _TempStorage

    {

        T           reduce[WARP_SMEM_ELEMENTS];

        SmemFlag    flags[WARP_SMEM_ELEMENTS];

    };


    // Alias wrapper allowing storage to be unioned

    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage    &temp_storage;

    unsigned int    lane_id;

    unsigned int    member_mask;


    /******************************************************************************

     * Construction

     ******************************************************************************/


    __device__ __forceinline__ WarpReduceSmem(

        TempStorage     &temp_storage)

    :

        temp_storage(temp_storage.Alias()),


        lane_id(IS_ARCH_WARP ?

            LaneId() :

            LaneId() % LOGICAL_WARP_THREADS),


        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?

            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp

            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))

    {}


    /******************************************************************************

     * Utility methods

     ******************************************************************************/


    //---------------------------------------------------------------------

    // Regular reduction

    //---------------------------------------------------------------------


    template <

        bool                ALL_LANES_VALID,

        typename            ReductionOp,

        int                 STEP>

    __device__ __forceinline__ T ReduceStep(

        T                   input,

        int                 valid_items,

        ReductionOp         reduction_op,

        Int2Type<STEP>      /*step*/)

    {

        const int OFFSET = 1 << STEP;


        // Share input through buffer

        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);


        WARP_SYNC(member_mask);


        // Update input if peer_addend is in range

        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))

        {

            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);

            input = reduction_op(input, peer_addend);

        }


        WARP_SYNC(member_mask);


        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());

    }


    template <

        bool                ALL_LANES_VALID,

        typename            ReductionOp>

    __device__ __forceinline__ T ReduceStep(

        T                   input,

        int                 valid_items,

        ReductionOp         /*reduction_op*/,

        Int2Type<STEPS>     /*step*/)

    {

        return input;

    }


    //---------------------------------------------------------------------

    // Segmented reduction

    //---------------------------------------------------------------------


    template <

        bool            HEAD_SEGMENTED,

        typename        FlagT,

        typename        ReductionOp>

    __device__ __forceinline__ T SegmentedReduce(

        T               input,

        FlagT           flag,

        ReductionOp     reduction_op,

        Int2Type<true>  /*has_ballot*/)

    {

        // Get the start flags for each thread in the warp.

        int warp_flags = WARP_BALLOT(flag, member_mask);


        if (!HEAD_SEGMENTED)

            warp_flags <<= 1;


        // Keep bits above the current thread.

        warp_flags &= LaneMaskGt();


        // Accommodate packing of multiple logical warps in a single physical warp

        if (!IS_ARCH_WARP)

        {

            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;

        }


        // Find next flag

        int next_flag = __clz(__brev(warp_flags));


        // Clip the next segment at the warp boundary if necessary

        if (LOGICAL_WARP_THREADS != 32)

            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);


        #pragma unroll

        for (int STEP = 0; STEP < STEPS; STEP++)

        {

            const int OFFSET = 1 << STEP;


            // Share input into buffer

            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);


            WARP_SYNC(member_mask);


            // Update input if peer_addend is in range

            if (OFFSET + lane_id < next_flag)

            {

                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);

                input = reduction_op(input, peer_addend);

            }


            WARP_SYNC(member_mask);

        }


        return input;

    }


    template <

        bool            HEAD_SEGMENTED,

        typename        FlagT,

        typename        ReductionOp>

    __device__ __forceinline__ T SegmentedReduce(

        T               input,

        FlagT           flag,

        ReductionOp     reduction_op,

        Int2Type<false> /*has_ballot*/)

    {

        enum

        {

            UNSET   = 0x0,  // Is initially unset

            SET     = 0x1,  // Is initially set

            SEEN    = 0x2,  // Has seen another head flag from a successor peer

        };


        // Alias flags onto shared data storage

        volatile SmemFlag *flag_storage = temp_storage.flags;


        SmemFlag flag_status = (flag) ? SET : UNSET;


        for (int STEP = 0; STEP < STEPS; STEP++)

        {

            const int OFFSET = 1 << STEP;


            // Share input through buffer

            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);


            WARP_SYNC(member_mask);


            // Get peer from buffer

            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);


            WARP_SYNC(member_mask);


            // Share flag through buffer

            flag_storage[lane_id] = flag_status;


            // Get peer flag from buffer

            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];


            // Update input if peer was in range

            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)

            {

                if (HEAD_SEGMENTED)

                {

                    // Head-segmented

                    if ((flag_status & SEEN) == 0)

                    {

                        // Has not seen a more distant head flag

                        if (peer_flag_status & SET)

                        {

                            // Has now seen a head flag

                            flag_status |= SEEN;

                        }

                        else

                        {

                            // Peer is not a head flag: grab its count

                            input = reduction_op(input, peer_addend);

                        }


                        // Update seen status to include that of peer

                        flag_status |= (peer_flag_status & SEEN);

                    }

                }

                else

                {

                    // Tail-segmented.  Simply propagate flag status

                    if (!flag_status)

                    {

                        input = reduction_op(input, peer_addend);

                        flag_status |= peer_flag_status;

                    }


                }

            }

        }


        return input;

    }


    /******************************************************************************

     * Interface

     ******************************************************************************/


    template <

        bool                ALL_LANES_VALID,

        typename            ReductionOp>

    __device__ __forceinline__ T Reduce(

        T                   input,

        int                 valid_items,

        ReductionOp         reduction_op)

    {

        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());

    }


    template <

        bool            HEAD_SEGMENTED,

        typename        FlagT,

        typename        ReductionOp>

    __device__ __forceinline__ T SegmentedReduce(

        T               input,

        FlagT            flag,

        ReductionOp     reduction_op)

    {

        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)

CUB_MIN
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition util_macro.cuh:66

cub::WARP_SYNC
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition util_ptx.cuh:273

cub::LaneMaskGt
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
Definition util_ptx.cuh:461

cub::WARP_BALLOT
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:310

cub::LaneId
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::reduction_op
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Definition dispatch_reduce.cuh:75

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

cub::Log2
Statically determine log2(N), rounded up.
Definition util_type.cuh:133

cub::PowerOfTwo
Statically determine if N is a power-of-two.
Definition util_type.cuh:156

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

cub::WarpReduceSmem::TempStorage
Definition warp_reduce_smem.cuh:95

cub::WarpReduceSmem::_TempStorage
Shared memory storage layout type (1.5 warps-worth of elements for each warp)
Definition warp_reduce_smem.cuh:89

cub::WarpReduceSmem
WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA ...
Definition warp_reduce_smem.cuh:56

cub::WarpReduceSmem::IS_POW_OF_TWO
@ IS_POW_OF_TWO
Whether the logical warp size is a power-of-two.
Definition warp_reduce_smem.cuh:67

cub::WarpReduceSmem::IS_ARCH_WARP
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
Definition warp_reduce_smem.cuh:64

cub::WarpReduceSmem::HALF_WARP_THREADS
@ HALF_WARP_THREADS
The number of threads in half a warp.
Definition warp_reduce_smem.cuh:73

cub::WarpReduceSmem::WARP_SMEM_ELEMENTS
@ WARP_SMEM_ELEMENTS
The number of shared memory elements per warp.
Definition warp_reduce_smem.cuh:76

cub::WarpReduceSmem::UNSET
@ UNSET
FlagT status (when not using ballot)
Definition warp_reduce_smem.cuh:79

cub::WarpReduceSmem::STEPS
@ STEPS
The number of warp scan steps.
Definition warp_reduce_smem.cuh:70

cub::WarpReduceSmem::ReduceStep
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp reduction_op, Int2Type< STEP >)
Definition warp_reduce_smem.cuh:141

cub::WarpReduceSmem::SegmentedReduce
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< false >)
Definition warp_reduce_smem.cuh:254

cub::WarpReduceSmem::ReduceStep
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp, Int2Type< STEPS >)
Definition warp_reduce_smem.cuh:173

cub::WarpReduceSmem::SegmentedReduce
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< true >)
Definition warp_reduce_smem.cuh:195

cub::WarpReduceSmem::WarpReduceSmem
__device__ __forceinline__ WarpReduceSmem(TempStorage &temp_storage)
Constructor.
Definition warp_reduce_smem.cuh:112

cub::WarpReduceSmem::SegmentedReduce
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
Definition warp_reduce_smem.cuh:359

cub::WarpReduceSmem::SmemFlag
unsigned char SmemFlag
Shared memory flag type.
Definition warp_reduce_smem.cuh:85

cub::WarpReduceSmem::Reduce
__device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)
Definition warp_reduce_smem.cuh:343