doxygen/openfpm/warp__reduce__shfl_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include "../../thread/thread_operators.cuh"

#include "../../util_ptx.cuh"

#include "../../util_type.cuh"

#include "../../util_macro.cuh"

#include "../../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


template <

    typename    T,

    int         LOGICAL_WARP_THREADS,

    int         PTX_ARCH>

struct WarpReduceShfl

{

    //---------------------------------------------------------------------

    // Constants and type definitions

    //---------------------------------------------------------------------


    enum

    {

        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),


        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,


        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,


        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8


    };


    template <typename S>

    struct IsInteger

    {

        enum {

            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))

        };

    };


    typedef NullType TempStorage;


    //---------------------------------------------------------------------

    // Thread fields

    //---------------------------------------------------------------------


    unsigned int lane_id;


    unsigned int warp_id;


    unsigned int member_mask;


    //---------------------------------------------------------------------

    // Construction

    //---------------------------------------------------------------------


    __device__ __forceinline__ WarpReduceShfl(

        TempStorage &/*temp_storage*/)

    {

        lane_id = LaneId();

        warp_id = 0;

        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);


        if (!IS_ARCH_WARP)

        {

            warp_id = lane_id / LOGICAL_WARP_THREADS;

            lane_id = lane_id % LOGICAL_WARP_THREADS;

            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);

        }

    }


    //---------------------------------------------------------------------

    // Reduction steps

    //---------------------------------------------------------------------


    __device__ __forceinline__ unsigned int ReduceStep(

        unsigned int    input,

        cub::Sum        /*reduction_op*/,

        int             last_lane,

        int             offset)

    {

        unsigned int output;

        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)


        // Use predicate set from SHFL to guard against invalid peers

#ifdef CUB_USE_COOPERATIVE_GROUPS

        asm volatile(

            "{"

            "  .reg .u32 r0;"

            "  .reg .pred p;"

            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"

            "  @p add.u32 r0, r0, %4;"

            "  mov.u32 %0, r0;"

            "}"

            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));

#else

        asm volatile(

            "{"

            "  .reg .u32 r0;"

            "  .reg .pred p;"

            "  shfl.down.b32 r0|p, %1, %2, %3;"

            "  @p add.u32 r0, r0, %4;"

            "  mov.u32 %0, r0;"

            "}"

            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));

#endif


        return output;

    }


    __device__ __forceinline__ float ReduceStep(

        float           input,

        cub::Sum        /*reduction_op*/,

        int             last_lane,

        int             offset)

    {

        float output;

        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)


        // Use predicate set from SHFL to guard against invalid peers

#ifdef CUB_USE_COOPERATIVE_GROUPS

        asm volatile(

            "{"

            "  .reg .f32 r0;"

            "  .reg .pred p;"

            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"

            "  @p add.f32 r0, r0, %4;"

            "  mov.f32 %0, r0;"

            "}"

            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));

#else

        asm volatile(

            "{"

            "  .reg .f32 r0;"

            "  .reg .pred p;"

            "  shfl.down.b32 r0|p, %1, %2, %3;"

            "  @p add.f32 r0, r0, %4;"

            "  mov.f32 %0, r0;"

            "}"

            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));

#endif


        return output;

    }


    __device__ __forceinline__ unsigned long long ReduceStep(

        unsigned long long  input,

        cub::Sum            /*reduction_op*/,

        int                 last_lane,

        int                 offset)

    {

        unsigned long long output;

        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)


#ifdef CUB_USE_COOPERATIVE_GROUPS

        asm volatile(

            "{"

            "  .reg .u32 lo;"

            "  .reg .u32 hi;"

            "  .reg .pred p;"

            "  mov.b64 {lo, hi}, %1;"

            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"

            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"

            "  mov.b64 %0, {lo, hi};"

            "  @p add.u64 %0, %0, %1;"

            "}"

            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));

#else

        asm volatile(

            "{"

            "  .reg .u32 lo;"

            "  .reg .u32 hi;"

            "  .reg .pred p;"

            "  mov.b64 {lo, hi}, %1;"

            "  shfl.down.b32 lo|p, lo, %2, %3;"

            "  shfl.down.b32 hi|p, hi, %2, %3;"

            "  mov.b64 %0, {lo, hi};"

            "  @p add.u64 %0, %0, %1;"

            "}"

            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));

#endif


        return output;

    }


    __device__ __forceinline__ long long ReduceStep(

        long long           input,

        cub::Sum            /*reduction_op*/,

        int                 last_lane,

        int                 offset)

    {

        long long output;

        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)


        // Use predicate set from SHFL to guard against invalid peers

#ifdef CUB_USE_COOPERATIVE_GROUPS

        asm volatile(

            "{"

            "  .reg .u32 lo;"

            "  .reg .u32 hi;"

            "  .reg .pred p;"

            "  mov.b64 {lo, hi}, %1;"

            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"

            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"

            "  mov.b64 %0, {lo, hi};"

            "  @p add.s64 %0, %0, %1;"

            "}"

            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));

#else

        asm volatile(

            "{"

            "  .reg .u32 lo;"

            "  .reg .u32 hi;"

            "  .reg .pred p;"

            "  mov.b64 {lo, hi}, %1;"

            "  shfl.down.b32 lo|p, lo, %2, %3;"

            "  shfl.down.b32 hi|p, hi, %2, %3;"

            "  mov.b64 %0, {lo, hi};"

            "  @p add.s64 %0, %0, %1;"

            "}"

            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));

#endif


        return output;

    }


    __device__ __forceinline__ double ReduceStep(

        double              input,

        cub::Sum            /*reduction_op*/,

        int                 last_lane,

        int                 offset)

    {

        double output;

        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)


        // Use predicate set from SHFL to guard against invalid peers

#ifdef CUB_USE_COOPERATIVE_GROUPS

        asm volatile(

            "{"

            "  .reg .u32 lo;"

            "  .reg .u32 hi;"

            "  .reg .pred p;"

            "  .reg .f64 r0;"

            "  mov.b64 %0, %1;"

            "  mov.b64 {lo, hi}, %1;"

            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"

            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"

            "  mov.b64 r0, {lo, hi};"

            "  @p add.f64 %0, %0, r0;"

            "}"

            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));

#else

        asm volatile(

            "{"

            "  .reg .u32 lo;"

            "  .reg .u32 hi;"

            "  .reg .pred p;"

            "  .reg .f64 r0;"

            "  mov.b64 %0, %1;"

            "  mov.b64 {lo, hi}, %1;"

            "  shfl.down.b32 lo|p, lo, %2, %3;"

            "  shfl.down.b32 hi|p, hi, %2, %3;"

            "  mov.b64 r0, {lo, hi};"

            "  @p add.f64 %0, %0, r0;"

            "}"

            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));

#endif


        return output;

    }


    template <typename ValueT, typename KeyT>

    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(

        KeyValuePair<KeyT, ValueT>                  input,

        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,

        int                                         last_lane,

        int                                         offset)

    {

        KeyValuePair<KeyT, ValueT> output;


        KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.key, offset, last_lane, member_mask);


        output.key = input.key;

        output.value = ReduceStep(

            input.value,

            cub::Sum(),

            last_lane,

            offset,

            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());


        if (input.key != other_key)

            output.value = input.value;


        return output;

    }


    template <typename ValueT, typename OffsetT>

    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(

        KeyValuePair<OffsetT, ValueT>                 input,

        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,

        int                                           last_lane,

        int                                           offset)

    {

        KeyValuePair<OffsetT, ValueT> output;


        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());

        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());


        if (input.key > 0)

            output.value = input.value;


        return output;

    }


    template <typename _T, typename ReductionOp>

    __device__ __forceinline__ _T ReduceStep(

        _T                  input,

        ReductionOp         reduction_op,

        int                 last_lane,

        int                 offset)

    {

        _T output = input;


        _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane, member_mask);


        // Perform reduction op if valid

        if (offset + lane_id <= last_lane)

            output = reduction_op(input, temp);


        return output;

    }


    template <typename _T, typename ReductionOp>

    __device__ __forceinline__ _T ReduceStep(

        _T              input,

        ReductionOp     reduction_op,

        int             last_lane,

        int             offset,

        Int2Type<true>  /*is_small_unsigned*/)

    {

        return ReduceStep(input, reduction_op, last_lane, offset);

    }


    template <typename _T, typename ReductionOp>

    __device__ __forceinline__ _T ReduceStep(

        _T              input,

        ReductionOp     reduction_op,

        int             last_lane,

        int             offset,

        Int2Type<false> /*is_small_unsigned*/)

    {

        return ReduceStep(input, reduction_op, last_lane, offset);

    }


    //---------------------------------------------------------------------

    // Templated inclusive scan iteration

    //---------------------------------------------------------------------


    template <typename ReductionOp, int STEP>

    __device__ __forceinline__ void ReduceStep(

        T&              input,

        ReductionOp     reduction_op,

        int             last_lane,

        Int2Type<STEP>  /*step*/)

    {

        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());


        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());

    }


    template <typename ReductionOp>

    __device__ __forceinline__ void ReduceStep(

        T&              /*input*/,

        ReductionOp     /*reduction_op*/,

        int             /*last_lane*/,

        Int2Type<STEPS> /*step*/)

    {}


    //---------------------------------------------------------------------

    // Reduction operations

    //---------------------------------------------------------------------


    template <

        bool            ALL_LANES_VALID,

        typename        ReductionOp>

    __device__ __forceinline__ T Reduce(

        T               input,

        int             valid_items,

        ReductionOp     reduction_op)

    {

        int last_lane = (ALL_LANES_VALID) ?

                            LOGICAL_WARP_THREADS - 1 :

                            valid_items - 1;


        T output = input;


//        // Iterate reduction steps

//        #pragma unroll

//        for (int STEP = 0; STEP < STEPS; STEP++)

//        {

//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());

//        }


        // Template-iterate reduction steps

        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());


        return output;

    }


    template <

        bool            HEAD_SEGMENTED,

        typename        FlagT,

        typename        ReductionOp>

    __device__ __forceinline__ T SegmentedReduce(

        T               input,

        FlagT           flag,

        ReductionOp     reduction_op)

    {

        // Get the start flags for each thread in the warp.

        int warp_flags = WARP_BALLOT(flag, member_mask);


        // Convert to tail-segmented

        if (HEAD_SEGMENTED)

            warp_flags >>= 1;


        // Mask out the bits below the current thread

        warp_flags &= LaneMaskGe();


        // Mask of physical lanes outside the logical warp and convert to logical lanemask

        if (!IS_ARCH_WARP)

        {

            warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS);

        }


        // Mask in the last lane of logical warp

        warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);


        // Find the next set flag

        int last_lane = __clz(__brev(warp_flags));


        T output = input;


//        // Iterate reduction steps

//        #pragma unroll

//        for (int STEP = 0; STEP < STEPS; STEP++)

//        {

//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());

//        }


        // Template-iterate reduction steps

        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());


        return output;

    }

};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)

cub::SwizzleScanOp
Binary operator wrapper for switching non-commutative scan arguments.
Definition thread_operators.cuh:210

cub::WARP_BALLOT
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:310

cub::LaneId
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420

cub::LaneMaskGe
__device__ __forceinline__ unsigned int LaneMaskGe()
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
Definition util_ptx.cuh:471

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

cub::reduction_op
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Definition dispatch_reduce.cuh:75

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

cub::KeyValuePair
A key identifier paired with a corresponding value.
Definition util_type.cuh:667

cub::KeyValuePair::value
Value value
Item value.
Definition util_type.cuh:672

cub::KeyValuePair::key
Key key
Item key.
Definition util_type.cuh:671

cub::Log2
Statically determine log2(N), rounded up.
Definition util_type.cuh:133

cub::NullType
A simple "NULL" marker type.
Definition util_type.cuh:257

cub::ReduceByKeyOp
< Binary reduction operator to apply to values
Definition thread_operators.cuh:282

cub::ReduceBySegmentOp
Reduce-by-segment functor.
Definition thread_operators.cuh:253

cub::Sum
Default sum functor.
Definition thread_operators.cuh:110

cub::Traits
Type traits.
Definition util_type.cuh:1158

cub::WarpReduceShfl::IsInteger
Definition warp_reduce_shfl.cuh:82

cub::WarpReduceShfl::IsInteger::IS_SMALL_UNSIGNED
@ IS_SMALL_UNSIGNED
Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction...
Definition warp_reduce_shfl.cuh:85

cub::WarpReduceShfl
WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA ...
Definition warp_reduce_shfl.cuh:59

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset)
Reduction step (generic)
Definition warp_reduce_shfl.cuh:389

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< false >)
Reduction step (specialized for types other than small unsigned integers size 32b or less)
Definition warp_reduce_shfl.cuh:422

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ double ReduceStep(double input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across double types)
Definition warp_reduce_shfl.cuh:293

cub::WarpReduceShfl::WarpReduceShfl
__device__ __forceinline__ WarpReduceShfl(TempStorage &)
Constructor.
Definition warp_reduce_shfl.cuh:113

cub::WarpReduceShfl::lane_id
unsigned int lane_id
Lane index in logical warp.
Definition warp_reduce_shfl.cuh:99

cub::WarpReduceShfl::member_mask
unsigned int member_mask
32-thread physical warp member mask of logical warp
Definition warp_reduce_shfl.cuh:105

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type< true >)
Reduction step (specialized for small unsigned integers size 32b or less)
Definition warp_reduce_shfl.cuh:409

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ KeyValuePair< OffsetT, ValueT > ReduceStep(KeyValuePair< OffsetT, ValueT > input, SwizzleScanOp< ReduceBySegmentOp< cub::Sum > >, int last_lane, int offset)
Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT,...
Definition warp_reduce_shfl.cuh:369

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ float ReduceStep(float input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across fp32 types)
Definition warp_reduce_shfl.cuh:171

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ unsigned long long ReduceStep(unsigned long long input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across unsigned long long types)
Definition warp_reduce_shfl.cuh:208

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ long long ReduceStep(long long input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across long long types)
Definition warp_reduce_shfl.cuh:250

cub::WarpReduceShfl::TempStorage
NullType TempStorage
Shared memory storage layout type.
Definition warp_reduce_shfl.cuh:91

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ KeyValuePair< KeyT, ValueT > ReduceStep(KeyValuePair< KeyT, ValueT > input, SwizzleScanOp< ReduceByKeyOp< cub::Sum > >, int last_lane, int offset)
Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT,...
Definition warp_reduce_shfl.cuh:341

cub::WarpReduceShfl::IS_ARCH_WARP
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
Definition warp_reduce_shfl.cuh:67

cub::WarpReduceShfl::SHFL_C
@ SHFL_C
The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up.
Definition warp_reduce_shfl.cuh:76

cub::WarpReduceShfl::LOGICAL_WARPS
@ LOGICAL_WARPS
Number of logical warps in a PTX warp.
Definition warp_reduce_shfl.cuh:73

cub::WarpReduceShfl::STEPS
@ STEPS
The number of warp reduction steps.
Definition warp_reduce_shfl.cuh:70

cub::WarpReduceShfl::Reduce
__device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)
Reduction.
Definition warp_reduce_shfl.cuh:466

cub::WarpReduceShfl::SegmentedReduce
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
Segmented reduction.
Definition warp_reduce_shfl.cuh:496

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ unsigned int ReduceStep(unsigned int input, cub::Sum, int last_lane, int offset)
Reduction (specialized for summation across uint32 types)
Definition warp_reduce_shfl.cuh:134

cub::WarpReduceShfl::ReduceStep
__device__ __forceinline__ void ReduceStep(T &input, ReductionOp reduction_op, int last_lane, Int2Type< STEP >)
Definition warp_reduce_shfl.cuh:438

cub::WarpReduceShfl::warp_id
unsigned int warp_id
Logical warp index in 32-thread physical warp.
Definition warp_reduce_shfl.cuh:102