doxygen/openfpm/block__scan__warp__scans2_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include "../../util_arch.cuh"

#include "../../util_ptx.cuh"

#include "../../warp/warp_scan.cuh"

#include "../../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


template <

    typename    T,

    int         BLOCK_DIM_X,

    int         BLOCK_DIM_Y,

    int         BLOCK_DIM_Z,

    int         PTX_ARCH>

struct BlockScanWarpScans

{

    //---------------------------------------------------------------------

    // Types and constants

    //---------------------------------------------------------------------


    enum

    {

        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),


        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,


        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

    };


    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;


    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;


    struct _TempStorage

    {

        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];

        typename WarpScanT::TempStorage             warp_scan[WARPS];

        T                                           warp_aggregates[WARPS];

        T                                           block_prefix;

    };


    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------

    // Per-thread fields

    //---------------------------------------------------------------------


    // Thread fields

    _TempStorage    &temp_storage;

    unsigned int    linear_tid;

    unsigned int    warp_id;

    unsigned int    lane_id;


    //---------------------------------------------------------------------

    // Constructors

    //---------------------------------------------------------------------


    __device__ __forceinline__ BlockScanWarpScans(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),

        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),

        lane_id(LaneId())

    {}


    //---------------------------------------------------------------------

    // Utility methods

    //---------------------------------------------------------------------


    template <typename ScanOp, int WARP>

    __device__ __forceinline__ void ApplyWarpAggregates(

        T               &warp_prefix,

        ScanOp          scan_op,

        T               &block_aggregate,

        Int2Type<WARP>  addend_warp)

    {

        if (warp_id == WARP)

            warp_prefix = block_aggregate;


        T addend = temp_storage.warp_aggregates[WARP];

        block_aggregate = scan_op(block_aggregate, addend);


        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());

    }


    template <typename ScanOp>

    __device__ __forceinline__ void ApplyWarpAggregates(

        T               &warp_prefix,

        ScanOp          scan_op,

        T               &block_aggregate,

        Int2Type<WARPS> addend_warp)

    {}


    template <typename ScanOp>

    __device__ __forceinline__ T ComputeWarpPrefix(

        ScanOp          scan_op,

        T               warp_aggregate,

        T               &block_aggregate)

    {

        // Last lane in each warp shares its warp-aggregate

        if (lane_id == WARP_THREADS - 1)

            temp_storage.warp_aggregates[warp_id] = warp_aggregate;


        CTA_SYNC();


        // Accumulate block aggregates and save the one that is our warp's prefix

        T warp_prefix;

        block_aggregate = temp_storage.warp_aggregates[0];


        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)

        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());

/*

        #pragma unroll

        for (int WARP = 1; WARP < WARPS; ++WARP)

        {

            if (warp_id == WARP)

                warp_prefix = block_aggregate;


            T addend = temp_storage.warp_aggregates[WARP];

            block_aggregate = scan_op(block_aggregate, addend);

        }

*/


        return warp_prefix;

    }


    template <typename ScanOp>

    __device__ __forceinline__ T ComputeWarpPrefix(

        ScanOp          scan_op,

        T               warp_aggregate,

        T               &block_aggregate,

        const T         &initial_value)

    {

        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);


        warp_prefix = scan_op(initial_value, warp_prefix);


        if (warp_id == 0)

            warp_prefix = initial_value;


        return warp_prefix;

    }


    //---------------------------------------------------------------------

    // Exclusive scans

    //---------------------------------------------------------------------


    template <typename ScanOp>

    __device__ __forceinline__ void ExclusiveScan(

        T               input,

        T               &exclusive_output,

        ScanOp          scan_op)

    {

        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.

        T block_aggregate;

        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);

    }


    template <typename ScanOp>

    __device__ __forceinline__ void ExclusiveScan(

        T               input,

        T               &exclusive_output,

        const T         &initial_value,

        ScanOp          scan_op)

    {

        T block_aggregate;

        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);

    }


    template <typename ScanOp>

    __device__ __forceinline__ void ExclusiveScan(

        T               input,

        T               &exclusive_output,

        ScanOp          scan_op,

        T               &block_aggregate)

    {

        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);


        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.

        T inclusive_output;

        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);


        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.

//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);


//--------------------------------------------------

        // Last lane in each warp shares its warp-aggregate

        if (lane_id == WARP_THREADS - 1)

            temp_storage.warp_aggregates[warp_id] = inclusive_output;


        CTA_SYNC();


        // Get the warp scan partial

        T warp_inclusive, warp_prefix;

        if (lane_id < WARPS)

        {

            // Scan the warpscan partials

            T warp_val = temp_storage.warp_aggregates[lane_id];

            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);

        }


        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);

        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);

//--------------------------------------------------


        // Apply warp prefix to our lane's partial

        if (warp_id != 0)

        {

            exclusive_output = scan_op(warp_prefix, exclusive_output);

            if (lane_id == 0)

                exclusive_output = warp_prefix;

        }

    }


    template <typename ScanOp>

    __device__ __forceinline__ void ExclusiveScan(

        T               input,

        T               &exclusive_output,

        const T         &initial_value,

        ScanOp          scan_op,

        T               &block_aggregate)

    {

        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);


        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.

        T inclusive_output;

        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);


        // Compute the warp-wide prefix and block-wide aggregate for each warp

//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);


//--------------------------------------------------

        // Last lane in each warp shares its warp-aggregate

        if (lane_id == WARP_THREADS - 1)

            temp_storage.warp_aggregates[warp_id] = inclusive_output;


        CTA_SYNC();


        // Get the warp scan partial

        T warp_inclusive, warp_prefix;

        if (lane_id < WARPS)

        {

            // Scan the warpscan partials

            T warp_val = temp_storage.warp_aggregates[lane_id];

            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);

        }


        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);

        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);

//--------------------------------------------------


        // Apply warp prefix to our lane's partial

        exclusive_output = scan_op(warp_prefix, exclusive_output);

        if (lane_id == 0)

            exclusive_output = warp_prefix;

    }


    template <

        typename ScanOp,

        typename BlockPrefixCallbackOp>

    __device__ __forceinline__ void ExclusiveScan(

        T                       input,

        T                       &exclusive_output,

        ScanOp                  scan_op,

        BlockPrefixCallbackOp   &block_prefix_callback_op)

    {

        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.

        T block_aggregate;

        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);


        // Use the first warp to determine the thread block prefix, returning the result in lane0

        if (warp_id == 0)

        {

            T block_prefix = block_prefix_callback_op(block_aggregate);

            if (lane_id == 0)

            {

                // Share the prefix with all threads

                temp_storage.block_prefix = block_prefix;

                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0

            }

        }


        CTA_SYNC();


        // Incorporate thread block prefix into outputs

        T block_prefix = temp_storage.block_prefix;

        if (linear_tid > 0)

        {

            exclusive_output = scan_op(block_prefix, exclusive_output);

        }

    }


    //---------------------------------------------------------------------

    // Inclusive scans

    //---------------------------------------------------------------------


    template <typename ScanOp>

    __device__ __forceinline__ void InclusiveScan(

        T               input,

        T               &inclusive_output,

        ScanOp          scan_op)

    {

        T block_aggregate;

        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);

    }


    template <typename ScanOp>

    __device__ __forceinline__ void InclusiveScan(

        T               input,

        T               &inclusive_output,

        ScanOp          scan_op,

        T               &block_aggregate)

    {

        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);


        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.

        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);


        // Apply warp prefix to our lane's partial

        if (warp_id != 0)

        {

            inclusive_output = scan_op(warp_prefix, inclusive_output);

        }

    }


    template <

        typename ScanOp,

        typename BlockPrefixCallbackOp>

    __device__ __forceinline__ void InclusiveScan(

        T                       input,

        T                       &exclusive_output,

        ScanOp                  scan_op,

        BlockPrefixCallbackOp   &block_prefix_callback_op)

    {

        T block_aggregate;

        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);


        // Use the first warp to determine the thread block prefix, returning the result in lane0

        if (warp_id == 0)

        {

            T block_prefix = block_prefix_callback_op(block_aggregate);

            if (lane_id == 0)

            {

                // Share the prefix with all threads

                temp_storage.block_prefix = block_prefix;

            }

        }


        CTA_SYNC();


        // Incorporate thread block prefix into outputs

        T block_prefix = temp_storage.block_prefix;

        exclusive_output = scan_op(block_prefix, exclusive_output);

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::WarpScan
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
Definition warp_scan.cuh:147

cub::WarpScan::Scan
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
Definition warp_scan.cuh:799

cub::WarpScan::Broadcast
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast the value input from warp-lanesrc_lane to all lanes in the warp.
Definition warp_scan.cuh:922

cub::RowMajorTid
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409

cub::LaneId
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::scan_op
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Definition dispatch_scan.cuh:109

cub::BlockScanWarpScans::TempStorage
Alias wrapper allowing storage to be unioned.
Definition block_scan_warp_scans.cuh:92

cub::BlockScanWarpScans::_TempStorage
Shared memory storage layout type.
Definition block_scan_warp_scans2.cuh:83

cub::BlockScanWarpScans::_TempStorage::warp_scan
WarpScanT::TempStorage warp_scan[WARPS]
Buffer for warp-synchronous scans.
Definition block_scan_warp_scans2.cuh:85

cub::BlockScanWarpScans::_TempStorage::block_prefix
T block_prefix
Shared prefix for the entire thread block.
Definition block_scan_warp_scans2.cuh:87

cub::BlockScanWarpScans::_TempStorage::inner_scan
WarpAggregateScanT::TempStorage inner_scan[WARPS]
Buffer for warp-synchronous scans.
Definition block_scan_warp_scans2.cuh:84

cub::BlockScanWarpScans::ExclusiveScan
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
Definition block_scan_warp_scans.cuh:208

cub::BlockScanWarpScans::WARP_THREADS
@ WARP_THREADS
Number of warp threads.
Definition block_scan_warp_scans.cuh:66

cub::BlockScanWarpScans::WARPS
@ WARPS
Number of active warps.
Definition block_scan_warp_scans.cuh:72

cub::BlockScanWarpScans::InclusiveScan
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
Definition block_scan_warp_scans.cuh:323

cub::BlockScanWarpScans::ApplyWarpAggregates
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP > addend_warp)
Definition block_scan_warp_scans2.cuh:126

cub::BlockScanWarpScans::ComputeWarpPrefix
__device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate)
Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate ...
Definition block_scan_warp_scans2.cuh:152

cub::BlockScanWarpScans::BlockScanWarpScans
__device__ __forceinline__ BlockScanWarpScans(TempStorage &temp_storage)
Constructor.
Definition block_scan_warp_scans2.cuh:111

cub::BlockScanWarpScans::ApplyWarpAggregates
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARPS > addend_warp)
Definition block_scan_warp_scans2.cuh:142

cub::BlockScanWarpScans::WarpScanT
WarpScan< T, WARP_THREADS, PTX_ARCH > WarpScanT
WarpScan utility type.
Definition block_scan_warp_scans2.cuh:76

cub::BlockScanWarpScans::BLOCK_THREADS
@ BLOCK_THREADS
The thread block size in threads.
Definition block_scan_warp_scans.cuh:69

cub::BlockScanWarpScans::WarpAggregateScanT
WarpScan< T, WARPS, PTX_ARCH > WarpAggregateScanT
WarpScan utility type.
Definition block_scan_warp_scans2.cuh:79

cub::BlockScanWarpScans::ApplyWarpAggregates
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP >)
Definition block_scan_warp_scans.cuh:126

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

cub::WarpScan::TempStorage
\smemstorage{WarpScan}
Definition warp_scan.cuh:192