doxygen/openfpm/agent__rle_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <iterator>


#include "single_pass_scan_operators.cuh"

#include "../block/block_load.cuh"

#include "../block/block_store.cuh"

#include "../block/block_scan.cuh"

#include "../block/block_exchange.cuh"

#include "../block/block_discontinuity.cuh"

#include "../grid/grid_queue.cuh"

#include "../iterator/cache_modified_input_iterator.cuh"

#include "../iterator/constant_input_iterator.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Tuning policy types

 ******************************************************************************/


template <

    int                         _BLOCK_THREADS,

    int                         _ITEMS_PER_THREAD,

    BlockLoadAlgorithm          _LOAD_ALGORITHM,

    CacheLoadModifier           _LOAD_MODIFIER,

    bool                        _STORE_WARP_TIME_SLICING,

    BlockScanAlgorithm          _SCAN_ALGORITHM>

struct AgentRlePolicy

{

    enum

    {

        BLOCK_THREADS           = _BLOCK_THREADS,

        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,

        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,

    };


    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;

    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;

    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;

};


/******************************************************************************

 * Thread block abstractions

 ******************************************************************************/


template <

    typename    AgentRlePolicyT,

    typename    InputIteratorT,

    typename    OffsetsOutputIteratorT,

    typename    LengthsOutputIteratorT,

    typename    EqualityOpT,

    typename    OffsetT>

struct AgentRle

{

    //---------------------------------------------------------------------

    // Types and constants

    //---------------------------------------------------------------------


    typedef typename std::iterator_traits<InputIteratorT>::value_type T;


    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?

        OffsetT,                                                                                                    // ... then the OffsetT type,

        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type


    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;


    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;


    // Constants

    enum

    {

        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),

        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,

        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,

        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,

        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,

        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,


        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),


        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,

        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,

    };


    template <bool LAST_TILE>

    struct OobInequalityOp

    {

        OffsetT         num_remaining;

        EqualityOpT      equality_op;


        __device__ __forceinline__ OobInequalityOp(

            OffsetT     num_remaining,

            EqualityOpT  equality_op)

        :

            num_remaining(num_remaining),

            equality_op(equality_op)

        {}


        template <typename Index>

        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)

        {

            if (!LAST_TILE || (idx < num_remaining))

                return !equality_op(first, second);

            else

                return true;

        }

    };


    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data

    typedef typename If<IsPointer<InputIteratorT>::VALUE,

            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator

            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type

        WrappedInputIteratorT;


    // Parameterized BlockLoad type for data

    typedef BlockLoad<

            T,

            AgentRlePolicyT::BLOCK_THREADS,

            AgentRlePolicyT::ITEMS_PER_THREAD,

            AgentRlePolicyT::LOAD_ALGORITHM>

        BlockLoadT;


    // Parameterized BlockDiscontinuity type for data

    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;


    // Parameterized WarpScan type

    typedef WarpScan<LengthOffsetPair> WarpScanPairs;


    // Reduce-length-by-run scan operator

    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;


    // Callback type for obtaining tile prefix during block scan

    typedef TilePrefixCallbackOp<

            LengthOffsetPair,

            ReduceBySegmentOpT,

            ScanTileStateT>

        TilePrefixCallbackOpT;


    // Warp exchange types

    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;


    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;


    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;

    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;


    typedef LengthOffsetPair WarpAggregates[WARPS];


    // Shared memory type for this thread block

    struct _TempStorage

    {

        // Aliasable storage layout

        union Aliasable

        {

            struct

            {

                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection

                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans

                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates

                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback

            };


            // Smem needed for input loading

            typename BlockLoadT::TempStorage                    load;


            // Aliasable layout needed for two-phase scatter

            union ScatterAliasable

            {

                unsigned long long                              align;

                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];

                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];

                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];


            } scatter_aliasable;


        } aliasable;


        OffsetT             tile_idx;                   // Shared tile index

        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix

        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix

    };


    // Alias wrapper allowing storage to be unioned

    struct TempStorage : Uninitialized<_TempStorage> {};


    //---------------------------------------------------------------------

    // Per-thread fields

    //---------------------------------------------------------------------


    _TempStorage&                   temp_storage;


    WrappedInputIteratorT           d_in;

    OffsetsOutputIteratorT          d_offsets_out;

    LengthsOutputIteratorT          d_lengths_out;


    EqualityOpT                     equality_op;

    ReduceBySegmentOpT              scan_op;

    OffsetT                         num_items;


    //---------------------------------------------------------------------

    // Constructor

    //---------------------------------------------------------------------


    // Constructor

    __device__ __forceinline__

    AgentRle(

        TempStorage                 &temp_storage,

        InputIteratorT              d_in,

        OffsetsOutputIteratorT      d_offsets_out,

        LengthsOutputIteratorT      d_lengths_out,

        EqualityOpT                 equality_op,

        OffsetT                     num_items)

    :

        temp_storage(temp_storage.Alias()),

        d_in(d_in),

        d_offsets_out(d_offsets_out),

        d_lengths_out(d_lengths_out),

        equality_op(equality_op),

        scan_op(cub::Sum()),

        num_items(num_items)

    {}


    //---------------------------------------------------------------------

    // Utility methods for initializing the selections

    //---------------------------------------------------------------------


    template <bool FIRST_TILE, bool LAST_TILE>

    __device__ __forceinline__ void InitializeSelections(

        OffsetT             tile_offset,

        OffsetT             num_remaining,

        T                   (&items)[ITEMS_PER_THREAD],

        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])

    {

        bool                head_flags[ITEMS_PER_THREAD];

        bool                tail_flags[ITEMS_PER_THREAD];


        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);


        if (FIRST_TILE && LAST_TILE)

        {

            // First-and-last-tile always head-flags the first item and tail-flags the last item


            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(

                head_flags, tail_flags, items, inequality_op);

        }

        else if (FIRST_TILE)

        {

            // First-tile always head-flags the first item


            // Get the first item from the next tile

            T tile_successor_item;

            if (threadIdx.x == BLOCK_THREADS - 1)

                tile_successor_item = d_in[tile_offset + TILE_ITEMS];


            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(

                head_flags, tail_flags, tile_successor_item, items, inequality_op);

        }

        else if (LAST_TILE)

        {

            // Last-tile always flags the last item


            // Get the last item from the previous tile

            T tile_predecessor_item;

            if (threadIdx.x == 0)

                tile_predecessor_item = d_in[tile_offset - 1];


            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(

                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);

        }

        else

        {

            // Get the first item from the next tile

            T tile_successor_item;

            if (threadIdx.x == BLOCK_THREADS - 1)

                tile_successor_item = d_in[tile_offset + TILE_ITEMS];


            // Get the last item from the previous tile

            T tile_predecessor_item;

            if (threadIdx.x == 0)

                tile_predecessor_item = d_in[tile_offset - 1];


            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(

                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);

        }


        // Zip counts and runs

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)

        {

            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);

            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));

        }

    }


    //---------------------------------------------------------------------

    // Scan utility methods

    //---------------------------------------------------------------------


    __device__ __forceinline__ void WarpScanAllocations(

        LengthOffsetPair    &tile_aggregate,

        LengthOffsetPair    &warp_aggregate,

        LengthOffsetPair    &warp_exclusive_in_tile,

        LengthOffsetPair    &thread_exclusive_in_warp,

        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])

    {

        // Perform warpscans

        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);

        int lane_id = LaneId();


        LengthOffsetPair identity;

        identity.key = 0;

        identity.value = 0;


        LengthOffsetPair thread_inclusive;

        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);

        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(

            thread_aggregate,

            thread_inclusive,

            thread_exclusive_in_warp,

            identity,

            scan_op);


        // Last lane in each warp shares its warp-aggregate

        if (lane_id == WARP_THREADS - 1)

            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;


        CTA_SYNC();


        // Accumulate total selected and the warp-wide prefix

        warp_exclusive_in_tile          = identity;

        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];

        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];


        #pragma unroll

        for (int WARP = 1; WARP < WARPS; ++WARP)

        {

            if (warp_id == WARP)

                warp_exclusive_in_tile = tile_aggregate;


            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);

        }

    }


    //---------------------------------------------------------------------

    // Utility methods for scattering selections

    //---------------------------------------------------------------------


    template <bool FIRST_TILE>

    __device__ __forceinline__ void ScatterTwoPhase(

        OffsetT             tile_num_runs_exclusive_in_global,

        OffsetT             warp_num_runs_aggregate,

        OffsetT             warp_num_runs_exclusive_in_tile,

        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],

        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],

        Int2Type<true>      is_warp_time_slice)

    {

        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);

        int lane_id = LaneId();


        // Locally compact items within the warp (first warp)

        if (warp_id == 0)

        {

            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(

                lengths_and_offsets, thread_num_runs_exclusive_in_warp);

        }


        // Locally compact items within the warp (remaining warps)

        #pragma unroll

        for (int SLICE = 1; SLICE < WARPS; ++SLICE)

        {

            CTA_SYNC();


            if (warp_id == SLICE)

            {

                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(

                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);

            }

        }


        // Global scatter

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)

            {

                OffsetT item_offset =

                    tile_num_runs_exclusive_in_global +

                    warp_num_runs_exclusive_in_tile +

                    (ITEM * WARP_THREADS) + lane_id;


                // Scatter offset

                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;


                // Scatter length if not the first (global) length

                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))

                {

                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;

                }

            }

        }

    }


    template <bool FIRST_TILE>

    __device__ __forceinline__ void ScatterTwoPhase(

        OffsetT             tile_num_runs_exclusive_in_global,

        OffsetT             warp_num_runs_aggregate,

        OffsetT             warp_num_runs_exclusive_in_tile,

        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],

        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],

        Int2Type<false>     is_warp_time_slice)

    {

        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);

        int lane_id = LaneId();


        // Unzip

        OffsetT run_offsets[ITEMS_PER_THREAD];

        LengthT run_lengths[ITEMS_PER_THREAD];


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;

            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;

        }


        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(

            run_offsets, thread_num_runs_exclusive_in_warp);


        WARP_SYNC(0xffffffff);


        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(

            run_lengths, thread_num_runs_exclusive_in_warp);


        // Global scatter

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)

            {

                OffsetT item_offset =

                    tile_num_runs_exclusive_in_global +

                    warp_num_runs_exclusive_in_tile +

                    (ITEM * WARP_THREADS) + lane_id;


                // Scatter offset

                d_offsets_out[item_offset] = run_offsets[ITEM];


                // Scatter length if not the first (global) length

                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))

                {

                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];

                }

            }

        }

    }


    template <bool FIRST_TILE>

    __device__ __forceinline__ void ScatterDirect(

        OffsetT             tile_num_runs_exclusive_in_global,

        OffsetT             warp_num_runs_aggregate,

        OffsetT             warp_num_runs_exclusive_in_tile,

        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],

        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)

        {

            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)

            {

                OffsetT item_offset =

                    tile_num_runs_exclusive_in_global +

                    warp_num_runs_exclusive_in_tile +

                    thread_num_runs_exclusive_in_warp[ITEM];


                // Scatter offset

                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;


                // Scatter length if not the first (global) length

                if (item_offset >= 1)

                {

                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;

                }

            }

        }

    }


    template <bool FIRST_TILE>

    __device__ __forceinline__ void Scatter(

        OffsetT             tile_num_runs_aggregate,

        OffsetT             tile_num_runs_exclusive_in_global,

        OffsetT             warp_num_runs_aggregate,

        OffsetT             warp_num_runs_exclusive_in_tile,

        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],

        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])

    {

        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))

        {

            // Direct scatter if the warp has any items

            if (warp_num_runs_aggregate)

            {

                ScatterDirect<FIRST_TILE>(

                    tile_num_runs_exclusive_in_global,

                    warp_num_runs_aggregate,

                    warp_num_runs_exclusive_in_tile,

                    thread_num_runs_exclusive_in_warp,

                    lengths_and_offsets);

            }

        }

        else

        {

            // Scatter two phase

            ScatterTwoPhase<FIRST_TILE>(

                tile_num_runs_exclusive_in_global,

                warp_num_runs_aggregate,

                warp_num_runs_exclusive_in_tile,

                thread_num_runs_exclusive_in_warp,

                lengths_and_offsets,

                Int2Type<STORE_WARP_TIME_SLICING>());

        }

    }


    //---------------------------------------------------------------------

    // Cooperatively scan a device-wide sequence of tiles with other CTAs

    //---------------------------------------------------------------------


    template <

        bool                LAST_TILE>

    __device__ __forceinline__ LengthOffsetPair ConsumeTile(

        OffsetT             num_items,

        OffsetT             num_remaining,

        int                 tile_idx,

        OffsetT             tile_offset,

        ScanTileStateT      &tile_status)

    {

        if (tile_idx == 0)

        {

            // First tile


            // Load items

            T items[ITEMS_PER_THREAD];

            if (LAST_TILE)

                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());

            else

                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);


            if (SYNC_AFTER_LOAD)

                CTA_SYNC();


            // Set flags

            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];


            InitializeSelections<true, LAST_TILE>(

                tile_offset,

                num_remaining,

                items,

                lengths_and_num_runs);


            // Exclusive scan of lengths and runs

            LengthOffsetPair tile_aggregate;

            LengthOffsetPair warp_aggregate;

            LengthOffsetPair warp_exclusive_in_tile;

            LengthOffsetPair thread_exclusive_in_warp;


            WarpScanAllocations(

                tile_aggregate,

                warp_aggregate,

                warp_exclusive_in_tile,

                thread_exclusive_in_warp,

                lengths_and_num_runs);


            // Update tile status if this is not the last tile

            if (!LAST_TILE && (threadIdx.x == 0))

                tile_status.SetInclusive(0, tile_aggregate);


            // Update thread_exclusive_in_warp to fold in warp run-length

            if (thread_exclusive_in_warp.key == 0)

                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;


            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];

            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];

            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];


            // Downsweep scan through lengths_and_num_runs

            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);


            // Zip


            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

            {

                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;

                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;

                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?

                                                                lengths_and_num_runs2[ITEM].key :         // keep

                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard

            }


            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;

            OffsetT tile_num_runs_exclusive_in_global    = 0;

            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;

            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;


            // Scatter

            Scatter<true>(

                tile_num_runs_aggregate,

                tile_num_runs_exclusive_in_global,

                warp_num_runs_aggregate,

                warp_num_runs_exclusive_in_tile,

                thread_num_runs_exclusive_in_warp,

                lengths_and_offsets);


            // Return running total (inclusive of this tile)

            return tile_aggregate;

        }

        else

        {

            // Not first tile


            // Load items

            T items[ITEMS_PER_THREAD];

            if (LAST_TILE)

                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());

            else

                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);


            if (SYNC_AFTER_LOAD)

                CTA_SYNC();


            // Set flags

            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];


            InitializeSelections<false, LAST_TILE>(

                tile_offset,

                num_remaining,

                items,

                lengths_and_num_runs);


            // Exclusive scan of lengths and runs

            LengthOffsetPair tile_aggregate;

            LengthOffsetPair warp_aggregate;

            LengthOffsetPair warp_exclusive_in_tile;

            LengthOffsetPair thread_exclusive_in_warp;


            WarpScanAllocations(

                tile_aggregate,

                warp_aggregate,

                warp_exclusive_in_tile,

                thread_exclusive_in_warp,

                lengths_and_num_runs);


            // First warp computes tile prefix in lane 0

            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);

            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);

            if (warp_id == 0)

            {

                prefix_op(tile_aggregate);

                if (threadIdx.x == 0)

                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;

            }


            CTA_SYNC();


            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;


            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths

            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);

            if (thread_exclusive_in_warp.key == 0)

                thread_exclusive_in_warp.value += thread_exclusive.value;


            // Downsweep scan through lengths_and_num_runs

            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];

            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];

            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];


            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);


            // Zip

            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

            {

                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;

                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;

                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?

                                                                lengths_and_num_runs2[ITEM].key :         // keep

                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard

            }


            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;

            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;

            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;

            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;


            // Scatter

            Scatter<false>(

                tile_num_runs_aggregate,

                tile_num_runs_exclusive_in_global,

                warp_num_runs_aggregate,

                warp_num_runs_exclusive_in_tile,

                thread_num_runs_exclusive_in_warp,

                lengths_and_offsets);


            // Return running total (inclusive of this tile)

            return prefix_op.inclusive_prefix;

        }

    }


    template <typename NumRunsIteratorT>

    __device__ __forceinline__ void ConsumeRange(

        int                 num_tiles,

        ScanTileStateT&     tile_status,

        NumRunsIteratorT    d_num_runs_out)

    {

        // Blocks are launched in increasing order, so just assign one tile per block

        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index

        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile

        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)


        if (tile_idx < num_tiles - 1)

        {

            // Not the last tile (full)

            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);

        }

        else if (num_remaining > 0)

        {

            // The last tile (possibly partially-full)

            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);


            if (threadIdx.x == 0)

            {

                // Output the total number of items selected

                *d_num_runs_out = running_total.key;


                // The inclusive prefix contains accumulated length reduction for the last run

                if (running_total.key > 0)

                    d_lengths_out[running_total.key - 1] = running_total.value;

            }

        }

    }

};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockDiscontinuity
The BlockDiscontinuity class provides collective methods for flagging discontinuities within an order...
Definition block_discontinuity.cuh:109

cub::BlockLoad
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
Definition block_load.cuh:641

cub::BlockRadixRank
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Definition block_radix_rank.cuh:98

cub::CacheModifiedInputIterator
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
Definition cache_modified_input_iterator.cuh:108

cub::WarpExchange
Definition block_exchange.cuh:1135

cub::WarpExchange::ScatterToStriped
__device__ __forceinline__ void ScatterToStriped(T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
Definition block_exchange.cuh:1213

cub::WarpScan
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
Definition warp_scan.cuh:147

cub::WarpScan::Scan
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
Definition warp_scan.cuh:799

cub::BlockLoadAlgorithm
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
Definition block_load.cuh:474

cub::CacheLoadModifier
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition thread_load.cuh:63

cub::BlockLoad::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition block_load.cuh:1130

cub::BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_DIRECT
Definition block_load.cuh:485

cub::internal::ThreadScanExclusive
__device__ __forceinline__ T ThreadScanExclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >)
Definition thread_scan.cuh:63

cub::WARP_SYNC
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition util_ptx.cuh:273

cub::LaneId
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub::internal::ThreadReduce
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Definition thread_reduce.cuh:55

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::d_num_runs_out
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
Definition dispatch_reduce_by_key.cuh:77

cub::num_tiles
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
Definition dispatch_rle.cuh:84

cub::BlockScanAlgorithm
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
Definition block_scan.cuh:58

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::tile_status
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT tile_status
[in] Tile status interface
Definition dispatch_rle.cuh:80

cub::equality_op
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT ScanTileStateT int EqualityOpT equality_op
KeyT equality operator.
Definition dispatch_reduce_by_key.cuh:80

single_pass_scan_operators.cuh

cub::AgentRlePolicy
< The BlockScan algorithm to use
Definition agent_rle.cuh:71

cub::AgentRlePolicy::SCAN_ALGORITHM
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
Definition agent_rle.cuh:81

cub::AgentRlePolicy::LOAD_ALGORITHM
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
Definition agent_rle.cuh:79

cub::AgentRlePolicy::ITEMS_PER_THREAD
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
Definition agent_rle.cuh:75

cub::AgentRlePolicy::BLOCK_THREADS
@ BLOCK_THREADS
Threads per thread block.
Definition agent_rle.cuh:74

cub::AgentRlePolicy::STORE_WARP_TIME_SLICING
@ STORE_WARP_TIME_SLICING
Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block...
Definition agent_rle.cuh:76

cub::AgentRlePolicy::LOAD_MODIFIER
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
Definition agent_rle.cuh:80

cub::AgentRle::OobInequalityOp
Definition agent_rle.cuh:148

cub::AgentRle::TempStorage
Definition agent_rle.cuh:246

cub::AgentRle::_TempStorage
Definition agent_rle.cuh:213

cub::AgentRle
AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run...
Definition agent_rle.cuh:103

cub::AgentRle::scan_op
ReduceBySegmentOpT scan_op
Reduce-length-by-flag scan operator.
Definition agent_rle.cuh:260

cub::AgentRle::AgentRle
__device__ __forceinline__ AgentRle(TempStorage &temp_storage, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, EqualityOpT equality_op, OffsetT num_items)
Definition agent_rle.cuh:270

cub::AgentRle::ScatterTwoPhase
__device__ __forceinline__ void ScatterTwoPhase(OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT(&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair(&lengths_and_offsets)[ITEMS_PER_THREAD], Int2Type< true > is_warp_time_slice)
Definition agent_rle.cuh:421

cub::AgentRle::Scatter
__device__ __forceinline__ void Scatter(OffsetT tile_num_runs_aggregate, OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT(&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair(&lengths_and_offsets)[ITEMS_PER_THREAD])
Definition agent_rle.cuh:572

cub::AgentRle::WarpScanAllocations
__device__ __forceinline__ void WarpScanAllocations(LengthOffsetPair &tile_aggregate, LengthOffsetPair &warp_aggregate, LengthOffsetPair &warp_exclusive_in_tile, LengthOffsetPair &thread_exclusive_in_warp, LengthOffsetPair(&lengths_and_num_runs)[ITEMS_PER_THREAD])
Definition agent_rle.cuh:367

cub::AgentRle::ConsumeRange
__device__ __forceinline__ void ConsumeRange(int num_tiles, ScanTileStateT &tile_status, NumRunsIteratorT d_num_runs_out)
< Output iterator type for recording number of items selected
Definition agent_rle.cuh:801

cub::AgentRle::ScatterTwoPhase
__device__ __forceinline__ void ScatterTwoPhase(OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT(&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair(&lengths_and_offsets)[ITEMS_PER_THREAD], Int2Type< false > is_warp_time_slice)
Definition agent_rle.cuh:480

cub::AgentRle::d_offsets_out
OffsetsOutputIteratorT d_offsets_out
Input run offsets.
Definition agent_rle.cuh:256

cub::AgentRle::STORE_WARP_TIME_SLICING
@ STORE_WARP_TIME_SLICING
Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block...
Definition agent_rle.cuh:136

cub::AgentRle::SYNC_AFTER_LOAD
@ SYNC_AFTER_LOAD
Whether or not to sync after loading data.
Definition agent_rle.cuh:133

cub::AgentRle::ScatterDirect
__device__ __forceinline__ void ScatterDirect(OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT(&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair(&lengths_and_offsets)[ITEMS_PER_THREAD])
Definition agent_rle.cuh:538

cub::AgentRle::d_lengths_out
LengthsOutputIteratorT d_lengths_out
Output run lengths.
Definition agent_rle.cuh:257

cub::AgentRle::d_in
WrappedInputIteratorT d_in
Pointer to input sequence of data items.
Definition agent_rle.cuh:255

cub::AgentRle::LengthOffsetPair
KeyValuePair< OffsetT, LengthT > LengthOffsetPair
Tuple type for scanning (pairs run-length and run-index)
Definition agent_rle.cuh:117

cub::AgentRle::equality_op
EqualityOpT equality_op
T equality operator.
Definition agent_rle.cuh:259

cub::AgentRle::num_items
OffsetT num_items
Total number of input items.
Definition agent_rle.cuh:261

cub::AgentRle::T
std::iterator_traits< InputIteratorT >::value_type T
The input value type.
Definition agent_rle.cuh:109

cub::AgentRle::temp_storage
_TempStorage & temp_storage
Reference to temp_storage.
Definition agent_rle.cuh:253

cub::AgentRle::ConsumeTile
__device__ __forceinline__ LengthOffsetPair ConsumeTile(OffsetT num_items, OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_status)
Definition agent_rle.cuh:617

cub::AgentRle::ScanTileStateT
ReduceByKeyScanTileState< LengthT, OffsetT > ScanTileStateT
Tile status descriptor interface type.
Definition agent_rle.cuh:120

cub::BlockDiscontinuity::TempStorage
\smemstorage{BlockDiscontinuity}
Definition block_discontinuity.cuh:260

cub::BlockLoad::TempStorage
\smemstorage{BlockLoad}
Definition block_load.cuh:1055

cub::Equals
Type equality test.
Definition util_type.cuh:99

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::If::Type
ThenType Type
Conditional type result.
Definition util_type.cuh:75

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

cub::KeyValuePair
A key identifier paired with a corresponding value.
Definition util_type.cuh:667

cub::KeyValuePair::value
Value value
Item value.
Definition util_type.cuh:672

cub::KeyValuePair::key
Key key
Item key.
Definition util_type.cuh:671

cub::ReduceByKeyScanTileState
Definition single_pass_scan_operators.cuh:450

cub::ReduceBySegmentOp
Reduce-by-segment functor.
Definition thread_operators.cuh:253

cub::Sum
Default sum functor.
Definition thread_operators.cuh:110

cub::TilePrefixCallbackOp::TempStorage
Definition single_pass_scan_operators.cuh:695

cub::TilePrefixCallbackOp
Definition single_pass_scan_operators.cuh:681

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

cub::WarpExchange::TempStorage
\smemstorage{WarpExchange}
Definition block_exchange.cuh:1171

cub::WarpScan::TempStorage
\smemstorage{WarpScan}
Definition warp_scan.cuh:192

cub::AgentRle::_TempStorage::Aliasable::ScatterAliasable
Definition agent_rle.cuh:230

cub::AgentRle::_TempStorage::Aliasable
Definition agent_rle.cuh:216