doxygen/openfpm/dispatch__radix__sort_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <stdio.h>

#include <iterator>


#include "../../agent/agent_radix_sort_upsweep.cuh"

#include "../../agent/agent_radix_sort_downsweep.cuh"

#include "../../agent/agent_scan.cuh"

#include "../../block/block_radix_sort.cuh"

#include "../../grid/grid_even_share.cuh"

#include "../../util_type.cuh"

#include "../../util_debug.cuh"

#include "../../util_device.cuh"

#include "../../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * Kernel entry points

 *****************************************************************************/


template <

    typename                ChainedPolicyT,

    bool                    ALT_DIGIT_BITS,

    bool                    IS_DESCENDING,

    typename                KeyT,

    typename                OffsetT>

__launch_bounds__ (int((ALT_DIGIT_BITS) ?

    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :

    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))

__global__ void DeviceRadixSortUpsweepKernel(

    const KeyT              *d_keys,

    OffsetT                 *d_spine,

    OffsetT                 /*num_items*/,

    int                     current_bit,

    int                     num_bits,

    GridEvenShare<OffsetT>  even_share)

{

    enum {

        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *

                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD

    };


    // Parameterize AgentRadixSortUpsweep type for the current configuration

    typedef AgentRadixSortUpsweep<

            typename If<(ALT_DIGIT_BITS),

                typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,

                typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type,

            KeyT,

            OffsetT>

        AgentRadixSortUpsweepT;


    // Shared memory storage

    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;


    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block

    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();


    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);


    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);


    CTA_SYNC();


    // Write out digit counts (striped)

    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);

}


template <

    typename                ChainedPolicyT,

    typename                OffsetT>

__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)

__global__ void RadixSortScanBinsKernel(

    OffsetT                 *d_spine,

    int                     num_counts)

{

    // Parameterize the AgentScan type for the current configuration

    typedef AgentScan<

            typename ChainedPolicyT::ActivePolicy::ScanPolicy,

            OffsetT*,

            OffsetT*,

            cub::Sum,

            OffsetT,

            OffsetT>

        AgentScanT;


    // Shared memory storage

    __shared__ typename AgentScanT::TempStorage temp_storage;


    // Block scan instance

    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;


    // Process full input tiles

    int block_offset = 0;

    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());

    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)

    {

        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);

        block_offset += AgentScanT::TILE_ITEMS;

    }

}


template <

    typename                ChainedPolicyT,

    bool                    ALT_DIGIT_BITS,

    bool                    IS_DESCENDING,

    typename                KeyT,

    typename                ValueT,

    typename                OffsetT>

__launch_bounds__ (int((ALT_DIGIT_BITS) ?

    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :

    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))

__global__ void DeviceRadixSortDownsweepKernel(

    const KeyT              *d_keys_in,

    KeyT                    *d_keys_out,

    const ValueT            *d_values_in,

    ValueT                  *d_values_out,

    OffsetT                 *d_spine,

    OffsetT                 num_items,

    int                     current_bit,

    int                     num_bits,

    GridEvenShare<OffsetT>  even_share)

{

    enum {

        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *

                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD

    };


    // Parameterize AgentRadixSortDownsweep type for the current configuration

    typedef AgentRadixSortDownsweep<

            typename If<(ALT_DIGIT_BITS),

                typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,

                typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type,

            IS_DESCENDING,

            KeyT,

            ValueT,

            OffsetT>

        AgentRadixSortDownsweepT;


    // Shared memory storage

    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;


    // Initialize even-share descriptor for this thread block

    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();


    // Process input tiles

    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(

        even_share.block_offset,

        even_share.block_end);

}


template <

    typename                ChainedPolicyT,

    bool                    IS_DESCENDING,

    typename                KeyT,

    typename                ValueT,

    typename                OffsetT>

__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)

__global__ void DeviceRadixSortSingleTileKernel(

    const KeyT              *d_keys_in,

    KeyT                    *d_keys_out,

    const ValueT            *d_values_in,

    ValueT                  *d_values_out,

    OffsetT                 num_items,

    int                     current_bit,

    int                     end_bit)

{

    // Constants

    enum

    {

        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,

        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,

        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,

    };


    // BlockRadixSort type

    typedef BlockRadixSort<

            KeyT,

            BLOCK_THREADS,

            ITEMS_PER_THREAD,

            ValueT,

            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,

            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),

            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>

        BlockRadixSortT;


    // BlockLoad type (keys)

    typedef BlockLoad<

        KeyT,

        BLOCK_THREADS,

        ITEMS_PER_THREAD,

        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;


    // BlockLoad type (values)

    typedef BlockLoad<

        ValueT,

        BLOCK_THREADS,

        ITEMS_PER_THREAD,

        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;


    // Unsigned word for key bits

    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;


    // Shared memory storage

    __shared__ union TempStorage

    {

        typename BlockRadixSortT::TempStorage       sort;

        typename BlockLoadKeys::TempStorage         load_keys;

        typename BlockLoadValues::TempStorage       load_values;


    } temp_storage;


    // Keys and values for the block

    KeyT            keys[ITEMS_PER_THREAD];

    ValueT          values[ITEMS_PER_THREAD];


    // Get default (min/max) value for out-of-bounds keys

    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;

    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);


    // Load keys

    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);


    CTA_SYNC();


    // Load values

    if (!KEYS_ONLY)

    {

        // Register pressure work-around: moving num_items through shfl prevents compiler

        // from reusing guards/addressing from prior guarded loads

        num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);


        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);


        CTA_SYNC();

    }


    // Sort tile

    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(

        keys,

        values,

        current_bit,

        end_bit,

        Int2Type<IS_DESCENDING>(),

        Int2Type<KEYS_ONLY>());


    // Store keys and values

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)

    {

        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;

        if (item_offset < num_items)

        {

            d_keys_out[item_offset] = keys[ITEM];

            if (!KEYS_ONLY)

                d_values_out[item_offset] = values[ITEM];

        }

    }

}


template <

    typename                ChainedPolicyT,

    bool                    ALT_DIGIT_BITS,

    bool                    IS_DESCENDING,

    typename                KeyT,

    typename                ValueT,

    typename                OffsetIteratorT,

    typename                OffsetT>

__launch_bounds__ (int((ALT_DIGIT_BITS) ?

    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :

    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))

__global__ void DeviceSegmentedRadixSortKernel(

    const KeyT              *d_keys_in,

    KeyT                    *d_keys_out,

    const ValueT            *d_values_in,

    ValueT                  *d_values_out,

    OffsetIteratorT         d_begin_offsets,

    OffsetIteratorT         d_end_offsets,

    int                     /*num_segments*/,

    int                     current_bit,

    int                     pass_bits)

{

    //

    // Constants

    //


    typedef typename If<(ALT_DIGIT_BITS),

        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,

        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;


    enum

    {

        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,

        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,

        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,

        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,

        RADIX_DIGITS        = 1 << RADIX_BITS,

        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,

    };


    // Upsweep type

    typedef AgentRadixSortUpsweep<

            AgentRadixSortUpsweepPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, SegmentedPolicyT::LOAD_MODIFIER, RADIX_BITS>,

            KeyT,

            OffsetT>

        BlockUpsweepT;


    // Digit-scan type

    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;


    // Downsweep type

    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;


    enum

    {

        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD

    };


    //

    // Process input tiles

    //


    // Shared memory storage

    __shared__ union

    {

        typename BlockUpsweepT::TempStorage     upsweep;

        typename BlockDownsweepT::TempStorage   downsweep;

        struct

        {

            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];

            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];

            typename DigitScanT::TempStorage        scan;

        };


    } temp_storage;


    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];

    OffsetT segment_end     = d_end_offsets[blockIdx.x];

    OffsetT num_items       = segment_end - segment_begin;


    // Check if empty segment

    if (num_items <= 0)

        return;


    // Upsweep

    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);

    upsweep.ProcessRegion(segment_begin, segment_end);


    CTA_SYNC();


    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)

    OffsetT bin_count[BINS_TRACKED_PER_THREAD];

    upsweep.ExtractCounts(bin_count);


    CTA_SYNC();


    if (IS_DESCENDING)

    {

        // Reverse bin counts

        #pragma unroll

        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)

        {

            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;


            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))

                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];

        }


        CTA_SYNC();


        #pragma unroll

        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)

        {

            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;


            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))

                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];

        }

    }


    // Scan

    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)

    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);


    #pragma unroll

    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)

    {

        bin_offset[track] += segment_begin;

    }


    if (IS_DESCENDING)

    {

        // Reverse bin offsets

        #pragma unroll

        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)

        {

            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;


            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))

                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];

        }


        CTA_SYNC();


        #pragma unroll

        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)

        {

            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;


            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))

                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];

        }

    }


    CTA_SYNC();


    // Downsweep

    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);

    downsweep.ProcessRegion(segment_begin, segment_end);

}


/******************************************************************************

 * Policy

 ******************************************************************************/


template <

    typename KeyT,

    typename ValueT,

    typename OffsetT>

struct DeviceRadixSortPolicy

{

    //------------------------------------------------------------------------------

    // Constants

    //------------------------------------------------------------------------------


    enum

    {

        // Whether this is a keys-only (or key-value) sort

        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),

    };


    // Dominant-sized key/value type

    typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT;


    //------------------------------------------------------------------------------

    // Architecture-specific tuning policies

    //------------------------------------------------------------------------------


    struct Policy200 : ChainedPolicy<200, Policy200, Policy200>

    {

        enum {

            PRIMARY_RADIX_BITS      = 5,

            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,


            // Relative size of KeyT type to a 4-byte word

            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,

        };


        // Keys-only upsweep policies

        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;

        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;


        // Key-value pairs upsweep policies

        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;

        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;


        // Upsweep policies

        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;

        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;


        // Scan policy

        typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;


        // Keys-only downsweep policies

        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;

        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;


        // Key-value pairs downsweep policies

        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;

        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;


        // Downsweep policies

        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;

        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;


        // Single-tile policy

        typedef DownsweepPolicy SingleTilePolicy;


        // Segmented policies

        typedef DownsweepPolicy     SegmentedPolicy;

        typedef AltDownsweepPolicy  AltSegmentedPolicy;

    };


    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>

    {

        enum {

            PRIMARY_RADIX_BITS      = 5,

            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,


            // Relative size of KeyT type to a 4-byte word

            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,

        };


        // Keys-only upsweep policies

        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;

        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;


        // Key-value pairs upsweep policies

        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;

        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;


        // Upsweep policies

        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;

        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;


        // Scan policy

        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;


        // Keys-only downsweep policies

        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;

        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;


        // Key-value pairs downsweep policies

        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;

        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;


        // Downsweep policies

        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;

        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;


        // Single-tile policy

        typedef DownsweepPolicy SingleTilePolicy;


        // Segmented policies

        typedef DownsweepPolicy     SegmentedPolicy;

        typedef AltDownsweepPolicy  AltSegmentedPolicy;

    };


    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>

    {

        enum {

            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)

        };


        // Scan policy

        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;


        // Keys-only downsweep policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(128, 9, DominantT), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(64, 18, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;


        // Key-value pairs downsweep policies

        typedef DownsweepPolicyKeys DownsweepPolicyPairs;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(128, 15, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;


        // Downsweep policies

        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;

        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;


        // Upsweep policies

        typedef DownsweepPolicy UpsweepPolicy;

        typedef AltDownsweepPolicy AltUpsweepPolicy;


        // Single-tile policy

        typedef DownsweepPolicy SingleTilePolicy;


        // Segmented policies

        typedef DownsweepPolicy     SegmentedPolicy;

        typedef AltDownsweepPolicy  AltSegmentedPolicy;


    };


    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>

    {

        enum {

            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)

            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,

            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)

        };


        // ScanPolicy

        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;


        // Downsweep policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(160, 39, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;


        // Upsweep policies

        typedef DownsweepPolicy UpsweepPolicy;

        typedef AltDownsweepPolicy AltUpsweepPolicy;


        // Single-tile policy

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;


        // Segmented policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 31, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 11, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;

    };


    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>

    {

        enum {

            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)

            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,

            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)

        };


        // ScanPolicy

        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;


        // Downsweep policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;


        // Upsweep policies

        typedef DownsweepPolicy UpsweepPolicy;

        typedef AltDownsweepPolicy AltUpsweepPolicy;


        // Single-tile policy

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;


        // Segmented policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;


    };


    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>

    {

        enum {

            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)

            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,

            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)

        };


        // ScanPolicy

        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;


        // Downsweep policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 31, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 35, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;


        // Upsweep policies

        typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;

        typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;


        // Single-tile policy

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;


        // Segmented policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;

    };


    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>

    {

        enum {

            PRIMARY_RADIX_BITS      = 5,

            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,

        };


        // ScanPolicy

        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;


        // Downsweep policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;


        // Upsweep policies

        typedef DownsweepPolicy UpsweepPolicy;

        typedef AltDownsweepPolicy AltUpsweepPolicy;


        // Single-tile policy

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;


        // Segmented policies

        typedef DownsweepPolicy     SegmentedPolicy;

        typedef AltDownsweepPolicy  AltSegmentedPolicy;

    };


    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>

    {

        enum {

            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)

            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,

            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)

        };


        // ScanPolicy

        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;


        // Downsweep policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;


        // Upsweep policies

        typedef DownsweepPolicy UpsweepPolicy;

        typedef AltDownsweepPolicy AltUpsweepPolicy;


        // Single-tile policy

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;


        // Segmented policies

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;

        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;

    };


    typedef Policy700 MaxPolicy;


};


/******************************************************************************

 * Single-problem dispatch

 ******************************************************************************/


template <

    bool     IS_DESCENDING,

    typename KeyT,

    typename ValueT,

    typename OffsetT>

struct DispatchRadixSort :

    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>

{

    //------------------------------------------------------------------------------

    // Constants

    //------------------------------------------------------------------------------


    enum

    {

        // Whether this is a keys-only (or key-value) sort

        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),

    };


    //------------------------------------------------------------------------------

    // Problem state

    //------------------------------------------------------------------------------


    void                    *d_temp_storage;

    size_t                  &temp_storage_bytes;

    DoubleBuffer<KeyT>      &d_keys;

    DoubleBuffer<ValueT>    &d_values;

    OffsetT                 num_items;

    int                     begin_bit;

    int                     end_bit;

    cudaStream_t            stream;

    bool                    debug_synchronous;

    int                     ptx_version;

    bool                    is_overwrite_okay;


    //------------------------------------------------------------------------------

    // Constructor

    //------------------------------------------------------------------------------


    CUB_RUNTIME_FUNCTION __forceinline__

    DispatchRadixSort(

        void*                   d_temp_storage,

        size_t                  &temp_storage_bytes,

        DoubleBuffer<KeyT>      &d_keys,

        DoubleBuffer<ValueT>    &d_values,

        OffsetT                 num_items,

        int                     begin_bit,

        int                     end_bit,

        bool                    is_overwrite_okay,

        cudaStream_t            stream,

        bool                    debug_synchronous,

        int                     ptx_version)

    :

        d_temp_storage(d_temp_storage),

        temp_storage_bytes(temp_storage_bytes),

        d_keys(d_keys),

        d_values(d_values),

        num_items(num_items),

        begin_bit(begin_bit),

        end_bit(end_bit),

        stream(stream),

        debug_synchronous(debug_synchronous),

        ptx_version(ptx_version),

        is_overwrite_okay(is_overwrite_okay)

    {}


    //------------------------------------------------------------------------------

    // Small-problem (single tile) invocation

    //------------------------------------------------------------------------------


    template <

        typename                ActivePolicyT,

        typename                SingleTileKernelT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t InvokeSingleTile(

        SingleTileKernelT       single_tile_kernel)

    {

#ifndef CUB_RUNTIME_ENABLED

        (void)single_tile_kernel;

        // Kernel launch not supported from this device

        return CubDebug(cudaErrorNotSupported );

#else

        cudaError error = cudaSuccess;

        do

        {

            // Return if the caller is simply requesting the size of the storage allocation

            if (d_temp_storage == NULL)

            {

                temp_storage_bytes = 1;

                break;

            }


            // Return if empty problem

            if (num_items == 0)

                break;


            // Log single_tile_kernel configuration

            if (debug_synchronous)

                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",

                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,

                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);


            // Invoke upsweep_kernel with same grid size as downsweep_kernel

            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(

                d_keys.Current(),

                d_keys.Alternate(),

                d_values.Current(),

                d_values.Alternate(),

                num_items,

                begin_bit,

                end_bit);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


            // Update selector

            d_keys.selector ^= 1;

            d_values.selector ^= 1;

        }

        while (0);


        return error;


#endif // CUB_RUNTIME_ENABLED

    }


    //------------------------------------------------------------------------------

    // Normal problem size invocation

    //------------------------------------------------------------------------------


    template <typename PassConfigT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t InvokePass(

        const KeyT      *d_keys_in,

        KeyT            *d_keys_out,

        const ValueT    *d_values_in,

        ValueT          *d_values_out,

        OffsetT         *d_spine,

        int             spine_length,

        int             &current_bit,

        PassConfigT     &pass_config)

    {

        cudaError error = cudaSuccess;

        do

        {

            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));


            // Log upsweep_kernel configuration

            if (debug_synchronous)

                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",

                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,

                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);


            // Invoke upsweep_kernel with same grid size as downsweep_kernel

            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(

                d_keys_in,

                d_spine,

                num_items,

                current_bit,

                pass_bits,

                pass_config.even_share);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


            // Log scan_kernel configuration

            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",

                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);


            // Invoke scan_kernel

            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(

                d_spine,

                spine_length);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


            // Log downsweep_kernel configuration

            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",

                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,

                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);


            // Invoke downsweep_kernel

            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(

                d_keys_in,

                d_keys_out,

                d_values_in,

                d_values_out,

                d_spine,

                num_items,

                current_bit,

                pass_bits,

                pass_config.even_share);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


            // Update current bit

            current_bit += pass_bits;

        }

        while (0);


        return error;

    }


    template <

        typename UpsweepKernelT,

        typename ScanKernelT,

        typename DownsweepKernelT>

    struct PassConfig

    {

        UpsweepKernelT          upsweep_kernel;

        KernelConfig            upsweep_config;

        ScanKernelT             scan_kernel;

        KernelConfig            scan_config;

        DownsweepKernelT        downsweep_kernel;

        KernelConfig            downsweep_config;

        int                     radix_bits;

        int                     radix_digits;

        int                     max_downsweep_grid_size;

        GridEvenShare<OffsetT>  even_share;


        template <

            typename UpsweepPolicyT,

            typename ScanPolicyT,

            typename DownsweepPolicyT>

        CUB_RUNTIME_FUNCTION __forceinline__

        cudaError_t InitPassConfig(

            UpsweepKernelT      upsweep_kernel,

            ScanKernelT         scan_kernel,

            DownsweepKernelT    downsweep_kernel,

            int                 ptx_version,

            int                 sm_count,

            int                 num_items)

        {

            cudaError error = cudaSuccess;

            do

            {

                this->upsweep_kernel    = upsweep_kernel;

                this->scan_kernel       = scan_kernel;

                this->downsweep_kernel  = downsweep_kernel;

                radix_bits              = DownsweepPolicyT::RADIX_BITS;

                radix_digits            = 1 << radix_bits;


                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;

                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;

                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;


                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);


                even_share.DispatchInit(

                    num_items,

                    max_downsweep_grid_size,

                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));


            }

            while (0);

            return error;

        }


    };


    template <

        typename            ActivePolicyT,

        typename            UpsweepKernelT,

        typename            ScanKernelT,

        typename            DownsweepKernelT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t InvokePasses(

        UpsweepKernelT      upsweep_kernel,

        UpsweepKernelT      alt_upsweep_kernel,

        ScanKernelT         scan_kernel,

        DownsweepKernelT    downsweep_kernel,

        DownsweepKernelT    alt_downsweep_kernel)

    {

#ifndef CUB_RUNTIME_ENABLED

        (void)upsweep_kernel;

        (void)alt_upsweep_kernel;

        (void)scan_kernel;

        (void)downsweep_kernel;

        (void)alt_downsweep_kernel;


        // Kernel launch not supported from this device

        return CubDebug(cudaErrorNotSupported );

#else


        cudaError error = cudaSuccess;

        do

        {

            // Get device ordinal

            int device_ordinal;

            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;


            // Get SM count

            int sm_count;

            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;


            // Init regular and alternate-digit kernel configurations

            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;

            if ((error = pass_config.template InitPassConfig<

                    typename ActivePolicyT::UpsweepPolicy,

                    typename ActivePolicyT::ScanPolicy,

                    typename ActivePolicyT::DownsweepPolicy>(

                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;


            if ((error = alt_pass_config.template InitPassConfig<

                    typename ActivePolicyT::AltUpsweepPolicy,

                    typename ActivePolicyT::ScanPolicy,

                    typename ActivePolicyT::AltDownsweepPolicy>(

                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;


            // Get maximum spine length

            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);

            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;


            // Temporary storage allocation requirements

            void* allocations[3];

            size_t allocation_sizes[3] =

            {

                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms

                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer

                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer

            };


            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)

            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;


            // Return if the caller is simply requesting the size of the storage allocation

            if (d_temp_storage == NULL)

                return cudaSuccess;


            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size

            int num_bits            = end_bit - begin_bit;

            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;

            bool is_num_passes_odd  = num_passes & 1;

            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;

            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));


            // Alias the temporary storage allocations

            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);


            DoubleBuffer<KeyT> d_keys_remaining_passes(

                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),

                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());


            DoubleBuffer<ValueT> d_values_remaining_passes(

                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),

                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());


            // Run first pass, consuming from the input's current buffers

            int current_bit = begin_bit;

            if (CubDebug(error = InvokePass(

                d_keys.Current(), d_keys_remaining_passes.Current(),

                d_values.Current(), d_values_remaining_passes.Current(),

                d_spine, spine_length, current_bit,

                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;


            // Run remaining passes

            while (current_bit < end_bit)

            {

                if (CubDebug(error = InvokePass(

                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],

                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],

                    d_spine, spine_length, current_bit,

                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;


                // Invert selectors

                d_keys_remaining_passes.selector ^= 1;

                d_values_remaining_passes.selector ^= 1;

            }


            // Update selector

            if (!is_overwrite_okay) {

                num_passes = 1; // Sorted data always ends up in the other vector

            }


            d_keys.selector = (d_keys.selector + num_passes) & 1;

            d_values.selector = (d_values.selector + num_passes) & 1;

        }

        while (0);


        return error;


#endif // CUB_RUNTIME_ENABLED

    }


    //------------------------------------------------------------------------------

    // Chained policy invocation

    //------------------------------------------------------------------------------


    template <typename ActivePolicyT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t Invoke()

    {

        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;

        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;


        // Force kernel code-generation in all compiler passes

        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))

        {

            // Small, single tile size

            return InvokeSingleTile<ActivePolicyT>(

                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);

        }

        else

        {

            // Regular size

            return InvokePasses<ActivePolicyT>(

                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,

                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,

                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,

                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,

                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);

        }

    }


    //------------------------------------------------------------------------------

    // Dispatch entrypoints

    //------------------------------------------------------------------------------


    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t Dispatch(

        void*                   d_temp_storage,

        size_t                  &temp_storage_bytes,

        DoubleBuffer<KeyT>      &d_keys,

        DoubleBuffer<ValueT>    &d_values,

        OffsetT                 num_items,

        int                     begin_bit,

        int                     end_bit,

        bool                    is_overwrite_okay,

        cudaStream_t            stream,

        bool                    debug_synchronous)

    {

        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;


        cudaError_t error;

        do {

            // Get PTX version

            int ptx_version;

            if (CubDebug(error = PtxVersion(ptx_version))) break;


            // Create dispatch functor

            DispatchRadixSort dispatch(

                d_temp_storage, temp_storage_bytes,

                d_keys, d_values,

                num_items, begin_bit, end_bit, is_overwrite_okay,

                stream, debug_synchronous, ptx_version);


            // Dispatch to chained policy

            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;


        } while (0);


        return error;

    }

};


/******************************************************************************

 * Segmented dispatch

 ******************************************************************************/


template <

    bool     IS_DESCENDING,

    typename KeyT,

    typename ValueT,

    typename OffsetIteratorT,

    typename OffsetT>

struct DispatchSegmentedRadixSort :

    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>

{

    //------------------------------------------------------------------------------

    // Constants

    //------------------------------------------------------------------------------


    enum

    {

        // Whether this is a keys-only (or key-value) sort

        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),

    };


    //------------------------------------------------------------------------------

    // Parameter members

    //------------------------------------------------------------------------------


    void                    *d_temp_storage;

    size_t                  &temp_storage_bytes;

    DoubleBuffer<KeyT>      &d_keys;

    DoubleBuffer<ValueT>    &d_values;

    OffsetT                 num_items;

    OffsetT                 num_segments;

    OffsetIteratorT         d_begin_offsets;

    OffsetIteratorT         d_end_offsets;

    int                     begin_bit;

    int                     end_bit;

    cudaStream_t            stream;

    bool                    debug_synchronous;

    int                     ptx_version;

    bool                    is_overwrite_okay;


    //------------------------------------------------------------------------------

    // Constructors

    //------------------------------------------------------------------------------


    CUB_RUNTIME_FUNCTION __forceinline__

    DispatchSegmentedRadixSort(

        void*                   d_temp_storage,

        size_t                  &temp_storage_bytes,

        DoubleBuffer<KeyT>      &d_keys,

        DoubleBuffer<ValueT>    &d_values,

        OffsetT                 num_items,

        OffsetT                 num_segments,

        OffsetIteratorT         d_begin_offsets,

        OffsetIteratorT         d_end_offsets,

        int                     begin_bit,

        int                     end_bit,

        bool                    is_overwrite_okay,

        cudaStream_t            stream,

        bool                    debug_synchronous,

        int                     ptx_version)

    :

        d_temp_storage(d_temp_storage),

        temp_storage_bytes(temp_storage_bytes),

        d_keys(d_keys),

        d_values(d_values),

        num_items(num_items),

        num_segments(num_segments),

        d_begin_offsets(d_begin_offsets),

        d_end_offsets(d_end_offsets),

        begin_bit(begin_bit),

        end_bit(end_bit),

        is_overwrite_okay(is_overwrite_okay),

        stream(stream),

        debug_synchronous(debug_synchronous),

        ptx_version(ptx_version)

    {}


    //------------------------------------------------------------------------------

    // Multi-segment invocation

    //------------------------------------------------------------------------------


    template <typename PassConfigT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t InvokePass(

        const KeyT      *d_keys_in,

        KeyT            *d_keys_out,

        const ValueT    *d_values_in,

        ValueT          *d_values_out,

        int             &current_bit,

        PassConfigT     &pass_config)

    {

        cudaError error = cudaSuccess;

        do

        {

            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));


            // Log kernel configuration

            if (debug_synchronous)

                _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",

                    num_segments, pass_config.segmented_config.block_threads, (long long) stream,

                pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);


            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(

                d_keys_in, d_keys_out,

                d_values_in,  d_values_out,

                d_begin_offsets, d_end_offsets, num_segments,

                current_bit, pass_bits);


            // Check for failure to launch

            if (CubDebug(error = cudaPeekAtLastError())) break;


            // Sync the stream if specified to flush runtime errors

            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;


            // Update current bit

            current_bit += pass_bits;

        }

        while (0);


        return error;

    }


    template <typename SegmentedKernelT>

    struct PassConfig

    {

        SegmentedKernelT    segmented_kernel;

        KernelConfig        segmented_config;

        int                 radix_bits;

        int                 radix_digits;


        template <typename SegmentedPolicyT>

        CUB_RUNTIME_FUNCTION __forceinline__

        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)

        {

            this->segmented_kernel  = segmented_kernel;

            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;

            this->radix_digits      = 1 << radix_bits;


            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));

        }

    };


    template <

        typename                ActivePolicyT,

        typename                SegmentedKernelT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t InvokePasses(

        SegmentedKernelT     segmented_kernel,

        SegmentedKernelT     alt_segmented_kernel)

    {

#ifndef CUB_RUNTIME_ENABLED

      (void)segmented_kernel;

      (void)alt_segmented_kernel;


        // Kernel launch not supported from this device

        return CubDebug(cudaErrorNotSupported );

#else


        cudaError error = cudaSuccess;

        do

        {

            // Init regular and alternate kernel configurations

            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;

            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;

            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;


            // Temporary storage allocation requirements

            void* allocations[2];

            size_t allocation_sizes[2] =

            {

                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer

                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer

            };


            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)

            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;


            // Return if the caller is simply requesting the size of the storage allocation

            if (d_temp_storage == NULL)

            {

                if (temp_storage_bytes == 0)

                    temp_storage_bytes = 1;

                return cudaSuccess;

            }


            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size

            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;

            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;

            int num_bits            = end_bit - begin_bit;

            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;

            bool is_num_passes_odd  = num_passes & 1;

            int max_alt_passes      = (num_passes * radix_bits) - num_bits;

            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));


            DoubleBuffer<KeyT> d_keys_remaining_passes(

                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),

                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());


            DoubleBuffer<ValueT> d_values_remaining_passes(

                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),

                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());


            // Run first pass, consuming from the input's current buffers

            int current_bit = begin_bit;


            if (CubDebug(error = InvokePass(

                d_keys.Current(), d_keys_remaining_passes.Current(),

                d_values.Current(), d_values_remaining_passes.Current(),

                current_bit,

                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;


            // Run remaining passes

            while (current_bit < end_bit)

            {

                if (CubDebug(error = InvokePass(

                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],

                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],

                    current_bit,

                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;


                // Invert selectors and update current bit

                d_keys_remaining_passes.selector ^= 1;

                d_values_remaining_passes.selector ^= 1;

            }


            // Update selector

            if (!is_overwrite_okay) {

                num_passes = 1; // Sorted data always ends up in the other vector

            }


            d_keys.selector = (d_keys.selector + num_passes) & 1;

            d_values.selector = (d_values.selector + num_passes) & 1;

        }

        while (0);


        return error;


#endif // CUB_RUNTIME_ENABLED

    }


    //------------------------------------------------------------------------------

    // Chained policy invocation

    //------------------------------------------------------------------------------


    template <typename ActivePolicyT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t Invoke()

    {

        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;


        // Force kernel code-generation in all compiler passes

        return InvokePasses<ActivePolicyT>(

            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,

            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);

    }


    //------------------------------------------------------------------------------

    // Dispatch entrypoints

    //------------------------------------------------------------------------------


    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t Dispatch(

        void*                   d_temp_storage,

        size_t                  &temp_storage_bytes,

        DoubleBuffer<KeyT>      &d_keys,

        DoubleBuffer<ValueT>    &d_values,

        int                     num_items,

        int                     num_segments,

        OffsetIteratorT         d_begin_offsets,

        OffsetIteratorT         d_end_offsets,

        int                     begin_bit,

        int                     end_bit,

        bool                    is_overwrite_okay,

        cudaStream_t            stream,

        bool                    debug_synchronous)

    {

        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;


        cudaError_t error;

        do {

            // Get PTX version

            int ptx_version;

            if (CubDebug(error = PtxVersion(ptx_version))) break;


            // Create dispatch functor

            DispatchSegmentedRadixSort dispatch(

                d_temp_storage, temp_storage_bytes,

                d_keys, d_values,

                num_items, num_segments, d_begin_offsets, d_end_offsets,

                begin_bit, end_bit, is_overwrite_okay,

                stream, debug_synchronous, ptx_version);


            // Dispatch to chained policy

            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;


        } while (0);


        return error;

    }

};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockLoad
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
Definition block_load.cuh:641

cub::BlockRadixRank
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Definition block_radix_rank.cuh:98

cub::BlockRadixSort
The BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thre...
Definition block_radix_sort.cuh:133

cub::BlockScan
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Definition block_scan.cuh:194

cub::BlockLoad::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition block_load.cuh:1130

cub::BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_DIRECT
Definition block_load.cuh:485

cub::BLOCK_LOAD_WARP_TRANSPOSE
@ BLOCK_LOAD_WARP_TRANSPOSE
Definition block_load.cuh:541

cub::BLOCK_LOAD_TRANSPOSE
@ BLOCK_LOAD_TRANSPOSE
Definition block_load.cuh:520

cub::LOAD_LDG
@ LOAD_LDG
Cache as texture.
Definition thread_load.cuh:69

cub::LOAD_DEFAULT
@ LOAD_DEFAULT
Default (no modifier)
Definition thread_load.cuh:64

_CubLog
#define _CubLog(format,...)
Log macro for printf statements.
Definition util_debug.cuh:112

cub::PtxVersion
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
Definition util_device.cuh:118

cub::AliasTemporaries
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition util_device.cuh:62

CubDebug
#define CubDebug(e)
Debug macro.
Definition util_debug.cuh:94

cub::SyncStream
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Definition util_device.cuh:199

CUB_MAX
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition util_macro.cuh:61

CUB_MIN
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition util_macro.cuh:66

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::d_values_out
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
Definition dispatch_radix_sort.cuh:166

cub::d_spine
OffsetT * d_spine
< [in] Input keys buffer
Definition dispatch_radix_sort.cuh:74

cub::end_bit
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
Definition dispatch_radix_sort.cuh:220

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

cub::num_items
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:168

cub::d_keys_out
KeyT * d_keys_out
< [in] Input keys buffer
Definition dispatch_radix_sort.cuh:164

cub::__launch_bounds__
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets

cub::num_bits
OffsetT int int num_bits
[in] Number of bits of current radix digit
Definition dispatch_radix_sort.cuh:77

cub::d_values_in
KeyT const ValueT * d_values_in
[in] Input values buffer
Definition dispatch_radix_sort.cuh:165

cub::d_begin_offsets
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
Definition dispatch_radix_sort.cuh:334

cub::num_counts
int num_counts
< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block,...
Definition dispatch_radix_sort.cuh:121

cub::current_bit
OffsetT int current_bit
[in] Bit position of current radix digit
Definition dispatch_radix_sort.cuh:76

cub::even_share
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
Definition dispatch_radix_sort.cuh:79

cub::BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_WARP_SCANS
Definition block_scan.cuh:108

cub::BLOCK_SCAN_RAKING_MEMOIZE
@ BLOCK_SCAN_RAKING_MEMOIZE
Definition block_scan.cuh:88

cub::BINS_TRACKED_PER_THREAD
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
Definition dispatch_radix_sort.cuh:374

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::d_end_offsets
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
Definition dispatch_radix_sort.cuh:335

cub::pass_bits
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int int pass_bits
< [in] Number of bits of current radix digit
Definition dispatch_radix_sort.cuh:339

block_offset
Definition SparseGridGpu_ker.cuh:14

cub::AgentRadixSortDownsweepPolicy
< The number of radix bits, i.e., log2(bins)
Definition agent_radix_sort_downsweep.cuh:81

cub::AgentRadixSortDownsweep::TempStorage
Alias wrapper allowing storage to be unioned.
Definition agent_radix_sort_downsweep.cuh:199

cub::AgentRadixSortDownsweep
AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in ...
Definition agent_radix_sort_downsweep.cuh:114

cub::AgentRadixSortDownsweep::ProcessRegion
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, OffsetT block_end)
Definition agent_radix_sort_downsweep.cuh:750

cub::AgentRadixSortDownsweep::BINS_TRACKED_PER_THREAD
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
Definition agent_radix_sort_downsweep.cuh:157

cub::AgentRadixSortUpsweepPolicy
< The number of radix bits, i.e., log2(bins)
Definition agent_radix_sort_upsweep.cuh:63

cub::AgentRadixSortUpsweep::TempStorage
Alias wrapper allowing storage to be unioned.
Definition agent_radix_sort_upsweep.cuh:153

cub::AgentRadixSortUpsweep
AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in de...
Definition agent_radix_sort_upsweep.cuh:87

cub::AgentRadixSortUpsweep::ProcessRegion
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, const OffsetT &block_end)
Definition agent_radix_sort_upsweep.cuh:352

cub::AgentRadixSortUpsweep::ExtractCounts
__device__ __forceinline__ void ExtractCounts(OffsetT *counters, int bin_stride=1, int bin_offset=0)
Definition agent_radix_sort_upsweep.cuh:403

cub::AgentScanPolicy
< The BlockScan algorithm to use
Definition agent_scan.cuh:68

cub::AgentScan
AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide pr...
Definition agent_scan.cuh:99

cub::BlockLoad::TempStorage
\smemstorage{BlockLoad}
Definition block_load.cuh:1055

cub::BlockRadixSort::TempStorage
\smemstorage{BlockRadixSort}
Definition block_radix_sort.cuh:402

cub::BlockScanRunningPrefixOp
< Wrapped scan operator type
Definition single_pass_scan_operators.cuh:66

cub::ChainedPolicy
Helper for dispatching into a policy chain.
Definition util_device.cuh:305

cub::DeviceRadixSortPolicy::Policy200
SM20.
Definition dispatch_radix_sort.cuh:514

cub::DeviceRadixSortPolicy::Policy300
SM30.
Definition dispatch_radix_sort.cuh:560

cub::DeviceRadixSortPolicy::Policy350
SM35.
Definition dispatch_radix_sort.cuh:607

cub::DeviceRadixSortPolicy::Policy500
SM50.
Definition dispatch_radix_sort.cuh:644

cub::DeviceRadixSortPolicy::Policy600
SM60 (GP100)
Definition dispatch_radix_sort.cuh:673

cub::DeviceRadixSortPolicy::Policy610
SM61 (GP104)
Definition dispatch_radix_sort.cuh:703

cub::DeviceRadixSortPolicy::Policy620
SM62 (Tegra, less RF)
Definition dispatch_radix_sort.cuh:732

cub::DeviceRadixSortPolicy::Policy700
SM70 (GV100)
Definition dispatch_radix_sort.cuh:760

cub::DeviceRadixSortPolicy
< Signed integer type for global offsets
Definition dispatch_radix_sort.cuh:494

cub::DeviceRadixSortPolicy::MaxPolicy
Policy700 MaxPolicy
MaxPolicy.
Definition dispatch_radix_sort.cuh:788

cub::DispatchRadixSort::PassConfig
Pass configuration structure.
Definition dispatch_radix_sort.cuh:1035

cub::DispatchRadixSort::PassConfig::InitPassConfig
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig(UpsweepKernelT upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, int ptx_version, int sm_count, int num_items)
Initialize pass configuration.
Definition dispatch_radix_sort.cuh:1053

cub::DispatchRadixSort
< Signed integer type for global offsets
Definition dispatch_radix_sort.cuh:809

cub::DispatchRadixSort::d_temp_storage
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
Definition dispatch_radix_sort.cuh:825

cub::DispatchRadixSort::debug_synchronous
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
Definition dispatch_radix_sort.cuh:833

cub::DispatchRadixSort::Invoke
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
Definition dispatch_radix_sort.cuh:1222

cub::DispatchRadixSort::InvokePass
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass(const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, OffsetT *d_spine, int spine_length, int &current_bit, PassConfigT &pass_config)
Definition dispatch_radix_sort.cuh:945

cub::DispatchRadixSort::ptx_version
int ptx_version
[in] PTX version
Definition dispatch_radix_sort.cuh:834

cub::DispatchRadixSort::d_keys
DoubleBuffer< KeyT > & d_keys
[in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return,...
Definition dispatch_radix_sort.cuh:827

cub::DispatchRadixSort::InvokePasses
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(UpsweepKernelT upsweep_kernel, UpsweepKernelT alt_upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, DownsweepKernelT alt_downsweep_kernel)
Invocation (run multiple digit passes)
Definition dispatch_radix_sort.cuh:1096

cub::DispatchRadixSort::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
Definition dispatch_radix_sort.cuh:1255

cub::DispatchRadixSort::d_values
DoubleBuffer< ValueT > & d_values
[in,out] Double-buffer whose current buffer contains the unsorted input values and,...
Definition dispatch_radix_sort.cuh:828

cub::DispatchRadixSort::temp_storage_bytes
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
Definition dispatch_radix_sort.cuh:826

cub::DispatchRadixSort::num_items
OffsetT num_items
[in] Number of items to sort
Definition dispatch_radix_sort.cuh:829

cub::DispatchRadixSort::is_overwrite_okay
bool is_overwrite_okay
[in] Whether is okay to overwrite source buffers
Definition dispatch_radix_sort.cuh:835

cub::DispatchRadixSort::DispatchRadixSort
CUB_RUNTIME_FUNCTION __forceinline__ DispatchRadixSort(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
Definition dispatch_radix_sort.cuh:844

cub::DispatchRadixSort::stream
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
Definition dispatch_radix_sort.cuh:832

cub::DispatchRadixSort::begin_bit
int begin_bit
[in] The beginning (least-significant) bit index needed for key comparison
Definition dispatch_radix_sort.cuh:830

cub::DispatchRadixSort::end_bit
int end_bit
[in] The past-the-end (most-significant) bit index needed for key comparison
Definition dispatch_radix_sort.cuh:831

cub::DispatchRadixSort::InvokeSingleTile
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel)
Invoke a single block to sort in-core.
Definition dispatch_radix_sort.cuh:880

cub::DispatchSegmentedRadixSort::PassConfig
PassConfig data structure.
Definition dispatch_radix_sort.cuh:1430

cub::DispatchSegmentedRadixSort::PassConfig::InitPassConfig
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
Initialize pass configuration.
Definition dispatch_radix_sort.cuh:1439

cub::DispatchSegmentedRadixSort
< Signed integer type for global offsets
Definition dispatch_radix_sort.cuh:1309

cub::DispatchSegmentedRadixSort::num_items
OffsetT num_items
[in] Number of items to sort
Definition dispatch_radix_sort.cuh:1329

cub::DispatchSegmentedRadixSort::stream
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
Definition dispatch_radix_sort.cuh:1335

cub::DispatchSegmentedRadixSort::num_segments
OffsetT num_segments
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:1330

cub::DispatchSegmentedRadixSort::d_begin_offsets
OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
Definition dispatch_radix_sort.cuh:1331

cub::DispatchSegmentedRadixSort::temp_storage_bytes
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
Definition dispatch_radix_sort.cuh:1326

cub::DispatchSegmentedRadixSort::d_end_offsets
OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
Definition dispatch_radix_sort.cuh:1332

cub::DispatchSegmentedRadixSort::d_keys
DoubleBuffer< KeyT > & d_keys
[in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return,...
Definition dispatch_radix_sort.cuh:1327

cub::DispatchSegmentedRadixSort::ptx_version
int ptx_version
[in] PTX version
Definition dispatch_radix_sort.cuh:1337

cub::DispatchSegmentedRadixSort::debug_synchronous
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
Definition dispatch_radix_sort.cuh:1336

cub::DispatchSegmentedRadixSort::d_values
DoubleBuffer< ValueT > & d_values
[in,out] Double-buffer whose current buffer contains the unsorted input values and,...
Definition dispatch_radix_sort.cuh:1328

cub::DispatchSegmentedRadixSort::Invoke
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
Definition dispatch_radix_sort.cuh:1557

cub::DispatchSegmentedRadixSort::InvokePass
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass(const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int &current_bit, PassConfigT &pass_config)
Invoke a three-kernel sorting pass at the current bit.
Definition dispatch_radix_sort.cuh:1387

cub::DispatchSegmentedRadixSort::InvokePasses
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel)
Invocation (run multiple digit passes)
Definition dispatch_radix_sort.cuh:1455

cub::DispatchSegmentedRadixSort::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
Internal dispatch routine.
Definition dispatch_radix_sort.cuh:1575

cub::DispatchSegmentedRadixSort::DispatchSegmentedRadixSort
CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedRadixSort(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, OffsetT num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
Definition dispatch_radix_sort.cuh:1347

cub::DispatchSegmentedRadixSort::is_overwrite_okay
bool is_overwrite_okay
[in] Whether is okay to overwrite source buffers
Definition dispatch_radix_sort.cuh:1338

cub::DispatchSegmentedRadixSort::end_bit
int end_bit
[in] The past-the-end (most-significant) bit index needed for key comparison
Definition dispatch_radix_sort.cuh:1334

cub::DispatchSegmentedRadixSort::begin_bit
int begin_bit
[in] The beginning (least-significant) bit index needed for key comparison
Definition dispatch_radix_sort.cuh:1333

cub::DispatchSegmentedRadixSort::d_temp_storage
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
Definition dispatch_radix_sort.cuh:1325

cub::DoubleBuffer
Double-buffer storage wrapper for multi-pass stream transformations that require more than one storag...
Definition util_type.cuh:792

cub::DoubleBuffer::Current
__host__ __device__ __forceinline__ T * Current()
Return pointer to the currently valid buffer.
Definition util_type.cuh:818

cub::DoubleBuffer::selector
int selector
Selector into d_buffers (i.e., the active/valid buffer)
Definition util_type.cuh:797

cub::DoubleBuffer::Alternate
__host__ __device__ __forceinline__ T * Alternate()
Return pointer to the currently invalid buffer.
Definition util_type.cuh:821

cub::Equals
Type equality test.
Definition util_type.cuh:99

cub::GridEvenShare
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
Definition grid_even_share.cuh:75

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::KernelConfig
Definition util_device.cuh:279

cub::Sum
Default sum functor.
Definition thread_operators.cuh:110

cub::Traits
Type traits.
Definition util_type.cuh:1158

cub::TempStorage
Definition dispatch_radix_sort.cuh:259

CUB_SUBSCRIPTION_FACTOR
#define CUB_SUBSCRIPTION_FACTOR(arch)
Oversubscription factor.
Definition util_arch.cuh:99

CUB_SCALED_GRANULARITIES
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.
Definition util_arch.cuh:141