doxygen/openfpm/block__radix__rank_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <stdint.h>


#include "../thread/thread_reduce.cuh"

#include "../thread/thread_scan.cuh"

#include "../block/block_scan.cuh"

#include "../util_ptx.cuh"

#include "../util_arch.cuh"

#include "../util_type.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


template <

    int                     BLOCK_DIM_X,

    int                     RADIX_BITS,

    bool                    IS_DESCENDING,

    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,

    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,

    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,

    int                     BLOCK_DIM_Y             = 1,

    int                     BLOCK_DIM_Z             = 1,

    int                     PTX_ARCH                = CUB_PTX_ARCH>

class BlockRadixRank

{

private:


    /******************************************************************************

     * Type definitions and constants

     ******************************************************************************/


    // Integer type for digit counters (to be packed into words of type PackedCounters)

    typedef unsigned short DigitCounter;


    // Integer type for packing DigitCounters into columns of shared memory banks

    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),

        unsigned long long,

        unsigned int>::Type PackedCounter;


    enum

    {

        // The thread block size in threads

        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,


        RADIX_DIGITS                = 1 << RADIX_BITS,


        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),

        WARP_THREADS                = 1 << LOG_WARP_THREADS,

        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,


        BYTES_PER_COUNTER           = sizeof(DigitCounter),

        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,


        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),

        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,


        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane

        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,


        // The number of packed counters per thread (plus one for padding)

        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,

        RAKING_SEGMENT              = PADDED_COUNTER_LANES,

    };


public:


    enum

    {

        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),

    };


private:


    typedef BlockScan<

            PackedCounter,

            BLOCK_DIM_X,

            INNER_SCAN_ALGORITHM,

            BLOCK_DIM_Y,

            BLOCK_DIM_Z,

            PTX_ARCH>

        BlockScan;


    struct __align__(16) _TempStorage

    {

        union Aliasable

        {

            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];

            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];


        } aliasable;


        // Storage for scanning local ranks

        typename BlockScan::TempStorage block_scan;

    };


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage &temp_storage;


    unsigned int linear_tid;


    PackedCounter cached_segment[RAKING_SEGMENT];


    /******************************************************************************

     * Utility methods

     ******************************************************************************/


    __device__ __forceinline__ _TempStorage& PrivateStorage()

    {

        __shared__ _TempStorage private_storage;

        return private_storage;

    }


    __device__ __forceinline__ PackedCounter Upsweep()

    {

        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];

        PackedCounter *raking_ptr;


        if (MEMOIZE_OUTER_SCAN)

        {

            // Copy data into registers

            #pragma unroll

            for (int i = 0; i < RAKING_SEGMENT; i++)

            {

                cached_segment[i] = smem_raking_ptr[i];

            }

            raking_ptr = cached_segment;

        }

        else

        {

            raking_ptr = smem_raking_ptr;

        }


        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());

    }


    __device__ __forceinline__ void ExclusiveDownsweep(

        PackedCounter raking_partial)

    {

        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];


        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?

            cached_segment :

            smem_raking_ptr;


        // Exclusive raking downsweep scan

        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);


        if (MEMOIZE_OUTER_SCAN)

        {

            // Copy data back to smem

            #pragma unroll

            for (int i = 0; i < RAKING_SEGMENT; i++)

            {

                smem_raking_ptr[i] = cached_segment[i];

            }

        }

    }


    __device__ __forceinline__ void ResetCounters()

    {

        // Reset shared memory digit counters

        #pragma unroll

        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)

        {

            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;

        }

    }


    struct PrefixCallBack

    {

        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)

        {

            PackedCounter block_prefix = 0;


            // Propagate totals in packed fields

            #pragma unroll

            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)

            {

                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);

            }


            return block_prefix;

        }

    };


    __device__ __forceinline__ void ScanCounters()

    {

        // Upsweep scan

        PackedCounter raking_partial = Upsweep();


        // Compute exclusive sum

        PackedCounter exclusive_partial;

        PrefixCallBack prefix_call_back;

        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);


        // Downsweep scan with exclusive partial

        ExclusiveDownsweep(exclusive_partial);

    }


public:


    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************/


    __device__ __forceinline__ BlockRadixRank()

    :

        temp_storage(PrivateStorage()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    __device__ __forceinline__ BlockRadixRank(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    /******************************************************************/


    template <

        typename        UnsignedBits,

        int             KEYS_PER_THREAD>

    __device__ __forceinline__ void RankKeys(

        UnsignedBits    (&keys)[KEYS_PER_THREAD],

        int             (&ranks)[KEYS_PER_THREAD],

        int             current_bit,

        int             num_bits)

    {

        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit

        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem


        // Reset shared memory digit counters

        ResetCounters();


        #pragma unroll

        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)

        {

            // Get digit

            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);


            // Get sub-counter

            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;


            // Get counter lane

            unsigned int counter_lane = digit & (COUNTER_LANES - 1);


            if (IS_DESCENDING)

            {

                sub_counter = PACKING_RATIO - 1 - sub_counter;

                counter_lane = COUNTER_LANES - 1 - counter_lane;

            }


            // Pointer to smem digit counter

            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];


            // Load thread-exclusive prefix

            thread_prefixes[ITEM] = *digit_counters[ITEM];


            // Store inclusive prefix

            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;

        }


        CTA_SYNC();


        // Scan shared memory counters

        ScanCounters();


        CTA_SYNC();


        // Extract the local ranks of each key

        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)

        {

            // Add in thread block exclusive prefix

            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];

        }

    }


    template <

        typename        UnsignedBits,

        int             KEYS_PER_THREAD>

    __device__ __forceinline__ void RankKeys(

        UnsignedBits    (&keys)[KEYS_PER_THREAD],

        int             (&ranks)[KEYS_PER_THREAD],

        int             current_bit,

        int             num_bits,

        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])

    {

        // Rank keys

        RankKeys(keys, ranks, current_bit, num_bits);


        // Get the inclusive and exclusive digit totals corresponding to the calling thread.

        #pragma unroll

        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)

        {

            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;


            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))

            {

                if (IS_DESCENDING)

                    bin_idx = RADIX_DIGITS - bin_idx - 1;


                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the

                // first counter column, resulting in unavoidable bank conflicts.)

                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));

                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);


                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];

            }

        }

    }

};


template <

    int                     BLOCK_DIM_X,

    int                     RADIX_BITS,

    bool                    IS_DESCENDING,

    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,

    int                     BLOCK_DIM_Y             = 1,

    int                     BLOCK_DIM_Z             = 1,

    int                     PTX_ARCH                = CUB_PTX_ARCH>

class BlockRadixRankMatch

{

private:


    /******************************************************************************

     * Type definitions and constants

     ******************************************************************************/


    typedef int32_t    RankT;

    typedef int32_t    DigitCounterT;


    enum

    {

        // The thread block size in threads

        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,


        RADIX_DIGITS                = 1 << RADIX_BITS,


        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),

        WARP_THREADS                = 1 << LOG_WARP_THREADS,

        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,


        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?

                                    WARPS + 1 :

                                    WARPS,


        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,

        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,

        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?

                                    RAKING_SEGMENT + 1 :

                                    RAKING_SEGMENT,

    };


public:


    enum

    {

        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),

    };


private:


    typedef BlockScan<

            DigitCounterT,

            BLOCK_THREADS,

            INNER_SCAN_ALGORITHM,

            BLOCK_DIM_Y,

            BLOCK_DIM_Z,

            PTX_ARCH>

        BlockScanT;


    struct __align__(16) _TempStorage

    {

        typename BlockScanT::TempStorage            block_scan;


        union __align__(16) Aliasable

        {

            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];

            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];


        } aliasable;

    };


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage &temp_storage;


    unsigned int linear_tid;


public:


    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************/


    __device__ __forceinline__ BlockRadixRankMatch(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    /******************************************************************/


    template <

        typename        UnsignedBits,

        int             KEYS_PER_THREAD>

    __device__ __forceinline__ void RankKeys(

        UnsignedBits    (&keys)[KEYS_PER_THREAD],

        int             (&ranks)[KEYS_PER_THREAD],

        int             current_bit,

        int             num_bits)

    {

        // Initialize shared digit counters


        #pragma unroll

        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)

            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;


        CTA_SYNC();


        // Each warp will strip-mine its section of input, one strip at a time


        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];

        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;

        uint32_t                lane_mask_lt    = LaneMaskLt();


        #pragma unroll

        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)

        {

            // My digit

            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);


            if (IS_DESCENDING)

                digit = RADIX_DIGITS - digit - 1;


            // Mask of peers who have same digit as me

            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);


            // Pointer to smem digit counter for this key

            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];


            // Number of occurrences in previous strips

            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];


            // Warp-sync

            WARP_SYNC(0xFFFFFFFF);


            // Number of peers having same digit as me

            int32_t digit_count = __popc(peer_mask);


            // Number of lower-ranked peers having same digit seen so far

            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);


            if (peer_digit_prefix == 0)

            {

                // First thread for each digit updates the shared warp counter

                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);

            }


            // Warp-sync

            WARP_SYNC(0xFFFFFFFF);


            // Number of prior keys having same digit

            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);

        }


        CTA_SYNC();


        // Scan warp counters


        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];


        #pragma unroll

        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)

            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];


        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);


        #pragma unroll

        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)

            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];


        CTA_SYNC();


        // Seed ranks with counter values from previous warps

        #pragma unroll

        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)

            ranks[ITEM] += *digit_counters[ITEM];

    }


    template <

        typename        UnsignedBits,

        int             KEYS_PER_THREAD>

    __device__ __forceinline__ void RankKeys(

        UnsignedBits    (&keys)[KEYS_PER_THREAD],

        int             (&ranks)[KEYS_PER_THREAD],

        int             current_bit,

        int             num_bits,

        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])

    {

        RankKeys(keys, ranks, current_bit, num_bits);


        // Get exclusive count for each digit

        #pragma unroll

        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)

        {

            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;


            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))

            {

                if (IS_DESCENDING)

                    bin_idx = RADIX_DIGITS - bin_idx - 1;


                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];

            }

        }

    }

};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockRadixRankMatch
Definition block_radix_rank.cuh:461

cub::BlockRadixRankMatch::BINS_TRACKED_PER_THREAD
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
Definition block_radix_rank.cuh:498

cub::BlockRadixRankMatch::__align__
struct __align__(16) _TempStorage
Shared memory storage layout type for BlockRadixRank.
Definition block_radix_rank.cuh:515

cub::BlockRadixRankMatch::linear_tid
unsigned int linear_tid
Linear thread-id.
Definition block_radix_rank.cuh:536

cub::BlockRadixRankMatch::BlockScanT
BlockScan< DigitCounterT, BLOCK_THREADS, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > BlockScanT
BlockScan type.
Definition block_radix_rank.cuh:511

cub::BlockRadixRankMatch::RankKeys
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
Definition block_radix_rank.cuh:666

cub::BlockRadixRankMatch::temp_storage
_TempStorage & temp_storage
Shared storage reference.
Definition block_radix_rank.cuh:533

cub::BlockRadixRankMatch::RankKeys
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
Rank keys.
Definition block_radix_rank.cuh:575

cub::BlockRadixRankMatch::BlockRadixRankMatch
__device__ __forceinline__ BlockRadixRankMatch(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition block_radix_rank.cuh:555

cub::BlockRadixRank
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Definition block_radix_rank.cuh:98

cub::BlockRadixRank::RankKeys
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
Rank keys.
Definition block_radix_rank.cuh:351

cub::BlockRadixRank::linear_tid
unsigned int linear_tid
Linear thread-id.
Definition block_radix_rank.cuh:183

cub::BlockRadixRank::ScanCounters
__device__ __forceinline__ void ScanCounters()
Definition block_radix_rank.cuh:293

cub::BlockRadixRank::ExclusiveDownsweep
__device__ __forceinline__ void ExclusiveDownsweep(PackedCounter raking_partial)
Performs exclusive downsweep raking scan.
Definition block_radix_rank.cuh:231

cub::BlockRadixRank::ResetCounters
__device__ __forceinline__ void ResetCounters()
Definition block_radix_rank.cuh:258

cub::BlockRadixRank::BlockScan
BlockScan< PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > BlockScan
BlockScan type.
Definition block_radix_rank.cuh:157

cub::BlockRadixRank::BlockRadixRank
__device__ __forceinline__ BlockRadixRank()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition block_radix_rank.cuh:321

cub::BlockRadixRank::Upsweep
__device__ __forceinline__ PackedCounter Upsweep()
Definition block_radix_rank.cuh:206

cub::BlockRadixRank::PrivateStorage
__device__ __forceinline__ _TempStorage & PrivateStorage()
Definition block_radix_rank.cuh:196

cub::BlockRadixRank::BlockRadixRank
__device__ __forceinline__ BlockRadixRank(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition block_radix_rank.cuh:331

cub::BlockRadixRank::temp_storage
_TempStorage & temp_storage
Shared storage reference.
Definition block_radix_rank.cuh:180

cub::BlockRadixRank::RankKeys
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits, int(&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
Rank keys. For the lower RADIX_DIGITS threads, digit counts for each digit are provided for the corre...
Definition block_radix_rank.cuh:413

cub::BlockScan
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Definition block_scan.cuh:194

CUB_MAX
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition util_macro.cuh:61

cub::LaneMaskLt
__device__ __forceinline__ unsigned int LaneMaskLt()
Returns the warp lane mask of all lanes less than the calling thread.
Definition util_ptx.cuh:441

cub::WARP_SYNC
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition util_ptx.cuh:273

cub::RowMajorTid
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub::BFE
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
Definition util_ptx.cuh:127

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

cub::num_bits
OffsetT int int num_bits
[in] Number of bits of current radix digit
Definition dispatch_radix_sort.cuh:77

cub::current_bit
OffsetT int current_bit
[in] Bit position of current radix digit
Definition dispatch_radix_sort.cuh:76

cub::BlockScanAlgorithm
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
Definition block_scan.cuh:58

cub::BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_WARP_SCANS
Definition block_scan.cuh:108

cub::BINS_TRACKED_PER_THREAD
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
Definition dispatch_radix_sort.cuh:374

cub::BlockRadixRankMatch::TempStorage
\smemstorage{BlockScan}
Definition block_radix_rank.cuh:543

cub::BlockRadixRank::PrefixCallBack
Definition block_radix_rank.cuh:273

cub::BlockRadixRank::TempStorage
\smemstorage{BlockScan}
Definition block_radix_rank.cuh:310

cub::BlockScan::TempStorage
\smemstorage{BlockScan}
Definition block_scan.cuh:260

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::Log2
Statically determine log2(N), rounded up.
Definition util_type.cuh:133

cub::Sum
Default sum functor.
Definition thread_operators.cuh:110

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

CUB_LOG_WARP_THREADS
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
Definition util_arch.cuh:73

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53