38#include "../thread/thread_load.cuh" 
   39#include "../thread/thread_store.cuh" 
   40#include "../warp/warp_reduce.cuh" 
   41#include "../util_arch.cuh" 
   42#include "../util_device.cuh" 
   43#include "../util_namespace.cuh" 
   89        const T &block_aggregate)              
 
  108    SCAN_TILE_INVALID = 99, 
 
  119    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
 
  132    typedef typename If<(
sizeof(T) == 8),
 
  134        typename If<(
sizeof(T) == 4),
 
  136            typename If<(
sizeof(T) == 2),
 
  138                char>::Type>::Type>::Type StatusWord;
 
  142    typedef typename If<(
sizeof(T) == 8),
 
  144        typename If<(
sizeof(T) == 4),
 
  146            typename If<(
sizeof(T) == 2),
 
  148                uchar2>::Type>::Type>::Type 
TxnWord;
 
  152    struct TileDescriptor
 
  162        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
 
  167    TxnWord *d_tile_descriptors;
 
  170    __host__ __device__ __forceinline__
 
  173        d_tile_descriptors(NULL)
 
  178    __host__ __device__ __forceinline__
 
  181        void    *d_temp_storage,                    
 
  184        d_tile_descriptors = 
reinterpret_cast<TxnWord*
>(d_temp_storage);
 
  192    __host__ __device__ __forceinline__
 
  195        size_t  &temp_storage_bytes)                
 
  197        temp_storage_bytes = (
num_tiles + TILE_STATUS_PADDING) * 
sizeof(TileDescriptor);       
 
  207        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
 
  210        TileDescriptor *descriptor = 
reinterpret_cast<TileDescriptor*
>(&val);
 
  215            descriptor->status = StatusWord(SCAN_TILE_INVALID);
 
  216            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
 
  219        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
 
  222            descriptor->status = StatusWord(SCAN_TILE_OOB);
 
  223            d_tile_descriptors[threadIdx.x] = val;
 
  231    __device__ __forceinline__ 
void SetInclusive(
int tile_idx, T tile_inclusive)
 
  233        TileDescriptor tile_descriptor;
 
  234        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
 
  235        tile_descriptor.value = tile_inclusive;
 
  238        *
reinterpret_cast<TileDescriptor*
>(&alias) = tile_descriptor;
 
  239        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
 
  246    __device__ __forceinline__ 
void SetPartial(
int tile_idx, T tile_partial)
 
  248        TileDescriptor tile_descriptor;
 
  249        tile_descriptor.status = SCAN_TILE_PARTIAL;
 
  250        tile_descriptor.value = tile_partial;
 
  253        *
reinterpret_cast<TileDescriptor*
>(&alias) = tile_descriptor;
 
  254        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
 
  265        TileDescriptor tile_descriptor;
 
  268            __threadfence_block(); 
 
  269            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
 
  270            tile_descriptor = 
reinterpret_cast<TileDescriptor&
>(alias);
 
  272        } 
while (
WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
 
  274        status = tile_descriptor.status;
 
  275        value = tile_descriptor.value;
 
  290    typedef char StatusWord;
 
  295        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
 
  299    StatusWord  *d_tile_status;
 
  304    __host__ __device__ __forceinline__
 
  308        d_tile_partial(NULL),
 
  309        d_tile_inclusive(NULL)
 
  314    __host__ __device__ __forceinline__
 
  317        void    *d_temp_storage,                    
 
  318        size_t  temp_storage_bytes)                 
 
  320        cudaError_t error = cudaSuccess;
 
  323            void*   allocations[3];
 
  324            size_t  allocation_sizes[3];
 
  326            allocation_sizes[0] = (
num_tiles + TILE_STATUS_PADDING) * 
sizeof(StatusWord);           
 
  334            d_tile_status       = 
reinterpret_cast<StatusWord*
>(allocations[0]);
 
  335            d_tile_partial      = 
reinterpret_cast<T*
>(allocations[1]);
 
  336            d_tile_inclusive    = 
reinterpret_cast<T*
>(allocations[2]);
 
  347    __host__ __device__ __forceinline__
 
  350        size_t  &temp_storage_bytes)                
 
  353        size_t  allocation_sizes[3];
 
  354        allocation_sizes[0] = (
num_tiles + TILE_STATUS_PADDING) * 
sizeof(StatusWord);         
 
  359        void* allocations[3];
 
  369        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
 
  373            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
 
  376        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
 
  379            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
 
  387    __device__ __forceinline__ 
void SetInclusive(
int tile_idx, T tile_inclusive)
 
  390        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
 
  396        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
 
  403    __device__ __forceinline__ 
void SetPartial(
int tile_idx, T tile_partial)
 
  406        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
 
  412        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
 
  424            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
 
  428        } 
while (status == SCAN_TILE_INVALID);
 
  430        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
 
  431            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
 
  433            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
 
  449    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (
sizeof(ValueT) + 
sizeof(KeyT) < 16)>
 
  466    __host__ __device__ __forceinline__
 
  485        PAIR_SIZE           = 
sizeof(ValueT) + 
sizeof(KeyT),
 
  487        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
 
  489        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
 
  493    typedef typename If<(STATUS_WORD_SIZE == 8),
 
  495        typename If<(STATUS_WORD_SIZE == 4),
 
  497            typename If<(STATUS_WORD_SIZE == 2),
 
  499                char>::Type>::Type>::Type StatusWord;
 
  502    typedef typename If<(TXN_WORD_SIZE == 16),
 
  504        typename If<(TXN_WORD_SIZE == 8),
 
  509    struct TileDescriptorBigStatus
 
  517    struct TileDescriptorLittleStatus
 
  526            (
sizeof(ValueT) == 
sizeof(KeyT)),
 
  527            TileDescriptorBigStatus,
 
  528            TileDescriptorLittleStatus>::Type
 
  537    __host__ __device__ __forceinline__
 
  540        d_tile_descriptors(NULL)
 
  545    __host__ __device__ __forceinline__
 
  548        void    *d_temp_storage,                    
 
  551        d_tile_descriptors = 
reinterpret_cast<TxnWord*
>(d_temp_storage);
 
  559    __host__ __device__ __forceinline__
 
  562        size_t  &temp_storage_bytes)                
 
  574        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
 
  581            descriptor->status = StatusWord(SCAN_TILE_INVALID);
 
  582            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
 
  585        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
 
  588            descriptor->status = StatusWord(SCAN_TILE_OOB);
 
  589            d_tile_descriptors[threadIdx.x] = val;
 
  600        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
 
  601        tile_descriptor.value   = tile_inclusive.
value;
 
  602        tile_descriptor.key     = tile_inclusive.
key;
 
  606        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
 
  616        tile_descriptor.status  = SCAN_TILE_PARTIAL;
 
  617        tile_descriptor.value   = tile_partial.
value;
 
  618        tile_descriptor.key     = tile_partial.
key;
 
  622        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
 
  651            __threadfence_block(); 
 
  652            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
 
  655        } 
while (
WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
 
  657        status      = tile_descriptor.status;
 
  658        value.
value = tile_descriptor.value;
 
  659        value.
key   = tile_descriptor.key;
 
  678    typename    ScanTileStateT,
 
  698    typedef typename ScanTileStateT::StatusWord StatusWord;
 
  709    __device__ __forceinline__
 
  723    __device__ __forceinline__
 
  726        StatusWord  &predecessor_status,    
 
  730        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
 
  735        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
 
  744    __device__ __forceinline__
 
  745    T operator()(T block_aggregate)
 
  749        if (threadIdx.x == 0)
 
  755        int         predecessor_idx = 
tile_idx - threadIdx.x - 1;
 
  756        StatusWord  predecessor_status;
 
  760        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
 
  766        while (
WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
 
  768            predecessor_idx -= CUB_PTX_WARP_THREADS;
 
  771            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
 
  776        if (threadIdx.x == 0)
 
  790    __device__ __forceinline__
 
  791    T GetExclusivePrefix()
 
  797    __device__ __forceinline__
 
  798    T GetInclusivePrefix()
 
  804    __device__ __forceinline__
 
  805    T GetBlockAggregate()
 
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
 
Binary operator wrapper for switching non-commutative scan arguments.
 
The WarpReduce class provides collective methods for computing a parallel reduction of items partitio...
 
__device__ __forceinline__ T TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op)
Computes a segmented reduction in the calling warp where segments are defined by tail-flags....
 
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
 
#define CubDebug(e)
Debug macro.
 
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
 
__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
 
Optional outer namespace(s)
 
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
 
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
 
< Wrapped scan operator type
 
__device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
Constructor.
 
T running_total
Running block-wide prefix.
 
__device__ __forceinline__ BlockScanRunningPrefixOp(T starting_prefix, ScanOpT op)
Constructor.
 
__device__ __forceinline__ T operator()(const T &block_aggregate)
 
ScanOpT op
Wrapped scan operator.
 
Type selection (IF ? ThenType : ElseType)
 
A key identifier paired with a corresponding value.
 
Statically determine log2(N), rounded up.
 
__host__ __device__ __forceinline__ ReduceByKeyScanTileState()
Constructor.
 
__host__ __device__ __forceinline__ cudaError_t Init(int, void *d_temp_storage, size_t)
Initializer.
 
__host__ __device__ __forceinline__ ReduceByKeyScanTileState()
Constructor.
 
__device__ __forceinline__ void InitializeStatus(int num_tiles)
 
__device__ __forceinline__ void WaitForValid(int tile_idx, StatusWord &status, KeyValuePairT &value)
 
__device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
 
__device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
 
__host__ __device__ static __forceinline__ cudaError_t AllocationSize(int num_tiles, size_t &temp_storage_bytes)
 
__device__ __forceinline__ void InitializeStatus(int num_tiles)
 
__device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
 
__device__ __forceinline__ void WaitForValid(int tile_idx, StatusWord &status, T &value)
 
__device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
 
__host__ __device__ static __forceinline__ cudaError_t AllocationSize(int num_tiles, size_t &temp_storage_bytes)
 
__host__ __device__ __forceinline__ ScanTileState()
Constructor.
 
__host__ __device__ __forceinline__ cudaError_t Init(int num_tiles, void *d_temp_storage, size_t temp_storage_bytes)
Initializer.
 
__device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
 
__device__ __forceinline__ void WaitForValid(int tile_idx, StatusWord &status, T &value)
 
__host__ __device__ __forceinline__ ScanTileState()
Constructor.
 
__host__ __device__ static __forceinline__ cudaError_t AllocationSize(int num_tiles, size_t &temp_storage_bytes)
 
__host__ __device__ __forceinline__ cudaError_t Init(int, void *d_temp_storage, size_t)
Initializer.
 
__device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
 
__device__ __forceinline__ void InitializeStatus(int num_tiles)
 
_TempStorage & temp_storage
Reference to a warp-reduction instance.
 
ScanTileStateT & tile_status
Interface to tile status.
 
int tile_idx
The current tile index.
 
ScanOpT scan_op
Binary scan operator.
 
T exclusive_prefix
Exclusive prefix for the tile.
 
__device__ __forceinline__ void ProcessWindow(int predecessor_idx, StatusWord &predecessor_status, T &window_aggregate)
 
T inclusive_prefix
Inclusive prefix for the tile.
 
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
 
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...