38#include "../thread/thread_load.cuh"
39#include "../thread/thread_store.cuh"
40#include "../warp/warp_reduce.cuh"
41#include "../util_arch.cuh"
42#include "../util_device.cuh"
43#include "../util_namespace.cuh"
89 const T &block_aggregate)
108 SCAN_TILE_INVALID = 99,
119 bool SINGLE_WORD = Traits<T>::PRIMITIVE>
132 typedef typename If<(
sizeof(T) == 8),
134 typename If<(
sizeof(T) == 4),
136 typename If<(
sizeof(T) == 2),
138 char>::Type>::Type>::Type StatusWord;
142 typedef typename If<(
sizeof(T) == 8),
144 typename If<(
sizeof(T) == 4),
146 typename If<(
sizeof(T) == 2),
148 uchar2>::Type>::Type>::Type
TxnWord;
152 struct TileDescriptor
162 TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
167 TxnWord *d_tile_descriptors;
170 __host__ __device__ __forceinline__
173 d_tile_descriptors(NULL)
178 __host__ __device__ __forceinline__
181 void *d_temp_storage,
184 d_tile_descriptors =
reinterpret_cast<TxnWord*
>(d_temp_storage);
192 __host__ __device__ __forceinline__
195 size_t &temp_storage_bytes)
197 temp_storage_bytes = (
num_tiles + TILE_STATUS_PADDING) *
sizeof(TileDescriptor);
207 int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
210 TileDescriptor *descriptor =
reinterpret_cast<TileDescriptor*
>(&val);
215 descriptor->status = StatusWord(SCAN_TILE_INVALID);
216 d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
219 if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
222 descriptor->status = StatusWord(SCAN_TILE_OOB);
223 d_tile_descriptors[threadIdx.x] = val;
231 __device__ __forceinline__
void SetInclusive(
int tile_idx, T tile_inclusive)
233 TileDescriptor tile_descriptor;
234 tile_descriptor.status = SCAN_TILE_INCLUSIVE;
235 tile_descriptor.value = tile_inclusive;
238 *
reinterpret_cast<TileDescriptor*
>(&alias) = tile_descriptor;
239 ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
246 __device__ __forceinline__
void SetPartial(
int tile_idx, T tile_partial)
248 TileDescriptor tile_descriptor;
249 tile_descriptor.status = SCAN_TILE_PARTIAL;
250 tile_descriptor.value = tile_partial;
253 *
reinterpret_cast<TileDescriptor*
>(&alias) = tile_descriptor;
254 ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
265 TileDescriptor tile_descriptor;
268 __threadfence_block();
269 TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
270 tile_descriptor =
reinterpret_cast<TileDescriptor&
>(alias);
272 }
while (
WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
274 status = tile_descriptor.status;
275 value = tile_descriptor.value;
290 typedef char StatusWord;
295 TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
299 StatusWord *d_tile_status;
304 __host__ __device__ __forceinline__
308 d_tile_partial(NULL),
309 d_tile_inclusive(NULL)
314 __host__ __device__ __forceinline__
317 void *d_temp_storage,
318 size_t temp_storage_bytes)
320 cudaError_t error = cudaSuccess;
323 void* allocations[3];
324 size_t allocation_sizes[3];
326 allocation_sizes[0] = (
num_tiles + TILE_STATUS_PADDING) *
sizeof(StatusWord);
334 d_tile_status =
reinterpret_cast<StatusWord*
>(allocations[0]);
335 d_tile_partial =
reinterpret_cast<T*
>(allocations[1]);
336 d_tile_inclusive =
reinterpret_cast<T*
>(allocations[2]);
347 __host__ __device__ __forceinline__
350 size_t &temp_storage_bytes)
353 size_t allocation_sizes[3];
354 allocation_sizes[0] = (
num_tiles + TILE_STATUS_PADDING) *
sizeof(StatusWord);
359 void* allocations[3];
369 int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
373 d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
376 if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
379 d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
387 __device__ __forceinline__
void SetInclusive(
int tile_idx, T tile_inclusive)
390 ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
396 ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
403 __device__ __forceinline__
void SetPartial(
int tile_idx, T tile_partial)
406 ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
412 ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
424 status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
428 }
while (status == SCAN_TILE_INVALID);
430 if (status == StatusWord(SCAN_TILE_PARTIAL))
431 value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
433 value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
449 bool SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (
sizeof(ValueT) +
sizeof(KeyT) < 16)>
466 __host__ __device__ __forceinline__
485 PAIR_SIZE =
sizeof(ValueT) +
sizeof(KeyT),
487 STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE,
489 TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
493 typedef typename If<(STATUS_WORD_SIZE == 8),
495 typename If<(STATUS_WORD_SIZE == 4),
497 typename If<(STATUS_WORD_SIZE == 2),
499 char>::Type>::Type>::Type StatusWord;
502 typedef typename If<(TXN_WORD_SIZE == 16),
504 typename If<(TXN_WORD_SIZE == 8),
509 struct TileDescriptorBigStatus
517 struct TileDescriptorLittleStatus
526 (
sizeof(ValueT) ==
sizeof(KeyT)),
527 TileDescriptorBigStatus,
528 TileDescriptorLittleStatus>::Type
537 __host__ __device__ __forceinline__
540 d_tile_descriptors(NULL)
545 __host__ __device__ __forceinline__
548 void *d_temp_storage,
551 d_tile_descriptors =
reinterpret_cast<TxnWord*
>(d_temp_storage);
559 __host__ __device__ __forceinline__
562 size_t &temp_storage_bytes)
574 int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
581 descriptor->status = StatusWord(SCAN_TILE_INVALID);
582 d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
585 if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
588 descriptor->status = StatusWord(SCAN_TILE_OOB);
589 d_tile_descriptors[threadIdx.x] = val;
600 tile_descriptor.status = SCAN_TILE_INCLUSIVE;
601 tile_descriptor.value = tile_inclusive.
value;
602 tile_descriptor.key = tile_inclusive.
key;
606 ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
616 tile_descriptor.status = SCAN_TILE_PARTIAL;
617 tile_descriptor.value = tile_partial.
value;
618 tile_descriptor.key = tile_partial.
key;
622 ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
651 __threadfence_block();
652 TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
655 }
while (
WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
657 status = tile_descriptor.status;
658 value.
value = tile_descriptor.value;
659 value.
key = tile_descriptor.key;
678 typename ScanTileStateT,
698 typedef typename ScanTileStateT::StatusWord StatusWord;
709 __device__ __forceinline__
723 __device__ __forceinline__
726 StatusWord &predecessor_status,
730 tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
735 int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
744 __device__ __forceinline__
745 T operator()(T block_aggregate)
749 if (threadIdx.x == 0)
755 int predecessor_idx =
tile_idx - threadIdx.x - 1;
756 StatusWord predecessor_status;
760 ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
766 while (
WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
768 predecessor_idx -= CUB_PTX_WARP_THREADS;
771 ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
776 if (threadIdx.x == 0)
790 __device__ __forceinline__
791 T GetExclusivePrefix()
797 __device__ __forceinline__
798 T GetInclusivePrefix()
804 __device__ __forceinline__
805 T GetBlockAggregate()
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Binary operator wrapper for switching non-commutative scan arguments.
The WarpReduce class provides collective methods for computing a parallel reduction of items partitio...
__device__ __forceinline__ T TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op)
Computes a segmented reduction in the calling warp where segments are defined by tail-flags....
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
#define CubDebug(e)
Debug macro.
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
OffsetsOutputIteratorT LengthsOutputIteratorT NumRunsOutputIteratorT ScanTileStateT EqualityOpT OffsetT int num_tiles
< [in] Total number of tiles for the entire problem
< Wrapped scan operator type
__device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
Constructor.
T running_total
Running block-wide prefix.
__device__ __forceinline__ BlockScanRunningPrefixOp(T starting_prefix, ScanOpT op)
Constructor.
__device__ __forceinline__ T operator()(const T &block_aggregate)
ScanOpT op
Wrapped scan operator.
Type selection (IF ? ThenType : ElseType)
A key identifier paired with a corresponding value.
Statically determine log2(N), rounded up.
__host__ __device__ __forceinline__ ReduceByKeyScanTileState()
Constructor.
__host__ __device__ __forceinline__ cudaError_t Init(int, void *d_temp_storage, size_t)
Initializer.
__host__ __device__ __forceinline__ ReduceByKeyScanTileState()
Constructor.
__device__ __forceinline__ void InitializeStatus(int num_tiles)
__device__ __forceinline__ void WaitForValid(int tile_idx, StatusWord &status, KeyValuePairT &value)
__device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
__device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
__host__ __device__ static __forceinline__ cudaError_t AllocationSize(int num_tiles, size_t &temp_storage_bytes)
__device__ __forceinline__ void InitializeStatus(int num_tiles)
__device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
__device__ __forceinline__ void WaitForValid(int tile_idx, StatusWord &status, T &value)
__device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
__host__ __device__ static __forceinline__ cudaError_t AllocationSize(int num_tiles, size_t &temp_storage_bytes)
__host__ __device__ __forceinline__ ScanTileState()
Constructor.
__host__ __device__ __forceinline__ cudaError_t Init(int num_tiles, void *d_temp_storage, size_t temp_storage_bytes)
Initializer.
__device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
__device__ __forceinline__ void WaitForValid(int tile_idx, StatusWord &status, T &value)
__host__ __device__ __forceinline__ ScanTileState()
Constructor.
__host__ __device__ static __forceinline__ cudaError_t AllocationSize(int num_tiles, size_t &temp_storage_bytes)
__host__ __device__ __forceinline__ cudaError_t Init(int, void *d_temp_storage, size_t)
Initializer.
__device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
__device__ __forceinline__ void InitializeStatus(int num_tiles)
_TempStorage & temp_storage
Reference to a warp-reduction instance.
ScanTileStateT & tile_status
Interface to tile status.
int tile_idx
The current tile index.
ScanOpT scan_op
Binary scan operator.
T exclusive_prefix
Exclusive prefix for the tile.
__device__ __forceinline__ void ProcessWindow(int predecessor_idx, StatusWord &predecessor_status, T &window_aggregate)
T inclusive_prefix
Inclusive prefix for the tile.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...