40#include "../../agent/agent_radix_sort_upsweep.cuh"
41#include "../../agent/agent_radix_sort_downsweep.cuh"
42#include "../../agent/agent_scan.cuh"
43#include "../../block/block_radix_sort.cuh"
44#include "../../grid/grid_even_share.cuh"
45#include "../../util_type.cuh"
46#include "../../util_debug.cuh"
47#include "../../util_device.cuh"
48#include "../../util_namespace.cuh"
64 typename ChainedPolicyT,
70 ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
71 ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
72__global__ void DeviceRadixSortUpsweepKernel(
81 TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
82 ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
87 typename If<(ALT_DIGIT_BITS),
88 typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
89 typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type,
98 even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
107 upsweep.template ExtractCounts<IS_DESCENDING>(
d_spine, gridDim.x, blockIdx.x);
115 typename ChainedPolicyT,
118__global__
void RadixSortScanBinsKernel(
124 typename ChainedPolicyT::ActivePolicy::ScanPolicy,
133 __shared__
typename AgentScanT::TempStorage temp_storage;
143 block_scan.template ConsumeTile<false, false>(
block_offset, prefix_op);
153 typename ChainedPolicyT,
160 ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
161 ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
162__global__ void DeviceRadixSortDownsweepKernel(
163 const KeyT *d_keys_in,
174 TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
175 ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
180 typename If<(ALT_DIGIT_BITS),
181 typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
182 typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type,
193 even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
206 typename ChainedPolicyT,
212__global__
void DeviceRadixSortSingleTileKernel(
213 const KeyT *d_keys_in,
224 BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
225 ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
235 ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
236 (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
237 ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
245 ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM>
BlockLoadKeys;
252 ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM>
BlockLoadValues;
267 KeyT keys[ITEMS_PER_THREAD];
268 ValueT values[ITEMS_PER_THREAD];
272 KeyT default_key =
reinterpret_cast<KeyT&
>(default_key_bits);
292 BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
297 Int2Type<IS_DESCENDING>(),
298 Int2Type<KEYS_ONLY>());
302 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
304 int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
319 typename ChainedPolicyT,
324 typename OffsetIteratorT,
327 ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
328 ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
329__global__ void DeviceSegmentedRadixSortKernel(
330 const KeyT *d_keys_in,
344 typedef typename If<(ALT_DIGIT_BITS),
345 typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
346 typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
350 BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS,
351 ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD,
352 RADIX_BITS = SegmentedPolicyT::RADIX_BITS,
353 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
354 RADIX_DIGITS = 1 << RADIX_BITS,
384 typename BlockUpsweepT::TempStorage upsweep;
385 typename BlockDownsweepT::TempStorage downsweep;
388 volatile OffsetT reverse_counts_in[RADIX_DIGITS];
389 volatile OffsetT reverse_counts_out[RADIX_DIGITS];
390 typename DigitScanT::TempStorage scan;
423 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
424 temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
434 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
435 bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
441 DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
446 bin_offset[track] += segment_begin;
457 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
458 temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
468 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
469 bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
477 downsweep.ProcessRegion(segment_begin, segment_end);
506 typedef typename If<(
sizeof(ValueT) > 4) && (
sizeof(KeyT) <
sizeof(ValueT)), ValueT, KeyT>::Type
DominantT;
516 PRIMARY_RADIX_BITS = 5,
517 ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1,
520 SCALE_FACTOR_4B = (
CUB_MAX(
sizeof(KeyT),
sizeof(ValueT)) + 3) / 4,
536 typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE>
ScanPolicy;
562 PRIMARY_RADIX_BITS = 5,
563 ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1,
566 SCALE_FACTOR_4B = (
CUB_MAX(
sizeof(KeyT),
sizeof(ValueT)) + 3) / 4,
582 typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS>
ScanPolicy;
609 PRIMARY_RADIX_BITS = (
sizeof(KeyT) > 1) ? 6 : 5,
613 typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS>
ScanPolicy;
646 PRIMARY_RADIX_BITS = (
sizeof(KeyT) > 1) ? 7 : 5,
647 SINGLE_TILE_RADIX_BITS = (
sizeof(KeyT) > 1) ? 6 : 5,
648 SEGMENTED_RADIX_BITS = (
sizeof(KeyT) > 1) ? 6 : 5,
652 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE>
ScanPolicy;
675 PRIMARY_RADIX_BITS = (
sizeof(KeyT) > 1) ? 7 : 5,
676 SINGLE_TILE_RADIX_BITS = (
sizeof(KeyT) > 1) ? 6 : 5,
677 SEGMENTED_RADIX_BITS = (
sizeof(KeyT) > 1) ? 6 : 5,
681 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE>
ScanPolicy;
705 PRIMARY_RADIX_BITS = (
sizeof(KeyT) > 1) ? 7 : 5,
706 SINGLE_TILE_RADIX_BITS = (
sizeof(KeyT) > 1) ? 6 : 5,
707 SEGMENTED_RADIX_BITS = (
sizeof(KeyT) > 1) ? 6 : 5,
711 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE>
ScanPolicy;
734 PRIMARY_RADIX_BITS = 5,
735 ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1,
739 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE>
ScanPolicy;
762 PRIMARY_RADIX_BITS = (
sizeof(KeyT) > 1) ? 7 : 5,
763 SINGLE_TILE_RADIX_BITS = (
sizeof(KeyT) > 1) ? 6 : 5,
764 SEGMENTED_RADIX_BITS = (
sizeof(KeyT) > 1) ? 6 : 5,
768 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE>
ScanPolicy;
843 CUB_RUNTIME_FUNCTION __forceinline__
877 typename ActivePolicyT,
878 typename SingleTileKernelT>
879 CUB_RUNTIME_FUNCTION __forceinline__
881 SingleTileKernelT single_tile_kernel)
883#ifndef CUB_RUNTIME_ENABLED
884 (void)single_tile_kernel;
886 return CubDebug(cudaErrorNotSupported );
888 cudaError error = cudaSuccess;
904 _CubLog(
"Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
905 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (
long long)
stream,
906 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1,
begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
909 single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
919 if (
CubDebug(error = cudaPeekAtLastError()))
break;
943 template <
typename PassConfigT>
944 CUB_RUNTIME_FUNCTION __forceinline__
946 const KeyT *d_keys_in,
953 PassConfigT &pass_config)
955 cudaError error = cudaSuccess;
962 _CubLog(
"Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
963 pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (
long long)
stream,
964 pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy,
current_bit,
pass_bits);
967 pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0,
stream>>>(
973 pass_config.even_share);
976 if (
CubDebug(error = cudaPeekAtLastError()))
break;
983 1, pass_config.scan_config.block_threads, (
long long)
stream, pass_config.scan_config.items_per_thread);
986 pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0,
stream>>>(
991 if (
CubDebug(error = cudaPeekAtLastError()))
break;
997 if (
debug_synchronous)
_CubLog(
"Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
998 pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (
long long)
stream,
999 pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
1002 pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0,
stream>>>(
1011 pass_config.even_share);
1014 if (
CubDebug(error = cudaPeekAtLastError()))
break;
1031 typename UpsweepKernelT,
1032 typename ScanKernelT,
1033 typename DownsweepKernelT>
1036 UpsweepKernelT upsweep_kernel;
1038 ScanKernelT scan_kernel;
1040 DownsweepKernelT downsweep_kernel;
1044 int max_downsweep_grid_size;
1049 typename UpsweepPolicyT,
1050 typename ScanPolicyT,
1051 typename DownsweepPolicyT>
1052 CUB_RUNTIME_FUNCTION __forceinline__
1054 UpsweepKernelT upsweep_kernel,
1055 ScanKernelT scan_kernel,
1056 DownsweepKernelT downsweep_kernel,
1061 cudaError error = cudaSuccess;
1064 this->upsweep_kernel = upsweep_kernel;
1065 this->scan_kernel = scan_kernel;
1066 this->downsweep_kernel = downsweep_kernel;
1067 radix_bits = DownsweepPolicyT::RADIX_BITS;
1068 radix_digits = 1 << radix_bits;
1070 if (
CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel)))
break;
1071 if (
CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel)))
break;
1072 if (
CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel)))
break;
1078 max_downsweep_grid_size,
1079 CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
1091 typename ActivePolicyT,
1092 typename UpsweepKernelT,
1093 typename ScanKernelT,
1094 typename DownsweepKernelT>
1095 CUB_RUNTIME_FUNCTION __forceinline__
1097 UpsweepKernelT upsweep_kernel,
1098 UpsweepKernelT alt_upsweep_kernel,
1099 ScanKernelT scan_kernel,
1100 DownsweepKernelT downsweep_kernel,
1101 DownsweepKernelT alt_downsweep_kernel)
1103#ifndef CUB_RUNTIME_ENABLED
1104 (void)upsweep_kernel;
1105 (void)alt_upsweep_kernel;
1107 (void)downsweep_kernel;
1108 (void)alt_downsweep_kernel;
1111 return CubDebug(cudaErrorNotSupported );
1114 cudaError error = cudaSuccess;
1119 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
1123 if (
CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
break;
1127 if ((error = pass_config.template InitPassConfig<
1128 typename ActivePolicyT::UpsweepPolicy,
1129 typename ActivePolicyT::ScanPolicy,
1130 typename ActivePolicyT::DownsweepPolicy>(
1133 if ((error = alt_pass_config.template InitPassConfig<
1134 typename ActivePolicyT::AltUpsweepPolicy,
1135 typename ActivePolicyT::ScanPolicy,
1136 typename ActivePolicyT::AltDownsweepPolicy>(
1137 alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel,
ptx_version, sm_count,
num_items)))
break;
1140 int max_grid_size =
CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
1141 int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
1144 void* allocations[3];
1145 size_t allocation_sizes[3] =
1147 spine_length *
sizeof(
OffsetT),
1161 int num_passes = (
num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
1162 bool is_num_passes_odd = num_passes & 1;
1163 int max_alt_passes = (num_passes * pass_config.radix_bits) -
num_bits;
1183 (
current_bit < alt_end_bit) ? alt_pass_config : pass_config)))
break;
1189 d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1190 d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1192 (
current_bit < alt_end_bit) ? alt_pass_config : pass_config)))
break;;
1195 d_keys_remaining_passes.
selector ^= 1;
1196 d_values_remaining_passes.selector ^= 1;
1220 template <
typename ActivePolicyT>
1221 CUB_RUNTIME_FUNCTION __forceinline__
1225 typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT;
1228 if (
num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
1231 return InvokeSingleTile<ActivePolicyT>(
1232 DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
1237 return InvokePasses<ActivePolicyT>(
1238 DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>,
1239 DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>,
1240 RadixSortScanBinsKernel< MaxPolicyT, OffsetT>,
1241 DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>,
1242 DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>);
1254 CUB_RUNTIME_FUNCTION __forceinline__
1305 typename OffsetIteratorT,
1346 CUB_RUNTIME_FUNCTION __forceinline__
1385 template <
typename PassConfigT>
1386 CUB_RUNTIME_FUNCTION __forceinline__
1388 const KeyT *d_keys_in,
1393 PassConfigT &pass_config)
1395 cudaError error = cudaSuccess;
1402 _CubLog(
"Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
1404 pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy,
current_bit,
pass_bits);
1406 pass_config.segmented_kernel<<<
num_segments, pass_config.segmented_config.block_threads, 0,
stream>>>(
1413 if (
CubDebug(error = cudaPeekAtLastError()))
break;
1428 template <
typename SegmentedKernelT>
1431 SegmentedKernelT segmented_kernel;
1437 template <
typename SegmentedPolicyT>
1438 CUB_RUNTIME_FUNCTION __forceinline__
1441 this->segmented_kernel = segmented_kernel;
1442 this->radix_bits = SegmentedPolicyT::RADIX_BITS;
1443 this->radix_digits = 1 << radix_bits;
1445 return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
1452 typename ActivePolicyT,
1453 typename SegmentedKernelT>
1454 CUB_RUNTIME_FUNCTION __forceinline__
1456 SegmentedKernelT segmented_kernel,
1457 SegmentedKernelT alt_segmented_kernel)
1459#ifndef CUB_RUNTIME_ENABLED
1460 (void)segmented_kernel;
1461 (void)alt_segmented_kernel;
1464 return CubDebug(cudaErrorNotSupported );
1467 cudaError error = cudaSuccess;
1472 if ((error = pass_config.template InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel)))
break;
1473 if ((error = alt_pass_config.template InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel)))
break;
1476 void* allocations[2];
1477 size_t allocation_sizes[2] =
1495 int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
1496 int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
1498 int num_passes = (
num_bits + radix_bits - 1) / radix_bits;
1499 bool is_num_passes_odd = num_passes & 1;
1500 int max_alt_passes = (num_passes * radix_bits) -
num_bits;
1518 (
current_bit < alt_end_bit) ? alt_pass_config : pass_config)))
break;
1524 d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1525 d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1527 (
current_bit < alt_end_bit) ? alt_pass_config : pass_config)))
break;
1530 d_keys_remaining_passes.
selector ^= 1;
1531 d_values_remaining_passes.selector ^= 1;
1555 template <
typename ActivePolicyT>
1556 CUB_RUNTIME_FUNCTION __forceinline__
1562 return InvokePasses<ActivePolicyT>(
1563 DeviceSegmentedRadixSortKernel<MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
1564 DeviceSegmentedRadixSortKernel<MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
1574 CUB_RUNTIME_FUNCTION __forceinline__
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
The BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thre...
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
@ BLOCK_LOAD_WARP_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
Optional outer namespace(s)
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
OffsetT * d_spine
< [in] Input keys buffer
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
KeyT * d_keys_out
< [in] Input keys buffer
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
OffsetT int int num_bits
[in] Number of bits of current radix digit
KeyT const ValueT * d_values_in
[in] Input values buffer
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
int num_counts
< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block,...
OffsetT int current_bit
[in] Bit position of current radix digit
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
@ BLOCK_SCAN_RAKING_MEMOIZE
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int int pass_bits
< [in] Number of bits of current radix digit
< The number of radix bits, i.e., log2(bins)
Alias wrapper allowing storage to be unioned.
AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in ...
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, OffsetT block_end)
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
< The number of radix bits, i.e., log2(bins)
Alias wrapper allowing storage to be unioned.
AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in de...
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, const OffsetT &block_end)
__device__ __forceinline__ void ExtractCounts(OffsetT *counters, int bin_stride=1, int bin_offset=0)
< The BlockScan algorithm to use
AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide pr...
\smemstorage{BlockRadixSort}
< Wrapped scan operator type
Helper for dispatching into a policy chain.
< Signed integer type for global offsets
Policy700 MaxPolicy
MaxPolicy.
Pass configuration structure.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig(UpsweepKernelT upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, int ptx_version, int sm_count, int num_items)
Initialize pass configuration.
< Signed integer type for global offsets
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass(const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, OffsetT *d_spine, int spine_length, int ¤t_bit, PassConfigT &pass_config)
int ptx_version
[in] PTX version
DoubleBuffer< KeyT > & d_keys
[in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return,...
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(UpsweepKernelT upsweep_kernel, UpsweepKernelT alt_upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, DownsweepKernelT alt_downsweep_kernel)
Invocation (run multiple digit passes)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
DoubleBuffer< ValueT > & d_values
[in,out] Double-buffer whose current buffer contains the unsorted input values and,...
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
OffsetT num_items
[in] Number of items to sort
bool is_overwrite_okay
[in] Whether is okay to overwrite source buffers
CUB_RUNTIME_FUNCTION __forceinline__ DispatchRadixSort(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
int begin_bit
[in] The beginning (least-significant) bit index needed for key comparison
int end_bit
[in] The past-the-end (most-significant) bit index needed for key comparison
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel)
Invoke a single block to sort in-core.
PassConfig data structure.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
Initialize pass configuration.
< Signed integer type for global offsets
OffsetT num_items
[in] Number of items to sort
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
OffsetT num_segments
[in] The number of segments that comprise the sorting data
OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
DoubleBuffer< KeyT > & d_keys
[in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return,...
int ptx_version
[in] PTX version
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
DoubleBuffer< ValueT > & d_values
[in,out] Double-buffer whose current buffer contains the unsorted input values and,...
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass(const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int ¤t_bit, PassConfigT &pass_config)
Invoke a three-kernel sorting pass at the current bit.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel)
Invocation (run multiple digit passes)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
Internal dispatch routine.
CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedRadixSort(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, OffsetT num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
bool is_overwrite_okay
[in] Whether is okay to overwrite source buffers
int end_bit
[in] The past-the-end (most-significant) bit index needed for key comparison
int begin_bit
[in] The beginning (least-significant) bit index needed for key comparison
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
Double-buffer storage wrapper for multi-pass stream transformations that require more than one storag...
__host__ __device__ __forceinline__ T * Current()
Return pointer to the currently valid buffer.
int selector
Selector into d_buffers (i.e., the active/valid buffer)
__host__ __device__ __forceinline__ T * Alternate()
Return pointer to the currently invalid buffer.
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
Type selection (IF ? ThenType : ElseType)
#define CUB_SUBSCRIPTION_FACTOR(arch)
Oversubscription factor.
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.