OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
dispatch_radix_sort.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39
40#include "../../agent/agent_radix_sort_upsweep.cuh"
41#include "../../agent/agent_radix_sort_downsweep.cuh"
42#include "../../agent/agent_scan.cuh"
43#include "../../block/block_radix_sort.cuh"
44#include "../../grid/grid_even_share.cuh"
45#include "../../util_type.cuh"
46#include "../../util_debug.cuh"
47#include "../../util_device.cuh"
48#include "../../util_namespace.cuh"
49
51CUB_NS_PREFIX
52
54namespace cub {
55
56/******************************************************************************
57 * Kernel entry points
58 *****************************************************************************/
59
63template <
64 typename ChainedPolicyT,
65 bool ALT_DIGIT_BITS,
66 bool IS_DESCENDING,
67 typename KeyT,
68 typename OffsetT>
69__launch_bounds__ (int((ALT_DIGIT_BITS) ?
70 ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
71 ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
72__global__ void DeviceRadixSortUpsweepKernel(
73 const KeyT *d_keys,
75 OffsetT /*num_items*/,
79{
80 enum {
81 TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
82 ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
83 };
84
85 // Parameterize AgentRadixSortUpsweep type for the current configuration
87 typename If<(ALT_DIGIT_BITS),
88 typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
89 typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type,
90 KeyT,
91 OffsetT>
93
94 // Shared memory storage
95 __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
96
97 // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
98 even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
99
100 AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
101
102 upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
103
104 CTA_SYNC();
105
106 // Write out digit counts (striped)
107 upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
108}
109
110
114template <
115 typename ChainedPolicyT,
116 typename OffsetT>
117__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
118__global__ void RadixSortScanBinsKernel(
121{
122 // Parameterize the AgentScan type for the current configuration
123 typedef AgentScan<
124 typename ChainedPolicyT::ActivePolicy::ScanPolicy,
125 OffsetT*,
126 OffsetT*,
127 cub::Sum,
128 OffsetT,
129 OffsetT>
130 AgentScanT;
131
132 // Shared memory storage
133 __shared__ typename AgentScanT::TempStorage temp_storage;
134
135 // Block scan instance
136 AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
137
138 // Process full input tiles
139 int block_offset = 0;
141 while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
142 {
143 block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
144 block_offset += AgentScanT::TILE_ITEMS;
145 }
146}
147
148
152template <
153 typename ChainedPolicyT,
154 bool ALT_DIGIT_BITS,
155 bool IS_DESCENDING,
156 typename KeyT,
157 typename ValueT,
158 typename OffsetT>
159__launch_bounds__ (int((ALT_DIGIT_BITS) ?
160 ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
161 ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
162__global__ void DeviceRadixSortDownsweepKernel(
163 const KeyT *d_keys_in,
165 const ValueT *d_values_in,
166 ValueT *d_values_out,
170 int num_bits,
172{
173 enum {
174 TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
175 ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
176 };
177
178 // Parameterize AgentRadixSortDownsweep type for the current configuration
180 typename If<(ALT_DIGIT_BITS),
181 typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
182 typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type,
183 IS_DESCENDING,
184 KeyT,
185 ValueT,
186 OffsetT>
188
189 // Shared memory storage
190 __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage;
191
192 // Initialize even-share descriptor for this thread block
193 even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
194
195 // Process input tiles
197 even_share.block_offset,
198 even_share.block_end);
199}
200
201
205template <
206 typename ChainedPolicyT,
207 bool IS_DESCENDING,
208 typename KeyT,
209 typename ValueT,
210 typename OffsetT>
211__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
212__global__ void DeviceRadixSortSingleTileKernel(
213 const KeyT *d_keys_in,
214 KeyT *d_keys_out,
215 const ValueT *d_values_in,
216 ValueT *d_values_out,
218 int current_bit,
220{
221 // Constants
222 enum
223 {
224 BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
225 ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
227 };
228
229 // BlockRadixSort type
230 typedef BlockRadixSort<
231 KeyT,
232 BLOCK_THREADS,
233 ITEMS_PER_THREAD,
234 ValueT,
235 ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
236 (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
237 ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
239
240 // BlockLoad type (keys)
241 typedef BlockLoad<
242 KeyT,
243 BLOCK_THREADS,
244 ITEMS_PER_THREAD,
245 ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
246
247 // BlockLoad type (values)
248 typedef BlockLoad<
249 ValueT,
250 BLOCK_THREADS,
251 ITEMS_PER_THREAD,
252 ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
253
254 // Unsigned word for key bits
255 typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
256
257 // Shared memory storage
258 __shared__ union TempStorage
259 {
260 typename BlockRadixSortT::TempStorage sort;
261 typename BlockLoadKeys::TempStorage load_keys;
262 typename BlockLoadValues::TempStorage load_values;
263
264 } temp_storage;
265
266 // Keys and values for the block
267 KeyT keys[ITEMS_PER_THREAD];
268 ValueT values[ITEMS_PER_THREAD];
269
270 // Get default (min/max) value for out-of-bounds keys
271 UnsignedBitsT default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
272 KeyT default_key = reinterpret_cast<KeyT&>(default_key_bits);
273
274 // Load keys
275 BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
276
277 CTA_SYNC();
278
279 // Load values
280 if (!KEYS_ONLY)
281 {
282 // Register pressure work-around: moving num_items through shfl prevents compiler
283 // from reusing guards/addressing from prior guarded loads
284 num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);
285
286 BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
287
288 CTA_SYNC();
289 }
290
291 // Sort tile
292 BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
293 keys,
294 values,
296 end_bit,
297 Int2Type<IS_DESCENDING>(),
298 Int2Type<KEYS_ONLY>());
299
300 // Store keys and values
301 #pragma unroll
302 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
303 {
304 int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
305 if (item_offset < num_items)
306 {
307 d_keys_out[item_offset] = keys[ITEM];
308 if (!KEYS_ONLY)
309 d_values_out[item_offset] = values[ITEM];
310 }
311 }
312}
313
314
318template <
319 typename ChainedPolicyT,
320 bool ALT_DIGIT_BITS,
321 bool IS_DESCENDING,
322 typename KeyT,
323 typename ValueT,
324 typename OffsetIteratorT,
325 typename OffsetT>
326__launch_bounds__ (int((ALT_DIGIT_BITS) ?
327 ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
328 ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
329__global__ void DeviceSegmentedRadixSortKernel(
330 const KeyT *d_keys_in,
331 KeyT *d_keys_out,
332 const ValueT *d_values_in,
333 ValueT *d_values_out,
334 OffsetIteratorT d_begin_offsets,
335 OffsetIteratorT d_end_offsets,
336 int /*num_segments*/,
339{
340 //
341 // Constants
342 //
343
344 typedef typename If<(ALT_DIGIT_BITS),
345 typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
346 typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
347
348 enum
349 {
350 BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS,
351 ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD,
352 RADIX_BITS = SegmentedPolicyT::RADIX_BITS,
353 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
354 RADIX_DIGITS = 1 << RADIX_BITS,
356 };
357
358 // Upsweep type
359 typedef AgentRadixSortUpsweep<
361 KeyT,
362 OffsetT>
364
365 // Digit-scan type
367
368 // Downsweep type
370
371 enum
372 {
375 };
376
377 //
378 // Process input tiles
379 //
380
381 // Shared memory storage
382 __shared__ union
383 {
384 typename BlockUpsweepT::TempStorage upsweep;
385 typename BlockDownsweepT::TempStorage downsweep;
386 struct
387 {
388 volatile OffsetT reverse_counts_in[RADIX_DIGITS];
389 volatile OffsetT reverse_counts_out[RADIX_DIGITS];
390 typename DigitScanT::TempStorage scan;
391 };
392
393 } temp_storage;
394
395 OffsetT segment_begin = d_begin_offsets[blockIdx.x];
396 OffsetT segment_end = d_end_offsets[blockIdx.x];
397 OffsetT num_items = segment_end - segment_begin;
398
399 // Check if empty segment
400 if (num_items <= 0)
401 return;
402
403 // Upsweep
404 BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
405 upsweep.ProcessRegion(segment_begin, segment_end);
406
407 CTA_SYNC();
408
409 // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
411 upsweep.ExtractCounts(bin_count);
412
413 CTA_SYNC();
414
415 if (IS_DESCENDING)
416 {
417 // Reverse bin counts
418 #pragma unroll
419 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
420 {
421 int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
422
423 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
424 temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
425 }
426
427 CTA_SYNC();
428
429 #pragma unroll
430 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
431 {
432 int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
433
434 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
435 bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
436 }
437 }
438
439 // Scan
440 OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
441 DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
442
443 #pragma unroll
444 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
445 {
446 bin_offset[track] += segment_begin;
447 }
448
449 if (IS_DESCENDING)
450 {
451 // Reverse bin offsets
452 #pragma unroll
453 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
454 {
455 int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
456
457 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
458 temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
459 }
460
461 CTA_SYNC();
462
463 #pragma unroll
464 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
465 {
466 int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
467
468 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
469 bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
470 }
471 }
472
473 CTA_SYNC();
474
475 // Downsweep
476 BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
477 downsweep.ProcessRegion(segment_begin, segment_end);
478}
479
480
481
482/******************************************************************************
483 * Policy
484 ******************************************************************************/
485
489template <
490 typename KeyT,
491 typename ValueT,
492 typename OffsetT>
494{
495 //------------------------------------------------------------------------------
496 // Constants
497 //------------------------------------------------------------------------------
498
499 enum
500 {
501 // Whether this is a keys-only (or key-value) sort
503 };
504
505 // Dominant-sized key/value type
506 typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT;
507
508 //------------------------------------------------------------------------------
509 // Architecture-specific tuning policies
510 //------------------------------------------------------------------------------
511
513 struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
514 {
515 enum {
516 PRIMARY_RADIX_BITS = 5,
517 ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1,
518
519 // Relative size of KeyT type to a 4-byte word
520 SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
521 };
522
523 // Keys-only upsweep policies
524 typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys;
525 typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys;
526
527 // Key-value pairs upsweep policies
528 typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs;
529 typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs;
530
531 // Upsweep policies
534
535 // Scan policy
536 typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
537
538 // Keys-only downsweep policies
539 typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
540 typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys;
541
542 // Key-value pairs downsweep policies
543 typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs;
544 typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs;
545
546 // Downsweep policies
549
550 // Single-tile policy
552
553 // Segmented policies
556 };
557
559 struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
560 {
561 enum {
562 PRIMARY_RADIX_BITS = 5,
563 ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1,
564
565 // Relative size of KeyT type to a 4-byte word
566 SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
567 };
568
569 // Keys-only upsweep policies
570 typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys;
571 typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys;
572
573 // Key-value pairs upsweep policies
574 typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs;
575 typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs;
576
577 // Upsweep policies
580
581 // Scan policy
582 typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
583
584 // Keys-only downsweep policies
585 typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
586 typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys;
587
588 // Key-value pairs downsweep policies
589 typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs;
590 typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs;
591
592 // Downsweep policies
595
596 // Single-tile policy
598
599 // Segmented policies
602 };
603
604
606 struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
607 {
608 enum {
609 PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
610 };
611
612 // Scan policy
613 typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
614
615 // Keys-only downsweep policies
618
619 // Key-value pairs downsweep policies
622
623 // Downsweep policies
626
627 // Upsweep policies
630
631 // Single-tile policy
633
634 // Segmented policies
637
638
639 };
640
641
643 struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
644 {
645 enum {
646 PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
647 SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
648 SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX)
649 };
650
651 // ScanPolicy
652 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
653
654 // Downsweep policies
657
658 // Upsweep policies
661
662 // Single-tile policy
664
665 // Segmented policies
668 };
669
670
672 struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
673 {
674 enum {
675 PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100)
676 SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
677 SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100)
678 };
679
680 // ScanPolicy
681 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
682
683 // Downsweep policies
686
687 // Upsweep policies
690
691 // Single-tile policy
693
694 // Segmented policies
697
698 };
699
700
702 struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
703 {
704 enum {
705 PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
706 SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
707 SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080)
708 };
709
710 // ScanPolicy
711 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
712
713 // Downsweep policies
716
717 // Upsweep policies
720
721 // Single-tile policy
723
724 // Segmented policies
727 };
728
729
731 struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
732 {
733 enum {
734 PRIMARY_RADIX_BITS = 5,
735 ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1,
736 };
737
738 // ScanPolicy
739 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
740
741 // Downsweep policies
744
745 // Upsweep policies
748
749 // Single-tile policy
751
752 // Segmented policies
755 };
756
757
759 struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
760 {
761 enum {
762 PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100)
763 SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
764 SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100)
765 };
766
767 // ScanPolicy
768 typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
769
770 // Downsweep policies
773
774 // Upsweep policies
777
778 // Single-tile policy
780
781 // Segmented policies
784 };
785
786
789
790
791};
792
793
794
795/******************************************************************************
796 * Single-problem dispatch
797 ******************************************************************************/
798
802template <
803 bool IS_DESCENDING,
804 typename KeyT,
805 typename ValueT,
806 typename OffsetT>
808 DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
809{
810 //------------------------------------------------------------------------------
811 // Constants
812 //------------------------------------------------------------------------------
813
814 enum
815 {
816 // Whether this is a keys-only (or key-value) sort
818 };
819
820
821 //------------------------------------------------------------------------------
822 // Problem state
823 //------------------------------------------------------------------------------
824
832 cudaStream_t stream;
836
837
838 //------------------------------------------------------------------------------
839 // Constructor
840 //------------------------------------------------------------------------------
841
843 CUB_RUNTIME_FUNCTION __forceinline__
845 void* d_temp_storage,
846 size_t &temp_storage_bytes,
850 int begin_bit,
851 int end_bit,
853 cudaStream_t stream,
855 int ptx_version)
856 :
859 d_keys(d_keys),
864 stream(stream),
868 {}
869
870
871 //------------------------------------------------------------------------------
872 // Small-problem (single tile) invocation
873 //------------------------------------------------------------------------------
874
876 template <
877 typename ActivePolicyT,
878 typename SingleTileKernelT>
879 CUB_RUNTIME_FUNCTION __forceinline__
880 cudaError_t InvokeSingleTile(
881 SingleTileKernelT single_tile_kernel)
882 {
883#ifndef CUB_RUNTIME_ENABLED
884 (void)single_tile_kernel;
885 // Kernel launch not supported from this device
886 return CubDebug(cudaErrorNotSupported );
887#else
888 cudaError error = cudaSuccess;
889 do
890 {
891 // Return if the caller is simply requesting the size of the storage allocation
892 if (d_temp_storage == NULL)
893 {
895 break;
896 }
897
898 // Return if empty problem
899 if (num_items == 0)
900 break;
901
902 // Log single_tile_kernel configuration
904 _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
905 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
906 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
907
908 // Invoke upsweep_kernel with same grid size as downsweep_kernel
909 single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
910 d_keys.Current(),
914 num_items,
915 begin_bit,
916 end_bit);
917
918 // Check for failure to launch
919 if (CubDebug(error = cudaPeekAtLastError())) break;
920
921 // Sync the stream if specified to flush runtime errors
922 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
923
924 // Update selector
925 d_keys.selector ^= 1;
926 d_values.selector ^= 1;
927 }
928 while (0);
929
930 return error;
931
932#endif // CUB_RUNTIME_ENABLED
933 }
934
935
936 //------------------------------------------------------------------------------
937 // Normal problem size invocation
938 //------------------------------------------------------------------------------
939
943 template <typename PassConfigT>
944 CUB_RUNTIME_FUNCTION __forceinline__
945 cudaError_t InvokePass(
946 const KeyT *d_keys_in,
947 KeyT *d_keys_out,
948 const ValueT *d_values_in,
949 ValueT *d_values_out,
951 int spine_length,
952 int &current_bit,
953 PassConfigT &pass_config)
954 {
955 cudaError error = cudaSuccess;
956 do
957 {
958 int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
959
960 // Log upsweep_kernel configuration
962 _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
963 pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
964 pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
965
966 // Invoke upsweep_kernel with same grid size as downsweep_kernel
967 pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
968 d_keys_in,
969 d_spine,
970 num_items,
972 pass_bits,
973 pass_config.even_share);
974
975 // Check for failure to launch
976 if (CubDebug(error = cudaPeekAtLastError())) break;
977
978 // Sync the stream if specified to flush runtime errors
979 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
980
981 // Log scan_kernel configuration
982 if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
983 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
984
985 // Invoke scan_kernel
986 pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
987 d_spine,
988 spine_length);
989
990 // Check for failure to launch
991 if (CubDebug(error = cudaPeekAtLastError())) break;
992
993 // Sync the stream if specified to flush runtime errors
994 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
995
996 // Log downsweep_kernel configuration
997 if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
998 pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
999 pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
1000
1001 // Invoke downsweep_kernel
1002 pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
1003 d_keys_in,
1004 d_keys_out,
1007 d_spine,
1008 num_items,
1010 pass_bits,
1011 pass_config.even_share);
1012
1013 // Check for failure to launch
1014 if (CubDebug(error = cudaPeekAtLastError())) break;
1015
1016 // Sync the stream if specified to flush runtime errors
1017 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
1018
1019 // Update current bit
1021 }
1022 while (0);
1023
1024 return error;
1025 }
1026
1027
1028
1030 template <
1031 typename UpsweepKernelT,
1032 typename ScanKernelT,
1033 typename DownsweepKernelT>
1035 {
1036 UpsweepKernelT upsweep_kernel;
1037 KernelConfig upsweep_config;
1038 ScanKernelT scan_kernel;
1039 KernelConfig scan_config;
1040 DownsweepKernelT downsweep_kernel;
1041 KernelConfig downsweep_config;
1042 int radix_bits;
1043 int radix_digits;
1044 int max_downsweep_grid_size;
1045 GridEvenShare<OffsetT> even_share;
1046
1048 template <
1049 typename UpsweepPolicyT,
1050 typename ScanPolicyT,
1051 typename DownsweepPolicyT>
1052 CUB_RUNTIME_FUNCTION __forceinline__
1053 cudaError_t InitPassConfig(
1054 UpsweepKernelT upsweep_kernel,
1055 ScanKernelT scan_kernel,
1056 DownsweepKernelT downsweep_kernel,
1057 int ptx_version,
1058 int sm_count,
1059 int num_items)
1060 {
1061 cudaError error = cudaSuccess;
1062 do
1063 {
1064 this->upsweep_kernel = upsweep_kernel;
1065 this->scan_kernel = scan_kernel;
1066 this->downsweep_kernel = downsweep_kernel;
1067 radix_bits = DownsweepPolicyT::RADIX_BITS;
1068 radix_digits = 1 << radix_bits;
1069
1070 if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
1071 if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
1072 if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
1073
1074 max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
1075
1076 even_share.DispatchInit(
1077 num_items,
1078 max_downsweep_grid_size,
1079 CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
1080
1081 }
1082 while (0);
1083 return error;
1084 }
1085
1086 };
1087
1088
1090 template <
1091 typename ActivePolicyT,
1092 typename UpsweepKernelT,
1093 typename ScanKernelT,
1094 typename DownsweepKernelT>
1095 CUB_RUNTIME_FUNCTION __forceinline__
1096 cudaError_t InvokePasses(
1097 UpsweepKernelT upsweep_kernel,
1098 UpsweepKernelT alt_upsweep_kernel,
1099 ScanKernelT scan_kernel,
1100 DownsweepKernelT downsweep_kernel,
1101 DownsweepKernelT alt_downsweep_kernel)
1102 {
1103#ifndef CUB_RUNTIME_ENABLED
1104 (void)upsweep_kernel;
1105 (void)alt_upsweep_kernel;
1106 (void)scan_kernel;
1107 (void)downsweep_kernel;
1108 (void)alt_downsweep_kernel;
1109
1110 // Kernel launch not supported from this device
1111 return CubDebug(cudaErrorNotSupported );
1112#else
1113
1114 cudaError error = cudaSuccess;
1115 do
1116 {
1117 // Get device ordinal
1118 int device_ordinal;
1119 if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
1120
1121 // Get SM count
1122 int sm_count;
1123 if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
1124
1125 // Init regular and alternate-digit kernel configurations
1127 if ((error = pass_config.template InitPassConfig<
1128 typename ActivePolicyT::UpsweepPolicy,
1129 typename ActivePolicyT::ScanPolicy,
1130 typename ActivePolicyT::DownsweepPolicy>(
1131 upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
1132
1133 if ((error = alt_pass_config.template InitPassConfig<
1134 typename ActivePolicyT::AltUpsweepPolicy,
1135 typename ActivePolicyT::ScanPolicy,
1136 typename ActivePolicyT::AltDownsweepPolicy>(
1137 alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
1138
1139 // Get maximum spine length
1140 int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
1141 int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
1142
1143 // Temporary storage allocation requirements
1144 void* allocations[3];
1145 size_t allocation_sizes[3] =
1146 {
1147 spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms
1148 (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer
1149 (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer
1150 };
1151
1152 // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
1153 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
1154
1155 // Return if the caller is simply requesting the size of the storage allocation
1156 if (d_temp_storage == NULL)
1157 return cudaSuccess;
1158
1159 // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
1160 int num_bits = end_bit - begin_bit;
1161 int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
1162 bool is_num_passes_odd = num_passes & 1;
1163 int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits;
1164 int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
1165
1166 // Alias the temporary storage allocations
1167 OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
1168
1169 DoubleBuffer<KeyT> d_keys_remaining_passes(
1170 (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
1171 (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
1172
1173 DoubleBuffer<ValueT> d_values_remaining_passes(
1174 (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
1175 (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
1176
1177 // Run first pass, consuming from the input's current buffers
1178 int current_bit = begin_bit;
1179 if (CubDebug(error = InvokePass(
1180 d_keys.Current(), d_keys_remaining_passes.Current(),
1181 d_values.Current(), d_values_remaining_passes.Current(),
1182 d_spine, spine_length, current_bit,
1183 (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
1184
1185 // Run remaining passes
1186 while (current_bit < end_bit)
1187 {
1188 if (CubDebug(error = InvokePass(
1189 d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1190 d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1191 d_spine, spine_length, current_bit,
1192 (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
1193
1194 // Invert selectors
1195 d_keys_remaining_passes.selector ^= 1;
1196 d_values_remaining_passes.selector ^= 1;
1197 }
1198
1199 // Update selector
1200 if (!is_overwrite_okay) {
1201 num_passes = 1; // Sorted data always ends up in the other vector
1202 }
1203
1204 d_keys.selector = (d_keys.selector + num_passes) & 1;
1205 d_values.selector = (d_values.selector + num_passes) & 1;
1206 }
1207 while (0);
1208
1209 return error;
1210
1211#endif // CUB_RUNTIME_ENABLED
1212 }
1213
1214
1215 //------------------------------------------------------------------------------
1216 // Chained policy invocation
1217 //------------------------------------------------------------------------------
1218
1220 template <typename ActivePolicyT>
1221 CUB_RUNTIME_FUNCTION __forceinline__
1222 cudaError_t Invoke()
1223 {
1224 typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
1225 typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT;
1226
1227 // Force kernel code-generation in all compiler passes
1228 if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
1229 {
1230 // Small, single tile size
1231 return InvokeSingleTile<ActivePolicyT>(
1232 DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
1233 }
1234 else
1235 {
1236 // Regular size
1237 return InvokePasses<ActivePolicyT>(
1238 DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>,
1239 DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>,
1240 RadixSortScanBinsKernel< MaxPolicyT, OffsetT>,
1241 DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>,
1242 DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>);
1243 }
1244 }
1245
1246
1247 //------------------------------------------------------------------------------
1248 // Dispatch entrypoints
1249 //------------------------------------------------------------------------------
1250
1254 CUB_RUNTIME_FUNCTION __forceinline__
1255 static cudaError_t Dispatch(
1256 void* d_temp_storage,
1257 size_t &temp_storage_bytes,
1261 int begin_bit,
1262 int end_bit,
1263 bool is_overwrite_okay,
1264 cudaStream_t stream,
1265 bool debug_synchronous)
1266 {
1267 typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
1268
1269 cudaError_t error;
1270 do {
1271 // Get PTX version
1272 int ptx_version;
1273 if (CubDebug(error = PtxVersion(ptx_version))) break;
1274
1275 // Create dispatch functor
1276 DispatchRadixSort dispatch(
1281
1282 // Dispatch to chained policy
1283 if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
1284
1285 } while (0);
1286
1287 return error;
1288 }
1289};
1290
1291
1292
1293
1294/******************************************************************************
1295 * Segmented dispatch
1296 ******************************************************************************/
1297
1301template <
1302 bool IS_DESCENDING,
1303 typename KeyT,
1304 typename ValueT,
1305 typename OffsetIteratorT,
1306 typename OffsetT>
1308 DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
1309{
1310 //------------------------------------------------------------------------------
1311 // Constants
1312 //------------------------------------------------------------------------------
1313
1314 enum
1315 {
1316 // Whether this is a keys-only (or key-value) sort
1317 KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
1318 };
1319
1320
1321 //------------------------------------------------------------------------------
1322 // Parameter members
1323 //------------------------------------------------------------------------------
1324
1331 OffsetIteratorT d_begin_offsets;
1332 OffsetIteratorT d_end_offsets;
1335 cudaStream_t stream;
1339
1340
1341 //------------------------------------------------------------------------------
1342 // Constructors
1343 //------------------------------------------------------------------------------
1344
1346 CUB_RUNTIME_FUNCTION __forceinline__
1348 void* d_temp_storage,
1349 size_t &temp_storage_bytes,
1354 OffsetIteratorT d_begin_offsets,
1355 OffsetIteratorT d_end_offsets,
1356 int begin_bit,
1357 int end_bit,
1358 bool is_overwrite_okay,
1359 cudaStream_t stream,
1360 bool debug_synchronous,
1361 int ptx_version)
1362 :
1365 d_keys(d_keys),
1374 stream(stream),
1377 {}
1378
1379
1380 //------------------------------------------------------------------------------
1381 // Multi-segment invocation
1382 //------------------------------------------------------------------------------
1383
1385 template <typename PassConfigT>
1386 CUB_RUNTIME_FUNCTION __forceinline__
1387 cudaError_t InvokePass(
1388 const KeyT *d_keys_in,
1389 KeyT *d_keys_out,
1390 const ValueT *d_values_in,
1391 ValueT *d_values_out,
1392 int &current_bit,
1393 PassConfigT &pass_config)
1394 {
1395 cudaError error = cudaSuccess;
1396 do
1397 {
1398 int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
1399
1400 // Log kernel configuration
1402 _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
1403 num_segments, pass_config.segmented_config.block_threads, (long long) stream,
1404 pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
1405
1406 pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
1407 d_keys_in, d_keys_out,
1411
1412 // Check for failure to launch
1413 if (CubDebug(error = cudaPeekAtLastError())) break;
1414
1415 // Sync the stream if specified to flush runtime errors
1416 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
1417
1418 // Update current bit
1420 }
1421 while (0);
1422
1423 return error;
1424 }
1425
1426
1428 template <typename SegmentedKernelT>
1430 {
1431 SegmentedKernelT segmented_kernel;
1432 KernelConfig segmented_config;
1433 int radix_bits;
1434 int radix_digits;
1435
1437 template <typename SegmentedPolicyT>
1438 CUB_RUNTIME_FUNCTION __forceinline__
1439 cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
1440 {
1441 this->segmented_kernel = segmented_kernel;
1442 this->radix_bits = SegmentedPolicyT::RADIX_BITS;
1443 this->radix_digits = 1 << radix_bits;
1444
1445 return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
1446 }
1447 };
1448
1449
1451 template <
1452 typename ActivePolicyT,
1453 typename SegmentedKernelT>
1454 CUB_RUNTIME_FUNCTION __forceinline__
1455 cudaError_t InvokePasses(
1456 SegmentedKernelT segmented_kernel,
1457 SegmentedKernelT alt_segmented_kernel)
1458 {
1459#ifndef CUB_RUNTIME_ENABLED
1460 (void)segmented_kernel;
1461 (void)alt_segmented_kernel;
1462
1463 // Kernel launch not supported from this device
1464 return CubDebug(cudaErrorNotSupported );
1465#else
1466
1467 cudaError error = cudaSuccess;
1468 do
1469 {
1470 // Init regular and alternate kernel configurations
1471 PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
1472 if ((error = pass_config.template InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
1473 if ((error = alt_pass_config.template InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
1474
1475 // Temporary storage allocation requirements
1476 void* allocations[2];
1477 size_t allocation_sizes[2] =
1478 {
1479 (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer
1480 (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer
1481 };
1482
1483 // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
1484 if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
1485
1486 // Return if the caller is simply requesting the size of the storage allocation
1487 if (d_temp_storage == NULL)
1488 {
1489 if (temp_storage_bytes == 0)
1491 return cudaSuccess;
1492 }
1493
1494 // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
1495 int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
1496 int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
1497 int num_bits = end_bit - begin_bit;
1498 int num_passes = (num_bits + radix_bits - 1) / radix_bits;
1499 bool is_num_passes_odd = num_passes & 1;
1500 int max_alt_passes = (num_passes * radix_bits) - num_bits;
1501 int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
1502
1503 DoubleBuffer<KeyT> d_keys_remaining_passes(
1504 (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
1505 (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
1506
1507 DoubleBuffer<ValueT> d_values_remaining_passes(
1508 (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
1509 (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
1510
1511 // Run first pass, consuming from the input's current buffers
1512 int current_bit = begin_bit;
1513
1514 if (CubDebug(error = InvokePass(
1515 d_keys.Current(), d_keys_remaining_passes.Current(),
1516 d_values.Current(), d_values_remaining_passes.Current(),
1518 (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
1519
1520 // Run remaining passes
1521 while (current_bit < end_bit)
1522 {
1523 if (CubDebug(error = InvokePass(
1524 d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1525 d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1527 (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
1528
1529 // Invert selectors and update current bit
1530 d_keys_remaining_passes.selector ^= 1;
1531 d_values_remaining_passes.selector ^= 1;
1532 }
1533
1534 // Update selector
1535 if (!is_overwrite_okay) {
1536 num_passes = 1; // Sorted data always ends up in the other vector
1537 }
1538
1539 d_keys.selector = (d_keys.selector + num_passes) & 1;
1540 d_values.selector = (d_values.selector + num_passes) & 1;
1541 }
1542 while (0);
1543
1544 return error;
1545
1546#endif // CUB_RUNTIME_ENABLED
1547 }
1548
1549
1550 //------------------------------------------------------------------------------
1551 // Chained policy invocation
1552 //------------------------------------------------------------------------------
1553
1555 template <typename ActivePolicyT>
1556 CUB_RUNTIME_FUNCTION __forceinline__
1557 cudaError_t Invoke()
1558 {
1559 typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
1560
1561 // Force kernel code-generation in all compiler passes
1562 return InvokePasses<ActivePolicyT>(
1563 DeviceSegmentedRadixSortKernel<MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
1564 DeviceSegmentedRadixSortKernel<MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
1565 }
1566
1567
1568 //------------------------------------------------------------------------------
1569 // Dispatch entrypoints
1570 //------------------------------------------------------------------------------
1571
1572
1574 CUB_RUNTIME_FUNCTION __forceinline__
1575 static cudaError_t Dispatch(
1576 void* d_temp_storage,
1577 size_t &temp_storage_bytes,
1580 int num_items,
1581 int num_segments,
1582 OffsetIteratorT d_begin_offsets,
1583 OffsetIteratorT d_end_offsets,
1584 int begin_bit,
1585 int end_bit,
1586 bool is_overwrite_okay,
1587 cudaStream_t stream,
1588 bool debug_synchronous)
1589 {
1590 typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
1591
1592 cudaError_t error;
1593 do {
1594 // Get PTX version
1595 int ptx_version;
1596 if (CubDebug(error = PtxVersion(ptx_version))) break;
1597
1598 // Create dispatch functor
1605
1606 // Dispatch to chained policy
1607 if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
1608
1609 } while (0);
1610
1611 return error;
1612 }
1613};
1614
1615
1616} // CUB namespace
1617CUB_NS_POSTFIX // Optional outer namespace(s)
1618
1619
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
The BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thre...
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
@ BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_WARP_TRANSPOSE
@ BLOCK_LOAD_TRANSPOSE
@ LOAD_LDG
Cache as texture.
@ LOAD_DEFAULT
Default (no modifier)
#define _CubLog(format,...)
Log macro for printf statements.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
#define CubDebug(e)
Debug macro.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
#define CUB_MAX(a, b)
Select maximum(a, b)
#define CUB_MIN(a, b)
Select minimum(a, b)
CTA_SYNC()
Definition util_ptx.cuh:255
Optional outer namespace(s)
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
OffsetT * d_spine
< [in] Input keys buffer
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
KeyT * d_keys_out
< [in] Input keys buffer
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
OffsetT int int num_bits
[in] Number of bits of current radix digit
KeyT const ValueT * d_values_in
[in] Input values buffer
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
int num_counts
< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block,...
OffsetT int current_bit
[in] Bit position of current radix digit
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
@ BLOCK_SCAN_WARP_SCANS
@ BLOCK_SCAN_RAKING_MEMOIZE
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int int pass_bits
< [in] Number of bits of current radix digit
< The number of radix bits, i.e., log2(bins)
Alias wrapper allowing storage to be unioned.
AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in ...
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, OffsetT block_end)
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
< The number of radix bits, i.e., log2(bins)
Alias wrapper allowing storage to be unioned.
AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in de...
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, const OffsetT &block_end)
__device__ __forceinline__ void ExtractCounts(OffsetT *counters, int bin_stride=1, int bin_offset=0)
< The BlockScan algorithm to use
AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide pr...
\smemstorage{BlockLoad}
\smemstorage{BlockRadixSort}
Helper for dispatching into a policy chain.
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig(UpsweepKernelT upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, int ptx_version, int sm_count, int num_items)
Initialize pass configuration.
< Signed integer type for global offsets
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass(const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, OffsetT *d_spine, int spine_length, int &current_bit, PassConfigT &pass_config)
int ptx_version
[in] PTX version
DoubleBuffer< KeyT > & d_keys
[in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return,...
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(UpsweepKernelT upsweep_kernel, UpsweepKernelT alt_upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, DownsweepKernelT alt_downsweep_kernel)
Invocation (run multiple digit passes)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
DoubleBuffer< ValueT > & d_values
[in,out] Double-buffer whose current buffer contains the unsorted input values and,...
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
OffsetT num_items
[in] Number of items to sort
bool is_overwrite_okay
[in] Whether is okay to overwrite source buffers
CUB_RUNTIME_FUNCTION __forceinline__ DispatchRadixSort(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
int begin_bit
[in] The beginning (least-significant) bit index needed for key comparison
int end_bit
[in] The past-the-end (most-significant) bit index needed for key comparison
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel)
Invoke a single block to sort in-core.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
Initialize pass configuration.
< Signed integer type for global offsets
OffsetT num_items
[in] Number of items to sort
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
OffsetT num_segments
[in] The number of segments that comprise the sorting data
OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
DoubleBuffer< KeyT > & d_keys
[in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return,...
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
DoubleBuffer< ValueT > & d_values
[in,out] Double-buffer whose current buffer contains the unsorted input values and,...
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass(const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int &current_bit, PassConfigT &pass_config)
Invoke a three-kernel sorting pass at the current bit.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel)
Invocation (run multiple digit passes)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
Internal dispatch routine.
CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedRadixSort(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, OffsetT num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
bool is_overwrite_okay
[in] Whether is okay to overwrite source buffers
int end_bit
[in] The past-the-end (most-significant) bit index needed for key comparison
int begin_bit
[in] The beginning (least-significant) bit index needed for key comparison
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
Double-buffer storage wrapper for multi-pass stream transformations that require more than one storag...
__host__ __device__ __forceinline__ T * Current()
Return pointer to the currently valid buffer.
int selector
Selector into d_buffers (i.e., the active/valid buffer)
__host__ __device__ __forceinline__ T * Alternate()
Return pointer to the currently invalid buffer.
Type equality test.
Definition util_type.cuh:99
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
Default sum functor.
Type traits.
#define CUB_SUBSCRIPTION_FACTOR(arch)
Oversubscription factor.
Definition util_arch.cuh:99
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.