OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
dispatch_radix_sort.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "../../agent/agent_radix_sort_upsweep.cuh"
41 #include "../../agent/agent_radix_sort_downsweep.cuh"
42 #include "../../agent/agent_scan.cuh"
43 #include "../../block/block_radix_sort.cuh"
44 #include "../../grid/grid_even_share.cuh"
45 #include "../../util_type.cuh"
46 #include "../../util_debug.cuh"
47 #include "../../util_device.cuh"
48 #include "../../util_namespace.cuh"
49 
51 CUB_NS_PREFIX
52 
54 namespace cub {
55 
56 /******************************************************************************
57  * Kernel entry points
58  *****************************************************************************/
59 
63 template <
64  typename ChainedPolicyT,
65  bool ALT_DIGIT_BITS,
66  bool IS_DESCENDING,
67  typename KeyT,
68  typename OffsetT>
69 __launch_bounds__ (int((ALT_DIGIT_BITS) ?
70  ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
71  ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
72 __global__ void DeviceRadixSortUpsweepKernel(
73  const KeyT *d_keys,
75  OffsetT /*num_items*/,
76  int current_bit,
77  int num_bits,
79 {
80  enum {
81  TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
82  ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
83  };
84 
85  // Parameterize AgentRadixSortUpsweep type for the current configuration
86  typedef AgentRadixSortUpsweep<
87  typename If<(ALT_DIGIT_BITS),
88  typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
89  typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type,
90  KeyT,
91  OffsetT>
93 
94  // Shared memory storage
95  __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
96 
97  // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
98  even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
99 
100  AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
101 
102  upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
103 
104  CTA_SYNC();
105 
106  // Write out digit counts (striped)
107  upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
108 }
109 
110 
114 template <
115  typename ChainedPolicyT,
116  typename OffsetT>
117 __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
118 __global__ void RadixSortScanBinsKernel(
119  OffsetT *d_spine,
120  int num_counts)
121 {
122  // Parameterize the AgentScan type for the current configuration
123  typedef AgentScan<
124  typename ChainedPolicyT::ActivePolicy::ScanPolicy,
125  OffsetT*,
126  OffsetT*,
127  cub::Sum,
128  OffsetT,
129  OffsetT>
130  AgentScanT;
131 
132  // Shared memory storage
133  __shared__ typename AgentScanT::TempStorage temp_storage;
134 
135  // Block scan instance
136  AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
137 
138  // Process full input tiles
139  int block_offset = 0;
141  while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
142  {
143  block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
144  block_offset += AgentScanT::TILE_ITEMS;
145  }
146 }
147 
148 
152 template <
153  typename ChainedPolicyT,
154  bool ALT_DIGIT_BITS,
155  bool IS_DESCENDING,
156  typename KeyT,
157  typename ValueT,
158  typename OffsetT>
159 __launch_bounds__ (int((ALT_DIGIT_BITS) ?
160  ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
161  ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
162 __global__ void DeviceRadixSortDownsweepKernel(
163  const KeyT *d_keys_in,
164  KeyT *d_keys_out,
165  const ValueT *d_values_in,
166  ValueT *d_values_out,
167  OffsetT *d_spine,
169  int current_bit,
170  int num_bits,
172 {
173  enum {
174  TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
175  ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
176  };
177 
178  // Parameterize AgentRadixSortDownsweep type for the current configuration
179  typedef AgentRadixSortDownsweep<
180  typename If<(ALT_DIGIT_BITS),
181  typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
182  typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type,
183  IS_DESCENDING,
184  KeyT,
185  ValueT,
186  OffsetT>
188 
189  // Shared memory storage
190  __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage;
191 
192  // Initialize even-share descriptor for this thread block
193  even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
194 
195  // Process input tiles
197  even_share.block_offset,
198  even_share.block_end);
199 }
200 
201 
205 template <
206  typename ChainedPolicyT,
207  bool IS_DESCENDING,
208  typename KeyT,
209  typename ValueT,
210  typename OffsetT>
211 __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
212 __global__ void DeviceRadixSortSingleTileKernel(
213  const KeyT *d_keys_in,
214  KeyT *d_keys_out,
215  const ValueT *d_values_in,
216  ValueT *d_values_out,
218  int current_bit,
219  int end_bit)
220 {
221  // Constants
222  enum
223  {
224  BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
225  ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
227  };
228 
229  // BlockRadixSort type
230  typedef BlockRadixSort<
231  KeyT,
232  BLOCK_THREADS,
233  ITEMS_PER_THREAD,
234  ValueT,
235  ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
236  (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
237  ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
239 
240  // BlockLoad type (keys)
241  typedef BlockLoad<
242  KeyT,
243  BLOCK_THREADS,
244  ITEMS_PER_THREAD,
245  ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
246 
247  // BlockLoad type (values)
248  typedef BlockLoad<
249  ValueT,
250  BLOCK_THREADS,
251  ITEMS_PER_THREAD,
252  ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
253 
254  // Unsigned word for key bits
255  typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
256 
257  // Shared memory storage
258  __shared__ union TempStorage
259  {
260  typename BlockRadixSortT::TempStorage sort;
261  typename BlockLoadKeys::TempStorage load_keys;
262  typename BlockLoadValues::TempStorage load_values;
263 
264  } temp_storage;
265 
266  // Keys and values for the block
267  KeyT keys[ITEMS_PER_THREAD];
268  ValueT values[ITEMS_PER_THREAD];
269 
270  // Get default (min/max) value for out-of-bounds keys
271  UnsignedBitsT default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
272  KeyT default_key = reinterpret_cast<KeyT&>(default_key_bits);
273 
274  // Load keys
275  BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
276 
277  CTA_SYNC();
278 
279  // Load values
280  if (!KEYS_ONLY)
281  {
282  // Register pressure work-around: moving num_items through shfl prevents compiler
283  // from reusing guards/addressing from prior guarded loads
284  num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);
285 
286  BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
287 
288  CTA_SYNC();
289  }
290 
291  // Sort tile
292  BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
293  keys,
294  values,
295  current_bit,
296  end_bit,
297  Int2Type<IS_DESCENDING>(),
298  Int2Type<KEYS_ONLY>());
299 
300  // Store keys and values
301  #pragma unroll
302  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
303  {
304  int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
305  if (item_offset < num_items)
306  {
307  d_keys_out[item_offset] = keys[ITEM];
308  if (!KEYS_ONLY)
309  d_values_out[item_offset] = values[ITEM];
310  }
311  }
312 }
313 
314 
318 template <
319  typename ChainedPolicyT,
320  bool ALT_DIGIT_BITS,
321  bool IS_DESCENDING,
322  typename KeyT,
323  typename ValueT,
324  typename OffsetIteratorT,
325  typename OffsetT>
326 __launch_bounds__ (int((ALT_DIGIT_BITS) ?
327  ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
328  ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
329 __global__ void DeviceSegmentedRadixSortKernel(
330  const KeyT *d_keys_in,
331  KeyT *d_keys_out,
332  const ValueT *d_values_in,
333  ValueT *d_values_out,
334  OffsetIteratorT d_begin_offsets,
335  OffsetIteratorT d_end_offsets,
336  int /*num_segments*/,
337  int current_bit,
338  int pass_bits)
339 {
340  //
341  // Constants
342  //
343 
344  typedef typename If<(ALT_DIGIT_BITS),
345  typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
346  typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
347 
348  enum
349  {
350  BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS,
351  ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD,
352  RADIX_BITS = SegmentedPolicyT::RADIX_BITS,
353  TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
354  RADIX_DIGITS = 1 << RADIX_BITS,
356  };
357 
358  // Upsweep type
359  typedef AgentRadixSortUpsweep<
361  KeyT,
362  OffsetT>
364 
365  // Digit-scan type
367 
368  // Downsweep type
370 
371  enum
372  {
375  };
376 
377  //
378  // Process input tiles
379  //
380 
381  // Shared memory storage
382  __shared__ union
383  {
384  typename BlockUpsweepT::TempStorage upsweep;
385  typename BlockDownsweepT::TempStorage downsweep;
386  struct
387  {
388  volatile OffsetT reverse_counts_in[RADIX_DIGITS];
389  volatile OffsetT reverse_counts_out[RADIX_DIGITS];
390  typename DigitScanT::TempStorage scan;
391  };
392 
393  } temp_storage;
394 
395  OffsetT segment_begin = d_begin_offsets[blockIdx.x];
396  OffsetT segment_end = d_end_offsets[blockIdx.x];
397  OffsetT num_items = segment_end - segment_begin;
398 
399  // Check if empty segment
400  if (num_items <= 0)
401  return;
402 
403  // Upsweep
404  BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
405  upsweep.ProcessRegion(segment_begin, segment_end);
406 
407  CTA_SYNC();
408 
409  // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
410  OffsetT bin_count[BINS_TRACKED_PER_THREAD];
411  upsweep.ExtractCounts(bin_count);
412 
413  CTA_SYNC();
414 
415  if (IS_DESCENDING)
416  {
417  // Reverse bin counts
418  #pragma unroll
419  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
420  {
421  int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
422 
423  if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
424  temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
425  }
426 
427  CTA_SYNC();
428 
429  #pragma unroll
430  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
431  {
432  int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
433 
434  if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
435  bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
436  }
437  }
438 
439  // Scan
440  OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
441  DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
442 
443  #pragma unroll
444  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
445  {
446  bin_offset[track] += segment_begin;
447  }
448 
449  if (IS_DESCENDING)
450  {
451  // Reverse bin offsets
452  #pragma unroll
453  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
454  {
455  int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
456 
457  if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
458  temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
459  }
460 
461  CTA_SYNC();
462 
463  #pragma unroll
464  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
465  {
466  int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
467 
468  if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
469  bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
470  }
471  }
472 
473  CTA_SYNC();
474 
475  // Downsweep
476  BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
477  downsweep.ProcessRegion(segment_begin, segment_end);
478 }
479 
480 
481 
482 /******************************************************************************
483  * Policy
484  ******************************************************************************/
485 
489 template <
490  typename KeyT,
491  typename ValueT,
492  typename OffsetT>
494 {
495  //------------------------------------------------------------------------------
496  // Constants
497  //------------------------------------------------------------------------------
498 
499  enum
500  {
501  // Whether this is a keys-only (or key-value) sort
502  KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
503  };
504 
505  // Dominant-sized key/value type
506  typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT;
507 
508  //------------------------------------------------------------------------------
509  // Architecture-specific tuning policies
510  //------------------------------------------------------------------------------
511 
513  struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
514  {
515  enum {
516  PRIMARY_RADIX_BITS = 5,
517  ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1,
518 
519  // Relative size of KeyT type to a 4-byte word
520  SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
521  };
522 
523  // Keys-only upsweep policies
524  typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys;
525  typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys;
526 
527  // Key-value pairs upsweep policies
528  typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs;
529  typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs;
530 
531  // Upsweep policies
534 
535  // Scan policy
537 
538  // Keys-only downsweep policies
539  typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
540  typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys;
541 
542  // Key-value pairs downsweep policies
543  typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs;
544  typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs;
545 
546  // Downsweep policies
549 
550  // Single-tile policy
552 
553  // Segmented policies
556  };
557 
559  struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
560  {
561  enum {
562  PRIMARY_RADIX_BITS = 5,
563  ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1,
564 
565  // Relative size of KeyT type to a 4-byte word
566  SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
567  };
568 
569  // Keys-only upsweep policies
570  typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys;
571  typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys;
572 
573  // Key-value pairs upsweep policies
574  typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs;
575  typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs;
576 
577  // Upsweep policies
580 
581  // Scan policy
583 
584  // Keys-only downsweep policies
585  typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
586  typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys;
587 
588  // Key-value pairs downsweep policies
589  typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs;
590  typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs;
591 
592  // Downsweep policies
595 
596  // Single-tile policy
598 
599  // Segmented policies
602  };
603 
604 
606  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
607  {
608  enum {
609  PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
610  };
611 
612  // Scan policy
614 
615  // Keys-only downsweep policies
617  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(64, 18, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
618 
619  // Key-value pairs downsweep policies
621  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(128, 15, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
622 
623  // Downsweep policies
626 
627  // Upsweep policies
630 
631  // Single-tile policy
633 
634  // Segmented policies
637 
638 
639  };
640 
641 
643  struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
644  {
645  enum {
646  PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
647  SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
648  SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX)
649  };
650 
651  // ScanPolicy
653 
654  // Downsweep policies
655  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(160, 39, DominantT), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy;
656  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy;
657 
658  // Upsweep policies
661 
662  // Single-tile policy
663  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
664 
665  // Segmented policies
666  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 31, DominantT), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy;
667  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 11, DominantT), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
668  };
669 
670 
672  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
673  {
674  enum {
675  PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100)
676  SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
677  SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100)
678  };
679 
680  // ScanPolicy
682 
683  // Downsweep policies
684  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy;
685  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy;
686 
687  // Upsweep policies
690 
691  // Single-tile policy
692  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
693 
694  // Segmented policies
695  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy;
696  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
697 
698  };
699 
700 
702  struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
703  {
704  enum {
705  PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
706  SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
707  SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080)
708  };
709 
710  // ScanPolicy
712 
713  // Downsweep policies
714  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 31, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy;
715  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 35, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy;
716 
717  // Upsweep policies
718  typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS> UpsweepPolicy;
719  typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy;
720 
721  // Single-tile policy
722  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
723 
724  // Segmented policies
725  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy;
726  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
727  };
728 
729 
731  struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
732  {
733  enum {
734  PRIMARY_RADIX_BITS = 5,
735  ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1,
736  };
737 
738  // ScanPolicy
740 
741  // Downsweep policies
742  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy;
744 
745  // Upsweep policies
748 
749  // Single-tile policy
750  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
751 
752  // Segmented policies
755  };
756 
757 
759  struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
760  {
761  enum {
762  PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100)
763  SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
764  SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100)
765  };
766 
767  // ScanPolicy
769 
770  // Downsweep policies
771  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy;
772  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy;
773 
774  // Upsweep policies
777 
778  // Single-tile policy
779  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
780 
781  // Segmented policies
782  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy;
783  typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
784  };
785 
786 
789 
790 
791 };
792 
793 
794 
795 /******************************************************************************
796  * Single-problem dispatch
797  ******************************************************************************/
798 
802 template <
803  bool IS_DESCENDING,
804  typename KeyT,
805  typename ValueT,
806  typename OffsetT>
808  DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
809 {
810  //------------------------------------------------------------------------------
811  // Constants
812  //------------------------------------------------------------------------------
813 
814  enum
815  {
816  // Whether this is a keys-only (or key-value) sort
817  KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
818  };
819 
820 
821  //------------------------------------------------------------------------------
822  // Problem state
823  //------------------------------------------------------------------------------
824 
830  int begin_bit;
831  int end_bit;
832  cudaStream_t stream;
836 
837 
838  //------------------------------------------------------------------------------
839  // Constructor
840  //------------------------------------------------------------------------------
841 
843  CUB_RUNTIME_FUNCTION __forceinline__
845  void* d_temp_storage,
846  size_t &temp_storage_bytes,
850  int begin_bit,
851  int end_bit,
852  bool is_overwrite_okay,
853  cudaStream_t stream,
854  bool debug_synchronous,
855  int ptx_version)
856  :
859  d_keys(d_keys),
863  end_bit(end_bit),
864  stream(stream),
868  {}
869 
870 
871  //------------------------------------------------------------------------------
872  // Small-problem (single tile) invocation
873  //------------------------------------------------------------------------------
874 
876  template <
877  typename ActivePolicyT,
878  typename SingleTileKernelT>
879  CUB_RUNTIME_FUNCTION __forceinline__
880  cudaError_t InvokeSingleTile(
881  SingleTileKernelT single_tile_kernel)
882  {
883 #ifndef CUB_RUNTIME_ENABLED
884  (void)single_tile_kernel;
885  // Kernel launch not supported from this device
886  return CubDebug(cudaErrorNotSupported );
887 #else
888  cudaError error = cudaSuccess;
889  do
890  {
891  // Return if the caller is simply requesting the size of the storage allocation
892  if (d_temp_storage == NULL)
893  {
894  temp_storage_bytes = 1;
895  break;
896  }
897 
898  // Return if empty problem
899  if (num_items == 0)
900  break;
901 
902  // Log single_tile_kernel configuration
903  if (debug_synchronous)
904  _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
905  1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
906  ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
907 
908  // Invoke upsweep_kernel with same grid size as downsweep_kernel
909  single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
910  d_keys.Current(),
911  d_keys.Alternate(),
912  d_values.Current(),
914  num_items,
915  begin_bit,
916  end_bit);
917 
918  // Check for failure to launch
919  if (CubDebug(error = cudaPeekAtLastError())) break;
920 
921  // Sync the stream if specified to flush runtime errors
922  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
923 
924  // Update selector
925  d_keys.selector ^= 1;
926  d_values.selector ^= 1;
927  }
928  while (0);
929 
930  return error;
931 
932 #endif // CUB_RUNTIME_ENABLED
933  }
934 
935 
936  //------------------------------------------------------------------------------
937  // Normal problem size invocation
938  //------------------------------------------------------------------------------
939 
943  template <typename PassConfigT>
944  CUB_RUNTIME_FUNCTION __forceinline__
945  cudaError_t InvokePass(
946  const KeyT *d_keys_in,
947  KeyT *d_keys_out,
948  const ValueT *d_values_in,
949  ValueT *d_values_out,
950  OffsetT *d_spine,
951  int spine_length,
952  int &current_bit,
953  PassConfigT &pass_config)
954  {
955  cudaError error = cudaSuccess;
956  do
957  {
958  int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
959 
960  // Log upsweep_kernel configuration
961  if (debug_synchronous)
962  _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
963  pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
964  pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
965 
966  // Invoke upsweep_kernel with same grid size as downsweep_kernel
967  pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
968  d_keys_in,
969  d_spine,
970  num_items,
971  current_bit,
972  pass_bits,
973  pass_config.even_share);
974 
975  // Check for failure to launch
976  if (CubDebug(error = cudaPeekAtLastError())) break;
977 
978  // Sync the stream if specified to flush runtime errors
979  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
980 
981  // Log scan_kernel configuration
982  if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
983  1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
984 
985  // Invoke scan_kernel
986  pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
987  d_spine,
988  spine_length);
989 
990  // Check for failure to launch
991  if (CubDebug(error = cudaPeekAtLastError())) break;
992 
993  // Sync the stream if specified to flush runtime errors
994  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
995 
996  // Log downsweep_kernel configuration
997  if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
998  pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
999  pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
1000 
1001  // Invoke downsweep_kernel
1002  pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
1003  d_keys_in,
1004  d_keys_out,
1005  d_values_in,
1006  d_values_out,
1007  d_spine,
1008  num_items,
1009  current_bit,
1010  pass_bits,
1011  pass_config.even_share);
1012 
1013  // Check for failure to launch
1014  if (CubDebug(error = cudaPeekAtLastError())) break;
1015 
1016  // Sync the stream if specified to flush runtime errors
1017  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
1018 
1019  // Update current bit
1021  }
1022  while (0);
1023 
1024  return error;
1025  }
1026 
1027 
1028 
1030  template <
1031  typename UpsweepKernelT,
1032  typename ScanKernelT,
1033  typename DownsweepKernelT>
1034  struct PassConfig
1035  {
1036  UpsweepKernelT upsweep_kernel;
1037  KernelConfig upsweep_config;
1038  ScanKernelT scan_kernel;
1039  KernelConfig scan_config;
1040  DownsweepKernelT downsweep_kernel;
1041  KernelConfig downsweep_config;
1042  int radix_bits;
1043  int radix_digits;
1044  int max_downsweep_grid_size;
1045  GridEvenShare<OffsetT> even_share;
1046 
1048  template <
1049  typename UpsweepPolicyT,
1050  typename ScanPolicyT,
1051  typename DownsweepPolicyT>
1052  CUB_RUNTIME_FUNCTION __forceinline__
1053  cudaError_t InitPassConfig(
1054  UpsweepKernelT upsweep_kernel,
1055  ScanKernelT scan_kernel,
1056  DownsweepKernelT downsweep_kernel,
1057  int ptx_version,
1058  int sm_count,
1059  int num_items)
1060  {
1061  cudaError error = cudaSuccess;
1062  do
1063  {
1064  this->upsweep_kernel = upsweep_kernel;
1065  this->scan_kernel = scan_kernel;
1066  this->downsweep_kernel = downsweep_kernel;
1067  radix_bits = DownsweepPolicyT::RADIX_BITS;
1068  radix_digits = 1 << radix_bits;
1069 
1070  if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
1071  if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
1072  if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
1073 
1074  max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
1075 
1076  even_share.DispatchInit(
1077  num_items,
1078  max_downsweep_grid_size,
1079  CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
1080 
1081  }
1082  while (0);
1083  return error;
1084  }
1085 
1086  };
1087 
1088 
1090  template <
1091  typename ActivePolicyT,
1092  typename UpsweepKernelT,
1093  typename ScanKernelT,
1094  typename DownsweepKernelT>
1095  CUB_RUNTIME_FUNCTION __forceinline__
1096  cudaError_t InvokePasses(
1097  UpsweepKernelT upsweep_kernel,
1098  UpsweepKernelT alt_upsweep_kernel,
1099  ScanKernelT scan_kernel,
1100  DownsweepKernelT downsweep_kernel,
1101  DownsweepKernelT alt_downsweep_kernel)
1102  {
1103 #ifndef CUB_RUNTIME_ENABLED
1104  (void)upsweep_kernel;
1105  (void)alt_upsweep_kernel;
1106  (void)scan_kernel;
1107  (void)downsweep_kernel;
1108  (void)alt_downsweep_kernel;
1109 
1110  // Kernel launch not supported from this device
1111  return CubDebug(cudaErrorNotSupported );
1112 #else
1113 
1114  cudaError error = cudaSuccess;
1115  do
1116  {
1117  // Get device ordinal
1118  int device_ordinal;
1119  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
1120 
1121  // Get SM count
1122  int sm_count;
1123  if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
1124 
1125  // Init regular and alternate-digit kernel configurations
1126  PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
1127  if ((error = pass_config.template InitPassConfig<
1128  typename ActivePolicyT::UpsweepPolicy,
1129  typename ActivePolicyT::ScanPolicy,
1130  typename ActivePolicyT::DownsweepPolicy>(
1131  upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
1132 
1133  if ((error = alt_pass_config.template InitPassConfig<
1134  typename ActivePolicyT::AltUpsweepPolicy,
1135  typename ActivePolicyT::ScanPolicy,
1136  typename ActivePolicyT::AltDownsweepPolicy>(
1137  alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
1138 
1139  // Get maximum spine length
1140  int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
1141  int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
1142 
1143  // Temporary storage allocation requirements
1144  void* allocations[3];
1145  size_t allocation_sizes[3] =
1146  {
1147  spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms
1148  (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer
1149  (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer
1150  };
1151 
1152  // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
1153  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
1154 
1155  // Return if the caller is simply requesting the size of the storage allocation
1156  if (d_temp_storage == NULL)
1157  return cudaSuccess;
1158 
1159  // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
1160  int num_bits = end_bit - begin_bit;
1161  int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
1162  bool is_num_passes_odd = num_passes & 1;
1163  int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits;
1164  int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
1165 
1166  // Alias the temporary storage allocations
1167  OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
1168 
1169  DoubleBuffer<KeyT> d_keys_remaining_passes(
1170  (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
1171  (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
1172 
1173  DoubleBuffer<ValueT> d_values_remaining_passes(
1174  (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
1175  (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
1176 
1177  // Run first pass, consuming from the input's current buffers
1178  int current_bit = begin_bit;
1179  if (CubDebug(error = InvokePass(
1180  d_keys.Current(), d_keys_remaining_passes.Current(),
1181  d_values.Current(), d_values_remaining_passes.Current(),
1182  d_spine, spine_length, current_bit,
1183  (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
1184 
1185  // Run remaining passes
1186  while (current_bit < end_bit)
1187  {
1188  if (CubDebug(error = InvokePass(
1189  d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1190  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1191  d_spine, spine_length, current_bit,
1192  (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
1193 
1194  // Invert selectors
1195  d_keys_remaining_passes.selector ^= 1;
1196  d_values_remaining_passes.selector ^= 1;
1197  }
1198 
1199  // Update selector
1200  if (!is_overwrite_okay) {
1201  num_passes = 1; // Sorted data always ends up in the other vector
1202  }
1203 
1204  d_keys.selector = (d_keys.selector + num_passes) & 1;
1205  d_values.selector = (d_values.selector + num_passes) & 1;
1206  }
1207  while (0);
1208 
1209  return error;
1210 
1211 #endif // CUB_RUNTIME_ENABLED
1212  }
1213 
1214 
1215  //------------------------------------------------------------------------------
1216  // Chained policy invocation
1217  //------------------------------------------------------------------------------
1218 
1220  template <typename ActivePolicyT>
1221  CUB_RUNTIME_FUNCTION __forceinline__
1222  cudaError_t Invoke()
1223  {
1224  typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
1225  typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT;
1226 
1227  // Force kernel code-generation in all compiler passes
1228  if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
1229  {
1230  // Small, single tile size
1231  return InvokeSingleTile<ActivePolicyT>(
1232  DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
1233  }
1234  else
1235  {
1236  // Regular size
1237  return InvokePasses<ActivePolicyT>(
1238  DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>,
1239  DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>,
1240  RadixSortScanBinsKernel< MaxPolicyT, OffsetT>,
1241  DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>,
1242  DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>);
1243  }
1244  }
1245 
1246 
1247  //------------------------------------------------------------------------------
1248  // Dispatch entrypoints
1249  //------------------------------------------------------------------------------
1250 
1254  CUB_RUNTIME_FUNCTION __forceinline__
1255  static cudaError_t Dispatch(
1256  void* d_temp_storage,
1257  size_t &temp_storage_bytes,
1258  DoubleBuffer<KeyT> &d_keys,
1259  DoubleBuffer<ValueT> &d_values,
1260  OffsetT num_items,
1261  int begin_bit,
1262  int end_bit,
1263  bool is_overwrite_okay,
1264  cudaStream_t stream,
1265  bool debug_synchronous)
1266  {
1267  typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
1268 
1269  cudaError_t error;
1270  do {
1271  // Get PTX version
1272  int ptx_version;
1273  if (CubDebug(error = PtxVersion(ptx_version))) break;
1274 
1275  // Create dispatch functor
1276  DispatchRadixSort dispatch(
1277  d_temp_storage, temp_storage_bytes,
1278  d_keys, d_values,
1279  num_items, begin_bit, end_bit, is_overwrite_okay,
1280  stream, debug_synchronous, ptx_version);
1281 
1282  // Dispatch to chained policy
1283  if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
1284 
1285  } while (0);
1286 
1287  return error;
1288  }
1289 };
1290 
1291 
1292 
1293 
1294 /******************************************************************************
1295  * Segmented dispatch
1296  ******************************************************************************/
1297 
1301 template <
1302  bool IS_DESCENDING,
1303  typename KeyT,
1304  typename ValueT,
1305  typename OffsetIteratorT,
1306  typename OffsetT>
1308  DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
1309 {
1310  //------------------------------------------------------------------------------
1311  // Constants
1312  //------------------------------------------------------------------------------
1313 
1314  enum
1315  {
1316  // Whether this is a keys-only (or key-value) sort
1317  KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
1318  };
1319 
1320 
1321  //------------------------------------------------------------------------------
1322  // Parameter members
1323  //------------------------------------------------------------------------------
1324 
1331  OffsetIteratorT d_begin_offsets;
1332  OffsetIteratorT d_end_offsets;
1334  int end_bit;
1335  cudaStream_t stream;
1339 
1340 
1341  //------------------------------------------------------------------------------
1342  // Constructors
1343  //------------------------------------------------------------------------------
1344 
1346  CUB_RUNTIME_FUNCTION __forceinline__
1348  void* d_temp_storage,
1349  size_t &temp_storage_bytes,
1350  DoubleBuffer<KeyT> &d_keys,
1351  DoubleBuffer<ValueT> &d_values,
1353  OffsetT num_segments,
1354  OffsetIteratorT d_begin_offsets,
1355  OffsetIteratorT d_end_offsets,
1356  int begin_bit,
1357  int end_bit,
1358  bool is_overwrite_okay,
1359  cudaStream_t stream,
1360  bool debug_synchronous,
1361  int ptx_version)
1362  :
1363  d_temp_storage(d_temp_storage),
1364  temp_storage_bytes(temp_storage_bytes),
1365  d_keys(d_keys),
1366  d_values(d_values),
1368  num_segments(num_segments),
1371  begin_bit(begin_bit),
1372  end_bit(end_bit),
1373  is_overwrite_okay(is_overwrite_okay),
1374  stream(stream),
1375  debug_synchronous(debug_synchronous),
1376  ptx_version(ptx_version)
1377  {}
1378 
1379 
1380  //------------------------------------------------------------------------------
1381  // Multi-segment invocation
1382  //------------------------------------------------------------------------------
1383 
1385  template <typename PassConfigT>
1386  CUB_RUNTIME_FUNCTION __forceinline__
1387  cudaError_t InvokePass(
1388  const KeyT *d_keys_in,
1389  KeyT *d_keys_out,
1390  const ValueT *d_values_in,
1391  ValueT *d_values_out,
1392  int &current_bit,
1393  PassConfigT &pass_config)
1394  {
1395  cudaError error = cudaSuccess;
1396  do
1397  {
1398  int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
1399 
1400  // Log kernel configuration
1401  if (debug_synchronous)
1402  _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
1403  num_segments, pass_config.segmented_config.block_threads, (long long) stream,
1404  pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
1405 
1406  pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
1407  d_keys_in, d_keys_out,
1409  d_begin_offsets, d_end_offsets, num_segments,
1411 
1412  // Check for failure to launch
1413  if (CubDebug(error = cudaPeekAtLastError())) break;
1414 
1415  // Sync the stream if specified to flush runtime errors
1416  if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
1417 
1418  // Update current bit
1420  }
1421  while (0);
1422 
1423  return error;
1424  }
1425 
1426 
1428  template <typename SegmentedKernelT>
1429  struct PassConfig
1430  {
1431  SegmentedKernelT segmented_kernel;
1432  KernelConfig segmented_config;
1433  int radix_bits;
1434  int radix_digits;
1435 
1437  template <typename SegmentedPolicyT>
1438  CUB_RUNTIME_FUNCTION __forceinline__
1439  cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
1440  {
1441  this->segmented_kernel = segmented_kernel;
1442  this->radix_bits = SegmentedPolicyT::RADIX_BITS;
1443  this->radix_digits = 1 << radix_bits;
1444 
1445  return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
1446  }
1447  };
1448 
1449 
1451  template <
1452  typename ActivePolicyT,
1453  typename SegmentedKernelT>
1454  CUB_RUNTIME_FUNCTION __forceinline__
1455  cudaError_t InvokePasses(
1456  SegmentedKernelT segmented_kernel,
1457  SegmentedKernelT alt_segmented_kernel)
1458  {
1459 #ifndef CUB_RUNTIME_ENABLED
1460  (void)segmented_kernel;
1461  (void)alt_segmented_kernel;
1462 
1463  // Kernel launch not supported from this device
1464  return CubDebug(cudaErrorNotSupported );
1465 #else
1466 
1467  cudaError error = cudaSuccess;
1468  do
1469  {
1470  // Init regular and alternate kernel configurations
1471  PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
1472  if ((error = pass_config.template InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
1473  if ((error = alt_pass_config.template InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
1474 
1475  // Temporary storage allocation requirements
1476  void* allocations[2];
1477  size_t allocation_sizes[2] =
1478  {
1479  (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer
1480  (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer
1481  };
1482 
1483  // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
1484  if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
1485 
1486  // Return if the caller is simply requesting the size of the storage allocation
1487  if (d_temp_storage == NULL)
1488  {
1489  if (temp_storage_bytes == 0)
1490  temp_storage_bytes = 1;
1491  return cudaSuccess;
1492  }
1493 
1494  // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
1495  int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
1496  int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
1497  int num_bits = end_bit - begin_bit;
1498  int num_passes = (num_bits + radix_bits - 1) / radix_bits;
1499  bool is_num_passes_odd = num_passes & 1;
1500  int max_alt_passes = (num_passes * radix_bits) - num_bits;
1501  int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
1502 
1503  DoubleBuffer<KeyT> d_keys_remaining_passes(
1504  (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
1505  (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
1506 
1507  DoubleBuffer<ValueT> d_values_remaining_passes(
1508  (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
1509  (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
1510 
1511  // Run first pass, consuming from the input's current buffers
1512  int current_bit = begin_bit;
1513 
1514  if (CubDebug(error = InvokePass(
1515  d_keys.Current(), d_keys_remaining_passes.Current(),
1516  d_values.Current(), d_values_remaining_passes.Current(),
1517  current_bit,
1518  (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
1519 
1520  // Run remaining passes
1521  while (current_bit < end_bit)
1522  {
1523  if (CubDebug(error = InvokePass(
1524  d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1525  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
1526  current_bit,
1527  (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
1528 
1529  // Invert selectors and update current bit
1530  d_keys_remaining_passes.selector ^= 1;
1531  d_values_remaining_passes.selector ^= 1;
1532  }
1533 
1534  // Update selector
1535  if (!is_overwrite_okay) {
1536  num_passes = 1; // Sorted data always ends up in the other vector
1537  }
1538 
1539  d_keys.selector = (d_keys.selector + num_passes) & 1;
1540  d_values.selector = (d_values.selector + num_passes) & 1;
1541  }
1542  while (0);
1543 
1544  return error;
1545 
1546 #endif // CUB_RUNTIME_ENABLED
1547  }
1548 
1549 
1550  //------------------------------------------------------------------------------
1551  // Chained policy invocation
1552  //------------------------------------------------------------------------------
1553 
1555  template <typename ActivePolicyT>
1556  CUB_RUNTIME_FUNCTION __forceinline__
1557  cudaError_t Invoke()
1558  {
1559  typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
1560 
1561  // Force kernel code-generation in all compiler passes
1562  return InvokePasses<ActivePolicyT>(
1563  DeviceSegmentedRadixSortKernel<MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
1564  DeviceSegmentedRadixSortKernel<MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
1565  }
1566 
1567 
1568  //------------------------------------------------------------------------------
1569  // Dispatch entrypoints
1570  //------------------------------------------------------------------------------
1571 
1572 
1574  CUB_RUNTIME_FUNCTION __forceinline__
1575  static cudaError_t Dispatch(
1576  void* d_temp_storage,
1577  size_t &temp_storage_bytes,
1578  DoubleBuffer<KeyT> &d_keys,
1579  DoubleBuffer<ValueT> &d_values,
1580  int num_items,
1581  int num_segments,
1582  OffsetIteratorT d_begin_offsets,
1583  OffsetIteratorT d_end_offsets,
1584  int begin_bit,
1585  int end_bit,
1586  bool is_overwrite_okay,
1587  cudaStream_t stream,
1588  bool debug_synchronous)
1589  {
1590  typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
1591 
1592  cudaError_t error;
1593  do {
1594  // Get PTX version
1595  int ptx_version;
1596  if (CubDebug(error = PtxVersion(ptx_version))) break;
1597 
1598  // Create dispatch functor
1599  DispatchSegmentedRadixSort dispatch(
1600  d_temp_storage, temp_storage_bytes,
1601  d_keys, d_values,
1602  num_items, num_segments, d_begin_offsets, d_end_offsets,
1603  begin_bit, end_bit, is_overwrite_okay,
1604  stream, debug_synchronous, ptx_version);
1605 
1606  // Dispatch to chained policy
1607  if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
1608 
1609  } while (0);
1610 
1611  return error;
1612  }
1613 };
1614 
1615 
1616 } // CUB namespace
1617 CUB_NS_POSTFIX // Optional outer namespace(s)
1618 
1619 
< The number of radix bits, i.e., log2(bins)
Cache as texture.
Definition: thread_load.cuh:69
Type equality test.
Definition: util_type.cuh:98
int end_bit
[in] The past-the-end (most-significant) bit index needed for key comparison
OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
DoubleBuffer< KeyT > & d_keys
[in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return,...
< Signed integer type for global offsets
#define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)
Define both nominal threads-per-block and items-per-thread.
Definition: util_arch.cuh:141
OffsetT num_segments
[in] The number of segments that comprise the sorting data
Type traits.
Definition: util_type.cuh:1158
AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide pr...
Definition: agent_scan.cuh:98
Default (no modifier)
Definition: thread_load.cuh:64
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
DoubleBuffer< KeyT > & d_keys
[in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return,...
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in ...
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, OffsetT block_end)
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition: util_device.cuh:62
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
int end_bit
[in] The past-the-end (most-significant) bit index needed for key comparison
Optional outer namespace(s)
Number of bin-starting offsets tracked per thread.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
DoubleBuffer< ValueT > & d_values
[in,out] Double-buffer whose current buffer contains the unsorted input values and,...
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
int num_counts
< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block,...
\smemstorage{BlockLoad}
DoubleBuffer< ValueT > & d_values
[in,out] Double-buffer whose current buffer contains the unsorted input values and,...
OffsetT int current_bit
[in] Bit position of current radix digit
int begin_bit
[in] The beginning (least-significant) bit index needed for key comparison
OffsetT int int num_bits
[in] Number of bits of current radix digit
The BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thre...
CTA_SYNC()
Definition: util_ptx.cuh:255
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, const OffsetT &block_end)
< The number of radix bits, i.e., log2(bins)
KeyT * d_keys_out
< [in] Input keys buffer
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
int ptx_version
[in] PTX version
__host__ __device__ __forceinline__ T * Current()
Return pointer to the currently valid buffer.
Definition: util_type.cuh:818
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(UpsweepKernelT upsweep_kernel, UpsweepKernelT alt_upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, DownsweepKernelT alt_downsweep_kernel)
Invocation (run multiple digit passes)
Alias wrapper allowing storage to be unioned.
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
Definition: block_load.cuh:640
int begin_bit
[in] The beginning (least-significant) bit index needed for key comparison
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
#define _CubLog(format,...)
Log macro for printf statements.
Definition: util_debug.cuh:112
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int int pass_bits
< [in] Number of bits of current radix digit
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig(UpsweepKernelT upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, int ptx_version, int sm_count, int num_items)
Initialize pass configuration.
< The BlockScan algorithm to use
Definition: agent_scan.cuh:67
OffsetT num_items
[in] Number of items to sort
OffsetT * d_spine
< [in] Input keys buffer
int selector
Selector into d_buffers (i.e., the active/valid buffer)
Definition: util_type.cuh:797
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
Internal dispatch routine.
\smemstorage{BlockRadixSort}
CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedRadixSort(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, OffsetT num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass(const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, OffsetT *d_spine, int spine_length, int &current_bit, PassConfigT &pass_config)
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePass(const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int &current_bit, PassConfigT &pass_config)
Invoke a three-kernel sorting pass at the current bit.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
Initialize pass configuration.
KeyT const ValueT * d_values_in
[in] Input values buffer
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
bool is_overwrite_okay
[in] Whether is okay to overwrite source buffers
size_t & temp_storage_bytes
[in,out] Reference to size in bytes of d_temp_storage allocation
Helper for dispatching into a policy chain.
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Number of bin-starting offsets tracked per thread.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
< Signed integer type for global offsets
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel)
Invocation (run multiple digit passes)
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
The BlockScan class provides collective methods for computing a parallel prefix sum/scan of items par...
Definition: block_scan.cuh:193
OffsetT num_items
[in] Number of items to sort
bool is_overwrite_okay
[in] Whether is okay to overwrite source buffers
AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in de...
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
bool debug_synchronous
[in] Whether or not to synchronize the stream after every kernel launch to check for errors....
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
void * d_temp_storage
[in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is wr...
cudaStream_t stream
[in] CUDA stream to launch kernels within. Default is stream0.
Default sum functor.
__device__ __forceinline__ void ExtractCounts(OffsetT *counters, int bin_stride=1, int bin_offset=0)
#define CUB_MAX(a, b)
Select maximum(a, b)
Definition: util_macro.cuh:61
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
#define CUB_SUBSCRIPTION_FACTOR(arch)
Oversubscription factor.
Definition: util_arch.cuh:99
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
Invocation.
Alias wrapper allowing storage to be unioned.
OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
__host__ __device__ __forceinline__ T * Alternate()
Return pointer to the currently invalid buffer.
Definition: util_type.cuh:821
CUB_RUNTIME_FUNCTION __forceinline__ DispatchRadixSort(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version)
Constructor.
__launch_bounds__(int(AgentHistogramPolicyT::BLOCK_THREADS)) __global__ void DeviceHistogramSweepKernel(SampleIteratorT d_samples
< Signed integer type for global offsets
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel)
Invoke a single block to sort in-core.
< Signed integer type for global offsets