OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
agent_radix_sort_downsweep.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
35#pragma once
36
37#include <stdint.h>
38
39#include "../thread/thread_load.cuh"
40#include "../block/block_load.cuh"
41#include "../block/block_store.cuh"
42#include "../block/block_radix_rank.cuh"
43#include "../block/block_exchange.cuh"
44#include "../util_type.cuh"
45#include "../iterator/cache_modified_input_iterator.cuh"
46#include "../util_namespace.cuh"
47
49CUB_NS_PREFIX
50
52namespace cub {
53
54
55/******************************************************************************
56 * Tuning policy types
57 ******************************************************************************/
58
63{
64 RADIX_RANK_BASIC,
65 RADIX_RANK_MEMOIZE,
66 RADIX_RANK_MATCH
67};
68
72template <
73 int _BLOCK_THREADS,
74 int _ITEMS_PER_THREAD,
75 BlockLoadAlgorithm _LOAD_ALGORITHM,
76 CacheLoadModifier _LOAD_MODIFIER,
77 RadixRankAlgorithm _RANK_ALGORITHM,
78 BlockScanAlgorithm _SCAN_ALGORITHM,
79 int _RADIX_BITS>
81{
82 enum
83 {
84 BLOCK_THREADS = _BLOCK_THREADS,
85 ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
86 RADIX_BITS = _RADIX_BITS,
87 };
88
89 static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
90 static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
91 static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM;
92 static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
93};
94
95
96/******************************************************************************
97 * Thread block abstractions
98 ******************************************************************************/
99
100
101
102
103
107template <
109 bool IS_DESCENDING,
110 typename KeyT,
111 typename ValueT,
112 typename OffsetT>
114{
115 //---------------------------------------------------------------------
116 // Type definitions and constants
117 //---------------------------------------------------------------------
118
119 // Appropriate unsigned-bits representation of KeyT
120 typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
121
122 static const UnsignedBits LOWEST_KEY = Traits<KeyT>::LOWEST_KEY;
123 static const UnsignedBits MAX_KEY = Traits<KeyT>::MAX_KEY;
124
129
130 enum
131 {
135 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
136
137 RADIX_DIGITS = 1 << RADIX_BITS,
139 };
140
141 // Input iterator wrapper type (for applying cache modifier)s
144
145 // Radix ranking type to use
146 typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
148 typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
151 >::Type
152 >::Type BlockRadixRankT;
153
154 enum
155 {
158 };
159
160 // BlockLoad type (keys)
161 typedef BlockLoad<
162 UnsignedBits,
163 BLOCK_THREADS,
164 ITEMS_PER_THREAD,
165 LOAD_ALGORITHM> BlockLoadKeysT;
166
167 // BlockLoad type (values)
168 typedef BlockLoad<
169 ValueT,
170 BLOCK_THREADS,
171 ITEMS_PER_THREAD,
172 LOAD_ALGORITHM> BlockLoadValuesT;
173
174 // Value exchange array type
175 typedef ValueT ValueExchangeT[TILE_ITEMS];
176
180 union __align__(16) _TempStorage
181 {
182 typename BlockLoadKeysT::TempStorage load_keys;
183 typename BlockLoadValuesT::TempStorage load_values;
184 typename BlockRadixRankT::TempStorage radix_rank;
185
186 struct
187 {
188 UnsignedBits exchange_keys[TILE_ITEMS];
189 OffsetT relative_bin_offsets[RADIX_DIGITS];
190 };
191
192 Uninitialized<ValueExchangeT> exchange_values;
193
194 OffsetT exclusive_digit_prefix[RADIX_DIGITS];
195 };
196
197
199 struct TempStorage : Uninitialized<_TempStorage> {};
200
201
202 //---------------------------------------------------------------------
203 // Thread fields
204 //---------------------------------------------------------------------
205
206 // Shared storage for this CTA
207 _TempStorage &temp_storage;
208
209 // Input and output device pointers
210 KeysItr d_keys_in;
211 ValuesItr d_values_in;
212 UnsignedBits *d_keys_out;
213 ValueT *d_values_out;
214
215 // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
217
218 // The least-significant bit position of the current digit to extract
219 int current_bit;
220
221 // Number of bits in current digit
222 int num_bits;
223
224 // Whether to short-cirucit
225 int short_circuit;
226
227 //---------------------------------------------------------------------
228 // Utility methods
229 //---------------------------------------------------------------------
230
231
235 template <bool FULL_TILE>
236 __device__ __forceinline__ void ScatterKeys(
237 UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD],
238 OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
239 int (&ranks)[ITEMS_PER_THREAD],
240 OffsetT valid_items)
241 {
242 #pragma unroll
243 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
244 {
245 temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
246 }
247
248 CTA_SYNC();
249
250 #pragma unroll
251 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
252 {
253 UnsignedBits key = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
254 UnsignedBits digit = BFE(key, current_bit, num_bits);
255 relative_bin_offsets[ITEM] = temp_storage.relative_bin_offsets[digit];
256
257 // Un-twiddle
258 key = Traits<KeyT>::TwiddleOut(key);
259
260 if (FULL_TILE ||
261 (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
262 {
263 d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
264 }
265 }
266 }
267
268
272 template <bool FULL_TILE>
273 __device__ __forceinline__ void ScatterValues(
274 ValueT (&values)[ITEMS_PER_THREAD],
275 OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
276 int (&ranks)[ITEMS_PER_THREAD],
277 OffsetT valid_items)
278 {
279 CTA_SYNC();
280
281 ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
282
283 #pragma unroll
284 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
285 {
286 exchange_values[ranks[ITEM]] = values[ITEM];
287 }
288
289 CTA_SYNC();
290
291 #pragma unroll
292 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
293 {
294 ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
295
296 if (FULL_TILE ||
297 (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
298 {
299 d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
300 }
301 }
302 }
303
307 template <int _RANK_ALGORITHM>
308 __device__ __forceinline__ void LoadKeys(
309 UnsignedBits (&keys)[ITEMS_PER_THREAD],
311 OffsetT valid_items,
312 UnsignedBits oob_item,
313 Int2Type<true> is_full_tile,
314 Int2Type<_RANK_ALGORITHM> rank_algorithm)
315 {
316 BlockLoadKeysT(temp_storage.load_keys).Load(
317 d_keys_in + block_offset, keys);
318
319 CTA_SYNC();
320 }
321
322
326 template <int _RANK_ALGORITHM>
327 __device__ __forceinline__ void LoadKeys(
328 UnsignedBits (&keys)[ITEMS_PER_THREAD],
330 OffsetT valid_items,
331 UnsignedBits oob_item,
332 Int2Type<false> is_full_tile,
333 Int2Type<_RANK_ALGORITHM> rank_algorithm)
334 {
335 // Register pressure work-around: moving valid_items through shfl prevents compiler
336 // from reusing guards/addressing from prior guarded loads
337 valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
338
339 BlockLoadKeysT(temp_storage.load_keys).Load(
340 d_keys_in + block_offset, keys, valid_items, oob_item);
341
342 CTA_SYNC();
343 }
344
345
349 __device__ __forceinline__ void LoadKeys(
350 UnsignedBits (&keys)[ITEMS_PER_THREAD],
352 OffsetT valid_items,
353 UnsignedBits oob_item,
354 Int2Type<true> is_full_tile,
355 Int2Type<RADIX_RANK_MATCH> rank_algorithm)
356 {
357 LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
358 }
359
360
364 __device__ __forceinline__ void LoadKeys(
365 UnsignedBits (&keys)[ITEMS_PER_THREAD],
367 OffsetT valid_items,
368 UnsignedBits oob_item,
369 Int2Type<false> is_full_tile,
370 Int2Type<RADIX_RANK_MATCH> rank_algorithm)
371 {
372 // Register pressure work-around: moving valid_items through shfl prevents compiler
373 // from reusing guards/addressing from prior guarded loads
374 valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
375
376 LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
377 }
378
379
383 template <int _RANK_ALGORITHM>
384 __device__ __forceinline__ void LoadValues(
385 ValueT (&values)[ITEMS_PER_THREAD],
387 OffsetT valid_items,
388 Int2Type<true> is_full_tile,
389 Int2Type<_RANK_ALGORITHM> rank_algorithm)
390 {
391 BlockLoadValuesT(temp_storage.load_values).Load(
392 d_values_in + block_offset, values);
393
394 CTA_SYNC();
395 }
396
397
401 template <int _RANK_ALGORITHM>
402 __device__ __forceinline__ void LoadValues(
403 ValueT (&values)[ITEMS_PER_THREAD],
405 OffsetT valid_items,
406 Int2Type<false> is_full_tile,
407 Int2Type<_RANK_ALGORITHM> rank_algorithm)
408 {
409 // Register pressure work-around: moving valid_items through shfl prevents compiler
410 // from reusing guards/addressing from prior guarded loads
411 valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
412
413 BlockLoadValuesT(temp_storage.load_values).Load(
414 d_values_in + block_offset, values, valid_items);
415
416 CTA_SYNC();
417 }
418
419
423 __device__ __forceinline__ void LoadValues(
424 ValueT (&values)[ITEMS_PER_THREAD],
426 OffsetT valid_items,
427 Int2Type<true> is_full_tile,
428 Int2Type<RADIX_RANK_MATCH> rank_algorithm)
429 {
430 LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
431 }
432
433
437 __device__ __forceinline__ void LoadValues(
438 ValueT (&values)[ITEMS_PER_THREAD],
440 OffsetT valid_items,
441 Int2Type<false> is_full_tile,
442 Int2Type<RADIX_RANK_MATCH> rank_algorithm)
443 {
444 // Register pressure work-around: moving valid_items through shfl prevents compiler
445 // from reusing guards/addressing from prior guarded loads
446 valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
447
448 LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
449 }
450
451
455 template <bool FULL_TILE>
456 __device__ __forceinline__ void GatherScatterValues(
457 OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
458 int (&ranks)[ITEMS_PER_THREAD],
460 OffsetT valid_items,
461 Int2Type<false> /*is_keys_only*/)
462 {
463 ValueT values[ITEMS_PER_THREAD];
464
465 CTA_SYNC();
466
468 values,
470 valid_items,
473
474 ScatterValues<FULL_TILE>(
475 values,
476 relative_bin_offsets,
477 ranks,
478 valid_items);
479 }
480
481
485 template <bool FULL_TILE>
486 __device__ __forceinline__ void GatherScatterValues(
487 OffsetT (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
488 int (&/*ranks*/)[ITEMS_PER_THREAD],
489 OffsetT /*block_offset*/,
490 OffsetT /*valid_items*/,
491 Int2Type<true> /*is_keys_only*/)
492 {}
493
494
498 template <bool FULL_TILE>
499 __device__ __forceinline__ void ProcessTile(
501 const OffsetT &valid_items = TILE_ITEMS)
502 {
503 UnsignedBits keys[ITEMS_PER_THREAD];
504 int ranks[ITEMS_PER_THREAD];
505 OffsetT relative_bin_offsets[ITEMS_PER_THREAD];
506
507 // Assign default (min/max) value to all keys
508 UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
509
510 // Load tile of keys
511 LoadKeys(
512 keys,
514 valid_items,
515 default_key,
518
519 // Twiddle key bits if necessary
520 #pragma unroll
521 for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
522 {
523 keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
524 }
525
526 // Rank the twiddled keys
527 int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
528 BlockRadixRankT(temp_storage.radix_rank).RankKeys(
529 keys,
530 ranks,
532 num_bits,
533 exclusive_digit_prefix);
534
535 CTA_SYNC();
536
537 // Share exclusive digit prefix
538 #pragma unroll
539 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
540 {
541 int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
542 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
543 {
544 // Store exclusive prefix
545 temp_storage.exclusive_digit_prefix[bin_idx] =
546 exclusive_digit_prefix[track];
547 }
548 }
549
550 CTA_SYNC();
551
552 // Get inclusive digit prefix
553 int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
554
555 #pragma unroll
556 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
557 {
558 int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
559 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
560 {
561 if (IS_DESCENDING)
562 {
563 // Get inclusive digit prefix from exclusive prefix (higher bins come first)
564 inclusive_digit_prefix[track] = (bin_idx == 0) ?
565 (BLOCK_THREADS * ITEMS_PER_THREAD) :
566 temp_storage.exclusive_digit_prefix[bin_idx - 1];
567 }
568 else
569 {
570 // Get inclusive digit prefix from exclusive prefix (lower bins come first)
571 inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
572 (BLOCK_THREADS * ITEMS_PER_THREAD) :
573 temp_storage.exclusive_digit_prefix[bin_idx + 1];
574 }
575 }
576 }
577
578 CTA_SYNC();
579
580 // Update global scatter base offsets for each digit
581 #pragma unroll
582 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
583 {
584 int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
585 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
586 {
587 bin_offset[track] -= exclusive_digit_prefix[track];
588 temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
589 bin_offset[track] += inclusive_digit_prefix[track];
590 }
591 }
592
593 CTA_SYNC();
594
595 // Scatter keys
596 ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
597
598 // Gather/scatter values
599 GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
600 }
601
602 //---------------------------------------------------------------------
603 // Copy shortcut
604 //---------------------------------------------------------------------
605
609 template <
610 typename InputIteratorT,
611 typename T>
612 __device__ __forceinline__ void Copy(
613 InputIteratorT d_in,
614 T *d_out,
616 OffsetT block_end)
617 {
618 // Simply copy the input
619 while (block_offset + TILE_ITEMS <= block_end)
620 {
621 T items[ITEMS_PER_THREAD];
622
623 LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
624 CTA_SYNC();
625 StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
626
627 block_offset += TILE_ITEMS;
628 }
629
630 // Clean up last partial tile with guarded-I/O
631 if (block_offset < block_end)
632 {
633 OffsetT valid_items = block_end - block_offset;
634
635 T items[ITEMS_PER_THREAD];
636
637 LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
638 CTA_SYNC();
639 StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
640 }
641 }
642
643
647 template <typename InputIteratorT>
648 __device__ __forceinline__ void Copy(
649 InputIteratorT /*d_in*/,
650 NullType * /*d_out*/,
651 OffsetT /*block_offset*/,
652 OffsetT /*block_end*/)
653 {}
654
655
656 //---------------------------------------------------------------------
657 // Interface
658 //---------------------------------------------------------------------
659
663 __device__ __forceinline__ AgentRadixSortDownsweep(
664 TempStorage &temp_storage,
665 OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD],
667 const KeyT *d_keys_in,
668 KeyT *d_keys_out,
669 const ValueT *d_values_in,
670 ValueT *d_values_out,
671 int current_bit,
672 int num_bits)
673 :
674 temp_storage(temp_storage.Alias()),
675 d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
677 d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
681 short_circuit(1)
682 {
683 #pragma unroll
684 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
685 {
686 this->bin_offset[track] = bin_offset[track];
687
688 int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
689 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
690 {
691 // Short circuit if the histogram has only bin counts of only zeros or problem-size
692 short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
693 }
694 }
695
696 short_circuit = CTA_SYNC_AND(short_circuit);
697 }
698
699
703 __device__ __forceinline__ AgentRadixSortDownsweep(
704 TempStorage &temp_storage,
707 const KeyT *d_keys_in,
708 KeyT *d_keys_out,
709 const ValueT *d_values_in,
710 ValueT *d_values_out,
711 int current_bit,
712 int num_bits)
713 :
714 temp_storage(temp_storage.Alias()),
715 d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
717 d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
721 short_circuit(1)
722 {
723 #pragma unroll
724 for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
725 {
726 int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
727
728 // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
729 if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
730 {
731 if (IS_DESCENDING)
732 bin_idx = RADIX_DIGITS - bin_idx - 1;
733
734 // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
735 OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
736 short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
737
738 // Load my block's bin offset for my bin
739 bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
740 }
741 }
742
743 short_circuit = CTA_SYNC_AND(short_circuit);
744 }
745
746
750 __device__ __forceinline__ void ProcessRegion(
752 OffsetT block_end)
753 {
754 if (short_circuit)
755 {
756 // Copy keys
757 Copy(d_keys_in, d_keys_out, block_offset, block_end);
758
759 // Copy values
761 }
762 else
763 {
764 // Process full tiles of tile_items
765 #pragma unroll 1
766 while (block_offset + TILE_ITEMS <= block_end)
767 {
768 ProcessTile<true>(block_offset);
769 block_offset += TILE_ITEMS;
770
771 CTA_SYNC();
772 }
773
774 // Clean up last partial tile with guarded-I/O
775 if (block_offset < block_end)
776 {
777 ProcessTile<false>(block_offset, block_end - block_offset);
778 }
779
780 }
781 }
782
783};
784
785
786
787} // CUB namespace
788CUB_NS_POSTFIX // Optional outer namespace(s)
789
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void RankKeys(UnsignedBits(&keys)[KEYS_PER_THREAD], int(&ranks)[KEYS_PER_THREAD], int current_bit, int num_bits)
Rank keys.
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a warp-striped arrangement across the thread block.
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
__device__ __forceinline__ int CTA_SYNC_AND(int p)
Definition util_ptx.cuh:264
CTA_SYNC()
Definition util_ptx.cuh:255
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
Definition util_ptx.cuh:127
Optional outer namespace(s)
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
OffsetT * d_spine
< [in] Input keys buffer
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
KeyT * d_keys_out
< [in] Input keys buffer
OffsetT int int num_bits
[in] Number of bits of current radix digit
KeyT const ValueT * d_values_in
[in] Input values buffer
OffsetT int current_bit
[in] Bit position of current radix digit
BlockScanAlgorithm
BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix ...
OffsetT OffsetT
[in] Total number of input data items
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
< The number of radix bits, i.e., log2(bins)
static const RadixRankAlgorithm RANK_ALGORITHM
The radix ranking algorithm to use.
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading keys (and values)
static const BlockScanAlgorithm SCAN_ALGORITHM
The BlockScan algorithm to use.
@ RADIX_BITS
The number of radix bits, i.e., log2(bins)
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
static const BlockLoadAlgorithm LOAD_ALGORITHM
The BlockLoad algorithm to use.
Alias wrapper allowing storage to be unioned.
AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in ...
__device__ __forceinline__ void ScatterValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT(&relative_bin_offsets)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], OffsetT valid_items)
__device__ __forceinline__ void Copy(InputIteratorT, NullType *, OffsetT, OffsetT)
__device__ __forceinline__ void LoadValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< true > is_full_tile, Int2Type< _RANK_ALGORITHM > rank_algorithm)
__device__ __forceinline__ AgentRadixSortDownsweep(TempStorage &temp_storage, OffsetT num_items, OffsetT *d_spine, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int current_bit, int num_bits)
__device__ __forceinline__ void GatherScatterValues(OffsetT(&)[ITEMS_PER_THREAD], int(&)[ITEMS_PER_THREAD], OffsetT, OffsetT, Int2Type< true >)
__device__ __forceinline__ void ProcessTile(OffsetT block_offset, const OffsetT &valid_items=TILE_ITEMS)
__device__ __forceinline__ void LoadKeys(UnsignedBits(&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type< true > is_full_tile, Int2Type< RADIX_RANK_MATCH > rank_algorithm)
__device__ __forceinline__ void LoadValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< false > is_full_tile, Int2Type< _RANK_ALGORITHM > rank_algorithm)
__device__ __forceinline__ void ScatterKeys(UnsignedBits(&twiddled_keys)[ITEMS_PER_THREAD], OffsetT(&relative_bin_offsets)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], OffsetT valid_items)
__device__ __forceinline__ void LoadKeys(UnsignedBits(&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type< true > is_full_tile, Int2Type< _RANK_ALGORITHM > rank_algorithm)
__device__ __forceinline__ void LoadValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< false > is_full_tile, Int2Type< RADIX_RANK_MATCH > rank_algorithm)
__device__ __forceinline__ void GatherScatterValues(OffsetT(&relative_bin_offsets)[ITEMS_PER_THREAD], int(&ranks)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< false >)
__device__ __forceinline__ void LoadValues(ValueT(&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type< true > is_full_tile, Int2Type< RADIX_RANK_MATCH > rank_algorithm)
__device__ __forceinline__ AgentRadixSortDownsweep(TempStorage &temp_storage, OffsetT(&bin_offset)[BINS_TRACKED_PER_THREAD], OffsetT num_items, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int current_bit, int num_bits)
__device__ __forceinline__ void ProcessRegion(OffsetT block_offset, OffsetT block_end)
__device__ __forceinline__ void LoadKeys(UnsignedBits(&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type< false > is_full_tile, Int2Type< _RANK_ALGORITHM > rank_algorithm)
__device__ __forceinline__ void Copy(InputIteratorT d_in, T *d_out, OffsetT block_offset, OffsetT block_end)
@ BINS_TRACKED_PER_THREAD
Number of bin-starting offsets tracked per thread.
__device__ __forceinline__ void LoadKeys(UnsignedBits(&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type< false > is_full_tile, Int2Type< RADIX_RANK_MATCH > rank_algorithm)
\smemstorage{BlockLoad}
Type equality test.
Definition util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A simple "NULL" marker type.
Type traits.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.