OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
device_segmented_radix_sort.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39
41#include "../util_arch.cuh"
42#include "../util_namespace.cuh"
43
45CUB_NS_PREFIX
46
48namespace cub {
49
50
77{
78
79 /******************************************************************/
83
138 template <
139 typename KeyT,
140 typename ValueT,
141 typename OffsetIteratorT>
142 CUB_RUNTIME_FUNCTION
143 static cudaError_t SortPairs(
144 void *d_temp_storage,
145 size_t &temp_storage_bytes,
146 const KeyT *d_keys_in,
147 KeyT *d_keys_out,
148 const ValueT *d_values_in,
149 ValueT *d_values_out,
150 int num_items,
151 int num_segments,
152 OffsetIteratorT d_begin_offsets,
153 OffsetIteratorT d_end_offsets,
154 int begin_bit = 0,
155 int end_bit = sizeof(KeyT) * 8,
156 cudaStream_t stream = 0,
157 bool debug_synchronous = false)
158 {
159 // Signed integer type for global offsets
160 typedef int OffsetT;
161
162 DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
163 DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
164
166 d_temp_storage,
167 temp_storage_bytes,
168 d_keys,
169 d_values,
170 num_items,
171 num_segments,
174 begin_bit,
175 end_bit,
176 false,
177 stream,
178 debug_synchronous);
179 }
180
181
247 template <
248 typename KeyT,
249 typename ValueT,
250 typename OffsetIteratorT>
251 CUB_RUNTIME_FUNCTION
252 static cudaError_t SortPairs(
253 void *d_temp_storage,
254 size_t &temp_storage_bytes,
255 DoubleBuffer<KeyT> &d_keys,
256 DoubleBuffer<ValueT> &d_values,
257 int num_items,
258 int num_segments,
259 OffsetIteratorT d_begin_offsets,
260 OffsetIteratorT d_end_offsets,
261 int begin_bit = 0,
262 int end_bit = sizeof(KeyT) * 8,
263 cudaStream_t stream = 0,
264 bool debug_synchronous = false)
265 {
266 // Signed integer type for global offsets
267 typedef int OffsetT;
268
270 d_temp_storage,
271 temp_storage_bytes,
272 d_keys,
273 d_values,
274 num_items,
275 num_segments,
278 begin_bit,
279 end_bit,
280 true,
281 stream,
282 debug_synchronous);
283 }
284
285
340 template <
341 typename KeyT,
342 typename ValueT,
343 typename OffsetIteratorT>
344 CUB_RUNTIME_FUNCTION
345 static cudaError_t SortPairsDescending(
346 void *d_temp_storage,
347 size_t &temp_storage_bytes,
348 const KeyT *d_keys_in,
349 KeyT *d_keys_out,
350 const ValueT *d_values_in,
351 ValueT *d_values_out,
352 int num_items,
353 int num_segments,
354 OffsetIteratorT d_begin_offsets,
355 OffsetIteratorT d_end_offsets,
356 int begin_bit = 0,
357 int end_bit = sizeof(KeyT) * 8,
358 cudaStream_t stream = 0,
359 bool debug_synchronous = false)
360 {
361 // Signed integer type for global offsets
362 typedef int OffsetT;
363
364 DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
365 DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
366
368 d_temp_storage,
369 temp_storage_bytes,
370 d_keys,
371 d_values,
372 num_items,
373 num_segments,
376 begin_bit,
377 end_bit,
378 false,
379 stream,
380 debug_synchronous);
381 }
382
383
449 template <
450 typename KeyT,
451 typename ValueT,
452 typename OffsetIteratorT>
453 CUB_RUNTIME_FUNCTION
454 static cudaError_t SortPairsDescending(
455 void *d_temp_storage,
456 size_t &temp_storage_bytes,
457 DoubleBuffer<KeyT> &d_keys,
458 DoubleBuffer<ValueT> &d_values,
459 int num_items,
460 int num_segments,
461 OffsetIteratorT d_begin_offsets,
462 OffsetIteratorT d_end_offsets,
463 int begin_bit = 0,
464 int end_bit = sizeof(KeyT) * 8,
465 cudaStream_t stream = 0,
466 bool debug_synchronous = false)
467 {
468 // Signed integer type for global offsets
469 typedef int OffsetT;
470
472 d_temp_storage,
473 temp_storage_bytes,
474 d_keys,
475 d_values,
476 num_items,
477 num_segments,
480 begin_bit,
481 end_bit,
482 true,
483 stream,
484 debug_synchronous);
485 }
486
487
489 /******************************************************************/
493
494
542 template <
543 typename KeyT,
544 typename OffsetIteratorT>
545 CUB_RUNTIME_FUNCTION
546 static cudaError_t SortKeys(
547 void *d_temp_storage,
548 size_t &temp_storage_bytes,
549 const KeyT *d_keys_in,
550 KeyT *d_keys_out,
551 int num_items,
552 int num_segments,
553 OffsetIteratorT d_begin_offsets,
554 OffsetIteratorT d_end_offsets,
555 int begin_bit = 0,
556 int end_bit = sizeof(KeyT) * 8,
557 cudaStream_t stream = 0,
558 bool debug_synchronous = false)
559 {
560 // Signed integer type for global offsets
561 typedef int OffsetT;
562
563 // Null value type
564 DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
565 DoubleBuffer<NullType> d_values;
566
568 d_temp_storage,
569 temp_storage_bytes,
570 d_keys,
571 d_values,
572 num_items,
573 num_segments,
576 begin_bit,
577 end_bit,
578 false,
579 stream,
580 debug_synchronous);
581 }
582
583
641 template <
642 typename KeyT,
643 typename OffsetIteratorT>
644 CUB_RUNTIME_FUNCTION
645 static cudaError_t SortKeys(
646 void *d_temp_storage,
647 size_t &temp_storage_bytes,
648 DoubleBuffer<KeyT> &d_keys,
649 int num_items,
650 int num_segments,
651 OffsetIteratorT d_begin_offsets,
652 OffsetIteratorT d_end_offsets,
653 int begin_bit = 0,
654 int end_bit = sizeof(KeyT) * 8,
655 cudaStream_t stream = 0,
656 bool debug_synchronous = false)
657 {
658 // Signed integer type for global offsets
659 typedef int OffsetT;
660
661 // Null value type
662 DoubleBuffer<NullType> d_values;
663
665 d_temp_storage,
666 temp_storage_bytes,
667 d_keys,
668 d_values,
669 num_items,
670 num_segments,
673 begin_bit,
674 end_bit,
675 true,
676 stream,
677 debug_synchronous);
678 }
679
730 template <
731 typename KeyT,
732 typename OffsetIteratorT>
733 CUB_RUNTIME_FUNCTION
734 static cudaError_t SortKeysDescending(
735 void *d_temp_storage,
736 size_t &temp_storage_bytes,
737 const KeyT *d_keys_in,
738 KeyT *d_keys_out,
739 int num_items,
740 int num_segments,
741 OffsetIteratorT d_begin_offsets,
742 OffsetIteratorT d_end_offsets,
743 int begin_bit = 0,
744 int end_bit = sizeof(KeyT) * 8,
745 cudaStream_t stream = 0,
746 bool debug_synchronous = false)
747 {
748 // Signed integer type for global offsets
749 typedef int OffsetT;
750
751 DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
752 DoubleBuffer<NullType> d_values;
753
755 d_temp_storage,
756 temp_storage_bytes,
757 d_keys,
758 d_values,
759 num_items,
760 num_segments,
763 begin_bit,
764 end_bit,
765 false,
766 stream,
767 debug_synchronous);
768 }
769
770
828 template <
829 typename KeyT,
830 typename OffsetIteratorT>
831 CUB_RUNTIME_FUNCTION
832 static cudaError_t SortKeysDescending(
833 void *d_temp_storage,
834 size_t &temp_storage_bytes,
835 DoubleBuffer<KeyT> &d_keys,
836 int num_items,
837 int num_segments,
838 OffsetIteratorT d_begin_offsets,
839 OffsetIteratorT d_end_offsets,
840 int begin_bit = 0,
841 int end_bit = sizeof(KeyT) * 8,
842 cudaStream_t stream = 0,
843 bool debug_synchronous = false)
844 {
845 // Signed integer type for global offsets
846 typedef int OffsetT;
847
848 // Null value type
849 DoubleBuffer<NullType> d_values;
850
852 d_temp_storage,
853 temp_storage_bytes,
854 d_keys,
855 d_values,
856 num_items,
857 num_segments,
860 begin_bit,
861 end_bit,
862 true,
863 stream,
864 debug_synchronous);
865 }
866
867
869
870
871};
872
873} // CUB namespace
874CUB_NS_POSTFIX // Optional outer namespace(s)
875
876
Optional outer namespace(s)
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
KeyT * d_keys_out
< [in] Input keys buffer
KeyT const ValueT * d_values_in
[in] Input values buffer
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort...
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into ascending order. (~2N auxiliary storage required)
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into ascending order. (~2N auxiliary storage required)
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into ascending order. (~N auxiliary storage required)
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into ascending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into descending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into descending order. (~2N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into descending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into descending order. (~2N auxiliary storage required).
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
Internal dispatch routine.
Double-buffer storage wrapper for multi-pass stream transformations that require more than one storag...