OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
device_segmented_radix_sort.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
41 #include "../util_arch.cuh"
42 #include "../util_namespace.cuh"
43 
45 CUB_NS_PREFIX
46 
48 namespace cub {
49 
50 
77 {
78 
79  /******************************************************************/
83 
138  template <
139  typename KeyT,
140  typename ValueT,
141  typename OffsetIteratorT>
142  CUB_RUNTIME_FUNCTION
143  static cudaError_t SortPairs(
144  void *d_temp_storage,
145  size_t &temp_storage_bytes,
146  const KeyT *d_keys_in,
147  KeyT *d_keys_out,
148  const ValueT *d_values_in,
149  ValueT *d_values_out,
150  int num_items,
151  int num_segments,
152  OffsetIteratorT d_begin_offsets,
153  OffsetIteratorT d_end_offsets,
154  int begin_bit = 0,
155  int end_bit = sizeof(KeyT) * 8,
156  cudaStream_t stream = 0,
157  bool debug_synchronous = false)
158  {
159  // Signed integer type for global offsets
160  typedef int OffsetT;
161 
162  DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
163  DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
164 
166  d_temp_storage,
167  temp_storage_bytes,
168  d_keys,
169  d_values,
170  num_items,
171  num_segments,
174  begin_bit,
175  end_bit,
176  false,
177  stream,
178  debug_synchronous);
179  }
180 
181 
247  template <
248  typename KeyT,
249  typename ValueT,
250  typename OffsetIteratorT>
251  CUB_RUNTIME_FUNCTION
252  static cudaError_t SortPairs(
253  void *d_temp_storage,
254  size_t &temp_storage_bytes,
255  DoubleBuffer<KeyT> &d_keys,
256  DoubleBuffer<ValueT> &d_values,
257  int num_items,
258  int num_segments,
259  OffsetIteratorT d_begin_offsets,
260  OffsetIteratorT d_end_offsets,
261  int begin_bit = 0,
262  int end_bit = sizeof(KeyT) * 8,
263  cudaStream_t stream = 0,
264  bool debug_synchronous = false)
265  {
266  // Signed integer type for global offsets
267  typedef int OffsetT;
268 
270  d_temp_storage,
271  temp_storage_bytes,
272  d_keys,
273  d_values,
274  num_items,
275  num_segments,
278  begin_bit,
279  end_bit,
280  true,
281  stream,
282  debug_synchronous);
283  }
284 
285 
340  template <
341  typename KeyT,
342  typename ValueT,
343  typename OffsetIteratorT>
344  CUB_RUNTIME_FUNCTION
345  static cudaError_t SortPairsDescending(
346  void *d_temp_storage,
347  size_t &temp_storage_bytes,
348  const KeyT *d_keys_in,
349  KeyT *d_keys_out,
350  const ValueT *d_values_in,
351  ValueT *d_values_out,
352  int num_items,
353  int num_segments,
354  OffsetIteratorT d_begin_offsets,
355  OffsetIteratorT d_end_offsets,
356  int begin_bit = 0,
357  int end_bit = sizeof(KeyT) * 8,
358  cudaStream_t stream = 0,
359  bool debug_synchronous = false)
360  {
361  // Signed integer type for global offsets
362  typedef int OffsetT;
363 
364  DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
365  DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
366 
368  d_temp_storage,
369  temp_storage_bytes,
370  d_keys,
371  d_values,
372  num_items,
373  num_segments,
376  begin_bit,
377  end_bit,
378  false,
379  stream,
380  debug_synchronous);
381  }
382 
383 
449  template <
450  typename KeyT,
451  typename ValueT,
452  typename OffsetIteratorT>
453  CUB_RUNTIME_FUNCTION
454  static cudaError_t SortPairsDescending(
455  void *d_temp_storage,
456  size_t &temp_storage_bytes,
457  DoubleBuffer<KeyT> &d_keys,
458  DoubleBuffer<ValueT> &d_values,
459  int num_items,
460  int num_segments,
461  OffsetIteratorT d_begin_offsets,
462  OffsetIteratorT d_end_offsets,
463  int begin_bit = 0,
464  int end_bit = sizeof(KeyT) * 8,
465  cudaStream_t stream = 0,
466  bool debug_synchronous = false)
467  {
468  // Signed integer type for global offsets
469  typedef int OffsetT;
470 
472  d_temp_storage,
473  temp_storage_bytes,
474  d_keys,
475  d_values,
476  num_items,
477  num_segments,
480  begin_bit,
481  end_bit,
482  true,
483  stream,
484  debug_synchronous);
485  }
486 
487 
489  /******************************************************************/
493 
494 
542  template <
543  typename KeyT,
544  typename OffsetIteratorT>
545  CUB_RUNTIME_FUNCTION
546  static cudaError_t SortKeys(
547  void *d_temp_storage,
548  size_t &temp_storage_bytes,
549  const KeyT *d_keys_in,
550  KeyT *d_keys_out,
551  int num_items,
552  int num_segments,
553  OffsetIteratorT d_begin_offsets,
554  OffsetIteratorT d_end_offsets,
555  int begin_bit = 0,
556  int end_bit = sizeof(KeyT) * 8,
557  cudaStream_t stream = 0,
558  bool debug_synchronous = false)
559  {
560  // Signed integer type for global offsets
561  typedef int OffsetT;
562 
563  // Null value type
564  DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
565  DoubleBuffer<NullType> d_values;
566 
568  d_temp_storage,
569  temp_storage_bytes,
570  d_keys,
571  d_values,
572  num_items,
573  num_segments,
576  begin_bit,
577  end_bit,
578  false,
579  stream,
580  debug_synchronous);
581  }
582 
583 
641  template <
642  typename KeyT,
643  typename OffsetIteratorT>
644  CUB_RUNTIME_FUNCTION
645  static cudaError_t SortKeys(
646  void *d_temp_storage,
647  size_t &temp_storage_bytes,
648  DoubleBuffer<KeyT> &d_keys,
649  int num_items,
650  int num_segments,
651  OffsetIteratorT d_begin_offsets,
652  OffsetIteratorT d_end_offsets,
653  int begin_bit = 0,
654  int end_bit = sizeof(KeyT) * 8,
655  cudaStream_t stream = 0,
656  bool debug_synchronous = false)
657  {
658  // Signed integer type for global offsets
659  typedef int OffsetT;
660 
661  // Null value type
662  DoubleBuffer<NullType> d_values;
663 
665  d_temp_storage,
666  temp_storage_bytes,
667  d_keys,
668  d_values,
669  num_items,
670  num_segments,
673  begin_bit,
674  end_bit,
675  true,
676  stream,
677  debug_synchronous);
678  }
679 
730  template <
731  typename KeyT,
732  typename OffsetIteratorT>
733  CUB_RUNTIME_FUNCTION
734  static cudaError_t SortKeysDescending(
735  void *d_temp_storage,
736  size_t &temp_storage_bytes,
737  const KeyT *d_keys_in,
738  KeyT *d_keys_out,
739  int num_items,
740  int num_segments,
741  OffsetIteratorT d_begin_offsets,
742  OffsetIteratorT d_end_offsets,
743  int begin_bit = 0,
744  int end_bit = sizeof(KeyT) * 8,
745  cudaStream_t stream = 0,
746  bool debug_synchronous = false)
747  {
748  // Signed integer type for global offsets
749  typedef int OffsetT;
750 
751  DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
752  DoubleBuffer<NullType> d_values;
753 
755  d_temp_storage,
756  temp_storage_bytes,
757  d_keys,
758  d_values,
759  num_items,
760  num_segments,
763  begin_bit,
764  end_bit,
765  false,
766  stream,
767  debug_synchronous);
768  }
769 
770 
828  template <
829  typename KeyT,
830  typename OffsetIteratorT>
831  CUB_RUNTIME_FUNCTION
832  static cudaError_t SortKeysDescending(
833  void *d_temp_storage,
834  size_t &temp_storage_bytes,
835  DoubleBuffer<KeyT> &d_keys,
836  int num_items,
837  int num_segments,
838  OffsetIteratorT d_begin_offsets,
839  OffsetIteratorT d_end_offsets,
840  int begin_bit = 0,
841  int end_bit = sizeof(KeyT) * 8,
842  cudaStream_t stream = 0,
843  bool debug_synchronous = false)
844  {
845  // Signed integer type for global offsets
846  typedef int OffsetT;
847 
848  // Null value type
849  DoubleBuffer<NullType> d_values;
850 
852  d_temp_storage,
853  temp_storage_bytes,
854  d_keys,
855  d_values,
856  num_items,
857  num_segments,
860  begin_bit,
861  end_bit,
862  true,
863  stream,
864  debug_synchronous);
865  }
866 
867 
869 
870 
871 };
872 
873 } // CUB namespace
874 CUB_NS_POSTFIX // Optional outer namespace(s)
875 
876 
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into ascending order. (~2N auxiliary storage required)
DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort...
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into descending order. (~2N auxiliary storage required).
KeyT * d_keys_out
< [in] Input keys buffer
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into descending order. (~N auxiliary storage required).
OffsetT OffsetT
[in] Total number of input data items
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
Internal dispatch routine.
KeyT const ValueT * d_values_in
[in] Input values buffer
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into descending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into ascending order. (~2N auxiliary storage required)
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into ascending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into descending order. (~2N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into ascending order. (~N auxiliary storage required)