OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
device_radix_sort.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
41 #include "../util_arch.cuh"
42 #include "../util_namespace.cuh"
43 
45 CUB_NS_PREFIX
46 
48 namespace cub {
49 
50 
84 {
85 
86  /******************************************************************/
90 
144  template <
145  typename KeyT,
146  typename ValueT>
147  CUB_RUNTIME_FUNCTION
148  static cudaError_t SortPairs(
149  void *d_temp_storage,
150  size_t &temp_storage_bytes,
151  const KeyT *d_keys_in,
152  KeyT *d_keys_out,
153  const ValueT *d_values_in,
154  ValueT *d_values_out,
155  int num_items,
156  int begin_bit = 0,
157  int end_bit = sizeof(KeyT) * 8,
158  cudaStream_t stream = 0,
159  bool debug_synchronous = false)
160  {
161  // Signed integer type for global offsets
162  typedef int OffsetT;
163 
164  DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
165  DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
166 
168  d_temp_storage,
169  temp_storage_bytes,
170  d_keys,
171  d_values,
172  num_items,
173  begin_bit,
174  end_bit,
175  false,
176  stream,
177  debug_synchronous);
178  }
179 
180 
245  template <
246  typename KeyT,
247  typename ValueT>
248  CUB_RUNTIME_FUNCTION
249  static cudaError_t SortPairs(
250  void *d_temp_storage,
251  size_t &temp_storage_bytes,
252  DoubleBuffer<KeyT> &d_keys,
253  DoubleBuffer<ValueT> &d_values,
254  int num_items,
255  int begin_bit = 0,
256  int end_bit = sizeof(KeyT) * 8,
257  cudaStream_t stream = 0,
258  bool debug_synchronous = false)
259  {
260  // Signed integer type for global offsets
261  typedef int OffsetT;
262 
264  d_temp_storage,
265  temp_storage_bytes,
266  d_keys,
267  d_values,
268  num_items,
269  begin_bit,
270  end_bit,
271  true,
272  stream,
273  debug_synchronous);
274  }
275 
276 
325  template <
326  typename KeyT,
327  typename ValueT>
328  CUB_RUNTIME_FUNCTION
329  static cudaError_t SortPairsDescending(
330  void *d_temp_storage,
331  size_t &temp_storage_bytes,
332  const KeyT *d_keys_in,
333  KeyT *d_keys_out,
334  const ValueT *d_values_in,
335  ValueT *d_values_out,
336  int num_items,
337  int begin_bit = 0,
338  int end_bit = sizeof(KeyT) * 8,
339  cudaStream_t stream = 0,
340  bool debug_synchronous = false)
341  {
342  // Signed integer type for global offsets
343  typedef int OffsetT;
344 
345  DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
346  DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
347 
349  d_temp_storage,
350  temp_storage_bytes,
351  d_keys,
352  d_values,
353  num_items,
354  begin_bit,
355  end_bit,
356  false,
357  stream,
358  debug_synchronous);
359  }
360 
361 
421  template <
422  typename KeyT,
423  typename ValueT>
424  CUB_RUNTIME_FUNCTION
425  static cudaError_t SortPairsDescending(
426  void *d_temp_storage,
427  size_t &temp_storage_bytes,
428  DoubleBuffer<KeyT> &d_keys,
429  DoubleBuffer<ValueT> &d_values,
430  int num_items,
431  int begin_bit = 0,
432  int end_bit = sizeof(KeyT) * 8,
433  cudaStream_t stream = 0,
434  bool debug_synchronous = false)
435  {
436  // Signed integer type for global offsets
437  typedef int OffsetT;
438 
440  d_temp_storage,
441  temp_storage_bytes,
442  d_keys,
443  d_values,
444  num_items,
445  begin_bit,
446  end_bit,
447  true,
448  stream,
449  debug_synchronous);
450  }
451 
452 
454  /******************************************************************/
458 
459 
505  template <typename KeyT>
506  CUB_RUNTIME_FUNCTION
507  static cudaError_t SortKeys(
508  void *d_temp_storage,
509  size_t &temp_storage_bytes,
510  const KeyT *d_keys_in,
511  KeyT *d_keys_out,
512  int num_items,
513  int begin_bit = 0,
514  int end_bit = sizeof(KeyT) * 8,
515  cudaStream_t stream = 0,
516  bool debug_synchronous = false)
517  {
518  // Signed integer type for global offsets
519  typedef int OffsetT;
520 
521  // Null value type
522  DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
523  DoubleBuffer<NullType> d_values;
524 
526  d_temp_storage,
527  temp_storage_bytes,
528  d_keys,
529  d_values,
530  num_items,
531  begin_bit,
532  end_bit,
533  false,
534  stream,
535  debug_synchronous);
536  }
537 
538 
594  template <typename KeyT>
595  CUB_RUNTIME_FUNCTION
596  static cudaError_t SortKeys(
597  void *d_temp_storage,
598  size_t &temp_storage_bytes,
599  DoubleBuffer<KeyT> &d_keys,
600  int num_items,
601  int begin_bit = 0,
602  int end_bit = sizeof(KeyT) * 8,
603  cudaStream_t stream = 0,
604  bool debug_synchronous = false)
605  {
606  // Signed integer type for global offsets
607  typedef int OffsetT;
608 
609  // Null value type
610  DoubleBuffer<NullType> d_values;
611 
613  d_temp_storage,
614  temp_storage_bytes,
615  d_keys,
616  d_values,
617  num_items,
618  begin_bit,
619  end_bit,
620  true,
621  stream,
622  debug_synchronous);
623  }
624 
669  template <typename KeyT>
670  CUB_RUNTIME_FUNCTION
671  static cudaError_t SortKeysDescending(
672  void *d_temp_storage,
673  size_t &temp_storage_bytes,
674  const KeyT *d_keys_in,
675  KeyT *d_keys_out,
676  int num_items,
677  int begin_bit = 0,
678  int end_bit = sizeof(KeyT) * 8,
679  cudaStream_t stream = 0,
680  bool debug_synchronous = false)
681  {
682  // Signed integer type for global offsets
683  typedef int OffsetT;
684 
685  DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
686  DoubleBuffer<NullType> d_values;
687 
689  d_temp_storage,
690  temp_storage_bytes,
691  d_keys,
692  d_values,
693  num_items,
694  begin_bit,
695  end_bit,
696  false,
697  stream,
698  debug_synchronous);
699  }
700 
701 
753  template <typename KeyT>
754  CUB_RUNTIME_FUNCTION
755  static cudaError_t SortKeysDescending(
756  void *d_temp_storage,
757  size_t &temp_storage_bytes,
758  DoubleBuffer<KeyT> &d_keys,
759  int num_items,
760  int begin_bit = 0,
761  int end_bit = sizeof(KeyT) * 8,
762  cudaStream_t stream = 0,
763  bool debug_synchronous = false)
764  {
765  // Signed integer type for global offsets
766  typedef int OffsetT;
767 
768  // Null value type
769  DoubleBuffer<NullType> d_values;
770 
772  d_temp_storage,
773  temp_storage_bytes,
774  d_keys,
775  d_values,
776  num_items,
777  begin_bit,
778  end_bit,
779  true,
780  stream,
781  debug_synchronous);
782  }
783 
784 
786 
787 
788 };
789 
794 } // CUB namespace
795 CUB_NS_POSTFIX // Optional outer namespace(s)
796 
797 
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Optional outer namespace(s)
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts keys into descending order. (~2N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts keys into ascending order. (~N auxiliary storage required).
DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequenc...
KeyT * d_keys_out
< [in] Input keys buffer
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts key-value pairs into descending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts key-value pairs into descending order. (~2N auxiliary storage required).
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT * d_values_in
[in] Input values buffer
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts key-value pairs into ascending order. (~N auxiliary storage required)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts keys into descending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts key-value pairs into ascending order. (~2N auxiliary storage required)
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts keys into ascending order. (~2N auxiliary storage required)