41#include "../util_arch.cuh"
42#include "../util_namespace.cuh"
149 void *d_temp_storage,
150 size_t &temp_storage_bytes,
151 const KeyT *d_keys_in,
157 int end_bit =
sizeof(KeyT) * 8,
158 cudaStream_t stream = 0,
159 bool debug_synchronous =
false)
250 void *d_temp_storage,
251 size_t &temp_storage_bytes,
256 int end_bit =
sizeof(KeyT) * 8,
257 cudaStream_t stream = 0,
258 bool debug_synchronous =
false)
330 void *d_temp_storage,
331 size_t &temp_storage_bytes,
332 const KeyT *d_keys_in,
338 int end_bit =
sizeof(KeyT) * 8,
339 cudaStream_t stream = 0,
340 bool debug_synchronous =
false)
426 void *d_temp_storage,
427 size_t &temp_storage_bytes,
432 int end_bit =
sizeof(KeyT) * 8,
433 cudaStream_t stream = 0,
434 bool debug_synchronous =
false)
505 template <
typename KeyT>
508 void *d_temp_storage,
509 size_t &temp_storage_bytes,
510 const KeyT *d_keys_in,
514 int end_bit =
sizeof(KeyT) * 8,
515 cudaStream_t stream = 0,
516 bool debug_synchronous =
false)
594 template <
typename KeyT>
597 void *d_temp_storage,
598 size_t &temp_storage_bytes,
602 int end_bit =
sizeof(KeyT) * 8,
603 cudaStream_t stream = 0,
604 bool debug_synchronous =
false)
669 template <
typename KeyT>
672 void *d_temp_storage,
673 size_t &temp_storage_bytes,
674 const KeyT *d_keys_in,
678 int end_bit =
sizeof(KeyT) * 8,
679 cudaStream_t stream = 0,
680 bool debug_synchronous =
false)
753 template <
typename KeyT>
756 void *d_temp_storage,
757 size_t &temp_storage_bytes,
761 int end_bit =
sizeof(KeyT) * 8,
762 cudaStream_t stream = 0,
763 bool debug_synchronous =
false)
Optional outer namespace(s)
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
KeyT * d_keys_out
< [in] Input keys buffer
KeyT const ValueT * d_values_in
[in] Input values buffer
OffsetT OffsetT
[in] Total number of input data items
DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequenc...
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts keys into ascending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts keys into descending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts key-value pairs into descending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts keys into ascending order. (~2N auxiliary storage required)
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts key-value pairs into ascending order. (~2N auxiliary storage required)
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts key-value pairs into descending order. (~2N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts keys into descending order. (~2N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts key-value pairs into ascending order. (~N auxiliary storage required)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
Double-buffer storage wrapper for multi-pass stream transformations that require more than one storag...