41#include "../util_arch.cuh"
42#include "../util_namespace.cuh"
141 typename OffsetIteratorT>
144 void *d_temp_storage,
145 size_t &temp_storage_bytes,
146 const KeyT *d_keys_in,
155 int end_bit =
sizeof(KeyT) * 8,
156 cudaStream_t stream = 0,
157 bool debug_synchronous =
false)
250 typename OffsetIteratorT>
253 void *d_temp_storage,
254 size_t &temp_storage_bytes,
262 int end_bit =
sizeof(KeyT) * 8,
263 cudaStream_t stream = 0,
264 bool debug_synchronous =
false)
343 typename OffsetIteratorT>
346 void *d_temp_storage,
347 size_t &temp_storage_bytes,
348 const KeyT *d_keys_in,
357 int end_bit =
sizeof(KeyT) * 8,
358 cudaStream_t stream = 0,
359 bool debug_synchronous =
false)
452 typename OffsetIteratorT>
455 void *d_temp_storage,
456 size_t &temp_storage_bytes,
464 int end_bit =
sizeof(KeyT) * 8,
465 cudaStream_t stream = 0,
466 bool debug_synchronous =
false)
544 typename OffsetIteratorT>
547 void *d_temp_storage,
548 size_t &temp_storage_bytes,
549 const KeyT *d_keys_in,
556 int end_bit =
sizeof(KeyT) * 8,
557 cudaStream_t stream = 0,
558 bool debug_synchronous =
false)
643 typename OffsetIteratorT>
646 void *d_temp_storage,
647 size_t &temp_storage_bytes,
654 int end_bit =
sizeof(KeyT) * 8,
655 cudaStream_t stream = 0,
656 bool debug_synchronous =
false)
732 typename OffsetIteratorT>
735 void *d_temp_storage,
736 size_t &temp_storage_bytes,
737 const KeyT *d_keys_in,
744 int end_bit =
sizeof(KeyT) * 8,
745 cudaStream_t stream = 0,
746 bool debug_synchronous =
false)
830 typename OffsetIteratorT>
833 void *d_temp_storage,
834 size_t &temp_storage_bytes,
841 int end_bit =
sizeof(KeyT) * 8,
842 cudaStream_t stream = 0,
843 bool debug_synchronous =
false)
Optional outer namespace(s)
KeyT const ValueT ValueT * d_values_out
[in] Output values buffer
KeyT const ValueT ValueT OffsetT int int end_bit
< [in] The past-the-end (most-significant) bit index needed for key comparison
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
KeyT * d_keys_out
< [in] Input keys buffer
KeyT const ValueT * d_values_in
[in] Input values buffer
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort...
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into ascending order. (~2N auxiliary storage required)
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into ascending order. (~2N auxiliary storage required)
static CUB_RUNTIME_FUNCTION cudaError_t SortPairs(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into ascending order. (~N auxiliary storage required)
static CUB_RUNTIME_FUNCTION cudaError_t SortKeys(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into ascending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into descending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortPairsDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of key-value pairs into descending order. (~2N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into descending order. (~N auxiliary storage required).
static CUB_RUNTIME_FUNCTION cudaError_t SortKeysDescending(void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit=0, int end_bit=sizeof(KeyT) *8, cudaStream_t stream=0, bool debug_synchronous=false)
Sorts segments of keys into descending order. (~2N auxiliary storage required).
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer< KeyT > &d_keys, DoubleBuffer< ValueT > &d_values, int num_items, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous)
Internal dispatch routine.
Double-buffer storage wrapper for multi-pass stream transformations that require more than one storag...