40#if (__CUDACC_VER_MAJOR__ >= 9)
41 #include <cuda_fp16.h>
44#include "util_macro.cuh"
71template <
bool IF,
typename ThenType,
typename ElseType>
78#ifndef DOXYGEN_SHOULD_SKIP_THIS
80template <
typename ThenType,
typename ElseType>
81struct If<false, ThenType, ElseType>
83 typedef ElseType
Type;
97template <
typename A,
typename B>
106#ifndef DOXYGEN_SHOULD_SKIP_THIS
131template <
int N,
int CURRENT_VAL = N,
int COUNT = 0>
135 enum { VALUE =
Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
138#ifndef DOXYGEN_SHOULD_SKIP_THIS
140template <
int N,
int COUNT>
143 enum {VALUE = (1 << (COUNT - 1) < N) ?
157 enum { VALUE = ((N & (N - 1)) == 0) };
169template <
typename Tp>
175#ifndef DOXYGEN_SHOULD_SKIP_THIS
177template <
typename Tp>
194template <
typename Tp>
200#ifndef DOXYGEN_SHOULD_SKIP_THIS
202template <
typename Tp>
221template <
typename Tp,
typename Up = Tp>
228#ifndef DOXYGEN_SHOULD_SKIP_THIS
230template <
typename Tp,
typename Up>
236template <
typename Tp,
typename Up>
242template <
typename Tp,
typename Up>
258#ifndef DOXYGEN_SHOULD_SKIP_THIS
260 template <
typename T>
261 __host__ __device__ __forceinline__
NullType& operator =(
const T&) {
return *
this; }
263 __host__ __device__ __forceinline__
bool operator ==(
const NullType&) {
return true; }
265 __host__ __device__ __forceinline__
bool operator !=(
const NullType&) {
return false; }
281#ifndef DOXYGEN_SHOULD_SKIP_THIS
312#define __CUB_ALIGN_BYTES(t, b) \
313 template <> struct AlignBytes<t> \
314 { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
316__CUB_ALIGN_BYTES(short4, 8)
317__CUB_ALIGN_BYTES(ushort4, 8)
318__CUB_ALIGN_BYTES(int2, 8)
319__CUB_ALIGN_BYTES(uint2, 8)
320__CUB_ALIGN_BYTES(
long long, 8)
321__CUB_ALIGN_BYTES(
unsigned long long, 8)
322__CUB_ALIGN_BYTES(float2, 8)
323__CUB_ALIGN_BYTES(
double, 8)
325 __CUB_ALIGN_BYTES(long2, 8)
326 __CUB_ALIGN_BYTES(ulong2, 8)
328 __CUB_ALIGN_BYTES(long2, 16)
329 __CUB_ALIGN_BYTES(ulong2, 16)
331__CUB_ALIGN_BYTES(int4, 16)
332__CUB_ALIGN_BYTES(uint4, 16)
333__CUB_ALIGN_BYTES(float4, 16)
334__CUB_ALIGN_BYTES(long4, 16)
335__CUB_ALIGN_BYTES(ulong4, 16)
336__CUB_ALIGN_BYTES(longlong2, 16)
337__CUB_ALIGN_BYTES(ulonglong2, 16)
338__CUB_ALIGN_BYTES(double2, 16)
339__CUB_ALIGN_BYTES(longlong4, 16)
340__CUB_ALIGN_BYTES(ulonglong4, 16)
341__CUB_ALIGN_BYTES(double4, 16)
356 template <
typename Unit>
361 IS_MULTIPLE = (
sizeof(T) %
sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
396#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
411#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
427#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
454template <
typename T,
int vec_elements>
struct CubVector;
456#ifndef DOXYGEN_SHOULD_SKIP_THIS
523#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type) \
525 template<> struct CubVector<base_type, 1> : short_type##1 \
527 typedef base_type BaseType; \
528 typedef short_type##1 Type; \
529 __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \
531 retval.x = x + other.x; \
534 __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \
536 retval.x = x - other.x; \
541 template<> struct CubVector<base_type, 2> : short_type##2 \
543 typedef base_type BaseType; \
544 typedef short_type##2 Type; \
545 __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \
547 retval.x = x + other.x; \
548 retval.y = y + other.y; \
551 __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \
553 retval.x = x - other.x; \
554 retval.y = y - other.y; \
559 template<> struct CubVector<base_type, 3> : short_type##3 \
561 typedef base_type BaseType; \
562 typedef short_type##3 Type; \
563 __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \
565 retval.x = x + other.x; \
566 retval.y = y + other.y; \
567 retval.z = z + other.z; \
570 __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \
572 retval.x = x - other.x; \
573 retval.y = y - other.y; \
574 retval.z = z - other.z; \
579 template<> struct CubVector<base_type, 4> : short_type##4 \
581 typedef base_type BaseType; \
582 typedef short_type##4 Type; \
583 __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \
585 retval.x = x + other.x; \
586 retval.y = y + other.y; \
587 retval.z = z + other.z; \
588 retval.w = w + other.w; \
591 __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \
593 retval.x = x - other.x; \
594 retval.y = y - other.y; \
595 retval.z = z - other.z; \
596 retval.w = w - other.w; \
620#undef CUB_DEFINE_VECTOR_TYPE
648 __host__ __device__ __forceinline__ T&
Alias()
650 return reinterpret_cast<T&
>(*this);
661#if defined(_WIN32) && !defined(_WIN64)
675 __host__ __device__ __forceinline__
679 __host__ __device__ __forceinline__
683 __host__ __device__ __forceinline__
bool operator !=(
const KeyValuePair &b)
685 return (value != b.value) || (key != b.key);
689#if defined(_WIN32) && !defined(_WIN64)
704template <
typename K,
typename V>
705struct KeyValuePair<K, V, true, false>
717 __host__ __device__ __forceinline__
721 __host__ __device__ __forceinline__
722 KeyValuePair(Key
const& key, Value
const& value) : key(key), value(value) {}
725 __host__ __device__ __forceinline__
bool operator !=(
const KeyValuePair &b)
727 return (value != b.value) || (key != b.key);
733template <
typename K,
typename V>
734struct KeyValuePair<K, V, false, true>
746 __host__ __device__ __forceinline__
750 __host__ __device__ __forceinline__
751 KeyValuePair(Key
const& key, Value
const& value) : key(key), value(value) {}
754 __host__ __device__ __forceinline__
bool operator !=(
const KeyValuePair &b)
756 return (value != b.value) || (key != b.key);
763#ifndef DOXYGEN_SHOULD_SKIP_THIS
769template <
typename T,
int COUNT>
813 d_buffers[0] = d_current;
814 d_buffers[1] = d_alternate;
818 __host__ __device__ __forceinline__ T*
Current() {
return d_buffers[selector]; }
821 __host__ __device__ __forceinline__ T*
Alternate() {
return d_buffers[selector ^ 1]; }
835#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \
836 template <typename T> \
837 struct detector_name \
839 template <typename C> \
840 static char& test(typename C::nested_type_name*); \
841 template <typename> \
842 static int& test(...); \
845 VALUE = sizeof(test<T>(0)) < sizeof(int) \
858template <
bool Condition,
class T =
void>
878template <
typename T,
typename BinaryOp>
888 template <
typename BinaryOpT,
bool (BinaryOpT::*)(const T &a, const T &b,
int idx) const>
struct SFINAE5 {};
889 template <
typename BinaryOpT,
bool (BinaryOpT::*)(const T &a, const T &b,
int idx)>
struct SFINAE6 {};
890 template <
typename BinaryOpT,
bool (BinaryOpT::*)(T a, T b,
int idx) const>
struct SFINAE7 {};
891 template <
typename BinaryOpT,
bool (BinaryOpT::*)(T a, T b,
int idx)>
struct SFINAE8 {};
898 template <
typename BinaryOpT> __host__ __device__
static char Test(
SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
899 template <
typename BinaryOpT> __host__ __device__
static char Test(
SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
900 template <
typename BinaryOpT> __host__ __device__
static char Test(
SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
901 template <
typename BinaryOpT> __host__ __device__
static char Test(
SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
903 template <
typename BinaryOpT>
static int Test(...);
908 static const bool HAS_PARAM =
sizeof(Test<BinaryOp>(NULL)) ==
sizeof(
char);
940template <Category _CATEGORY,
bool _PRIMITIVE,
bool _NULL_TYPE,
typename _Un
signedBits,
typename T>
947 PRIMITIVE = _PRIMITIVE,
948 NULL_TYPE = _NULL_TYPE,
956template <
typename _Un
signedBits,
typename T>
959 typedef _UnsignedBits UnsignedBits;
961 static const Category CATEGORY = UNSIGNED_INTEGER;
962 static const UnsignedBits LOWEST_KEY = UnsignedBits(0);
963 static const UnsignedBits MAX_KEY = UnsignedBits(-1);
972 static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
977 static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
982 static __host__ __device__ __forceinline__ T
Max()
984 UnsignedBits retval = MAX_KEY;
985 return reinterpret_cast<T&
>(retval);
988 static __host__ __device__ __forceinline__ T Lowest()
990 UnsignedBits retval = LOWEST_KEY;
991 return reinterpret_cast<T&
>(retval);
999template <
typename _Un
signedBits,
typename T>
1002 typedef _UnsignedBits UnsignedBits;
1004 static const Category CATEGORY = SIGNED_INTEGER;
1005 static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((
sizeof(UnsignedBits) * 8) - 1);
1006 static const UnsignedBits LOWEST_KEY = HIGH_BIT;
1007 static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT;
1015 static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
1017 return key ^ HIGH_BIT;
1020 static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
1022 return key ^ HIGH_BIT;
1025 static __host__ __device__ __forceinline__ T
Max()
1027 UnsignedBits retval = MAX_KEY;
1028 return reinterpret_cast<T&
>(retval);
1031 static __host__ __device__ __forceinline__ T Lowest()
1033 UnsignedBits retval = LOWEST_KEY;
1034 return reinterpret_cast<T&
>(retval);
1038template <
typename _T>
1044 static __host__ __device__ __forceinline__
float Max() {
1048 static __host__ __device__ __forceinline__
float Lowest() {
1049 return FLT_MAX * float(-1);
1056 static __host__ __device__ __forceinline__
double Max() {
1060 static __host__ __device__ __forceinline__
double Lowest() {
1061 return DBL_MAX * double(-1);
1066#if (__CUDACC_VER_MAJOR__ >= 9)
1070 static __host__ __device__ __forceinline__ __half
Max() {
1071 unsigned short max_word = 0x7BFF;
1072 return reinterpret_cast<__half&
>(max_word);
1075 static __host__ __device__ __forceinline__ __half Lowest() {
1076 unsigned short lowest_word = 0xFBFF;
1077 return reinterpret_cast<__half&
>(lowest_word);
1086template <
typename _Un
signedBits,
typename T>
1089 typedef _UnsignedBits UnsignedBits;
1091 static const Category CATEGORY = FLOATING_POINT;
1092 static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((
sizeof(UnsignedBits) * 8) - 1);
1093 static const UnsignedBits LOWEST_KEY = UnsignedBits(-1);
1094 static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT;
1102 static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
1104 UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
1108 static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
1110 UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
1114 static __host__ __device__ __forceinline__ T
Max() {
1118 static __host__ __device__ __forceinline__ T Lowest() {
1131template <>
struct NumericTraits<char> :
BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
1146#if (__CUDACC_VER_MAJOR__ >= 9)
1157template <
typename T>
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
#define CUB_DEFINE_VECTOR_TYPE(base_type, short_type)
Category
Basic type traits categories.
@ MAX_VEC_ELEMENTS
The maximum number of elements in CUDA vector types.
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
T Type
The "truly aligned" type.
@ ALIGN_BYTES
The "true CUDA" alignment of T in bytes.
A wrapper for passing simple static arrays as kernel parameters.
__host__ __device__ __forceinline__ ArrayWrapper()
Constructor.
Determine whether or not BinaryOp's functor is of the form bool operator()(const T& a,...
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Double-buffer storage wrapper for multi-pass stream transformations that require more than one storag...
__host__ __device__ __forceinline__ T * Current()
Return pointer to the currently valid buffer.
__host__ __device__ __forceinline__ DoubleBuffer(T *d_current, T *d_alternate)
Constructor.
__host__ __device__ __forceinline__ DoubleBuffer()
Constructor.
int selector
Selector into d_buffers (i.e., the active/valid buffer)
__host__ __device__ __forceinline__ T * Alternate()
Return pointer to the currently invalid buffer.
Simple enable-if (similar to Boost)
T Type
Enable-if type for SFINAE dummy variables.
Type selection (IF ? ThenType : ElseType)
ThenType Type
Conditional type result.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A key identifier paired with a corresponding value.
__host__ __device__ __forceinline__ KeyValuePair()
Constructor.
__host__ __device__ __forceinline__ KeyValuePair(Key const &key, Value const &value)
Constructor.
_Value Value
Value data type.
Statically determine log2(N), rounded up.
A simple "NULL" marker type.
Statically determine if N is a power-of-two.
Removes const and volatile qualifiers from type Tp.
Up Type
Type without const and volatile qualifiers.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
DeviceWord storage[WORDS]
Backing storage.
__host__ __device__ __forceinline__ T & Alias()
Alias.
UnitWord< T >::DeviceWord DeviceWord
Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T.
Unit-words of data movement.
If< IsMultiple< longlong2 >::IS_MULTIPLE, ulonglong2, VolatileWord >::Type DeviceWord
Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T.
If< IsMultiple< longlong >::IS_MULTIPLE, unsignedlonglong, ShuffleWord >::Type VolatileWord
Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T.
If< IsMultiple< int4 >::IS_MULTIPLE, uint4, typenameIf< IsMultiple< int2 >::IS_MULTIPLE, uint2, ShuffleWord >::Type >::Type TextureWord
Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of ...
If< IsMultiple< int >::IS_MULTIPLE, unsignedint, typenameIf< IsMultiple< short >::IS_MULTIPLE, unsignedshort, unsignedchar >::Type >::Type ShuffleWord
Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T.