doxygen/openfpm/util__ptx_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include "util_type.cuh"

#include "util_arch.cuh"

#include "util_namespace.cuh"

#include "util_debug.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * PTX helper macros

 ******************************************************************************/


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


#if defined(_WIN64) || defined(__LP64__)

    #define __CUB_LP64__ 1

    // 64-bit register modifier for inlined asm

    #define _CUB_ASM_PTR_ "l"

    #define _CUB_ASM_PTR_SIZE_ "u64"

#else

    #define __CUB_LP64__ 0

    // 32-bit register modifier for inlined asm

    #define _CUB_ASM_PTR_ "r"

    #define _CUB_ASM_PTR_SIZE_ "u32"

#endif


#endif // DOXYGEN_SHOULD_SKIP_THIS


/******************************************************************************

 * Inlined PTX intrinsics

 ******************************************************************************/


__device__ __forceinline__ unsigned int SHR_ADD(

    unsigned int x,

    unsigned int shift,

    unsigned int addend)

{

    unsigned int ret;

#if CUB_PTX_ARCH >= 200

    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :

        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));

#else

    ret = (x >> shift) + addend;

#endif

    return ret;

}


__device__ __forceinline__ unsigned int SHL_ADD(

    unsigned int x,

    unsigned int shift,

    unsigned int addend)

{

    unsigned int ret;

#if CUB_PTX_ARCH >= 200

    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :

        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));

#else

    ret = (x << shift) + addend;

#endif

    return ret;

}


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


template <typename UnsignedBits, int BYTE_LEN>

__device__ __forceinline__ unsigned int BFE(

    UnsignedBits            source,

    unsigned int            bit_start,

    unsigned int            num_bits,

    Int2Type<BYTE_LEN>      /*byte_len*/)

{

    unsigned int bits;

#if CUB_PTX_ARCH >= 200

    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));

#else

    const unsigned int MASK = (1 << num_bits) - 1;

    bits = (source >> bit_start) & MASK;

#endif

    return bits;

}


template <typename UnsignedBits>

__device__ __forceinline__ unsigned int BFE(

    UnsignedBits            source,

    unsigned int            bit_start,

    unsigned int            num_bits,

    Int2Type<8>             /*byte_len*/)

{

    const unsigned long long MASK = (1ull << num_bits) - 1;

    return (source >> bit_start) & MASK;

}


#endif // DOXYGEN_SHOULD_SKIP_THIS


template <typename UnsignedBits>

__device__ __forceinline__ unsigned int BFE(

    UnsignedBits source,

    unsigned int bit_start,

    unsigned int num_bits)

{

    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());

}


__device__ __forceinline__ void BFI(

    unsigned int &ret,

    unsigned int x,

    unsigned int y,

    unsigned int bit_start,

    unsigned int num_bits)

{

#if CUB_PTX_ARCH >= 200

    asm ("bfi.b32 %0, %1, %2, %3, %4;" :

        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));

#else

    x <<= bit_start;

    unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;

    unsigned int MASK_Y = ~MASK_X;

    ret = (y & MASK_Y) | (x & MASK_X);

#endif

}


__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)

{

#if CUB_PTX_ARCH >= 200

    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));

#else

    x = x + y + z;

#endif

    return x;

}


__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)

{

    int ret;

    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));

    return ret;

}


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


__device__ __forceinline__ void BAR(int count)

{

    asm volatile("bar.sync 1, %0;" : : "r"(count));

}


__device__  __forceinline__ void CTA_SYNC()

{

    __syncthreads();

}


__device__  __forceinline__ int CTA_SYNC_AND(int p)

{

    return __syncthreads_and(p);

}


__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)

{

#ifdef CUB_USE_COOPERATIVE_GROUPS

    __syncwarp(member_mask);

#endif

}


__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)

{

#ifdef CUB_USE_COOPERATIVE_GROUPS

    return __any_sync(member_mask, predicate);

#else

    return ::__any(predicate);

#endif

}


__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)

{

#ifdef CUB_USE_COOPERATIVE_GROUPS

    return __all_sync(member_mask, predicate);

#else

    return ::__all(predicate);

#endif

}


__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)

{

#ifdef CUB_USE_COOPERATIVE_GROUPS

    return __ballot_sync(member_mask, predicate);

#else

    return __ballot(predicate);

#endif

}


__device__ __forceinline__

unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)

{

#ifdef CUB_USE_COOPERATIVE_GROUPS

    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"

        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));

#else

    asm volatile("shfl.up.b32 %0, %1, %2, %3;"

        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));

#endif

    return word;

}


__device__ __forceinline__

unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)

{

#ifdef CUB_USE_COOPERATIVE_GROUPS

    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"

        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));

#else

    asm volatile("shfl.down.b32 %0, %1, %2, %3;"

        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));

#endif

    return word;

}


__device__ __forceinline__

unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)

{

#ifdef CUB_USE_COOPERATIVE_GROUPS

    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"

        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));

#else

    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"

        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));

#endif

    return word;

}


__device__ __forceinline__ float FMUL_RZ(float a, float b)

{

    float d;

    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));

    return d;

}


__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)

{

    float d;

    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));

    return d;

}


#endif // DOXYGEN_SHOULD_SKIP_THIS


__device__ __forceinline__ void ThreadExit() {

    asm volatile("exit;");

}


__device__ __forceinline__ void ThreadTrap() {

    asm volatile("trap;");

}


__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)

{

    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +

            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +

            threadIdx.x;

}


__device__ __forceinline__ unsigned int LaneId()

{

    unsigned int ret;

    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );

    return ret;

}


__device__ __forceinline__ unsigned int WarpId()

{

    unsigned int ret;

    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );

    return ret;

}


__device__ __forceinline__ unsigned int LaneMaskLt()

{

    unsigned int ret;

    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );

    return ret;

}


__device__ __forceinline__ unsigned int LaneMaskLe()

{

    unsigned int ret;

    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );

    return ret;

}


__device__ __forceinline__ unsigned int LaneMaskGt()

{

    unsigned int ret;

    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );

    return ret;

}


__device__ __forceinline__ unsigned int LaneMaskGe()

{

    unsigned int ret;

    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );

    return ret;

}

       // end group UtilPtx


template <

    int LOGICAL_WARP_THREADS,

    typename T>

__device__ __forceinline__ T ShuffleUp(

    T               input,

    int             src_offset,

    int             first_thread,

    unsigned int    member_mask)

{

    enum {

        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8

    };


    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;


    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);


    T               output;

    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);

    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);


    unsigned int shuffle_word;

    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);

    output_alias[0] = shuffle_word;


    #pragma unroll

    for (int WORD = 1; WORD < WORDS; ++WORD)

    {

        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);

        output_alias[WORD] = shuffle_word;

    }


    return output;

}


template <

    int LOGICAL_WARP_THREADS,

    typename T>

__device__ __forceinline__ T ShuffleDown(

    T               input,

    int             src_offset,

    int             last_thread,

    unsigned int    member_mask)

{

    enum {

        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8

    };


    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;


    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);


    T               output;

    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);

    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);


    unsigned int shuffle_word;

    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);

    output_alias[0] = shuffle_word;


    #pragma unroll

    for (int WORD = 1; WORD < WORDS; ++WORD)

    {

        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);

        output_alias[WORD] = shuffle_word;

    }


    return output;

}


template <

    int LOGICAL_WARP_THREADS,

    typename T>

__device__ __forceinline__ T ShuffleIndex(

    T               input,

    int             src_lane,

    unsigned int    member_mask)

{

    enum {

        SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)

    };


    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;


    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);


    T               output;

    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);

    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);


    unsigned int shuffle_word;

    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],

                                 src_lane,

                                 SHFL_C,

                                 member_mask);


    output_alias[0] = shuffle_word;


    #pragma unroll

    for (int WORD = 1; WORD < WORDS; ++WORD)

    {

        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],

                                     src_lane,

                                     SHFL_C,

                                     member_mask);


        output_alias[WORD] = shuffle_word;

    }


    return output;

}


template <int LABEL_BITS>

inline __device__ unsigned int MatchAny(unsigned int label)

{

    unsigned int retval;


    // Extract masks of common threads for each bit

    #pragma unroll

    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)

    {

        unsigned int mask;

        unsigned int current_bit = 1 << BIT;

        asm ("{\n"

            "    .reg .pred p;\n"

            "    and.b32 %0, %1, %2;"

            "    setp.eq.u32 p, %0, %2;\n"

#ifdef CUB_USE_COOPERATIVE_GROUPS

            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"

#else

            "    vote.ballot.b32 %0, p;\n"

#endif

            "    @!p not.b32 %0, %0;\n"

            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));


        // Remove peers who differ

        retval = (BIT == 0) ? mask : retval & mask;

    }


    return retval;


//  // VOLTA match

//    unsigned int retval;

//    asm ("{\n"

//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"

//         "}\n" : "=r"(retval) : "r"(label));

//    return retval;


}


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)

cub::BlockRadixRank
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Definition block_radix_rank.cuh:98

shift
Definition CellDecomposer.hpp:27

cub::SHFL_UP_SYNC
__device__ __forceinline__ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
Definition util_ptx.cuh:323

cub::BFI
__device__ __forceinline__ void BFI(unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start.
Definition util_ptx.cuh:176

cub::FFMA_RZ
__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
Definition util_ptx.cuh:381

cub::BAR
__device__ __forceinline__ void BAR(int count)
Definition util_ptx.cuh:247

cub::LaneMaskLt
__device__ __forceinline__ unsigned int LaneMaskLt()
Returns the warp lane mask of all lanes less than the calling thread.
Definition util_ptx.cuh:441

cub::SHL_ADD
__device__ __forceinline__ unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-left then add. Returns (x << shift) + addend.
Definition util_ptx.cuh:106

cub::PRMT
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit ...
Definition util_ptx.cuh:235

cub::WARP_SYNC
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition util_ptx.cuh:273

cub::IADD3
__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
Three-operand add. Returns x + y + z.
Definition util_ptx.cuh:198

cub::WARP_ANY
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:284

cub::LaneMaskGt
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
Definition util_ptx.cuh:461

cub::WARP_BALLOT
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:310

cub::RowMajorTid
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409

cub::SHR_ADD
__device__ __forceinline__ unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-right then add. Returns (x >> shift) + addend.
Definition util_ptx.cuh:87

cub::CTA_SYNC_AND
__device__ __forceinline__ int CTA_SYNC_AND(int p)
Definition util_ptx.cuh:264

cub::ThreadExit
__device__ __forceinline__ void ThreadExit()
Terminates the calling thread.
Definition util_ptx.cuh:393

cub::SHFL_IDX_SYNC
__device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
Definition util_ptx.cuh:355

cub::LaneMaskLe
__device__ __forceinline__ unsigned int LaneMaskLe()
Returns the warp lane mask of all lanes less than or equal to the calling thread.
Definition util_ptx.cuh:451

cub::ThreadTrap
__device__ __forceinline__ void ThreadTrap()
Abort execution and generate an interrupt to the host CPU.
Definition util_ptx.cuh:401

cub::LaneId
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420

cub::SHFL_DOWN_SYNC
__device__ __forceinline__ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
Definition util_ptx.cuh:339

cub::LaneMaskGe
__device__ __forceinline__ unsigned int LaneMaskGe()
Returns the warp lane mask of all lanes greater than or equal to the calling thread.
Definition util_ptx.cuh:471

cub::WarpId
__device__ __forceinline__ unsigned int WarpId()
Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps,...
Definition util_ptx.cuh:431

cub::FMUL_RZ
__device__ __forceinline__ float FMUL_RZ(float a, float b)
Definition util_ptx.cuh:370

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub::WARP_ALL
__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:297

cub::BFE
__device__ __forceinline__ unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type< BYTE_LEN >)
Definition util_ptx.cuh:127

cub::ShuffleIndex
__device__ __forceinline__ T ShuffleIndex(T input, int src_lane, unsigned int member_mask)
Shuffle-broadcast for any data type. Each warp-lanei obtains the value input contributed by warp-lane...
Definition util_ptx.cuh:656

cub::ShuffleUp
__device__ __forceinline__ T ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask)
Shuffle-up for any data type. Each warp-lanei obtains the value input contributed by warp-lanei-src_o...
Definition util_ptx.cuh:517

cub::ShuffleDown
__device__ __forceinline__ T ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask)
Shuffle-down for any data type. Each warp-lanei obtains the value input contributed by warp-lanei+src...
Definition util_ptx.cuh:585

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

cub::MatchAny
__device__ unsigned int MatchAny(unsigned int label)
Definition util_ptx.cuh:703

cub::num_bits
OffsetT int int num_bits
[in] Number of bits of current radix digit
Definition dispatch_radix_sort.cuh:77

cub::current_bit
OffsetT int current_bit
[in] Bit position of current radix digit
Definition dispatch_radix_sort.cuh:76

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

util_arch.cuh

util_debug.cuh

util_namespace.cuh

util_type.cuh