doxygen/openfpm/block__shuffle_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include "../util_arch.cuh"

#include "../util_ptx.cuh"

#include "../util_macro.cuh"

#include "../util_type.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


template <

    typename            T,

    int                 BLOCK_DIM_X,

    int                 BLOCK_DIM_Y         = 1,

    int                 BLOCK_DIM_Z         = 1,

    int                 PTX_ARCH            = CUB_PTX_ARCH>

class BlockShuffle

{

private:


    /******************************************************************************

     * Constants

     ******************************************************************************/


    enum

    {

        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,


        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),

        WARP_THREADS                = 1 << LOG_WARP_THREADS,

        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

    };


    /******************************************************************************

     * Type definitions

     ******************************************************************************/


    struct _TempStorage

    {

        T prev[BLOCK_THREADS];

        T next[BLOCK_THREADS];

    };


public:


    struct TempStorage : Uninitialized<_TempStorage> {};


private:


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage &temp_storage;


    unsigned int linear_tid;


    /******************************************************************************

     * Utility methods

     ******************************************************************************/


    __device__ __forceinline__ _TempStorage& PrivateStorage()

    {

        __shared__ _TempStorage private_storage;

        return private_storage;

    }


public:


    /******************************************************************/


    __device__ __forceinline__ BlockShuffle()

    :

        temp_storage(PrivateStorage()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    __device__ __forceinline__ BlockShuffle(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    /******************************************************************/


    __device__ __forceinline__ void Offset(

        T   input,

        T&  output,

        int distance = 1)

    {

        temp_storage[linear_tid].prev = input;


        CTA_SYNC();


        if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))

            output = temp_storage[linear_tid + distance].prev;

    }


    __device__ __forceinline__ void Rotate(

        T   input,

        T&  output,

        unsigned int distance = 1)

    {

        temp_storage[linear_tid].prev = input;


        CTA_SYNC();


        unsigned int offset = threadIdx.x + distance;

        if (offset >= BLOCK_THREADS)

            offset -= BLOCK_THREADS;


        output = temp_storage[offset].prev;

    }


    template <int ITEMS_PER_THREAD>

    __device__ __forceinline__ void Up(

        T (&input)[ITEMS_PER_THREAD],

        T (&prev)[ITEMS_PER_THREAD])

    {

        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];


        CTA_SYNC();


        #pragma unroll

        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)

            prev[ITEM] = input[ITEM - 1];


        if (linear_tid > 0)

            prev[0] = temp_storage[linear_tid - 1].prev;

    }


    template <int ITEMS_PER_THREAD>

    __device__ __forceinline__ void Up(

        T (&input)[ITEMS_PER_THREAD],

        T (&prev)[ITEMS_PER_THREAD],

        T &block_suffix)

    {

        Up(input, prev);

        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;

    }


    template <int ITEMS_PER_THREAD>

    __device__ __forceinline__ void Down(

        T (&input)[ITEMS_PER_THREAD],

        T (&prev)[ITEMS_PER_THREAD])

    {

        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];


        CTA_SYNC();


        #pragma unroll

        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)

            prev[ITEM] = input[ITEM - 1];


        if (linear_tid > 0)

            prev[0] = temp_storage[linear_tid - 1].prev;

    }


    template <int ITEMS_PER_THREAD>

    __device__ __forceinline__ void Down(

        T (&input)[ITEMS_PER_THREAD],

        T (&prev)[ITEMS_PER_THREAD],

        T &block_prefix)

    {

        Up(input, prev);

        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockShuffle
The BlockShuffle class provides collective methods for shuffling data partitioned across a CUDA threa...
Definition block_shuffle.cuh:71

cub::BlockShuffle::Up
__device__ __forceinline__ void Up(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_suffix)
The thread block rotates its blocked arrangement of input items, shifting it up by one item....
Definition block_shuffle.cuh:244

cub::BlockShuffle::BlockShuffle
__device__ __forceinline__ BlockShuffle()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition block_shuffle.cuh:140

cub::BlockShuffle::PrivateStorage
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
Definition block_shuffle.cuh:123

cub::BlockShuffle::Down
__device__ __forceinline__ void Down(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD])
The thread block rotates its blocked arrangement of input items, shifting it down by one item.
Definition block_shuffle.cuh:263

cub::BlockShuffle::BlockShuffle
__device__ __forceinline__ BlockShuffle(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition block_shuffle.cuh:150

cub::BlockShuffle::Down
__device__ __forceinline__ void Down(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD], T &block_prefix)
The thread block rotates its blocked arrangement of input items, shifting it down by one item....
Definition block_shuffle.cuh:289

cub::BlockShuffle::Rotate
__device__ __forceinline__ void Rotate(T input, T &output, unsigned int distance=1)
Each threadi obtains the input provided by threadi+distance.
Definition block_shuffle.cuh:191

cub::BlockShuffle::Up
__device__ __forceinline__ void Up(T(&input)[ITEMS_PER_THREAD], T(&prev)[ITEMS_PER_THREAD])
The thread block rotates its blocked arrangement of input items, shifting it up by one item.
Definition block_shuffle.cuh:217

cub::BlockShuffle::temp_storage
_TempStorage & temp_storage
Shared storage reference.
Definition block_shuffle.cuh:112

cub::BlockShuffle::linear_tid
unsigned int linear_tid
Linear thread-id.
Definition block_shuffle.cuh:115

cub::BlockShuffle::Offset
__device__ __forceinline__ void Offset(T input, T &output, int distance=1)
Each threadi obtains the input provided by threadi+distance. The offset distance may be negative.
Definition block_shuffle.cuh:171

cub::RowMajorTid
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::BlockShuffle::TempStorage
\smemstorage{BlockShuffle}
Definition block_shuffle.cuh:102

cub::BlockShuffle::_TempStorage
Shared memory storage layout type (last element from each thread's input)
Definition block_shuffle.cuh:93

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

CUB_LOG_WARP_THREADS
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
Definition util_arch.cuh:73

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53