doxygen/openfpm/block__exchange_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include "../util_ptx.cuh"

#include "../util_arch.cuh"

#include "../util_macro.cuh"

#include "../util_type.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


template <

    typename    InputT,

    int         BLOCK_DIM_X,

    int         ITEMS_PER_THREAD,

    bool        WARP_TIME_SLICING   = false,

    int         BLOCK_DIM_Y         = 1,

    int         BLOCK_DIM_Z         = 1,

    int         PTX_ARCH            = CUB_PTX_ARCH>

class BlockExchange

{

private:


    /******************************************************************************

     * Constants

     ******************************************************************************/


    enum

    {

        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,


        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),

        WARP_THREADS                = 1 << LOG_WARP_THREADS,

        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,


        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),

        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,


        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,


        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,


        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,

        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,


        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),

        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,


        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)

        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),

        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,

    };


    /******************************************************************************

     * Type definitions

     ******************************************************************************/


    struct __align__(16) _TempStorage

    {

        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];

    };


public:


    struct TempStorage : Uninitialized<_TempStorage> {};


private:


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage &temp_storage;


    unsigned int linear_tid;

    unsigned int lane_id;

    unsigned int warp_id;

    unsigned int warp_offset;


    /******************************************************************************

     * Utility methods

     ******************************************************************************/


    __device__ __forceinline__ _TempStorage& PrivateStorage()

    {

        __shared__ _TempStorage private_storage;

        return private_storage;

    }


    template <typename OutputT>

    __device__ __forceinline__ void BlockedToStriped(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        Int2Type<false> /*time_slicing*/)

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;

            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

            temp_storage.buff[item_offset] = input_items[ITEM];

        }


        CTA_SYNC();


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;

            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

            output_items[ITEM] = temp_storage.buff[item_offset];

        }

    }


    template <typename OutputT>

    __device__ __forceinline__ void BlockedToStriped(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        Int2Type<true>  /*time_slicing*/)

    {

        InputT temp_items[ITEMS_PER_THREAD];


        #pragma unroll

        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)

        {

            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;

            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;


            CTA_SYNC();


            if (warp_id == SLICE)

            {

                #pragma unroll

                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

                {

                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;

                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                    temp_storage.buff[item_offset] = input_items[ITEM];

                }

            }


            CTA_SYNC();


            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

            {

                // Read a strip of items

                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;

                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;


                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))

                {

                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;

                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))

                    {

                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                        temp_items[ITEM] = temp_storage.buff[item_offset];

                    }

                }

            }

        }


        // Copy

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            output_items[ITEM] = temp_items[ITEM];

        }

    }


    template <typename OutputT>

    __device__ __forceinline__ void BlockedToWarpStriped(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        Int2Type<false> /*time_slicing*/)

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);

            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

            temp_storage.buff[item_offset] = input_items[ITEM];

        }


        WARP_SYNC(0xffffffff);


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;

            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

            output_items[ITEM] = temp_storage.buff[item_offset];

        }

    }


    template <typename OutputT>

    __device__ __forceinline__ void BlockedToWarpStriped(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        Int2Type<true>  /*time_slicing*/)

    {

        if (warp_id == 0)

        {

            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

            {

                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);

                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                temp_storage.buff[item_offset] = input_items[ITEM];

            }


            WARP_SYNC(0xffffffff);


            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

            {

                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;

                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                output_items[ITEM] = temp_storage.buff[item_offset];

            }

        }


        #pragma unroll

        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)

        {

            CTA_SYNC();


            if (warp_id == SLICE)

            {

                #pragma unroll

                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

                {

                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);

                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                    temp_storage.buff[item_offset] = input_items[ITEM];

                }


                WARP_SYNC(0xffffffff);


                #pragma unroll

                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

                {

                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;

                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                    output_items[ITEM] = temp_storage.buff[item_offset];

                }

            }

        }

    }


    template <typename OutputT>

    __device__ __forceinline__ void StripedToBlocked(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        Int2Type<false> /*time_slicing*/)

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;

            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

            temp_storage.buff[item_offset] = input_items[ITEM];

        }


        CTA_SYNC();


        // No timeslicing

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;

            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

            output_items[ITEM] = temp_storage.buff[item_offset];

        }

    }


    template <typename OutputT>

    __device__ __forceinline__ void StripedToBlocked(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        Int2Type<true>  /*time_slicing*/)

    {

        // Warp time-slicing

        InputT temp_items[ITEMS_PER_THREAD];


        #pragma unroll

        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)

        {

            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;

            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;


            CTA_SYNC();


            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

            {

                // Write a strip of items

                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;

                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;


                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))

                {

                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;

                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))

                    {

                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                        temp_storage.buff[item_offset] = input_items[ITEM];

                    }

                }

            }


            CTA_SYNC();


            if (warp_id == SLICE)

            {

                #pragma unroll

                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

                {

                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;

                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                    temp_items[ITEM] = temp_storage.buff[item_offset];

                }

            }

        }


        // Copy

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            output_items[ITEM] = temp_items[ITEM];

        }

    }


    template <typename OutputT>

    __device__ __forceinline__ void WarpStripedToBlocked(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        Int2Type<false> /*time_slicing*/)

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;

            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

            temp_storage.buff[item_offset] = input_items[ITEM];

        }


        WARP_SYNC(0xffffffff);


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);

            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

            output_items[ITEM] = temp_storage.buff[item_offset];

        }

    }


    template <typename OutputT>

    __device__ __forceinline__ void WarpStripedToBlocked(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        Int2Type<true>  /*time_slicing*/)

    {

        #pragma unroll

        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)

        {

            CTA_SYNC();


            if (warp_id == SLICE)

            {

                #pragma unroll

                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

                {

                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;

                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                    temp_storage.buff[item_offset] = input_items[ITEM];

                }


                WARP_SYNC(0xffffffff);


                #pragma unroll

                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

                {

                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);

                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                    output_items[ITEM] = temp_storage.buff[item_offset];

                }

            }

        }

    }


    template <typename OutputT, typename OffsetT>

    __device__ __forceinline__ void ScatterToBlocked(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        OffsetT         ranks[ITEMS_PER_THREAD],

        Int2Type<false> /*time_slicing*/)

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = ranks[ITEM];

            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

            temp_storage.buff[item_offset] = input_items[ITEM];

        }


        CTA_SYNC();


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;

            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

            output_items[ITEM] = temp_storage.buff[item_offset];

        }

    }


    template <typename OutputT, typename OffsetT>

    __device__ __forceinline__ void ScatterToBlocked(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        OffsetT         ranks[ITEMS_PER_THREAD],

        Int2Type<true>  /*time_slicing*/)

    {

        InputT temp_items[ITEMS_PER_THREAD];


        #pragma unroll

        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)

        {

            CTA_SYNC();


            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;


            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

            {

                int item_offset = ranks[ITEM] - SLICE_OFFSET;

                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))

                {

                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

                    temp_storage.buff[item_offset] = input_items[ITEM];

                }

            }


            CTA_SYNC();


            if (warp_id == SLICE)

            {

                #pragma unroll

                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

                {

                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;

                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

                    temp_items[ITEM] = temp_storage.buff[item_offset];

                }

            }

        }


        // Copy

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            output_items[ITEM] = temp_items[ITEM];

        }

    }


    template <typename OutputT, typename OffsetT>

    __device__ __forceinline__ void ScatterToStriped(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        OffsetT         ranks[ITEMS_PER_THREAD],

        Int2Type<false> /*time_slicing*/)

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = ranks[ITEM];

            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

            temp_storage.buff[item_offset] = input_items[ITEM];

        }


        CTA_SYNC();


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;

            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

            output_items[ITEM] = temp_storage.buff[item_offset];

        }

    }


    template <typename OutputT, typename OffsetT>

    __device__ __forceinline__ void ScatterToStriped(

        InputT          input_items[ITEMS_PER_THREAD],

        OutputT         output_items[ITEMS_PER_THREAD],

        OffsetT         ranks[ITEMS_PER_THREAD],

        Int2Type<true> /*time_slicing*/)

    {

        InputT temp_items[ITEMS_PER_THREAD];


        #pragma unroll

        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)

        {

            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;

            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;


            CTA_SYNC();


            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

            {

                int item_offset = ranks[ITEM] - SLICE_OFFSET;

                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))

                {

                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

                    temp_storage.buff[item_offset] = input_items[ITEM];

                }

            }


            CTA_SYNC();


            #pragma unroll

            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

            {

                // Read a strip of items

                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;

                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;


                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))

                {

                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;

                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))

                    {

                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;

                        temp_items[ITEM] = temp_storage.buff[item_offset];

                    }

                }

            }

        }


        // Copy

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            output_items[ITEM] = temp_items[ITEM];

        }

    }


public:


    /******************************************************************/


    __device__ __forceinline__ BlockExchange()

    :

        temp_storage(PrivateStorage()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),

        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),

        lane_id(LaneId()),

        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)

    {}


    __device__ __forceinline__ BlockExchange(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),

        lane_id(LaneId()),

        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),

        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)

    {}


    /******************************************************************/


    template <typename OutputT>

    __device__ __forceinline__ void StripedToBlocked(

        InputT      input_items[ITEMS_PER_THREAD],

        OutputT     output_items[ITEMS_PER_THREAD])

    {

        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());

    }


    template <typename OutputT>

    __device__ __forceinline__ void BlockedToStriped(

        InputT      input_items[ITEMS_PER_THREAD],

        OutputT     output_items[ITEMS_PER_THREAD])

    {

        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());

    }


    template <typename OutputT>

    __device__ __forceinline__ void WarpStripedToBlocked(

        InputT      input_items[ITEMS_PER_THREAD],

        OutputT     output_items[ITEMS_PER_THREAD])

    {

        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());

    }


    template <typename OutputT>

    __device__ __forceinline__ void BlockedToWarpStriped(

        InputT      input_items[ITEMS_PER_THREAD],

        OutputT     output_items[ITEMS_PER_THREAD])

    {

        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());

    }


    /******************************************************************/


    template <typename OutputT, typename OffsetT>

    __device__ __forceinline__ void ScatterToBlocked(

        InputT      input_items[ITEMS_PER_THREAD],

        OutputT     output_items[ITEMS_PER_THREAD],

        OffsetT     ranks[ITEMS_PER_THREAD])

    {

        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());

    }


    template <typename OutputT, typename OffsetT>

    __device__ __forceinline__ void ScatterToStriped(

        InputT      input_items[ITEMS_PER_THREAD],

        OutputT     output_items[ITEMS_PER_THREAD],

        OffsetT     ranks[ITEMS_PER_THREAD])

    {

        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());

    }


    template <typename OutputT, typename OffsetT>

    __device__ __forceinline__ void ScatterToStripedGuarded(

        InputT      input_items[ITEMS_PER_THREAD],

        OutputT     output_items[ITEMS_PER_THREAD],

        OffsetT     ranks[ITEMS_PER_THREAD])

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = ranks[ITEM];

            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

            if (ranks[ITEM] >= 0)

                temp_storage.buff[item_offset] = input_items[ITEM];

        }


        CTA_SYNC();


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;

            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

            output_items[ITEM] = temp_storage.buff[item_offset];

        }

    }


    template <typename OutputT, typename OffsetT, typename ValidFlag>

    __device__ __forceinline__ void ScatterToStripedFlagged(

        InputT      input_items[ITEMS_PER_THREAD],

        OutputT     output_items[ITEMS_PER_THREAD],

        OffsetT     ranks[ITEMS_PER_THREAD],

        ValidFlag   is_valid[ITEMS_PER_THREAD])

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = ranks[ITEM];

            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

            if (is_valid[ITEM])

                temp_storage.buff[item_offset] = input_items[ITEM];

        }


        CTA_SYNC();


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;

            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

            output_items[ITEM] = temp_storage.buff[item_offset];

        }

    }


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


    __device__ __forceinline__ void StripedToBlocked(

        InputT      items[ITEMS_PER_THREAD])

    {

        StripedToBlocked(items, items);

    }


    __device__ __forceinline__ void BlockedToStriped(

        InputT      items[ITEMS_PER_THREAD])

    {

        BlockedToStriped(items, items);

    }


    __device__ __forceinline__ void WarpStripedToBlocked(

        InputT      items[ITEMS_PER_THREAD])

    {

        WarpStripedToBlocked(items, items);

    }


    __device__ __forceinline__ void BlockedToWarpStriped(

        InputT      items[ITEMS_PER_THREAD])

    {

        BlockedToWarpStriped(items, items);

    }


    template <typename OffsetT>

    __device__ __forceinline__ void ScatterToBlocked(

        InputT      items[ITEMS_PER_THREAD],

        OffsetT     ranks[ITEMS_PER_THREAD])

    {

        ScatterToBlocked(items, items, ranks);

    }


    template <typename OffsetT>

    __device__ __forceinline__ void ScatterToStriped(

        InputT      items[ITEMS_PER_THREAD],

        OffsetT     ranks[ITEMS_PER_THREAD])

    {

        ScatterToStriped(items, items, ranks);

    }


    template <typename OffsetT>

    __device__ __forceinline__ void ScatterToStripedGuarded(

        InputT      items[ITEMS_PER_THREAD],

        OffsetT     ranks[ITEMS_PER_THREAD])

    {

        ScatterToStripedGuarded(items, items, ranks);

    }


    template <typename OffsetT, typename ValidFlag>

    __device__ __forceinline__ void ScatterToStripedFlagged(

        InputT      items[ITEMS_PER_THREAD],

        OffsetT     ranks[ITEMS_PER_THREAD],

        ValidFlag   is_valid[ITEMS_PER_THREAD])

    {

        ScatterToStriped(items, items, ranks, is_valid);

    }


#endif // DOXYGEN_SHOULD_SKIP_THIS


};


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


template <

    typename    T,

    int         ITEMS_PER_THREAD,

    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,

    int         PTX_ARCH                = CUB_PTX_ARCH>

class WarpExchange

{

private:


    /******************************************************************************

     * Constants

     ******************************************************************************/


    enum

    {

        // Whether the logical warp size and the PTX warp size coincide

        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),


        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,


        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),

        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,


        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)

        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),

        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,

    };


    /******************************************************************************

     * Type definitions

     ******************************************************************************/


    struct _TempStorage

    {

        T buff[WARP_ITEMS + PADDING_ITEMS];

    };


public:


    struct TempStorage : Uninitialized<_TempStorage> {};


private:


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage    &temp_storage;

    int             lane_id;


public:


    /******************************************************************************

     * Construction

     ******************************************************************************/


    __device__ __forceinline__ WarpExchange(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias()),

        lane_id(IS_ARCH_WARP ?

            LaneId() :

            LaneId() % LOGICAL_WARP_THREADS)

    {}


    /******************************************************************************

     * Interface

     ******************************************************************************/


    template <typename OffsetT>

    __device__ __forceinline__ void ScatterToStriped(

        T               items[ITEMS_PER_THREAD],

        OffsetT         ranks[ITEMS_PER_THREAD])

    {

        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);

            temp_storage.buff[ranks[ITEM]] = items[ITEM];

        }


        WARP_SYNC(0xffffffff);


        #pragma unroll

        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        {

            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;

            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);

            items[ITEM] = temp_storage.buff[item_offset];

        }

    }


};


#endif // DOXYGEN_SHOULD_SKIP_THIS


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::BlockExchange
The BlockExchange class provides collective methods for rearranging data partitioned across a CUDA th...
Definition block_exchange.cuh:117

cub::BlockExchange::StripedToBlocked
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from striped arrangement to blocked arrangement.
Definition block_exchange.cuh:780

cub::BlockExchange::BlockExchange
__device__ __forceinline__ BlockExchange(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition block_exchange.cuh:726

cub::BlockExchange::BlockedToStriped
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from blocked arrangement to striped arrangement.
Definition block_exchange.cuh:829

cub::BlockExchange::ScatterToBlocked
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
Definition block_exchange.cuh:534

cub::BlockExchange::ScatterToBlocked
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into blocked arrangement.
Definition block_exchange.cuh:953

cub::BlockExchange::BlockExchange
__device__ __forceinline__ BlockExchange()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition block_exchange.cuh:713

cub::BlockExchange::StripedToBlocked
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
Definition block_exchange.cuh:376

cub::BlockExchange::ScatterToStriped
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >)
Definition block_exchange.cuh:646

cub::BlockExchange::StripedToBlocked
__device__ __forceinline__ void StripedToBlocked(InputT items[ITEMS_PER_THREAD])
Definition block_exchange.cuh:1063

cub::BlockExchange::BlockedToWarpStriped
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
Definition block_exchange.cuh:289

cub::BlockExchange::ScatterToStripedGuarded
__device__ __forceinline__ void ScatterToStripedGuarded(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged...
Definition block_exchange.cuh:991

cub::BlockExchange::BLOCK_THREADS
@ BLOCK_THREADS
The thread block size in threads.
Definition block_exchange.cuh:128

cub::BlockExchange::PrivateStorage
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
Definition block_exchange.cuh:189

cub::BlockExchange::BlockedToWarpStriped
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
Definition block_exchange.cuh:317

cub::BlockExchange::temp_storage
_TempStorage & temp_storage
Shared storage reference.
Definition block_exchange.cuh:175

cub::BlockExchange::BlockedToStriped
__device__ __forceinline__ void BlockedToStriped(InputT items[ITEMS_PER_THREAD])
Definition block_exchange.cuh:1069

cub::BlockExchange::ScatterToStripedGuarded
__device__ __forceinline__ void ScatterToStripedGuarded(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Definition block_exchange.cuh:1104

cub::BlockExchange::BlockedToWarpStriped
__device__ __forceinline__ void BlockedToWarpStriped(InputT items[ITEMS_PER_THREAD])
Definition block_exchange.cuh:1081

cub::BlockExchange::ScatterToStriped
__device__ __forceinline__ void ScatterToStriped(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Definition block_exchange.cuh:1096

cub::BlockExchange::WarpStripedToBlocked
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from warp-striped arrangement to blocked arrangement.
Definition block_exchange.cuh:877

cub::BlockExchange::__align__
struct __align__(16) _TempStorage
Shared memory storage layout type.
Definition block_exchange.cuh:157

cub::BlockExchange::ScatterToStriped
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges data items annotated by rank into striped arrangement.
Definition block_exchange.cuh:972

cub::BlockExchange::WarpStripedToBlocked
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
Definition block_exchange.cuh:467

cub::BlockExchange::ScatterToStripedFlagged
__device__ __forceinline__ void ScatterToStripedFlagged(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
Definition block_exchange.cuh:1029

cub::BlockExchange::StripedToBlocked
__device__ __forceinline__ void StripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
Definition block_exchange.cuh:406

cub::BlockExchange::BlockedToStriped
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
Definition block_exchange.cuh:229

cub::BlockExchange::ScatterToStriped
__device__ __forceinline__ void ScatterToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< false >)
Definition block_exchange.cuh:616

cub::BlockExchange::ScatterToBlocked
__device__ __forceinline__ void ScatterToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type< true >)
Definition block_exchange.cuh:563

cub::BlockExchange::WarpStripedToBlocked
__device__ __forceinline__ void WarpStripedToBlocked(InputT items[ITEMS_PER_THREAD])
Definition block_exchange.cuh:1075

cub::BlockExchange::WarpStripedToBlocked
__device__ __forceinline__ void WarpStripedToBlocked(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< true >)
Definition block_exchange.cuh:496

cub::BlockExchange::linear_tid
unsigned int linear_tid
Linear thread-id.
Definition block_exchange.cuh:178

cub::BlockExchange::BlockedToStriped
__device__ __forceinline__ void BlockedToStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD], Int2Type< false >)
Definition block_exchange.cuh:200

cub::BlockExchange::ScatterToBlocked
__device__ __forceinline__ void ScatterToBlocked(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Definition block_exchange.cuh:1088

cub::BlockExchange::BlockedToWarpStriped
__device__ __forceinline__ void BlockedToWarpStriped(InputT input_items[ITEMS_PER_THREAD], OutputT output_items[ITEMS_PER_THREAD])
Transposes data items from blocked arrangement to warp-striped arrangement.
Definition block_exchange.cuh:928

cub::BlockExchange::ScatterToStripedFlagged
__device__ __forceinline__ void ScatterToStripedFlagged(InputT items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], ValidFlag is_valid[ITEMS_PER_THREAD])
Definition block_exchange.cuh:1112

cub::WarpExchange
Definition block_exchange.cuh:1135

cub::WarpExchange::WarpExchange
__device__ __forceinline__ WarpExchange(TempStorage &temp_storage)
Constructor.
Definition block_exchange.cuh:1190

cub::WarpExchange::ScatterToStriped
__device__ __forceinline__ void ScatterToStriped(T items[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD])
Exchanges valid data items annotated by rank into striped arrangement.
Definition block_exchange.cuh:1213

CUB_MIN
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition util_macro.cuh:66

cub::WARP_SYNC
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition util_ptx.cuh:273

cub::RowMajorTid
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409

cub::SHR_ADD
__device__ __forceinline__ unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
Shift-right then add. Returns (x >> shift) + addend.
Definition util_ptx.cuh:87

cub::LaneId
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::BlockExchange::TempStorage
\smemstorage{BlockExchange}
Definition block_exchange.cuh:165

cub::Int2Type
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition util_type.cuh:276

cub::PowerOfTwo
Statically determine if N is a power-of-two.
Definition util_type.cuh:156

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

cub::WarpExchange::TempStorage
\smemstorage{WarpExchange}
Definition block_exchange.cuh:1171

cub::WarpExchange::_TempStorage
Shared memory storage layout type.
Definition block_exchange.cuh:1164

CUB_LOG_WARP_THREADS
#define CUB_LOG_WARP_THREADS(arch)
Number of threads per warp.
Definition util_arch.cuh:73

CUB_LOG_SMEM_BANKS
#define CUB_LOG_SMEM_BANKS(arch)
Number of smem banks.
Definition util_arch.cuh:85

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53