doxygen/openfpm/block__load_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <iterator>


#include "block_exchange.cuh"

#include "../iterator/cache_modified_input_iterator.cuh"

#include "../util_ptx.cuh"

#include "../util_macro.cuh"

#include "../util_type.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************/


template <

    typename        InputT,

    int             ITEMS_PER_THREAD,

    typename        InputIteratorT>

__device__ __forceinline__ void LoadDirectBlocked(

    int             linear_tid,

    InputIteratorT  block_itr,

    InputT          (&items)[ITEMS_PER_THREAD])

{

    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);


    // Load directly in thread-blocked order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        items[ITEM] = thread_itr[ITEM];

    }

}


template <

    typename        InputT,

    int             ITEMS_PER_THREAD,

    typename        InputIteratorT>

__device__ __forceinline__ void LoadDirectBlocked(

    int             linear_tid,

    InputIteratorT  block_itr,

    InputT          (&items)[ITEMS_PER_THREAD],

    int             valid_items)

{

    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);


    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)

        {

            items[ITEM] = thread_itr[ITEM];

        }

    }

}


template <

    typename        InputT,

    typename        DefaultT,

    int             ITEMS_PER_THREAD,

    typename        InputIteratorT>

__device__ __forceinline__ void LoadDirectBlocked(

    int             linear_tid,

    InputIteratorT  block_itr,

    InputT          (&items)[ITEMS_PER_THREAD],

    int             valid_items,

    DefaultT        oob_default)

{

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        items[ITEM] = oob_default;


    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);

}


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


template <

    CacheLoadModifier   MODIFIER,

    typename            T,

    int                 ITEMS_PER_THREAD>

__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(

    int    linear_tid,

    T      *block_ptr,

    T      (&items)[ITEMS_PER_THREAD])

{

    // Biggest memory access word that T is a whole multiple of

    typedef typename UnitWord<T>::DeviceWord DeviceWord;


    enum

    {

        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),


        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?

            4 :

            (TOTAL_WORDS % 2 == 0) ?

                2 :

                1,


        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,

    };


    // Vector type

    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;


    // Vector items

    Vector vec_items[VECTORS_PER_THREAD];


    // Aliased input ptr

    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);


    // Load directly in thread-blocked order

    #pragma unroll

    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)

    {

        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);

    }


    // Copy

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);

    }

}


#endif // DOXYGEN_SHOULD_SKIP_THIS


template <

    typename        T,

    int             ITEMS_PER_THREAD>

__device__ __forceinline__ void LoadDirectBlockedVectorized(

    int linear_tid,

    T   *block_ptr,

    T   (&items)[ITEMS_PER_THREAD])

{

    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);

}


/******************************************************************/


template <

    int             BLOCK_THREADS,

    typename        InputT,

    int             ITEMS_PER_THREAD,

    typename        InputIteratorT>

__device__ __forceinline__ void LoadDirectStriped(

    int             linear_tid,

    InputIteratorT  block_itr,

    InputT          (&items)[ITEMS_PER_THREAD])

{

    InputIteratorT thread_itr = block_itr + linear_tid;


    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];

    }

}


template <

    int             BLOCK_THREADS,

    typename        InputT,

    int             ITEMS_PER_THREAD,

    typename        InputIteratorT>

__device__ __forceinline__ void LoadDirectStriped(

    int             linear_tid,

    InputIteratorT  block_itr,

    InputT          (&items)[ITEMS_PER_THREAD],

    int             valid_items)

{

    InputIteratorT thread_itr = block_itr + linear_tid;


    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)

        {

            items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];

        }

    }

}


template <

    int             BLOCK_THREADS,

    typename        InputT,

    typename        DefaultT,

    int             ITEMS_PER_THREAD,

    typename        InputIteratorT>

__device__ __forceinline__ void LoadDirectStriped(

    int             linear_tid,

    InputIteratorT  block_itr,

    InputT          (&items)[ITEMS_PER_THREAD],

    int             valid_items,

    DefaultT        oob_default)

{

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        items[ITEM] = oob_default;


    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);

}


/******************************************************************/


template <

    typename        InputT,

    int             ITEMS_PER_THREAD,

    typename        InputIteratorT>

__device__ __forceinline__ void LoadDirectWarpStriped(

    int             linear_tid,

    InputIteratorT  block_itr,

    InputT          (&items)[ITEMS_PER_THREAD])

{

    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);

    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;

    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;


    InputIteratorT thread_itr = block_itr + warp_offset + tid ;


    // Load directly in warp-striped order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];

    }

}


template <

    typename        InputT,

    int             ITEMS_PER_THREAD,

    typename        InputIteratorT>

__device__ __forceinline__ void LoadDirectWarpStriped(

    int             linear_tid,

    InputIteratorT  block_itr,

    InputT          (&items)[ITEMS_PER_THREAD],

    int             valid_items)

{

    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);

    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;

    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;


    InputIteratorT thread_itr = block_itr + warp_offset + tid ;


    // Load directly in warp-striped order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)

        {

            items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];

        }

    }

}


template <

    typename        InputT,

    typename        DefaultT,

    int             ITEMS_PER_THREAD,

    typename        InputIteratorT>

__device__ __forceinline__ void LoadDirectWarpStriped(

    int             linear_tid,

    InputIteratorT  block_itr,

    InputT          (&items)[ITEMS_PER_THREAD],

    int             valid_items,

    DefaultT        oob_default)

{

    // Load directly in warp-striped order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

        items[ITEM] = oob_default;


    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);

}


       // end group UtilIo


//-----------------------------------------------------------------------------

// Generic BlockLoad abstraction

//-----------------------------------------------------------------------------


enum BlockLoadAlgorithm

{

    BLOCK_LOAD_DIRECT,


    BLOCK_LOAD_VECTORIZE,


    BLOCK_LOAD_TRANSPOSE,


    BLOCK_LOAD_WARP_TRANSPOSE,


    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,

};


template <

    typename            InputT,

    int                 BLOCK_DIM_X,

    int                 ITEMS_PER_THREAD,

    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,

    int                 BLOCK_DIM_Y         = 1,

    int                 BLOCK_DIM_Z         = 1,

    int                 PTX_ARCH            = CUB_PTX_ARCH>

class BlockLoad

{

private:


    /******************************************************************************

     * Constants and typed definitions

     ******************************************************************************/


    enum

    {

        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,

    };


    /******************************************************************************

     * Algorithmic variants

     ******************************************************************************/


    template <BlockLoadAlgorithm _POLICY, int DUMMY>

    struct LoadInternal;


    template <int DUMMY>

    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>

    {

        typedef NullType TempStorage;


        int linear_tid;


        __device__ __forceinline__ LoadInternal(

            TempStorage &/*temp_storage*/,

            int linear_tid)

        :

            linear_tid(linear_tid)

        {}


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD])

        {

            LoadDirectBlocked(linear_tid, block_itr, items);

        }


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items)

        {

            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);

        }


        template <typename InputIteratorT, typename DefaultT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items,

            DefaultT        oob_default)

        {

            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);

        }


    };


    template <int DUMMY>

    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>

    {

        typedef NullType TempStorage;


        int linear_tid;


        __device__ __forceinline__ LoadInternal(

            TempStorage &/*temp_storage*/,

            int linear_tid)

        :

            linear_tid(linear_tid)

        {}


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputT               *block_ptr,

            InputT               (&items)[ITEMS_PER_THREAD])

        {

            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);

        }


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            const InputT         *block_ptr,

            InputT               (&items)[ITEMS_PER_THREAD])

        {

            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);

        }


        template <

            CacheLoadModifier   MODIFIER,

            typename            ValueType,

            typename            OffsetT>

        __device__ __forceinline__ void Load(

            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,

            InputT                                                     (&items)[ITEMS_PER_THREAD])

        {

            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);

        }


        template <typename _InputIteratorT>

        __device__ __forceinline__ void Load(

            _InputIteratorT   block_itr,

            InputT           (&items)[ITEMS_PER_THREAD])

        {

            LoadDirectBlocked(linear_tid, block_itr, items);

        }


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items)

        {

            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);

        }


        template <typename InputIteratorT, typename DefaultT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items,

            DefaultT          oob_default)

        {

            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);

        }


    };


    template <int DUMMY>

    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>

    {

        // BlockExchange utility type for keys

        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;


        struct _TempStorage : BlockExchange::TempStorage

        {};


        struct TempStorage : Uninitialized<_TempStorage> {};


        _TempStorage &temp_storage;


        int linear_tid;


        __device__ __forceinline__ LoadInternal(

            TempStorage &temp_storage,

            int linear_tid)

        :

            temp_storage(temp_storage.Alias()),

            linear_tid(linear_tid)

        {}


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD])

        {

            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);

            BlockExchange(temp_storage).StripedToBlocked(items, items);

        }


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items)

        {

            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);

            BlockExchange(temp_storage).StripedToBlocked(items, items);

        }


        template <typename InputIteratorT, typename DefaultT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items,

            DefaultT        oob_default)

        {

            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);

            BlockExchange(temp_storage).StripedToBlocked(items, items);

        }


    };


    template <int DUMMY>

    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>

    {

        enum

        {

            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)

        };


        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS

        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");


        // BlockExchange utility type for keys

        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;


        struct _TempStorage : BlockExchange::TempStorage

        {};


        struct TempStorage : Uninitialized<_TempStorage> {};


        _TempStorage &temp_storage;


        int linear_tid;


        __device__ __forceinline__ LoadInternal(

            TempStorage &temp_storage,

            int linear_tid)

        :

            temp_storage(temp_storage.Alias()),

            linear_tid(linear_tid)

        {}


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD])

        {

            LoadDirectWarpStriped(linear_tid, block_itr, items);

            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);

        }


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items)

        {

            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);

            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);

        }


        template <typename InputIteratorT, typename DefaultT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items,

            DefaultT        oob_default)

        {

            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);

            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);

        }

    };


    template <int DUMMY>

    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>

    {

        enum

        {

            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)

        };


        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS

        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");


        // BlockExchange utility type for keys

        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;


        struct _TempStorage : BlockExchange::TempStorage

        {};


        struct TempStorage : Uninitialized<_TempStorage> {};


        _TempStorage &temp_storage;


        int linear_tid;


        __device__ __forceinline__ LoadInternal(

            TempStorage &temp_storage,

            int linear_tid)

        :

            temp_storage(temp_storage.Alias()),

            linear_tid(linear_tid)

        {}


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD])

        {

            LoadDirectWarpStriped(linear_tid, block_itr, items);

            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);

        }


        template <typename InputIteratorT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items)

        {

            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);

            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);

        }


        template <typename InputIteratorT, typename DefaultT>

        __device__ __forceinline__ void Load(

            InputIteratorT  block_itr,

            InputT          (&items)[ITEMS_PER_THREAD],

            int             valid_items,

            DefaultT        oob_default)

        {

            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);

            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);

        }

    };


    /******************************************************************************

     * Type definitions

     ******************************************************************************/


    typedef LoadInternal<ALGORITHM, 0> InternalLoad;


    typedef typename InternalLoad::TempStorage _TempStorage;


    /******************************************************************************

     * Utility methods

     ******************************************************************************/


    __device__ __forceinline__ _TempStorage& PrivateStorage()

    {

        __shared__ _TempStorage private_storage;

        return private_storage;

    }


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage &temp_storage;


    int linear_tid;


public:


    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************/


    __device__ __forceinline__ BlockLoad()

    :

        temp_storage(PrivateStorage()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    __device__ __forceinline__ BlockLoad(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    /******************************************************************/


    template <typename InputIteratorT>

    __device__ __forceinline__ void Load(

        InputIteratorT  block_itr,

        InputT          (&items)[ITEMS_PER_THREAD])

    {

        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);

    }


    template <typename InputIteratorT>

    __device__ __forceinline__ void Load(

        InputIteratorT  block_itr,

        InputT          (&items)[ITEMS_PER_THREAD],

        int             valid_items)

    {

        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);

    }


    template <typename InputIteratorT, typename DefaultT>

    __device__ __forceinline__ void Load(

        InputIteratorT  block_itr,

        InputT          (&items)[ITEMS_PER_THREAD],

        int             valid_items,

        DefaultT        oob_default)

    {

        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


block_exchange.cuh

Vector
Sparse Matrix implementation stub object when OpenFPM is compiled with no linear algebra support.
Definition Vector.hpp:40

cub::BlockLoad
The BlockLoad class provides collective data movement methods for loading a linear segment of items f...
Definition block_load.cuh:641

cub::BlockRadixRank
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Definition block_radix_rank.cuh:98

cub::CacheModifiedInputIterator
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
Definition cache_modified_input_iterator.cuh:108

cub::CacheModifiedInputIterator::ptr
ValueType * ptr
Wrapped native pointer.
Definition cache_modified_input_iterator.cuh:134

cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_load.cuh:728

cub::LoadDirectBlockedVectorized
__device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a blocked arrangement across the thread block.
Definition block_load.cuh:227

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_load.cuh:968

cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range (skips vectorization)
Definition block_load.cuh:779

cub::BlockLoad::PrivateStorage
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
Definition block_load.cuh:1035

cub::BlockLoad::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_load.cuh:1047

cub::BlockLoad::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_load.cuh:818

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition block_load.cuh:909

cub::BlockLoad::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition block_load.cuh:855

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_load.cuh:893

cub::BlockLoad::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition block_load.cuh:706

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
Definition block_load.cuh:994

cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load
__device__ __forceinline__ void Load(CacheModifiedInputIterator< MODIFIER, ValueType, OffsetT > block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
Definition block_load.cuh:761

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_load.cuh:896

cub::BlockLoad::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::LoadInternal
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
Definition block_load.cuh:824

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::LoadInternal
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
Definition block_load.cuh:899

cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::LoadInternal
__device__ __forceinline__ LoadInternal(TempStorage &, int linear_tid)
Constructor.
Definition block_load.cuh:731

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition block_load.cuh:931

cub::BlockLoad::BlockLoad
__device__ __forceinline__ BlockLoad()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition block_load.cuh:1066

cub::InternalLoadDirectBlockedVectorized
__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Definition block_load.cuh:162

cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::TempStorage
NullType TempStorage
Shared memory storage layout type.
Definition block_load.cuh:725

cub::LoadDirectStriped
__device__ __forceinline__ void LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a striped arrangement across the thread block.
Definition block_load.cuh:258

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
Definition block_load.cuh:919

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition block_load.cuh:1006

cub::BlockLoad::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::LoadInternal
__device__ __forceinline__ LoadInternal(TempStorage &, int linear_tid)
Constructor.
Definition block_load.cuh:678

cub::LoadDirectWarpStriped
__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a warp-striped arrangement across the thread block.
Definition block_load.cuh:362

cub::BlockLoad::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition block_load.cuh:1224

cub::BlockLoad::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
Definition block_load.cuh:696

cub::BlockLoadAlgorithm
BlockLoadAlgorithm
cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment...
Definition block_load.cuh:474

cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load
__device__ __forceinline__ void Load(_InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorizati...
Definition block_load.cuh:770

cub::BlockLoad::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
Definition block_load.cuh:844

cub::BlockLoad::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition block_load.cuh:834

cub::LoadDirectBlocked
__device__ __forceinline__ void LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items into a blocked arrangement across the thread block.
Definition block_load.cuh:76

cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load
__device__ __forceinline__ void Load(const InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
Definition block_load.cuh:749

cub::BlockLoad::linear_tid
int linear_tid
Linear thread-id.
Definition block_load.cuh:1050

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::LoadInternal
__device__ __forceinline__ LoadInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
Definition block_load.cuh:974

cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load
__device__ __forceinline__ void Load(InputT *block_ptr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory, specialized for native pointer types (attempts vectorizat...
Definition block_load.cuh:740

cub::CacheLoadModifier
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition thread_load.cuh:63

cub::BlockLoad::InternalLoad
LoadInternal< ALGORITHM, 0 > InternalLoad
Internal load implementation to use.
Definition block_load.cuh:1023

cub::BlockLoad::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_load.cuh:675

cub::BlockLoad::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition block_load.cuh:687

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition block_load.cuh:984

cub::BlockLoad::_TempStorage
InternalLoad::TempStorage _TempStorage
Shared memory storage layout type.
Definition block_load.cuh:1027

cub::BlockLoad::BlockLoad
__device__ __forceinline__ BlockLoad(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition block_load.cuh:1076

cub::BlockLoad::LoadInternal< BLOCK_LOAD_DIRECT, DUMMY >::TempStorage
NullType TempStorage
Shared memory storage layout type.
Definition block_load.cuh:672

cub::BlockLoad::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
Load a linear segment of items from memory.
Definition block_load.cuh:1130

cub::BlockLoad::LoadInternal< BLOCK_LOAD_VECTORIZE, DUMMY >::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-b...
Definition block_load.cuh:789

cub::BlockLoad::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_load.cuh:971

cub::BlockLoad::Load
__device__ __forceinline__ void Load(InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
Load a linear segment of items from memory, guarded by range.
Definition block_load.cuh:1176

cub::BlockLoad::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_load.cuh:821

cub::BLOCK_LOAD_DIRECT
@ BLOCK_LOAD_DIRECT
Definition block_load.cuh:485

cub::BLOCK_LOAD_VECTORIZE
@ BLOCK_LOAD_VECTORIZE
Definition block_load.cuh:505

cub::BLOCK_LOAD_WARP_TRANSPOSE
@ BLOCK_LOAD_WARP_TRANSPOSE
Definition block_load.cuh:541

cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
@ BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
Definition block_load.cuh:562

cub::BLOCK_LOAD_TRANSPOSE
@ BLOCK_LOAD_TRANSPOSE
Definition block_load.cuh:520

cub::BlockLoad::BLOCK_THREADS
@ BLOCK_THREADS
The thread block size in threads.
Definition block_load.cuh:652

CUB_STATIC_ASSERT
#define CUB_STATIC_ASSERT(cond, msg)
Static assert.
Definition util_macro.cuh:97

cub::RowMajorTid
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::BlockExchange::TempStorage
\smemstorage{BlockExchange}
Definition block_exchange.cuh:165

cub::BlockLoad::LoadInternal
Load helper.
Definition block_load.cuh:662

cub::BlockLoad::TempStorage
\smemstorage{BlockLoad}
Definition block_load.cuh:1055

cub::CubVector
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition util_type.cuh:454

cub::NullType
A simple "NULL" marker type.
Definition util_type.cuh:257

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53