doxygen/openfpm/block__store_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <iterator>


#include "block_exchange.cuh"

#include "../util_ptx.cuh"

#include "../util_macro.cuh"

#include "../util_type.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


/******************************************************************/


template <

    typename            T,

    int                 ITEMS_PER_THREAD,

    typename            OutputIteratorT>

__device__ __forceinline__ void StoreDirectBlocked(

    int                 linear_tid,

    OutputIteratorT     block_itr,

    T                   (&items)[ITEMS_PER_THREAD])

{

    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);


    // Store directly in thread-blocked order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        thread_itr[ITEM] = items[ITEM];

    }

}


template <

    typename            T,

    int                 ITEMS_PER_THREAD,

    typename            OutputIteratorT>

__device__ __forceinline__ void StoreDirectBlocked(

    int                 linear_tid,

    OutputIteratorT     block_itr,

    T                   (&items)[ITEMS_PER_THREAD],

    int                 valid_items)

{

    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);


    // Store directly in thread-blocked order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)

        {

            thread_itr[ITEM] = items[ITEM];

        }

    }

}


template <

    typename            T,

    int                 ITEMS_PER_THREAD>

__device__ __forceinline__ void StoreDirectBlockedVectorized(

    int                 linear_tid,

    T                   *block_ptr,

    T                   (&items)[ITEMS_PER_THREAD])

{

    enum

    {

        // Maximum CUDA vector size is 4 elements

        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),


        // Vector size must be a power of two and an even divisor of the items per thread

        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?

            MAX_VEC_SIZE :

            1,


        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,

    };


    // Vector type

    typedef typename CubVector<T, VEC_SIZE>::Type Vector;


    // Alias global pointer

    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));


    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)

    Vector raw_vector[VECTORS_PER_THREAD];

    T *raw_items = reinterpret_cast<T*>(raw_vector);


    // Copy

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        raw_items[ITEM] = items[ITEM];

    }


    // Direct-store using vector types

    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);

}


/******************************************************************/


template <

    int                 BLOCK_THREADS,

    typename            T,

    int                 ITEMS_PER_THREAD,

    typename            OutputIteratorT>

__device__ __forceinline__ void StoreDirectStriped(

    int                 linear_tid,

    OutputIteratorT     block_itr,

    T                   (&items)[ITEMS_PER_THREAD])

{

    OutputIteratorT thread_itr = block_itr + linear_tid;


    // Store directly in striped order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];

    }

}


template <

    int                 BLOCK_THREADS,

    typename            T,

    int                 ITEMS_PER_THREAD,

    typename            OutputIteratorT>

__device__ __forceinline__ void StoreDirectStriped(

    int                 linear_tid,

    OutputIteratorT     block_itr,

    T                   (&items)[ITEMS_PER_THREAD],

    int                 valid_items)

{

    OutputIteratorT thread_itr = block_itr + linear_tid;


    // Store directly in striped order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)

        {

            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];

        }

    }

}


/******************************************************************/


template <

    typename            T,

    int                 ITEMS_PER_THREAD,

    typename            OutputIteratorT>

__device__ __forceinline__ void StoreDirectWarpStriped(

    int                 linear_tid,

    OutputIteratorT     block_itr,

    T                   (&items)[ITEMS_PER_THREAD])

{

    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);

    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;

    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;


    OutputIteratorT thread_itr = block_itr + warp_offset + tid;


    // Store directly in warp-striped order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];

    }

}


template <

    typename            T,

    int                 ITEMS_PER_THREAD,

    typename            OutputIteratorT>

__device__ __forceinline__ void StoreDirectWarpStriped(

    int                 linear_tid,

    OutputIteratorT     block_itr,

    T                   (&items)[ITEMS_PER_THREAD],

    int                 valid_items)

{

    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);

    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;

    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;


    OutputIteratorT thread_itr = block_itr + warp_offset + tid;


    // Store directly in warp-striped order

    #pragma unroll

    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

    {

        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)

        {

            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];

        }

    }

}


       // end group UtilIo


//-----------------------------------------------------------------------------

// Generic BlockStore abstraction

//-----------------------------------------------------------------------------


enum BlockStoreAlgorithm

{

    BLOCK_STORE_DIRECT,


    BLOCK_STORE_VECTORIZE,


    BLOCK_STORE_TRANSPOSE,


    BLOCK_STORE_WARP_TRANSPOSE,


    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,


};


template <

    typename                T,

    int                     BLOCK_DIM_X,

    int                     ITEMS_PER_THREAD,

    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,

    int                     BLOCK_DIM_Y         = 1,

    int                     BLOCK_DIM_Z         = 1,

    int                     PTX_ARCH            = CUB_PTX_ARCH>

class BlockStore

{

private:

    /******************************************************************************

     * Constants and typed definitions

     ******************************************************************************/


    enum

    {

        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,

    };


    /******************************************************************************

     * Algorithmic variants

     ******************************************************************************/


    template <BlockStoreAlgorithm _POLICY, int DUMMY>

    struct StoreInternal;


    template <int DUMMY>

    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>

    {

        typedef NullType TempStorage;


        int linear_tid;


        __device__ __forceinline__ StoreInternal(

            TempStorage &/*temp_storage*/,

            int linear_tid)

        :

            linear_tid(linear_tid)

        {}


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT     block_itr,

            T                   (&items)[ITEMS_PER_THREAD])

        {

            StoreDirectBlocked(linear_tid, block_itr, items);

        }


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT     block_itr,

            T                   (&items)[ITEMS_PER_THREAD],

            int                 valid_items)

        {

            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);

        }

    };


    template <int DUMMY>

    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>

    {

        typedef NullType TempStorage;


        int linear_tid;


        __device__ __forceinline__ StoreInternal(

            TempStorage &/*temp_storage*/,

            int linear_tid)

        :

            linear_tid(linear_tid)

        {}


        __device__ __forceinline__ void Store(

            T                   *block_ptr,

            T                   (&items)[ITEMS_PER_THREAD])

        {

            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);

        }


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT    block_itr,

            T                   (&items)[ITEMS_PER_THREAD])

        {

            StoreDirectBlocked(linear_tid, block_itr, items);

        }


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT     block_itr,

            T                   (&items)[ITEMS_PER_THREAD],

            int                 valid_items)

        {

            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);

        }

    };


    template <int DUMMY>

    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>

    {

        // BlockExchange utility type for keys

        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;


        struct _TempStorage : BlockExchange::TempStorage

        {

            volatile int valid_items;

        };


        struct TempStorage : Uninitialized<_TempStorage> {};


        _TempStorage &temp_storage;


        int linear_tid;


        __device__ __forceinline__ StoreInternal(

            TempStorage &temp_storage,

            int linear_tid)

        :

            temp_storage(temp_storage.Alias()),

            linear_tid(linear_tid)

        {}


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT     block_itr,

            T                   (&items)[ITEMS_PER_THREAD])

        {

            BlockExchange(temp_storage).BlockedToStriped(items);

            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);

        }


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT   block_itr,

            T                   (&items)[ITEMS_PER_THREAD],

            int                 valid_items)

        {

            BlockExchange(temp_storage).BlockedToStriped(items);

            if (linear_tid == 0)

                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads

            CTA_SYNC();

            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);

        }

    };


    template <int DUMMY>

    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>

    {

        enum

        {

            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)

        };


        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS

        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");


        // BlockExchange utility type for keys

        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;


        struct _TempStorage : BlockExchange::TempStorage

        {

            volatile int valid_items;

        };


        struct TempStorage : Uninitialized<_TempStorage> {};


        _TempStorage &temp_storage;


        int linear_tid;


        __device__ __forceinline__ StoreInternal(

            TempStorage &temp_storage,

            int linear_tid)

        :

            temp_storage(temp_storage.Alias()),

            linear_tid(linear_tid)

        {}


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT   block_itr,

            T                 (&items)[ITEMS_PER_THREAD])

        {

            BlockExchange(temp_storage).BlockedToWarpStriped(items);

            StoreDirectWarpStriped(linear_tid, block_itr, items);

        }


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT   block_itr,

            T                 (&items)[ITEMS_PER_THREAD],

            int               valid_items)

        {

            BlockExchange(temp_storage).BlockedToWarpStriped(items);

            if (linear_tid == 0)

                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads

            CTA_SYNC();

            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);

        }

    };


    template <int DUMMY>

    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>

    {

        enum

        {

            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)

        };


        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS

        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");


        // BlockExchange utility type for keys

        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;


        struct _TempStorage : BlockExchange::TempStorage

        {

            volatile int valid_items;

        };


        struct TempStorage : Uninitialized<_TempStorage> {};


        _TempStorage &temp_storage;


        int linear_tid;


        __device__ __forceinline__ StoreInternal(

            TempStorage &temp_storage,

            int linear_tid)

        :

            temp_storage(temp_storage.Alias()),

            linear_tid(linear_tid)

        {}


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT     block_itr,

            T                   (&items)[ITEMS_PER_THREAD])

        {

            BlockExchange(temp_storage).BlockedToWarpStriped(items);

            StoreDirectWarpStriped(linear_tid, block_itr, items);

        }


        template <typename OutputIteratorT>

        __device__ __forceinline__ void Store(

            OutputIteratorT   block_itr,

            T                   (&items)[ITEMS_PER_THREAD],

            int                 valid_items)

        {

            BlockExchange(temp_storage).BlockedToWarpStriped(items);

            if (linear_tid == 0)

                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads

            CTA_SYNC();

            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);

        }

    };


    /******************************************************************************

     * Type definitions

     ******************************************************************************/


    typedef StoreInternal<ALGORITHM, 0> InternalStore;


    typedef typename InternalStore::TempStorage _TempStorage;


    /******************************************************************************

     * Utility methods

     ******************************************************************************/


    __device__ __forceinline__ _TempStorage& PrivateStorage()

    {

        __shared__ _TempStorage private_storage;

        return private_storage;

    }


    /******************************************************************************

     * Thread fields

     ******************************************************************************/


    _TempStorage &temp_storage;


    int linear_tid;


public:


    struct TempStorage : Uninitialized<_TempStorage> {};


    /******************************************************************/


    __device__ __forceinline__ BlockStore()

    :

        temp_storage(PrivateStorage()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    __device__ __forceinline__ BlockStore(

        TempStorage &temp_storage)

    :

        temp_storage(temp_storage.Alias()),

        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))

    {}


    /******************************************************************/


    template <typename OutputIteratorT>

    __device__ __forceinline__ void Store(

        OutputIteratorT     block_itr,

        T                   (&items)[ITEMS_PER_THREAD])

    {

        InternalStore(temp_storage, linear_tid).Store(block_itr, items);

    }


    template <typename OutputIteratorT>

    __device__ __forceinline__ void Store(

        OutputIteratorT     block_itr,

        T                   (&items)[ITEMS_PER_THREAD],

        int                 valid_items)

    {

        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);

    }

};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


block_exchange.cuh

Vector
Sparse Matrix implementation stub object when OpenFPM is compiled with no linear algebra support.
Definition Vector.hpp:40

cub::BlockStore
The BlockStore class provides collective data movement methods for writing a blocked arrangement of i...
Definition block_store.cuh:516

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
Definition block_store.cuh:811

cub::BlockStore::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::TempStorage
NullType TempStorage
Shared memory storage layout type.
Definition block_store.cuh:587

cub::BlockStore::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
Definition block_store.cuh:570

cub::BlockStore::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_store.cuh:853

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::StoreInternal
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
Definition block_store.cuh:723

cub::BlockStore::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store
__device__ __forceinline__ void Store(T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory, specialized for native pointer types (attempts vectoriza...
Definition block_store.cuh:601

cub::BlockStore::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
Definition block_store.cuh:675

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_store.cuh:717

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage::valid_items
volatile int valid_items
Temporary storage for partially-full block guard.
Definition block_store.cuh:778

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::StoreInternal
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
Definition block_store.cuh:791

cub::BlockStore::BlockStore
__device__ __forceinline__ BlockStore(TempStorage &temp_storage)
Collective constructor using the specified memory allocation as temporary storage.
Definition block_store.cuh:883

cub::BlockStore::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_store.cuh:652

cub::StoreDirectBlocked
__device__ __forceinline__ void StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a blocked arrangement of items across a thread block into a linear segment of items.
Definition block_store.cuh:74

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_store.cuh:788

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
Definition block_store.cuh:743

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_store.cuh:785

cub::BlockStoreAlgorithm
BlockStoreAlgorithm
cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arr...
Definition block_store.cuh:355

cub::BlockStore::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
Definition block_store.cuh:939

cub::BlockStore::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::TempStorage
NullType TempStorage
Shared memory storage layout type.
Definition block_store.cuh:546

cub::BlockStore::_TempStorage
InternalStore::TempStorage _TempStorage
Shared memory storage layout type.
Definition block_store.cuh:833

cub::BlockStore::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
Definition block_store.cuh:561

cub::StoreDirectWarpStriped
__device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a warp-striped arrangement of data across the thread block into a linear segment of items.
Definition block_store.cuh:281

cub::BlockStore::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorizat...
Definition block_store.cuh:610

cub::BlockStore::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::temp_storage
_TempStorage & temp_storage
Thread reference to shared storage.
Definition block_store.cuh:649

cub::BlockStore::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::StoreInternal
__device__ __forceinline__ StoreInternal(TempStorage &, int linear_tid)
Constructor.
Definition block_store.cuh:552

cub::BlockStore::StoreInternal< BLOCK_STORE_DIRECT, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_store.cuh:549

cub::BlockStore::InternalStore
StoreInternal< ALGORITHM, 0 > InternalStore
Internal load implementation to use.
Definition block_store.cuh:829

cub::BlockStore::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::_TempStorage::valid_items
volatile int valid_items
Temporary storage for partially-full block guard.
Definition block_store.cuh:642

cub::BlockStore::PrivateStorage
__device__ __forceinline__ _TempStorage & PrivateStorage()
Internal storage allocator.
Definition block_store.cuh:841

cub::StoreDirectStriped
__device__ __forceinline__ void StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store a striped arrangement of data across the thread block into a linear segment of items.
Definition block_store.cuh:206

cub::BlockStore::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
Definition block_store.cuh:988

cub::BlockStore::BlockStore
__device__ __forceinline__ BlockStore()
Collective constructor using a private static allocation of shared memory as temporary storage.
Definition block_store.cuh:873

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::_TempStorage::valid_items
volatile int valid_items
Temporary storage for partially-full block guard.
Definition block_store.cuh:710

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
Definition block_store.cuh:801

cub::BlockStore::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
Definition block_store.cuh:665

cub::BlockStore::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_store.cuh:590

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::linear_tid
int linear_tid
Linear thread-id.
Definition block_store.cuh:720

cub::StoreDirectBlockedVectorized
__device__ __forceinline__ void StoreDirectBlockedVectorized(int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
Store a blocked arrangement of items across a thread block into a linear segment of items.
Definition block_store.cuh:143

cub::BlockStore::StoreInternal< BLOCK_STORE_TRANSPOSE, DUMMY >::StoreInternal
__device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid)
Constructor.
Definition block_store.cuh:655

cub::BlockStore::StoreInternal< BLOCK_STORE_WARP_TRANSPOSE, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
Store items into a linear segment of memory.
Definition block_store.cuh:733

cub::BlockStore::linear_tid
int linear_tid
Linear thread-id.
Definition block_store.cuh:856

cub::BlockStore::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::StoreInternal
__device__ __forceinline__ StoreInternal(TempStorage &, int linear_tid)
Constructor.
Definition block_store.cuh:593

cub::BlockStore::StoreInternal< BLOCK_STORE_VECTORIZE, DUMMY >::Store
__device__ __forceinline__ void Store(OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
Store items into a linear segment of memory, guarded by range.
Definition block_store.cuh:619

cub::BlockStore::BLOCK_THREADS
@ BLOCK_THREADS
The thread block size in threads.
Definition block_store.cuh:526

cub::BLOCK_STORE_VECTORIZE
@ BLOCK_STORE_VECTORIZE
Definition block_store.cuh:386

cub::BLOCK_STORE_WARP_TRANSPOSE
@ BLOCK_STORE_WARP_TRANSPOSE
Definition block_store.cuh:416

cub::BLOCK_STORE_DIRECT
@ BLOCK_STORE_DIRECT
Definition block_store.cuh:366

cub::BLOCK_STORE_TRANSPOSE
@ BLOCK_STORE_TRANSPOSE
Definition block_store.cuh:399

cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
@ BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
Definition block_store.cuh:435

CUB_MIN
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition util_macro.cuh:66

CUB_STATIC_ASSERT
#define CUB_STATIC_ASSERT(cond, msg)
Static assert.
Definition util_macro.cuh:97

cub::RowMajorTid
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409

cub::CTA_SYNC
CTA_SYNC()
Definition util_ptx.cuh:255

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::BlockExchange::TempStorage
\smemstorage{BlockExchange}
Definition block_exchange.cuh:165

cub::BlockStore::StoreInternal
Store helper.
Definition block_store.cuh:536

cub::BlockStore::TempStorage
\smemstorage{BlockStore}
Definition block_store.cuh:862

cub::CubVector
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition util_type.cuh:454

cub::NullType
A simple "NULL" marker type.
Definition util_type.cuh:257

cub::Uninitialized
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition util_type.cuh:635

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53