doxygen/openfpm/util__device_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include "util_type.cuh"

#include "util_arch.cuh"

#include "util_debug.cuh"

#include "util_namespace.cuh"

#include "util_macro.cuh"


CUB_NS_PREFIX


namespace cub {


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


template <int ALLOCATIONS>

__host__ __device__ __forceinline__

cudaError_t AliasTemporaries(

    void    *d_temp_storage,

    size_t  &temp_storage_bytes,

    void*   (&allocations)[ALLOCATIONS],

    size_t  (&allocation_sizes)[ALLOCATIONS])

{

    const int ALIGN_BYTES   = 256;

    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);


    // Compute exclusive prefix sum over allocation requests

    size_t allocation_offsets[ALLOCATIONS];

    size_t bytes_needed = 0;

    for (int i = 0; i < ALLOCATIONS; ++i)

    {

        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;

        allocation_offsets[i] = bytes_needed;

        bytes_needed += allocation_bytes;

    }

    bytes_needed += ALIGN_BYTES - 1;


    // Check if the caller is simply requesting the size of the storage allocation

    if (!d_temp_storage)

    {

        temp_storage_bytes = bytes_needed;

        return cudaSuccess;

    }


    // Check if enough storage provided

    if (temp_storage_bytes < bytes_needed)

    {

        return CubDebug(cudaErrorInvalidValue);

    }


    // Alias

    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);

    for (int i = 0; i < ALLOCATIONS; ++i)

    {

        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];

    }


    return cudaSuccess;

}


template <typename T>

__global__ void EmptyKernel(void) { }


#endif  // DOXYGEN_SHOULD_SKIP_THIS


CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)

{

    struct Dummy

    {

        typedef void (*EmptyKernelPtr)();


        CUB_RUNTIME_FUNCTION __forceinline__

        EmptyKernelPtr Empty()

        {

            return EmptyKernel<void>;

        }

    };


#ifndef CUB_RUNTIME_ENABLED

    (void)ptx_version;


    // CUDA API calls not supported from this device

    return cudaErrorInvalidConfiguration;


#elif (CUB_PTX_ARCH > 0)


    ptx_version = CUB_PTX_ARCH;

    return cudaSuccess;


#else


    cudaError_t error = cudaSuccess;

    do

    {

        cudaFuncAttributes empty_kernel_attrs;

        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;

        ptx_version = empty_kernel_attrs.ptxVersion * 10;

    }

    while (0);


    return error;


#endif

}


CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)

{

#ifndef CUB_RUNTIME_ENABLED

    (void)sm_version;

    (void)device_ordinal;


    // CUDA API calls not supported from this device

    return cudaErrorInvalidConfiguration;


#else


    cudaError_t error = cudaSuccess;

    do

    {

        // Fill in SM version

        int major, minor;

        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;

        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;

        sm_version = major * 100 + minor * 10;

    }

    while (0);


    return error;


#endif

}


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


CUB_RUNTIME_FUNCTION __forceinline__

static cudaError_t SyncStream(cudaStream_t stream)

{

#if (CUB_PTX_ARCH == 0)

    return cudaStreamSynchronize(stream);

#else

    (void)stream;

    // Device can't yet sync on a specific stream

    return cudaDeviceSynchronize();

#endif

}


template <typename KernelPtr>

CUB_RUNTIME_FUNCTION __forceinline__

cudaError_t MaxSmOccupancy(

    int                 &max_sm_occupancy,

    KernelPtr           kernel_ptr,

    int                 block_threads,

    int                 dynamic_smem_bytes = 0)

{

#ifndef CUB_RUNTIME_ENABLED

    (void)dynamic_smem_bytes;

    (void)block_threads;

    (void)kernel_ptr;

    (void)max_sm_occupancy;


    // CUDA API calls not supported from this device

    return CubDebug(cudaErrorInvalidConfiguration);


#else


    return cudaOccupancyMaxActiveBlocksPerMultiprocessor (

        &max_sm_occupancy,

        kernel_ptr,

        block_threads,

        dynamic_smem_bytes);


#endif  // CUB_RUNTIME_ENABLED

}


/******************************************************************************

 * Policy management

 ******************************************************************************/


struct KernelConfig

{

    int block_threads;

    int items_per_thread;

    int tile_size;

    int sm_occupancy;


    CUB_RUNTIME_FUNCTION __forceinline__

    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}


    template <typename AgentPolicyT, typename KernelPtrT>

    CUB_RUNTIME_FUNCTION __forceinline__

    cudaError_t Init(KernelPtrT kernel_ptr)

    {

        block_threads        = AgentPolicyT::BLOCK_THREADS;

        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;

        tile_size            = block_threads * items_per_thread;

        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);

        return retval;

    }

};


template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>

struct ChainedPolicy

{

   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;


   template <typename FunctorT>

   CUB_RUNTIME_FUNCTION __forceinline__

   static cudaError_t Invoke(int ptx_version, FunctorT &op)

   {

       if (ptx_version < PTX_VERSION) {

           return PrevPolicyT::Invoke(ptx_version, op);

       }

       return op.template Invoke<PolicyT>();

   }

};


template <int PTX_VERSION, typename PolicyT>

struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>

{

    typedef PolicyT ActivePolicy;


    template <typename FunctorT>

    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) {

        return op.template Invoke<PolicyT>();

    }

};


#endif  // Do not document


       // end group UtilMgmt


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)

cub::PtxVersion
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
Definition util_device.cuh:118

cub::EmptyKernel
__global__ void EmptyKernel(void)
Definition util_device.cuh:110

cub::AliasTemporaries
__host__ __device__ __forceinline__ cudaError_t AliasTemporaries(void *d_temp_storage, size_t &temp_storage_bytes, void *(&allocations)[ALLOCATIONS], size_t(&allocation_sizes)[ALLOCATIONS])
Definition util_device.cuh:62

cub::SmVersion
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
Retrieves the SM version (major * 100 + minor * 10)
Definition util_device.cuh:165

cub::MaxSmOccupancy
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy(int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel...
Definition util_device.cuh:244

CubDebug
#define CubDebug(e)
Debug macro.
Definition util_debug.cuh:94

cub::SyncStream
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
Definition util_device.cuh:199

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::ChainedPolicy< PTX_VERSION, PolicyT, PolicyT >::ActivePolicy
PolicyT ActivePolicy
The policy for the active compiler pass.
Definition util_device.cuh:326

cub::ChainedPolicy< PTX_VERSION, PolicyT, PolicyT >::Invoke
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Invoke(int, FunctorT &op)
Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version.
Definition util_device.cuh:331

cub::ChainedPolicy
Helper for dispatching into a policy chain.
Definition util_device.cuh:305

cub::ChainedPolicy::Invoke
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Invoke(int ptx_version, FunctorT &op)
Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version.
Definition util_device.cuh:312

cub::ChainedPolicy::ActivePolicy
If<(CUB_PTX_ARCH< PTX_VERSION), typenamePrevPolicyT::ActivePolicy, PolicyT >::Type ActivePolicy
The policy for the active compiler pass.
Definition util_device.cuh:307

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::KernelConfig
Definition util_device.cuh:279

util_arch.cuh

CUB_PTX_ARCH
#define CUB_PTX_ARCH
CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host p...
Definition util_arch.cuh:53

util_debug.cuh

util_namespace.cuh

util_type.cuh