doxygen/openfpm/util__allocator_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


/******************************************************************************

 * Simple caching allocator for device memory allocations. The allocator is

 * thread-safe and capable of managing device allocations on multiple devices.

 ******************************************************************************/


#pragma once


#include "util_namespace.cuh"

#include "util_debug.cuh"


#include <set>

#include <map>


#include "host/mutex.cuh"

#include <math.h>


CUB_NS_PREFIX


namespace cub {


/******************************************************************************

 * CachingDeviceAllocator (host use)

 ******************************************************************************/


struct CachingDeviceAllocator

{


    //---------------------------------------------------------------------

    // Constants

    //---------------------------------------------------------------------


    static const unsigned int INVALID_BIN = (unsigned int) -1;


    static const size_t INVALID_SIZE = (size_t) -1;


#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


    static const int INVALID_DEVICE_ORDINAL = -1;


    //---------------------------------------------------------------------

    // Type definitions and helper types

    //---------------------------------------------------------------------


    struct BlockDescriptor

    {

        void*           d_ptr;              // Device pointer

        size_t          bytes;              // Size of allocation in bytes

        unsigned int    bin;                // Bin enumeration

        int             device;             // device ordinal

        cudaStream_t    associated_stream;  // Associated associated_stream

        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed


        // Constructor (suitable for searching maps for a specific block, given its pointer and device)

        BlockDescriptor(void *d_ptr, int device) :

            d_ptr(d_ptr),

            bytes(0),

            bin(INVALID_BIN),

            device(device),

            associated_stream(0),

            ready_event(0)

        {}


        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)

        BlockDescriptor(int device) :

            d_ptr(NULL),

            bytes(0),

            bin(INVALID_BIN),

            device(device),

            associated_stream(0),

            ready_event(0)

        {}


        // Comparison functor for comparing device pointers

        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)

        {

            if (a.device == b.device)

                return (a.d_ptr < b.d_ptr);

            else

                return (a.device < b.device);

        }


        // Comparison functor for comparing allocation sizes

        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)

        {

            if (a.device == b.device)

                return (a.bytes < b.bytes);

            else

                return (a.device < b.device);

        }

    };


    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);


    class TotalBytes {

    public:

        size_t free;

        size_t live;

        TotalBytes() { free = live = 0; }

    };


    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;


    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;


    typedef std::map<int, TotalBytes> GpuCachedBytes;


    //---------------------------------------------------------------------

    // Utility functions

    //---------------------------------------------------------------------


    static unsigned int IntPow(

        unsigned int base,

        unsigned int exp)

    {

        unsigned int retval = 1;

        while (exp > 0)

        {

            if (exp & 1) {

                retval = retval * base;        // multiply the result by the current base

            }

            base = base * base;                // square the base

            exp = exp >> 1;                    // divide the exponent in half

        }

        return retval;

    }


    void NearestPowerOf(

        unsigned int    &power,

        size_t          &rounded_bytes,

        unsigned int    base,

        size_t          value)

    {

        power = 0;

        rounded_bytes = 1;


        if (value * base < value)

        {

            // Overflow

            power = sizeof(size_t) * 8;

            rounded_bytes = size_t(0) - 1;

            return;

        }


        while (rounded_bytes < value)

        {

            rounded_bytes *= base;

            power++;

        }

    }


    //---------------------------------------------------------------------

    // Fields

    //---------------------------------------------------------------------


    cub::Mutex      mutex;


    unsigned int    bin_growth;

    unsigned int    min_bin;

    unsigned int    max_bin;


    size_t          min_bin_bytes;

    size_t          max_bin_bytes;

    size_t          max_cached_bytes;


    const bool      skip_cleanup;

    bool            debug;


    GpuCachedBytes  cached_bytes;

    CachedBlocks    cached_blocks;

    BusyBlocks      live_blocks;


#endif // DOXYGEN_SHOULD_SKIP_THIS


    //---------------------------------------------------------------------

    // Methods

    //---------------------------------------------------------------------


    CachingDeviceAllocator(

        unsigned int    bin_growth,

        unsigned int    min_bin             = 1,

        unsigned int    max_bin             = INVALID_BIN,

        size_t          max_cached_bytes    = INVALID_SIZE,

        bool            skip_cleanup        = false,

        bool            debug               = false)

    :

        bin_growth(bin_growth),

        min_bin(min_bin),

        max_bin(max_bin),

        min_bin_bytes(IntPow(bin_growth, min_bin)),

        max_bin_bytes(IntPow(bin_growth, max_bin)),

        max_cached_bytes(max_cached_bytes),

        skip_cleanup(skip_cleanup),

        debug(debug),

        cached_blocks(BlockDescriptor::SizeCompare),

        live_blocks(BlockDescriptor::PtrCompare)

    {}


    CachingDeviceAllocator(

        bool skip_cleanup = false,

        bool debug = false)

    :

        bin_growth(8),

        min_bin(3),

        max_bin(7),

        min_bin_bytes(IntPow(bin_growth, min_bin)),

        max_bin_bytes(IntPow(bin_growth, max_bin)),

        max_cached_bytes((max_bin_bytes * 3) - 1),

        skip_cleanup(skip_cleanup),

        debug(debug),

        cached_blocks(BlockDescriptor::SizeCompare),

        live_blocks(BlockDescriptor::PtrCompare)

    {}


    cudaError_t SetMaxCachedBytes(

        size_t max_cached_bytes)

    {

        // Lock

        mutex.Lock();


        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);


        this->max_cached_bytes = max_cached_bytes;


        // Unlock

        mutex.Unlock();


        return cudaSuccess;

    }


    cudaError_t DeviceAllocate(

        int             device,

        void            **d_ptr,

        size_t          bytes,

        cudaStream_t    active_stream = 0)

    {

        *d_ptr                          = NULL;

        int entrypoint_device           = INVALID_DEVICE_ORDINAL;

        cudaError_t error               = cudaSuccess;


        if (device == INVALID_DEVICE_ORDINAL)

        {

            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;

            device = entrypoint_device;

        }


        // Create a block descriptor for the requested allocation

        bool found = false;

        BlockDescriptor search_key(device);

        search_key.associated_stream = active_stream;

        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);


        if (search_key.bin > max_bin)

        {

            // Bin is greater than our maximum bin: allocate the request

            // exactly and give out-of-bounds bin.  It will not be cached

            // for reuse when returned.

            search_key.bin      = INVALID_BIN;

            search_key.bytes    = bytes;

        }

        else

        {

            // Search for a suitable cached allocation: lock

            mutex.Lock();


            if (search_key.bin < min_bin)

            {

                // Bin is less than minimum bin: round up

                search_key.bin      = min_bin;

                search_key.bytes    = min_bin_bytes;

            }


            // Iterate through the range of cached blocks on the same device in the same bin

            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);

            while ((block_itr != cached_blocks.end())

                    && (block_itr->device == device)

                    && (block_itr->bin == search_key.bin))

            {

                // To prevent races with reusing blocks returned by the host but still

                // in use by the device, only consider cached blocks that are

                // either (from the active stream) or (from an idle stream)

                if ((active_stream == block_itr->associated_stream) ||

                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))

                {

                    // Reuse existing cache block.  Insert into live blocks.

                    found = true;

                    search_key = *block_itr;

                    search_key.associated_stream = active_stream;

                    live_blocks.insert(search_key);


                    // Remove from free blocks

                    cached_bytes[device].free -= search_key.bytes;

                    cached_bytes[device].live += search_key.bytes;


                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",

                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);


                    cached_blocks.erase(block_itr);


                    break;

                }

                block_itr++;

            }


            // Done searching: unlock

            mutex.Unlock();

        }


        // Allocate the block if necessary

        if (!found)

        {

            // Set runtime's current device to specified device (entrypoint may not be set)

            if (device != entrypoint_device)

            {

                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;

                if (CubDebug(error = cudaSetDevice(device))) return error;

            }


            // Attempt to allocate

            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)

            {

                // The allocation attempt failed: free all cached blocks on device and retry

                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",

                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);


                error = cudaSuccess;    // Reset the error we will return

                cudaGetLastError();     // Reset CUDART's error


                // Lock

                mutex.Lock();


                // Iterate the range of free blocks on the same device

                BlockDescriptor free_key(device);

                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);


                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))

                {

                    // No need to worry about synchronization with the device: cudaFree is

                    // blocking and will synchronize across all kernels executing

                    // on the current device


                    // Free device memory and destroy stream event.

                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;

                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;


                    // Reduce balance and erase entry

                    cached_bytes[device].free -= block_itr->bytes;


                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",

                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);


                    cached_blocks.erase(block_itr);


                    block_itr++;

                }


                // Unlock

                mutex.Unlock();


                // Return under error

                if (error) return error;


                // Try to allocate again

                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;

            }


            // Create ready event

            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))

                return error;


            // Insert into live blocks

            mutex.Lock();

            live_blocks.insert(search_key);

            cached_bytes[device].live += search_key.bytes;

            mutex.Unlock();


            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",

                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);


            // Attempt to revert back to previous device if necessary

            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))

            {

                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;

            }

        }


        // Copy device pointer to output parameter

        *d_ptr = search_key.d_ptr;


        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",

            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);


        return error;

    }


    cudaError_t DeviceAllocate(

        void            **d_ptr,

        size_t          bytes,

        cudaStream_t    active_stream = 0)

    {

        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);

    }


    cudaError_t DeviceFree(

        int             device,

        void*           d_ptr)

    {

        int entrypoint_device           = INVALID_DEVICE_ORDINAL;

        cudaError_t error               = cudaSuccess;


        if (device == INVALID_DEVICE_ORDINAL)

        {

            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))

                return error;

            device = entrypoint_device;

        }


        // Lock

        mutex.Lock();


        // Find corresponding block descriptor

        bool recached = false;

        BlockDescriptor search_key(d_ptr, device);

        BusyBlocks::iterator block_itr = live_blocks.find(search_key);

        if (block_itr != live_blocks.end())

        {

            // Remove from live blocks

            search_key = *block_itr;

            live_blocks.erase(block_itr);

            cached_bytes[device].live -= search_key.bytes;


            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold

            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))

            {

                // Insert returned allocation into free blocks

                recached = true;

                cached_blocks.insert(search_key);

                cached_bytes[device].free += search_key.bytes;


                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",

                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),

                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);

            }

        }


        // Unlock

        mutex.Unlock();


        // First set to specified device (entrypoint may not be set)

        if (device != entrypoint_device)

        {

            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;

            if (CubDebug(error = cudaSetDevice(device))) return error;

        }


        if (recached)

        {

            // Insert the ready event in the associated stream (must have current device set properly)

            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;

        }

        else

        {

            // Free the allocation from the runtime and cleanup the event.

            if (CubDebug(error = cudaFree(d_ptr))) return error;

            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;


            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",

                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);

        }


        // Reset device

        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))

        {

            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;

        }


        return error;

    }


    cudaError_t DeviceFree(

        void*           d_ptr)

    {

        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);

    }


    cudaError_t FreeAllCached()

    {

        cudaError_t error         = cudaSuccess;

        int entrypoint_device     = INVALID_DEVICE_ORDINAL;

        int current_device        = INVALID_DEVICE_ORDINAL;


        mutex.Lock();


        while (!cached_blocks.empty())

        {

            // Get first block

            CachedBlocks::iterator begin = cached_blocks.begin();


            // Get entry-point device ordinal if necessary

            if (entrypoint_device == INVALID_DEVICE_ORDINAL)

            {

                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;

            }


            // Set current device ordinal if necessary

            if (begin->device != current_device)

            {

                if (CubDebug(error = cudaSetDevice(begin->device))) break;

                current_device = begin->device;

            }


            // Free device memory

            if (CubDebug(error = cudaFree(begin->d_ptr))) break;

            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;


            // Reduce balance and erase entry

            cached_bytes[current_device].free -= begin->bytes;


            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",

                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);


            cached_blocks.erase(begin);

        }


        mutex.Unlock();


        // Attempt to revert back to entry-point device if necessary

        if (entrypoint_device != INVALID_DEVICE_ORDINAL)

        {

            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;

        }


        return error;

    }


    virtual ~CachingDeviceAllocator()

    {

        if (!skip_cleanup)

            FreeAllCached();

    }


};


       // end group UtilMgmt


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)

cub::CachingDeviceAllocator::TotalBytes
Definition util_allocator.cuh:177

_CubLog
#define _CubLog(format,...)
Log macro for printf statements.
Definition util_debug.cuh:112

CubDebug
#define CubDebug(e)
Debug macro.
Definition util_debug.cuh:94

mutex.cuh

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::int
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Definition dispatch_radix_sort.cuh:336

IntPow
Definition mathUtils.hpp:13

cub::CachingDeviceAllocator::BlockDescriptor
Definition util_allocator.cuh:127

cub::CachingDeviceAllocator
A simple caching allocator for device memory allocations.
Definition util_allocator.cuh:102

cub::CachingDeviceAllocator::min_bin
unsigned int min_bin
Geometric growth factor for bin-sizes.
Definition util_allocator.cuh:253

cub::CachingDeviceAllocator::CachingDeviceAllocator
CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live device allocations currently in use.
Definition util_allocator.cuh:276

cub::CachingDeviceAllocator::max_bin_bytes
size_t max_bin_bytes
Minimum bin size.
Definition util_allocator.cuh:257

cub::CachingDeviceAllocator::bin_growth
unsigned int bin_growth
Mutex for thread-safety.
Definition util_allocator.cuh:252

cub::CachingDeviceAllocator::SetMaxCachedBytes
cudaError_t SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache per device.
Definition util_allocator.cuh:333

cub::CachingDeviceAllocator::~CachingDeviceAllocator
virtual ~CachingDeviceAllocator()
Destructor.
Definition util_allocator.cuh:694

cub::CachingDeviceAllocator::min_bin_bytes
size_t min_bin_bytes
Maximum bin enumeration.
Definition util_allocator.cuh:256

cub::CachingDeviceAllocator::GpuCachedBytes
std::map< int, TotalBytes > GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
Definition util_allocator.cuh:191

cub::CachingDeviceAllocator::IntPow
static unsigned int IntPow(unsigned int base, unsigned int exp)
Definition util_allocator.cuh:201

cub::CachingDeviceAllocator::max_cached_bytes
size_t max_cached_bytes
Maximum bin size.
Definition util_allocator.cuh:258

cub::CachingDeviceAllocator::live_blocks
BusyBlocks live_blocks
Set of cached device allocations available for reuse.
Definition util_allocator.cuh:265

cub::CachingDeviceAllocator::DeviceFree
cudaError_t DeviceFree(int device, void *d_ptr)
Frees a live allocation of device memory on the specified device, returning it to the allocator.
Definition util_allocator.cuh:546

cub::CachingDeviceAllocator::INVALID_SIZE
static const size_t INVALID_SIZE
Invalid size.
Definition util_allocator.cuh:112

cub::CachingDeviceAllocator::CachedBlocks
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
Definition util_allocator.cuh:185

cub::CachingDeviceAllocator::Compare
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
Definition util_allocator.cuh:175

cub::CachingDeviceAllocator::INVALID_BIN
static const unsigned int INVALID_BIN
Out-of-bounds bin.
Definition util_allocator.cuh:109

cub::CachingDeviceAllocator::NearestPowerOf
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
Definition util_allocator.cuh:221

cub::CachingDeviceAllocator::cached_bytes
GpuCachedBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
Definition util_allocator.cuh:263

cub::CachingDeviceAllocator::DeviceAllocate
cudaError_t DeviceAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=0)
Provides a suitable allocation of device memory for the given size on the current device.
Definition util_allocator.cuh:530

cub::CachingDeviceAllocator::max_bin
unsigned int max_bin
Minimum bin enumeration.
Definition util_allocator.cuh:254

cub::CachingDeviceAllocator::BusyBlocks
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)
Definition util_allocator.cuh:188

cub::CachingDeviceAllocator::INVALID_DEVICE_ORDINAL
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
Definition util_allocator.cuh:117

cub::CachingDeviceAllocator::debug
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
Definition util_allocator.cuh:261

cub::CachingDeviceAllocator::DeviceFree
cudaError_t DeviceFree(void *d_ptr)
Frees a live allocation of device memory on the current device, returning it to the allocator.
Definition util_allocator.cuh:630

cub::CachingDeviceAllocator::DeviceAllocate
cudaError_t DeviceAllocate(int device, void **d_ptr, size_t bytes, cudaStream_t active_stream=0)
Provides a suitable allocation of device memory for the given size on the specified device.
Definition util_allocator.cuh:357

cub::CachingDeviceAllocator::CachingDeviceAllocator
CachingDeviceAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
Definition util_allocator.cuh:310

cub::CachingDeviceAllocator::skip_cleanup
const bool skip_cleanup
Maximum aggregate cached bytes per device.
Definition util_allocator.cuh:260

cub::CachingDeviceAllocator::cached_blocks
CachedBlocks cached_blocks
Map of device ordinal to aggregate cached bytes on that device.
Definition util_allocator.cuh:264

cub::CachingDeviceAllocator::FreeAllCached
cudaError_t FreeAllCached()
Frees all cached device allocations on all devices.
Definition util_allocator.cuh:640

cub::Mutex
Definition mutex.cuh:73

cub::Mutex::Unlock
__forceinline__ void Unlock()
Definition mutex.cuh:156

cub::Mutex::Lock
__forceinline__ void Lock()
Definition mutex.cuh:143

util_debug.cuh

util_namespace.cuh