114#ifndef DOXYGEN_SHOULD_SKIP_THIS
132 cudaStream_t associated_stream;
133 cudaEvent_t ready_event;
141 associated_stream(0),
151 associated_stream(0),
158 if (a.device == b.device)
159 return (a.d_ptr < b.d_ptr);
161 return (a.device < b.device);
167 if (a.device == b.device)
168 return (a.bytes < b.bytes);
170 return (a.device < b.device);
205 unsigned int retval = 1;
209 retval = retval * base;
223 size_t &rounded_bytes,
230 if (value * base < value)
233 power =
sizeof(size_t) * 8;
234 rounded_bytes = size_t(0) - 1;
238 while (rounded_bytes < value)
240 rounded_bytes *= base;
361 cudaStream_t active_stream = 0)
365 cudaError_t error = cudaSuccess;
369 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
370 device = entrypoint_device;
376 search_key.associated_stream = active_stream;
385 search_key.bytes = bytes;
400 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(search_key);
402 && (block_itr->device == device)
403 && (block_itr->bin == search_key.bin))
408 if ((active_stream == block_itr->associated_stream) ||
409 (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
413 search_key = *block_itr;
414 search_key.associated_stream = active_stream;
421 if (
debug)
_CubLog(
"\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
422 device, search_key.d_ptr, (
long long) search_key.bytes, (
long long) search_key.associated_stream, (
long long) block_itr->associated_stream);
439 if (device != entrypoint_device)
441 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
442 if (
CubDebug(error = cudaSetDevice(device)))
return error;
446 if (
CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
449 if (
debug)
_CubLog(
"\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
450 device, (
long long) search_key.bytes, (
long long) search_key.associated_stream);
460 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(free_key);
462 while ((block_itr !=
cached_blocks.end()) && (block_itr->device == device))
469 if (
CubDebug(error = cudaFree(block_itr->d_ptr)))
break;
470 if (
CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
break;
475 if (
debug)
_CubLog(
"\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
487 if (error)
return error;
490 if (
CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)))
return error;
494 if (
CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
503 if (
debug)
_CubLog(
"\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
504 device, search_key.d_ptr, (
long long) search_key.bytes, (
long long) search_key.associated_stream);
509 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
514 *d_ptr = search_key.d_ptr;
516 if (
debug)
_CubLog(
"\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
533 cudaStream_t active_stream = 0)
551 cudaError_t error = cudaSuccess;
555 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
557 device = entrypoint_device;
564 bool recached =
false;
566 BusyBlocks::iterator block_itr =
live_blocks.find(search_key);
570 search_key = *block_itr;
582 if (
debug)
_CubLog(
"\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
583 device, (
long long) search_key.bytes, (
long long) search_key.associated_stream, (
long long)
cached_blocks.size(),
592 if (device != entrypoint_device)
594 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
595 if (
CubDebug(error = cudaSetDevice(device)))
return error;
601 if (
CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
return error;
606 if (
CubDebug(error = cudaFree(d_ptr)))
return error;
607 if (
CubDebug(error = cudaEventDestroy(search_key.ready_event)))
return error;
609 if (
debug)
_CubLog(
"\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
616 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
642 cudaError_t error = cudaSuccess;
656 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
break;
660 if (begin->device != current_device)
662 if (
CubDebug(error = cudaSetDevice(begin->device)))
break;
663 current_device = begin->device;
667 if (
CubDebug(error = cudaFree(begin->d_ptr)))
break;
668 if (
CubDebug(error = cudaEventDestroy(begin->ready_event)))
break;
673 if (
debug)
_CubLog(
"\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
684 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
#define _CubLog(format,...)
Log macro for printf statements.
#define CubDebug(e)
Debug macro.
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
A simple caching allocator for device memory allocations.
unsigned int min_bin
Geometric growth factor for bin-sizes.
CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live device allocations currently in use.
size_t max_bin_bytes
Minimum bin size.
unsigned int bin_growth
Mutex for thread-safety.
cudaError_t SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache per device.
virtual ~CachingDeviceAllocator()
Destructor.
size_t min_bin_bytes
Maximum bin enumeration.
std::map< int, TotalBytes > GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
static unsigned int IntPow(unsigned int base, unsigned int exp)
size_t max_cached_bytes
Maximum bin size.
BusyBlocks live_blocks
Set of cached device allocations available for reuse.
cudaError_t DeviceFree(int device, void *d_ptr)
Frees a live allocation of device memory on the specified device, returning it to the allocator.
static const size_t INVALID_SIZE
Invalid size.
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
static const unsigned int INVALID_BIN
Out-of-bounds bin.
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
GpuCachedBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
cudaError_t DeviceAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=0)
Provides a suitable allocation of device memory for the given size on the current device.
unsigned int max_bin
Minimum bin enumeration.
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
cudaError_t DeviceFree(void *d_ptr)
Frees a live allocation of device memory on the current device, returning it to the allocator.
cudaError_t DeviceAllocate(int device, void **d_ptr, size_t bytes, cudaStream_t active_stream=0)
Provides a suitable allocation of device memory for the given size on the specified device.
CachingDeviceAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
const bool skip_cleanup
Maximum aggregate cached bytes per device.
CachedBlocks cached_blocks
Map of device ordinal to aggregate cached bytes on that device.
cudaError_t FreeAllCached()
Frees all cached device allocations on all devices.
__forceinline__ void Unlock()
__forceinline__ void Lock()