OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
util_allocator.cuh
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
29/******************************************************************************
30 * Simple caching allocator for device memory allocations. The allocator is
31 * thread-safe and capable of managing device allocations on multiple devices.
32 ******************************************************************************/
33
34#pragma once
35
36#include "util_namespace.cuh"
37#include "util_debug.cuh"
38
39#include <set>
40#include <map>
41
42#include "host/mutex.cuh"
43#include <math.h>
44
46CUB_NS_PREFIX
47
49namespace cub {
50
51
58/******************************************************************************
59 * CachingDeviceAllocator (host use)
60 ******************************************************************************/
61
102{
103
104 //---------------------------------------------------------------------
105 // Constants
106 //---------------------------------------------------------------------
107
109 static const unsigned int INVALID_BIN = (unsigned int) -1;
110
112 static const size_t INVALID_SIZE = (size_t) -1;
113
114#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
115
117 static const int INVALID_DEVICE_ORDINAL = -1;
118
119 //---------------------------------------------------------------------
120 // Type definitions and helper types
121 //---------------------------------------------------------------------
122
127 {
128 void* d_ptr; // Device pointer
129 size_t bytes; // Size of allocation in bytes
130 unsigned int bin; // Bin enumeration
131 int device; // device ordinal
132 cudaStream_t associated_stream; // Associated associated_stream
133 cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed
134
135 // Constructor (suitable for searching maps for a specific block, given its pointer and device)
136 BlockDescriptor(void *d_ptr, int device) :
137 d_ptr(d_ptr),
138 bytes(0),
139 bin(INVALID_BIN),
140 device(device),
141 associated_stream(0),
142 ready_event(0)
143 {}
144
145 // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
146 BlockDescriptor(int device) :
147 d_ptr(NULL),
148 bytes(0),
149 bin(INVALID_BIN),
150 device(device),
151 associated_stream(0),
152 ready_event(0)
153 {}
154
155 // Comparison functor for comparing device pointers
156 static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
157 {
158 if (a.device == b.device)
159 return (a.d_ptr < b.d_ptr);
160 else
161 return (a.device < b.device);
162 }
163
164 // Comparison functor for comparing allocation sizes
165 static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
166 {
167 if (a.device == b.device)
168 return (a.bytes < b.bytes);
169 else
170 return (a.device < b.device);
171 }
172 };
173
175 typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
176
178 public:
179 size_t free;
180 size_t live;
181 TotalBytes() { free = live = 0; }
182 };
183
185 typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
186
188 typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
189
191 typedef std::map<int, TotalBytes> GpuCachedBytes;
192
193
194 //---------------------------------------------------------------------
195 // Utility functions
196 //---------------------------------------------------------------------
197
201 static unsigned int IntPow(
202 unsigned int base,
203 unsigned int exp)
204 {
205 unsigned int retval = 1;
206 while (exp > 0)
207 {
208 if (exp & 1) {
209 retval = retval * base; // multiply the result by the current base
210 }
211 base = base * base; // square the base
212 exp = exp >> 1; // divide the exponent in half
213 }
214 return retval;
215 }
216
217
222 unsigned int &power,
223 size_t &rounded_bytes,
224 unsigned int base,
225 size_t value)
226 {
227 power = 0;
228 rounded_bytes = 1;
229
230 if (value * base < value)
231 {
232 // Overflow
233 power = sizeof(size_t) * 8;
234 rounded_bytes = size_t(0) - 1;
235 return;
236 }
237
238 while (rounded_bytes < value)
239 {
240 rounded_bytes *= base;
241 power++;
242 }
243 }
244
245
246 //---------------------------------------------------------------------
247 // Fields
248 //---------------------------------------------------------------------
249
250 cub::Mutex mutex;
251
252 unsigned int bin_growth;
253 unsigned int min_bin;
254 unsigned int max_bin;
255
259
260 const bool skip_cleanup;
261 bool debug;
262
266
267#endif // DOXYGEN_SHOULD_SKIP_THIS
268
269 //---------------------------------------------------------------------
270 // Methods
271 //---------------------------------------------------------------------
272
277 unsigned int bin_growth,
278 unsigned int min_bin = 1,
279 unsigned int max_bin = INVALID_BIN,
281 bool skip_cleanup = false,
282 bool debug = false)
283 :
291 debug(debug),
292 cached_blocks(BlockDescriptor::SizeCompare),
293 live_blocks(BlockDescriptor::PtrCompare)
294 {}
295
296
311 bool skip_cleanup = false,
312 bool debug = false)
313 :
314 bin_growth(8),
315 min_bin(3),
316 max_bin(7),
321 debug(debug),
322 cached_blocks(BlockDescriptor::SizeCompare),
323 live_blocks(BlockDescriptor::PtrCompare)
324 {}
325
326
333 cudaError_t SetMaxCachedBytes(
334 size_t max_cached_bytes)
335 {
336 // Lock
337 mutex.Lock();
338
339 if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
340
341 this->max_cached_bytes = max_cached_bytes;
342
343 // Unlock
344 mutex.Unlock();
345
346 return cudaSuccess;
347 }
348
349
357 cudaError_t DeviceAllocate(
358 int device,
359 void **d_ptr,
360 size_t bytes,
361 cudaStream_t active_stream = 0)
362 {
363 *d_ptr = NULL;
364 int entrypoint_device = INVALID_DEVICE_ORDINAL;
365 cudaError_t error = cudaSuccess;
366
367 if (device == INVALID_DEVICE_ORDINAL)
368 {
369 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
370 device = entrypoint_device;
371 }
372
373 // Create a block descriptor for the requested allocation
374 bool found = false;
375 BlockDescriptor search_key(device);
376 search_key.associated_stream = active_stream;
377 NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
378
379 if (search_key.bin > max_bin)
380 {
381 // Bin is greater than our maximum bin: allocate the request
382 // exactly and give out-of-bounds bin. It will not be cached
383 // for reuse when returned.
384 search_key.bin = INVALID_BIN;
385 search_key.bytes = bytes;
386 }
387 else
388 {
389 // Search for a suitable cached allocation: lock
390 mutex.Lock();
391
392 if (search_key.bin < min_bin)
393 {
394 // Bin is less than minimum bin: round up
395 search_key.bin = min_bin;
396 search_key.bytes = min_bin_bytes;
397 }
398
399 // Iterate through the range of cached blocks on the same device in the same bin
400 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
401 while ((block_itr != cached_blocks.end())
402 && (block_itr->device == device)
403 && (block_itr->bin == search_key.bin))
404 {
405 // To prevent races with reusing blocks returned by the host but still
406 // in use by the device, only consider cached blocks that are
407 // either (from the active stream) or (from an idle stream)
408 if ((active_stream == block_itr->associated_stream) ||
409 (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
410 {
411 // Reuse existing cache block. Insert into live blocks.
412 found = true;
413 search_key = *block_itr;
414 search_key.associated_stream = active_stream;
415 live_blocks.insert(search_key);
416
417 // Remove from free blocks
418 cached_bytes[device].free -= search_key.bytes;
419 cached_bytes[device].live += search_key.bytes;
420
421 if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
422 device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream);
423
424 cached_blocks.erase(block_itr);
425
426 break;
427 }
428 block_itr++;
429 }
430
431 // Done searching: unlock
432 mutex.Unlock();
433 }
434
435 // Allocate the block if necessary
436 if (!found)
437 {
438 // Set runtime's current device to specified device (entrypoint may not be set)
439 if (device != entrypoint_device)
440 {
441 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
442 if (CubDebug(error = cudaSetDevice(device))) return error;
443 }
444
445 // Attempt to allocate
446 if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
447 {
448 // The allocation attempt failed: free all cached blocks on device and retry
449 if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
450 device, (long long) search_key.bytes, (long long) search_key.associated_stream);
451
452 error = cudaSuccess; // Reset the error we will return
453 cudaGetLastError(); // Reset CUDART's error
454
455 // Lock
456 mutex.Lock();
457
458 // Iterate the range of free blocks on the same device
459 BlockDescriptor free_key(device);
460 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
461
462 while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
463 {
464 // No need to worry about synchronization with the device: cudaFree is
465 // blocking and will synchronize across all kernels executing
466 // on the current device
467
468 // Free device memory and destroy stream event.
469 if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
470 if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
471
472 // Reduce balance and erase entry
473 cached_bytes[device].free -= block_itr->bytes;
474
475 if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
476 device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
477
478 cached_blocks.erase(block_itr);
479
480 block_itr++;
481 }
482
483 // Unlock
484 mutex.Unlock();
485
486 // Return under error
487 if (error) return error;
488
489 // Try to allocate again
490 if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
491 }
492
493 // Create ready event
494 if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
495 return error;
496
497 // Insert into live blocks
498 mutex.Lock();
499 live_blocks.insert(search_key);
500 cached_bytes[device].live += search_key.bytes;
501 mutex.Unlock();
502
503 if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
504 device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
505
506 // Attempt to revert back to previous device if necessary
507 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
508 {
509 if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
510 }
511 }
512
513 // Copy device pointer to output parameter
514 *d_ptr = search_key.d_ptr;
515
516 if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
517 (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
518
519 return error;
520 }
521
522
530 cudaError_t DeviceAllocate(
531 void **d_ptr,
532 size_t bytes,
533 cudaStream_t active_stream = 0)
534 {
535 return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
536 }
537
538
546 cudaError_t DeviceFree(
547 int device,
548 void* d_ptr)
549 {
550 int entrypoint_device = INVALID_DEVICE_ORDINAL;
551 cudaError_t error = cudaSuccess;
552
553 if (device == INVALID_DEVICE_ORDINAL)
554 {
555 if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
556 return error;
557 device = entrypoint_device;
558 }
559
560 // Lock
561 mutex.Lock();
562
563 // Find corresponding block descriptor
564 bool recached = false;
565 BlockDescriptor search_key(d_ptr, device);
566 BusyBlocks::iterator block_itr = live_blocks.find(search_key);
567 if (block_itr != live_blocks.end())
568 {
569 // Remove from live blocks
570 search_key = *block_itr;
571 live_blocks.erase(block_itr);
572 cached_bytes[device].live -= search_key.bytes;
573
574 // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
575 if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
576 {
577 // Insert returned allocation into free blocks
578 recached = true;
579 cached_blocks.insert(search_key);
580 cached_bytes[device].free += search_key.bytes;
581
582 if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
583 device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
584 (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
585 }
586 }
587
588 // Unlock
589 mutex.Unlock();
590
591 // First set to specified device (entrypoint may not be set)
592 if (device != entrypoint_device)
593 {
594 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
595 if (CubDebug(error = cudaSetDevice(device))) return error;
596 }
597
598 if (recached)
599 {
600 // Insert the ready event in the associated stream (must have current device set properly)
601 if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
602 }
603 else
604 {
605 // Free the allocation from the runtime and cleanup the event.
606 if (CubDebug(error = cudaFree(d_ptr))) return error;
607 if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
608
609 if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
610 device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
611 }
612
613 // Reset device
614 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
615 {
616 if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
617 }
618
619 return error;
620 }
621
622
630 cudaError_t DeviceFree(
631 void* d_ptr)
632 {
633 return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
634 }
635
636
640 cudaError_t FreeAllCached()
641 {
642 cudaError_t error = cudaSuccess;
643 int entrypoint_device = INVALID_DEVICE_ORDINAL;
644 int current_device = INVALID_DEVICE_ORDINAL;
645
646 mutex.Lock();
647
648 while (!cached_blocks.empty())
649 {
650 // Get first block
651 CachedBlocks::iterator begin = cached_blocks.begin();
652
653 // Get entry-point device ordinal if necessary
654 if (entrypoint_device == INVALID_DEVICE_ORDINAL)
655 {
656 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
657 }
658
659 // Set current device ordinal if necessary
660 if (begin->device != current_device)
661 {
662 if (CubDebug(error = cudaSetDevice(begin->device))) break;
663 current_device = begin->device;
664 }
665
666 // Free device memory
667 if (CubDebug(error = cudaFree(begin->d_ptr))) break;
668 if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
669
670 // Reduce balance and erase entry
671 cached_bytes[current_device].free -= begin->bytes;
672
673 if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
674 current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
675
676 cached_blocks.erase(begin);
677 }
678
679 mutex.Unlock();
680
681 // Attempt to revert back to entry-point device if necessary
682 if (entrypoint_device != INVALID_DEVICE_ORDINAL)
683 {
684 if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
685 }
686
687 return error;
688 }
689
690
695 {
696 if (!skip_cleanup)
698 }
699
700};
701
702
703
704 // end group UtilMgmt
706
707} // CUB namespace
708CUB_NS_POSTFIX // Optional outer namespace(s)
#define _CubLog(format,...)
Log macro for printf statements.
#define CubDebug(e)
Debug macro.
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
A simple caching allocator for device memory allocations.
unsigned int min_bin
Geometric growth factor for bin-sizes.
CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live device allocations currently in use.
size_t max_bin_bytes
Minimum bin size.
unsigned int bin_growth
Mutex for thread-safety.
cudaError_t SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache per device.
virtual ~CachingDeviceAllocator()
Destructor.
size_t min_bin_bytes
Maximum bin enumeration.
std::map< int, TotalBytes > GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
static unsigned int IntPow(unsigned int base, unsigned int exp)
size_t max_cached_bytes
Maximum bin size.
BusyBlocks live_blocks
Set of cached device allocations available for reuse.
cudaError_t DeviceFree(int device, void *d_ptr)
Frees a live allocation of device memory on the specified device, returning it to the allocator.
static const size_t INVALID_SIZE
Invalid size.
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
static const unsigned int INVALID_BIN
Out-of-bounds bin.
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
GpuCachedBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
cudaError_t DeviceAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=0)
Provides a suitable allocation of device memory for the given size on the current device.
unsigned int max_bin
Minimum bin enumeration.
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
cudaError_t DeviceFree(void *d_ptr)
Frees a live allocation of device memory on the current device, returning it to the allocator.
cudaError_t DeviceAllocate(int device, void **d_ptr, size_t bytes, cudaStream_t active_stream=0)
Provides a suitable allocation of device memory for the given size on the specified device.
CachingDeviceAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
const bool skip_cleanup
Maximum aggregate cached bytes per device.
CachedBlocks cached_blocks
Map of device ordinal to aggregate cached bytes on that device.
cudaError_t FreeAllCached()
Frees all cached device allocations on all devices.
__forceinline__ void Unlock()
Definition mutex.cuh:156
__forceinline__ void Lock()
Definition mutex.cuh:143