OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
util_allocator.cuh
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Simple caching allocator for device memory allocations. The allocator is
31  * thread-safe and capable of managing device allocations on multiple devices.
32  ******************************************************************************/
33 
34 #pragma once
35 
36 #include "util_namespace.cuh"
37 #include "util_debug.cuh"
38 
39 #include <set>
40 #include <map>
41 
42 #include "host/mutex.cuh"
43 #include <math.h>
44 
46 CUB_NS_PREFIX
47 
49 namespace cub {
50 
51 
58 /******************************************************************************
59  * CachingDeviceAllocator (host use)
60  ******************************************************************************/
61 
102 {
103 
104  //---------------------------------------------------------------------
105  // Constants
106  //---------------------------------------------------------------------
107 
109  static const unsigned int INVALID_BIN = (unsigned int) -1;
110 
112  static const size_t INVALID_SIZE = (size_t) -1;
113 
114 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
115 
117  static const int INVALID_DEVICE_ORDINAL = -1;
118 
119  //---------------------------------------------------------------------
120  // Type definitions and helper types
121  //---------------------------------------------------------------------
122 
127  {
128  void* d_ptr; // Device pointer
129  size_t bytes; // Size of allocation in bytes
130  unsigned int bin; // Bin enumeration
131  int device; // device ordinal
132  cudaStream_t associated_stream; // Associated associated_stream
133  cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed
134 
135  // Constructor (suitable for searching maps for a specific block, given its pointer and device)
136  BlockDescriptor(void *d_ptr, int device) :
137  d_ptr(d_ptr),
138  bytes(0),
139  bin(INVALID_BIN),
140  device(device),
141  associated_stream(0),
142  ready_event(0)
143  {}
144 
145  // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
146  BlockDescriptor(int device) :
147  d_ptr(NULL),
148  bytes(0),
149  bin(INVALID_BIN),
150  device(device),
151  associated_stream(0),
152  ready_event(0)
153  {}
154 
155  // Comparison functor for comparing device pointers
156  static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
157  {
158  if (a.device == b.device)
159  return (a.d_ptr < b.d_ptr);
160  else
161  return (a.device < b.device);
162  }
163 
164  // Comparison functor for comparing allocation sizes
165  static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
166  {
167  if (a.device == b.device)
168  return (a.bytes < b.bytes);
169  else
170  return (a.device < b.device);
171  }
172  };
173 
175  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
176 
177  class TotalBytes {
178  public:
179  size_t free;
180  size_t live;
181  TotalBytes() { free = live = 0; }
182  };
183 
185  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
186 
188  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
189 
191  typedef std::map<int, TotalBytes> GpuCachedBytes;
192 
193 
194  //---------------------------------------------------------------------
195  // Utility functions
196  //---------------------------------------------------------------------
197 
201  static unsigned int IntPow(
202  unsigned int base,
203  unsigned int exp)
204  {
205  unsigned int retval = 1;
206  while (exp > 0)
207  {
208  if (exp & 1) {
209  retval = retval * base; // multiply the result by the current base
210  }
211  base = base * base; // square the base
212  exp = exp >> 1; // divide the exponent in half
213  }
214  return retval;
215  }
216 
217 
222  unsigned int &power,
223  size_t &rounded_bytes,
224  unsigned int base,
225  size_t value)
226  {
227  power = 0;
228  rounded_bytes = 1;
229 
230  if (value * base < value)
231  {
232  // Overflow
233  power = sizeof(size_t) * 8;
234  rounded_bytes = size_t(0) - 1;
235  return;
236  }
237 
238  while (rounded_bytes < value)
239  {
240  rounded_bytes *= base;
241  power++;
242  }
243  }
244 
245 
246  //---------------------------------------------------------------------
247  // Fields
248  //---------------------------------------------------------------------
249 
250  cub::Mutex mutex;
251 
252  unsigned int bin_growth;
253  unsigned int min_bin;
254  unsigned int max_bin;
255 
256  size_t min_bin_bytes;
257  size_t max_bin_bytes;
259 
260  const bool skip_cleanup;
261  bool debug;
262 
266 
267 #endif // DOXYGEN_SHOULD_SKIP_THIS
268 
269  //---------------------------------------------------------------------
270  // Methods
271  //---------------------------------------------------------------------
272 
277  unsigned int bin_growth,
278  unsigned int min_bin = 1,
279  unsigned int max_bin = INVALID_BIN,
280  size_t max_cached_bytes = INVALID_SIZE,
281  bool skip_cleanup = false,
282  bool debug = false)
283  :
285  min_bin(min_bin),
286  max_bin(max_bin),
291  debug(debug),
292  cached_blocks(BlockDescriptor::SizeCompare),
293  live_blocks(BlockDescriptor::PtrCompare)
294  {}
295 
296 
311  bool skip_cleanup = false,
312  bool debug = false)
313  :
314  bin_growth(8),
315  min_bin(3),
316  max_bin(7),
319  max_cached_bytes((max_bin_bytes * 3) - 1),
321  debug(debug),
322  cached_blocks(BlockDescriptor::SizeCompare),
323  live_blocks(BlockDescriptor::PtrCompare)
324  {}
325 
326 
333  cudaError_t SetMaxCachedBytes(
334  size_t max_cached_bytes)
335  {
336  // Lock
337  mutex.Lock();
338 
339  if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
340 
341  this->max_cached_bytes = max_cached_bytes;
342 
343  // Unlock
344  mutex.Unlock();
345 
346  return cudaSuccess;
347  }
348 
349 
357  cudaError_t DeviceAllocate(
358  int device,
359  void **d_ptr,
360  size_t bytes,
361  cudaStream_t active_stream = 0)
362  {
363  *d_ptr = NULL;
364  int entrypoint_device = INVALID_DEVICE_ORDINAL;
365  cudaError_t error = cudaSuccess;
366 
367  if (device == INVALID_DEVICE_ORDINAL)
368  {
369  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
370  device = entrypoint_device;
371  }
372 
373  // Create a block descriptor for the requested allocation
374  bool found = false;
375  BlockDescriptor search_key(device);
376  search_key.associated_stream = active_stream;
377  NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
378 
379  if (search_key.bin > max_bin)
380  {
381  // Bin is greater than our maximum bin: allocate the request
382  // exactly and give out-of-bounds bin. It will not be cached
383  // for reuse when returned.
384  search_key.bin = INVALID_BIN;
385  search_key.bytes = bytes;
386  }
387  else
388  {
389  // Search for a suitable cached allocation: lock
390  mutex.Lock();
391 
392  if (search_key.bin < min_bin)
393  {
394  // Bin is less than minimum bin: round up
395  search_key.bin = min_bin;
396  search_key.bytes = min_bin_bytes;
397  }
398 
399  // Iterate through the range of cached blocks on the same device in the same bin
400  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
401  while ((block_itr != cached_blocks.end())
402  && (block_itr->device == device)
403  && (block_itr->bin == search_key.bin))
404  {
405  // To prevent races with reusing blocks returned by the host but still
406  // in use by the device, only consider cached blocks that are
407  // either (from the active stream) or (from an idle stream)
408  if ((active_stream == block_itr->associated_stream) ||
409  (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
410  {
411  // Reuse existing cache block. Insert into live blocks.
412  found = true;
413  search_key = *block_itr;
414  search_key.associated_stream = active_stream;
415  live_blocks.insert(search_key);
416 
417  // Remove from free blocks
418  cached_bytes[device].free -= search_key.bytes;
419  cached_bytes[device].live += search_key.bytes;
420 
421  if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
422  device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream);
423 
424  cached_blocks.erase(block_itr);
425 
426  break;
427  }
428  block_itr++;
429  }
430 
431  // Done searching: unlock
432  mutex.Unlock();
433  }
434 
435  // Allocate the block if necessary
436  if (!found)
437  {
438  // Set runtime's current device to specified device (entrypoint may not be set)
439  if (device != entrypoint_device)
440  {
441  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
442  if (CubDebug(error = cudaSetDevice(device))) return error;
443  }
444 
445  // Attempt to allocate
446  if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
447  {
448  // The allocation attempt failed: free all cached blocks on device and retry
449  if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
450  device, (long long) search_key.bytes, (long long) search_key.associated_stream);
451 
452  error = cudaSuccess; // Reset the error we will return
453  cudaGetLastError(); // Reset CUDART's error
454 
455  // Lock
456  mutex.Lock();
457 
458  // Iterate the range of free blocks on the same device
459  BlockDescriptor free_key(device);
460  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
461 
462  while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
463  {
464  // No need to worry about synchronization with the device: cudaFree is
465  // blocking and will synchronize across all kernels executing
466  // on the current device
467 
468  // Free device memory and destroy stream event.
469  if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
470  if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
471 
472  // Reduce balance and erase entry
473  cached_bytes[device].free -= block_itr->bytes;
474 
475  if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
476  device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
477 
478  cached_blocks.erase(block_itr);
479 
480  block_itr++;
481  }
482 
483  // Unlock
484  mutex.Unlock();
485 
486  // Return under error
487  if (error) return error;
488 
489  // Try to allocate again
490  if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
491  }
492 
493  // Create ready event
494  if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
495  return error;
496 
497  // Insert into live blocks
498  mutex.Lock();
499  live_blocks.insert(search_key);
500  cached_bytes[device].live += search_key.bytes;
501  mutex.Unlock();
502 
503  if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
504  device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
505 
506  // Attempt to revert back to previous device if necessary
507  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
508  {
509  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
510  }
511  }
512 
513  // Copy device pointer to output parameter
514  *d_ptr = search_key.d_ptr;
515 
516  if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
517  (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
518 
519  return error;
520  }
521 
522 
530  cudaError_t DeviceAllocate(
531  void **d_ptr,
532  size_t bytes,
533  cudaStream_t active_stream = 0)
534  {
535  return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
536  }
537 
538 
546  cudaError_t DeviceFree(
547  int device,
548  void* d_ptr)
549  {
550  int entrypoint_device = INVALID_DEVICE_ORDINAL;
551  cudaError_t error = cudaSuccess;
552 
553  if (device == INVALID_DEVICE_ORDINAL)
554  {
555  if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
556  return error;
557  device = entrypoint_device;
558  }
559 
560  // Lock
561  mutex.Lock();
562 
563  // Find corresponding block descriptor
564  bool recached = false;
565  BlockDescriptor search_key(d_ptr, device);
566  BusyBlocks::iterator block_itr = live_blocks.find(search_key);
567  if (block_itr != live_blocks.end())
568  {
569  // Remove from live blocks
570  search_key = *block_itr;
571  live_blocks.erase(block_itr);
572  cached_bytes[device].live -= search_key.bytes;
573 
574  // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
575  if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
576  {
577  // Insert returned allocation into free blocks
578  recached = true;
579  cached_blocks.insert(search_key);
580  cached_bytes[device].free += search_key.bytes;
581 
582  if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
583  device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
584  (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
585  }
586  }
587 
588  // Unlock
589  mutex.Unlock();
590 
591  // First set to specified device (entrypoint may not be set)
592  if (device != entrypoint_device)
593  {
594  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
595  if (CubDebug(error = cudaSetDevice(device))) return error;
596  }
597 
598  if (recached)
599  {
600  // Insert the ready event in the associated stream (must have current device set properly)
601  if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
602  }
603  else
604  {
605  // Free the allocation from the runtime and cleanup the event.
606  if (CubDebug(error = cudaFree(d_ptr))) return error;
607  if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
608 
609  if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
610  device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
611  }
612 
613  // Reset device
614  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
615  {
616  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
617  }
618 
619  return error;
620  }
621 
622 
630  cudaError_t DeviceFree(
631  void* d_ptr)
632  {
633  return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
634  }
635 
636 
640  cudaError_t FreeAllCached()
641  {
642  cudaError_t error = cudaSuccess;
643  int entrypoint_device = INVALID_DEVICE_ORDINAL;
644  int current_device = INVALID_DEVICE_ORDINAL;
645 
646  mutex.Lock();
647 
648  while (!cached_blocks.empty())
649  {
650  // Get first block
651  CachedBlocks::iterator begin = cached_blocks.begin();
652 
653  // Get entry-point device ordinal if necessary
654  if (entrypoint_device == INVALID_DEVICE_ORDINAL)
655  {
656  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
657  }
658 
659  // Set current device ordinal if necessary
660  if (begin->device != current_device)
661  {
662  if (CubDebug(error = cudaSetDevice(begin->device))) break;
663  current_device = begin->device;
664  }
665 
666  // Free device memory
667  if (CubDebug(error = cudaFree(begin->d_ptr))) break;
668  if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
669 
670  // Reduce balance and erase entry
671  cached_bytes[current_device].free -= begin->bytes;
672 
673  if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
674  current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
675 
676  cached_blocks.erase(begin);
677  }
678 
679  mutex.Unlock();
680 
681  // Attempt to revert back to entry-point device if necessary
682  if (entrypoint_device != INVALID_DEVICE_ORDINAL)
683  {
684  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
685  }
686 
687  return error;
688  }
689 
690 
695  {
696  if (!skip_cleanup)
697  FreeAllCached();
698  }
699 
700 };
701 
702 
703 
704  // end group UtilMgmt
706 
707 } // CUB namespace
708 CUB_NS_POSTFIX // Optional outer namespace(s)
cudaError_t DeviceFree(void *d_ptr)
Frees a live allocation of device memory on the current device, returning it to the allocator.
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
__forceinline__ void Lock()
Definition: mutex.cuh:143
Optional outer namespace(s)
unsigned int bin_growth
Mutex for thread-safety.
cudaError_t DeviceFree(int device, void *d_ptr)
Frees a live allocation of device memory on the specified device, returning it to the allocator.
size_t min_bin_bytes
Maximum bin enumeration.
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
std::map< int, TotalBytes > GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
unsigned int min_bin
Geometric growth factor for bin-sizes.
CachingDeviceAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
static const size_t INVALID_SIZE
Invalid size.
__forceinline__ void Unlock()
Definition: mutex.cuh:156
static const unsigned int INVALID_BIN
Out-of-bounds bin.
virtual ~CachingDeviceAllocator()
Destructor.
GpuCachedBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
#define _CubLog(format,...)
Log macro for printf statements.
Definition: util_debug.cuh:112
A simple caching allocator for device memory allocations.
unsigned int max_bin
Minimum bin enumeration.
size_t max_cached_bytes
Maximum bin size.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
CachedBlocks cached_blocks
Map of device ordinal to aggregate cached bytes on that device.
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
BusyBlocks live_blocks
Set of cached device allocations available for reuse.
cudaError_t SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache per device.
CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live device allocations currently in use.
size_t max_bin_bytes
Minimum bin size.
cudaError_t FreeAllCached()
Frees all cached device allocations on all devices.
static unsigned int IntPow(unsigned int base, unsigned int exp)
cudaError_t DeviceAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=0)
Provides a suitable allocation of device memory for the given size on the current device.
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94
const bool skip_cleanup
Maximum aggregate cached bytes per device.
cudaError_t DeviceAllocate(int device, void **d_ptr, size_t bytes, cudaStream_t active_stream=0)
Provides a suitable allocation of device memory for the given size on the specified device.