OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
grid_barrier.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../util_debug.cuh"
37 #include "../util_namespace.cuh"
38 #include "../thread/thread_load.cuh"
39 
41 CUB_NS_PREFIX
42 
44 namespace cub {
45 
46 
57 {
58 protected :
59 
60  typedef unsigned int SyncFlag;
61 
62  // Counters in global device memory
63  SyncFlag* d_sync;
64 
65 public:
66 
70  GridBarrier() : d_sync(NULL) {}
71 
72 
76  __device__ __forceinline__ void Sync() const
77  {
78  volatile SyncFlag *d_vol_sync = d_sync;
79 
80  // Threadfence and syncthreads to make sure global writes are visible before
81  // thread-0 reports in with its sync counter
82  __threadfence();
83  CTA_SYNC();
84 
85  if (blockIdx.x == 0)
86  {
87  // Report in ourselves
88  if (threadIdx.x == 0)
89  {
90  d_vol_sync[blockIdx.x] = 1;
91  }
92 
93  CTA_SYNC();
94 
95  // Wait for everyone else to report in
96  for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
97  {
98  while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
99  {
100  __threadfence_block();
101  }
102  }
103 
104  CTA_SYNC();
105 
106  // Let everyone know it's safe to proceed
107  for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
108  {
109  d_vol_sync[peer_block] = 0;
110  }
111  }
112  else
113  {
114  if (threadIdx.x == 0)
115  {
116  // Report in
117  d_vol_sync[blockIdx.x] = 1;
118 
119  // Wait for acknowledgment
120  while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
121  {
122  __threadfence_block();
123  }
124  }
125 
126  CTA_SYNC();
127  }
128  }
129 };
130 
131 
139 {
140 protected:
141 
142  // Number of bytes backed by d_sync
143  size_t sync_bytes;
144 
145 public:
146 
150  GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
151 
152 
156  cudaError_t HostReset()
157  {
158  cudaError_t retval = cudaSuccess;
159  if (d_sync)
160  {
161  CubDebug(retval = cudaFree(d_sync));
162  d_sync = NULL;
163  }
164  sync_bytes = 0;
165  return retval;
166  }
167 
168 
173  {
174  HostReset();
175  }
176 
177 
182  cudaError_t Setup(int sweep_grid_size)
183  {
184  cudaError_t retval = cudaSuccess;
185  do {
186  size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
187  if (new_sync_bytes > sync_bytes)
188  {
189  if (d_sync)
190  {
191  if (CubDebug(retval = cudaFree(d_sync))) break;
192  }
193 
194  sync_bytes = new_sync_bytes;
195 
196  // Allocate and initialize to zero
197  if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
198  if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
199  }
200  } while (0);
201 
202  return retval;
203  }
204 };
205 
206  // end group GridModule
208 
209 } // CUB namespace
210 CUB_NS_POSTFIX // Optional outer namespace(s)
211 
GridBarrier implements a software global barrier among thread blocks within a CUDA grid.
Optional outer namespace(s)
cudaError_t Setup(int sweep_grid_size)
CTA_SYNC()
Definition: util_ptx.cuh:255
GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storag...
__device__ __forceinline__ void Sync() const
#define CubDebug(e)
Debug macro.
Definition: util_debug.cuh:94