36#include "../util_debug.cuh"
37#include "../util_namespace.cuh"
38#include "../thread/thread_load.cuh"
60 typedef unsigned int SyncFlag;
76 __device__ __forceinline__
void Sync()
const
78 volatile SyncFlag *d_vol_sync = d_sync;
90 d_vol_sync[blockIdx.x] = 1;
96 for (
int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
98 while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
100 __threadfence_block();
107 for (
int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
109 d_vol_sync[peer_block] = 0;
114 if (threadIdx.x == 0)
117 d_vol_sync[blockIdx.x] = 1;
120 while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
122 __threadfence_block();
158 cudaError_t retval = cudaSuccess;
161 CubDebug(retval = cudaFree(d_sync));
182 cudaError_t
Setup(
int sweep_grid_size)
184 cudaError_t retval = cudaSuccess;
186 size_t new_sync_bytes = sweep_grid_size *
sizeof(SyncFlag);
187 if (new_sync_bytes > sync_bytes)
191 if (
CubDebug(retval = cudaFree(d_sync)))
break;
194 sync_bytes = new_sync_bytes;
197 if (
CubDebug(retval = cudaMalloc((
void**) &d_sync, sync_bytes)))
break;
198 if (
CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes)))
break;
GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storag...
cudaError_t Setup(int sweep_grid_size)
virtual ~GridBarrierLifetime()
GridBarrier implements a software global barrier among thread blocks within a CUDA grid.
__device__ __forceinline__ void Sync() const
#define CubDebug(e)
Debug macro.
Optional outer namespace(s)