1 #define VCLUSTER_PERF_REPORT 2 #define SYNC_BEFORE_TAKE_TIME 3 #define ENABLE_GRID_DIST_ID_PERF_STATS 4 #include "Decomposition/Distribution/BoxDistribution.hpp" 5 #include "util/cuda_launch.hpp" 6 #include "Grid/grid_dist_id.hpp" 7 #include "data_type/aggregate.hpp" 73 constexpr
int U_next = 2;
74 constexpr
int V_next = 3;
82 typedef sgrid_dist_id_gpu<3,float,aggregate<float>,
CudaMemory, Dec> SparseGridType;
90 for (
int i = 0 ; i < 10 ; i++)
95 grid.addPoints([] __device__ (
int i,
int j,
int k)
99 [] __device__ (InsertBlockT & data,
int i,
int j,
int k)
101 data.template get<U>() = 1.0;
106 grid.template flush<smax_<U>>(flush_type::FLUSH_ON_DEVICE);
110 std::cout <<
"Time populate: " << t.
getwct() << std::endl;
113 cudaDeviceSynchronize();
116 grid.addPoints([] __device__ (
int i,
int j,
int k)
120 [] __device__ (InsertBlockT & data,
int i,
int j,
int k)
122 data.template get<U>() = 5.0;
127 grid.template flush<sRight_<U>>(flush_type::FLUSH_ON_DEVICE);
131 std::cout <<
"Time populate: " <<
t2.getwct() << std::endl;
136 int main(
int argc,
char* argv[])
138 openfpm_init(&argc,&argv);
144 size_t sz[3] = {512,512,512};
163 size_t timeSteps = 300;
165 size_t timeSteps = 15000;
172 SparseGridType
grid(sz,domain,g,bc);
175 float spacing[3] = {
grid.spacing(0),
grid.spacing(1),
grid.spacing(2)};
180 grid.deviceToHost<U>();
215 int main(
int argc,
char* argv[])
double getwct()
Return the elapsed real time.
It model an expression expr1 * expr2.
This class decompose a space into sub-sub-domains and distribute them across processors.
void start()
Start the timer.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
Class for cpu time benchmarking.
void stop()
Stop the timer.
[v_transform metafunction]