This example show the classical SPH Dam break simulation with load balancing and dynamic load balancing. The main difference with SPH with Dynamic load Balancing is that here we use GPUs and 1.2 Millions particles.
On SPH we have the necessity to remove particles that go out of bound. OpenFPM provide the function remove_marked .
where vd is the vector_dist_gpu red is the property that mark which particle must be removed. We mark the particle to be removed in the function kernel We check if the particle go out of the region of interest or their density go critically far from the rest density
When we want to launch a kernel "my_kernel" on CUDA we in general use the Nvidia CUDA syntax
my_kernel<<<wthr,thr>>>(arguments ... )
Where wthr is the number of workgroups and thr is the number of threads in a workgroup and arguments... are the arguments to pass to the kernel. Equivalently we can launch a kernel with the macro CUDA_LAUNCH_DIM3(my_kernel,wthr,thr,arguments...) or CUDA_LAUNCH(my_kernel,ite,arguments) where ite has been taken using getDomainIteratorGPU. There are several advantage on using CUDA_LAUNCH. The first advantage in using the macro is enabling SE_CLASS1 all kernel launch become synchronous and an error check is performed before continue to the next kernel making debugging easier. Another feature is the possibility to run CUDA code on CPU without a GPU. compiling with "CUDA_ON_CPU=1 make" (Note openfpm must be compiled with GPU support (-g) or with CUDA_ON_CPU support (-c "... --enable_cuda_on_cpu"). You can compile this example on CPU. You do not have to change a single line of code for this example. (Check the video to see this feature in action). All the openfpm GPU example and CUDA example can run on CPU if they use CUDA_LAUNCH as macro. We are planning to support AMD GPUs as well using this system.
#ifdef __NVCC__
#define PRINT_STACKTRACE
#define OPENMPI
#define SCAN_WITH_CUB
#define SORT_WITH_CUB
#include "Vector/vector_dist.hpp"
#include <math.h>
#include "Draw/DrawParticles.hpp"
typedef float real_number;
#define BOUNDARY 0
#define FLUID 1
const real_number dp = 0.00425;
real_number h_swl = 0.0;
const real_number coeff_sound = 20.0;
const real_number gamma_ = 7.0;
const real_number H = 0.00736121593217;
const real_number Eta2 = 0.01 * H*H;
const real_number FourH2 = 4.0 * H*H;
const real_number visco = 0.1;
real_number cbar = 0.0;
const real_number MassFluid = 0.0000767656;
const real_number MassBound = 0.0000767656;
#ifdef TEST_RUN
const real_number t_end = 0.001;
#else
const real_number t_end = 1.5;
#endif
const real_number gravity = 9.81;
const real_number rho_zero = 1000.0;
real_number B = 0.0;
const real_number CFLnumber = 0.2;
const real_number DtMin = 0.00001;
const real_number RhoMin = 700.0;
const real_number RhoMax = 1300.0;
real_number max_fluid_height = 0.0;
const size_t type = 0;
const int rho = 1;
const int rho_prev = 2;
const int Pressure = 3;
const int drho = 4;
const int force = 5;
const int velocity = 6;
const int velocity_prev = 7;
const int red2 = 9;
typedef vector_dist_gpu<3,real_number,aggregate<unsigned int,real_number, real_number, real_number, real_number, real_number[3], real_number[3], real_number[3], real_number, real_number>>
particles;
{
template<
typename Decomposition,
typename vector>
inline void addComputation(
Decomposition & dec,
vector & vd,
size_t v,
size_t p)
{
if (vd.template getProp<type>(p) == FLUID)
dec.addComputationCost(v,4);
else
dec.addComputationCost(v,3);
}
template<
typename Decomposition>
inline void applyModel(
Decomposition & dec,
size_t v)
{
dec.setSubSubDomainComputationCost(v, dec.getSubSubDomainComputationCost(v) * dec.getSubSubDomainComputationCost(v));
}
real_number distributionTol()
{
return 1.01;
}
};
template<typename vd_type>
__global__ void EqState_gpu(vd_type vd, real_number B)
{
auto a = GET_PARTICLE(vd);
real_number rho_a = vd.template getProp<rho>(a);
real_number rho_frac = rho_a / rho_zero;
vd.template getProp<Pressure>(a) = B*( rho_frac*rho_frac*rho_frac*rho_frac*rho_frac*rho_frac*rho_frac - 1.0);
}
{
auto it = vd.getDomainIteratorGPU();
CUDA_LAUNCH(EqState_gpu,it,vd.toKernel(),B);
}
const real_number a2 = 1.0/M_PI/H/H/H;
inline __device__ __host__ real_number Wab(real_number r)
{
r /= H;
if (r < 1.0)
return (1.0 - 3.0/2.0*r*r + 3.0/4.0*r*r*r)*a2;
else if (r < 2.0)
return (1.0/4.0*(2.0 - r*r)*(2.0 - r*r)*(2.0 - r*r))*a2;
else
return 0.0;
}
const real_number c1 = -3.0/M_PI/H/H/H/H;
const real_number d1 = 9.0/4.0/M_PI/H/H/H/H;
const real_number c2 = -3.0/4.0/M_PI/H/H/H/H;
const real_number a2_4 = 0.25*a2;
real_number W_dap = 0.0;
{
const real_number qq=r/H;
real_number qq2 = qq * qq;
real_number fac1 = (c1*qq + d1*qq2)/r;
real_number b1 = (qq < 1.0f)?1.0f:0.0f;
real_number wqq = (2.0f - qq);
real_number fac2 = c2 * wqq * wqq / r;
real_number b2 = (qq >= 1.0f && qq < 2.0f)?1.0f:0.0f;
real_number factor = (b1*fac1 + b2*fac2);
DW.
get(0) = factor * dx.
get(0);
DW.
get(1) = factor * dx.
get(1);
DW.
get(2) = factor * dx.
get(2);
}
inline __device__ __host__ real_number Tensile(real_number r, real_number rhoa, real_number rhob, real_number prs1, real_number prs2, real_number W_dap)
{
const real_number qq=r/H;
real_number wab;
if(r>H)
{
real_number wqq1=2.0f-qq;
real_number wqq2=wqq1*wqq1;
wab=a2_4*(wqq2*wqq1);
}
else
{
real_number wqq2=qq*qq;
real_number wqq3=wqq2*qq;
wab=a2*(1.0f-1.5f*wqq2+0.75f*wqq3);
}
real_number fab=wab*W_dap;
fab*=fab; fab*=fab;
const real_number tensilp1=(prs1/(rhoa*rhoa))*(prs1>0.0f? 0.01f: -0.2f);
const real_number tensilp2=(prs2/(rhob*rhob))*(prs2>0.0f? 0.01f: -0.2f);
return (fab*(tensilp1+tensilp2));
}
inline __device__ __host__ real_number Pi(
const Point<3,real_number> & dr, real_number rr2,
Point<3,real_number> & dv, real_number rhoa, real_number rhob, real_number massb, real_number cbar, real_number & visc)
{
const real_number dot_rr2 = dot/(rr2+Eta2);
visc=(dot_rr2 < visc)?visc:dot_rr2;
if(dot < 0)
{
const float amubar=H*dot_rr2;
const float robar=(rhoa+rhob)*0.5f;
const float pi_visc=(-visco*cbar*amubar/robar);
return pi_visc;
}
else
return 0.0f;
}
template<typename particles_type, typename NN_type>
__global__
void calc_forces_gpu(
particles_type vd, NN_type NN, real_number W_dap, real_number cbar)
{
unsigned int a;
GET_PARTICLE_SORT(a,NN);
real_number max_visc = 0.0f;
unsigned int typea = vd.template getProp<type>(a);
real_number rhoa = vd.template getProp<rho>(a);
real_number Pa = vd.template getProp<Pressure>(a);
force_.
get(2) = -gravity;
real_number drho_ = 0.0f;
auto Np = NN.getNNIteratorBox(NN.getCell(xa));
while (Np.isNext() == true)
{
auto b = Np.get_sort();
if (a == b) {++Np; continue;};
unsigned int typeb = vd.template getProp<type>(b);
real_number massb = (typeb == FLUID)?MassFluid:MassBound;
real_number Pb = vd.template getProp<Pressure>(b);
real_number rhob = vd.template getProp<rho>(b);
real_number r2 = norm2(dr);
if (r2 < FourH2 && r2 >= 1e-16)
{
real_number r = sqrtf(r2);
DWab(dr,DW,r);
real_number factor = - massb*((Pa + Pb) / (rhoa * rhob) + Tensile(r,rhoa,rhob,Pa,Pb,W_dap) + Pi(dr,r2,v_rel,rhoa,rhob,massb,cbar,max_visc));
factor = (typea == BOUNDARY && typeb == BOUNDARY)?0.0f:factor;
force_.
get(0) += factor * DW.
get(0);
force_.
get(1) += factor * DW.
get(1);
force_.
get(2) += factor * DW.
get(2);
real_number scal = massb*(v_rel.
get(0)*DW.
get(0)+v_rel.
get(1)*DW.
get(1)+v_rel.
get(2)*DW.
get(2));
scal = (typea == BOUNDARY && typeb == BOUNDARY)?0.0f:scal;
drho_ += scal;
}
++Np;
}
vd.template getProp<red>(a) = max_visc;
vd.template getProp<force>(a)[0] = force_.
get(0);
vd.template getProp<force>(a)[1] = force_.
get(1);
vd.template getProp<force>(a)[2] = force_.
get(2);
vd.template getProp<drho>(a) = drho_;
}
template<
typename CellList>
inline void calc_forces(
particles & vd,
CellList & NN, real_number & max_visc,
size_t cnt)
{
auto part = vd.getDomainIteratorGPU(96);
CUDA_LAUNCH(calc_forces_gpu,part,vd.toKernel_sorted(),NN.toKernel(),W_dap,cbar);
vd.merge_sort<force,drho,
red>(NN);
max_visc = reduce_local<red,_max_>(vd);
}
template<typename vector_type>
__global__
void max_acceleration_and_velocity_gpu(
vector_type vd)
{
auto a = GET_PARTICLE(vd);
vd.template getProp<red>(a) = norm(acc);
vd.template getProp<red2>(a) = norm(vel);
}
void max_acceleration_and_velocity(
particles & vd, real_number & max_acc, real_number & max_vel)
{
auto part = vd.getDomainIteratorGPU();
CUDA_LAUNCH(max_acceleration_and_velocity_gpu,part,vd.toKernel());
max_acc = reduce_local<red,_max_>(vd);
max_vel = reduce_local<red2,_max_>(vd);
}
real_number calc_deltaT(
particles & vd, real_number ViscDtMax)
{
real_number Maxacc = 0.0;
real_number Maxvel = 0.0;
max_acceleration_and_velocity(vd,Maxacc,Maxvel);
const real_number dt_f = (Maxacc)?sqrt(H/Maxacc):std::numeric_limits<float>::max();
const real_number dt_cv = H/(std::max(cbar,Maxvel*10.f) + H*ViscDtMax);
real_number dt=real_number(CFLnumber)*std::min(dt_f,dt_cv);
if(dt<real_number(DtMin))
{dt=real_number(DtMin);}
return dt;
}
template<typename vector_dist_type>
__global__ void verlet_int_gpu(vector_dist_type vd, real_number dt, real_number dt2, real_number dt205)
{
auto a = GET_PARTICLE(vd);
if (vd.template getProp<type>(a) == BOUNDARY)
{
real_number rhop = vd.template getProp<rho>(a);
vd.template getProp<velocity>(a)[0] = 0.0;
vd.template getProp<velocity>(a)[1] = 0.0;
vd.template getProp<velocity>(a)[2] = 0.0;
real_number rhonew = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
vd.template getProp<rho>(a) = (rhonew < rho_zero)?rho_zero:rhonew;
vd.template getProp<rho_prev>(a) = rhop;
vd.template getProp<red>(a) = 0;
return;
}
real_number dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
vd.getPos(a)[0] += dx;
vd.getPos(a)[1] += dy;
vd.getPos(a)[2] += dz;
real_number velX = vd.template getProp<velocity>(a)[0];
real_number velY = vd.template getProp<velocity>(a)[1];
real_number velZ = vd.template getProp<velocity>(a)[2];
real_number rhop = vd.template getProp<rho>(a);
vd.template getProp<velocity>(a)[0] = vd.template getProp<velocity_prev>(a)[0] + vd.template getProp<force>(a)[0]*dt2;
vd.template getProp<velocity>(a)[1] = vd.template getProp<velocity_prev>(a)[1] + vd.template getProp<force>(a)[1]*dt2;
vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity_prev>(a)[2] + vd.template getProp<force>(a)[2]*dt2;
vd.template getProp<rho>(a) = vd.template getProp<rho_prev>(a) + dt2*vd.template getProp<drho>(a);
if (vd.getPos(a)[0] < 0.0 || vd.getPos(a)[1] < 0.0 || vd.getPos(a)[2] < 0.0 ||
vd.getPos(a)[0] > 1.61 || vd.getPos(a)[1] > 0.68 || vd.getPos(a)[2] > 0.50 ||
vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
{vd.template getProp<red>(a) = 1;}
else
{vd.template getProp<red>(a) = 0;}
vd.template getProp<velocity_prev>(a)[0] = velX;
vd.template getProp<velocity_prev>(a)[1] = velY;
vd.template getProp<velocity_prev>(a)[2] = velZ;
vd.template getProp<rho_prev>(a) = rhop;
}
size_t cnt = 0;
void verlet_int(
particles & vd, real_number dt)
{
auto part = vd.getDomainIteratorGPU();
real_number dt205 = dt*dt*0.5;
real_number dt2 = dt*2.0;
CUDA_LAUNCH(verlet_int_gpu,part,vd.toKernel(),dt,dt2,dt205);
remove_marked<red>(vd);
cnt++;
}
template<typename vector_type>
__global__
void euler_int_gpu(
vector_type vd,real_number dt, real_number dt205)
{
auto a = GET_PARTICLE(vd);
if (vd.template getProp<type>(a) == BOUNDARY)
{
real_number rhop = vd.template getProp<rho>(a);
vd.template getProp<velocity>(a)[0] = 0.0;
vd.template getProp<velocity>(a)[1] = 0.0;
vd.template getProp<velocity>(a)[2] = 0.0;
real_number rhonew = vd.template getProp<rho>(a) + dt*vd.template getProp<drho>(a);
vd.template getProp<rho>(a) = (rhonew < rho_zero)?rho_zero:rhonew;
vd.template getProp<rho_prev>(a) = rhop;
vd.template getProp<red>(a) = 0;
return;
}
real_number dx = vd.template getProp<velocity>(a)[0]*dt + vd.template getProp<force>(a)[0]*dt205;
real_number dy = vd.template getProp<velocity>(a)[1]*dt + vd.template getProp<force>(a)[1]*dt205;
real_number dz = vd.template getProp<velocity>(a)[2]*dt + vd.template getProp<force>(a)[2]*dt205;
real_number velX = vd.template getProp<velocity>(a)[0];
real_number velY = vd.template getProp<velocity>(a)[1];
real_number velZ = vd.template getProp<velocity>(a)[2];
real_number rhop = vd.template getProp<rho>(a);
vd.template getProp<velocity>(a)[0] = vd.template getProp<velocity>(a)[0] + vd.template getProp<force>(a)[0]*dt;
vd.template getProp<velocity>(a)[1] = vd.template getProp<velocity>(a)[1] + vd.template getProp<force>(a)[1]*dt;
vd.template getProp<velocity>(a)[2] = vd.template getProp<velocity>(a)[2] + vd.template getProp<force>(a)[2]*dt;
vd.template getProp<rho>(a) = vd.template getProp<rho>(a) + dt*vd.template getProp<drho>(a);
vd.template getProp<rho>(a) < RhoMin || vd.template getProp<rho>(a) > RhoMax)
{vd.template getProp<red>(a) = 1;}
else
{vd.template getProp<red>(a) = 0;}
vd.template getProp<velocity_prev>(a)[0] = velX;
vd.template getProp<velocity_prev>(a)[1] = velY;
vd.template getProp<velocity_prev>(a)[2] = velZ;
vd.template getProp<rho_prev>(a) = rhop;
}
void euler_int(
particles & vd, real_number dt)
{
auto part = vd.getDomainIteratorGPU();
real_number dt205 = dt*dt*0.5;
CUDA_LAUNCH(euler_int_gpu,part,vd.toKernel(),dt,dt205);
remove_marked<red>(vd);
cnt++;
}
template<typename vector_type, typename NN_type>
{
real_number tot_ker = 0.0;
auto itg = NN.getNNIteratorBox(NN.getCell(xp));
while (itg.isNext())
{
auto q = itg.get_sort();
if (vd.template getProp<type>(q) != FLUID)
{
++itg;
continue;
}
real_number r = sqrt(norm2(xp - xq));
real_number ker = Wab(r) * (MassFluid / rho_zero);
tot_ker += ker;
*press_tmp += vd.template getProp<Pressure>(q) * ker;
++itg;
}
if (tot_ker == 0.0)
{*press_tmp = 0.0;}
else
{*press_tmp = 1.0 / tot_ker * *press_tmp;}
}
template<typename Vector, typename CellList>
inline void sensor_pressure(
Vector & vd,
{
press_t.add();
for (size_t i = 0 ; i < probes.size() ; i++)
{
real_number press_tmp;
if (vd.getDecomposition().isLocal(probes.get(i)) == true)
{
CUDA_LAUNCH_DIM3(sensor_pressure_gpu,1,1,vd.toKernel_sorted(),NN.toKernel(),probes.
get(i),(real_number *)press_tmp_.toKernel());
vd.merge<Pressure>(NN);
press_tmp_.deviceToHost();
press_tmp = *(real_number *)press_tmp_.getPointer();
}
press_t.last().add(press_tmp);
}
}
int main(int argc, char* argv[])
{
openfpm_init(&argc,&argv);
#if !defined(CUDA_ON_CPU) && !defined(__HIP__)
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
#endif
probes.add({0.8779f,0.3f,0.02f});
probes.add({0.754f,0.31f,0.02f});
size_t sz[3] = {413,179,133};
W_dap = 1.0/Wab(H/1.5);
size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
Box<3,real_number> fluid_box({dp/2.0f,dp/2.0f,dp/2.0f},{0.4f+dp/2.0f,0.67f-dp/2.0f,0.3f+dp/2.0f});
max_fluid_height = fluid_it.getBoxMargins().
getHigh(2);
h_swl = fluid_it.getBoxMargins().
getHigh(2) - fluid_it.getBoxMargins().
getLow(2);
B = (coeff_sound)*(coeff_sound)*gravity*h_swl*rho_zero / gamma_;
cbar = coeff_sound * sqrt(gravity * h_swl);
while (fluid_it.isNext())
{
vd.add();
vd.getLastPos()[0] = fluid_it.get().get(0);
vd.getLastPos()[1] = fluid_it.get().get(1);
vd.getLastPos()[2] = fluid_it.get().get(2);
vd.template getLastProp<type>() = FLUID;
vd.template getLastProp<Pressure>() = rho_zero * gravity * (max_fluid_height - fluid_it.get().get(2));
vd.template getLastProp<rho>() = pow(vd.template getLastProp<Pressure>() / B + 1, 1.0/gamma_) * rho_zero;
vd.template getLastProp<rho_prev>() = vd.template getLastProp<rho>();
vd.template getLastProp<velocity>()[0] = 0.0;
vd.template getLastProp<velocity>()[1] = 0.0;
vd.template getLastProp<velocity>()[2] = 0.0;
vd.template getLastProp<velocity_prev>()[0] = 0.0;
vd.template getLastProp<velocity_prev>()[1] = 0.0;
vd.template getLastProp<velocity_prev>()[2] = 0.0;
++fluid_it;
}
Box<3,real_number> recipient1({0.0f,0.0f,0.0f},{1.6f+dp/2.0f,0.67f+dp/2.0f,0.4f+dp/2.0f});
Box<3,real_number> obstacle1({0.9f,0.24f-dp/2.0f,0.0f},{1.02f+dp/2.0f,0.36f,0.45f+dp/2.0f});
Box<3,real_number> obstacle2({0.9f+dp,0.24f+dp/2.0f,0.0f},{1.02f-dp/2.0f,0.36f-dp,0.45f-dp/2.0f});
holes.add(recipient2);
holes.add(obstacle1);
while (bound_box.isNext())
{
vd.add();
vd.getLastPos()[0] = bound_box.get().get(0);
vd.getLastPos()[1] = bound_box.get().get(1);
vd.getLastPos()[2] = bound_box.get().get(2);
vd.template getLastProp<type>() = BOUNDARY;
vd.template getLastProp<rho>() = rho_zero;
vd.template getLastProp<rho_prev>() = rho_zero;
vd.template getLastProp<velocity>()[0] = 0.0;
vd.template getLastProp<velocity>()[1] = 0.0;
vd.template getLastProp<velocity>()[2] = 0.0;
vd.template getLastProp<velocity_prev>()[0] = 0.0;
vd.template getLastProp<velocity_prev>()[1] = 0.0;
vd.template getLastProp<velocity_prev>()[2] = 0.0;
++bound_box;
}
while (obstacle_box.isNext())
{
vd.add();
vd.getLastPos()[0] = obstacle_box.get().get(0);
vd.getLastPos()[1] = obstacle_box.get().get(1);
vd.getLastPos()[2] = obstacle_box.get().get(2);
vd.template getLastProp<type>() = BOUNDARY;
vd.template getLastProp<rho>() = rho_zero;
vd.template getLastProp<rho_prev>() = rho_zero;
vd.template getLastProp<velocity>()[0] = 0.0;
vd.template getLastProp<velocity>()[1] = 0.0;
vd.template getLastProp<velocity>()[2] = 0.0;
vd.template getLastProp<velocity_prev>()[0] = 0.0;
vd.template getLastProp<velocity_prev>()[1] = 0.0;
vd.template getLastProp<velocity_prev>()[2] = 0.0;
++obstacle_box;
}
vd.map();
vd.addComputationCosts(md);
vd.getDecomposition().decompose();
vd.map();
vd.hostToDevicePos();
vd.template hostToDeviceProp<type,rho,rho_prev,Pressure,velocity,velocity_prev>();
vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
auto NN = vd.getCellListGPU(2*H / 2.0);
size_t write = 0;
size_t it = 0;
size_t it_reb = 0;
real_number t = 0.0;
while (t <= t_end)
{
it_reb++;
if (it_reb == 300)
{
vd.map(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.template deviceToHostProp<type>();
it_reb = 0;
vd.addComputationCosts(md);
vd.getDecomposition().decompose();
{std::cout << "REBALANCED " << it_reb << std::endl;}
}
vd.map(RUN_ON_DEVICE);
EqState(vd);
real_number max_visc = 0.0;
vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
calc_forces(vd,NN,max_visc,cnt);
real_number dt = calc_deltaT(vd,max_visc);
it++;
if (it < 40)
verlet_int(vd,dt);
else
{
euler_int(vd,dt);
it = 0;
}
t += dt;
if (write < t*10)
{
vd.map(RUN_ON_DEVICE);
vd.ghost_get<type,rho,Pressure,velocity>(RUN_ON_DEVICE);
vd.updateCellList(NN);
std::cout << "OUTPUT " << dt << std::endl;
vd.deviceToHostPos();
vd.deviceToHostProp<type,rho,rho_prev,Pressure,drho,force,velocity,velocity_prev,
red,red2>();
auto ito = vd.getDomainIterator();
while(ito.isNext())
{
auto p = ito.get();
vd_out.add();
vd_out.getLastPos()[0] = vd.getPos(p)[0];
vd_out.getLastPos()[1] = vd.getPos(p)[1];
vd_out.getLastPos()[2] = vd.getPos(p)[2];
vd_out.template getLastProp<0>() = vd.template getProp<type>(p);
vd_out.template getLastProp<1>()[0] = vd.template getProp<velocity>(p)[0];
vd_out.template getLastProp<1>()[1] = vd.template getProp<velocity>(p)[1];
vd_out.template getLastProp<1>()[2] = vd.template getProp<velocity>(p)[2];
++ito;
}
vd_out.write_frame("Particles",write,VTK_WRITER | FORMAT_BINARY);
write++;
{std::cout <<
"TIME: " << t <<
" write " << it_time.
getwct() <<
" " << it_reb <<
" " << cnt <<
" Max visc: " << max_visc <<
" " << vd.size_local() << std::endl;}
}
else
{
{std::cout <<
"TIME: " << t <<
" " << it_time.
getwct() <<
" " << it_reb <<
" " << cnt <<
" Max visc: " << max_visc <<
" " << vd.size_local() << std::endl;}
}
}
std::cout <<
"Time to complete: " << tot_sim.
getwct() <<
" seconds" << std::endl;
openfpm_finalize();
}
#else
int main(int argc, char* argv[])
{
return 0;
}
#endif
This class represent an N-dimensional box.
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Class for FAST cell list implementation.
auto get(size_t cell, size_t ele) -> decltype(this->Mem_type::get(cell, ele))
Get an element in the cell.
This class define the domain decomposition interface.
static PointIteratorSkin< dim, T, typename vd_type::Decomposition_type > DrawSkin(vd_type &vd, size_t(&sz)[dim], Box< dim, T > &domain, Box< dim, T > &sub_A, Box< dim, T > &sub_B)
Draw particles in a box B excluding the area of a second box A (B - A)
static PointIterator< dim, T, typename vd_type::Decomposition_type > DrawBox(vd_type &vd, size_t(&sz)[dim], Box< dim, T > &domain, Box< dim, T > &sub)
Draw particles in a box.
This class implement the point shape in an N-dimensional space.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
void execute()
Execute all the requests.
void sum(T &num)
Sum the numbers across all processors and get the result.
size_t getProcessUnitID()
Get the process unit id.
void max(T &num)
Get the maximum number across all processors (or reduction with infinity norm)
Implementation of VCluster class.
Sparse Matrix implementation stub object when OpenFPM is compiled with no linear algebra support.
Implementation of 1-D std::vector like structure.
Class for cpu time benchmarking.
void stop()
Stop the timer.
void start()
Start the timer.
double getwct()
Return the elapsed real time.
void updateCellList(CellL &cell_list, bool no_se3=false, cl_construct_opt opt=cl_construct_opt::Full)
Update a cell list using the stored particles.
auto getPos(vect_dist_key_dx vec_key) -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
Model for Dynamic load balancing.
temporal buffer for reductions