OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
main.cu
1 
124 #ifdef __NVCC__
125 
127 #define OPENMPI
128 
130 #define SCAN_WITH_CUB <------ MODERNGPU is broken on RTX use CUB library for scan
131 //#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default
132 
133 #include "Vector/vector_dist.hpp"
134 
136 
137 template<typename vector_type>
138 __global__ void translate_fill_prop(vector_type vd)
139 {
140  auto p = GET_PARTICLE(vd);
141 
142  vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1];
143 
144  vd.template getProp<1>(p)[0] = vd.getPos(p)[0];
145  vd.template getProp<1>(p)[1] = vd.getPos(p)[1];
146 
147  vd.template getProp<2>(p)[0][0] = vd.getPos(p)[0];
148  vd.template getProp<2>(p)[0][1] = vd.getPos(p)[1];
149  vd.template getProp<2>(p)[1][0] = vd.getPos(p)[0] + vd.getPos(p)[1];
150  vd.template getProp<2>(p)[1][1] = vd.getPos(p)[1] - vd.getPos(p)[0];
151 
152  vd.getPos(p)[0] += 0.01f;
153  vd.getPos(p)[1] += 0.01f;
154 }
155 
157 
158 int main(int argc, char* argv[])
159 {
160  // OpenFPM GPU distribution
161 
162  // OpenFPM by default select GPU 0 for process 0, gpu 1 for process 1 and so on ... . In case of multi-node is the same each node has
163  // has a group of processes and these group of processes are distributed across the available GPU on that node.
164 
165  // If you want to override this behaviour use #define EXTERNAL_SET_GPU at the very beginning of the program and use
166  // cudaSetDevice to select the GPU for that particular process before openfpm_init
167  // Note: To get the process number do MPI_Init and and use the MPI_Comm_rank. VCluster is not available before openfpm_init
168  // A code snippet in case we want to skip GPU 0
169  // MPI_Init(&argc,&argv);
170  // int rank;
171  // MPI_Comm_rank(MPI_COMM_WORLD,&rank);
172  // cudaSetDevice(1+rank);
173 
175 
176  // initialize the library
177  openfpm_init(&argc,&argv);
178 
179  // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y
180  Box<2,float> domain({0.0,0.0},{1.0,1.0});
181 
182  // Here we define the boundary conditions of our problem
183  size_t bc[2]={PERIODIC,PERIODIC};
184 
185  // extended boundary around the domain, and the processor domain
186  Ghost<2,float> g(0.05);
187 
189 
190  // the scalar is the element at position 0 in the aggregate
191  const int scalar = 0;
192 
193  // the vector is the element at position 1 in the aggregate
194  const int vector = 1;
195 
196  // the tensor is the element at position 2 in the aggregate
197  const int tensor = 2;
198 
199  auto it = vd.getDomainIterator();
200 
201  while (it.isNext())
202  {
203  auto key = it.get();
204 
205  // we define x, assign a random position between 0.0 and 1.0
206  vd.getPos(key)[0] = (float)rand() / RAND_MAX;
207 
208  // we define y, assign a random position between 0.0 and 1.0
209  vd.getPos(key)[1] = (float)rand() / RAND_MAX;
210 
211  // next particle
212  ++it;
213  }
214 
215  vd.map();
216 
218 
220 
221  vd.hostToDevicePos();
222  vd.template hostToDeviceProp<scalar,vector,tensor>();
223 
225 
227 
228  auto ite = vd.getDomainIteratorGPU();
229  // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
230  CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
231 
233 
235 
236  vd.deviceToHostPos();
237  vd.deviceToHostProp<0,1,2>();
238 
239  // We write on a file
240  vd.write("output");
241 
243 
245 
246  for (int j = 0 ; j < 100 ; j++)
247  {
248  auto ite = vd.getDomainIteratorGPU();
249  // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
250  CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
251 
252  vd.map(RUN_ON_DEVICE);
253  vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);
254 
255  if ( j % 10 == 0)
256  {
257  // offload to host
258  vd.deviceToHostPos();
259  vd.template deviceToHostProp<0,1,2>();
260 
261  // write
262  vd.write_frame("output_f",j);
263  }
264  }
265 
267 
269 
270  bool active = is_mpi_rdma_cuda_active();
271 
272  std::cout << "Is MPI rdma active on CUDA " << active << std::endl;
273 
275 
276  openfpm_finalize();
277 }
278 
279 #else
280 
281 int main(int argc, char* argv[])
282 {
283  return 0;
284 }
285 
286 #endif
bool write_frame(std::string out, size_t iteration, int opt=VTK_WRITER)
Output particle position and properties.
void deviceToHostProp()
Move the memory from the device to host memory.
auto getPos(vect_dist_key_dx vec_key) -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
Definition: Ghost.hpp:39
void map(size_t opt=NONE)
It move all the particles that does not belong to the local processor to the respective processor.
void hostToDevicePos()
Move the memory from the device to host memory.
bool write(std::string out, int opt=VTK_WRITER)
Output particle position and properties.
This class represent an N-dimensional box.
Definition: Box.hpp:60
Distributed vector.
void deviceToHostPos()
Move the memory from the device to host memory.
vect_dist_key_dx get()
Get the actual key.
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.