OpenFPM  5.2.0
Project that contain the implementation of distributed structures
main.cu
1 
124 #ifdef __NVCC__
125 
127 #define OPENMPI
129 
130 //#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default
131 
132 #include "Vector/vector_dist.hpp"
133 
135 
136 template<typename vector_type>
137 __global__ void translate_fill_prop(vector_type vd)
138 {
139  auto p = GET_PARTICLE(vd);
140 
141  vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1];
142 
143  vd.template getProp<1>(p)[0] = vd.getPos(p)[0];
144  vd.template getProp<1>(p)[1] = vd.getPos(p)[1];
145 
146  vd.template getProp<2>(p)[0][0] = vd.getPos(p)[0];
147  vd.template getProp<2>(p)[0][1] = vd.getPos(p)[1];
148  vd.template getProp<2>(p)[1][0] = vd.getPos(p)[0] + vd.getPos(p)[1];
149  vd.template getProp<2>(p)[1][1] = vd.getPos(p)[1] - vd.getPos(p)[0];
150 
151  vd.getPos(p)[0] += 0.01f;
152  vd.getPos(p)[1] += 0.01f;
153 }
154 
156 
157 int main(int argc, char* argv[])
158 {
159  // OpenFPM GPU distribution
160 
161  // OpenFPM by default select GPU 0 for process 0, gpu 1 for process 1 and so on ... . In case of multi-node is the same each node has
162  // has a group of processes and these group of processes are distributed across the available GPU on that node.
163 
164  // If you want to override this behaviour use #define EXTERNAL_SET_GPU at the very beginning of the program and use
165  // cudaSetDevice to select the GPU for that particular process before openfpm_init
166  // Note: To get the process number do MPI_Init and and use the MPI_Comm_rank. VCluster is not available before openfpm_init
167  // A code snippet in case we want to skip GPU 0
168  // MPI_Init(&argc,&argv);
169  // int rank;
170  // MPI_Comm_rank(MPI_COMM_WORLD,&rank);
171  // cudaSetDevice(1+rank);
172 
174 
175  // initialize the library
176  openfpm_init(&argc,&argv);
177 
178  // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y
179  Box<2,float> domain({0.0,0.0},{1.0,1.0});
180 
181  // Here we define the boundary conditions of our problem
182  size_t bc[2]={PERIODIC,PERIODIC};
183 
184  // extended boundary around the domain, and the processor domain
185  Ghost<2,float> g(0.05);
186 
188 
189  // the scalar is the element at position 0 in the aggregate
190  const int scalar = 0;
191 
192  // the vector is the element at position 1 in the aggregate
193  const int vector = 1;
194 
195  // the tensor is the element at position 2 in the aggregate
196  const int tensor = 2;
197 
198  auto it = vd.getDomainIterator();
199 
200  while (it.isNext())
201  {
202  auto key = it.get();
203 
204  // we define x, assign a random position between 0.0 and 1.0
205  vd.getPos(key)[0] = (float)rand() / RAND_MAX;
206 
207  // we define y, assign a random position between 0.0 and 1.0
208  vd.getPos(key)[1] = (float)rand() / RAND_MAX;
209 
210  // next particle
211  ++it;
212  }
213 
214  vd.map();
215 
217 
219 
220  vd.hostToDevicePos();
221  vd.template hostToDeviceProp<scalar,vector,tensor>();
222 
224 
226 
227  auto ite = vd.getDomainIteratorGPU();
228  // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
229  CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
230 
232 
234 
235  vd.deviceToHostPos();
236  vd.deviceToHostProp<0,1,2>();
237 
238  // We write on a file
239  vd.write("output");
240 
242 
244 
245  for (int j = 0 ; j < 100 ; j++)
246  {
247  auto ite = vd.getDomainIteratorGPU();
248  // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
249  CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
250 
251  vd.map(RUN_ON_DEVICE);
252  vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);
253 
254  if ( j % 10 == 0)
255  {
256  // offload to host
257  vd.deviceToHostPos();
258  vd.template deviceToHostProp<0,1,2>();
259 
260  // write
261  vd.write_frame("output_f",j);
262  }
263  }
264 
266 
268 
269  bool active = is_mpi_rdma_cuda_active();
270 
271  std::cout << "Is MPI rdma active on CUDA " << active << std::endl;
272 
274 
275  openfpm_finalize();
276 }
277 
278 #else
279 
280 int main(int argc, char* argv[])
281 {
282  return 0;
283 }
284 
285 #endif
This class represent an N-dimensional box.
Definition: Box.hpp:60
Definition: Ghost.hpp:40
vect_dist_key_dx get()
Get the actual key.
Distributed vector.
bool write_frame(std::string out, size_t iteration, int opt=VTK_WRITER)
Output particle position and properties.
void deviceToHostPos()
Move the memory from the device to host memory.
auto getPos(vect_dist_key_dx vec_key) -> decltype(vPos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.
void hostToDevicePos()
Move the memory from the device to host memory.
void map(size_t opt=NONE)
It move all the particles that does not belong to the local processor to the respective processor.
bool write(std::string out, int opt=VTK_WRITER)
Output particle position and properties.
void deviceToHostProp()
Move the memory from the device to host memory.