OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
vector_dist_gpu_unit_tests.cu
1 #define BOOST_TEST_DYN_LINK
2 #include "config.h"
3 #include <boost/test/unit_test.hpp>
4 #include "VCluster/VCluster.hpp"
5 #include <Vector/vector_dist.hpp>
6 #include "Vector/tests/vector_dist_util_unit_tests.hpp"
7 
8 #define SUB_UNIT_FACTOR 1024
9 
10 template<unsigned int dim , typename vector_dist_type>
11 __global__ void move_parts_gpu_test(vector_dist_type vd)
12 {
13  auto p = GET_PARTICLE(vd);
14 
15 #pragma unroll
16  for (int i = 0 ; i < dim ; i++)
17  {
18  vd.getPos(p)[i] += 0.05;
19  }
20 }
21 
22 BOOST_AUTO_TEST_SUITE( vector_dist_gpu_test )
23 
24 void print_test(std::string test, size_t sz)
25 {
26  if (create_vcluster().getProcessUnitID() == 0)
27  std::cout << test << " " << sz << "\n";
28 }
29 
30 
31 __global__ void initialize_props(vector_dist_ker<3, float, aggregate<float, float [3], float[3]>> vd)
32 {
33  auto p = GET_PARTICLE(vd);
34 
35  vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
36 
37  vd.template getProp<1>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
38  vd.template getProp<1>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
39  vd.template getProp<1>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
40 }
41 
42 template<typename T,typename CellList_type>
43 __global__ void calculate_force(vector_dist_ker<3, T, aggregate<T, T[3], T [3]>> vd,
44  vector_dist_ker<3, T, aggregate<T, T[3], T [3]>> vd_sort,
45  CellList_type cl,
46  int rank)
47 {
48  auto p = GET_PARTICLE(vd);
49 
50  Point<3,T> xp = vd.getPos(p);
51 
52  auto it = cl.getNNIterator(cl.getCell(xp));
53 
54  Point<3,T> force1({0.0,0.0,0.0});
55  Point<3,T> force2({0.0,0.0,0.0});
56 
57  while (it.isNext())
58  {
59  auto q1 = it.get_sort();
60  auto q2 = it.get();
61 
62  if (q2 == p) {++it; continue;}
63 
64  Point<3,T> xq_1 = vd_sort.getPos(q1);
65  Point<3,T> xq_2 = vd.getPos(q2);
66 
67  Point<3,T> r1 = xq_1 - xp;
68  Point<3,T> r2 = xq_2 - xp;
69 
70  // Normalize
71 
72  if (r1.norm() > 1e-6)
73  {
74  r1 /= r1.norm();
75  force1 += vd_sort.template getProp<0>(q1)*r1;
76  }
77  if (r2.norm() > 1e-6)
78  {
79  r2 /= r2.norm();
80  force2 += vd.template getProp<0>(q2)*r2;
81  }
82 
83  ++it;
84  }
85 
86  vd.template getProp<1>(p)[0] = force1.get(0);
87  vd.template getProp<1>(p)[1] = force1.get(1);
88  vd.template getProp<1>(p)[2] = force1.get(2);
89 
90  vd.template getProp<2>(p)[0] = force2.get(0);
91  vd.template getProp<2>(p)[1] = force2.get(1);
92  vd.template getProp<2>(p)[2] = force2.get(2);
93 }
94 
95 template<typename T, typename CellList_type>
96 __global__ void calculate_force_full_sort(vector_dist_ker<3, T, aggregate<T, T[3], T [3]>> vd,
97  CellList_type cl, int rank)
98 {
99  unsigned int p;
100  GET_PARTICLE_SORT(p,cl);
101 
102  Point<3,T> xp = vd.getPos(p);
103 
104  auto it = cl.getNNIterator(cl.getCell(xp));
105 
106  Point<3,T> force1({0.0,0.0,0.0});
107 
108  while (it.isNext())
109  {
110  auto q1 = it.get_sort();
111 
112  if (q1 == p) {++it; continue;}
113 
114  Point<3,T> xq_1 = vd.getPos(q1);
115 
116  Point<3,T> r1 = xq_1 - xp;
117 
118  // Normalize
119 
120  if (r1.norm() > 1e-6)
121  {
122  r1 /= r1.norm();
123 
124  force1 += vd.template getProp<0>(q1)*r1;
125  }
126 
127  ++it;
128  }
129 
130  vd.template getProp<1>(p)[0] = force1.get(0);
131  vd.template getProp<1>(p)[1] = force1.get(1);
132  vd.template getProp<1>(p)[2] = force1.get(2);
133 }
134 
135 template<typename CellList_type, typename vector_type>
136 bool check_force(CellList_type & NN_cpu, vector_type & vd)
137 {
138  typedef typename vector_type::stype St;
139 
140  auto it6 = vd.getDomainIterator();
141 
142  bool match = true;
143 
144  while (it6.isNext())
145  {
146  auto p = it6.get();
147 
148  Point<3,St> xp = vd.getPos(p);
149 
150  // Calculate on CPU
151 
152  Point<3,St> force({0.0,0.0,0.0});
153 
154  auto NNc = NN_cpu.getNNIterator(NN_cpu.getCell(xp));
155 
156  while (NNc.isNext())
157  {
158  auto q = NNc.get();
159 
160  if (q == p.getKey()) {++NNc; continue;}
161 
162  Point<3,St> xq_2 = vd.getPos(q);
163  Point<3,St> r2 = xq_2 - xp;
164 
165  // Normalize
166 
167  if (r2.norm() > 1e-6)
168  {
169  r2 /= r2.norm();
170  force += vd.template getProp<0>(q)*r2;
171  }
172 
173  ++NNc;
174  }
175 
176  match &= fabs(vd.template getProp<1>(p)[0] - vd.template getProp<2>(p)[0]) < 0.0003;
177  match &= fabs(vd.template getProp<1>(p)[1] - vd.template getProp<2>(p)[1]) < 0.0003;
178  match &= fabs(vd.template getProp<1>(p)[2] - vd.template getProp<2>(p)[2]) < 0.0003;
179 
180  match &= fabs(vd.template getProp<1>(p)[0] - force.get(0)) < 0.0003;
181  match &= fabs(vd.template getProp<1>(p)[1] - force.get(1)) < 0.0003;
182  match &= fabs(vd.template getProp<1>(p)[2] - force.get(2)) < 0.0003;
183 
184  if (match == false)
185  {
186  std::cout << "ERROR: " << vd.template getProp<1>(p)[0] << " " << vd.template getProp<2>(p)[0] << std::endl;
187  std::cout << "ERROR: " << vd.template getProp<1>(p)[1] << " " << vd.template getProp<2>(p)[1] << std::endl;
188  std::cout << "ERROR: " << vd.template getProp<1>(p)[2] << " " << vd.template getProp<2>(p)[2] << std::endl;
189 
190  std::cout << p.getKey() << " ERROR2: " << vd.template getProp<1>(p)[0] << " " << force.get(0) << std::endl;
191  std::cout << p.getKey() << " ERROR2: " << vd.template getProp<1>(p)[1] << " " << force.get(1) << std::endl;
192  std::cout << p.getKey() << " ERROR2: " << vd.template getProp<1>(p)[2] << " " << force.get(2) << std::endl;
193 
194 
195  break;
196  }
197 
198  ++it6;
199  }
200 
201  return match;
202 }
203 
204 BOOST_AUTO_TEST_CASE( vector_dist_gpu_ghost_get )
205 {
206  auto & v_cl = create_vcluster();
207 
208  if (v_cl.size() > 16)
209  {return;}
210 
211  Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
212 
213  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
214  Ghost<3,float> g(0.1);
215 
216  // Boundary conditions
217  size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
218 
220 
221  auto it = vd.getDomainIterator();
222 
223  while (it.isNext())
224  {
225  auto p = it.get();
226 
227  vd.getPos(p)[0] = (float)rand() / (float)RAND_MAX;
228  vd.getPos(p)[1] = (float)rand() / (float)RAND_MAX;
229  vd.getPos(p)[2] = (float)rand() / (float)RAND_MAX;
230 
231  vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
232 
233  vd.template getProp<1>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
234  vd.template getProp<1>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
235  vd.template getProp<1>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
236 
237  vd.template getProp<2>(p)[0] = vd.getPos(p)[0] + 3.0*vd.getPos(p)[1];
238  vd.template getProp<2>(p)[1] = vd.getPos(p)[0] + 3.0*vd.getPos(p)[2];
239  vd.template getProp<2>(p)[2] = vd.getPos(p)[1] + 3.0*vd.getPos(p)[2];
240 
241 
242  ++it;
243  }
244 
245  // Ok we redistribute the particles (CPU based)
246  vd.map();
247 
248  vd.template ghost_get<0,1,2>();
249 
250  // Now we check the the ghost contain the correct information
251 
252  bool check = true;
253 
254  auto itg = vd.getDomainAndGhostIterator();
255 
256  while (itg.isNext())
257  {
258  auto p = itg.get();
259 
260  check &= (vd.template getProp<0>(p) == vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2]);
261 
262  check &= (vd.template getProp<1>(p)[0] == vd.getPos(p)[0] + vd.getPos(p)[1]);
263  check &= (vd.template getProp<1>(p)[1] == vd.getPos(p)[0] + vd.getPos(p)[2]);
264  check &= (vd.template getProp<1>(p)[2] == vd.getPos(p)[1] + vd.getPos(p)[2]);
265 
266  check &= (vd.template getProp<2>(p)[0] == vd.getPos(p)[0] + 3.0*vd.getPos(p)[1]);
267  check &= (vd.template getProp<2>(p)[1] == vd.getPos(p)[0] + 3.0*vd.getPos(p)[2]);
268  check &= (vd.template getProp<2>(p)[2] == vd.getPos(p)[1] + 3.0*vd.getPos(p)[2]);
269 
270  ++itg;
271  }
272 
273  size_t tot_s = vd.size_local_with_ghost();
274 
275  v_cl.sum(tot_s);
276  v_cl.execute();
277 
278  // We check that we check something
279  BOOST_REQUIRE(tot_s > 1000);
280 }
281 
282 template<typename vector_type, typename CellList_type, typename CellList_type_cpu>
283 void check_cell_list_cpu_and_gpu(vector_type & vd, CellList_type & NN, CellList_type_cpu & NN_cpu)
284 {
285  auto it5 = vd.getDomainIteratorGPU(32);
286 
287  CUDA_LAUNCH((calculate_force<typename vector_type::stype,decltype(NN.toKernel())>),it5,vd.toKernel(),vd.toKernel_sorted(),NN.toKernel(),create_vcluster().rank());
288 
289  vd.template deviceToHostProp<1,2>();
290 
291  bool test = check_force(NN_cpu,vd);
292  BOOST_REQUIRE_EQUAL(test,true);
293 
294  // We reset the property 1 on device
295 
296  auto rst = vd.getDomainIterator();
297 
298  while (rst.isNext())
299  {
300  auto p = rst.get();
301 
302  vd.template getProp<1>(p)[0] = 0.0;
303  vd.template getProp<1>(p)[1] = 0.0;
304  vd.template getProp<1>(p)[2] = 0.0;
305 
306  ++rst;
307  }
308 
309  vd.template hostToDeviceProp<1>();
310 
311  // We do exactly the same test as before, but now we completely use the sorted version
312 
313  CUDA_LAUNCH((calculate_force_full_sort<typename vector_type::stype,decltype(NN.toKernel())>),it5,vd.toKernel_sorted(),NN.toKernel(),create_vcluster().rank());
314 
315  vd.template merge_sort<1>(NN);
316  vd.template deviceToHostProp<1>();
317 
318  test = check_force(NN_cpu,vd);
319  BOOST_REQUIRE_EQUAL(test,true);
320 }
321 
322 template<typename CellList_type>
323 void vector_dist_gpu_test_impl()
324 {
325  auto & v_cl = create_vcluster();
326 
327  if (v_cl.size() > 16)
328  {return;}
329 
330  Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
331 
332  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
333  Ghost<3,float> g(0.1);
334 
335  // Boundary conditions
336  size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
337 
339 
340  srand(55067*create_vcluster().rank());
341 
342  auto it = vd.getDomainIterator();
343 
344  while (it.isNext())
345  {
346  auto p = it.get();
347 
348  int x = rand();
349  int y = rand();
350  int z = rand();
351 
352  vd.getPos(p)[0] = (float)x / (float)RAND_MAX;
353  vd.getPos(p)[1] = (float)y / (float)RAND_MAX;
354  vd.getPos(p)[2] = (float)z / (float)RAND_MAX;
355 
356  Point<3,float> xp = vd.getPos(p);
357 
358  ++it;
359  }
360 
361  // Ok we redistribute the particles (CPU based)
362  vd.map();
363 
364  size_t size_l = vd.size_local();
365 
366  v_cl.sum(size_l);
367  v_cl.execute();
368 
369  BOOST_REQUIRE_EQUAL(size_l,10000);
370 
371 
372  auto & ct = vd.getDecomposition();
373 
374  bool noOut = true;
375  size_t cnt = 0;
376 
377  auto it2 = vd.getDomainIterator();
378 
379  while (it2.isNext())
380  {
381  auto p = it2.get();
382 
383  noOut &= ct.isLocal(vd.getPos(p));
384 
385  cnt++;
386  ++it2;
387  }
388 
389  BOOST_REQUIRE_EQUAL(noOut,true);
390  BOOST_REQUIRE_EQUAL(cnt,vd.size_local());
391 
392  // now we offload all the properties
393 
394  auto it3 = vd.getDomainIteratorGPU();
395 
396  // offload to device
397  vd.hostToDevicePos();
398 
399  CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vd.toKernel());
400 
401  // now we check what we initialized
402 
403  vd.deviceToHostProp<0,1>();
404 
405  auto it4 = vd.getDomainIterator();
406 
407  while (it4.isNext())
408  {
409  auto p = it4.get();
410 
411  BOOST_REQUIRE_CLOSE(vd.template getProp<0>(p),vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2],0.01);
412 
413  BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[0],vd.getPos(p)[0] + vd.getPos(p)[1],0.01);
414  BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[1],vd.getPos(p)[0] + vd.getPos(p)[2],0.01);
415  BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[2],vd.getPos(p)[1] + vd.getPos(p)[2],0.01);
416 
417  ++it4;
418  }
419 
420  // here we do a ghost_get
421  vd.ghost_get<0>();
422 
423  // Double ghost get to check crashes
424  vd.ghost_get<0>();
425 
426  // we re-offload what we received
427  vd.hostToDevicePos();
428  vd.template hostToDeviceProp<0>();
429 
430  auto NN = vd.template getCellListGPU<CellList_type>(0.1);
431  auto NN_cpu = vd.getCellList(0.1);
432  check_cell_list_cpu_and_gpu(vd,NN,NN_cpu);
433 
434  auto NN_up = vd.template getCellListGPU<CellList_type>(0.1);
435 
436  NN_up.clear();
437  vd.updateCellList(NN_up);
438  check_cell_list_cpu_and_gpu(vd,NN_up,NN_cpu);
439 }
440 
441 template<typename CellList_type>
442 void vector_dist_gpu_make_sort_test_impl()
443 {
444  auto & v_cl = create_vcluster();
445 
446  if (v_cl.size() > 16)
447  {return;}
448 
449  Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
450 
451  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
452  Ghost<3,float> g(0.1);
453 
454  // Boundary conditions
455  size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
456 
458 
459  srand(55067*create_vcluster().rank());
460 
461  auto it = vd.getDomainIterator();
462 
463  while (it.isNext())
464  {
465  auto p = it.get();
466 
467  int x = rand();
468  int y = rand();
469  int z = rand();
470 
471  vd.getPos(p)[0] = (float)x / (float)RAND_MAX;
472  vd.getPos(p)[1] = (float)y / (float)RAND_MAX;
473  vd.getPos(p)[2] = (float)z / (float)RAND_MAX;
474 
475  ++it;
476  }
477 
478  vd.hostToDevicePos();
479 
480  // Ok we redistribute the particles
481  vd.map(RUN_ON_DEVICE);
482 
483  auto it3 = vd.getDomainIteratorGPU();
484 
485  CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vd.toKernel());
486 
487  // Here we check make sort does not mess-up particles we use a Cell-List to check that
488  // the two cell-list constructed are identical
489 
490  vd.deviceToHostPos();
491 
492  auto NN_cpu1 = vd.getCellList(0.1);
493  auto NN = vd.template getCellListGPU<CellList_type>(0.1);
494  vd.make_sort(NN);
495 
496  vd.deviceToHostPos();
497 
498  auto NN_cpu2 = vd.getCellList(0.1);
499 
500  // here we compare the two cell-lists
501 
502  bool match = true;
503  for (size_t i = 0 ; i < NN_cpu1.getNCells() ; i++)
504  {
505  match &= NN_cpu1.getNelements(i) == NN_cpu2.getNelements(i);
506  }
507 
508  BOOST_REQUIRE_EQUAL(match,true);
509 
510  // In this second step we check that we can use make_sort_from to check we can sort partifcles even
511  // when ghost are filled
512 
513  // Here we get do a make sort
514  NN = vd.template getCellListGPU<CellList_type>(0.1);
515  vd.make_sort(NN);
516 
519 
520  vd.deviceToHostPos();
521  tmp_pos.template deviceToHost<0>();
522 
523  // here we do a ghost_get
524  vd.ghost_get<0>(RUN_ON_DEVICE);
525 
526  // Here we get do a make sort
527  NN = vd.template getCellListGPU<CellList_type>(0.1);
528 
529  CUDA_CHECK()
530 
531  vd.make_sort_from(NN);
532 
533  // Check
534 
535  tmp_pos.deviceToHost<0>();
536  vd.deviceToHostPos();
537 
538  match = true;
539  for (size_t i = 0 ; i < vd.size_local() ; i++)
540  {
541  Point<3,float> p1 = vd.getPos(i);
542  Point<3,float> p2 = tmp_pos.template get<0>(i);
543 
544  // They must be in the same cell
545  auto c1 = NN.getCell(p1);
546  auto c2 = NN.getCell(p1);
547 
548  match &= c1 == c2;
549  }
550 
551  BOOST_REQUIRE_EQUAL(match,true);
552 }
553 
554 
555 BOOST_AUTO_TEST_CASE(vector_dist_gpu_make_sort_sparse)
556 {
557  vector_dist_gpu_make_sort_test_impl<CELLLIST_GPU_SPARSE<3,float>>();
558 }
559 
560 BOOST_AUTO_TEST_CASE(vector_dist_gpu_make_sort)
561 {
562  vector_dist_gpu_make_sort_test_impl<CellList_gpu<3,float,CudaMemory,shift_only<3, float>>>();
563 }
564 
565 BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
566 {
567  vector_dist_gpu_test_impl<CellList_gpu<3,float,CudaMemory,shift_only<3, float>>>();
568 }
569 
570 BOOST_AUTO_TEST_CASE( vector_dist_gpu_test_sparse)
571 {
572  vector_dist_gpu_test_impl<CELLLIST_GPU_SPARSE<3,float>>();
573 }
574 
575 template<typename St>
576 void vdist_calc_gpu_test()
577 {
578  auto & v_cl = create_vcluster();
579 
580  if (v_cl.size() > 16)
581  {return;}
582 
583  Box<3,St> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
584 
585  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
586  Ghost<3,St> g(0.1);
587 
588  // Boundary conditions
589  size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
590 
592 
594 
596 
598 
599  srand(v_cl.rank()*10000);
600  auto it = vd.getDomainIterator();
601 
602  while (it.isNext())
603  {
604  auto p = it.get();
605 
606  vd.getPos(p)[0] = (St)rand() / (float)RAND_MAX;
607  vd.getPos(p)[1] = (St)rand() / (float)RAND_MAX;
608  vd.getPos(p)[2] = (St)rand() / (float)RAND_MAX;
609 
610  vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
611 
612  vd.template getProp<1>(p)[0] = vd.getPos(p)[0];
613  vd.template getProp<1>(p)[1] = vd.getPos(p)[1];
614  vd.template getProp<1>(p)[2] = vd.getPos(p)[2];
615 
616  vd.template getProp<2>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
617  vd.template getProp<2>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
618  vd.template getProp<2>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
619 
620  ++it;
621  }
622 
623  // move on device
624  vd.hostToDevicePos();
625  vd.template hostToDeviceProp<0,1,2>();
626 
627  // Ok we redistribute the particles (GPU based)
628  vd.map(RUN_ON_DEVICE);
629 
631 
632  vd.deviceToHostPos();
633  vd.template deviceToHostProp<0,1,2>();
634 
635  // Reset the host part
636 
637  auto it3 = vd.getDomainIterator();
638 
639  while (it3.isNext())
640  {
641  auto p = it3.get();
642 
643  vd.getPos(p)[0] = 1.0;
644  vd.getPos(p)[1] = 1.0;
645  vd.getPos(p)[2] = 1.0;
646 
647  vd.template getProp<0>(p) = 0.0;
648 
649  vd.template getProp<0>(p) = 0.0;
650  vd.template getProp<0>(p) = 0.0;
651  vd.template getProp<0>(p) = 0.0;
652 
653  vd.template getProp<0>(p) = 0.0;
654  vd.template getProp<0>(p) = 0.0;
655  vd.template getProp<0>(p) = 0.0;
656 
657  ++it3;
658  }
659 
660  // we move from Device to CPU
661 
662  vd.deviceToHostPos();
663  vd.template deviceToHostProp<0,1,2>();
664 
665  // Check
666 
667  auto it2 = vd.getDomainIterator();
668 
669  bool match = true;
670  while (it2.isNext())
671  {
672  auto p = it2.get();
673 
674  match &= vd.template getProp<0>(p) == vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
675 
676  match &= vd.template getProp<1>(p)[0] == vd.getPos(p)[0];
677  match &= vd.template getProp<1>(p)[1] == vd.getPos(p)[1];
678  match &= vd.template getProp<1>(p)[2] == vd.getPos(p)[2];
679 
680  match &= vd.template getProp<2>(p)[0] == vd.getPos(p)[0] + vd.getPos(p)[1];
681  match &= vd.template getProp<2>(p)[1] == vd.getPos(p)[0] + vd.getPos(p)[2];
682  match &= vd.template getProp<2>(p)[2] == vd.getPos(p)[1] + vd.getPos(p)[2];
683 
684  ++it2;
685  }
686 
687  BOOST_REQUIRE_EQUAL(match,true);
688 
689  // count local particles
690 
691  size_t l_cnt = 0;
692  size_t nl_cnt = 0;
693  size_t n_out = 0;
694 
695  // Domain + ghost box
696  Box<3,St> dom_ext = domain;
697  dom_ext.enlarge(g);
698 
699  auto it5 = vd.getDomainIterator();
700  count_local_n_local<3>(vd,it5,bc,domain,dom_ext,l_cnt,nl_cnt,n_out);
701 
702  BOOST_REQUIRE_EQUAL(n_out,0);
703  BOOST_REQUIRE_EQUAL(l_cnt,vd.size_local());
704 
705  // we do 10 gpu steps (using a cpu vector to check that map and ghost get work as expented)
706 
707  for (size_t i = 0 ; i < 10 ; i++)
708  {
709  vd.map(RUN_ON_DEVICE);
710 
711  vd.deviceToHostPos();
712  vd.template deviceToHostProp<0,1,2>();
713 
714  // To test we copy on a cpu distributed vector and we do a map
715 
716  vector_dist<3,St,aggregate<St,St[3],St[3]>> vd_cpu(vd.getDecomposition().template duplicate_convert<HeapMemory,memory_traits_lin>(),0);
717 
718  auto itc = vd.getDomainIterator();
719 
720  while (itc.isNext())
721  {
722  auto p = itc.get();
723 
724  vd_cpu.add();
725 
726  vd_cpu.getLastPos()[0] = vd.getPos(p)[0];
727  vd_cpu.getLastPos()[1] = vd.getPos(p)[1];
728  vd_cpu.getLastPos()[2] = vd.getPos(p)[2];
729 
730  vd_cpu.template getLastProp<0>() = vd.template getProp<0>(p);
731 
732  vd_cpu.template getLastProp<1>()[0] = vd.template getProp<1>(p)[0];
733  vd_cpu.template getLastProp<1>()[1] = vd.template getProp<1>(p)[1];
734  vd_cpu.template getLastProp<1>()[2] = vd.template getProp<1>(p)[2];
735 
736  vd_cpu.template getLastProp<2>()[0] = vd.template getProp<2>(p)[0];
737  vd_cpu.template getLastProp<2>()[1] = vd.template getProp<2>(p)[1];
738  vd_cpu.template getLastProp<2>()[2] = vd.template getProp<2>(p)[2];
739 
740  ++itc;
741  }
742 
743  vd_cpu.template ghost_get<0,1,2>();
744 
746 
747  vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);
748 
750 
751  vd.deviceToHostPos();
752  vd.template deviceToHostProp<0,1,2>();
753 
754  match = true;
755 
756  // Particle on the gpu ghost and cpu ghost are not ordered in the same way so we have to reorder
757 
758  struct part
759  {
760  Point<3,St> xp;
761 
762 
763  St prp0;
764  St prp1[3];
765  St prp2[3];
766 
767  bool operator<(const part & tmp) const
768  {
769  if (xp.get(0) < tmp.xp.get(0))
770  {return true;}
771  else if (xp.get(0) > tmp.xp.get(0))
772  {return false;}
773 
774  if (xp.get(1) < tmp.xp.get(1))
775  {return true;}
776  else if (xp.get(1) > tmp.xp.get(1))
777  {return false;}
778 
779  if (xp.get(2) < tmp.xp.get(2))
780  {return true;}
781  else if (xp.get(2) > tmp.xp.get(2))
782  {return false;}
783 
784  return false;
785  }
786  };
787 
788  openfpm::vector<part> cpu_sort;
789  openfpm::vector<part> gpu_sort;
790 
791  cpu_sort.resize(vd_cpu.size_local_with_ghost() - vd_cpu.size_local());
792  gpu_sort.resize(vd.size_local_with_ghost() - vd.size_local());
793 
794  BOOST_REQUIRE_EQUAL(cpu_sort.size(),gpu_sort.size());
795 
796  size_t cnt = 0;
797 
798  auto itc2 = vd.getGhostIterator();
799  while (itc2.isNext())
800  {
801  auto p = itc2.get();
802 
803  cpu_sort.get(cnt).xp.get(0) = vd_cpu.getPos(p)[0];
804  gpu_sort.get(cnt).xp.get(0) = vd.getPos(p)[0];
805  cpu_sort.get(cnt).xp.get(1) = vd_cpu.getPos(p)[1];
806  gpu_sort.get(cnt).xp.get(1) = vd.getPos(p)[1];
807  cpu_sort.get(cnt).xp.get(2) = vd_cpu.getPos(p)[2];
808  gpu_sort.get(cnt).xp.get(2) = vd.getPos(p)[2];
809 
810  cpu_sort.get(cnt).prp0 = vd_cpu.template getProp<0>(p);
811  gpu_sort.get(cnt).prp0 = vd.template getProp<0>(p);
812 
813  cpu_sort.get(cnt).prp1[0] = vd_cpu.template getProp<1>(p)[0];
814  gpu_sort.get(cnt).prp1[0] = vd.template getProp<1>(p)[0];
815  cpu_sort.get(cnt).prp1[1] = vd_cpu.template getProp<1>(p)[1];
816  gpu_sort.get(cnt).prp1[1] = vd.template getProp<1>(p)[1];
817  cpu_sort.get(cnt).prp1[2] = vd_cpu.template getProp<1>(p)[2];
818  gpu_sort.get(cnt).prp1[2] = vd.template getProp<1>(p)[2];
819 
820  cpu_sort.get(cnt).prp2[0] = vd_cpu.template getProp<2>(p)[0];
821  gpu_sort.get(cnt).prp2[0] = vd.template getProp<2>(p)[0];
822  cpu_sort.get(cnt).prp2[1] = vd_cpu.template getProp<2>(p)[1];
823  gpu_sort.get(cnt).prp2[1] = vd.template getProp<2>(p)[1];
824  cpu_sort.get(cnt).prp2[2] = vd_cpu.template getProp<2>(p)[2];
825  gpu_sort.get(cnt).prp2[2] = vd.template getProp<2>(p)[2];
826 
827  ++cnt;
828  ++itc2;
829  }
830 
831  cpu_sort.sort();
832  gpu_sort.sort();
833 
834  for (size_t i = 0 ; i < cpu_sort.size() ; i++)
835  {
836  match &= cpu_sort.get(i).xp.get(0) == gpu_sort.get(i).xp.get(0);
837  match &= cpu_sort.get(i).xp.get(1) == gpu_sort.get(i).xp.get(1);
838  match &= cpu_sort.get(i).xp.get(2) == gpu_sort.get(i).xp.get(2);
839 
840  match &= cpu_sort.get(i).prp0 == gpu_sort.get(i).prp0;
841  match &= cpu_sort.get(i).prp1[0] == gpu_sort.get(i).prp1[0];
842  match &= cpu_sort.get(i).prp1[1] == gpu_sort.get(i).prp1[1];
843  match &= cpu_sort.get(i).prp1[2] == gpu_sort.get(i).prp1[2];
844 
845  match &= cpu_sort.get(i).prp2[0] == gpu_sort.get(i).prp2[0];
846  match &= cpu_sort.get(i).prp2[1] == gpu_sort.get(i).prp2[1];
847  match &= cpu_sort.get(i).prp2[2] == gpu_sort.get(i).prp2[2];
848  }
849 
850  BOOST_REQUIRE_EQUAL(match,true);
851 
852  // move particles on gpu
853 
854  auto ite = vd.getDomainIteratorGPU();
855  CUDA_LAUNCH_DIM3((move_parts_gpu_test<3,decltype(vd.toKernel())>),ite.wthr,ite.thr,vd.toKernel());
856  }
857 }
858 
859 BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
860 {
861  vdist_calc_gpu_test<float>();
862  vdist_calc_gpu_test<double>();
863 }
864 
865 BOOST_AUTO_TEST_CASE(vector_dist_reduce)
866 {
867  auto & v_cl = create_vcluster();
868 
869  if (v_cl.size() > 16)
870  {return;}
871 
872  Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
873 
874  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
875  Ghost<3,float> g(0.1);
876 
877  // Boundary conditions
878  size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
879 
880  vector_dist_gpu<3,float,aggregate<float,double,int,size_t>> vd(5000*v_cl.size(),domain,bc,g);
881 
882  auto it = vd.getDomainIterator();
883 
884  float fc = 1.0;
885  double dc = 1.0;
886  int ic = 1.0;
887  size_t sc = 1.0;
888 
889  while(it.isNext())
890  {
891  auto p = it.get();
892 
893  vd.template getProp<0>(p) = fc;
894  vd.template getProp<1>(p) = dc;
895  vd.template getProp<2>(p) = ic;
896  vd.template getProp<3>(p) = sc;
897 
898  fc += 1.0;
899  dc += 1.0;
900  ic += 1;
901  sc += 1;
902 
903  ++it;
904  }
905 
906  vd.template hostToDeviceProp<0,1,2,3>();
907 
908  float redf = reduce_local<0,_add_>(vd);
909  double redd = reduce_local<1,_add_>(vd);
910  int redi = reduce_local<2,_add_>(vd);
911  size_t reds = reduce_local<3,_add_>(vd);
912 
913  BOOST_REQUIRE_EQUAL(redf,(vd.size_local()+1.0)*(vd.size_local())/2.0);
914  BOOST_REQUIRE_EQUAL(redd,(vd.size_local()+1.0)*(vd.size_local())/2.0);
915  BOOST_REQUIRE_EQUAL(redi,(vd.size_local()+1)*(vd.size_local())/2);
916  BOOST_REQUIRE_EQUAL(reds,(vd.size_local()+1)*(vd.size_local())/2);
917 
918  float redf2 = reduce_local<0,_max_>(vd);
919  double redd2 = reduce_local<1,_max_>(vd);
920  int redi2 = reduce_local<2,_max_>(vd);
921  size_t reds2 = reduce_local<3,_max_>(vd);
922 
923  BOOST_REQUIRE_EQUAL(redf2,vd.size_local());
924  BOOST_REQUIRE_EQUAL(redd2,vd.size_local());
925  BOOST_REQUIRE_EQUAL(redi2,vd.size_local());
926  BOOST_REQUIRE_EQUAL(reds2,vd.size_local());
927 }
928 
929 template<typename CellList_type>
930 void vector_dist_dlb_on_cuda_impl(size_t k,double r_cut)
931 {
932  std::random_device r;
933 
934  std::seed_seq seed2{/*r() +*/ create_vcluster().rank(),
935  /*r() +*/ create_vcluster().rank(),
936  /*r() +*/ create_vcluster().rank(),
937  /*r() +*/ create_vcluster().rank(),
938  /*r() +*/ create_vcluster().rank(),
939  /*r() +*/ create_vcluster().rank(),
940  /*r() +*/ create_vcluster().rank(),
941  /*r() +*/ create_vcluster().rank()};
942  std::mt19937 e2(seed2);
943 
945 
946  Vcluster<> & v_cl = create_vcluster();
947 
948  if (v_cl.getProcessingUnits() > 8)
949  return;
950 
951  std::uniform_real_distribution<double> unif(0.0,0.3);
952 
953  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
954  Ghost<3,double> g(0.1);
955  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
956 
957  vector_type vd(0,domain,bc,g,DEC_GRAN(2048));
958 
959  // Only processor 0 initialy add particles on a corner of a domain
960 
961  if (v_cl.getProcessUnitID() == 0)
962  {
963  for(size_t i = 0 ; i < k ; i++)
964  {
965  vd.add();
966 
967  vd.getLastPos()[0] = unif(e2);
968  vd.getLastPos()[1] = unif(e2);
969  vd.getLastPos()[2] = unif(e2);
970  }
971  }
972 
973  // Move to GPU
974  vd.hostToDevicePos();
975  vd.template hostToDeviceProp<0>();
976 
977  vd.map(RUN_ON_DEVICE);
978  vd.template ghost_get<>(RUN_ON_DEVICE);
979 
980  // now move to CPU
981 
982  vd.deviceToHostPos();
983  vd.template deviceToHostProp<0>();
984 
985  // Get the neighborhood of each particles
986 
987  auto VV = vd.getVerlet(r_cut);
988 
989  // store the number of neighborhood for each particles
990 
991  auto it = vd.getDomainIterator();
992 
993  while (it.isNext())
994  {
995  auto p = it.get();
996 
997  vd.template getProp<0>(p) = VV.getNNPart(p.getKey());
998 
999  ++it;
1000  }
1001 
1002  // Move to GPU
1003  vd.template hostToDeviceProp<0>();
1004 
1005  ModelSquare md;
1006  md.factor = 10;
1007  vd.addComputationCosts(md);
1008  vd.getDecomposition().decompose();
1009  vd.map(RUN_ON_DEVICE);
1010 
1011  vd.deviceToHostPos();
1012  // Move info to CPU for addComputationcosts
1013 
1014  vd.addComputationCosts(md);
1015 
1017  size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1018  v_cl.allGather(load,loads);
1019  v_cl.execute();
1020 
1021  for (size_t i = 0 ; i < loads.size() ; i++)
1022  {
1023  double load_f = load;
1024  double load_fc = loads.get(i);
1025 
1026  BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
1027  }
1028 
1029  BOOST_REQUIRE(vd.size_local() != 0);
1030 
1031  Point<3,double> v({1.0,1.0,1.0});
1032 
1033  for (size_t i = 0 ; i < 25 ; i++)
1034  {
1035  // move particles to CPU and move the particles by 0.1
1036 
1037  vd.deviceToHostPos();
1038 
1039  auto it = vd.getDomainIterator();
1040 
1041  while (it.isNext())
1042  {
1043  auto p = it.get();
1044 
1045  vd.getPos(p)[0] += v.get(0) * 0.09;
1046  vd.getPos(p)[1] += v.get(1) * 0.09;
1047  vd.getPos(p)[2] += v.get(2) * 0.09;
1048 
1049  ++it;
1050  }
1051 
1052  //Back to GPU
1053  vd.hostToDevicePos();
1054  vd.map(RUN_ON_DEVICE);
1055  vd.template ghost_get<0>(RUN_ON_DEVICE);
1056 
1057  vd.deviceToHostPos();
1058  vd.template deviceToHostProp<0,1,2>();
1059 
1060  // Check calc forces
1061  auto NN_gpu = vd.template getCellListGPU<CellList_type>(r_cut);
1062  auto NN_cpu = vd.getCellList(r_cut);
1063  check_cell_list_cpu_and_gpu(vd,NN_gpu,NN_cpu);
1064 
1065  auto VV2 = vd.getVerlet(r_cut);
1066 
1067  auto it2 = vd.getDomainIterator();
1068 
1069  bool match = true;
1070  while (it2.isNext())
1071  {
1072  auto p = it2.get();
1073 
1074  match &= vd.template getProp<0>(p) == VV2.getNNPart(p.getKey());
1075 
1076  ++it2;
1077  }
1078 
1079  BOOST_REQUIRE_EQUAL(match,true);
1080 
1081  ModelSquare md;
1082  vd.addComputationCosts(md);
1083  vd.getDecomposition().redecompose(200);
1084  vd.map(RUN_ON_DEVICE);
1085 
1086  BOOST_REQUIRE(vd.size_local() != 0);
1087 
1088  vd.template ghost_get<0>(RUN_ON_DEVICE);
1089  vd.deviceToHostPos();
1090  vd.template deviceToHostProp<0>();
1091 
1092  vd.addComputationCosts(md);
1093 
1095  size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1096  v_cl.allGather(load,loads);
1097  v_cl.execute();
1098 
1099  for (size_t i = 0 ; i < loads.size() ; i++)
1100  {
1101  double load_f = load;
1102  double load_fc = loads.get(i);
1103 
1104 #ifdef ENABLE_ASAN
1105  BOOST_REQUIRE_CLOSE(load_f,load_fc,30.0);
1106 #else
1107  BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
1108 #endif
1109  }
1110  }
1111 }
1112 
1113 template<typename CellList_type>
1114 void vector_dist_dlb_on_cuda_impl_async(size_t k,double r_cut)
1115 {
1116  std::random_device r;
1117 
1118  std::seed_seq seed2{r() + create_vcluster().rank(),
1119  r() + create_vcluster().rank(),
1120  r() + create_vcluster().rank(),
1121  r() + create_vcluster().rank(),
1122  r() + create_vcluster().rank(),
1123  r() + create_vcluster().rank(),
1124  r() + create_vcluster().rank(),
1125  r() + create_vcluster().rank()};
1126  std::mt19937 e2(seed2);
1127 
1129 
1130  Vcluster<> & v_cl = create_vcluster();
1131 
1132  if (v_cl.getProcessingUnits() > 8)
1133  return;
1134 
1135  std::uniform_real_distribution<double> unif(0.0,0.3);
1136 
1137  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1138  Ghost<3,double> g(0.1);
1139  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1140 
1141  vector_type vd(0,domain,bc,g,DEC_GRAN(2048));
1142 
1143  // Only processor 0 initialy add particles on a corner of a domain
1144 
1145  if (v_cl.getProcessUnitID() == 0)
1146  {
1147  for(size_t i = 0 ; i < k ; i++)
1148  {
1149  vd.add();
1150 
1151  vd.getLastPos()[0] = unif(e2);
1152  vd.getLastPos()[1] = unif(e2);
1153  vd.getLastPos()[2] = unif(e2);
1154  }
1155  }
1156 
1157  // Move to GPU
1158  vd.hostToDevicePos();
1159  vd.template hostToDeviceProp<0>();
1160 
1161  vd.map(RUN_ON_DEVICE);
1162  vd.template Ighost_get<>(RUN_ON_DEVICE);
1163  vd.template ghost_wait<>(RUN_ON_DEVICE);
1164 
1165  // now move to CPU
1166 
1167  vd.deviceToHostPos();
1168  vd.template deviceToHostProp<0>();
1169 
1170  // Get the neighborhood of each particles
1171 
1172  auto VV = vd.getVerlet(r_cut);
1173 
1174  // store the number of neighborhood for each particles
1175 
1176  auto it = vd.getDomainIterator();
1177 
1178  while (it.isNext())
1179  {
1180  auto p = it.get();
1181 
1182  vd.template getProp<0>(p) = VV.getNNPart(p.getKey());
1183 
1184  ++it;
1185  }
1186 
1187  // Move to GPU
1188  vd.template hostToDeviceProp<0>();
1189 
1190  ModelSquare md;
1191  md.factor = 10;
1192  vd.addComputationCosts(md);
1193  vd.getDecomposition().decompose();
1194  vd.map(RUN_ON_DEVICE);
1195 
1196  vd.deviceToHostPos();
1197  // Move info to CPU for addComputationcosts
1198 
1199  vd.addComputationCosts(md);
1200 
1202  size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1203  v_cl.allGather(load,loads);
1204  v_cl.execute();
1205 
1206  for (size_t i = 0 ; i < loads.size() ; i++)
1207  {
1208  double load_f = load;
1209  double load_fc = loads.get(i);
1210 
1211  BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
1212  }
1213 
1214  BOOST_REQUIRE(vd.size_local() != 0);
1215 
1216  Point<3,double> v({1.0,1.0,1.0});
1217 
1218  for (size_t i = 0 ; i < 25 ; i++)
1219  {
1220  // move particles to CPU and move the particles by 0.1
1221 
1222  vd.deviceToHostPos();
1223 
1224  auto it = vd.getDomainIterator();
1225 
1226  while (it.isNext())
1227  {
1228  auto p = it.get();
1229 
1230  vd.getPos(p)[0] += v.get(0) * 0.09;
1231  vd.getPos(p)[1] += v.get(1) * 0.09;
1232  vd.getPos(p)[2] += v.get(2) * 0.09;
1233 
1234  ++it;
1235  }
1236 
1237  // Back to GPU
1238  vd.hostToDevicePos();
1239  vd.map(RUN_ON_DEVICE);
1240  vd.template Ighost_get<0>(RUN_ON_DEVICE);
1241  vd.template ghost_wait<0>(RUN_ON_DEVICE);
1242  vd.deviceToHostPos();
1243  vd.template deviceToHostProp<0,1,2>();
1244 
1245  // Check calc forces
1246  auto NN_gpu = vd.template getCellListGPU<CellList_type>(r_cut);
1247  auto NN_cpu = vd.getCellList(r_cut);
1248  check_cell_list_cpu_and_gpu(vd,NN_gpu,NN_cpu);
1249 
1250  auto VV2 = vd.getVerlet(r_cut);
1251 
1252  auto it2 = vd.getDomainIterator();
1253 
1254  bool match = true;
1255  while (it2.isNext())
1256  {
1257  auto p = it2.get();
1258 
1259  match &= vd.template getProp<0>(p) == VV2.getNNPart(p.getKey());
1260 
1261  ++it2;
1262  }
1263 
1264  BOOST_REQUIRE_EQUAL(match,true);
1265 
1266  ModelSquare md;
1267  vd.addComputationCosts(md);
1268  vd.getDecomposition().redecompose(200);
1269  vd.map(RUN_ON_DEVICE);
1270 
1271  BOOST_REQUIRE(vd.size_local() != 0);
1272 
1273 // vd.template ghost_get<0>(RUN_ON_DEVICE);
1274 // vd.template ghost_get<0>(RUN_ON_DEVICE);
1275  vd.template Ighost_get<0>(RUN_ON_DEVICE);
1276  vd.template ghost_wait<0>(RUN_ON_DEVICE);
1277  vd.deviceToHostPos();
1278  vd.template deviceToHostProp<0>();
1279 
1280  vd.addComputationCosts(md);
1281 
1283  size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1284  v_cl.allGather(load,loads);
1285  v_cl.execute();
1286 
1287  for (size_t i = 0 ; i < loads.size() ; i++)
1288  {
1289  double load_f = load;
1290  double load_fc = loads.get(i);
1291 
1292  BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
1293  }
1294  }
1295 }
1296 
1297 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda_async)
1298 {
1299  vector_dist_dlb_on_cuda_impl_async<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(50000,0.01);
1300 }
1301 
1302 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda)
1303 {
1304  vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(50000,0.01);
1305 }
1306 
1307 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda_sparse)
1308 {
1309  vector_dist_dlb_on_cuda_impl<CELLLIST_GPU_SPARSE<3,double>>(50000,0.01);
1310 }
1311 
1312 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda2)
1313 {
1314  if (create_vcluster().size() <= 3)
1315  {return;};
1316 
1317  #ifndef CUDA_ON_CPU
1318  vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(1000000,0.01);
1319  #endif
1320 }
1321 
1322 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda3)
1323 {
1324  if (create_vcluster().size() < 8)
1325  {return;}
1326 
1327  #ifndef CUDA_ON_CPU
1328  vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(15000000,0.005);
1329  #endif
1330 }
1331 
1332 
1333 BOOST_AUTO_TEST_CASE(vector_dist_keep_prop_on_cuda)
1334 {
1336 
1337  Vcluster<> & v_cl = create_vcluster();
1338 
1339  if (v_cl.getProcessingUnits() > 8)
1340  return;
1341 
1342  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1343  Ghost<3,double> g(0.1);
1344  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1345 
1346  vector_type vd(0,domain,bc,g,DEC_GRAN(2048));
1347 
1348  // Only processor 0 initialy add particles on a corner of a domain
1349 
1350  if (v_cl.getProcessUnitID() == 0)
1351  {
1352  for(size_t i = 0 ; i < 50000 ; i++)
1353  {
1354  vd.add();
1355 
1356  vd.getLastPos()[0] = ((double)rand())/RAND_MAX * 0.3;
1357  vd.getLastPos()[1] = ((double)rand())/RAND_MAX * 0.3;
1358  vd.getLastPos()[2] = ((double)rand())/RAND_MAX * 0.3;
1359  }
1360  }
1361 
1362  // Move to GPU
1363  vd.hostToDevicePos();
1364  vd.template hostToDeviceProp<0>();
1365 
1366  vd.map(RUN_ON_DEVICE);
1367  vd.template ghost_get<>(RUN_ON_DEVICE);
1368 
1369  // now move to CPU
1370 
1371  vd.deviceToHostPos();
1372  vd.template deviceToHostProp<0>();
1373 
1374 
1375  // store the number of neighborhood for each particles
1376 
1377  auto it = vd.getDomainIterator();
1378 
1379  while (it.isNext())
1380  {
1381  auto p = it.get();
1382 
1383  vd.template getProp<0>(p) = 0.0;
1384 
1385  vd.template getProp<1>(p)[0] = 1000.0;
1386  vd.template getProp<1>(p)[1] = 2000.0;
1387  vd.template getProp<1>(p)[2] = 3000.0;
1388 
1389  vd.template getProp<2>(p)[0][0] = 6000,0;
1390  vd.template getProp<2>(p)[0][1] = 7000.0;
1391  vd.template getProp<2>(p)[0][2] = 8000.0;
1392  vd.template getProp<2>(p)[1][0] = 9000.0;
1393  vd.template getProp<2>(p)[1][1] = 10000.0;
1394  vd.template getProp<2>(p)[1][2] = 11000.0;
1395  vd.template getProp<2>(p)[2][0] = 12000.0;
1396  vd.template getProp<2>(p)[2][1] = 13000.0;
1397  vd.template getProp<2>(p)[2][2] = 14000.0;
1398 
1399  ++it;
1400  }
1401 
1402  // Move to GPU
1403  vd.template hostToDeviceProp<0,1,2>();
1404 
1405  ModelSquare md;
1406  md.factor = 10;
1407  vd.addComputationCosts(md);
1408  vd.getDecomposition().decompose();
1409  vd.map(RUN_ON_DEVICE);
1410 
1411  vd.deviceToHostPos();
1412  // Move info to CPU for addComputationcosts
1413 
1414  vd.addComputationCosts(md);
1415 
1417  size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1418  v_cl.allGather(load,loads);
1419  v_cl.execute();
1420 
1421  for (size_t i = 0 ; i < loads.size() ; i++)
1422  {
1423  double load_f = load;
1424  double load_fc = loads.get(i);
1425 
1426  BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
1427  }
1428 
1429  BOOST_REQUIRE(vd.size_local() != 0);
1430 
1431  Point<3,double> v({1.0,1.0,1.0});
1432 
1433  int base = 0;
1434 
1435  for (size_t i = 0 ; i < 25 ; i++)
1436  {
1437  if (i % 2 == 0)
1438  {
1439  // move particles to CPU and move the particles by 0.1
1440 
1441  vd.deviceToHostPos();
1442 
1443  auto it = vd.getDomainIterator();
1444 
1445  while (it.isNext())
1446  {
1447  auto p = it.get();
1448 
1449  vd.getPos(p)[0] += v.get(0) * 0.09;
1450  vd.getPos(p)[1] += v.get(1) * 0.09;
1451  vd.getPos(p)[2] += v.get(2) * 0.09;
1452 
1453  ++it;
1454  }
1455 
1456  //Back to GPU
1457  vd.hostToDevicePos();
1458  vd.map(RUN_ON_DEVICE);
1459  vd.template ghost_get<>(RUN_ON_DEVICE);
1460  vd.deviceToHostPos();
1461  vd.template deviceToHostProp<0,1,2>();
1462 
1463  ModelSquare md;
1464  vd.addComputationCosts(md);
1465  vd.getDecomposition().redecompose(200);
1466  vd.map(RUN_ON_DEVICE);
1467 
1468  BOOST_REQUIRE(vd.size_local() != 0);
1469 
1470  vd.template ghost_get<0>(RUN_ON_DEVICE);
1471  vd.deviceToHostPos();
1472  vd.template deviceToHostProp<0,1,2>();
1473 
1474  vd.addComputationCosts(md);
1475 
1477  size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1478  v_cl.allGather(load,loads);
1479  v_cl.execute();
1480 
1481  for (size_t i = 0 ; i < loads.size() ; i++)
1482  {
1483  double load_f = load;
1484  double load_fc = loads.get(i);
1485 
1486  BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
1487  }
1488  }
1489  else
1490  {
1491  vd.template deviceToHostProp<0,1,2>();
1492 
1493  auto it2 = vd.getDomainIterator();
1494 
1495  bool match = true;
1496  while (it2.isNext())
1497  {
1498  auto p = it2.get();
1499 
1500  vd.template getProp<0>(p) += 1;
1501 
1502  vd.template getProp<1>(p)[0] += 1.0;
1503  vd.template getProp<1>(p)[1] += 1.0;
1504  vd.template getProp<1>(p)[2] += 1.0;
1505 
1506  vd.template getProp<2>(p)[0][0] += 1.0;
1507  vd.template getProp<2>(p)[0][1] += 1.0;
1508  vd.template getProp<2>(p)[0][2] += 1.0;
1509  vd.template getProp<2>(p)[1][0] += 1.0;
1510  vd.template getProp<2>(p)[1][1] += 1.0;
1511  vd.template getProp<2>(p)[1][2] += 1.0;
1512  vd.template getProp<2>(p)[2][0] += 1.0;
1513  vd.template getProp<2>(p)[2][1] += 1.0;
1514  vd.template getProp<2>(p)[2][2] += 1.0;
1515 
1516  ++it2;
1517  }
1518 
1519  vd.template hostToDeviceProp<0,1,2>();
1520 
1521  ++base;
1522 
1523  vd.template ghost_get<0,1,2>(RUN_ON_DEVICE | KEEP_PROPERTIES);
1524  vd.template deviceToHostProp<0,1,2>();
1525 
1526  // Check that the ghost contain the correct information
1527 
1528  auto itg = vd.getGhostIterator();
1529 
1530  while (itg.isNext())
1531  {
1532  auto p = itg.get();
1533 
1534  match &= vd.template getProp<0>(p) == base;
1535 
1536  match &= vd.template getProp<1>(p)[0] == base + 1000.0;
1537  match &= vd.template getProp<1>(p)[1] == base + 2000.0;
1538  match &= vd.template getProp<1>(p)[2] == base + 3000.0;
1539 
1540  match &= vd.template getProp<2>(p)[0][0] == base + 6000.0;
1541  match &= vd.template getProp<2>(p)[0][1] == base + 7000.0;
1542  match &= vd.template getProp<2>(p)[0][2] == base + 8000.0;
1543  match &= vd.template getProp<2>(p)[1][0] == base + 9000.0;
1544  match &= vd.template getProp<2>(p)[1][1] == base + 10000.0;
1545  match &= vd.template getProp<2>(p)[1][2] == base + 11000.0;
1546  match &= vd.template getProp<2>(p)[2][0] == base + 12000.0;
1547  match &= vd.template getProp<2>(p)[2][1] == base + 13000.0;
1548  match &= vd.template getProp<2>(p)[2][2] == base + 14000.0;
1549 
1550  ++itg;
1551  }
1552 
1553  BOOST_REQUIRE_EQUAL(match,true);
1554  }
1555  }
1556 }
1557 
1559 {
1560  __device__ static bool check(int c)
1561  {
1562  return c == 1;
1563  }
1564 };
1565 
1566 BOOST_AUTO_TEST_CASE(vector_dist_get_index_set)
1567 {
1568  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1569  Ghost<3,double> g(0.1);
1570  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1571 
1572  if (create_vcluster().size() >= 16)
1573  {return;}
1574 
1575  Vcluster<> & v_cl = create_vcluster();
1576 
1577  vector_dist_gpu<3,double,aggregate<int,double>> vdg(10000,domain,bc,g,DEC_GRAN(128));
1578 
1579  auto it = vdg.getDomainIterator();
1580 
1581  while (it.isNext())
1582  {
1583  auto p = it.get();
1584 
1585  vdg.getPos(p)[0] = (double)rand() / RAND_MAX;
1586  vdg.getPos(p)[1] = (double)rand() / RAND_MAX;
1587  vdg.getPos(p)[2] = (double)rand() / RAND_MAX;
1588 
1589  vdg.template getProp<0>(p) = (int)((double)rand() / RAND_MAX / 0.5);
1590 
1591  vdg.template getProp<1>(p) = (double)rand() / RAND_MAX;
1592 
1593  ++it;
1594  }
1595 
1596  vdg.map();
1597 
1598  vdg.hostToDeviceProp<0,1>();
1599  vdg.hostToDevicePos();
1600 
1601  auto cl = vdg.getCellListGPU(0.1);
1602 
1603  // than we get a cell-list to force reorder
1604 
1606 
1607  get_indexes_by_type<0,type_is_one>(vdg.getPropVectorSort(),ids,vdg.size_local(),v_cl.getmgpuContext());
1608 
1609  // test
1610 
1611  ids.template deviceToHost<0>();
1612 
1613  auto & vs = vdg.getPropVectorSort();
1614  vs.template deviceToHost<0>();
1615 
1616  bool match = true;
1617 
1618  for (int i = 0 ; i < ids.size() ; i++)
1619  {
1620  if (vs.template get<0>(ids.template get<0>(i)) != 1)
1621  {match = false;}
1622  }
1623 
1624  BOOST_REQUIRE_EQUAL(match,true);
1625 }
1626 
1627 BOOST_AUTO_TEST_CASE(vector_dist_compare_host_device)
1628 {
1629  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1630  Ghost<3,double> g(0.1);
1631  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1632 
1633  if (create_vcluster().size() >= 16)
1634  {return;}
1635 
1636  vector_dist_gpu<3,double,aggregate<double,double[3],double[3][3]>> vdg(10000,domain,bc,g,DEC_GRAN(128));
1637 
1638  auto it = vdg.getDomainIterator();
1639 
1640  while (it.isNext())
1641  {
1642  auto p = it.get();
1643 
1644  vdg.getPos(p)[0] = (double)rand() / RAND_MAX;
1645  vdg.getPos(p)[1] = (double)rand() / RAND_MAX;
1646  vdg.getPos(p)[2] = (double)rand() / RAND_MAX;
1647 
1648  vdg.template getProp<0>(p) = (double)rand() / RAND_MAX;
1649 
1650  vdg.template getProp<1>(p)[0] = (double)rand() / RAND_MAX;
1651  vdg.template getProp<1>(p)[1] = (double)rand() / RAND_MAX;
1652  vdg.template getProp<1>(p)[2] = (double)rand() / RAND_MAX;
1653 
1654  vdg.template getProp<2>(p)[0][0] = (double)rand() / RAND_MAX;
1655  vdg.template getProp<2>(p)[0][1] = (double)rand() / RAND_MAX;
1656  vdg.template getProp<2>(p)[0][2] = (double)rand() / RAND_MAX;
1657  vdg.template getProp<2>(p)[1][0] = (double)rand() / RAND_MAX;
1658  vdg.template getProp<2>(p)[1][1] = (double)rand() / RAND_MAX;
1659  vdg.template getProp<2>(p)[1][2] = (double)rand() / RAND_MAX;
1660  vdg.template getProp<2>(p)[2][0] = (double)rand() / RAND_MAX;
1661  vdg.template getProp<2>(p)[2][1] = (double)rand() / RAND_MAX;
1662  vdg.template getProp<2>(p)[2][2] = (double)rand() / RAND_MAX;
1663 
1664  ++it;
1665  }
1666 
1667  vdg.map();
1668 
1669  vdg.hostToDeviceProp<0,1,2>();
1670  vdg.hostToDevicePos();
1671 
1672  bool test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
1673  BOOST_REQUIRE_EQUAL(test,true);
1674 
1675  vdg.getPos(100)[0] = 0.99999999;
1676 
1677  test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
1678  BOOST_REQUIRE_EQUAL(test,false);
1679 
1680  vdg.hostToDevicePos();
1681  vdg.getPos(100)[0] = 0.99999999;
1682 
1683  test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
1684  BOOST_REQUIRE_EQUAL(test,true);
1685 
1687 
1688  test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
1689  BOOST_REQUIRE_EQUAL(test,true);
1690 
1691  vdg.getProp<1>(103)[0] = 0.99999999;
1692 
1693  test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
1694  BOOST_REQUIRE_EQUAL(test,false);
1695 
1696  vdg.hostToDeviceProp<1>();
1697  vdg.getProp<1>(103)[0] = 0.99999999;
1698 
1699  test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
1700  BOOST_REQUIRE_EQUAL(test,true);
1701 
1703 
1704 
1705  test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
1706  BOOST_REQUIRE_EQUAL(test,true);
1707 
1708  vdg.getProp<0>(105) = 0.99999999;
1709 
1710  test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
1711  BOOST_REQUIRE_EQUAL(test,false);
1712 
1713  vdg.hostToDeviceProp<0>();
1714  vdg.getProp<0>(105) = 0.99999999;
1715 
1716  test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
1717  BOOST_REQUIRE_EQUAL(test,true);
1718 
1719 
1721 
1722 
1723  test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
1724  BOOST_REQUIRE_EQUAL(test,true);
1725 
1726  vdg.getProp<2>(108)[1][2] = 0.99999999;
1727 
1728  test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
1729  BOOST_REQUIRE_EQUAL(test,false);
1730 
1731  vdg.hostToDeviceProp<2>();
1732  vdg.getProp<2>(108)[1][2] = 0.99999999;
1733 
1734  test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
1735  BOOST_REQUIRE_EQUAL(test,true);
1736 }
1737 
1738 template<typename vector_dist_type>
1739 __global__ void assign_to_ghost(vector_dist_type vds)
1740 {
1741  int i = threadIdx.x + blockIdx.x * blockDim.x;
1742 
1743  if (i >= vds.size()) {return;}
1744 
1745  vds.template getProp<0>(i) = 1000.0 + i;
1746 
1747  vds.template getProp<1>(i)[0] = 2000.0 + i;
1748  vds.template getProp<1>(i)[1] = 3000.0 + i;
1749  vds.template getProp<1>(i)[2] = 4000.0 + i;
1750 
1751  vds.template getProp<2>(i)[0][0] = 12000.0 + i;
1752  vds.template getProp<2>(i)[0][1] = 13000.0 + i;
1753  vds.template getProp<2>(i)[0][2] = 14000.0 + i;
1754  vds.template getProp<2>(i)[1][0] = 22000.0 + i;
1755  vds.template getProp<2>(i)[1][1] = 23000.0 + i;
1756  vds.template getProp<2>(i)[1][2] = 24000.0 + i;
1757  vds.template getProp<2>(i)[2][0] = 32000.0 + i;
1758  vds.template getProp<2>(i)[2][1] = 33000.0 + i;
1759  vds.template getProp<2>(i)[2][2] = 34000.0 + i;
1760 
1761 }
1762 
1763 BOOST_AUTO_TEST_CASE(vector_dist_domain_and_ghost_test)
1764 {
1765  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1766  Ghost<3,double> g(0.1);
1767  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1768 
1769  if (create_vcluster().size() >= 16)
1770  {return;}
1771 
1772  vector_dist_gpu<3,double,aggregate<double,double[3],double[3][3]>> vdg(10000,domain,bc,g,DEC_GRAN(128));
1773 
1774  auto ite = vdg.getDomainAndGhostIteratorGPU();
1775 
1776  CUDA_LAUNCH(assign_to_ghost,ite,vdg.toKernel());
1777 
1778  vdg.template deviceToHostProp<0,1,2>();
1779 
1780 
1781  auto it = vdg.getDomainAndGhostIterator();
1782 
1783  bool check = true;
1784 
1785  while (it.isNext())
1786  {
1787  auto k = it.get();
1788 
1789  check &= vdg.template getProp<0>(k) == 1000.0 + k.getKey();
1790 
1791  check &= vdg.template getProp<1>(k)[0] == 2000.0 + k.getKey();
1792  check &= vdg.template getProp<1>(k)[1] == 3000.0 + k.getKey();
1793  check &= vdg.template getProp<1>(k)[2] == 4000.0 + k.getKey();
1794 
1795  check &= vdg.template getProp<2>(k)[0][0] == 12000.0 + k.getKey();
1796  check &= vdg.template getProp<2>(k)[0][1] == 13000.0 + k.getKey();
1797  check &= vdg.template getProp<2>(k)[0][2] == 14000.0 + k.getKey();
1798  check &= vdg.template getProp<2>(k)[1][0] == 22000.0 + k.getKey();
1799  check &= vdg.template getProp<2>(k)[1][1] == 23000.0 + k.getKey();
1800  check &= vdg.template getProp<2>(k)[1][2] == 24000.0 + k.getKey();
1801  check &= vdg.template getProp<2>(k)[2][0] == 32000.0 + k.getKey();
1802  check &= vdg.template getProp<2>(k)[2][1] == 33000.0 + k.getKey();
1803  check &= vdg.template getProp<2>(k)[2][2] == 34000.0 + k.getKey();
1804 
1805  ++it;
1806  }
1807 
1808 
1809  BOOST_REQUIRE_EQUAL(check,true);
1810 }
1811 
1812 template<typename vT>
1813 __global__ void launch_overflow(vT vs, vT vs2)
1814 {
1815  vs2.template getProp<1>(57)[0];
1816 }
1817 
1818 BOOST_AUTO_TEST_CASE(vector_dist_overflow_se_class1)
1819 {
1820  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1821  Ghost<3,double> g(0.1);
1822  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1823 
1824  if (create_vcluster().size() >= 16)
1825  {return;}
1826 
1827  std::cout << "****** TEST ERROR MESSAGE BEGIN ********" << std::endl;
1828 
1830  vector_dist_gpu<3,double,aggregate<double,double[3],double[3][3]>> vdg2(0,domain,bc,g,DEC_GRAN(128));
1831 
1832 
1833  vdg.setCapacity(100);
1834 
1835  ite_gpu<1> ite;
1836 
1837  ite.wthr.x = 1;
1838  ite.wthr.y = 1;
1839  ite.wthr.z = 1;
1840  ite.thr.x = 1;
1841  ite.thr.y = 1;
1842  ite.thr.z = 1;
1843 
1844  try
1845  {
1846  CUDA_LAUNCH(launch_overflow,ite,vdg.toKernel(),vdg2.toKernel());
1847  }
1848  catch(...)
1849  {
1850  std::cout << "SE_CLASS1 Catch" << std::endl;
1851  };
1852 
1853  std::cout << "****** TEST ERROR MESSAGE END ********" << std::endl;
1854 }
1855 
1856 
1857 
1858 BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
1859 {
1860 
1861 // This example require float atomic, until C++20 we are out of luck
1862 #ifndef CUDIFY_USE_OPENMP
1863 
1864  Vcluster<> & v_cl = create_vcluster();
1865 
1866  long int k = 25*25*25*create_vcluster().getProcessingUnits();
1867  k = std::pow(k, 1/3.);
1868 
1869  if (v_cl.getProcessingUnits() > 48)
1870  return;
1871 
1872  print_test("Testing 3D periodic ghost put GPU k=",k);
1873  BOOST_TEST_CHECKPOINT( "Testing 3D periodic ghost put k=" << k );
1874 
1875  long int big_step = k / 30;
1876  big_step = (big_step == 0)?1:big_step;
1877  long int small_step = 21;
1878 
1879  // 3D test
1880  for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
1881  {
1882  float r_cut = 1.3 / k;
1883  float r_g = 1.5 / k;
1884 
1885  Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0});
1886 
1887  // Boundary conditions
1888  size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
1889 
1890  // ghost
1891  Ghost<3,float> ghost(r_g);
1892 
1893  typedef aggregate<float,float,float> part_prop;
1894 
1895  // Distributed vector
1896  vector_dist_gpu<3,float, part_prop > vd(0,box,bc,ghost);
1897 
1898  auto it = vd.getGridIterator({(size_t)k,(size_t)k,(size_t)k});
1899 
1900  while (it.isNext())
1901  {
1902  auto key = it.get();
1903 
1904  vd.add();
1905 
1906  vd.getLastPosWrite()[0] = key.get(0)*it.getSpacing(0);
1907  vd.getLastPosWrite()[1] = key.get(1)*it.getSpacing(1);
1908  vd.getLastPosWrite()[2] = key.get(2)*it.getSpacing(2);
1909 
1910  // Fill some properties randomly
1911 
1912  vd.getLastPropWrite<0>() = 0.0;
1913 
1914  vd.getLastPropWrite<2>() = 0.0;
1915 
1916  ++it;
1917  }
1918 
1919  vd.map();
1920 
1921  vd.hostToDevicePos();
1922  vd.template hostToDeviceProp<0,2>();
1923  // sync the ghost
1924  vd.ghost_get<0,2>(RUN_ON_DEVICE);
1925  vd.template deviceToHostProp<0,2>();
1926  vd.deviceToHostPos();
1927 
1928  {
1929  auto NN = vd.getCellList(r_cut);
1930  float a = 1.0f*k*k;
1931 
1932  // run trough all the particles + ghost
1933 
1934  auto it2 = vd.getDomainIterator();
1935 
1936  while (it2.isNext())
1937  {
1938  // particle p
1939  auto p = it2.get();
1940  Point<3,float> xp = vd.getPos(p);
1941 
1942  // Get an iterator over the neighborhood particles of p
1943  auto Np = NN.getNNIterator<NO_CHECK>(NN.getCell(xp));
1944 
1945  // For each neighborhood particle ...
1946  while (Np.isNext())
1947  {
1948  auto q = Np.get();
1949  Point<3,float> xq = vd.getPosRead(q);
1950 
1951  float dist = xp.distance(xq);
1952 
1953  if (dist < r_cut)
1954  {
1955  vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
1956  vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut) / 2;
1957  }
1958 
1959  ++Np;
1960  }
1961 
1962  ++it2;
1963  }
1964 
1965  vd.hostToDevicePos();
1966  vd.template hostToDeviceProp<0,2>();
1967  vd.template ghost_put<add_atomic_,0,2>(RUN_ON_DEVICE);
1968  vd.template deviceToHostProp<0,2>();
1969  vd.deviceToHostPos();
1970 
1971  bool ret = true;
1972  auto it3 = vd.getDomainIterator();
1973 
1974  float constant = vd.getProp<0>(it3.get());
1975  float constanta = vd.getProp<2>(it3.get());
1976  float eps = 0.001;
1977 
1978  while (it3.isNext())
1979  {
1980  float constant2 = vd.getProp<0>(it3.get());
1981  float constant3 = vd.getProp<2>(it3.get());
1982  if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
1983  {
1984  Point<3,float> p = vd.getPosRead(it3.get());
1985 
1986  std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << v_cl.getProcessUnitID() << std::endl;
1987  ret = false;
1988  break;
1989  }
1990 
1991  ++it3;
1992  }
1993  BOOST_REQUIRE_EQUAL(ret,true);
1994  }
1995 
1996  auto itp = vd.getDomainAndGhostIterator();
1997  while (itp.isNext())
1998  {
1999  auto key = itp.get();
2000 
2001  vd.getPropWrite<0>(key) = 0.0;
2002  vd.getPropWrite<2>(key) = 0.0;
2003 
2004  ++itp;
2005  }
2006 
2007  {
2008  auto NN = vd.getCellList(r_cut);
2009  float a = 1.0f*k*k;
2010 
2011  // run trough all the particles + ghost
2012 
2013  auto it2 = vd.getDomainIterator();
2014 
2015  while (it2.isNext())
2016  {
2017  // particle p
2018  auto p = it2.get();
2019  Point<3,float> xp = vd.getPosRead(p);
2020 
2021  // Get an iterator over the neighborhood particles of p
2022  auto Np = NN.getNNIterator<NO_CHECK>(NN.getCell(xp));
2023 
2024  // For each neighborhood particle ...
2025  while (Np.isNext())
2026  {
2027  auto q = Np.get();
2028  Point<3,float> xq = vd.getPosRead(q);
2029 
2030  float dist = xp.distance(xq);
2031 
2032  if (dist < r_cut)
2033  {
2034  vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
2035  vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut);
2036  }
2037 
2038  ++Np;
2039  }
2040 
2041  ++it2;
2042  }
2043 
2044  vd.hostToDevicePos();
2045  vd.template hostToDeviceProp<0,2>();
2046  vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE);
2047  vd.template ghost_put<add_atomic_,2>(RUN_ON_DEVICE);
2048  vd.template deviceToHostProp<0,2>();
2049  vd.deviceToHostPos();
2050 
2051  bool ret = true;
2052  auto it3 = vd.getDomainIterator();
2053 
2054  float constant = vd.getPropRead<0>(it3.get());
2055  float constanta = vd.getPropRead<2>(it3.get());
2056  float eps = 0.001;
2057 
2058  while (it3.isNext())
2059  {
2060  float constant2 = vd.getPropRead<0>(it3.get());
2061  float constant3 = vd.getPropRead<0>(it3.get());
2062  if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
2063  {
2064  Point<3,float> p = vd.getPosRead(it3.get());
2065 
2066  std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << v_cl.getProcessUnitID() << std::endl;
2067  ret = false;
2068  break;
2069  }
2070 
2071  ++it3;
2072  }
2073  BOOST_REQUIRE_EQUAL(ret,true);
2074  }
2075  }
2076 
2077 #endif
2078 }
2079 
2080 BOOST_AUTO_TEST_SUITE_END()
auto getLastPropWrite() -> decltype(v_prp.template get< id >(0))
Get the property of the last element.
auto getLastPosWrite() -> decltype(v_pos.template get< 0 >(0))
Get the position of the last element.
Decomposition & getDecomposition()
Get the decomposition.
size_t getProcessUnitID()
Get the process unit id.
void deviceToHostProp()
Move the memory from the device to host memory.
St stype
space type
size_t size_local_with_ghost() const
return the local size of the vector
bool allGather(T &send, openfpm::vector< T, Mem, gr > &v)
Gather the data from all processors.
auto getPos(vect_dist_key_dx vec_key) -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
VerletL getVerlet(St r_cut)
for each particle get the verlet list
auto getProp(vect_dist_key_dx vec_key) -> decltype(v_prp.template get< id >(vec_key.getKey()))
Get the property of an element.
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:27
size_t size()
Stub size.
Definition: map_vector.hpp:211
vector_dist_iterator getGhostIterator() const
Get the iterator across the position of the ghost particles.
vector_dist_iterator getDomainAndGhostIterator() const
Get an iterator that traverse the particles in the domain.
void addComputationCosts(const self &vd, Model md=Model())
Add the computation cost on the decomposition coming from the particles.
std::string toString() const
Return the string with the point coordinate.
Definition: Point.hpp:398
CellL getCellList(St r_cut, bool no_se3=false)
Construct a cell list starting from the stored particles.
Definition: Ghost.hpp:39
__device__ __host__ T norm()
norm of the vector
Definition: Point.hpp:231
Implementation of VCluster class.
Definition: VCluster.hpp:58
void execute()
Execute all the requests.
const vector_dist_prop & getPropVector() const
return the property vector of all the particles
auto getLastPos() -> decltype(v_pos.template get< 0 >(0))
Get the position of the last element.
void map(size_t opt=NONE)
It move all the particles that does not belong to the local processor to the respective processor.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
void hostToDevicePos()
Move the memory from the device to host memory.
grid_dist_id_iterator_dec< Decomposition > getGridIterator(const size_t(&sz)[dim])
size_t size_local() const
return the local size of the vector
void enlarge(const Box< dim, T > &gh)
Enlarge the box with ghost margin.
Definition: Box.hpp:823
auto getPropWrite(vect_dist_key_dx vec_key) -> decltype(v_prp.template get< id >(vec_key.getKey()))
Get the property of an element.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
size_t getProcessingUnits()
Get the total number of processors.
void ghost_get(size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
void add()
Add local particle.
Linear model.
Definition: LB_Model.hpp:47
mgpu::ofp_context_t & getmgpuContext(bool iw=true)
If nvidia cuda is activated return a mgpu context.
void updateCellList(CellL &cell_list, bool no_se3=false, cl_construct_opt opt=cl_construct_opt::Full)
Update a cell list using the stored particles.
__device__ __host__ T distance(const Point< dim, T > &q) const
It calculate the distance between 2 points.
Definition: Point.hpp:250
Distributed vector.
void deviceToHostPos()
Move the memory from the device to host memory.
auto getPropRead(vect_dist_key_dx vec_key) const -> decltype(v_prp.template get< id >(vec_key.getKey()))
Get the property of an element.
vect_dist_key_dx get()
Get the actual key.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
const vector_dist_pos & getPosVector() const
return the position vector of all the particles
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.
auto getPosRead(vect_dist_key_dx vec_key) const -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.