OpenFPM  5.2.0
Project that contain the implementation of distributed structures
vector_dist_gpu_unit_tests.cu
1 #define BOOST_TEST_DYN_LINK
2 #include "config.h"
3 #include <boost/test/unit_test.hpp>
4 #include "VCluster/VCluster.hpp"
5 #include <Vector/vector_dist.hpp>
6 #include "Vector/tests/vector_dist_util_unit_tests.hpp"
7 
8 #define SUB_UNIT_FACTOR 1024
9 
10 template<unsigned int dim , typename vector_dist_type>
11 __global__ void move_parts_gpu_test(vector_dist_type vecDist)
12 {
13  auto p = GET_PARTICLE(vecDist);
14 
15 #pragma unroll
16  for (int i = 0 ; i < dim ; i++)
17  {
18  vecDist.getPos(p)[i] += 0.05;
19  }
20 }
21 
22 BOOST_AUTO_TEST_SUITE( vector_dist_gpu_test )
23 
24 void print_test(std::string test, size_t sz)
25 {
26  if (create_vcluster().getProcessUnitID() == 0)
27  std::cout << test << " " << sz << "\n";
28 }
29 
30 
31 __global__ void initialize_props(vector_dist_ker<3, float, aggregate<float, float [3], float[3]>> vecDist)
32 {
33  auto p = GET_PARTICLE(vecDist);
34 
35  vecDist.template getProp<0>(p) = vecDist.getPos(p)[0] + vecDist.getPos(p)[1] + vecDist.getPos(p)[2];
36 
37  vecDist.template getProp<1>(p)[0] = vecDist.getPos(p)[0] + vecDist.getPos(p)[1];
38  vecDist.template getProp<1>(p)[1] = vecDist.getPos(p)[0] + vecDist.getPos(p)[2];
39  vecDist.template getProp<1>(p)[2] = vecDist.getPos(p)[1] + vecDist.getPos(p)[2];
40 
41  vecDist.template getProp<2>(p)[0] = vecDist.getPos(p)[0] + vecDist.getPos(p)[1];
42  vecDist.template getProp<2>(p)[1] = vecDist.getPos(p)[0] + vecDist.getPos(p)[2];
43  vecDist.template getProp<2>(p)[2] = vecDist.getPos(p)[1] + vecDist.getPos(p)[2];
44 }
45 
46 template<typename T,typename CellList_type>
47 __global__ void calculate_force(
48  vector_dist_ker<3, T, aggregate<T, T[3], T [3]>> vecDist,
49  CellList_type cellList,
50  int rank)
51 {
52  size_t p = GET_PARTICLE(vecDist);
53 
54  Point<3,T> xp = vecDist.getPos(p);
55 
56  auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
57 
58  Point<3,T> force({0.0,0.0,0.0});
59 
60  while (it.isNext())
61  {
62  auto q = it.get();
63 
64  if (q == p) {++it; continue;}
65 
66  Point<3,T> xq = vecDist.getPos(q);
67  Point<3,T> r = xq - xp;
68 
69  if (r.norm() > 1e-6)
70  {
71  r /= r.norm();
72  force += vecDist.template getProp<0>(q)*r;
73  }
74 
75  ++it;
76  }
77 
78  vecDist.template getProp<1>(p)[0] = force.get(0);
79  vecDist.template getProp<1>(p)[1] = force.get(1);
80  vecDist.template getProp<1>(p)[2] = force.get(2);
81 }
82 
83 template<typename T,typename CellList_type>
84 __global__ void calculate_force_sort(
85  vector_dist_ker<3, T, aggregate<T, T[3], T [3]>> vecDistSort,
86  CellList_type cellList,
87  int rank)
88 {
89  size_t p; GET_PARTICLE_SORT(p, cellList);
90 
91  Point<3,T> xp = vecDistSort.getPos(p);
92  Point<3,T> force({0.0,0.0,0.0});
93 
94  auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
95 
96  while (it.isNext())
97  {
98  auto q = it.get_sort();
99 
100  if (q == p) {++it; continue;}
101 
102  Point<3,T> xq = vecDistSort.getPos(q);
103  Point<3,T> r = xq - xp;
104 
105  // Normalize
106 
107  if (r.norm() > 1e-6)
108  {
109  r /= r.norm();
110  force += vecDistSort.template getProp<0>(q)*r;
111  }
112 
113  ++it;
114  }
115 
116  vecDistSort.template getProp<1>(p)[0] = force.get(0);
117  vecDistSort.template getProp<1>(p)[1] = force.get(1);
118  vecDistSort.template getProp<1>(p)[2] = force.get(2);
119 }
120 
121 template<typename CellList_type, typename vector_type>
122 bool check_force(CellList_type & cellList, vector_type & vecDist)
123 {
124  typedef typename vector_type::stype St;
125 
126  auto it6 = vecDist.getDomainIterator();
127 
128  bool match = true;
129 
130  while (it6.isNext())
131  {
132  auto p = it6.get();
133 
134  Point<3,St> xp = vecDist.getPos(p);
135 
136  // Calculate on CPU
137 
138  Point<3,St> force({0.0,0.0,0.0});
139 
140  auto NNc = cellList.getNNIteratorBox(cellList.getCell(xp));
141 
142  while (NNc.isNext())
143  {
144  auto q = NNc.get();
145 
146  if (q == p.getKey()) {++NNc; continue;}
147 
148  Point<3,St> xq_2 = vecDist.getPos(q);
149  Point<3,St> r2 = xq_2 - xp;
150 
151  // Normalize
152 
153  if (r2.norm() > 1e-6)
154  {
155  r2 /= r2.norm();
156  force += vecDist.template getProp<0>(q)*r2;
157  }
158 
159  ++NNc;
160  }
161 
162  match &= fabs(vecDist.template getProp<1>(p)[0] - force.get(0)) < 0.0003;
163  match &= fabs(vecDist.template getProp<1>(p)[1] - force.get(1)) < 0.0003;
164  match &= fabs(vecDist.template getProp<1>(p)[2] - force.get(2)) < 0.0003;
165 
166  if (match == false)
167  {
168  std::cout << p.getKey() << " ERROR: " << vecDist.template getProp<1>(p)[0] << " " << force.get(0) << std::endl;
169  // std::cout << p.getKey() << " ERROR: " << vecDist.template getProp<1>(p)[1] << " " << force.get(1) << std::endl;
170  // std::cout << p.getKey() << " ERROR: " << vecDist.template getProp<1>(p)[2] << " " << force.get(2) << std::endl;
171 
172  // break;
173  }
174 
175  ++it6;
176  }
177 
178  return match;
179 }
180 
181 BOOST_AUTO_TEST_CASE( vector_dist_gpu_ghost_get )
182 {
183  auto & vCluster = create_vcluster();
184 
185  if (vCluster.size() > 16)
186  {return;}
187 
188  Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
189 
190  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
191  Ghost<3,float> g(0.1);
192 
193  // Boundary conditions
194  size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
195 
197 
198  auto it = vecDist.getDomainIterator();
199 
200  while (it.isNext())
201  {
202  auto p = it.get();
203 
204  vecDist.getPos(p)[0] = (float)rand() / (float)RAND_MAX;
205  vecDist.getPos(p)[1] = (float)rand() / (float)RAND_MAX;
206  vecDist.getPos(p)[2] = (float)rand() / (float)RAND_MAX;
207 
208  vecDist.template getProp<0>(p) = vecDist.getPos(p)[0] + vecDist.getPos(p)[1] + vecDist.getPos(p)[2];
209 
210  vecDist.template getProp<1>(p)[0] = vecDist.getPos(p)[0] + vecDist.getPos(p)[1];
211  vecDist.template getProp<1>(p)[1] = vecDist.getPos(p)[0] + vecDist.getPos(p)[2];
212  vecDist.template getProp<1>(p)[2] = vecDist.getPos(p)[1] + vecDist.getPos(p)[2];
213 
214  vecDist.template getProp<2>(p)[0] = vecDist.getPos(p)[0] + 3.0*vecDist.getPos(p)[1];
215  vecDist.template getProp<2>(p)[1] = vecDist.getPos(p)[0] + 3.0*vecDist.getPos(p)[2];
216  vecDist.template getProp<2>(p)[2] = vecDist.getPos(p)[1] + 3.0*vecDist.getPos(p)[2];
217 
218 
219  ++it;
220  }
221 
222  // Ok we redistribute the particles (CPU based)
223  vecDist.map();
224 
225  vecDist.template ghost_get<0,1,2>();
226 
227  // Now we check the the ghost contain the correct information
228 
229  bool check = true;
230 
231  auto itg = vecDist.getDomainAndGhostIterator();
232 
233  while (itg.isNext())
234  {
235  auto p = itg.get();
236 
237  check &= (vecDist.template getProp<0>(p) == vecDist.getPos(p)[0] + vecDist.getPos(p)[1] + vecDist.getPos(p)[2]);
238 
239  check &= (vecDist.template getProp<1>(p)[0] == vecDist.getPos(p)[0] + vecDist.getPos(p)[1]);
240  check &= (vecDist.template getProp<1>(p)[1] == vecDist.getPos(p)[0] + vecDist.getPos(p)[2]);
241  check &= (vecDist.template getProp<1>(p)[2] == vecDist.getPos(p)[1] + vecDist.getPos(p)[2]);
242 
243  check &= (vecDist.template getProp<2>(p)[0] == vecDist.getPos(p)[0] + 3.0*vecDist.getPos(p)[1]);
244  check &= (vecDist.template getProp<2>(p)[1] == vecDist.getPos(p)[0] + 3.0*vecDist.getPos(p)[2]);
245  check &= (vecDist.template getProp<2>(p)[2] == vecDist.getPos(p)[1] + 3.0*vecDist.getPos(p)[2]);
246 
247  ++itg;
248  }
249 
250  size_t tot_s = vecDist.size_local_with_ghost();
251 
252  vCluster.sum(tot_s);
253  vCluster.execute();
254 
255  // We check that we check something
256  BOOST_REQUIRE(tot_s > 1000);
257 }
258 
259 template<typename vector_type, typename CellList_type, typename CellList_type_cpu>
260 void compareCellListCpuGpu(vector_type & vecDist, CellList_type & NN, CellList_type_cpu & cellList)
261 {
262  const auto it5 = vecDist.getDomainIteratorGPU(32);
263 
264  CUDA_LAUNCH((calculate_force<typename vector_type::stype,decltype(NN.toKernel())>),
265  it5,
266  vecDist.toKernel(),
267  NN.toKernel(),
268  (int)create_vcluster().rank()
269  );
270 
271  vecDist.template deviceToHostProp<1>();
272 
273  bool test = check_force(cellList,vecDist);
274  BOOST_REQUIRE_EQUAL(test,true);
275 }
276 
277 template<typename vector_type, typename CellList_type, typename CellList_type_cpu>
278 void compareCellListCpuGpuSorted(vector_type & vecDistSort, CellList_type & cellListGPU, CellList_type_cpu & cellList)
279 {
280  const auto it5 = vecDistSort.getDomainIteratorGPU(32);
281 
282  CUDA_LAUNCH((calculate_force_sort<typename vector_type::stype,decltype(cellListGPU.toKernel())>),
283  it5,
284  vecDistSort.toKernel(),
285  cellListGPU.toKernel(),
286  (int)create_vcluster().rank()
287  );
288 
289  vecDistSort.template restoreOrder<1>(cellListGPU);
290 
291  vecDistSort.template deviceToHostProp<1>();
292  vecDistSort.deviceToHostPos();
293 
294  bool test = check_force(cellList,vecDistSort);
295  BOOST_REQUIRE_EQUAL(test,true);
296 }
297 
298 template<typename CellList_type, bool sorted>
299 void vector_dist_gpu_test_impl()
300 {
301  auto & vCluster = create_vcluster();
302 
303  if (vCluster.size() > 16)
304  {return;}
305 
306  Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
307 
308  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
309  Ghost<3,float> g(0.1);
310 
311  // Boundary conditions
312  size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
313 
315 
316  srand(55067*create_vcluster().rank());
317 
318  auto it = vecDist.getDomainIterator();
319 
320  while (it.isNext())
321  {
322  auto p = it.get();
323 
324  int x = rand();
325  int y = rand();
326  int z = rand();
327 
328  vecDist.getPos(p)[0] = (float)x / (float)RAND_MAX;
329  vecDist.getPos(p)[1] = (float)y / (float)RAND_MAX;
330  vecDist.getPos(p)[2] = (float)z / (float)RAND_MAX;
331 
332  Point<3,float> xp = vecDist.getPos(p);
333 
334  ++it;
335  }
336 
337  // Ok we redistribute the particles (CPU based)
338  vecDist.map();
339 
340  size_t size_l = vecDist.size_local();
341 
342  vCluster.sum(size_l);
343  vCluster.execute();
344 
345  BOOST_REQUIRE_EQUAL(size_l,10000);
346 
347  auto & dec = vecDist.getDecomposition();
348 
349  bool noOut = true;
350  size_t cnt = 0;
351 
352  auto it2 = vecDist.getDomainIterator();
353 
354  while (it2.isNext())
355  {
356  auto p = it2.get();
357 
358  noOut &= dec.isLocal(vecDist.getPos(p));
359 
360  cnt++;
361  ++it2;
362  }
363 
364  BOOST_REQUIRE_EQUAL(noOut,true);
365  BOOST_REQUIRE_EQUAL(cnt,vecDist.size_local());
366 
367  // now we offload all the properties
368 
369  const auto it3 = vecDist.getDomainIteratorGPU();
370 
371  // offload to device
372  vecDist.hostToDevicePos();
373 
374  CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vecDist.toKernel());
375 
376  // now we check what we initialized
377 
378  vecDist.deviceToHostProp<0,1>();
379 
380  auto it4 = vecDist.getDomainIterator();
381 
382  while (it4.isNext())
383  {
384  auto p = it4.get();
385 
386  BOOST_REQUIRE_CLOSE(vecDist.template getProp<0>(p),vecDist.getPos(p)[0] + vecDist.getPos(p)[1] + vecDist.getPos(p)[2],0.01);
387 
388  BOOST_REQUIRE_CLOSE(vecDist.template getProp<1>(p)[0],vecDist.getPos(p)[0] + vecDist.getPos(p)[1],0.01);
389  BOOST_REQUIRE_CLOSE(vecDist.template getProp<1>(p)[1],vecDist.getPos(p)[0] + vecDist.getPos(p)[2],0.01);
390  BOOST_REQUIRE_CLOSE(vecDist.template getProp<1>(p)[2],vecDist.getPos(p)[1] + vecDist.getPos(p)[2],0.01);
391 
392  ++it4;
393  }
394 
395  // here we do a ghost_get
396  vecDist.ghost_get<0>();
397 
398  // Double ghost get to check crashes
399  vecDist.ghost_get<0>();
400 
401  // we re-offload what we received
402  vecDist.hostToDevicePos();
403  vecDist.template hostToDeviceProp<0>();
404 
405  size_t opt = (sorted)? CL_GPU_REORDER : 0;
406  auto cellList = vecDist.getCellList(0.1);
407  auto cellListGPU = vecDist.template getCellListGPU<CellList_type>(0.1, opt | CL_NON_SYMMETRIC);
408 
409  if (sorted)
410  {
411  vecDist.hostToDevicePos();
412  vecDist.template hostToDeviceProp<0>();
413 
414  vecDist.template updateCellListGPU<0>(cellListGPU);
415  compareCellListCpuGpuSorted(vecDist,cellListGPU,cellList);
416  }
417 
418  else
419  {
420  vecDist.updateCellListGPU(cellListGPU);
421  compareCellListCpuGpu(vecDist,cellListGPU,cellList);
422  }
423 }
424 
425 template<typename CellList_type>
426 void vector_dist_gpu_make_sort_test_impl()
427 {
428  auto & vCluster = create_vcluster();
429 
430  if (vCluster.size() > 16)
431  {return;}
432 
433  Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
434 
435  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
436  Ghost<3,float> g(0.1);
437 
438  // Boundary conditions
439  size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
440 
442 
443  srand(55067*create_vcluster().rank());
444 
445  auto it = vecDist.getDomainIterator();
446 
447  while (it.isNext())
448  {
449  auto p = it.get();
450 
451  int x = rand();
452  int y = rand();
453  int z = rand();
454 
455  vecDist.getPos(p)[0] = (float)x / (float)RAND_MAX;
456  vecDist.getPos(p)[1] = (float)y / (float)RAND_MAX;
457  vecDist.getPos(p)[2] = (float)z / (float)RAND_MAX;
458 
459  ++it;
460  }
461 
462  vecDist.hostToDevicePos();
463 
464  // redistribute the particles
465  vecDist.map(RUN_ON_DEVICE);
466 
467  auto it3 = vecDist.getDomainIteratorGPU();
468 
469  CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vecDist.toKernel());
470 
471  vecDist.template deviceToHostProp<0,1,2>();
472 
473  // Here we check CL_GPU_REORDER correctly restores particle order
474  // we use a Cell-List to check that the two cell-list constructed are identical
475 
476  vecDist.deviceToHostPos();
477 
480 
481  auto NN_cpu1 = vecDist.getCellList(0.1);
482  auto NN = vecDist.template getCellListGPU<CellList_type>(0.1, CL_NON_SYMMETRIC | CL_GPU_REORDER);
483 
484  vecDist.hostToDevicePos();
485  vecDist.template hostToDeviceProp<0,1,2>();
486 
487  vecDist.template updateCellListGPU<0,1,2>(NN);
488  vecDist.template restoreOrder<0,1,2>(NN);
489 
490  NN.setOpt(NN.getOpt() | CL_GPU_SKIP_CONSTRUCT_ON_STATIC_DOMAIN);
491  vecDist.template updateCellListGPU<0,1,2>(NN);
492  vecDist.template restoreOrder<0,1,2>(NN);
493  NN.setOpt(NN.getOpt() ^ CL_GPU_SKIP_CONSTRUCT_ON_STATIC_DOMAIN);
494 
495  vecDist.deviceToHostPos();
496  vecDist.template deviceToHostProp<0,1,2>();
497 
498  vecDist.hostToDevicePos();
499  vecDist.template hostToDeviceProp<0>();
500 
501  vecDist.template updateCellListGPU<0>(NN);
502  vecDist.template restoreOrder<0>(NN);
503 
504  vecDist.deviceToHostPos();
505  vecDist.template deviceToHostProp<0>();
506 
507  vecDist.hostToDevicePos();
508  vecDist.template hostToDeviceProp<1>();
509 
510  vecDist.template updateCellListGPU<1>(NN);
511  vecDist.template restoreOrder<1>(NN);
512 
513  vecDist.deviceToHostPos();
514  vecDist.template deviceToHostProp<1>();
515 
516  vecDist.hostToDevicePos();
517  vecDist.template hostToDeviceProp<2>();
518 
519  vecDist.template updateCellListGPU<2>(NN);
520  vecDist.template restoreOrder<2>(NN);
521 
522  vecDist.deviceToHostPos();
523  vecDist.template deviceToHostProp<2>();
524 
525  auto NN_cpu2 = vecDist.getCellList(0.1);
526 
527  // compare two cell-lists
528  bool match = true;
529  for (size_t i = 0 ; i < NN_cpu1.getNCells() ; i++)
530  {
531  match &= NN_cpu1.getNelements(i) == NN_cpu2.getNelements(i);
532  }
533  BOOST_REQUIRE_EQUAL(match,true);
534 
535  match = true;
536  for (size_t i = 0 ; i < vecDist.size_local() ; i++)
537  {
538  Point<3,float> p1 = vecDist.getPos(i);
539  Point<3,float> p2 = tmpPos.template get<0>(i);
540 
541  // They must be in the same cell
542  auto c1 = NN.getCell(p1);
543  auto c2 = NN.getCell(p2);
544 
545  match &= c1 == c2;
546  }
547  BOOST_REQUIRE_EQUAL(match,true);
548 
549  match = true;
550  for (size_t i = 0 ; i < vecDist.size_local() ; i++)
551  {
552  for (int j = 0; j < 3; ++j)
553  match &= (vecDist.getPos(i)[j] - tmpPos.template get<0>(i)[j]) < 0.0003;
554 
555  match &= (vecDist.getProp<0>(i) - tmpPrp.template get<0>(i)) < 0.0003;
556 
557  for (int j = 0; j < 3; ++j)
558  match &= (vecDist.getProp<1>(i)[j] - tmpPrp.template get<1>(i)[j]) < 0.0003;
559 
560  for (int j = 0; j < 3; ++j)
561  match &= (vecDist.getProp<2>(i)[j] - tmpPrp.template get<2>(i)[j]) < 0.0003;
562  }
563 
564  BOOST_REQUIRE_EQUAL(match,true);
565 
566  CUDA_CHECK();
567 }
568 
569 
570 BOOST_AUTO_TEST_CASE(vector_dist_gpu_make_sort_sparse)
571 {
572  vector_dist_gpu_make_sort_test_impl<CELLLIST_GPU_SPARSE<3,float>>();
573 }
574 
575 BOOST_AUTO_TEST_CASE(vector_dist_gpu_make_sort)
576 {
577  vector_dist_gpu_make_sort_test_impl<CellList_gpu<3,float,CudaMemory,shift_only<3, float>>>();
578 }
579 
580 BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
581 {
582  vector_dist_gpu_test_impl<CellList_gpu<3,float,CudaMemory,shift_only<3, float>>, false>();
583 }
584 
585 BOOST_AUTO_TEST_CASE( vector_dist_gpu_test_sorted)
586 {
587  vector_dist_gpu_test_impl<CellList_gpu<3,float,CudaMemory,shift_only<3, float>>, true>();
588 }
589 
590 BOOST_AUTO_TEST_CASE( vector_dist_gpu_test_sparse)
591 {
592  vector_dist_gpu_test_impl<CELLLIST_GPU_SPARSE<3,float>, false>();
593 }
594 
595 BOOST_AUTO_TEST_CASE( vector_dist_gpu_test_sparse_sorted)
596 {
597  vector_dist_gpu_test_impl<CELLLIST_GPU_SPARSE<3,float>, true>();
598 }
599 
600 template<typename St>
601 void vdist_calc_gpu_test()
602 {
603  auto & vCluster = create_vcluster();
604 
605  if (vCluster.size() > 16)
606  {return;}
607 
608  Box<3,St> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
609 
610  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
611  Ghost<3,St> g(0.1);
612 
613  // Boundary conditions
614  size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
615 
617 
618  vector_dist_gpu<3,St,aggregate<St,St[3],St[3]>> vecDist(1000,domain,bc,g);
619 
621 
623 
624  srand(vCluster.rank()*10000);
625  auto it = vecDist.getDomainIterator();
626 
627  while (it.isNext())
628  {
629  auto p = it.get();
630 
631  vecDist.getPos(p)[0] = (St)rand() / (float)RAND_MAX;
632  vecDist.getPos(p)[1] = (St)rand() / (float)RAND_MAX;
633  vecDist.getPos(p)[2] = (St)rand() / (float)RAND_MAX;
634 
635  vecDist.template getProp<0>(p) = vecDist.getPos(p)[0] + vecDist.getPos(p)[1] + vecDist.getPos(p)[2];
636 
637  vecDist.template getProp<1>(p)[0] = vecDist.getPos(p)[0];
638  vecDist.template getProp<1>(p)[1] = vecDist.getPos(p)[1];
639  vecDist.template getProp<1>(p)[2] = vecDist.getPos(p)[2];
640 
641  vecDist.template getProp<2>(p)[0] = vecDist.getPos(p)[0] + vecDist.getPos(p)[1];
642  vecDist.template getProp<2>(p)[1] = vecDist.getPos(p)[0] + vecDist.getPos(p)[2];
643  vecDist.template getProp<2>(p)[2] = vecDist.getPos(p)[1] + vecDist.getPos(p)[2];
644 
645  ++it;
646  }
647 
648  // move on device
649  vecDist.hostToDevicePos();
650  vecDist.template hostToDeviceProp<0,1,2>();
651 
652  // Ok we redistribute the particles (GPU based)
653  vecDist.map(RUN_ON_DEVICE);
654 
656 
657  vecDist.deviceToHostPos();
658  vecDist.template deviceToHostProp<0,1,2>();
659 
660  // Reset the host part
661 
662  auto it3 = vecDist.getDomainIterator();
663 
664  while (it3.isNext())
665  {
666  auto p = it3.get();
667 
668  vecDist.getPos(p)[0] = 1.0;
669  vecDist.getPos(p)[1] = 1.0;
670  vecDist.getPos(p)[2] = 1.0;
671 
672  vecDist.template getProp<0>(p) = 0.0;
673 
674  vecDist.template getProp<0>(p) = 0.0;
675  vecDist.template getProp<0>(p) = 0.0;
676  vecDist.template getProp<0>(p) = 0.0;
677 
678  vecDist.template getProp<0>(p) = 0.0;
679  vecDist.template getProp<0>(p) = 0.0;
680  vecDist.template getProp<0>(p) = 0.0;
681 
682  ++it3;
683  }
684 
685  // we move from Device to CPU
686 
687  vecDist.deviceToHostPos();
688  vecDist.template deviceToHostProp<0,1,2>();
689 
690  // Check
691 
692  auto it2 = vecDist.getDomainIterator();
693 
694  bool match = true;
695  while (it2.isNext())
696  {
697  auto p = it2.get();
698 
699  match &= vecDist.template getProp<0>(p) == vecDist.getPos(p)[0] + vecDist.getPos(p)[1] + vecDist.getPos(p)[2];
700 
701  match &= vecDist.template getProp<1>(p)[0] == vecDist.getPos(p)[0];
702  match &= vecDist.template getProp<1>(p)[1] == vecDist.getPos(p)[1];
703  match &= vecDist.template getProp<1>(p)[2] == vecDist.getPos(p)[2];
704 
705  match &= vecDist.template getProp<2>(p)[0] == vecDist.getPos(p)[0] + vecDist.getPos(p)[1];
706  match &= vecDist.template getProp<2>(p)[1] == vecDist.getPos(p)[0] + vecDist.getPos(p)[2];
707  match &= vecDist.template getProp<2>(p)[2] == vecDist.getPos(p)[1] + vecDist.getPos(p)[2];
708 
709  ++it2;
710  }
711 
712  BOOST_REQUIRE_EQUAL(match,true);
713 
714  // count local particles
715 
716  size_t l_cnt = 0;
717  size_t nl_cnt = 0;
718  size_t n_out = 0;
719 
720  // Domain + ghost box
721  Box<3,St> dom_ext = domain;
722  dom_ext.enlarge(g);
723 
724  auto it5 = vecDist.getDomainIterator();
725  count_local_n_local<3>(vecDist,it5,bc,domain,dom_ext,l_cnt,nl_cnt,n_out);
726 
727  BOOST_REQUIRE_EQUAL(n_out,0);
728  BOOST_REQUIRE_EQUAL(l_cnt,vecDist.size_local());
729 
730  // we do 10 gpu steps (using a cpu vector to check that map and ghost get work as expented)
731 
732  for (size_t i = 0 ; i < 10 ; i++)
733  {
734  vecDist.map(RUN_ON_DEVICE);
735 
736  vecDist.deviceToHostPos();
737  vecDist.template deviceToHostProp<0,1,2>();
738 
739  // To test we copy on a cpu distributed vector and we do a map
740 
741  vector_dist<3,St,aggregate<St,St[3],St[3]>> vd_cpu(vecDist.getDecomposition().template duplicate_convert<HeapMemory,memory_traits_lin>(),0);
742 
743  auto itc = vecDist.getDomainIterator();
744 
745  while (itc.isNext())
746  {
747  auto p = itc.get();
748 
749  vd_cpu.add();
750 
751  vd_cpu.getLastPos()[0] = vecDist.getPos(p)[0];
752  vd_cpu.getLastPos()[1] = vecDist.getPos(p)[1];
753  vd_cpu.getLastPos()[2] = vecDist.getPos(p)[2];
754 
755  vd_cpu.template getLastProp<0>() = vecDist.template getProp<0>(p);
756 
757  vd_cpu.template getLastProp<1>()[0] = vecDist.template getProp<1>(p)[0];
758  vd_cpu.template getLastProp<1>()[1] = vecDist.template getProp<1>(p)[1];
759  vd_cpu.template getLastProp<1>()[2] = vecDist.template getProp<1>(p)[2];
760 
761  vd_cpu.template getLastProp<2>()[0] = vecDist.template getProp<2>(p)[0];
762  vd_cpu.template getLastProp<2>()[1] = vecDist.template getProp<2>(p)[1];
763  vd_cpu.template getLastProp<2>()[2] = vecDist.template getProp<2>(p)[2];
764 
765  ++itc;
766  }
767 
768  vd_cpu.template ghost_get<0,1,2>();
769 
771 
772  vecDist.template ghost_get<0,1,2>(RUN_ON_DEVICE);
773 
775 
776  vecDist.deviceToHostPos();
777  vecDist.template deviceToHostProp<0,1,2>();
778 
779  match = true;
780 
781  // Particle on the gpu ghost and cpu ghost are not ordered in the same way so we have to reorder
782 
783  struct part
784  {
785  Point<3,St> xp;
786 
787 
788  St prp0;
789  St prp1[3];
790  St prp2[3];
791 
792  bool operator<(const part & tmp) const
793  {
794  if (xp.get(0) < tmp.xp.get(0))
795  {return true;}
796  else if (xp.get(0) > tmp.xp.get(0))
797  {return false;}
798 
799  if (xp.get(1) < tmp.xp.get(1))
800  {return true;}
801  else if (xp.get(1) > tmp.xp.get(1))
802  {return false;}
803 
804  if (xp.get(2) < tmp.xp.get(2))
805  {return true;}
806  else if (xp.get(2) > tmp.xp.get(2))
807  {return false;}
808 
809  return false;
810  }
811  };
812 
813  openfpm::vector<part> cpu_sort;
814  openfpm::vector<part> gpu_sort;
815 
816  cpu_sort.resize(vd_cpu.size_local_with_ghost() - vd_cpu.size_local());
817  gpu_sort.resize(vecDist.size_local_with_ghost() - vecDist.size_local());
818 
819  BOOST_REQUIRE_EQUAL(cpu_sort.size(),gpu_sort.size());
820 
821  size_t cnt = 0;
822 
823  auto itc2 = vecDist.getGhostIterator();
824  while (itc2.isNext())
825  {
826  auto p = itc2.get();
827 
828  cpu_sort.get(cnt).xp.get(0) = vd_cpu.getPos(p)[0];
829  gpu_sort.get(cnt).xp.get(0) = vecDist.getPos(p)[0];
830  cpu_sort.get(cnt).xp.get(1) = vd_cpu.getPos(p)[1];
831  gpu_sort.get(cnt).xp.get(1) = vecDist.getPos(p)[1];
832  cpu_sort.get(cnt).xp.get(2) = vd_cpu.getPos(p)[2];
833  gpu_sort.get(cnt).xp.get(2) = vecDist.getPos(p)[2];
834 
835  cpu_sort.get(cnt).prp0 = vd_cpu.template getProp<0>(p);
836  gpu_sort.get(cnt).prp0 = vecDist.template getProp<0>(p);
837 
838  cpu_sort.get(cnt).prp1[0] = vd_cpu.template getProp<1>(p)[0];
839  gpu_sort.get(cnt).prp1[0] = vecDist.template getProp<1>(p)[0];
840  cpu_sort.get(cnt).prp1[1] = vd_cpu.template getProp<1>(p)[1];
841  gpu_sort.get(cnt).prp1[1] = vecDist.template getProp<1>(p)[1];
842  cpu_sort.get(cnt).prp1[2] = vd_cpu.template getProp<1>(p)[2];
843  gpu_sort.get(cnt).prp1[2] = vecDist.template getProp<1>(p)[2];
844 
845  cpu_sort.get(cnt).prp2[0] = vd_cpu.template getProp<2>(p)[0];
846  gpu_sort.get(cnt).prp2[0] = vecDist.template getProp<2>(p)[0];
847  cpu_sort.get(cnt).prp2[1] = vd_cpu.template getProp<2>(p)[1];
848  gpu_sort.get(cnt).prp2[1] = vecDist.template getProp<2>(p)[1];
849  cpu_sort.get(cnt).prp2[2] = vd_cpu.template getProp<2>(p)[2];
850  gpu_sort.get(cnt).prp2[2] = vecDist.template getProp<2>(p)[2];
851 
852  ++cnt;
853  ++itc2;
854  }
855 
856  cpu_sort.sort();
857  gpu_sort.sort();
858 
859  for (size_t i = 0 ; i < cpu_sort.size() ; i++)
860  {
861  match &= cpu_sort.get(i).xp.get(0) == gpu_sort.get(i).xp.get(0);
862  match &= cpu_sort.get(i).xp.get(1) == gpu_sort.get(i).xp.get(1);
863  match &= cpu_sort.get(i).xp.get(2) == gpu_sort.get(i).xp.get(2);
864 
865  match &= cpu_sort.get(i).prp0 == gpu_sort.get(i).prp0;
866  match &= cpu_sort.get(i).prp1[0] == gpu_sort.get(i).prp1[0];
867  match &= cpu_sort.get(i).prp1[1] == gpu_sort.get(i).prp1[1];
868  match &= cpu_sort.get(i).prp1[2] == gpu_sort.get(i).prp1[2];
869 
870  match &= cpu_sort.get(i).prp2[0] == gpu_sort.get(i).prp2[0];
871  match &= cpu_sort.get(i).prp2[1] == gpu_sort.get(i).prp2[1];
872  match &= cpu_sort.get(i).prp2[2] == gpu_sort.get(i).prp2[2];
873  }
874 
875  BOOST_REQUIRE_EQUAL(match,true);
876 
877  // move particles on gpu
878 
879  auto ite = vecDist.getDomainIteratorGPU();
880  CUDA_LAUNCH_DIM3((move_parts_gpu_test<3,decltype(vecDist.toKernel())>),ite.wthr,ite.thr,vecDist.toKernel());
881  }
882 }
883 
884 BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
885 {
886  vdist_calc_gpu_test<float>();
887  vdist_calc_gpu_test<double>();
888 }
889 
890 BOOST_AUTO_TEST_CASE(vector_dist_reduce)
891 {
892  auto & vCluster = create_vcluster();
893 
894  if (vCluster.size() > 16)
895  {return;}
896 
897  Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
898 
899  // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
900  Ghost<3,float> g(0.1);
901 
902  // Boundary conditions
903  size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
904 
905  vector_dist_gpu<3,float,aggregate<float,double,int,size_t>> vecDist(5000*vCluster.size(),domain,bc,g);
906 
907  auto it = vecDist.getDomainIterator();
908 
909  float fc = 1.0;
910  double dc = 1.0;
911  int ic = 1.0;
912  size_t sc = 1.0;
913 
914  while(it.isNext())
915  {
916  auto p = it.get();
917 
918  vecDist.template getProp<0>(p) = fc;
919  vecDist.template getProp<1>(p) = dc;
920  vecDist.template getProp<2>(p) = ic;
921  vecDist.template getProp<3>(p) = sc;
922 
923  fc += 1.0;
924  dc += 1.0;
925  ic += 1;
926  sc += 1;
927 
928  ++it;
929  }
930 
931  vecDist.template hostToDeviceProp<0,1,2,3>();
932 
933  float redf = reduce_local<0,_add_>(vecDist);
934  double redd = reduce_local<1,_add_>(vecDist);
935  int redi = reduce_local<2,_add_>(vecDist);
936  size_t reds = reduce_local<3,_add_>(vecDist);
937 
938  BOOST_REQUIRE_EQUAL(redf,(vecDist.size_local()+1.0)*(vecDist.size_local())/2.0);
939  BOOST_REQUIRE_EQUAL(redd,(vecDist.size_local()+1.0)*(vecDist.size_local())/2.0);
940  BOOST_REQUIRE_EQUAL(redi,(vecDist.size_local()+1)*(vecDist.size_local())/2);
941  BOOST_REQUIRE_EQUAL(reds,(vecDist.size_local()+1)*(vecDist.size_local())/2);
942 
943  float redf2 = reduce_local<0,_max_>(vecDist);
944  double redd2 = reduce_local<1,_max_>(vecDist);
945  int redi2 = reduce_local<2,_max_>(vecDist);
946  size_t reds2 = reduce_local<3,_max_>(vecDist);
947 
948  BOOST_REQUIRE_EQUAL(redf2,vecDist.size_local());
949  BOOST_REQUIRE_EQUAL(redd2,vecDist.size_local());
950  BOOST_REQUIRE_EQUAL(redi2,vecDist.size_local());
951  BOOST_REQUIRE_EQUAL(reds2,vecDist.size_local());
952 }
953 
954 template<typename CellList_type, bool sorted>
955 void vector_dist_dlb_on_cuda_impl(size_t k,double r_cut)
956 {
957  std::random_device r;
958 
959  std::seed_seq seed2{/*r() +*/ create_vcluster().rank(),
960  /*r() +*/ create_vcluster().rank(),
961  /*r() +*/ create_vcluster().rank(),
962  /*r() +*/ create_vcluster().rank(),
963  /*r() +*/ create_vcluster().rank(),
964  /*r() +*/ create_vcluster().rank(),
965  /*r() +*/ create_vcluster().rank(),
966  /*r() +*/ create_vcluster().rank()};
967  std::mt19937 e2(seed2);
968 
970 
971  Vcluster<> & vCluster = create_vcluster();
972 
973  if (vCluster.getProcessingUnits() > 8)
974  return;
975 
976  std::uniform_real_distribution<double> unif(0.0,0.3);
977 
978  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
979  Ghost<3,double> g(0.1);
980  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
981 
982  vector_type vecDist(0,domain,bc,g,DEC_GRAN(2048));
983 
984  // Only processor 0 initialy add particles on a corner of a domain
985 
986  if (vCluster.getProcessUnitID() == 0)
987  {
988  for(size_t i = 0 ; i < k ; i++)
989  {
990  vecDist.add();
991 
992  vecDist.getLastPos()[0] = unif(e2);
993  vecDist.getLastPos()[1] = unif(e2);
994  vecDist.getLastPos()[2] = unif(e2);
995  }
996  }
997 
998  // Move to GPU
999  vecDist.hostToDevicePos();
1000  vecDist.template hostToDeviceProp<0>();
1001 
1002  vecDist.map(RUN_ON_DEVICE);
1003  vecDist.template ghost_get<>(RUN_ON_DEVICE);
1004 
1005  // now move to CPU
1006 
1007  vecDist.deviceToHostPos();
1008  vecDist.template deviceToHostProp<0>();
1009 
1010  // Get the neighborhood of each particles
1011 
1012  auto VV = vecDist.getVerlet(r_cut);
1013 
1014  // store the number of neighborhood for each particles
1015 
1016  auto it = vecDist.getDomainIterator();
1017 
1018  while (it.isNext())
1019  {
1020  auto p = it.get();
1021 
1022  vecDist.template getProp<0>(p) = VV.getNNPart(p.getKey());
1023 
1024  ++it;
1025  }
1026 
1027  // Move to GPU
1028  vecDist.template hostToDeviceProp<0>();
1029 
1030  ModelSquare md;
1031  md.factor = 10;
1032  vecDist.addComputationCosts(md);
1033  vecDist.getDecomposition().decompose();
1034  vecDist.map(RUN_ON_DEVICE);
1035 
1036  vecDist.deviceToHostPos();
1037  // Move info to CPU for addComputationcosts
1038 
1039  vecDist.addComputationCosts(md);
1040 
1042  size_t load = vecDist.getDecomposition().getDistribution().getProcessorLoad();
1043  vCluster.allGather(load,loads);
1044  vCluster.execute();
1045 
1046  for (size_t i = 0 ; i < loads.size() ; i++)
1047  {
1048  double load_f = load;
1049  double load_fc = loads.get(i);
1050 
1051  BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
1052  }
1053 
1054  BOOST_REQUIRE(vecDist.size_local() != 0);
1055 
1056  Point<3,double> v({1.0,1.0,1.0});
1057 
1058  for (size_t i = 0 ; i < 25 ; i++)
1059  {
1060  // move particles to CPU and move the particles by 0.1
1061 
1062  vecDist.deviceToHostPos();
1063 
1064  auto it = vecDist.getDomainIterator();
1065 
1066  while (it.isNext())
1067  {
1068  auto p = it.get();
1069 
1070  vecDist.getPos(p)[0] += v.get(0) * 0.09;
1071  vecDist.getPos(p)[1] += v.get(1) * 0.09;
1072  vecDist.getPos(p)[2] += v.get(2) * 0.09;
1073 
1074  ++it;
1075  }
1076 
1077  //Back to GPU
1078  vecDist.hostToDevicePos();
1079  vecDist.map(RUN_ON_DEVICE);
1080  vecDist.template ghost_get<0>(RUN_ON_DEVICE);
1081 
1082  vecDist.deviceToHostPos();
1083  vecDist.template deviceToHostProp<0,1,2>();
1084 
1085  // Check calc forces
1086  size_t opt = (sorted)? CL_GPU_REORDER : 0;
1087  auto cellList = vecDist.getCellList(r_cut);
1088  auto cellListGPU = vecDist.template getCellListGPU<CellList_type>(r_cut, opt | CL_NON_SYMMETRIC);
1089 
1090  if (sorted)
1091  {
1092  vecDist.hostToDevicePos();
1093  vecDist.template hostToDeviceProp<0>();
1094 
1095  vecDist.template updateCellListGPU<0>(cellListGPU);
1096  compareCellListCpuGpuSorted(vecDist,cellListGPU,cellList);
1097  }
1098 
1099  else
1100  {
1101  vecDist.updateCellListGPU(cellListGPU);
1102  compareCellListCpuGpu(vecDist,cellListGPU,cellList);
1103  }
1104 
1105  auto VV2 = vecDist.getVerlet(r_cut);
1106 
1107  auto it2 = vecDist.getDomainIterator();
1108 
1109  bool match = true;
1110  while (it2.isNext())
1111  {
1112  auto p = it2.get();
1113 
1114  match &= vecDist.template getProp<0>(p) == VV2.getNNPart(p.getKey());
1115 
1116  ++it2;
1117  }
1118 
1119  BOOST_REQUIRE_EQUAL(match,true);
1120 
1121  ModelSquare md;
1122  vecDist.addComputationCosts(md);
1123  vecDist.getDecomposition().redecompose(200);
1124  vecDist.map(RUN_ON_DEVICE);
1125 
1126  BOOST_REQUIRE(vecDist.size_local() != 0);
1127 
1128  vecDist.template ghost_get<0>(RUN_ON_DEVICE);
1129  vecDist.deviceToHostPos();
1130  vecDist.template deviceToHostProp<0>();
1131 
1132  vecDist.addComputationCosts(md);
1133 
1135  size_t load = vecDist.getDecomposition().getDistribution().getProcessorLoad();
1136  vCluster.allGather(load,loads);
1137  vCluster.execute();
1138 
1139  for (size_t i = 0 ; i < loads.size() ; i++)
1140  {
1141  double load_f = load;
1142  double load_fc = loads.get(i);
1143 
1144 #ifdef ENABLE_ASAN
1145  BOOST_REQUIRE_CLOSE(load_f,load_fc,30.0);
1146 #else
1147  BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
1148 #endif
1149  }
1150  }
1151 }
1152 
1153 template<typename CellList_type, bool sorted>
1154 void vector_dist_dlb_on_cuda_impl_async(size_t k,double r_cut)
1155 {
1156  std::random_device r;
1157 
1158  std::seed_seq seed2{r() + create_vcluster().rank(),
1159  r() + create_vcluster().rank(),
1160  r() + create_vcluster().rank(),
1161  r() + create_vcluster().rank(),
1162  r() + create_vcluster().rank(),
1163  r() + create_vcluster().rank(),
1164  r() + create_vcluster().rank(),
1165  r() + create_vcluster().rank()};
1166  std::mt19937 e2(seed2);
1167 
1169 
1170  Vcluster<> & vCluster = create_vcluster();
1171 
1172  if (vCluster.getProcessingUnits() > 8)
1173  return;
1174 
1175  std::uniform_real_distribution<double> unif(0.0,0.3);
1176 
1177  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1178  Ghost<3,double> g(0.1);
1179  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1180 
1181  vector_type vecDist(0,domain,bc,g,DEC_GRAN(2048));
1182 
1183  // Only processor 0 initialy add particles on a corner of a domain
1184 
1185  if (vCluster.getProcessUnitID() == 0)
1186  {
1187  for(size_t i = 0 ; i < k ; i++)
1188  {
1189  vecDist.add();
1190 
1191  vecDist.getLastPos()[0] = unif(e2);
1192  vecDist.getLastPos()[1] = unif(e2);
1193  vecDist.getLastPos()[2] = unif(e2);
1194  }
1195  }
1196 
1197  // Move to GPU
1198  vecDist.hostToDevicePos();
1199  vecDist.template hostToDeviceProp<0>();
1200 
1201  vecDist.map(RUN_ON_DEVICE);
1202  vecDist.template Ighost_get<>(RUN_ON_DEVICE);
1203  vecDist.template ghost_wait<>(RUN_ON_DEVICE);
1204 
1205  // now move to CPU
1206 
1207  vecDist.deviceToHostPos();
1208  vecDist.template deviceToHostProp<0>();
1209 
1210  // Get the neighborhood of each particles
1211 
1212  auto VV = vecDist.getVerlet(r_cut);
1213 
1214  // store the number of neighborhood for each particles
1215 
1216  auto it = vecDist.getDomainIterator();
1217 
1218  while (it.isNext())
1219  {
1220  auto p = it.get();
1221 
1222  vecDist.template getProp<0>(p) = VV.getNNPart(p.getKey());
1223 
1224  ++it;
1225  }
1226 
1227  // Move to GPU
1228  vecDist.template hostToDeviceProp<0>();
1229 
1230  ModelSquare md;
1231  md.factor = 10;
1232  vecDist.addComputationCosts(md);
1233  vecDist.getDecomposition().decompose();
1234  vecDist.map(RUN_ON_DEVICE);
1235 
1236  vecDist.deviceToHostPos();
1237  // Move info to CPU for addComputationcosts
1238 
1239  vecDist.addComputationCosts(md);
1240 
1242  size_t load = vecDist.getDecomposition().getDistribution().getProcessorLoad();
1243  vCluster.allGather(load,loads);
1244  vCluster.execute();
1245 
1246  for (size_t i = 0 ; i < loads.size() ; i++)
1247  {
1248  double load_f = load;
1249  double load_fc = loads.get(i);
1250 
1251  BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
1252  }
1253 
1254  BOOST_REQUIRE(vecDist.size_local() != 0);
1255 
1256  Point<3,double> v({1.0,1.0,1.0});
1257 
1258  for (size_t i = 0 ; i < 25 ; i++)
1259  {
1260  // move particles to CPU and move the particles by 0.1
1261 
1262  vecDist.deviceToHostPos();
1263 
1264  auto it = vecDist.getDomainIterator();
1265 
1266  while (it.isNext())
1267  {
1268  auto p = it.get();
1269 
1270  vecDist.getPos(p)[0] += v.get(0) * 0.09;
1271  vecDist.getPos(p)[1] += v.get(1) * 0.09;
1272  vecDist.getPos(p)[2] += v.get(2) * 0.09;
1273 
1274  ++it;
1275  }
1276 
1277  // Back to GPU
1278  vecDist.hostToDevicePos();
1279  vecDist.map(RUN_ON_DEVICE);
1280  vecDist.template Ighost_get<0>(RUN_ON_DEVICE);
1281  vecDist.template ghost_wait<0>(RUN_ON_DEVICE);
1282  vecDist.deviceToHostPos();
1283  vecDist.template deviceToHostProp<0,1,2>();
1284 
1285  // Check calc forces
1286  size_t opt = (sorted)? CL_GPU_REORDER : 0;
1287  auto cellList = vecDist.getCellList(r_cut);
1288  auto cellListGPU = vecDist.template getCellListGPU<CellList_type>(r_cut, opt | CL_NON_SYMMETRIC);
1289 
1290  if (sorted)
1291  {
1292  vecDist.hostToDevicePos();
1293  vecDist.template hostToDeviceProp<0>();
1294 
1295  vecDist.template updateCellListGPU<0>(cellListGPU);
1296  compareCellListCpuGpuSorted(vecDist,cellListGPU,cellList);
1297  }
1298 
1299  else
1300  {
1301  vecDist.updateCellListGPU(cellListGPU);
1302  compareCellListCpuGpu(vecDist,cellListGPU,cellList);
1303  }
1304 
1305  auto VV2 = vecDist.getVerlet(r_cut);
1306 
1307  auto it2 = vecDist.getDomainIterator();
1308 
1309  bool match = true;
1310  while (it2.isNext())
1311  {
1312  auto p = it2.get();
1313 
1314  match &= vecDist.template getProp<0>(p) == VV2.getNNPart(p.getKey());
1315 
1316  ++it2;
1317  }
1318 
1319  BOOST_REQUIRE_EQUAL(match,true);
1320 
1321  ModelSquare md;
1322  vecDist.addComputationCosts(md);
1323  vecDist.getDecomposition().redecompose(200);
1324  vecDist.map(RUN_ON_DEVICE);
1325 
1326  BOOST_REQUIRE(vecDist.size_local() != 0);
1327 
1328  vecDist.template Ighost_get<0>(RUN_ON_DEVICE);
1329  vecDist.template ghost_wait<0>(RUN_ON_DEVICE);
1330  vecDist.deviceToHostPos();
1331  vecDist.template deviceToHostProp<0>();
1332 
1333  vecDist.addComputationCosts(md);
1334 
1336  size_t load = vecDist.getDecomposition().getDistribution().getProcessorLoad();
1337  vCluster.allGather(load,loads);
1338  vCluster.execute();
1339 
1340  for (size_t i = 0 ; i < loads.size() ; i++)
1341  {
1342  double load_f = load;
1343  double load_fc = loads.get(i);
1344 
1345  BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
1346  }
1347  }
1348 }
1349 
1350 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda_async)
1351 {
1352  vector_dist_dlb_on_cuda_impl_async<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,false>, false>(50000,0.01);
1353 }
1354 
1355 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda_async_sorted)
1356 {
1357  vector_dist_dlb_on_cuda_impl_async<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,false>, true>(50000,0.01);
1358 }
1359 
1360 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda)
1361 {
1362  vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,false>, false>(50000,0.01);
1363 }
1364 
1365 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda_sorted)
1366 {
1367  vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,false>, true>(50000,0.01);
1368 }
1369 
1370 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda_sparse)
1371 {
1372  vector_dist_dlb_on_cuda_impl<CELLLIST_GPU_SPARSE<3,double>, false>(50000,0.01);
1373 }
1374 
1375 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda_sparse_sorted)
1376 {
1377  vector_dist_dlb_on_cuda_impl<CELLLIST_GPU_SPARSE<3,double>, true>(50000,0.01);
1378 }
1379 
1380 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda2)
1381 {
1382  if (create_vcluster().size() <= 3)
1383  {return;};
1384 
1385  #ifndef CUDA_ON_CPU
1386  vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,false>, false>(1000000,0.01);
1387  #endif
1388 }
1389 
1390 BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda3)
1391 {
1392  if (create_vcluster().size() < 8)
1393  {return;}
1394 
1395  #ifndef CUDA_ON_CPU
1396  vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,false>, false>(15000000,0.005);
1397  #endif
1398 }
1399 
1400 
1401 BOOST_AUTO_TEST_CASE(vector_dist_keep_prop_on_cuda)
1402 {
1404 
1405  Vcluster<> & vCluster = create_vcluster();
1406 
1407  if (vCluster.getProcessingUnits() > 8)
1408  return;
1409 
1410  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1411  Ghost<3,double> g(0.1);
1412  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1413 
1414  vector_type vecDist(0,domain,bc,g,DEC_GRAN(2048));
1415 
1416  // Only processor 0 initialy add particles on a corner of a domain
1417 
1418  if (vCluster.getProcessUnitID() == 0)
1419  {
1420  for(size_t i = 0 ; i < 50000 ; i++)
1421  {
1422  vecDist.add();
1423 
1424  vecDist.getLastPos()[0] = ((double)rand())/RAND_MAX * 0.3;
1425  vecDist.getLastPos()[1] = ((double)rand())/RAND_MAX * 0.3;
1426  vecDist.getLastPos()[2] = ((double)rand())/RAND_MAX * 0.3;
1427  }
1428  }
1429 
1430  // Move to GPU
1431  vecDist.hostToDevicePos();
1432  vecDist.template hostToDeviceProp<0>();
1433 
1434  vecDist.map(RUN_ON_DEVICE);
1435  vecDist.template ghost_get<>(RUN_ON_DEVICE);
1436 
1437  // now move to CPU
1438 
1439  vecDist.deviceToHostPos();
1440  vecDist.template deviceToHostProp<0>();
1441 
1442 
1443  // store the number of neighborhood for each particles
1444 
1445  auto it = vecDist.getDomainIterator();
1446 
1447  while (it.isNext())
1448  {
1449  auto p = it.get();
1450 
1451  vecDist.template getProp<0>(p) = 0.0;
1452 
1453  vecDist.template getProp<1>(p)[0] = 1000.0;
1454  vecDist.template getProp<1>(p)[1] = 2000.0;
1455  vecDist.template getProp<1>(p)[2] = 3000.0;
1456 
1457  vecDist.template getProp<2>(p)[0][0] = 6000,0;
1458  vecDist.template getProp<2>(p)[0][1] = 7000.0;
1459  vecDist.template getProp<2>(p)[0][2] = 8000.0;
1460  vecDist.template getProp<2>(p)[1][0] = 9000.0;
1461  vecDist.template getProp<2>(p)[1][1] = 10000.0;
1462  vecDist.template getProp<2>(p)[1][2] = 11000.0;
1463  vecDist.template getProp<2>(p)[2][0] = 12000.0;
1464  vecDist.template getProp<2>(p)[2][1] = 13000.0;
1465  vecDist.template getProp<2>(p)[2][2] = 14000.0;
1466 
1467  ++it;
1468  }
1469 
1470  // Move to GPU
1471  vecDist.template hostToDeviceProp<0,1,2>();
1472 
1473  ModelSquare md;
1474  md.factor = 10;
1475  vecDist.addComputationCosts(md);
1476  vecDist.getDecomposition().decompose();
1477  vecDist.map(RUN_ON_DEVICE);
1478 
1479  vecDist.deviceToHostPos();
1480  // Move info to CPU for addComputationcosts
1481 
1482  vecDist.addComputationCosts(md);
1483 
1485  size_t load = vecDist.getDecomposition().getDistribution().getProcessorLoad();
1486  vCluster.allGather(load,loads);
1487  vCluster.execute();
1488 
1489  for (size_t i = 0 ; i < loads.size() ; i++)
1490  {
1491  double load_f = load;
1492  double load_fc = loads.get(i);
1493 
1494  BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
1495  }
1496 
1497  BOOST_REQUIRE(vecDist.size_local() != 0);
1498 
1499  Point<3,double> v({1.0,1.0,1.0});
1500 
1501  int base = 0;
1502 
1503  for (size_t i = 0 ; i < 25 ; i++)
1504  {
1505  if (i % 2 == 0)
1506  {
1507  // move particles to CPU and move the particles by 0.1
1508 
1509  vecDist.deviceToHostPos();
1510 
1511  auto it = vecDist.getDomainIterator();
1512 
1513  while (it.isNext())
1514  {
1515  auto p = it.get();
1516 
1517  vecDist.getPos(p)[0] += v.get(0) * 0.09;
1518  vecDist.getPos(p)[1] += v.get(1) * 0.09;
1519  vecDist.getPos(p)[2] += v.get(2) * 0.09;
1520 
1521  ++it;
1522  }
1523 
1524  //Back to GPU
1525  vecDist.hostToDevicePos();
1526  vecDist.map(RUN_ON_DEVICE);
1527  vecDist.template ghost_get<>(RUN_ON_DEVICE);
1528  vecDist.deviceToHostPos();
1529  vecDist.template deviceToHostProp<0,1,2>();
1530 
1531  ModelSquare md;
1532  vecDist.addComputationCosts(md);
1533  vecDist.getDecomposition().redecompose(200);
1534  vecDist.map(RUN_ON_DEVICE);
1535 
1536  BOOST_REQUIRE(vecDist.size_local() != 0);
1537 
1538  vecDist.template ghost_get<0>(RUN_ON_DEVICE);
1539  vecDist.deviceToHostPos();
1540  vecDist.template deviceToHostProp<0,1,2>();
1541 
1542  vecDist.addComputationCosts(md);
1543 
1545  size_t load = vecDist.getDecomposition().getDistribution().getProcessorLoad();
1546  vCluster.allGather(load,loads);
1547  vCluster.execute();
1548 
1549  for (size_t i = 0 ; i < loads.size() ; i++)
1550  {
1551  double load_f = load;
1552  double load_fc = loads.get(i);
1553 
1554  BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
1555  }
1556  }
1557  else
1558  {
1559  vecDist.template deviceToHostProp<0,1,2>();
1560 
1561  auto it2 = vecDist.getDomainIterator();
1562 
1563  bool match = true;
1564  while (it2.isNext())
1565  {
1566  auto p = it2.get();
1567 
1568  vecDist.template getProp<0>(p) += 1;
1569 
1570  vecDist.template getProp<1>(p)[0] += 1.0;
1571  vecDist.template getProp<1>(p)[1] += 1.0;
1572  vecDist.template getProp<1>(p)[2] += 1.0;
1573 
1574  vecDist.template getProp<2>(p)[0][0] += 1.0;
1575  vecDist.template getProp<2>(p)[0][1] += 1.0;
1576  vecDist.template getProp<2>(p)[0][2] += 1.0;
1577  vecDist.template getProp<2>(p)[1][0] += 1.0;
1578  vecDist.template getProp<2>(p)[1][1] += 1.0;
1579  vecDist.template getProp<2>(p)[1][2] += 1.0;
1580  vecDist.template getProp<2>(p)[2][0] += 1.0;
1581  vecDist.template getProp<2>(p)[2][1] += 1.0;
1582  vecDist.template getProp<2>(p)[2][2] += 1.0;
1583 
1584  ++it2;
1585  }
1586 
1587  vecDist.template hostToDeviceProp<0,1,2>();
1588 
1589  ++base;
1590 
1591  vecDist.template ghost_get<0,1,2>(RUN_ON_DEVICE | KEEP_PROPERTIES);
1592  vecDist.template deviceToHostProp<0,1,2>();
1593 
1594  // Check that the ghost contain the correct information
1595 
1596  auto itg = vecDist.getGhostIterator();
1597 
1598  while (itg.isNext())
1599  {
1600  auto p = itg.get();
1601 
1602  match &= vecDist.template getProp<0>(p) == base;
1603 
1604  match &= vecDist.template getProp<1>(p)[0] == base + 1000.0;
1605  match &= vecDist.template getProp<1>(p)[1] == base + 2000.0;
1606  match &= vecDist.template getProp<1>(p)[2] == base + 3000.0;
1607 
1608  match &= vecDist.template getProp<2>(p)[0][0] == base + 6000.0;
1609  match &= vecDist.template getProp<2>(p)[0][1] == base + 7000.0;
1610  match &= vecDist.template getProp<2>(p)[0][2] == base + 8000.0;
1611  match &= vecDist.template getProp<2>(p)[1][0] == base + 9000.0;
1612  match &= vecDist.template getProp<2>(p)[1][1] == base + 10000.0;
1613  match &= vecDist.template getProp<2>(p)[1][2] == base + 11000.0;
1614  match &= vecDist.template getProp<2>(p)[2][0] == base + 12000.0;
1615  match &= vecDist.template getProp<2>(p)[2][1] == base + 13000.0;
1616  match &= vecDist.template getProp<2>(p)[2][2] == base + 14000.0;
1617 
1618  ++itg;
1619  }
1620 
1621  BOOST_REQUIRE_EQUAL(match,true);
1622  }
1623  }
1624 }
1625 
1627 {
1628  __device__ static bool check(int c)
1629  {
1630  return c == 1;
1631  }
1632 };
1633 
1634 BOOST_AUTO_TEST_CASE(vector_dist_get_index_set)
1635 {
1636  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1637  Ghost<3,double> g(0.1);
1638  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1639 
1640  if (create_vcluster().size() >= 16)
1641  {return;}
1642 
1643  Vcluster<> & vCluster = create_vcluster();
1644 
1645  vector_dist_gpu<3,double,aggregate<int,double>> vdg(10000,domain,bc,g,DEC_GRAN(128));
1646 
1647  auto it = vdg.getDomainIterator();
1648 
1649  while (it.isNext())
1650  {
1651  auto p = it.get();
1652 
1653  vdg.getPos(p)[0] = (double)rand() / RAND_MAX;
1654  vdg.getPos(p)[1] = (double)rand() / RAND_MAX;
1655  vdg.getPos(p)[2] = (double)rand() / RAND_MAX;
1656 
1657  vdg.template getProp<0>(p) = (int)((double)rand() / RAND_MAX / 0.5);
1658 
1659  vdg.template getProp<1>(p) = (double)rand() / RAND_MAX;
1660 
1661  ++it;
1662  }
1663 
1664  vdg.map();
1665 
1666  vdg.hostToDeviceProp<0,1>();
1667  vdg.hostToDevicePos();
1668 
1669  auto cl = vdg.getCellListGPU(0.1);
1670  vdg.updateCellListGPU(cl);
1671 
1672  // than we get a cell-list to force reorder
1673 
1675 
1676  get_indexes_by_type<0,type_is_one>(vdg.getPropVector(),ids,vdg.size_local(),vCluster.getGpuContext());
1677 
1678  // test
1679 
1680  ids.template deviceToHost<0>();
1681 
1682  auto & vs = vdg.getPropVector();
1683  vs.template deviceToHost<0>();
1684 
1685  bool match = true;
1686 
1687  for (int i = 0 ; i < ids.size() ; i++)
1688  {
1689  if (vs.template get<0>(ids.template get<0>(i)) != 1)
1690  {match = false;}
1691  }
1692 
1693  BOOST_REQUIRE_EQUAL(match,true);
1694 }
1695 
1696 BOOST_AUTO_TEST_CASE(vector_dist_compare_host_device)
1697 {
1698  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1699  Ghost<3,double> g(0.1);
1700  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1701 
1702  if (create_vcluster().size() >= 16)
1703  {return;}
1704 
1705  vector_dist_gpu<3,double,aggregate<double,double[3],double[3][3]>> vdg(10000,domain,bc,g,DEC_GRAN(128));
1706 
1707  auto it = vdg.getDomainIterator();
1708 
1709  while (it.isNext())
1710  {
1711  auto p = it.get();
1712 
1713  vdg.getPos(p)[0] = (double)rand() / RAND_MAX;
1714  vdg.getPos(p)[1] = (double)rand() / RAND_MAX;
1715  vdg.getPos(p)[2] = (double)rand() / RAND_MAX;
1716 
1717  vdg.template getProp<0>(p) = (double)rand() / RAND_MAX;
1718 
1719  vdg.template getProp<1>(p)[0] = (double)rand() / RAND_MAX;
1720  vdg.template getProp<1>(p)[1] = (double)rand() / RAND_MAX;
1721  vdg.template getProp<1>(p)[2] = (double)rand() / RAND_MAX;
1722 
1723  vdg.template getProp<2>(p)[0][0] = (double)rand() / RAND_MAX;
1724  vdg.template getProp<2>(p)[0][1] = (double)rand() / RAND_MAX;
1725  vdg.template getProp<2>(p)[0][2] = (double)rand() / RAND_MAX;
1726  vdg.template getProp<2>(p)[1][0] = (double)rand() / RAND_MAX;
1727  vdg.template getProp<2>(p)[1][1] = (double)rand() / RAND_MAX;
1728  vdg.template getProp<2>(p)[1][2] = (double)rand() / RAND_MAX;
1729  vdg.template getProp<2>(p)[2][0] = (double)rand() / RAND_MAX;
1730  vdg.template getProp<2>(p)[2][1] = (double)rand() / RAND_MAX;
1731  vdg.template getProp<2>(p)[2][2] = (double)rand() / RAND_MAX;
1732 
1733  ++it;
1734  }
1735 
1736  vdg.map();
1737 
1738  vdg.hostToDeviceProp<0,1,2>();
1739  vdg.hostToDevicePos();
1740 
1741  bool test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
1742  BOOST_REQUIRE_EQUAL(test,true);
1743 
1744  vdg.getPos(100)[0] = 0.99999999;
1745 
1746  test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
1747  BOOST_REQUIRE_EQUAL(test,false);
1748 
1749  vdg.hostToDevicePos();
1750  vdg.getPos(100)[0] = 0.99999999;
1751 
1752  test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
1753  BOOST_REQUIRE_EQUAL(test,true);
1754 
1756 
1757  test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
1758  BOOST_REQUIRE_EQUAL(test,true);
1759 
1760  vdg.getProp<1>(103)[0] = 0.99999999;
1761 
1762  test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
1763  BOOST_REQUIRE_EQUAL(test,false);
1764 
1765  vdg.hostToDeviceProp<1>();
1766  vdg.getProp<1>(103)[0] = 0.99999999;
1767 
1768  test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
1769  BOOST_REQUIRE_EQUAL(test,true);
1770 
1772 
1773 
1774  test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
1775  BOOST_REQUIRE_EQUAL(test,true);
1776 
1777  vdg.getProp<0>(105) = 0.99999999;
1778 
1779  test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
1780  BOOST_REQUIRE_EQUAL(test,false);
1781 
1782  vdg.hostToDeviceProp<0>();
1783  vdg.getProp<0>(105) = 0.99999999;
1784 
1785  test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
1786  BOOST_REQUIRE_EQUAL(test,true);
1787 
1788 
1790 
1791 
1792  test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
1793  BOOST_REQUIRE_EQUAL(test,true);
1794 
1795  vdg.getProp<2>(108)[1][2] = 0.99999999;
1796 
1797  test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
1798  BOOST_REQUIRE_EQUAL(test,false);
1799 
1800  vdg.hostToDeviceProp<2>();
1801  vdg.getProp<2>(108)[1][2] = 0.99999999;
1802 
1803  test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
1804  BOOST_REQUIRE_EQUAL(test,true);
1805 }
1806 
1807 template<typename vector_dist_type>
1808 __global__ void assign_to_ghost(vector_dist_type vds)
1809 {
1810  int i = threadIdx.x + blockIdx.x * blockDim.x;
1811 
1812  if (i >= vds.size()) {return;}
1813 
1814  vds.template getProp<0>(i) = 1000.0 + i;
1815 
1816  vds.template getProp<1>(i)[0] = 2000.0 + i;
1817  vds.template getProp<1>(i)[1] = 3000.0 + i;
1818  vds.template getProp<1>(i)[2] = 4000.0 + i;
1819 
1820  vds.template getProp<2>(i)[0][0] = 12000.0 + i;
1821  vds.template getProp<2>(i)[0][1] = 13000.0 + i;
1822  vds.template getProp<2>(i)[0][2] = 14000.0 + i;
1823  vds.template getProp<2>(i)[1][0] = 22000.0 + i;
1824  vds.template getProp<2>(i)[1][1] = 23000.0 + i;
1825  vds.template getProp<2>(i)[1][2] = 24000.0 + i;
1826  vds.template getProp<2>(i)[2][0] = 32000.0 + i;
1827  vds.template getProp<2>(i)[2][1] = 33000.0 + i;
1828  vds.template getProp<2>(i)[2][2] = 34000.0 + i;
1829 
1830 }
1831 
1832 BOOST_AUTO_TEST_CASE(vector_dist_domain_and_ghost_test)
1833 {
1834  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1835  Ghost<3,double> g(0.1);
1836  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1837 
1838  if (create_vcluster().size() >= 16)
1839  {return;}
1840 
1841  vector_dist_gpu<3,double,aggregate<double,double[3],double[3][3]>> vdg(10000,domain,bc,g,DEC_GRAN(128));
1842 
1843  auto ite = vdg.getDomainAndGhostIteratorGPU();
1844 
1845  CUDA_LAUNCH(assign_to_ghost,ite,vdg.toKernel());
1846 
1847  vdg.template deviceToHostProp<0,1,2>();
1848 
1849 
1850  auto it = vdg.getDomainAndGhostIterator();
1851 
1852  bool check = true;
1853 
1854  while (it.isNext())
1855  {
1856  auto k = it.get();
1857 
1858  check &= vdg.template getProp<0>(k) == 1000.0 + k.getKey();
1859 
1860  check &= vdg.template getProp<1>(k)[0] == 2000.0 + k.getKey();
1861  check &= vdg.template getProp<1>(k)[1] == 3000.0 + k.getKey();
1862  check &= vdg.template getProp<1>(k)[2] == 4000.0 + k.getKey();
1863 
1864  check &= vdg.template getProp<2>(k)[0][0] == 12000.0 + k.getKey();
1865  check &= vdg.template getProp<2>(k)[0][1] == 13000.0 + k.getKey();
1866  check &= vdg.template getProp<2>(k)[0][2] == 14000.0 + k.getKey();
1867  check &= vdg.template getProp<2>(k)[1][0] == 22000.0 + k.getKey();
1868  check &= vdg.template getProp<2>(k)[1][1] == 23000.0 + k.getKey();
1869  check &= vdg.template getProp<2>(k)[1][2] == 24000.0 + k.getKey();
1870  check &= vdg.template getProp<2>(k)[2][0] == 32000.0 + k.getKey();
1871  check &= vdg.template getProp<2>(k)[2][1] == 33000.0 + k.getKey();
1872  check &= vdg.template getProp<2>(k)[2][2] == 34000.0 + k.getKey();
1873 
1874  ++it;
1875  }
1876 
1877 
1878  BOOST_REQUIRE_EQUAL(check,true);
1879 }
1880 
1881 template<typename vT>
1882 __global__ void launch_overflow(vT vs, vT vs2)
1883 {
1884  vs2.template getProp<1>(57)[0];
1885 }
1886 
1887 BOOST_AUTO_TEST_CASE(vector_dist_overflow_se_class1)
1888 {
1889  Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1890  Ghost<3,double> g(0.1);
1891  size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1892 
1893  if (create_vcluster().size() >= 16)
1894  {return;}
1895 
1896  std::cout << "****** TEST ERROR MESSAGE BEGIN ********" << std::endl;
1897 
1899  vector_dist_gpu<3,double,aggregate<double,double[3],double[3][3]>> vdg2(0,domain,bc,g,DEC_GRAN(128));
1900 
1901 
1902  vdg.setCapacity(100);
1903 
1904  ite_gpu<1> ite;
1905 
1906  ite.wthr.x = 1;
1907  ite.wthr.y = 1;
1908  ite.wthr.z = 1;
1909  ite.thr.x = 1;
1910  ite.thr.y = 1;
1911  ite.thr.z = 1;
1912 
1913  try
1914  {
1915  CUDA_LAUNCH(launch_overflow,ite,vdg.toKernel(),vdg2.toKernel());
1916  }
1917  catch(...)
1918  {
1919  std::cout << "SE_CLASS1 Catch" << std::endl;
1920  };
1921 
1922  std::cout << "****** TEST ERROR MESSAGE END ********" << std::endl;
1923 }
1924 
1925 
1926 
1927 BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
1928 {
1929 
1930 // This example require float atomic, until C++20 we are out of luck
1931 #ifndef CUDIFY_USE_OPENMP
1932 
1933  Vcluster<> & vCluster = create_vcluster();
1934 
1935  long int k = 25*25*25*create_vcluster().getProcessingUnits();
1936  k = std::pow(k, 1/3.);
1937 
1938  if (vCluster.getProcessingUnits() > 48)
1939  return;
1940 
1941  print_test("Testing 3D periodic ghost put GPU k=",k);
1942  BOOST_TEST_CHECKPOINT( "Testing 3D periodic ghost put k=" << k );
1943 
1944  long int big_step = k / 30;
1945  big_step = (big_step == 0)?1:big_step;
1946  long int small_step = 21;
1947 
1948  // 3D test
1949  for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
1950  {
1951  float r_cut = 1.3 / k;
1952  float r_g = 1.5 / k;
1953 
1954  Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0});
1955 
1956  // Boundary conditions
1957  size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
1958 
1959  // ghost
1960  Ghost<3,float> ghost(r_g);
1961 
1962  typedef aggregate<float,float,float> part_prop;
1963 
1964  // Distributed vector
1965  vector_dist_gpu<3,float, part_prop > vecDist(0,box,bc,ghost);
1966 
1967  auto it = vecDist.getGridIterator({(size_t)k,(size_t)k,(size_t)k});
1968 
1969  while (it.isNext())
1970  {
1971  auto key = it.get();
1972 
1973  vecDist.add();
1974 
1975  vecDist.getLastPosWrite()[0] = key.get(0)*it.getSpacing(0);
1976  vecDist.getLastPosWrite()[1] = key.get(1)*it.getSpacing(1);
1977  vecDist.getLastPosWrite()[2] = key.get(2)*it.getSpacing(2);
1978 
1979  // Fill some properties randomly
1980 
1981  vecDist.getLastPropWrite<0>() = 0.0;
1982 
1983  vecDist.getLastPropWrite<2>() = 0.0;
1984 
1985  ++it;
1986  }
1987 
1988  vecDist.map();
1989 
1990  vecDist.hostToDevicePos();
1991  vecDist.template hostToDeviceProp<0,2>();
1992  // sync the ghost
1993  vecDist.ghost_get<0,2>(RUN_ON_DEVICE);
1994  vecDist.template deviceToHostProp<0,2>();
1995  vecDist.deviceToHostPos();
1996 
1997  {
1998  auto NN = vecDist.getCellList(r_cut);
1999  float a = 1.0f*k*k;
2000 
2001  // run trough all the particles + ghost
2002 
2003  auto it2 = vecDist.getDomainIterator();
2004 
2005  while (it2.isNext())
2006  {
2007  // particle p
2008  auto p = it2.get();
2009  Point<3,float> xp = vecDist.getPos(p);
2010 
2011  // Get an iterator over the neighborhood particles of p
2012  auto Np = NN.getNNIteratorBox(NN.getCell(xp));
2013 
2014  // For each neighborhood particle ...
2015  while (Np.isNext())
2016  {
2017  auto q = Np.get();
2018  Point<3,float> xq = vecDist.getPosRead(q);
2019 
2020  float dist = xp.distance(xq);
2021 
2022  if (dist < r_cut)
2023  {
2024  vecDist.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
2025  vecDist.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut) / 2;
2026  }
2027 
2028  ++Np;
2029  }
2030 
2031  ++it2;
2032  }
2033 
2034  vecDist.hostToDevicePos();
2035  vecDist.template hostToDeviceProp<0,2>();
2036  vecDist.template ghost_put<add_atomic_,0,2>(RUN_ON_DEVICE);
2037  vecDist.template deviceToHostProp<0,2>();
2038  vecDist.deviceToHostPos();
2039 
2040  bool ret = true;
2041  auto it3 = vecDist.getDomainIterator();
2042 
2043  float constant = vecDist.getProp<0>(it3.get());
2044  float constanta = vecDist.getProp<2>(it3.get());
2045  float eps = 0.001;
2046 
2047  while (it3.isNext())
2048  {
2049  float constant2 = vecDist.getProp<0>(it3.get());
2050  float constant3 = vecDist.getProp<2>(it3.get());
2051  if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
2052  {
2053  Point<3,float> p = vecDist.getPosRead(it3.get());
2054 
2055  std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << vCluster.getProcessUnitID() << std::endl;
2056  ret = false;
2057  break;
2058  }
2059 
2060  ++it3;
2061  }
2062  BOOST_REQUIRE_EQUAL(ret,true);
2063  }
2064 
2065  auto itp = vecDist.getDomainAndGhostIterator();
2066  while (itp.isNext())
2067  {
2068  auto key = itp.get();
2069 
2070  vecDist.getPropWrite<0>(key) = 0.0;
2071  vecDist.getPropWrite<2>(key) = 0.0;
2072 
2073  ++itp;
2074  }
2075 
2076  {
2077  auto NN = vecDist.getCellList(r_cut);
2078  float a = 1.0f*k*k;
2079 
2080  // run trough all the particles + ghost
2081 
2082  auto it2 = vecDist.getDomainIterator();
2083 
2084  while (it2.isNext())
2085  {
2086  // particle p
2087  auto p = it2.get();
2088  Point<3,float> xp = vecDist.getPosRead(p);
2089 
2090  // Get an iterator over the neighborhood particles of p
2091  auto Np = NN.getNNIteratorBox(NN.getCell(xp));
2092 
2093  // For each neighborhood particle ...
2094  while (Np.isNext())
2095  {
2096  auto q = Np.get();
2097  Point<3,float> xq = vecDist.getPosRead(q);
2098 
2099  float dist = xp.distance(xq);
2100 
2101  if (dist < r_cut)
2102  {
2103  vecDist.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
2104  vecDist.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut);
2105  }
2106 
2107  ++Np;
2108  }
2109 
2110  ++it2;
2111  }
2112 
2113  vecDist.hostToDevicePos();
2114  vecDist.template hostToDeviceProp<0,2>();
2115  vecDist.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE);
2116  vecDist.template ghost_put<add_atomic_,2>(RUN_ON_DEVICE);
2117  vecDist.template deviceToHostProp<0,2>();
2118  vecDist.deviceToHostPos();
2119 
2120  bool ret = true;
2121  auto it3 = vecDist.getDomainIterator();
2122 
2123  float constant = vecDist.getPropRead<0>(it3.get());
2124  float constanta = vecDist.getPropRead<2>(it3.get());
2125  float eps = 0.001;
2126 
2127  while (it3.isNext())
2128  {
2129  float constant2 = vecDist.getPropRead<0>(it3.get());
2130  float constant3 = vecDist.getPropRead<0>(it3.get());
2131  if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
2132  {
2133  Point<3,float> p = vecDist.getPosRead(it3.get());
2134 
2135  std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << vCluster.getProcessUnitID() << std::endl;
2136  ret = false;
2137  break;
2138  }
2139 
2140  ++it3;
2141  }
2142  BOOST_REQUIRE_EQUAL(ret,true);
2143  }
2144  }
2145 
2146 #endif
2147 }
2148 
2149 BOOST_AUTO_TEST_SUITE_END()
void enlarge(const Box< dim, T > &gh)
Enlarge the box with ghost margin.
Definition: Box.hpp:822
size_t getNelements(const size_t cell_id) const
Return the number of elements in the cell.
Definition: CellList.hpp:1106
Definition: Ghost.hpp:40
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:28
__device__ __host__ T distance(const Point< dim, T > &q) const
It calculate the distance between 2 points.
Definition: Point.hpp:265
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
__device__ __host__ T norm() const
norm of the vector
Definition: Point.hpp:231
void execute()
Execute all the requests.
size_t getProcessUnitID()
Get the process unit id.
gpu::ofp_context_t & getGpuContext(bool iw=true)
If nvidia cuda is activated return a gpu context.
size_t getProcessingUnits()
Get the total number of processors.
bool allGather(T &send, openfpm::vector< T, Mem, gr > &v)
Gather the data from all processors.
Implementation of VCluster class.
Definition: VCluster.hpp:59
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:204
size_t size()
Stub size.
Definition: map_vector.hpp:212
vect_dist_key_dx get()
Get the actual key.
Distributed vector.
size_t size_local() const
return the local size of the vector
auto getProp(vect_dist_key_dx vec_key) -> decltype(vPrp.template get< id >(vec_key.getKey()))
Get the property of an element.
size_t size_local_with_ghost() const
return the local size of the vector
void deviceToHostPos()
Move the memory from the device to host memory.
const vector_dist_prop & getPropVector() const
return the property vector of all the particles
Decomposition & getDecomposition()
Get the decomposition.
auto getLastPosWrite() -> decltype(vPos.template get< 0 >(0))
Get the position of the last element.
vector_dist_iterator getGhostIterator() const
Get the iterator across the position of the ghost particles.
VerletList_type getVerlet(St r_cut, size_t neighborMaxNum=0)
for each particle get the verlet list
grid_dist_id_iterator_dec< Decomposition > getGridIterator(const size_t(&sz)[dim])
auto getPos(vect_dist_key_dx vec_key) -> decltype(vPos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
auto getPosRead(vect_dist_key_dx vec_key) const -> decltype(vPos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
auto getLastPropWrite() -> decltype(vPrp.template get< id >(0))
Get the property of the last element.
const vector_dist_pos & getPosVector() const
return the position vector of all the particles
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.
void ghost_get(size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
void hostToDevicePos()
Move the memory from the device to host memory.
void map(size_t opt=NONE)
It move all the particles that does not belong to the local processor to the respective processor.
auto getPropRead(vect_dist_key_dx vec_key) const -> decltype(vPrp.template get< id >(vec_key.getKey()))
Get the property of an element.
void deviceToHostProp()
Move the memory from the device to host memory.
vector_dist_iterator getDomainAndGhostIterator() const
Get an iterator that traverse the particles in the domain.
void add()
Add local particle.
CellList_type getCellList(St r_cut, size_t opt=CL_NON_SYMMETRIC|CL_LINEAR_CELL_KEYS, bool no_se3=false, float ghostEnlargeFactor=1.013)
Construct a cell list starting from the stored particles.
void addComputationCosts(const self &vd, Model md=Model())
Add the computation cost on the decomposition coming from the particles.
auto getPropWrite(vect_dist_key_dx vec_key) -> decltype(vPrp.template get< id >(vec_key.getKey()))
Get the property of an element.
auto getLastPos() -> decltype(vPos.template get< 0 >(0))
Get the position of the last element.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Linear model.
Definition: LB_Model.hpp:48
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:221