OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
vector_dist_gpu_unit_tests.cu
1#define BOOST_TEST_DYN_LINK
2#include "config.h"
3#include <boost/test/unit_test.hpp>
4#include "VCluster/VCluster.hpp"
5#include <Vector/vector_dist.hpp>
6#include "Vector/tests/vector_dist_util_unit_tests.hpp"
7
8#define SUB_UNIT_FACTOR 1024
9
10template<unsigned int dim , typename vector_dist_type>
11__global__ void move_parts_gpu_test(vector_dist_type vd)
12{
13 auto p = GET_PARTICLE(vd);
14
15#pragma unroll
16 for (int i = 0 ; i < dim ; i++)
17 {
18 vd.getPos(p)[i] += 0.05;
19 }
20}
21
22BOOST_AUTO_TEST_SUITE( vector_dist_gpu_test )
23
24void print_test(std::string test, size_t sz)
25{
26 if (create_vcluster().getProcessUnitID() == 0)
27 std::cout << test << " " << sz << "\n";
28}
29
30
31__global__ void initialize_props(vector_dist_ker<3, float, aggregate<float, float [3], float[3]>> vd)
32{
33 auto p = GET_PARTICLE(vd);
34
35 vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
36
37 vd.template getProp<1>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
38 vd.template getProp<1>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
39 vd.template getProp<1>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
40}
41
42template<typename T,typename CellList_type>
43__global__ void calculate_force(vector_dist_ker<3, T, aggregate<T, T[3], T [3]>> vd,
44 vector_dist_ker<3, T, aggregate<T, T[3], T [3]>> vd_sort,
45 CellList_type cl,
46 int rank)
47{
48 auto p = GET_PARTICLE(vd);
49
50 Point<3,T> xp = vd.getPos(p);
51
52 auto it = cl.getNNIterator(cl.getCell(xp));
53
54 Point<3,T> force1({0.0,0.0,0.0});
55 Point<3,T> force2({0.0,0.0,0.0});
56
57 while (it.isNext())
58 {
59 auto q1 = it.get_sort();
60 auto q2 = it.get();
61
62 if (q2 == p) {++it; continue;}
63
64 Point<3,T> xq_1 = vd_sort.getPos(q1);
65 Point<3,T> xq_2 = vd.getPos(q2);
66
67 Point<3,T> r1 = xq_1 - xp;
68 Point<3,T> r2 = xq_2 - xp;
69
70 // Normalize
71
72 if (r1.norm() > 1e-6)
73 {
74 r1 /= r1.norm();
75 force1 += vd_sort.template getProp<0>(q1)*r1;
76 }
77 if (r2.norm() > 1e-6)
78 {
79 r2 /= r2.norm();
80 force2 += vd.template getProp<0>(q2)*r2;
81 }
82
83 ++it;
84 }
85
86 vd.template getProp<1>(p)[0] = force1.get(0);
87 vd.template getProp<1>(p)[1] = force1.get(1);
88 vd.template getProp<1>(p)[2] = force1.get(2);
89
90 vd.template getProp<2>(p)[0] = force2.get(0);
91 vd.template getProp<2>(p)[1] = force2.get(1);
92 vd.template getProp<2>(p)[2] = force2.get(2);
93}
94
95template<typename T, typename CellList_type>
96__global__ void calculate_force_full_sort(vector_dist_ker<3, T, aggregate<T, T[3], T [3]>> vd,
97 CellList_type cl, int rank)
98{
99 unsigned int p;
100 GET_PARTICLE_SORT(p,cl);
101
102 Point<3,T> xp = vd.getPos(p);
103
104 auto it = cl.getNNIterator(cl.getCell(xp));
105
106 Point<3,T> force1({0.0,0.0,0.0});
107
108 while (it.isNext())
109 {
110 auto q1 = it.get_sort();
111
112 if (q1 == p) {++it; continue;}
113
114 Point<3,T> xq_1 = vd.getPos(q1);
115
116 Point<3,T> r1 = xq_1 - xp;
117
118 // Normalize
119
120 if (r1.norm() > 1e-6)
121 {
122 r1 /= r1.norm();
123
124 force1 += vd.template getProp<0>(q1)*r1;
125 }
126
127 ++it;
128 }
129
130 vd.template getProp<1>(p)[0] = force1.get(0);
131 vd.template getProp<1>(p)[1] = force1.get(1);
132 vd.template getProp<1>(p)[2] = force1.get(2);
133}
134
135template<typename CellList_type, typename vector_type>
136bool check_force(CellList_type & NN_cpu, vector_type & vd)
137{
138 typedef typename vector_type::stype St;
139
140 auto it6 = vd.getDomainIterator();
141
142 bool match = true;
143
144 while (it6.isNext())
145 {
146 auto p = it6.get();
147
148 Point<3,St> xp = vd.getPos(p);
149
150 // Calculate on CPU
151
152 Point<3,St> force({0.0,0.0,0.0});
153
154 auto NNc = NN_cpu.getNNIterator(NN_cpu.getCell(xp));
155
156 while (NNc.isNext())
157 {
158 auto q = NNc.get();
159
160 if (q == p.getKey()) {++NNc; continue;}
161
162 Point<3,St> xq_2 = vd.getPos(q);
163 Point<3,St> r2 = xq_2 - xp;
164
165 // Normalize
166
167 if (r2.norm() > 1e-6)
168 {
169 r2 /= r2.norm();
170 force += vd.template getProp<0>(q)*r2;
171 }
172
173 ++NNc;
174 }
175
176 match &= fabs(vd.template getProp<1>(p)[0] - vd.template getProp<2>(p)[0]) < 0.0003;
177 match &= fabs(vd.template getProp<1>(p)[1] - vd.template getProp<2>(p)[1]) < 0.0003;
178 match &= fabs(vd.template getProp<1>(p)[2] - vd.template getProp<2>(p)[2]) < 0.0003;
179
180 match &= fabs(vd.template getProp<1>(p)[0] - force.get(0)) < 0.0003;
181 match &= fabs(vd.template getProp<1>(p)[1] - force.get(1)) < 0.0003;
182 match &= fabs(vd.template getProp<1>(p)[2] - force.get(2)) < 0.0003;
183
184 if (match == false)
185 {
186 std::cout << "ERROR: " << vd.template getProp<1>(p)[0] << " " << vd.template getProp<2>(p)[0] << std::endl;
187 std::cout << "ERROR: " << vd.template getProp<1>(p)[1] << " " << vd.template getProp<2>(p)[1] << std::endl;
188 std::cout << "ERROR: " << vd.template getProp<1>(p)[2] << " " << vd.template getProp<2>(p)[2] << std::endl;
189
190 std::cout << p.getKey() << " ERROR2: " << vd.template getProp<1>(p)[0] << " " << force.get(0) << std::endl;
191 std::cout << p.getKey() << " ERROR2: " << vd.template getProp<1>(p)[1] << " " << force.get(1) << std::endl;
192 std::cout << p.getKey() << " ERROR2: " << vd.template getProp<1>(p)[2] << " " << force.get(2) << std::endl;
193
194
195 break;
196 }
197
198 ++it6;
199 }
200
201 return match;
202}
203
204BOOST_AUTO_TEST_CASE( vector_dist_gpu_ghost_get )
205{
206 auto & v_cl = create_vcluster();
207
208 if (v_cl.size() > 16)
209 {return;}
210
211 Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
212
213 // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
214 Ghost<3,float> g(0.1);
215
216 // Boundary conditions
217 size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
218
220
221 auto it = vd.getDomainIterator();
222
223 while (it.isNext())
224 {
225 auto p = it.get();
226
227 vd.getPos(p)[0] = (float)rand() / (float)RAND_MAX;
228 vd.getPos(p)[1] = (float)rand() / (float)RAND_MAX;
229 vd.getPos(p)[2] = (float)rand() / (float)RAND_MAX;
230
231 vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
232
233 vd.template getProp<1>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
234 vd.template getProp<1>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
235 vd.template getProp<1>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
236
237 vd.template getProp<2>(p)[0] = vd.getPos(p)[0] + 3.0*vd.getPos(p)[1];
238 vd.template getProp<2>(p)[1] = vd.getPos(p)[0] + 3.0*vd.getPos(p)[2];
239 vd.template getProp<2>(p)[2] = vd.getPos(p)[1] + 3.0*vd.getPos(p)[2];
240
241
242 ++it;
243 }
244
245 // Ok we redistribute the particles (CPU based)
246 vd.map();
247
248 vd.template ghost_get<0,1,2>();
249
250 // Now we check the the ghost contain the correct information
251
252 bool check = true;
253
254 auto itg = vd.getDomainAndGhostIterator();
255
256 while (itg.isNext())
257 {
258 auto p = itg.get();
259
260 check &= (vd.template getProp<0>(p) == vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2]);
261
262 check &= (vd.template getProp<1>(p)[0] == vd.getPos(p)[0] + vd.getPos(p)[1]);
263 check &= (vd.template getProp<1>(p)[1] == vd.getPos(p)[0] + vd.getPos(p)[2]);
264 check &= (vd.template getProp<1>(p)[2] == vd.getPos(p)[1] + vd.getPos(p)[2]);
265
266 check &= (vd.template getProp<2>(p)[0] == vd.getPos(p)[0] + 3.0*vd.getPos(p)[1]);
267 check &= (vd.template getProp<2>(p)[1] == vd.getPos(p)[0] + 3.0*vd.getPos(p)[2]);
268 check &= (vd.template getProp<2>(p)[2] == vd.getPos(p)[1] + 3.0*vd.getPos(p)[2]);
269
270 ++itg;
271 }
272
273 size_t tot_s = vd.size_local_with_ghost();
274
275 v_cl.sum(tot_s);
276 v_cl.execute();
277
278 // We check that we check something
279 BOOST_REQUIRE(tot_s > 1000);
280}
281
282template<typename vector_type, typename CellList_type, typename CellList_type_cpu>
283void check_cell_list_cpu_and_gpu(vector_type & vd, CellList_type & NN, CellList_type_cpu & NN_cpu)
284{
285 const auto it5 = vd.getDomainIteratorGPU(32);
286
287 CUDA_LAUNCH((calculate_force<typename vector_type::stype,decltype(NN.toKernel())>),it5,vd.toKernel(),vd.toKernel_sorted(),NN.toKernel(),create_vcluster().rank());
288
289 vd.template deviceToHostProp<1,2>();
290
291 bool test = check_force(NN_cpu,vd);
292 BOOST_REQUIRE_EQUAL(test,true);
293
294 // We reset the property 1 on device
295
296 auto rst = vd.getDomainIterator();
297
298 while (rst.isNext())
299 {
300 auto p = rst.get();
301
302 vd.template getProp<1>(p)[0] = 0.0;
303 vd.template getProp<1>(p)[1] = 0.0;
304 vd.template getProp<1>(p)[2] = 0.0;
305
306 ++rst;
307 }
308
309 vd.template hostToDeviceProp<1>();
310
311 // We do exactly the same test as before, but now we completely use the sorted version
312
313 CUDA_LAUNCH((calculate_force_full_sort<typename vector_type::stype,decltype(NN.toKernel())>),it5,vd.toKernel_sorted(),NN.toKernel(),create_vcluster().rank());
314
315 vd.template merge_sort<1>(NN);
316 vd.template deviceToHostProp<1>();
317
318 test = check_force(NN_cpu,vd);
319 BOOST_REQUIRE_EQUAL(test,true);
320}
321
322template<typename CellList_type>
323void vector_dist_gpu_test_impl()
324{
325 auto & v_cl = create_vcluster();
326
327 if (v_cl.size() > 16)
328 {return;}
329
330 Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
331
332 // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
333 Ghost<3,float> g(0.1);
334
335 // Boundary conditions
336 size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
337
339
340 srand(55067*create_vcluster().rank());
341
342 auto it = vd.getDomainIterator();
343
344 while (it.isNext())
345 {
346 auto p = it.get();
347
348 int x = rand();
349 int y = rand();
350 int z = rand();
351
352 vd.getPos(p)[0] = (float)x / (float)RAND_MAX;
353 vd.getPos(p)[1] = (float)y / (float)RAND_MAX;
354 vd.getPos(p)[2] = (float)z / (float)RAND_MAX;
355
356 Point<3,float> xp = vd.getPos(p);
357
358 ++it;
359 }
360
361 // Ok we redistribute the particles (CPU based)
362 vd.map();
363
364 size_t size_l = vd.size_local();
365
366 v_cl.sum(size_l);
367 v_cl.execute();
368
369 BOOST_REQUIRE_EQUAL(size_l,10000);
370
371
372 auto & ct = vd.getDecomposition();
373
374 bool noOut = true;
375 size_t cnt = 0;
376
377 auto it2 = vd.getDomainIterator();
378
379 while (it2.isNext())
380 {
381 auto p = it2.get();
382
383 noOut &= ct.isLocal(vd.getPos(p));
384
385 cnt++;
386 ++it2;
387 }
388
389 BOOST_REQUIRE_EQUAL(noOut,true);
390 BOOST_REQUIRE_EQUAL(cnt,vd.size_local());
391
392 // now we offload all the properties
393
394 const auto it3 = vd.getDomainIteratorGPU();
395
396 // offload to device
397 vd.hostToDevicePos();
398
399 CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vd.toKernel());
400
401 // now we check what we initialized
402
403 vd.deviceToHostProp<0,1>();
404
405 auto it4 = vd.getDomainIterator();
406
407 while (it4.isNext())
408 {
409 auto p = it4.get();
410
411 BOOST_REQUIRE_CLOSE(vd.template getProp<0>(p),vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2],0.01);
412
413 BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[0],vd.getPos(p)[0] + vd.getPos(p)[1],0.01);
414 BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[1],vd.getPos(p)[0] + vd.getPos(p)[2],0.01);
415 BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[2],vd.getPos(p)[1] + vd.getPos(p)[2],0.01);
416
417 ++it4;
418 }
419
420 // here we do a ghost_get
421 vd.ghost_get<0>();
422
423 // Double ghost get to check crashes
424 vd.ghost_get<0>();
425
426 // we re-offload what we received
427 vd.hostToDevicePos();
428 vd.template hostToDeviceProp<0>();
429
430 auto NN = vd.template getCellListGPU<CellList_type>(0.1);
431 auto NN_cpu = vd.getCellList(0.1);
432 check_cell_list_cpu_and_gpu(vd,NN,NN_cpu);
433
434 auto NN_up = vd.template getCellListGPU<CellList_type>(0.1);
435
436 NN_up.clear();
437 vd.updateCellList(NN_up);
438 check_cell_list_cpu_and_gpu(vd,NN_up,NN_cpu);
439}
440
441template<typename CellList_type>
442void vector_dist_gpu_make_sort_test_impl()
443{
444 auto & v_cl = create_vcluster();
445
446 if (v_cl.size() > 16)
447 {return;}
448
449 Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
450
451 // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
452 Ghost<3,float> g(0.1);
453
454 // Boundary conditions
455 size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
456
458
459 srand(55067*create_vcluster().rank());
460
461 auto it = vd.getDomainIterator();
462
463 while (it.isNext())
464 {
465 auto p = it.get();
466
467 int x = rand();
468 int y = rand();
469 int z = rand();
470
471 vd.getPos(p)[0] = (float)x / (float)RAND_MAX;
472 vd.getPos(p)[1] = (float)y / (float)RAND_MAX;
473 vd.getPos(p)[2] = (float)z / (float)RAND_MAX;
474
475 ++it;
476 }
477
478 vd.hostToDevicePos();
479
480 // Ok we redistribute the particles
481 vd.map(RUN_ON_DEVICE);
482
483 auto it3 = vd.getDomainIteratorGPU();
484
485 CUDA_LAUNCH_DIM3(initialize_props,it3.wthr,it3.thr,vd.toKernel());
486
487 // Here we check make sort does not mess-up particles we use a Cell-List to check that
488 // the two cell-list constructed are identical
489
490 vd.deviceToHostPos();
491
492 auto NN_cpu1 = vd.getCellList(0.1);
493 auto NN = vd.template getCellListGPU<CellList_type>(0.1);
494 vd.make_sort(NN);
495
496 vd.deviceToHostPos();
497
498 auto NN_cpu2 = vd.getCellList(0.1);
499
500 // here we compare the two cell-lists
501
502 bool match = true;
503 for (size_t i = 0 ; i < NN_cpu1.getNCells() ; i++)
504 {
505 match &= NN_cpu1.getNelements(i) == NN_cpu2.getNelements(i);
506 }
507
508 BOOST_REQUIRE_EQUAL(match,true);
509
510 // In this second step we check that we can use make_sort_from to check we can sort partifcles even
511 // when ghost are filled
512
513 // Here we get do a make sort
514 NN = vd.template getCellListGPU<CellList_type>(0.1);
515 vd.make_sort(NN);
516
519
520 vd.deviceToHostPos();
521 tmp_pos.template deviceToHost<0>();
522
523 // here we do a ghost_get
524 vd.ghost_get<0>(RUN_ON_DEVICE);
525
526 // Here we get do a make sort
527 NN = vd.template getCellListGPU<CellList_type>(0.1);
528
529 CUDA_CHECK()
530
531 vd.make_sort_from(NN);
532
533 // Check
534
535 tmp_pos.deviceToHost<0>();
536 vd.deviceToHostPos();
537
538 match = true;
539 for (size_t i = 0 ; i < vd.size_local() ; i++)
540 {
541 Point<3,float> p1 = vd.getPos(i);
542 Point<3,float> p2 = tmp_pos.template get<0>(i);
543
544 // They must be in the same cell
545 auto c1 = NN.getCell(p1);
546 auto c2 = NN.getCell(p1);
547
548 match &= c1 == c2;
549 }
550
551 BOOST_REQUIRE_EQUAL(match,true);
552}
553
554
555BOOST_AUTO_TEST_CASE(vector_dist_gpu_make_sort_sparse)
556{
557 vector_dist_gpu_make_sort_test_impl<CELLLIST_GPU_SPARSE<3,float>>();
558}
559
560BOOST_AUTO_TEST_CASE(vector_dist_gpu_make_sort)
561{
562 vector_dist_gpu_make_sort_test_impl<CellList_gpu<3,float,CudaMemory,shift_only<3, float>>>();
563}
564
565BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
566{
567 vector_dist_gpu_test_impl<CellList_gpu<3,float,CudaMemory,shift_only<3, float>>>();
568}
569
570BOOST_AUTO_TEST_CASE( vector_dist_gpu_test_sparse)
571{
572 vector_dist_gpu_test_impl<CELLLIST_GPU_SPARSE<3,float>>();
573}
574
575template<typename St>
576void vdist_calc_gpu_test()
577{
578 auto & v_cl = create_vcluster();
579
580 if (v_cl.size() > 16)
581 {return;}
582
583 Box<3,St> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
584
585 // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
586 Ghost<3,St> g(0.1);
587
588 // Boundary conditions
589 size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
590
592
594
596
598
599 srand(v_cl.rank()*10000);
600 auto it = vd.getDomainIterator();
601
602 while (it.isNext())
603 {
604 auto p = it.get();
605
606 vd.getPos(p)[0] = (St)rand() / (float)RAND_MAX;
607 vd.getPos(p)[1] = (St)rand() / (float)RAND_MAX;
608 vd.getPos(p)[2] = (St)rand() / (float)RAND_MAX;
609
610 vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
611
612 vd.template getProp<1>(p)[0] = vd.getPos(p)[0];
613 vd.template getProp<1>(p)[1] = vd.getPos(p)[1];
614 vd.template getProp<1>(p)[2] = vd.getPos(p)[2];
615
616 vd.template getProp<2>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
617 vd.template getProp<2>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
618 vd.template getProp<2>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
619
620 ++it;
621 }
622
623 // move on device
624 vd.hostToDevicePos();
625 vd.template hostToDeviceProp<0,1,2>();
626
627 // Ok we redistribute the particles (GPU based)
628 vd.map(RUN_ON_DEVICE);
629
631
632 vd.deviceToHostPos();
633 vd.template deviceToHostProp<0,1,2>();
634
635 // Reset the host part
636
637 auto it3 = vd.getDomainIterator();
638
639 while (it3.isNext())
640 {
641 auto p = it3.get();
642
643 vd.getPos(p)[0] = 1.0;
644 vd.getPos(p)[1] = 1.0;
645 vd.getPos(p)[2] = 1.0;
646
647 vd.template getProp<0>(p) = 0.0;
648
649 vd.template getProp<0>(p) = 0.0;
650 vd.template getProp<0>(p) = 0.0;
651 vd.template getProp<0>(p) = 0.0;
652
653 vd.template getProp<0>(p) = 0.0;
654 vd.template getProp<0>(p) = 0.0;
655 vd.template getProp<0>(p) = 0.0;
656
657 ++it3;
658 }
659
660 // we move from Device to CPU
661
662 vd.deviceToHostPos();
663 vd.template deviceToHostProp<0,1,2>();
664
665 // Check
666
667 auto it2 = vd.getDomainIterator();
668
669 bool match = true;
670 while (it2.isNext())
671 {
672 auto p = it2.get();
673
674 match &= vd.template getProp<0>(p) == vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
675
676 match &= vd.template getProp<1>(p)[0] == vd.getPos(p)[0];
677 match &= vd.template getProp<1>(p)[1] == vd.getPos(p)[1];
678 match &= vd.template getProp<1>(p)[2] == vd.getPos(p)[2];
679
680 match &= vd.template getProp<2>(p)[0] == vd.getPos(p)[0] + vd.getPos(p)[1];
681 match &= vd.template getProp<2>(p)[1] == vd.getPos(p)[0] + vd.getPos(p)[2];
682 match &= vd.template getProp<2>(p)[2] == vd.getPos(p)[1] + vd.getPos(p)[2];
683
684 ++it2;
685 }
686
687 BOOST_REQUIRE_EQUAL(match,true);
688
689 // count local particles
690
691 size_t l_cnt = 0;
692 size_t nl_cnt = 0;
693 size_t n_out = 0;
694
695 // Domain + ghost box
696 Box<3,St> dom_ext = domain;
697 dom_ext.enlarge(g);
698
699 auto it5 = vd.getDomainIterator();
700 count_local_n_local<3>(vd,it5,bc,domain,dom_ext,l_cnt,nl_cnt,n_out);
701
702 BOOST_REQUIRE_EQUAL(n_out,0);
703 BOOST_REQUIRE_EQUAL(l_cnt,vd.size_local());
704
705 // we do 10 gpu steps (using a cpu vector to check that map and ghost get work as expented)
706
707 for (size_t i = 0 ; i < 10 ; i++)
708 {
709 vd.map(RUN_ON_DEVICE);
710
711 vd.deviceToHostPos();
712 vd.template deviceToHostProp<0,1,2>();
713
714 // To test we copy on a cpu distributed vector and we do a map
715
716 vector_dist<3,St,aggregate<St,St[3],St[3]>> vd_cpu(vd.getDecomposition().template duplicate_convert<HeapMemory,memory_traits_lin>(),0);
717
718 auto itc = vd.getDomainIterator();
719
720 while (itc.isNext())
721 {
722 auto p = itc.get();
723
724 vd_cpu.add();
725
726 vd_cpu.getLastPos()[0] = vd.getPos(p)[0];
727 vd_cpu.getLastPos()[1] = vd.getPos(p)[1];
728 vd_cpu.getLastPos()[2] = vd.getPos(p)[2];
729
730 vd_cpu.template getLastProp<0>() = vd.template getProp<0>(p);
731
732 vd_cpu.template getLastProp<1>()[0] = vd.template getProp<1>(p)[0];
733 vd_cpu.template getLastProp<1>()[1] = vd.template getProp<1>(p)[1];
734 vd_cpu.template getLastProp<1>()[2] = vd.template getProp<1>(p)[2];
735
736 vd_cpu.template getLastProp<2>()[0] = vd.template getProp<2>(p)[0];
737 vd_cpu.template getLastProp<2>()[1] = vd.template getProp<2>(p)[1];
738 vd_cpu.template getLastProp<2>()[2] = vd.template getProp<2>(p)[2];
739
740 ++itc;
741 }
742
743 vd_cpu.template ghost_get<0,1,2>();
744
746
747 vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);
748
750
751 vd.deviceToHostPos();
752 vd.template deviceToHostProp<0,1,2>();
753
754 match = true;
755
756 // Particle on the gpu ghost and cpu ghost are not ordered in the same way so we have to reorder
757
758 struct part
759 {
760 Point<3,St> xp;
761
762
763 St prp0;
764 St prp1[3];
765 St prp2[3];
766
767 bool operator<(const part & tmp) const
768 {
769 if (xp.get(0) < tmp.xp.get(0))
770 {return true;}
771 else if (xp.get(0) > tmp.xp.get(0))
772 {return false;}
773
774 if (xp.get(1) < tmp.xp.get(1))
775 {return true;}
776 else if (xp.get(1) > tmp.xp.get(1))
777 {return false;}
778
779 if (xp.get(2) < tmp.xp.get(2))
780 {return true;}
781 else if (xp.get(2) > tmp.xp.get(2))
782 {return false;}
783
784 return false;
785 }
786 };
787
788 openfpm::vector<part> cpu_sort;
789 openfpm::vector<part> gpu_sort;
790
791 cpu_sort.resize(vd_cpu.size_local_with_ghost() - vd_cpu.size_local());
792 gpu_sort.resize(vd.size_local_with_ghost() - vd.size_local());
793
794 BOOST_REQUIRE_EQUAL(cpu_sort.size(),gpu_sort.size());
795
796 size_t cnt = 0;
797
798 auto itc2 = vd.getGhostIterator();
799 while (itc2.isNext())
800 {
801 auto p = itc2.get();
802
803 cpu_sort.get(cnt).xp.get(0) = vd_cpu.getPos(p)[0];
804 gpu_sort.get(cnt).xp.get(0) = vd.getPos(p)[0];
805 cpu_sort.get(cnt).xp.get(1) = vd_cpu.getPos(p)[1];
806 gpu_sort.get(cnt).xp.get(1) = vd.getPos(p)[1];
807 cpu_sort.get(cnt).xp.get(2) = vd_cpu.getPos(p)[2];
808 gpu_sort.get(cnt).xp.get(2) = vd.getPos(p)[2];
809
810 cpu_sort.get(cnt).prp0 = vd_cpu.template getProp<0>(p);
811 gpu_sort.get(cnt).prp0 = vd.template getProp<0>(p);
812
813 cpu_sort.get(cnt).prp1[0] = vd_cpu.template getProp<1>(p)[0];
814 gpu_sort.get(cnt).prp1[0] = vd.template getProp<1>(p)[0];
815 cpu_sort.get(cnt).prp1[1] = vd_cpu.template getProp<1>(p)[1];
816 gpu_sort.get(cnt).prp1[1] = vd.template getProp<1>(p)[1];
817 cpu_sort.get(cnt).prp1[2] = vd_cpu.template getProp<1>(p)[2];
818 gpu_sort.get(cnt).prp1[2] = vd.template getProp<1>(p)[2];
819
820 cpu_sort.get(cnt).prp2[0] = vd_cpu.template getProp<2>(p)[0];
821 gpu_sort.get(cnt).prp2[0] = vd.template getProp<2>(p)[0];
822 cpu_sort.get(cnt).prp2[1] = vd_cpu.template getProp<2>(p)[1];
823 gpu_sort.get(cnt).prp2[1] = vd.template getProp<2>(p)[1];
824 cpu_sort.get(cnt).prp2[2] = vd_cpu.template getProp<2>(p)[2];
825 gpu_sort.get(cnt).prp2[2] = vd.template getProp<2>(p)[2];
826
827 ++cnt;
828 ++itc2;
829 }
830
831 cpu_sort.sort();
832 gpu_sort.sort();
833
834 for (size_t i = 0 ; i < cpu_sort.size() ; i++)
835 {
836 match &= cpu_sort.get(i).xp.get(0) == gpu_sort.get(i).xp.get(0);
837 match &= cpu_sort.get(i).xp.get(1) == gpu_sort.get(i).xp.get(1);
838 match &= cpu_sort.get(i).xp.get(2) == gpu_sort.get(i).xp.get(2);
839
840 match &= cpu_sort.get(i).prp0 == gpu_sort.get(i).prp0;
841 match &= cpu_sort.get(i).prp1[0] == gpu_sort.get(i).prp1[0];
842 match &= cpu_sort.get(i).prp1[1] == gpu_sort.get(i).prp1[1];
843 match &= cpu_sort.get(i).prp1[2] == gpu_sort.get(i).prp1[2];
844
845 match &= cpu_sort.get(i).prp2[0] == gpu_sort.get(i).prp2[0];
846 match &= cpu_sort.get(i).prp2[1] == gpu_sort.get(i).prp2[1];
847 match &= cpu_sort.get(i).prp2[2] == gpu_sort.get(i).prp2[2];
848 }
849
850 BOOST_REQUIRE_EQUAL(match,true);
851
852 // move particles on gpu
853
854 auto ite = vd.getDomainIteratorGPU();
855 CUDA_LAUNCH_DIM3((move_parts_gpu_test<3,decltype(vd.toKernel())>),ite.wthr,ite.thr,vd.toKernel());
856 }
857}
858
859BOOST_AUTO_TEST_CASE( vector_dist_map_on_gpu_test)
860{
861 vdist_calc_gpu_test<float>();
862 vdist_calc_gpu_test<double>();
863}
864
865BOOST_AUTO_TEST_CASE(vector_dist_reduce)
866{
867 auto & v_cl = create_vcluster();
868
869 if (v_cl.size() > 16)
870 {return;}
871
872 Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
873
874 // set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
875 Ghost<3,float> g(0.1);
876
877 // Boundary conditions
878 size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
879
880 vector_dist_gpu<3,float,aggregate<float,double,int,size_t>> vd(5000*v_cl.size(),domain,bc,g);
881
882 auto it = vd.getDomainIterator();
883
884 float fc = 1.0;
885 double dc = 1.0;
886 int ic = 1.0;
887 size_t sc = 1.0;
888
889 while(it.isNext())
890 {
891 auto p = it.get();
892
893 vd.template getProp<0>(p) = fc;
894 vd.template getProp<1>(p) = dc;
895 vd.template getProp<2>(p) = ic;
896 vd.template getProp<3>(p) = sc;
897
898 fc += 1.0;
899 dc += 1.0;
900 ic += 1;
901 sc += 1;
902
903 ++it;
904 }
905
906 vd.template hostToDeviceProp<0,1,2,3>();
907
908 float redf = reduce_local<0,_add_>(vd);
909 double redd = reduce_local<1,_add_>(vd);
910 int redi = reduce_local<2,_add_>(vd);
911 size_t reds = reduce_local<3,_add_>(vd);
912
913 BOOST_REQUIRE_EQUAL(redf,(vd.size_local()+1.0)*(vd.size_local())/2.0);
914 BOOST_REQUIRE_EQUAL(redd,(vd.size_local()+1.0)*(vd.size_local())/2.0);
915 BOOST_REQUIRE_EQUAL(redi,(vd.size_local()+1)*(vd.size_local())/2);
916 BOOST_REQUIRE_EQUAL(reds,(vd.size_local()+1)*(vd.size_local())/2);
917
918 float redf2 = reduce_local<0,_max_>(vd);
919 double redd2 = reduce_local<1,_max_>(vd);
920 int redi2 = reduce_local<2,_max_>(vd);
921 size_t reds2 = reduce_local<3,_max_>(vd);
922
923 BOOST_REQUIRE_EQUAL(redf2,vd.size_local());
924 BOOST_REQUIRE_EQUAL(redd2,vd.size_local());
925 BOOST_REQUIRE_EQUAL(redi2,vd.size_local());
926 BOOST_REQUIRE_EQUAL(reds2,vd.size_local());
927}
928
929template<typename CellList_type>
930void vector_dist_dlb_on_cuda_impl(size_t k,double r_cut)
931{
932 std::random_device r;
933
934 std::seed_seq seed2{/*r() +*/ create_vcluster().rank(),
935 /*r() +*/ create_vcluster().rank(),
936 /*r() +*/ create_vcluster().rank(),
937 /*r() +*/ create_vcluster().rank(),
938 /*r() +*/ create_vcluster().rank(),
939 /*r() +*/ create_vcluster().rank(),
940 /*r() +*/ create_vcluster().rank(),
941 /*r() +*/ create_vcluster().rank()};
942 std::mt19937 e2(seed2);
943
945
946 Vcluster<> & v_cl = create_vcluster();
947
948 if (v_cl.getProcessingUnits() > 8)
949 return;
950
951 std::uniform_real_distribution<double> unif(0.0,0.3);
952
953 Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
954 Ghost<3,double> g(0.1);
955 size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
956
957 vector_type vd(0,domain,bc,g,DEC_GRAN(2048));
958
959 // Only processor 0 initialy add particles on a corner of a domain
960
961 if (v_cl.getProcessUnitID() == 0)
962 {
963 for(size_t i = 0 ; i < k ; i++)
964 {
965 vd.add();
966
967 vd.getLastPos()[0] = unif(e2);
968 vd.getLastPos()[1] = unif(e2);
969 vd.getLastPos()[2] = unif(e2);
970 }
971 }
972
973 // Move to GPU
974 vd.hostToDevicePos();
975 vd.template hostToDeviceProp<0>();
976
977 vd.map(RUN_ON_DEVICE);
978 vd.template ghost_get<>(RUN_ON_DEVICE);
979
980 // now move to CPU
981
982 vd.deviceToHostPos();
983 vd.template deviceToHostProp<0>();
984
985 // Get the neighborhood of each particles
986
987 auto VV = vd.getVerlet(r_cut);
988
989 // store the number of neighborhood for each particles
990
991 auto it = vd.getDomainIterator();
992
993 while (it.isNext())
994 {
995 auto p = it.get();
996
997 vd.template getProp<0>(p) = VV.getNNPart(p.getKey());
998
999 ++it;
1000 }
1001
1002 // Move to GPU
1003 vd.template hostToDeviceProp<0>();
1004
1005 ModelSquare md;
1006 md.factor = 10;
1007 vd.addComputationCosts(md);
1008 vd.getDecomposition().decompose();
1009 vd.map(RUN_ON_DEVICE);
1010
1011 vd.deviceToHostPos();
1012 // Move info to CPU for addComputationcosts
1013
1014 vd.addComputationCosts(md);
1015
1017 size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1018 v_cl.allGather(load,loads);
1019 v_cl.execute();
1020
1021 for (size_t i = 0 ; i < loads.size() ; i++)
1022 {
1023 double load_f = load;
1024 double load_fc = loads.get(i);
1025
1026 BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
1027 }
1028
1029 BOOST_REQUIRE(vd.size_local() != 0);
1030
1031 Point<3,double> v({1.0,1.0,1.0});
1032
1033 for (size_t i = 0 ; i < 25 ; i++)
1034 {
1035 // move particles to CPU and move the particles by 0.1
1036
1037 vd.deviceToHostPos();
1038
1039 auto it = vd.getDomainIterator();
1040
1041 while (it.isNext())
1042 {
1043 auto p = it.get();
1044
1045 vd.getPos(p)[0] += v.get(0) * 0.09;
1046 vd.getPos(p)[1] += v.get(1) * 0.09;
1047 vd.getPos(p)[2] += v.get(2) * 0.09;
1048
1049 ++it;
1050 }
1051
1052 //Back to GPU
1053 vd.hostToDevicePos();
1054 vd.map(RUN_ON_DEVICE);
1055 vd.template ghost_get<0>(RUN_ON_DEVICE);
1056
1057 vd.deviceToHostPos();
1058 vd.template deviceToHostProp<0,1,2>();
1059
1060 // Check calc forces
1061 auto NN_gpu = vd.template getCellListGPU<CellList_type>(r_cut);
1062 auto NN_cpu = vd.getCellList(r_cut);
1063 check_cell_list_cpu_and_gpu(vd,NN_gpu,NN_cpu);
1064
1065 auto VV2 = vd.getVerlet(r_cut);
1066
1067 auto it2 = vd.getDomainIterator();
1068
1069 bool match = true;
1070 while (it2.isNext())
1071 {
1072 auto p = it2.get();
1073
1074 match &= vd.template getProp<0>(p) == VV2.getNNPart(p.getKey());
1075
1076 ++it2;
1077 }
1078
1079 BOOST_REQUIRE_EQUAL(match,true);
1080
1081 ModelSquare md;
1082 vd.addComputationCosts(md);
1083 vd.getDecomposition().redecompose(200);
1084 vd.map(RUN_ON_DEVICE);
1085
1086 BOOST_REQUIRE(vd.size_local() != 0);
1087
1088 vd.template ghost_get<0>(RUN_ON_DEVICE);
1089 vd.deviceToHostPos();
1090 vd.template deviceToHostProp<0>();
1091
1092 vd.addComputationCosts(md);
1093
1095 size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1096 v_cl.allGather(load,loads);
1097 v_cl.execute();
1098
1099 for (size_t i = 0 ; i < loads.size() ; i++)
1100 {
1101 double load_f = load;
1102 double load_fc = loads.get(i);
1103
1104#ifdef ENABLE_ASAN
1105 BOOST_REQUIRE_CLOSE(load_f,load_fc,30.0);
1106#else
1107 BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
1108#endif
1109 }
1110 }
1111}
1112
1113template<typename CellList_type>
1114void vector_dist_dlb_on_cuda_impl_async(size_t k,double r_cut)
1115{
1116 std::random_device r;
1117
1118 std::seed_seq seed2{r() + create_vcluster().rank(),
1119 r() + create_vcluster().rank(),
1120 r() + create_vcluster().rank(),
1121 r() + create_vcluster().rank(),
1122 r() + create_vcluster().rank(),
1123 r() + create_vcluster().rank(),
1124 r() + create_vcluster().rank(),
1125 r() + create_vcluster().rank()};
1126 std::mt19937 e2(seed2);
1127
1129
1130 Vcluster<> & v_cl = create_vcluster();
1131
1132 if (v_cl.getProcessingUnits() > 8)
1133 return;
1134
1135 std::uniform_real_distribution<double> unif(0.0,0.3);
1136
1137 Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1138 Ghost<3,double> g(0.1);
1139 size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1140
1141 vector_type vd(0,domain,bc,g,DEC_GRAN(2048));
1142
1143 // Only processor 0 initialy add particles on a corner of a domain
1144
1145 if (v_cl.getProcessUnitID() == 0)
1146 {
1147 for(size_t i = 0 ; i < k ; i++)
1148 {
1149 vd.add();
1150
1151 vd.getLastPos()[0] = unif(e2);
1152 vd.getLastPos()[1] = unif(e2);
1153 vd.getLastPos()[2] = unif(e2);
1154 }
1155 }
1156
1157 // Move to GPU
1158 vd.hostToDevicePos();
1159 vd.template hostToDeviceProp<0>();
1160
1161 vd.map(RUN_ON_DEVICE);
1162 vd.template Ighost_get<>(RUN_ON_DEVICE);
1163 vd.template ghost_wait<>(RUN_ON_DEVICE);
1164
1165 // now move to CPU
1166
1167 vd.deviceToHostPos();
1168 vd.template deviceToHostProp<0>();
1169
1170 // Get the neighborhood of each particles
1171
1172 auto VV = vd.getVerlet(r_cut);
1173
1174 // store the number of neighborhood for each particles
1175
1176 auto it = vd.getDomainIterator();
1177
1178 while (it.isNext())
1179 {
1180 auto p = it.get();
1181
1182 vd.template getProp<0>(p) = VV.getNNPart(p.getKey());
1183
1184 ++it;
1185 }
1186
1187 // Move to GPU
1188 vd.template hostToDeviceProp<0>();
1189
1190 ModelSquare md;
1191 md.factor = 10;
1192 vd.addComputationCosts(md);
1193 vd.getDecomposition().decompose();
1194 vd.map(RUN_ON_DEVICE);
1195
1196 vd.deviceToHostPos();
1197 // Move info to CPU for addComputationcosts
1198
1199 vd.addComputationCosts(md);
1200
1202 size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1203 v_cl.allGather(load,loads);
1204 v_cl.execute();
1205
1206 for (size_t i = 0 ; i < loads.size() ; i++)
1207 {
1208 double load_f = load;
1209 double load_fc = loads.get(i);
1210
1211 BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
1212 }
1213
1214 BOOST_REQUIRE(vd.size_local() != 0);
1215
1216 Point<3,double> v({1.0,1.0,1.0});
1217
1218 for (size_t i = 0 ; i < 25 ; i++)
1219 {
1220 // move particles to CPU and move the particles by 0.1
1221
1222 vd.deviceToHostPos();
1223
1224 auto it = vd.getDomainIterator();
1225
1226 while (it.isNext())
1227 {
1228 auto p = it.get();
1229
1230 vd.getPos(p)[0] += v.get(0) * 0.09;
1231 vd.getPos(p)[1] += v.get(1) * 0.09;
1232 vd.getPos(p)[2] += v.get(2) * 0.09;
1233
1234 ++it;
1235 }
1236
1237 // Back to GPU
1238 vd.hostToDevicePos();
1239 vd.map(RUN_ON_DEVICE);
1240 vd.template Ighost_get<0>(RUN_ON_DEVICE);
1241 vd.template ghost_wait<0>(RUN_ON_DEVICE);
1242 vd.deviceToHostPos();
1243 vd.template deviceToHostProp<0,1,2>();
1244
1245 // Check calc forces
1246 auto NN_gpu = vd.template getCellListGPU<CellList_type>(r_cut);
1247 auto NN_cpu = vd.getCellList(r_cut);
1248 check_cell_list_cpu_and_gpu(vd,NN_gpu,NN_cpu);
1249
1250 auto VV2 = vd.getVerlet(r_cut);
1251
1252 auto it2 = vd.getDomainIterator();
1253
1254 bool match = true;
1255 while (it2.isNext())
1256 {
1257 auto p = it2.get();
1258
1259 match &= vd.template getProp<0>(p) == VV2.getNNPart(p.getKey());
1260
1261 ++it2;
1262 }
1263
1264 BOOST_REQUIRE_EQUAL(match,true);
1265
1266 ModelSquare md;
1267 vd.addComputationCosts(md);
1268 vd.getDecomposition().redecompose(200);
1269 vd.map(RUN_ON_DEVICE);
1270
1271 BOOST_REQUIRE(vd.size_local() != 0);
1272
1273// vd.template ghost_get<0>(RUN_ON_DEVICE);
1274// vd.template ghost_get<0>(RUN_ON_DEVICE);
1275 vd.template Ighost_get<0>(RUN_ON_DEVICE);
1276 vd.template ghost_wait<0>(RUN_ON_DEVICE);
1277 vd.deviceToHostPos();
1278 vd.template deviceToHostProp<0>();
1279
1280 vd.addComputationCosts(md);
1281
1283 size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1284 v_cl.allGather(load,loads);
1285 v_cl.execute();
1286
1287 for (size_t i = 0 ; i < loads.size() ; i++)
1288 {
1289 double load_f = load;
1290 double load_fc = loads.get(i);
1291
1292 BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
1293 }
1294 }
1295}
1296
1297BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda_async)
1298{
1299 vector_dist_dlb_on_cuda_impl_async<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(50000,0.01);
1300}
1301
1302BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda)
1303{
1304 vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(50000,0.01);
1305}
1306
1307BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda_sparse)
1308{
1309 vector_dist_dlb_on_cuda_impl<CELLLIST_GPU_SPARSE<3,double>>(50000,0.01);
1310}
1311
1312BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda2)
1313{
1314 if (create_vcluster().size() <= 3)
1315 {return;};
1316
1317 #ifndef CUDA_ON_CPU
1318 vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(1000000,0.01);
1319 #endif
1320}
1321
1322BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda3)
1323{
1324 if (create_vcluster().size() < 8)
1325 {return;}
1326
1327 #ifndef CUDA_ON_CPU
1328 vector_dist_dlb_on_cuda_impl<CellList_gpu<3,double,CudaMemory,shift_only<3,double>,unsigned int,int,false>>(15000000,0.005);
1329 #endif
1330}
1331
1332
1333BOOST_AUTO_TEST_CASE(vector_dist_keep_prop_on_cuda)
1334{
1336
1337 Vcluster<> & v_cl = create_vcluster();
1338
1339 if (v_cl.getProcessingUnits() > 8)
1340 return;
1341
1342 Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1343 Ghost<3,double> g(0.1);
1344 size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1345
1346 vector_type vd(0,domain,bc,g,DEC_GRAN(2048));
1347
1348 // Only processor 0 initialy add particles on a corner of a domain
1349
1350 if (v_cl.getProcessUnitID() == 0)
1351 {
1352 for(size_t i = 0 ; i < 50000 ; i++)
1353 {
1354 vd.add();
1355
1356 vd.getLastPos()[0] = ((double)rand())/RAND_MAX * 0.3;
1357 vd.getLastPos()[1] = ((double)rand())/RAND_MAX * 0.3;
1358 vd.getLastPos()[2] = ((double)rand())/RAND_MAX * 0.3;
1359 }
1360 }
1361
1362 // Move to GPU
1363 vd.hostToDevicePos();
1364 vd.template hostToDeviceProp<0>();
1365
1366 vd.map(RUN_ON_DEVICE);
1367 vd.template ghost_get<>(RUN_ON_DEVICE);
1368
1369 // now move to CPU
1370
1371 vd.deviceToHostPos();
1372 vd.template deviceToHostProp<0>();
1373
1374
1375 // store the number of neighborhood for each particles
1376
1377 auto it = vd.getDomainIterator();
1378
1379 while (it.isNext())
1380 {
1381 auto p = it.get();
1382
1383 vd.template getProp<0>(p) = 0.0;
1384
1385 vd.template getProp<1>(p)[0] = 1000.0;
1386 vd.template getProp<1>(p)[1] = 2000.0;
1387 vd.template getProp<1>(p)[2] = 3000.0;
1388
1389 vd.template getProp<2>(p)[0][0] = 6000,0;
1390 vd.template getProp<2>(p)[0][1] = 7000.0;
1391 vd.template getProp<2>(p)[0][2] = 8000.0;
1392 vd.template getProp<2>(p)[1][0] = 9000.0;
1393 vd.template getProp<2>(p)[1][1] = 10000.0;
1394 vd.template getProp<2>(p)[1][2] = 11000.0;
1395 vd.template getProp<2>(p)[2][0] = 12000.0;
1396 vd.template getProp<2>(p)[2][1] = 13000.0;
1397 vd.template getProp<2>(p)[2][2] = 14000.0;
1398
1399 ++it;
1400 }
1401
1402 // Move to GPU
1403 vd.template hostToDeviceProp<0,1,2>();
1404
1405 ModelSquare md;
1406 md.factor = 10;
1407 vd.addComputationCosts(md);
1408 vd.getDecomposition().decompose();
1409 vd.map(RUN_ON_DEVICE);
1410
1411 vd.deviceToHostPos();
1412 // Move info to CPU for addComputationcosts
1413
1414 vd.addComputationCosts(md);
1415
1417 size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1418 v_cl.allGather(load,loads);
1419 v_cl.execute();
1420
1421 for (size_t i = 0 ; i < loads.size() ; i++)
1422 {
1423 double load_f = load;
1424 double load_fc = loads.get(i);
1425
1426 BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
1427 }
1428
1429 BOOST_REQUIRE(vd.size_local() != 0);
1430
1431 Point<3,double> v({1.0,1.0,1.0});
1432
1433 int base = 0;
1434
1435 for (size_t i = 0 ; i < 25 ; i++)
1436 {
1437 if (i % 2 == 0)
1438 {
1439 // move particles to CPU and move the particles by 0.1
1440
1441 vd.deviceToHostPos();
1442
1443 auto it = vd.getDomainIterator();
1444
1445 while (it.isNext())
1446 {
1447 auto p = it.get();
1448
1449 vd.getPos(p)[0] += v.get(0) * 0.09;
1450 vd.getPos(p)[1] += v.get(1) * 0.09;
1451 vd.getPos(p)[2] += v.get(2) * 0.09;
1452
1453 ++it;
1454 }
1455
1456 //Back to GPU
1457 vd.hostToDevicePos();
1458 vd.map(RUN_ON_DEVICE);
1459 vd.template ghost_get<>(RUN_ON_DEVICE);
1460 vd.deviceToHostPos();
1461 vd.template deviceToHostProp<0,1,2>();
1462
1463 ModelSquare md;
1464 vd.addComputationCosts(md);
1465 vd.getDecomposition().redecompose(200);
1466 vd.map(RUN_ON_DEVICE);
1467
1468 BOOST_REQUIRE(vd.size_local() != 0);
1469
1470 vd.template ghost_get<0>(RUN_ON_DEVICE);
1471 vd.deviceToHostPos();
1472 vd.template deviceToHostProp<0,1,2>();
1473
1474 vd.addComputationCosts(md);
1475
1477 size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
1478 v_cl.allGather(load,loads);
1479 v_cl.execute();
1480
1481 for (size_t i = 0 ; i < loads.size() ; i++)
1482 {
1483 double load_f = load;
1484 double load_fc = loads.get(i);
1485
1486 BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
1487 }
1488 }
1489 else
1490 {
1491 vd.template deviceToHostProp<0,1,2>();
1492
1493 auto it2 = vd.getDomainIterator();
1494
1495 bool match = true;
1496 while (it2.isNext())
1497 {
1498 auto p = it2.get();
1499
1500 vd.template getProp<0>(p) += 1;
1501
1502 vd.template getProp<1>(p)[0] += 1.0;
1503 vd.template getProp<1>(p)[1] += 1.0;
1504 vd.template getProp<1>(p)[2] += 1.0;
1505
1506 vd.template getProp<2>(p)[0][0] += 1.0;
1507 vd.template getProp<2>(p)[0][1] += 1.0;
1508 vd.template getProp<2>(p)[0][2] += 1.0;
1509 vd.template getProp<2>(p)[1][0] += 1.0;
1510 vd.template getProp<2>(p)[1][1] += 1.0;
1511 vd.template getProp<2>(p)[1][2] += 1.0;
1512 vd.template getProp<2>(p)[2][0] += 1.0;
1513 vd.template getProp<2>(p)[2][1] += 1.0;
1514 vd.template getProp<2>(p)[2][2] += 1.0;
1515
1516 ++it2;
1517 }
1518
1519 vd.template hostToDeviceProp<0,1,2>();
1520
1521 ++base;
1522
1523 vd.template ghost_get<0,1,2>(RUN_ON_DEVICE | KEEP_PROPERTIES);
1524 vd.template deviceToHostProp<0,1,2>();
1525
1526 // Check that the ghost contain the correct information
1527
1528 auto itg = vd.getGhostIterator();
1529
1530 while (itg.isNext())
1531 {
1532 auto p = itg.get();
1533
1534 match &= vd.template getProp<0>(p) == base;
1535
1536 match &= vd.template getProp<1>(p)[0] == base + 1000.0;
1537 match &= vd.template getProp<1>(p)[1] == base + 2000.0;
1538 match &= vd.template getProp<1>(p)[2] == base + 3000.0;
1539
1540 match &= vd.template getProp<2>(p)[0][0] == base + 6000.0;
1541 match &= vd.template getProp<2>(p)[0][1] == base + 7000.0;
1542 match &= vd.template getProp<2>(p)[0][2] == base + 8000.0;
1543 match &= vd.template getProp<2>(p)[1][0] == base + 9000.0;
1544 match &= vd.template getProp<2>(p)[1][1] == base + 10000.0;
1545 match &= vd.template getProp<2>(p)[1][2] == base + 11000.0;
1546 match &= vd.template getProp<2>(p)[2][0] == base + 12000.0;
1547 match &= vd.template getProp<2>(p)[2][1] == base + 13000.0;
1548 match &= vd.template getProp<2>(p)[2][2] == base + 14000.0;
1549
1550 ++itg;
1551 }
1552
1553 BOOST_REQUIRE_EQUAL(match,true);
1554 }
1555 }
1556}
1557
1559{
1560 __device__ static bool check(int c)
1561 {
1562 return c == 1;
1563 }
1564};
1565
1566BOOST_AUTO_TEST_CASE(vector_dist_get_index_set)
1567{
1568 Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1569 Ghost<3,double> g(0.1);
1570 size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1571
1572 if (create_vcluster().size() >= 16)
1573 {return;}
1574
1575 Vcluster<> & v_cl = create_vcluster();
1576
1577 vector_dist_gpu<3,double,aggregate<int,double>> vdg(10000,domain,bc,g,DEC_GRAN(128));
1578
1579 auto it = vdg.getDomainIterator();
1580
1581 while (it.isNext())
1582 {
1583 auto p = it.get();
1584
1585 vdg.getPos(p)[0] = (double)rand() / RAND_MAX;
1586 vdg.getPos(p)[1] = (double)rand() / RAND_MAX;
1587 vdg.getPos(p)[2] = (double)rand() / RAND_MAX;
1588
1589 vdg.template getProp<0>(p) = (int)((double)rand() / RAND_MAX / 0.5);
1590
1591 vdg.template getProp<1>(p) = (double)rand() / RAND_MAX;
1592
1593 ++it;
1594 }
1595
1596 vdg.map();
1597
1598 vdg.hostToDeviceProp<0,1>();
1599 vdg.hostToDevicePos();
1600
1601 auto cl = vdg.getCellListGPU(0.1);
1602
1603 // than we get a cell-list to force reorder
1604
1606
1607 get_indexes_by_type<0,type_is_one>(vdg.getPropVectorSort(),ids,vdg.size_local(),v_cl.getgpuContext());
1608
1609 // test
1610
1611 ids.template deviceToHost<0>();
1612
1613 auto & vs = vdg.getPropVectorSort();
1614 vs.template deviceToHost<0>();
1615
1616 bool match = true;
1617
1618 for (int i = 0 ; i < ids.size() ; i++)
1619 {
1620 if (vs.template get<0>(ids.template get<0>(i)) != 1)
1621 {match = false;}
1622 }
1623
1624 BOOST_REQUIRE_EQUAL(match,true);
1625}
1626
1627BOOST_AUTO_TEST_CASE(vector_dist_compare_host_device)
1628{
1629 Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1630 Ghost<3,double> g(0.1);
1631 size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1632
1633 if (create_vcluster().size() >= 16)
1634 {return;}
1635
1636 vector_dist_gpu<3,double,aggregate<double,double[3],double[3][3]>> vdg(10000,domain,bc,g,DEC_GRAN(128));
1637
1638 auto it = vdg.getDomainIterator();
1639
1640 while (it.isNext())
1641 {
1642 auto p = it.get();
1643
1644 vdg.getPos(p)[0] = (double)rand() / RAND_MAX;
1645 vdg.getPos(p)[1] = (double)rand() / RAND_MAX;
1646 vdg.getPos(p)[2] = (double)rand() / RAND_MAX;
1647
1648 vdg.template getProp<0>(p) = (double)rand() / RAND_MAX;
1649
1650 vdg.template getProp<1>(p)[0] = (double)rand() / RAND_MAX;
1651 vdg.template getProp<1>(p)[1] = (double)rand() / RAND_MAX;
1652 vdg.template getProp<1>(p)[2] = (double)rand() / RAND_MAX;
1653
1654 vdg.template getProp<2>(p)[0][0] = (double)rand() / RAND_MAX;
1655 vdg.template getProp<2>(p)[0][1] = (double)rand() / RAND_MAX;
1656 vdg.template getProp<2>(p)[0][2] = (double)rand() / RAND_MAX;
1657 vdg.template getProp<2>(p)[1][0] = (double)rand() / RAND_MAX;
1658 vdg.template getProp<2>(p)[1][1] = (double)rand() / RAND_MAX;
1659 vdg.template getProp<2>(p)[1][2] = (double)rand() / RAND_MAX;
1660 vdg.template getProp<2>(p)[2][0] = (double)rand() / RAND_MAX;
1661 vdg.template getProp<2>(p)[2][1] = (double)rand() / RAND_MAX;
1662 vdg.template getProp<2>(p)[2][2] = (double)rand() / RAND_MAX;
1663
1664 ++it;
1665 }
1666
1667 vdg.map();
1668
1669 vdg.hostToDeviceProp<0,1,2>();
1670 vdg.hostToDevicePos();
1671
1672 bool test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
1673 BOOST_REQUIRE_EQUAL(test,true);
1674
1675 vdg.getPos(100)[0] = 0.99999999;
1676
1677 test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
1678 BOOST_REQUIRE_EQUAL(test,false);
1679
1680 vdg.hostToDevicePos();
1681 vdg.getPos(100)[0] = 0.99999999;
1682
1683 test = vdg.compareHostAndDevicePos(0.00001,0.00000001);
1684 BOOST_REQUIRE_EQUAL(test,true);
1685
1687
1688 test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
1689 BOOST_REQUIRE_EQUAL(test,true);
1690
1691 vdg.getProp<1>(103)[0] = 0.99999999;
1692
1693 test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
1694 BOOST_REQUIRE_EQUAL(test,false);
1695
1696 vdg.hostToDeviceProp<1>();
1697 vdg.getProp<1>(103)[0] = 0.99999999;
1698
1699 test = vdg.compareHostAndDeviceProp<1>(0.00001,0.00000001);
1700 BOOST_REQUIRE_EQUAL(test,true);
1701
1703
1704
1705 test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
1706 BOOST_REQUIRE_EQUAL(test,true);
1707
1708 vdg.getProp<0>(105) = 0.99999999;
1709
1710 test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
1711 BOOST_REQUIRE_EQUAL(test,false);
1712
1713 vdg.hostToDeviceProp<0>();
1714 vdg.getProp<0>(105) = 0.99999999;
1715
1716 test = vdg.compareHostAndDeviceProp<0>(0.00001,0.00000001);
1717 BOOST_REQUIRE_EQUAL(test,true);
1718
1719
1721
1722
1723 test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
1724 BOOST_REQUIRE_EQUAL(test,true);
1725
1726 vdg.getProp<2>(108)[1][2] = 0.99999999;
1727
1728 test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
1729 BOOST_REQUIRE_EQUAL(test,false);
1730
1731 vdg.hostToDeviceProp<2>();
1732 vdg.getProp<2>(108)[1][2] = 0.99999999;
1733
1734 test = vdg.compareHostAndDeviceProp<2>(0.00001,0.00000001);
1735 BOOST_REQUIRE_EQUAL(test,true);
1736}
1737
1738template<typename vector_dist_type>
1739__global__ void assign_to_ghost(vector_dist_type vds)
1740{
1741 int i = threadIdx.x + blockIdx.x * blockDim.x;
1742
1743 if (i >= vds.size()) {return;}
1744
1745 vds.template getProp<0>(i) = 1000.0 + i;
1746
1747 vds.template getProp<1>(i)[0] = 2000.0 + i;
1748 vds.template getProp<1>(i)[1] = 3000.0 + i;
1749 vds.template getProp<1>(i)[2] = 4000.0 + i;
1750
1751 vds.template getProp<2>(i)[0][0] = 12000.0 + i;
1752 vds.template getProp<2>(i)[0][1] = 13000.0 + i;
1753 vds.template getProp<2>(i)[0][2] = 14000.0 + i;
1754 vds.template getProp<2>(i)[1][0] = 22000.0 + i;
1755 vds.template getProp<2>(i)[1][1] = 23000.0 + i;
1756 vds.template getProp<2>(i)[1][2] = 24000.0 + i;
1757 vds.template getProp<2>(i)[2][0] = 32000.0 + i;
1758 vds.template getProp<2>(i)[2][1] = 33000.0 + i;
1759 vds.template getProp<2>(i)[2][2] = 34000.0 + i;
1760
1761}
1762
1763BOOST_AUTO_TEST_CASE(vector_dist_domain_and_ghost_test)
1764{
1765 Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1766 Ghost<3,double> g(0.1);
1767 size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1768
1769 if (create_vcluster().size() >= 16)
1770 {return;}
1771
1772 vector_dist_gpu<3,double,aggregate<double,double[3],double[3][3]>> vdg(10000,domain,bc,g,DEC_GRAN(128));
1773
1774 auto ite = vdg.getDomainAndGhostIteratorGPU();
1775
1776 CUDA_LAUNCH(assign_to_ghost,ite,vdg.toKernel());
1777
1778 vdg.template deviceToHostProp<0,1,2>();
1779
1780
1781 auto it = vdg.getDomainAndGhostIterator();
1782
1783 bool check = true;
1784
1785 while (it.isNext())
1786 {
1787 auto k = it.get();
1788
1789 check &= vdg.template getProp<0>(k) == 1000.0 + k.getKey();
1790
1791 check &= vdg.template getProp<1>(k)[0] == 2000.0 + k.getKey();
1792 check &= vdg.template getProp<1>(k)[1] == 3000.0 + k.getKey();
1793 check &= vdg.template getProp<1>(k)[2] == 4000.0 + k.getKey();
1794
1795 check &= vdg.template getProp<2>(k)[0][0] == 12000.0 + k.getKey();
1796 check &= vdg.template getProp<2>(k)[0][1] == 13000.0 + k.getKey();
1797 check &= vdg.template getProp<2>(k)[0][2] == 14000.0 + k.getKey();
1798 check &= vdg.template getProp<2>(k)[1][0] == 22000.0 + k.getKey();
1799 check &= vdg.template getProp<2>(k)[1][1] == 23000.0 + k.getKey();
1800 check &= vdg.template getProp<2>(k)[1][2] == 24000.0 + k.getKey();
1801 check &= vdg.template getProp<2>(k)[2][0] == 32000.0 + k.getKey();
1802 check &= vdg.template getProp<2>(k)[2][1] == 33000.0 + k.getKey();
1803 check &= vdg.template getProp<2>(k)[2][2] == 34000.0 + k.getKey();
1804
1805 ++it;
1806 }
1807
1808
1809 BOOST_REQUIRE_EQUAL(check,true);
1810}
1811
1812template<typename vT>
1813__global__ void launch_overflow(vT vs, vT vs2)
1814{
1815 vs2.template getProp<1>(57)[0];
1816}
1817
1818BOOST_AUTO_TEST_CASE(vector_dist_overflow_se_class1)
1819{
1820 Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
1821 Ghost<3,double> g(0.1);
1822 size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
1823
1824 if (create_vcluster().size() >= 16)
1825 {return;}
1826
1827 std::cout << "****** TEST ERROR MESSAGE BEGIN ********" << std::endl;
1828
1831
1832
1833 vdg.setCapacity(100);
1834
1835 ite_gpu<1> ite;
1836
1837 ite.wthr.x = 1;
1838 ite.wthr.y = 1;
1839 ite.wthr.z = 1;
1840 ite.thr.x = 1;
1841 ite.thr.y = 1;
1842 ite.thr.z = 1;
1843
1844 try
1845 {
1846 CUDA_LAUNCH(launch_overflow,ite,vdg.toKernel(),vdg2.toKernel());
1847 }
1848 catch(...)
1849 {
1850 std::cout << "SE_CLASS1 Catch" << std::endl;
1851 };
1852
1853 std::cout << "****** TEST ERROR MESSAGE END ********" << std::endl;
1854}
1855
1856
1857
1858BOOST_AUTO_TEST_CASE( vector_dist_ghost_put_gpu )
1859{
1860
1861// This example require float atomic, until C++20 we are out of luck
1862#ifndef CUDIFY_USE_OPENMP
1863
1864 Vcluster<> & v_cl = create_vcluster();
1865
1866 long int k = 25*25*25*create_vcluster().getProcessingUnits();
1867 k = std::pow(k, 1/3.);
1868
1869 if (v_cl.getProcessingUnits() > 48)
1870 return;
1871
1872 print_test("Testing 3D periodic ghost put GPU k=",k);
1873 BOOST_TEST_CHECKPOINT( "Testing 3D periodic ghost put k=" << k );
1874
1875 long int big_step = k / 30;
1876 big_step = (big_step == 0)?1:big_step;
1877 long int small_step = 21;
1878
1879 // 3D test
1880 for ( ; k >= 2 ; k-= (k > 2*big_step)?big_step:small_step )
1881 {
1882 float r_cut = 1.3 / k;
1883 float r_g = 1.5 / k;
1884
1885 Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0});
1886
1887 // Boundary conditions
1888 size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
1889
1890 // ghost
1891 Ghost<3,float> ghost(r_g);
1892
1893 typedef aggregate<float,float,float> part_prop;
1894
1895 // Distributed vector
1896 vector_dist_gpu<3,float, part_prop > vd(0,box,bc,ghost);
1897
1898 auto it = vd.getGridIterator({(size_t)k,(size_t)k,(size_t)k});
1899
1900 while (it.isNext())
1901 {
1902 auto key = it.get();
1903
1904 vd.add();
1905
1906 vd.getLastPosWrite()[0] = key.get(0)*it.getSpacing(0);
1907 vd.getLastPosWrite()[1] = key.get(1)*it.getSpacing(1);
1908 vd.getLastPosWrite()[2] = key.get(2)*it.getSpacing(2);
1909
1910 // Fill some properties randomly
1911
1912 vd.getLastPropWrite<0>() = 0.0;
1913
1914 vd.getLastPropWrite<2>() = 0.0;
1915
1916 ++it;
1917 }
1918
1919 vd.map();
1920
1921 vd.hostToDevicePos();
1922 vd.template hostToDeviceProp<0,2>();
1923 // sync the ghost
1924 vd.ghost_get<0,2>(RUN_ON_DEVICE);
1925 vd.template deviceToHostProp<0,2>();
1926 vd.deviceToHostPos();
1927
1928 {
1929 auto NN = vd.getCellList(r_cut);
1930 float a = 1.0f*k*k;
1931
1932 // run trough all the particles + ghost
1933
1934 auto it2 = vd.getDomainIterator();
1935
1936 while (it2.isNext())
1937 {
1938 // particle p
1939 auto p = it2.get();
1940 Point<3,float> xp = vd.getPos(p);
1941
1942 // Get an iterator over the neighborhood particles of p
1943 auto Np = NN.getNNIterator<NO_CHECK>(NN.getCell(xp));
1944
1945 // For each neighborhood particle ...
1946 while (Np.isNext())
1947 {
1948 auto q = Np.get();
1949 Point<3,float> xq = vd.getPosRead(q);
1950
1951 float dist = xp.distance(xq);
1952
1953 if (dist < r_cut)
1954 {
1955 vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
1956 vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut) / 2;
1957 }
1958
1959 ++Np;
1960 }
1961
1962 ++it2;
1963 }
1964
1965 vd.hostToDevicePos();
1966 vd.template hostToDeviceProp<0,2>();
1967 vd.template ghost_put<add_atomic_,0,2>(RUN_ON_DEVICE);
1968 vd.template deviceToHostProp<0,2>();
1969 vd.deviceToHostPos();
1970
1971 bool ret = true;
1972 auto it3 = vd.getDomainIterator();
1973
1974 float constant = vd.getProp<0>(it3.get());
1975 float constanta = vd.getProp<2>(it3.get());
1976 float eps = 0.001;
1977
1978 while (it3.isNext())
1979 {
1980 float constant2 = vd.getProp<0>(it3.get());
1981 float constant3 = vd.getProp<2>(it3.get());
1982 if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
1983 {
1984 Point<3,float> p = vd.getPosRead(it3.get());
1985
1986 std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << v_cl.getProcessUnitID() << std::endl;
1987 ret = false;
1988 break;
1989 }
1990
1991 ++it3;
1992 }
1993 BOOST_REQUIRE_EQUAL(ret,true);
1994 }
1995
1996 auto itp = vd.getDomainAndGhostIterator();
1997 while (itp.isNext())
1998 {
1999 auto key = itp.get();
2000
2001 vd.getPropWrite<0>(key) = 0.0;
2002 vd.getPropWrite<2>(key) = 0.0;
2003
2004 ++itp;
2005 }
2006
2007 {
2008 auto NN = vd.getCellList(r_cut);
2009 float a = 1.0f*k*k;
2010
2011 // run trough all the particles + ghost
2012
2013 auto it2 = vd.getDomainIterator();
2014
2015 while (it2.isNext())
2016 {
2017 // particle p
2018 auto p = it2.get();
2019 Point<3,float> xp = vd.getPosRead(p);
2020
2021 // Get an iterator over the neighborhood particles of p
2022 auto Np = NN.getNNIterator<NO_CHECK>(NN.getCell(xp));
2023
2024 // For each neighborhood particle ...
2025 while (Np.isNext())
2026 {
2027 auto q = Np.get();
2028 Point<3,float> xq = vd.getPosRead(q);
2029
2030 float dist = xp.distance(xq);
2031
2032 if (dist < r_cut)
2033 {
2034 vd.getPropWrite<0>(q) += a*(-dist*dist+r_cut*r_cut);
2035 vd.getPropWrite<2>(q) += a*(-dist*dist+r_cut*r_cut);
2036 }
2037
2038 ++Np;
2039 }
2040
2041 ++it2;
2042 }
2043
2044 vd.hostToDevicePos();
2045 vd.template hostToDeviceProp<0,2>();
2046 vd.template ghost_put<add_atomic_,0>(RUN_ON_DEVICE);
2047 vd.template ghost_put<add_atomic_,2>(RUN_ON_DEVICE);
2048 vd.template deviceToHostProp<0,2>();
2049 vd.deviceToHostPos();
2050
2051 bool ret = true;
2052 auto it3 = vd.getDomainIterator();
2053
2054 float constant = vd.getPropRead<0>(it3.get());
2055 float constanta = vd.getPropRead<2>(it3.get());
2056 float eps = 0.001;
2057
2058 while (it3.isNext())
2059 {
2060 float constant2 = vd.getPropRead<0>(it3.get());
2061 float constant3 = vd.getPropRead<0>(it3.get());
2062 if (fabs(constant - constant2)/constant > eps || fabs(constanta - constant3)/constanta > eps)
2063 {
2064 Point<3,float> p = vd.getPosRead(it3.get());
2065
2066 std::cout << p.toString() << " " << constant2 << "/" << constant << "/" << constant3 << " " << v_cl.getProcessUnitID() << std::endl;
2067 ret = false;
2068 break;
2069 }
2070
2071 ++it3;
2072 }
2073 BOOST_REQUIRE_EQUAL(ret,true);
2074 }
2075 }
2076
2077#endif
2078}
2079
2080BOOST_AUTO_TEST_SUITE_END()
This class represent an N-dimensional box.
Definition Box.hpp:61
void enlarge(const Box< dim, T > &gh)
Enlarge the box with ghost margin.
Definition Box.hpp:823
This class implement the point shape in an N-dimensional space.
Definition Point.hpp:28
__device__ __host__ T distance(const Point< dim, T > &q) const
It calculate the distance between 2 points.
Definition Point.hpp:250
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition Point.hpp:172
std::string toString() const
Return the string with the point coordinate.
Definition Point.hpp:398
__device__ __host__ T norm() const
norm of the vector
Definition Point.hpp:231
void execute()
Execute all the requests.
size_t getProcessUnitID()
Get the process unit id.
size_t getProcessingUnits()
Get the total number of processors.
gpu::ofp_context_t & getgpuContext(bool iw=true)
If nvidia cuda is activated return a gpu context.
bool allGather(T &send, openfpm::vector< T, Mem, gr > &v)
Gather the data from all processors.
Implementation of VCluster class.
Definition VCluster.hpp:59
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.
vect_dist_key_dx get()
Get the actual key.
Distributed vector.
auto getLastPropWrite() -> decltype(v_prp.template get< id >(0))
Get the property of the last element.
auto getPropWrite(vect_dist_key_dx vec_key) -> decltype(v_prp.template get< id >(vec_key.getKey()))
Get the property of an element.
size_t size_local() const
return the local size of the vector
auto getProp(vect_dist_key_dx vec_key) -> decltype(v_prp.template get< id >(vec_key.getKey()))
Get the property of an element.
const vector_dist_prop & getPropVector() const
return the property vector of all the particles
grid_dist_id_iterator_dec< Decomposition > getGridIterator(const size_t(&sz)[dim])
auto getPosRead(vect_dist_key_dx vec_key) const -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
size_t size_local_with_ghost() const
return the local size of the vector
void deviceToHostPos()
Move the memory from the device to host memory.
void updateCellList(CellL &cell_list, bool no_se3=false, cl_construct_opt opt=cl_construct_opt::Full)
Update a cell list using the stored particles.
VerletL getVerlet(St r_cut)
for each particle get the verlet list
vector_dist_iterator getGhostIterator() const
Get the iterator across the position of the ghost particles.
auto getPos(vect_dist_key_dx vec_key) -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
CellL getCellList(St r_cut, bool no_se3=false)
Construct a cell list starting from the stored particles.
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.
void ghost_get(size_t opt=WITH_POSITION)
It synchronize the properties and position of the ghost particles.
auto getLastPosWrite() -> decltype(v_pos.template get< 0 >(0))
Get the position of the last element.
void hostToDevicePos()
Move the memory from the device to host memory.
void map(size_t opt=NONE)
It move all the particles that does not belong to the local processor to the respective processor.
const vector_dist_pos & getPosVector() const
return the position vector of all the particles
auto getLastPos() -> decltype(v_pos.template get< 0 >(0))
Get the position of the last element.
void deviceToHostProp()
Move the memory from the device to host memory.
vector_dist_iterator getDomainAndGhostIterator() const
Get an iterator that traverse the particles in the domain.
void add()
Add local particle.
void addComputationCosts(const self &vd, Model md=Model())
Add the computation cost on the decomposition coming from the particles.
auto getPropRead(vect_dist_key_dx vec_key) const -> decltype(v_prp.template get< id >(vec_key.getKey()))
Get the property of an element.
Decomposition & getDecomposition()
Get the decomposition.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Linear model.
Definition LB_Model.hpp:48
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...