OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
CellList_gpu_test.cu
1 /*
2  * CellList_gpu_test.cpp
3  *
4  * Created on: Jun 13, 2018
5  * Author: i-bird
6  */
7 
8 #define BOOST_GPU_ENABLED __host__ __device__
9 #include "util/cuda_launch.hpp"
10 #include "config.h"
11 #define BOOST_TEST_DYN_LINK
12 #include <boost/test/unit_test.hpp>
13 
14 #include "util/cuda_util.hpp"
15 #include "cuda/CellList_gpu.hpp"
16 #include "CellList.hpp"
17 #include "util/boost/boost_array_openfpm.hpp"
18 #include "Point_test.hpp"
19 #include "util/cuda_util.hpp"
20 
21 BOOST_AUTO_TEST_SUITE( CellList_gpu_test )
22 
23 template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
24 void test_sub_index()
25 {
28 
29  // fill with some particles
30 
32 
33  // create 3 particles
34 
35  Point<dim,T> p1({0.2,0.2,0.2});
36  Point<dim,T> p2({0.9,0.2,0.2});
37  Point<dim,T> p3({0.2,0.9,0.2});
38  Point<dim,T> p4({0.2,0.2,0.9});
39  Point<dim,T> p5({0.9,0.9,0.2});
40  Point<dim,T> p6({0.9,0.2,0.9});
41  Point<dim,T> p7({0.2,0.9,0.9});
42  Point<dim,T> p8({0.9,0.9,0.9});
43  Point<dim,T> p9({0.0,0.0,0.0});
44  Point<dim,T> p10({0.205,0.205,0.205});
45 
46  pl.add(p1);
47  pl.add(p2);
48  pl.add(p3);
49  pl.add(p4);
50  pl.add(p5);
51  pl.add(p6);
52  pl.add(p7);
53  pl.add(p8);
54  pl.add(p9);
55  pl.add(p10);
56 
60 
61  for (size_t i = 0 ; i < dim ; i++)
62  {
63  div[i] = 17;
64  spacing[i] = 0.1;
65  off[i] = 0;
66  }
67 
68  cl_n.resize(17*17*17);
69  cl_n.template fill<0>(0);
70  //CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
71 
72  part_ids.resize(pl.size());
73 
74  size_t sz[3] = {17,17,17};
75  grid_sm<3,void> gr(sz);
76 
77  auto ite = pl.getGPUIterator();
78 
79  pl.template hostToDevice<0>();
80 
81  Matrix<dim,T> mt;
82  Point<dim,T> pt;
83 
84  no_transform_only<dim,T> t(mt,pt);
85 
86  CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>>),ite.wthr,ite.thr,div,
87  spacing,
88  off,
89  t,
90  pl.capacity(),
91  pl.size(),
92  part_ids.capacity(),
93  0,
94  static_cast<T *>(pl.template getDeviceBuffer<0>()),
95  static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
96  static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
97 
98  cl_n.template deviceToHost<0>();
99  part_ids.template deviceToHost<0>();
100 
101  // I have to find != 0 in the cell with particles
102 
103  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(0)[0],gr.LinId({2,2,2}));
104  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(1)[0],gr.LinId({9,2,2}));
105  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(2)[0],gr.LinId({2,9,2}));
106  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(3)[0],gr.LinId({2,2,9}));
107  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(4)[0],gr.LinId({9,9,2}));
108  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(5)[0],gr.LinId({9,2,9}));
109  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(6)[0],gr.LinId({2,9,9}));
110  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(7)[0],gr.LinId({9,9,9}));
111  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(8)[0],gr.LinId({0,0,0}));
112  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(9)[0],gr.LinId({2,2,2}));
113 
114  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
115  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
116  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
117  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
118  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
119  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
120  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
121  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
122  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
123 }
124 
125 
126 template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
127 void test_sub_index2()
128 {
131 
132  // fill with some particles
133 
135 
136  // Make the test more complicated the test to pass because make different capacity of pl and part_ids
137  pl.resize(256);
138  pl.resize(0);
139 
140  // create 3 particles
141 
142  Point<dim,T> p1({0.2,0.2,0.2});
143  Point<dim,T> p2({0.9,0.2,0.2});
144  Point<dim,T> p3({0.2,0.9,0.2});
145  Point<dim,T> p4({0.2,0.2,0.9});
146  Point<dim,T> p5({0.9,0.9,0.2});
147  Point<dim,T> p6({0.9,0.2,0.9});
148  Point<dim,T> p7({0.2,0.9,0.9});
149  Point<dim,T> p8({0.9,0.9,0.9});
150  Point<dim,T> p9({0.0,0.0,0.0});
151  Point<dim,T> p10({0.205,0.205,0.205});
152 
153  Point<dim,T> pt({-0.3,-0.3,-0.3});
154 
155  p1 += pt;
156  p2 += pt;
157  p3 += pt;
158  p4 += pt;
159  p5 += pt;
160  p6 += pt;
161  p7 += pt;
162  p8 += pt;
163  p9 += pt;
164  p10 += pt;
165 
166 
167  pl.add(p1);
168  pl.add(p2);
169  pl.add(p3);
170  pl.add(p4);
171  pl.add(p5);
172  pl.add(p6);
173  pl.add(p7);
174  pl.add(p8);
175  pl.add(p9);
176  pl.add(p10);
177 
181 
182  for (size_t i = 0 ; i < dim ; i++)
183  {
184  div[i] = 17;
185  spacing[i] = 0.1;
186  off[i] = 0;
187  }
188 
189  cl_n.resize(17*17*17);
190  cl_n.template fill<0>(0);
191 // CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
192 
193  part_ids.resize(pl.size());
194 
195  size_t sz[3] = {17,17,17};
196  grid_sm<3,void> gr(sz);
197 
198  auto ite = pl.getGPUIterator();
199 
200  pl.template hostToDevice<0>();
201 
202  Matrix<dim,T> mt;
203 
204  shift_only<dim,T> t(mt,pt);
205 
206  CUDA_LAUNCH_DIM3((subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>>),ite.wthr,ite.thr,div,
207  spacing,
208  off,
209  t,
210  pl.capacity(),
211  pl.size(),
212  part_ids.capacity(),
213  0,
214  static_cast<T *>(pl.template getDeviceBuffer<0>()),
215  static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
216  static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
217 
218  cl_n.template deviceToHost<0>();
219  part_ids.template deviceToHost<0>();
220 
221  // I have to find != 0 in the cell with particles
222 
223  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(0)[0],gr.LinId({2,2,2}));
224  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(1)[0],gr.LinId({9,2,2}));
225  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(2)[0],gr.LinId({2,9,2}));
226  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(3)[0],gr.LinId({2,2,9}));
227  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(4)[0],gr.LinId({9,9,2}));
228  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(5)[0],gr.LinId({9,2,9}));
229  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(6)[0],gr.LinId({2,9,9}));
230  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(7)[0],gr.LinId({9,9,9}));
231  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(8)[0],gr.LinId({0,0,0}));
232  BOOST_REQUIRE_EQUAL(part_ids.template get<0>(9)[0],gr.LinId({2,2,2}));
233 
234  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
235  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
236  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
237  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
238  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
239  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
240  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
241  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
242  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
243 }
244 
245 template<unsigned int dim, typename T>
246 void create_n_part(int n_part,
248  CellList<dim,T, Mem_fast<>> & cl)
249 {
250  pl.resize(n_part);
251 
252  auto it = pl.getIterator();
253 
254  while(it.isNext())
255  {
256  auto p = it.get();
257 
258  pl.template get<0>(p)[0] = (double)rand()/RAND_MAX;
259  pl.template get<0>(p)[1] = (double)rand()/RAND_MAX;
260  pl.template get<0>(p)[2] = (double)rand()/RAND_MAX;
261 
262  Point<dim,T> xp;
263  xp.get(0) = pl.template get<0>(p)[0];
264  xp.get(1) = pl.template get<0>(p)[1];
265  xp.get(2) = pl.template get<0>(p)[2];
266 
267  size_t c = cl.getCell(xp);
268  cl.addCell(c,p);
269 
270  ++it;
271  }
272 }
273 
274 template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
275 void create_starts_and_parts_ids(CellList<dim,T, Mem_fast<>> & cl,
276  grid_sm<dim,void> & gr,
277  size_t n_part,
278  size_t n_cell,
280  openfpm::vector<aggregate<ids_type[2]>,CudaMemory,memory_traits_inte> & part_ids,
282 {
283  // Construct starts and part_ids
284 
285  part_ids.resize(n_part);
286  starts.resize(n_cell);
287  cells.resize(n_part);
288 
290 
291  size_t start = 0;
292 
293  while (itg.isNext())
294  {
295  auto cell = itg.get();
296 
297  size_t clin = gr.LinId(cell);
298 
299  for (size_t j = 0 ; j < cl.getNelements(clin) ; j++)
300  {
301  size_t p_id = cl.get(clin,j);
302 
303  part_ids.template get<0>(p_id)[0] = clin;
304 
305  part_ids.template get<0>(p_id)[1] = j;
306 
307  cells.template get<0>(start+j) = p_id;
308  }
309  starts.template get<0>(clin) = start;
310  start += cl.getNelements(clin);
311 
312  ++itg;
313  }
314 }
315 
316 template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
317 void test_fill_cell()
318 {
319 #ifndef MAKE_CELLLIST_DETERMINISTIC
320 
325 
326 
327  // CellList to check the result
328 
329  Box<dim,T> domain;
330 
331  typename openfpm::array<T,dim> spacing;
334 
335  size_t div_host[dim];
336 
337  size_t tot = 1;
338  for (size_t i = 0 ; i < dim ; i++)
339  {
340  div_host[i] = 10;
341  div_c[i] = 10;
342  tot *= div_host[i];
343  spacing[i] = 0.1;
344  domain.setLow(i,0.0);
345  domain.setHigh(i,1.0);
346  off[i] = 0;
347  }
348 
349  CellList<dim,T, Mem_fast<>> cl(domain,div_host,0);
351 
352  create_n_part(5000,pl,cl);
353 
354  grid_sm<dim,void> gr(div_host);
355 
356  create_starts_and_parts_ids(cl,gr,pl.size(),tot,starts,part_ids,cells_out);
357 
358  bool check = true;
359  cells.resize(pl.size());
360  for (size_t i = 0 ; i < gr.size() - 1 ; i++)
361  {
362  size_t tot_p = starts.template get<0>(i+1) - starts.template get<0>(i);
363 
364  check &= (tot_p == cl.getNelements(i));
365 
366  grid_key_dx<dim> key = gr.InvLinId(i);
367 
368  // Now we check the ids
369 
370  for (size_t j = 0 ; j < cl.getNelements(i) ; j++)
371  {
372  size_t p_id = cl.get(i,j);
373 
374  check &= part_ids.template get<0>(p_id)[0] == i;
375  }
376  }
377 
378  BOOST_REQUIRE(check == true);
379 
380  auto itgg = part_ids.getGPUIterator();
381 
382  starts.template hostToDevice<0>();
383  part_ids.template hostToDevice<0>();
384 
385  // Here we test fill cell
386  CUDA_LAUNCH_DIM3((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg.wthr,itgg.thr,0,
387  div_c,
388  off,
389  part_ids.size(),
390  part_ids.capacity(),
391  0,
392  static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
393  static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
394  static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
395 
396  cells.template deviceToHost<0>();
397 
398  for (size_t i = 0 ; i < gr.size() - 1 ; i++)
399  {
400  size_t tot_p = starts.template get<0>(i+1) - starts.template get<0>(i);
401 
402  check &= (tot_p == cl.getNelements(i));
403 
404  grid_key_dx<dim> key = gr.InvLinId(i);
405 
406  // Now we check the ids
407 
408  for (size_t j = 0 ; j < cl.getNelements(i) ; j++)
409  {
410  size_t p_id = cl.get(i,j);
411 
412  size_t p_id2 = cells.template get<0>(starts.template get<0>(i) + j);
413 
414  check &= (p_id == p_id2);
415  }
416  }
417 
418  BOOST_REQUIRE(check == true);
419 
420 #endif
421 }
422 
423 template<typename sparse_vector_type>
424 __global__ void construct_cells(sparse_vector_type sv, grid_sm<3,void> gs)
425 {
426  sv.init();
427 
428  grid_key_dx<3> key1({5,5,5});
429  grid_key_dx<3> key2({5,5,6});
430  grid_key_dx<3> key3({5,6,5});
431  grid_key_dx<3> key4({5,6,6});
432  grid_key_dx<3> key5({6,5,5});
433  grid_key_dx<3> key6({6,5,6});
434  grid_key_dx<3> key7({6,6,5});
435  grid_key_dx<3> key8({6,6,6});
436 
437  grid_key_dx<3> key9({7,7,7});
438 
439  grid_key_dx<3> key10({9,9,9});
440 
441  sv.template insert<0>(gs.LinId(key1)) = gs.LinId(key1);
442  sv.template insert<0>(gs.LinId(key2)) = gs.LinId(key2);
443  sv.template insert<0>(gs.LinId(key3)) = gs.LinId(key3);
444  sv.template insert<0>(gs.LinId(key4)) = gs.LinId(key4);
445  sv.template insert<0>(gs.LinId(key5)) = gs.LinId(key5);
446  sv.template insert<0>(gs.LinId(key6)) = gs.LinId(key6);
447  sv.template insert<0>(gs.LinId(key7)) = gs.LinId(key7);
448  sv.template insert<0>(gs.LinId(key8)) = gs.LinId(key8);
449  sv.template insert<0>(gs.LinId(key9)) = gs.LinId(key9);
450  sv.template insert<0>(gs.LinId(key10)) = gs.LinId(key10);
451 
452  sv.flush_block_insert();
453 }
454 
455 void test_cell_count_n()
456 {
459  openfpm::vector_gpu<aggregate<int>> cells_nn_test;
460 
461  vs.template setBackground<0>(-1);
462 
463  vs.setGPUInsertBuffer(1,32);
464 
465  size_t sz[] = {17,17,17};
466  grid_sm<3,void> gs(sz);
467 
468  CUDA_LAUNCH_DIM3(construct_cells,1,1,vs.toKernel(),gs);
469 
470  mgpu::ofp_context_t ctx;
471 
472  vs.flush<sadd_<0>>(ctx,flush_type::FLUSH_ON_DEVICE);
473 
474  cells_nn.resize(11);
475  cells_nn.fill<0>(0);
476 
477  grid_key_dx<3> start({0,0,0});
478  grid_key_dx<3> stop({2,2,2});
479  grid_key_dx<3> middle({1,1,1});
480 
481  int mid = gs.LinId(middle);
482 
483  grid_key_dx_iterator_sub<3> it(gs,start,stop);
484 
485  while (it.isNext())
486  {
487  auto p = it.get();
488 
489  cells_nn_test.add();
490  cells_nn_test.get<0>(cells_nn_test.size()-1) = (int)gs.LinId(p) - mid;
491 
492  ++it;
493  }
494 
495  cells_nn_test.template hostToDevice<0>();
496 
497  auto itgg = vs.getGPUIterator();
498  CUDA_LAUNCH((count_nn_cells),itgg,vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
499 
500  cells_nn.deviceToHost<0>();
501 
502  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(0),8);
503  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(1),8);
504  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(2),8);
505  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(3),8);
506  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(4),8);
507  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(5),8);
508  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(6),8);
509  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(7),9);
510  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(8),2);
511  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(9),1);
512 
513  // now we scan
514  openfpm::scan((unsigned int *)cells_nn.template getDeviceBuffer<0>(), cells_nn.size(), (unsigned int *)cells_nn.template getDeviceBuffer<0>() , ctx);
515 
517  cell_nn_list.resize(7*8 + 9 + 2 + 1);
518 
519  CUDA_LAUNCH((fill_nn_cells),itgg,vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cell_nn_list.toKernel(),200);
520 
521  cell_nn_list.deviceToHost<0>();
522 
523  // 8 NN
524  for (size_t i = 0 ; i < 7 ; i++)
525  {
526  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+0),1535);
527  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+1),1536);
528  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+2),1552);
529  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+3),1553);
530  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+4),1824);
531  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+5),1825);
532  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+6),1841);
533  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+7),1842);
534  }
535 
536  // 9 NN
537  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+0),1535);
538  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+1),1536);
539  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+2),1552);
540  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+3),1553);
541  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+4),1824);
542  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+5),1825);
543  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+6),1841);
544  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+7),1842);
545  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+8),2149);
546 
547  // 2 NN
548  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9),1842);
549  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+1),2149);
550 
551  // 1 NN
552  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+2),2763);
553 }
554 
555 BOOST_AUTO_TEST_CASE( test_count_nn_cells )
556 {
557  std::cout << "Test cell count nn" << std::endl;
558 
559  test_cell_count_n();
560 }
561 
562 BOOST_AUTO_TEST_CASE( test_subindex_funcs )
563 {
564  std::cout << "Test cell list GPU base func" << "\n";
565 
566  test_sub_index<3,float,int,unsigned char>();
567  test_sub_index2<3,float,int,unsigned char>();
568 
569  std::cout << "End cell list GPU" << "\n";
570 
571  // Test the cell list
572 }
573 
574 BOOST_AUTO_TEST_CASE ( test_cell_fill )
575 {
576  std::cout << "Test GPU fill cells" << "\n";
577 
578  test_fill_cell<3,float,unsigned int, unsigned char>();
579 
580  std::cout << "End GPU fill cells" << "\n";
581 }
582 
583 template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
584 void test_reorder_parts(size_t n_part)
585 {
586  // Create n_part
593 
596 
597  // CellList to check the result
598 
599  Box<dim,T> domain;
600 
601  typename boost::array_openfpm<T,dim> spacing;
603 
604  size_t div_host[dim];
605 
606  size_t tot = 1;
607  for (size_t i = 0 ; i < dim ; i++)
608  {
609  div_host[i] = 10;
610  div_c[i] = tot;
611  tot *= div_host[i];
612  spacing[i] = 0.1;
613  domain.setLow(i,0.0);
614  domain.setHigh(i,1.0);
615  }
616 
617  CellList<dim,T, Mem_fast<>> cl(domain,div_host,0);
620 
621  create_n_part(n_part,pl,cl);
622  parts_prp.resize(n_part);
623  parts_prp_out.resize(n_part);
624  pl_out.resize(n_part);
625  sort_to_not_sort.resize(n_part);
626  non_sort_to_sort.resize(n_part);
627 
628  auto p_it = parts_prp.getIterator();
629  while (p_it.isNext())
630  {
631  auto p = p_it.get();
632 
633  parts_prp.template get<0>(p) = 10000 + p;
634  parts_prp.template get<1>(p) = 20000 + p;
635 
636  parts_prp.template get<2>(p)[0] = 30000 + p;
637  parts_prp.template get<2>(p)[1] = 40000 + p;
638  parts_prp.template get<2>(p)[2] = 50000 + p;
639 
640  parts_prp.template get<3>(p)[0][0] = 60000 + p;
641  parts_prp.template get<3>(p)[0][1] = 70000 + p;
642  parts_prp.template get<3>(p)[0][2] = 80000 + p;
643  parts_prp.template get<3>(p)[1][0] = 90000 + p;
644  parts_prp.template get<3>(p)[1][1] = 100000 + p;
645  parts_prp.template get<3>(p)[1][2] = 110000 + p;
646  parts_prp.template get<3>(p)[2][0] = 120000 + p;
647  parts_prp.template get<3>(p)[2][1] = 130000 + p;
648  parts_prp.template get<3>(p)[0][2] = 140000 + p;
649 
650  ++p_it;
651  }
652 
653  grid_sm<dim,void> gr(div_host);
654 
655  create_starts_and_parts_ids(cl,gr,pl.size(),tot,starts,part_ids,cells_out);
656 
657 
658  auto itgg = pl.getGPUIterator();
659 
660  cells_out.template hostToDevice<0>();
661 
662  auto ite = pl.getGPUIterator();
663 
664  parts_prp.template hostToDevice<0,1,2,3>();
665 
666  // Here we test fill cell
667  CUDA_LAUNCH_DIM3((reorder_parts<decltype(parts_prp.toKernel()),
668  decltype(pl.toKernel()),
669  decltype(sort_to_not_sort.toKernel()),
670  cnt_type,
671  shift_ph<0,cnt_type>>),ite.wthr,ite.thr,pl.size(),
672  parts_prp.toKernel(),
673  parts_prp_out.toKernel(),
674  pl.toKernel(),
675  pl_out.toKernel(),
676  sort_to_not_sort.toKernel(),
677  non_sort_to_sort.toKernel(),
678  static_cast<cnt_type *>(cells_out.template getDeviceBuffer<0>()));
679 
680  bool check = true;
681  parts_prp_out.template deviceToHost<0>();
682  sort_to_not_sort.template deviceToHost<0>();
683  non_sort_to_sort.template deviceToHost<0>();
684 
685  size_t st = 0;
686  for (size_t i = 0 ; i < tot ; i++)
687  {
688  size_t n = cl.getNelements(i);
689 
690  for (size_t j = 0 ; j < n ; j++)
691  {
692  size_t p = cl.get(i,j);
693 
694  check &= parts_prp_out.template get<0>(st) == parts_prp.template get<0>(p);
695  check &= sort_to_not_sort.template get<0>(st) == p;
696  check &= non_sort_to_sort.template get<0>(p) == st;
697 
698  st++;
699  }
700  }
701 
702 
703  BOOST_REQUIRE_EQUAL(check,true);
704 }
705 
706 BOOST_AUTO_TEST_CASE ( test_reorder_particles )
707 {
708  std::cout << "Test GPU reorder" << "\n";
709 
710  test_reorder_parts<3,float,unsigned int, unsigned char>(5000);
711 
712  std::cout << "End GPU reorder" << "\n";
713 }
714 
715 template<unsigned int dim, typename T, typename CellS> void Test_cell_gpu(SpaceBox<dim,T> & box)
716 {
717  //Space where is living the Cell list
718  //SpaceBox<dim,T> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
719 
720  // Subdivisions
721  size_t div[dim] = {16,16,16};
722 
723  // Origin
724  Point<dim,T> org({0.0,0.0,0.0});
725 
726  // id Cell list
727  CellS cl2(box,div);
728 
729  // vector of particles
730 
733 
736 
737 
738  // create 3 particles
739 
740  Point<dim,double> p1({0.2,0.2,0.2});
741  Point<dim,double> p2({0.9,0.2,0.2});
742  Point<dim,double> p3({0.2,0.9,0.2});
743  Point<dim,double> p4({0.2,0.2,0.9});
744  Point<dim,double> p5({0.9,0.9,0.2});
745  Point<dim,double> p6({0.9,0.2,0.9});
746  Point<dim,double> p7({0.2,0.9,0.9});
747  Point<dim,double> p8({0.9,0.9,0.9});
748  Point<dim,double> p9({0.0,0.0,0.0});
749 
750  pl.add(p1);
751  pl.add(p2);
752  pl.add(p3);
753  pl.add(p4);
754  pl.add(p5);
755  pl.add(p6);
756  pl.add(p7);
757  pl.add(p8);
758  pl.add(p9);
759 
760  pl_prp.resize(pl.size());
761  pl_prp_out.resize(pl.size());
762  pl_out.resize(pl.size());
763 
764  for (size_t i = 0 ; i < pl.size() ; i++)
765  {
766  pl_prp.template get<0>(i) = pl.template get<0>(i)[0];
767 
768  pl_prp.template get<1>(i)[0] = pl.template get<0>(i)[0]+100.0;
769  pl_prp.template get<1>(i)[1] = pl.template get<0>(i)[1]+100.0;
770  pl_prp.template get<1>(i)[2] = pl.template get<0>(i)[2]+100.0;
771 
772  pl_prp.template get<2>(i)[0][0] = pl.template get<0>(i)[0]+1000.0;
773  pl_prp.template get<2>(i)[0][1] = pl.template get<0>(i)[1]+1000.0;
774  pl_prp.template get<2>(i)[0][2] = pl.template get<0>(i)[2]+1000.0;
775 
776  pl_prp.template get<2>(i)[1][0] = pl.template get<0>(i)[0]+2000.0;
777  pl_prp.template get<2>(i)[1][1] = pl.template get<0>(i)[1]+3000.0;
778  pl_prp.template get<2>(i)[1][2] = pl.template get<0>(i)[2]+4000.0;
779 
780  pl_prp.template get<2>(i)[2][0] = pl.template get<0>(i)[0]+5000.0;
781  pl_prp.template get<2>(i)[2][1] = pl.template get<0>(i)[1]+6000.0;
782  pl_prp.template get<2>(i)[2][2] = pl.template get<0>(i)[2]+7000.0;
783  }
784 
785  pl_prp.resize(pl.size());
786  pl_prp_out.resize(pl.size());
787 
788  pl.template hostToDevice<0>();
789  pl_prp.template hostToDevice<0,1,2>();
790 
791  // create an mgpu context
792  mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
793  cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context);
794 
795  // Check
796 
797  pl_prp_out.deviceToHost<0>();
798  pl_prp_out.deviceToHost<1>();
799  pl_prp_out.deviceToHost<2>();
800 
802 
804 
805  pl_correct.add(p9);
806  pl_correct.add(p1);
807  pl_correct.add(p2);
808  pl_correct.add(p3);
809  pl_correct.add(p5);
810  pl_correct.add(p4);
811  pl_correct.add(p6);
812  pl_correct.add(p7);
813  pl_correct.add(p8);
814 
815  for (size_t i = 0 ; i < pl_correct.size() ; i++)
816  {
817  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<0>(i),(float)pl_correct.template get<0>(i)[0]);
818  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[0],(float)(pl_correct.template get<0>(i)[0]+100.0));
819  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[1],(float)(pl_correct.template get<0>(i)[1]+100.0));
820  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<1>(i)[2],(float)(pl_correct.template get<0>(i)[2]+100.0));
821  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][0],(float)(pl_correct.template get<0>(i)[0] + 1000.0));
822  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][1],(float)(pl_correct.template get<0>(i)[1] + 1000.0));
823  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[0][2],(float)(pl_correct.template get<0>(i)[2] + 1000.0));
824  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][0],(float)(pl_correct.template get<0>(i)[0] + 2000.0));
825  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][1],(float)(pl_correct.template get<0>(i)[1] + 3000.0));
826  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[1][2],(float)(pl_correct.template get<0>(i)[2] + 4000.0));
827  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][0],(float)(pl_correct.template get<0>(i)[0] + 5000.0));
828  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][1],(float)(pl_correct.template get<0>(i)[1] + 6000.0));
829  BOOST_REQUIRE_EQUAL(pl_prp_out.template get<2>(i)[2][2],(float)(pl_correct.template get<0>(i)[2] + 7000.0));
830  }
831 
832  // Check the sort to non sort buffer
833 
834  auto & vsrt = cl2.getSortToNonSort();
835  vsrt.template deviceToHost<0>();
836 
837  BOOST_REQUIRE_EQUAL(vsrt.size(),9);
838 
839  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(0),8);
840  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(1),0);
841  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(2),1);
842  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(3),2);
843  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(4),4);
844  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(5),3);
845  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(6),5);
846  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(7),6);
847  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(8),7);
848 
849  auto & vnsrt = cl2.getNonSortToSort();
850 
851  BOOST_REQUIRE_EQUAL(vnsrt.size(),9);
852 
853  // Move to CPU
854 
855  vnsrt.template deviceToHost<0>();
856 
857  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(8),0);
858  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(0),1);
859  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(1),2);
860  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(2),3);
861  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(4),4);
862  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(3),5);
863  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(5),6);
864  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(6),7);
865  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(7),8);
866 }
867 
868 
869 BOOST_AUTO_TEST_CASE( CellList_gpu_use)
870 {
871  std::cout << "Test cell list GPU" << "\n";
872 
873  SpaceBox<3,double> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
874  SpaceBox<3,double> box2({-1.0f,-1.0f,-1.0f},{1.0f,1.0f,1.0f});
875 
876  Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory>>(box);
877 
878  std::cout << "End cell list GPU" << "\n";
879 
880  // Test the cell list
881 }
882 
883 BOOST_AUTO_TEST_CASE( CellList_gpu_use_sparse )
884 {
885  std::cout << "Test cell list GPU sparse" << "\n";
886 
887  SpaceBox<3,double> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
888  SpaceBox<3,double> box2({-1.0f,-1.0f,-1.0f},{1.0f,1.0f,1.0f});
889 
890  Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory,no_transform_only<3,double>,unsigned int,int,true>> (box);
891 
892  std::cout << "End cell list GPU sparse" << "\n";
893 
894  // Test the cell list
895 }
896 
897 template<unsigned int dim, typename vector_ps, typename vector_pr>
898 void fill_random_parts(Box<dim,float> & box, vector_ps & vd_pos, vector_pr & vd_prp, size_t n)
899 {
900  for (size_t i = 0 ; i < n ; i++)
901  {
903 
904  p.get(0) = ((box.getHigh(0) - box.getLow(0) - 0.0001)*(float)rand()/RAND_MAX) + box.getLow(0);
905  p.get(1) = ((box.getHigh(1) - box.getLow(1) - 0.0001)*(float)rand()/RAND_MAX) + box.getLow(1);
906  p.get(2) = ((box.getHigh(2) - box.getLow(2) - 0.0001)*(float)rand()/RAND_MAX) + box.getLow(2);
907 
908  vd_pos.add(p);
909  vd_prp.add();
910  vd_prp.last().template get<0>() = i % 3;
911  }
912 }
913 
914 
915 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
916 __global__ void calc_force_number(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
917 {
918  int p = threadIdx.x + blockIdx.x * blockDim.x;
919 
920  if (p >= pos.size()) return;
921 
922  Point<3,float> xp = pos.template get<0>(p);
923 
924  auto it = cl.getNNIterator(cl.getCell(xp));
925 
926  while (it.isNext())
927  {
928  auto q = it.get_sort();
929  auto q_ns = it.get();
930 
931  int s1 = s_t_ns.template get<0>(q);
932 
933  atomicAdd(&vn.template get<0>(s1), 1);
934 
935  ++it;
936  }
937 }
938 
939 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
940 __global__ void calc_force_number_noato(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
941 {
942  int p = threadIdx.x + blockIdx.x * blockDim.x;
943 
944  if (p >= pos.size()) return;
945 
946  Point<3,float> xp = pos.template get<0>(p);
947 
948  auto it = cl.getNNIterator(cl.getCell(xp));
949 
950  while (it.isNext())
951  {
952  auto q = it.get_sort();
953  auto q_ns = it.get();
954 
955  int s1 = s_t_ns.template get<0>(q);
956 
957  ++vn.template get<0>(p);
958 
959  ++it;
960  }
961 }
962 
963 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
964 __global__ void calc_force_number_box(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn, unsigned int start)
965 {
966  int p = threadIdx.x + blockIdx.x * blockDim.x + start;
967 
968  if (p >= pos.size()) return;
969 
970  Point<3,float> xp = pos.template get<0>(p);
971 
972  auto it = cl.getNNIteratorBox(cl.getCell(xp));
973 
974  while (it.isNext())
975  {
976  auto q = it.get_sort();
977 
978  int s1 = s_t_ns.template get<0>(q);
979 
980  atomicAdd(&vn.template get<0>(s1), 1);
981 
982  ++it;
983  }
984 }
985 
986 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
987 __global__ void calc_force_number_box_noato(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn, unsigned int start)
988 {
989  int p = threadIdx.x + blockIdx.x * blockDim.x + start;
990 
991  if (p >= pos.size()) return;
992 
993  Point<3,float> xp = pos.template get<0>(p);
994 
995  auto it = cl.getNNIteratorBox(cl.getCell(xp));
996 
997  while (it.isNext())
998  {
999  auto q = it.get_sort();
1000 
1001  ++vn.template get<0>(p);
1002 
1003  ++it;
1004  }
1005 }
1006 
1007 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1008 __global__ void calc_force_number_rad(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type vn)
1009 {
1010  int p = threadIdx.x + blockIdx.x * blockDim.x;
1011 
1012  if (p >= pos.size()) return;
1013 
1014  Point<3,float> xp = pos.template get<0>(p);
1015 
1016  auto it = cl.getNNIteratorRadius(cl.getCell(xp));
1017 
1018  while (it.isNext())
1019  {
1020  auto q = it.get_sort();
1021 
1022  int s1 = s_t_ns.template get<0>(q);
1023 
1024  atomicAdd(&vn.template get<0>(s1), 1);
1025 
1026  ++it;
1027  }
1028 }
1029 
1030 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1031 __global__ void calc_force_list_box(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1032 {
1033  int p = threadIdx.x + blockIdx.x * blockDim.x;
1034 
1035  if (p >= pos.size()) return;
1036 
1037  Point<3,float> xp = pos.template get<0>(p);
1038  int start_list = v_nscan.template get<0>(p);
1039 
1040  auto it = cl.getNNIteratorBox(cl.getCell(xp));
1041 
1042  while (it.isNext())
1043  {
1044  auto q = it.get_sort();
1045 
1046  int s1 = s_t_ns.template get<0>(q);
1047 
1048  v_list.template get<0>(start_list) = s1;
1049 
1050  ++start_list;
1051  ++it;
1052  }
1053 }
1054 
1055 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1056 __global__ void calc_force_list(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1057 {
1058  int p = threadIdx.x + blockIdx.x * blockDim.x;
1059 
1060  if (p >= pos.size()) return;
1061 
1062  Point<3,float> xp = pos.template get<0>(p);
1063  int start_list = v_nscan.template get<0>(p);
1064 
1065  auto it = cl.getNNIterator(cl.getCell(xp));
1066 
1067  while (it.isNext())
1068  {
1069  auto q = it.get_sort();
1070 
1071  int s1 = s_t_ns.template get<0>(q);
1072 
1073  v_list.template get<0>(start_list) = s1;
1074 
1075  ++start_list;
1076  ++it;
1077  }
1078 }
1079 
1080 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1081 __global__ void calc_force_list_box_partial(vector_pos pos,
1082  vector_ns s_t_ns,
1083  CellList_type cl,
1084  vector_n_type v_nscan,
1085  vector_n_type v_nscan_part,
1086  vector_n_type v_list)
1087 {
1088  int p = threadIdx.x + blockIdx.x * blockDim.x;
1089 
1090  if (p >= pos.size()) return;
1091 
1092  Point<3,float> xp = pos.template get<0>(p);
1093  int start_list = v_nscan.template get<0>(p) + v_nscan_part.template get<0>(p);
1094 
1095  auto it = cl.getNNIteratorBox(cl.getCell(xp));
1096 
1097  while (it.isNext())
1098  {
1099  auto q = it.get_sort();
1100 
1101  int s1 = s_t_ns.template get<0>(q);
1102 
1103  v_list.template get<0>(start_list) = s1;
1104 
1105  ++start_list;
1106  ++it;
1107  }
1108 }
1109 
1110 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
1111 __global__ void calc_force_list_rad(vector_pos pos, vector_ns s_t_ns, CellList_type cl, vector_n_type v_nscan ,vector_n_type v_list)
1112 {
1113  int p = threadIdx.x + blockIdx.x * blockDim.x;
1114 
1115  if (p >= pos.size()) return;
1116 
1117  Point<3,float> xp = pos.template get<0>(p);
1118  int start_list = v_nscan.template get<0>(p);
1119 
1120  auto it = cl.getNNIteratorRadius(cl.getCell(xp));
1121 
1122  while (it.isNext())
1123  {
1124  auto q = it.get_sort();
1125 
1126  int s1 = s_t_ns.template get<0>(q);
1127 
1128  v_list.template get<0>(start_list) = s1;
1129 
1130  ++start_list;
1131  ++it;
1132  }
1133 }
1134 
1135 template<unsigned int impl>
1137 {
1138  template<typename CellS, typename Cells_cpu_type, typename T>
1139  static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1140  {
1141  }
1142 
1143  template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_type>
1144  static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out, unsigned int start)
1145  {
1146  auto ite = pl.getGPUIterator();
1147 
1148  CUDA_LAUNCH((calc_force_number),ite,pl.toKernel(),s_t_ns.toKernel(),
1149  cl2.toKernel(),n_out.toKernel());
1150  }
1151 
1152  template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1153  static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2,n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1154  {
1155  auto ite = pl.getGPUIterator();
1156 
1157  CUDA_LAUNCH((calc_force_list),ite,pl.toKernel(),
1158  s_t_ns.toKernel(),
1159  cl2.toKernel(),
1160  n_out_scan.toKernel(),
1161  nn_list.toKernel());
1162  }
1163 
1164  template<typename NN_type>
1165  static auto getNN(NN_type & nn, size_t cell) -> decltype(nn.getNNIterator(cell))
1166  {
1167  return nn.getNNIterator(cell);
1168  }
1169 };
1170 
1171 template<>
1173 {
1174  template<typename CellS, typename Cells_cpu_type, typename T>
1175  static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1176  {
1177  cl2.setRadius(radius);
1178  cl_cpu.setRadius(radius);
1179  }
1180 
1181  template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_type>
1182  static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out, unsigned int start)
1183  {
1184  auto ite = pl.getGPUIterator();
1185 
1186  CUDA_LAUNCH((calc_force_number_rad<decltype(pl.toKernel()),
1187  decltype(s_t_ns.toKernel()),
1188  decltype(cl2.toKernel()),
1189  decltype(n_out.toKernel())>),
1190  ite,pl.toKernel(),
1191  s_t_ns.toKernel(),
1192  cl2.toKernel(),
1193  n_out.toKernel());
1194  }
1195 
1196  template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1197  static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1198  {
1199  auto ite = pl.getGPUIterator();
1200 
1201  CUDA_LAUNCH((calc_force_list_rad<decltype(pl.toKernel()),
1202  decltype(s_t_ns.toKernel()),
1203  decltype(cl2.toKernel()),
1204  decltype(nn_list.toKernel())>),
1205  ite,pl.toKernel(),
1206  s_t_ns.toKernel(),
1207  cl2.toKernel(),
1208  n_out_scan.toKernel(),
1209  nn_list.toKernel());
1210  }
1211 
1212  template<typename NN_type>
1213  static auto getNN(NN_type & nn, size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1214  {
1215  return nn.getNNIteratorRadius(cell);
1216  }
1217 };
1218 
1219 template<>
1221 {
1222  template<typename CellS, typename Cells_cpu_type, typename T>
1223  static void set_radius(CellS & cl2, Cells_cpu_type & cl_cpu, T & radius)
1224  {
1225  cl2.setRadius(radius);
1226  cl_cpu.setRadius(radius);
1227  }
1228 
1229  template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_type>
1230  static void calc_num_noato(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out, unsigned int start)
1231  {
1232  auto ite = s_t_ns.getGPUIterator();
1233 
1234  CUDA_LAUNCH((calc_force_number_box_noato<decltype(pl.toKernel()),
1235  decltype(s_t_ns.toKernel()),
1236  decltype(cl2.toKernel()),
1237  decltype(n_out.toKernel())>),
1238  ite,pl.toKernel(),
1239  s_t_ns.toKernel(),
1240  cl2.toKernel(),
1241  n_out.toKernel(),
1242  start);
1243  }
1244 
1245  template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_type>
1246  static void calc_num(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_type & n_out, unsigned int start)
1247  {
1248  auto ite = s_t_ns.getGPUIterator();
1249 
1250  CUDA_LAUNCH((calc_force_number_box<decltype(pl.toKernel()),
1251  decltype(s_t_ns.toKernel()),
1252  decltype(cl2.toKernel()),
1253  decltype(n_out.toKernel())>),
1254  ite,
1255  pl.toKernel(),
1256  s_t_ns.toKernel(),
1257  cl2.toKernel(),
1258  n_out.toKernel(),
1259  start);
1260  }
1261 
1262  template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1263  static void calc_list(pl_type & pl, s_t_ns_type & s_t_ns, cl2_type & cl2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1264  {
1265  auto ite = s_t_ns.getGPUIterator();
1266 
1267  CUDA_LAUNCH((calc_force_list_box<decltype(pl.toKernel()),
1268  decltype(s_t_ns.toKernel()),
1269  decltype(cl2.toKernel()),
1270  decltype(nn_list.toKernel())>),
1271  ite,pl.toKernel(),
1272  s_t_ns.toKernel(),
1273  cl2.toKernel(),
1274  n_out_scan.toKernel(),
1275  nn_list.toKernel());
1276  }
1277 
1278  template<typename pl_type, typename s_t_ns_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1279  static void calc_list_partial(pl_type & pl,
1280  s_t_ns_type & s_t_ns,
1281  cl2_type & cl2,
1282  n_out_scan_type & n_out_scan,
1283  n_out_scan_type & n_out_scan_partial,
1284  nn_list_type & nn_list)
1285  {
1286  auto ite = s_t_ns.getGPUIterator();
1287 
1288  CUDA_LAUNCH((calc_force_list_box_partial),ite,pl.toKernel(),
1289  s_t_ns.toKernel(),
1290  cl2.toKernel(),
1291  n_out_scan.toKernel(),
1292  n_out_scan_partial.toKernel(),
1293  nn_list.toKernel());
1294  }
1295 
1296  template<typename NN_type>
1297  static auto getNN(NN_type & nn, size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1298  {
1299  return nn.getNNIteratorRadius(cell);
1300  }
1301 };
1302 
1303 template<unsigned int dim, typename T, typename CellS, int impl>
1304 void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& div)[dim],int box_nn = 1)
1305 {
1306  // Origin
1307  Point<dim,T> org({0.0,0.0,0.0});
1308 
1309  // id Cell list
1310  CellS cl2(box,div,2);
1311 
1312  CellList<dim,T,Mem_fast<>,shift<dim,T>> cl_cpu(box,div,2);
1313 
1314  cl2.setBoxNN(box_nn);
1315 
1316  T radius = (box.getHigh(0) - box.getLow(0))/div[0] * 2.0;
1317  execute_cl_test<impl>::set_radius(cl2,cl_cpu,radius);
1318 
1319  // vector of particles
1320 
1323 
1326 
1328 
1329  // create random particles
1330 
1331  fill_random_parts<3>(box,pl,pl_prp,npart);
1332 
1333  pl_prp_out.resize(pl.size());
1334  pl_out.resize(pl.size());
1335  n_out.resize(pl.size()+1);
1336  n_out.fill<0>(0);
1337 
1338  pl_prp.resize(pl.size());
1339  pl_prp_out.resize(pl.size());
1340 
1341  pl.template hostToDevice<0>();
1342  pl_prp.template hostToDevice<0,1>();
1343 
1344  // Construct an equivalent CPU cell-list
1345 
1346  // construct
1347 
1348  auto it2 = pl.getIterator();
1349 
1350  while (it2.isNext())
1351  {
1352  auto p = it2.get();
1353 
1354  Point<3,T> xp = pl.get(p);
1355 
1356  cl_cpu.add(xp,p);
1357 
1358  ++it2;
1359  }
1360 
1361  size_t g_m = pl.size() / 2;
1362 
1363  mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
1364  cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
1365 
1366  auto & s_t_ns = cl2.getSortToNonSort();
1367 
1368  pl.template hostToDevice<0>();
1369 
1370  execute_cl_test<impl>::calc_num(pl,s_t_ns,cl2,n_out,0);
1371 
1372  // Domain particles
1373 
1374 
1375  auto & gdsi = cl2.getDomainSortIds();
1376  gdsi.template deviceToHost<0>();
1377  s_t_ns.template deviceToHost<0>();
1378 
1379  bool match = true;
1380  for (size_t i = 0 ; i < g_m ; i++)
1381  {
1382  unsigned int p = gdsi.template get<0>(i);
1383 
1384  match &= (s_t_ns.template get<0>(p) < g_m);
1385  }
1386 
1387  BOOST_REQUIRE_EQUAL(match,true);
1388 
1389  // Check
1390 
1391  n_out.deviceToHost<0>();
1392 
1393  {
1394  bool check = true;
1395  auto it = pl.getIterator();
1396 
1397  while(it.isNext())
1398  {
1399  auto p = it.get();
1400 
1401  Point<dim,T> xp = pl.get(p);
1402 
1403  // Get NN iterator
1404 
1405  auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIterator(cl_cpu.getCell(xp))*/;
1406 
1407  size_t n_ele = 0;
1408  while (NN_it.isNext())
1409  {
1410  auto q = NN_it.get();
1411 
1412  n_ele++;
1413 
1414  ++NN_it;
1415  }
1416 
1417  check &= n_ele == n_out.template get<0>(p);
1418 
1419  if (check == false)
1420  {
1421  std::cout << p << " " << n_ele << " " << n_out.template get<0>(p) << " " << check << std::endl;
1422  break;
1423  }
1424 
1425  ++it;
1426  }
1427  BOOST_REQUIRE_EQUAL(check,true);
1428  }
1429 
1430  // now we scan the buffer
1431 
1434 
1435  n_out_scan.resize(pl.size()+1);
1436 
1437  openfpm::scan((unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.size(),(unsigned int *)n_out_scan.template getDeviceBuffer<0>(),context);
1438  n_out_scan.template deviceToHost<0>();
1439 
1440  if (n_out_scan.template get<0>(pl.size()) == 0)
1441  {return;}
1442 
1443  nn_list.resize(n_out_scan.template get<0>(pl.size()));
1444 
1445  // Now for each particle we construct the list
1446 
1447  pl.template hostToDevice<0>();
1448 
1449  execute_cl_test<impl>::calc_list(pl,s_t_ns,cl2,n_out_scan,nn_list);
1450 
1451  nn_list.template deviceToHost<0>();
1452 
1453  // Check
1454 
1455  n_out.deviceToHost<0>();
1456 
1457  {
1458  bool check = true;
1459  auto it = pl.getIterator();
1460 
1461  while(it.isNext())
1462  {
1463  auto p = it.get();
1464 
1465  Point<dim,T> xp = pl.get(p);
1466 
1467  // Get NN iterator
1468 
1469  openfpm::vector<int> cpu_list;
1470 
1471  auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIterator(cl_cpu.getCell(xp));*/
1472 
1473  while (NN_it.isNext())
1474  {
1475  auto q = NN_it.get();
1476 
1477  cpu_list.add(q);
1478 
1479  ++NN_it;
1480  }
1481 
1482  openfpm::vector<int> gpu_list;
1483 
1484  for (size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1485  {
1486  gpu_list.add(nn_list.template get<0>(i));
1487  }
1488 
1489  // sort bost vector
1490 
1491  cpu_list.sort();
1492  gpu_list.sort();
1493 
1494  for (size_t j = 0 ; j < cpu_list.size() ; j++)
1495  {check &= cpu_list.get(j) == gpu_list.get(j);}
1496 
1497  ++it;
1498  }
1499 
1500  BOOST_REQUIRE_EQUAL(check,true);
1501 
1502  }
1503 }
1504 
1505 template<unsigned int dim, typename T, typename CellS, int impl>
1506 void Test_cell_gpu_force_split(SpaceBox<dim,T> & box, size_t npart, const size_t (& div)[dim],int box_nn = 1)
1507 {
1508  // Origin
1509  Point<dim,T> org({0.0,0.0,0.0});
1510 
1511  // id Cell list
1512  CellS cl2_split1(box,div,2);
1513  CellS cl2_split2(box,div,2);
1514 
1515  CellList<dim,T,Mem_fast<>,shift<dim,T>> cl_cpu(box,div,2);
1516 
1517  cl2_split1.setBoxNN(box_nn);
1518  cl2_split2.setBoxNN(box_nn);
1519 
1520  T radius = (box.getHigh(0) - box.getLow(0))/div[0] * 2.0;
1521  execute_cl_test<impl>::set_radius(cl2_split1,cl_cpu,radius);
1522  execute_cl_test<impl>::set_radius(cl2_split2,cl_cpu,radius);
1523 
1524  // vector of particles
1525 
1528 
1531 
1534 
1535  // create random particles
1536 
1537  fill_random_parts<3>(box,pl,pl_prp,npart);
1538 
1539  pl_prp_out.resize(pl.size());
1540  pl_out.resize(pl.size());
1541  n_out.resize(pl.size()+1);
1542  n_out.fill<0>(0);
1543 
1544  pl_prp.resize(pl.size());
1545  pl_prp_out.resize(pl.size());
1546 
1547  pl.template hostToDevice<0>();
1548  pl_prp.template hostToDevice<0,1>();
1549 
1550  // Construct an equivalent CPU cell-list
1551 
1552  // construct
1553 
1554  auto it2 = pl.getIterator();
1555 
1556  while (it2.isNext())
1557  {
1558  auto p = it2.get();
1559 
1560  Point<3,T> xp = pl.get(p);
1561 
1562  cl_cpu.add(xp,p);
1563 
1564  ++it2;
1565  }
1566 
1567  size_t g_m = pl.size() / 2;
1568 
1569  mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
1570  cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2);
1571  cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size());
1572  auto & s_t_ns_s1 = cl2_split1.getSortToNonSort();
1573  auto & s_t_ns_s2 = cl2_split2.getSortToNonSort();
1574 
1575  execute_cl_test<impl>::calc_num_noato(pl,s_t_ns_s1,cl2_split1,n_out,0);
1576  n_out_partial = n_out;
1577  execute_cl_test<impl>::calc_num_noato(pl,s_t_ns_s2,cl2_split2,n_out,0);
1578 
1579  // Domain particles
1580 
1581  auto & gdsi_s1 = cl2_split1.getDomainSortIds();
1582  gdsi_s1.template deviceToHost<0>();
1583  s_t_ns_s1.template deviceToHost<0>();
1584 
1585  bool match = true;
1586  for (size_t i = 0 ; i < g_m ; i++)
1587  {
1588  unsigned int p = gdsi_s1.template get<0>(i);
1589 
1590  match &= (s_t_ns_s1.template get<0>(p) < g_m);
1591  }
1592 
1593  BOOST_REQUIRE_EQUAL(match,true);
1594 
1595  // Check
1596 
1597  n_out.deviceToHost<0>();
1598 
1599  {
1600  bool check = true;
1601  auto it = pl.getIteratorTo(pl.size()/2-1);
1602 
1603  while(it.isNext())
1604  {
1605  auto p = it.get();
1606 
1607  Point<dim,T> xp = pl.get(p);
1608 
1609  // Get NN iterator
1610 
1611  auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIterator(cl_cpu.getCell(xp))*/;
1612 
1613  size_t n_ele = 0;
1614  while (NN_it.isNext())
1615  {
1616  auto q = NN_it.get();
1617 
1618  n_ele++;
1619 
1620  ++NN_it;
1621  }
1622 
1623  check &= n_ele == n_out.template get<0>(p);
1624 
1625  if (check == false)
1626  {
1627  std::cout << p << " " << n_ele << " " << n_out.template get<0>(p) << " " << check << std::endl;
1628  break;
1629  }
1630 
1631  ++it;
1632  }
1633  BOOST_REQUIRE_EQUAL(check,true);
1634  }
1635 
1636  // now we scan the buffer
1637 
1640 
1641  n_out_scan.resize(n_out.size());
1642 
1643  openfpm::scan((unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.size(),(unsigned int *)n_out_scan.template getDeviceBuffer<0>(),context);
1644 
1645  n_out_scan.template deviceToHost<0>();
1646 
1647  if (n_out_scan.template get<0>(pl.size()) == 0)
1648  {return;}
1649 
1650  nn_list.resize(n_out_scan.template get<0>(pl.size()));
1651 
1652  // Now for each particle we construct the list
1653 
1654  pl.template hostToDevice<0>();
1655 
1656  execute_cl_test<impl>::calc_list(pl,s_t_ns_s1,cl2_split1,n_out_scan,nn_list);
1657  execute_cl_test<impl>::calc_list_partial(pl,s_t_ns_s2,cl2_split2,n_out_scan,n_out_partial,nn_list);
1658 
1659  nn_list.template deviceToHost<0>();
1660 
1661  // Check
1662 
1663  n_out.deviceToHost<0>();
1664 
1665  {
1666  bool check = true;
1667  auto it = pl.getIteratorTo(pl.size()/2-1);
1668 
1669  while(it.isNext())
1670  {
1671  auto p = it.get();
1672 
1673  Point<dim,T> xp = pl.get(p);
1674 
1675  // Get NN iterator
1676 
1677  openfpm::vector<int> cpu_list;
1678 
1679  auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIterator(cl_cpu.getCell(xp));*/
1680 
1681  while (NN_it.isNext())
1682  {
1683  auto q = NN_it.get();
1684 
1685  cpu_list.add(q);
1686 
1687  ++NN_it;
1688  }
1689 
1690  openfpm::vector<int> gpu_list;
1691 
1692  for (size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1693  {
1694  gpu_list.add(nn_list.template get<0>(i));
1695  }
1696 
1697  // sort both vector
1698 
1699 #ifndef MAKE_CELLLIST_DETERMINISTIC
1700 
1701  cpu_list.sort();
1702  gpu_list.sort();
1703 
1704 #endif
1705 
1706  for (size_t j = 0 ; j < cpu_list.size() ; j++)
1707  {check &= cpu_list.get(j) == gpu_list.get(j);}
1708 
1709  if (check == false)
1710  {
1711  std::cout << "NPARTS: " << npart << std::endl;
1712 
1713  for (size_t j = 0 ; j < cpu_list.size() ; j++)
1714  {std::cout << cpu_list.get(j) << " " << gpu_list.get(j) << std::endl;}
1715 
1716  break;
1717  }
1718 
1719  ++it;
1720  }
1721 
1722  BOOST_REQUIRE_EQUAL(check,true);
1723 
1724  }
1725 }
1726 
1727 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box)
1728 {
1729  std::cout << "Test cell list GPU" << "\n";
1730 
1731  SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1732  SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1733 
1734  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1735  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1736 
1737  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1738  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1739 
1740  std::cout << "End cell list GPU" << "\n";
1741 
1742  // Test the cell list
1743 }
1744 
1745 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split)
1746 {
1747  std::cout << "Test cell list GPU split" << "\n";
1748 
1749  SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1750  SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1751 
1752  Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1753  Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1754 
1755  Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1756  Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1757 
1758  std::cout << "End cell list GPU split" << "\n";
1759 
1760  // Test the cell list
1761 }
1762 
1763 /*BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split_test_performance)
1764 {
1765  typedef float T;
1766  constexpr int dim = 3;
1767  typedef CellList_gpu<3,float,CudaMemory,shift_only<3,float>> CellS;
1768 
1769  std::cout << "Performance" << "\n";
1770 
1771  SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1772  size_t div[] = {64,64,64};
1773 
1774  // Origin
1775  Point<dim,T> org({0.0,0.0,0.0});
1776 
1777  // id Cell list
1778  CellS cl2_split1(box,div,2);
1779  CellS cl2_split2(box,div,2);
1780 
1781  cl2_split1.setBoxNN(2);
1782  cl2_split2.setBoxNN(2);
1783 
1784  T radius = (box.getHigh(0) - box.getLow(0))/div[0] * 2.0;
1785 
1786  // vector of particles
1787 
1788  openfpm::vector<Point<dim,T>,CudaMemory,typename memory_traits_inte<Point<dim,T>>::type,memory_traits_inte> pl;
1789  openfpm::vector<Point<dim,T>,CudaMemory,typename memory_traits_inte<Point<dim,T>>::type,memory_traits_inte> pl_out;
1790 
1791  openfpm::vector<aggregate<T,T[3]>,CudaMemory,typename memory_traits_inte<aggregate<T,T[3]>>::type,memory_traits_inte> pl_prp;
1792  openfpm::vector<aggregate<T,T[3]>,CudaMemory,typename memory_traits_inte<aggregate<T,T[3]>>::type,memory_traits_inte> pl_prp_out;
1793 
1794  openfpm::vector<aggregate<unsigned int>,CudaMemory,typename memory_traits_inte<aggregate<unsigned int>>::type,memory_traits_inte> n_out;
1795  openfpm::vector<aggregate<unsigned int>,CudaMemory,typename memory_traits_inte<aggregate<unsigned int>>::type,memory_traits_inte> n_out_partial;
1796 
1797  // create random particles
1798 
1799  fill_random_parts<3>(box,pl,pl_prp,1400000);
1800 
1801  pl_prp_out.resize(pl.size());
1802  pl_out.resize(pl.size());
1803  n_out.resize(pl.size()+1);
1804  n_out.fill<0>(0);
1805 
1806  pl_prp.resize(pl.size());
1807  pl_prp_out.resize(pl.size());
1808 
1809  pl.hostToDevice<0>();
1810  pl_prp.hostToDevice<0,1>();
1811 
1812  size_t g_m = pl.size() / 2;
1813 
1814  mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
1815 
1816  cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2);
1817  cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size());
1818 
1819  cudaDeviceSynchronize();
1820 
1821  timer t;
1822  t.start();
1823 
1824  cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2);
1825  cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size());
1826 
1827  t.stop();
1828  std::cout << "Time: " << t.getwct() << std::endl;
1829 
1830  cudaDeviceSynchronize();
1831 
1832  cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size());
1833 
1834  cudaDeviceSynchronize();
1835 
1836  timer t2;
1837  t2.start();
1838 
1839  cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size());
1840 
1841  t2.stop();
1842  std::cout << "Time: " << t2.getwct() << std::endl;
1843 
1844  std::cout << "Performance" << "\n";
1845 
1846  // Test the cell list
1847 }*/
1848 
1849 #if 0
1850 
1851 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_sparse)
1852 {
1853  std::cout << "Test cell list GPU" << "\n";
1854 
1855  SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1856  SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1857 
1858  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,2>(box,1000,{32,32,32},2);
1859  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,2>(box,10000,{32,32,32},2);
1860 
1861  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,2>(box2,1000,{32,32,32},2);
1862  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,2>(box2,10000,{32,32,32},2);
1863 
1864  std::cout << "End cell list GPU" << "\n";
1865 
1866  // Test the cell list
1867 }
1868 
1869 #endif
1870 
1871 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_radius)
1872 {
1873  std::cout << "Test cell list GPU" << "\n";
1874 
1875  SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1876  SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1877 
1878  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,1000,{32,32,32});
1879  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,10000,{32,32,32});
1880 
1881  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,1000,{32,32,32});
1882  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,10000,{32,32,32});
1883 
1884  std::cout << "End cell list GPU" << "\n";
1885 
1886  // Test the cell list
1887 }
1888 
1889 #if 0
1890 
1891 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force)
1892 {
1893  std::cout << "Test cell list GPU" << "\n";
1894 
1895  SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1896  SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1897 
1898  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,1000,{16,16,16});
1899  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,10000,{16,16,16});
1900 
1901  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,1000,{16,16,16});
1902  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,10000,{16,16,16});
1903 
1904  std::cout << "End cell list GPU" << "\n";
1905 
1906  // Test the cell list
1907 }
1908 
1909 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_sparse)
1910 {
1911  std::cout << "Test cell list GPU force sparse" << "\n";
1912 
1913  SpaceBox<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1914  SpaceBox<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1915 
1916  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,0>(box,1000,{16,16,16});
1917  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,0>(box,10000,{16,16,16});
1918 
1919  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,0>(box2,1000,{16,16,16});
1920  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,unsigned int,int,true>,0>(box2,10000,{16,16,16});
1921 
1922  std::cout << "End cell list GPU force sparse" << "\n";
1923 
1924  // Test the cell list
1925 }
1926 
1927 #endif
1928 
1929 template<typename CellList_type, typename Vector_type, typename Vector_out>
1930 __global__ void cl_offload_gpu(CellList_type cl, Vector_type parts, Vector_out output)
1931 {
1932  int p = threadIdx.x + blockIdx.x * blockDim.x;
1933 
1934  if (p >= parts.size()) return;
1935 
1936  Point<3,float> xp = parts.template get<0>(p);
1937 
1938  output.template get<0>(p) = cl.getNelements(cl.getCell(xp));
1939 }
1940 
1941 template<typename CellList_type, typename Vector_type, typename Vector_scan_type, typename Vector_list_type>
1942 __global__ void cl_offload_gpu_list(CellList_type cl, Vector_type parts, Vector_scan_type scan, Vector_list_type list)
1943 {
1944  int p = threadIdx.x + blockIdx.x * blockDim.x;
1945 
1946  if (p >= parts.size()) return;
1947 
1948  Point<3,float> xp = parts.template get<0>(p);
1949 
1950  int id = cl.getCell(xp);
1951  int n_ele = cl.getNelements(id);
1952  int start = scan.template get<0>(p);
1953 
1954  for (int j = 0 ; j < n_ele ; j++)
1955  {
1956  list.template get<0>(start+j) = cl.get(id,j);
1957  }
1958 
1959 }
1960 
1961 #if 0
1962 
1963 BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test )
1964 {
1965  std::cout << "Test cell list offload gpu" << "\n";
1966 
1967  // Subdivisions
1968  size_t div[3] = {10,10,10};
1969 
1970  // grid info
1971  grid_sm<3,void> g_info(div);
1972 
1973  Box<3,float> box({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
1974 
1975  // CellS = CellListM<dim,T,8>
1977 
1980  v.resize(10000);
1981  os.resize(v.size());
1982 
1983  for (size_t i = 0 ; i < v.size() ; i++)
1984  {
1985  v.template get<0>(i)[0] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1986  v.template get<0>(i)[1] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1987  v.template get<0>(i)[2] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1988 
1989  Point<3,float> xp = v.template get<0>(i);
1990 
1991  cl1.add(xp,i);
1992  }
1993 
1994  auto ite = v.getGPUIterator();
1995 
1996  cl1.hostToDevice();
1997  v.hostToDevice<0>();
1998 
1999  CUDA_LAUNCH_DIM3((cl_offload_gpu<decltype(cl1.toKernel()),decltype(v.toKernel()),decltype(os.toKernel())>),ite.wthr,ite.thr,cl1.toKernel(),v.toKernel(),os.toKernel());
2000 
2001  os.deviceToHost<0>();
2002 
2003  bool match = true;
2004  for (size_t i = 0 ; i < os.size() ; i++)
2005  {
2006  Point<3,float> xp = v.template get<0>(i);
2007 
2008  match &= os.template get<0>(i) == cl1.getNelements(cl1.getCell(xp));
2009  }
2010 
2011  BOOST_REQUIRE_EQUAL(match,true);
2012 
2013  // now we scan the vector out
2014 
2016  os_scan.resize(v.size());
2017 
2018  mgpu::ofp_context_t ctx;
2019  openfpm::scan((int *)os.template getDeviceBuffer<0>(),os.size(),(int *)os_scan.template getDeviceBuffer<0>(),ctx);
2020 
2021  os_scan.deviceToHost<0>();
2022  os.deviceToHost<0>(os.size()-1,os.size()-1);
2023  size_t size_list = os_scan.template get<0>(os_scan.size()-1) + os.template get<0>(os.size()-1);
2024 
2026  os_list.resize(size_list);
2027 
2028  CUDA_LAUNCH_DIM3((cl_offload_gpu_list<decltype(cl1.toKernel()),decltype(v.toKernel()),
2029  decltype(os_scan.toKernel()),decltype(os_list.toKernel())>),ite.wthr,ite.thr,
2030  cl1.toKernel(),v.toKernel(),os_scan.toKernel(),os_list.toKernel());
2031 
2032  os_list.deviceToHost<0>();
2033 
2034  match = true;
2035  for (size_t i = 0 ; i < os.size() ; i++)
2036  {
2037  Point<3,float> xp = v.template get<0>(i);
2038 
2039  for (size_t j = 0 ; j < cl1.getNelements(cl1.getCell(xp)) ; j++)
2040  {
2041  match &= os_list.template get<0>(os_scan.template get<0>(i)+j) == cl1.get(cl1.getCell(xp),j);
2042  }
2043  }
2044 
2045  BOOST_REQUIRE_EQUAL(match,true);
2046 
2047  std::cout << "End cell list offload gpu" << "\n";
2048 
2049  // Test the cell list
2050 }
2051 
2052 #endif
2053 
2054 BOOST_AUTO_TEST_CASE( CellList_swap_test )
2055 {
2056  size_t npart = 4096;
2057 
2058  Box<3,float> box({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
2059 
2060  // Subdivisions
2061  size_t div[3] = {10,10,10};
2062 
2063  // Origin
2064  Point<3,float> org({0.0,0.0,0.0});
2065 
2066  // id Cell list
2067  CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl2(box,div,2);
2068  CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl3(box,div,2);
2069  CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cl4(box,div,2);
2070 
2071  // vector of particles
2072 
2075 
2078 
2079  // create random particles
2080 
2081  fill_random_parts<3>(box,pl,pl_prp,npart);
2082 
2083  pl_prp_out.resize(pl.size());
2084  pl_out.resize(pl.size());
2085 
2086  pl_prp.resize(pl.size());
2087  pl_prp_out.resize(pl.size());
2088 
2089  pl.template hostToDevice<0>();
2090  pl_prp.template hostToDevice<0,1>();
2091 
2092  size_t g_m = pl.size() / 2;
2093 
2094  mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
2095  cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
2096  cl4.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
2097 
2098  cl3.swap(cl2);
2099 
2100  // move device to host
2101 
2102  cl3.debug_deviceToHost();
2103  cl4.debug_deviceToHost();
2104 
2105  BOOST_REQUIRE_EQUAL(cl3.getNCells(),cl4.getNCells());
2106 
2109 
2110  bool check = true;
2111  for (size_t i = 0 ; i < cl3.getNCells() ; i++)
2112  {
2113  check &= cl3.getNelements(i) == cl4.getNelements(i);
2114 
2115  for (size_t j = 0 ; j < cl3.getNelements(i) ; j++)
2116  {
2117  s1.add(cl3.get(i,j));
2118  s2.add(cl4.get(i,j));
2119  }
2120 
2121  s1.sort();
2122  s2.sort();
2123 
2124  for (size_t j = 0 ; j < s1.size() ; j++)
2125  {
2126  check &= s1.get(j) == s2.get(j);
2127  }
2128  }
2129 
2130  BOOST_REQUIRE_EQUAL(check,true);
2131 
2133 }
2134 
2135 BOOST_AUTO_TEST_SUITE_END()
2136 
This class represent an N-dimensional box.
Definition: SpaceBox.hpp:26
__device__ __host__ grid_key_dx< N > InvLinId(mem_id id) const
Construct.
Definition: grid_sm.hpp:589
__device__ __host__ size_t size() const
Return the size of the grid.
Definition: grid_sm.hpp:637
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition: Box.hpp:556
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:27
size_t size()
Stub size.
Definition: map_vector.hpp:211
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition: memory_conf.hpp:83
__device__ __host__ void setHigh(int i, T val)
set the high interval of the box
Definition: Box.hpp:544
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
Definition: grid_sm.hpp:434
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
__device__ __host__ void setLow(int i, T val)
set the low interval of the box
Definition: Box.hpp:533
This class implement an NxN (dense) matrix.
Definition: Matrix.hpp:32
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
This class represent an N-dimensional box.
Definition: Box.hpp:60
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
Declaration grid_sm.
Definition: grid_sm.hpp:147
void flush(mgpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
Declaration grid_key_dx_iterator_sub.
Definition: grid_sm.hpp:156
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
Class for FAST cell list implementation.
Definition: CellList.hpp:356
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition: Box.hpp:567
No transformation.