OpenFPM  5.2.0
Project that contain the implementation of distributed structures
CellList_gpu_test.cu
1 /*
2  * CellList_gpu_test.cpp
3  *
4  * Created on: Jun 13, 2018
5  * Author: i-bird
6  */
7 
8 #include "util/cuda_util.hpp"
9 #include "config.h"
10 #define BOOST_TEST_DYN_LINK
11 #include <boost/test/unit_test.hpp>
12 
13 #include "NN/CellList/cuda/CellList_gpu.hpp"
14 #include "NN/CellList/CellList.hpp"
15 #include "util/boost/boost_array_openfpm.hpp"
16 #include "Point_test.hpp"
17 #include "util/cuda_util.hpp"
18 
19 BOOST_AUTO_TEST_SUITE( CellList_gpu_test )
20 
21 template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
22 void test_sub_index()
23 {
26 
27  // fill with some particles
28 
30 
31  // create 3 particles
32 
33  Point<dim,T> p1({0.2,0.2,0.2});
34  Point<dim,T> p2({0.9,0.2,0.2});
35  Point<dim,T> p3({0.2,0.9,0.2});
36  Point<dim,T> p4({0.2,0.2,0.9});
37  Point<dim,T> p5({0.9,0.9,0.2});
38  Point<dim,T> p6({0.9,0.2,0.9});
39  Point<dim,T> p7({0.2,0.9,0.9});
40  Point<dim,T> p8({0.9,0.9,0.9});
41  Point<dim,T> p9({0.0,0.0,0.0});
42  Point<dim,T> p10({0.205,0.205,0.205});
43 
44  vPos.add(p1);
45  vPos.add(p2);
46  vPos.add(p3);
47  vPos.add(p4);
48  vPos.add(p5);
49  vPos.add(p6);
50  vPos.add(p7);
51  vPos.add(p8);
52  vPos.add(p9);
53  vPos.add(p10);
54 
55  openfpm::array<T,dim> spacing;
58 
59  for (size_t i = 0 ; i < dim ; i++)
60  {
61  div[i] = 17;
62  spacing[i] = 0.1;
63  off[i] = 0;
64  }
65 
66  cl_n.resize(17*17*17);
67  cl_n.template fill<0>(0);
68  //CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(unsigned int)));
69 
70  cellIndex_LocalIndex.resize(vPos.size());
71 
72  size_t sz[3] = {17,17,17};
73  grid_sm<3,void> gr(sz);
74 
75  auto ite = vPos.getGPUIterator();
76 
77  vPos.template hostToDevice<0>();
78 
79  Matrix<dim,T> mt;
80  Point<dim,T> pt;
81 
82  no_transform_only<dim,T> t(mt,pt);
83 
84 
85  CUDA_LAUNCH_DIM3((fill_cellIndex_LocalIndex<dim,T,ids_type,no_transform_only<dim,T>>),ite.wthr,ite.thr,div,
86  spacing,
87  off,
88  t,
89  vPos.size(),
90  (size_t)0,
91  vPos.toKernel(),
92  cl_n.toKernel(),
93  cellIndex_LocalIndex.toKernel());
94 
95  cl_n.template deviceToHost<0>();
96  cellIndex_LocalIndex.template deviceToHost<0>();
97 
98  // I have to find != 0 in the cell with particles
99 
100  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(0)[0],gr.LinId({2,2,2}));
101  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(1)[0],gr.LinId({9,2,2}));
102  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(2)[0],gr.LinId({2,9,2}));
103  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(3)[0],gr.LinId({2,2,9}));
104  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(4)[0],gr.LinId({9,9,2}));
105  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(5)[0],gr.LinId({9,2,9}));
106  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(6)[0],gr.LinId({2,9,9}));
107  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(7)[0],gr.LinId({9,9,9}));
108  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(8)[0],gr.LinId({0,0,0}));
109  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(9)[0],gr.LinId({2,2,2}));
110 
111  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
112  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
113  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
114  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
115  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
116  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
117  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
118  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
119  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
120 }
121 
122 
123 template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
124 void test_sub_index2()
125 {
128 
129  // fill with some particles
130 
132 
133  // Make the test more complicated the test to pass because make different capacity of vPos and cellIndex_LocalIndex
134  vPos.resize(256);
135  vPos.resize(0);
136 
137  // create 3 particles
138 
139  Point<dim,T> p1({0.2,0.2,0.2});
140  Point<dim,T> p2({0.9,0.2,0.2});
141  Point<dim,T> p3({0.2,0.9,0.2});
142  Point<dim,T> p4({0.2,0.2,0.9});
143  Point<dim,T> p5({0.9,0.9,0.2});
144  Point<dim,T> p6({0.9,0.2,0.9});
145  Point<dim,T> p7({0.2,0.9,0.9});
146  Point<dim,T> p8({0.9,0.9,0.9});
147  Point<dim,T> p9({0.0,0.0,0.0});
148  Point<dim,T> p10({0.205,0.205,0.205});
149 
150  Point<dim,T> pt({-0.3,-0.3,-0.3});
151 
152  p1 += pt;
153  p2 += pt;
154  p3 += pt;
155  p4 += pt;
156  p5 += pt;
157  p6 += pt;
158  p7 += pt;
159  p8 += pt;
160  p9 += pt;
161  p10 += pt;
162 
163 
164  vPos.add(p1);
165  vPos.add(p2);
166  vPos.add(p3);
167  vPos.add(p4);
168  vPos.add(p5);
169  vPos.add(p6);
170  vPos.add(p7);
171  vPos.add(p8);
172  vPos.add(p9);
173  vPos.add(p10);
174 
175  openfpm::array<T,dim> spacing;
178 
179  for (size_t i = 0 ; i < dim ; i++)
180  {
181  div[i] = 17;
182  spacing[i] = 0.1;
183  off[i] = 0;
184  }
185 
186  cl_n.resize(17*17*17);
187  cl_n.template fill<0>(0);
188 // CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(unsigned int)));
189 
190  cellIndex_LocalIndex.resize(vPos.size());
191 
192  size_t sz[3] = {17,17,17};
193  grid_sm<3,void> gr(sz);
194 
195  auto ite = vPos.getGPUIterator();
196 
197  vPos.template hostToDevice<0>();
198 
199  Matrix<dim,T> mt;
200 
201  shift_only<dim,T> t(mt,pt);
202 
203 
204  CUDA_LAUNCH_DIM3((fill_cellIndex_LocalIndex<dim,T,ids_type,shift_only<dim,T>>),ite.wthr,ite.thr,div,
205  spacing,
206  off,
207  t,
208  vPos.size(),
209  (size_t)0,
210  vPos.toKernel(),
211  cl_n.toKernel(),
212  cellIndex_LocalIndex.toKernel());
213 
214  cl_n.template deviceToHost<0>();
215  cellIndex_LocalIndex.template deviceToHost<0>();
216 
217  // I have to find != 0 in the cell with particles
218 
219  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(0)[0],gr.LinId({2,2,2}));
220  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(1)[0],gr.LinId({9,2,2}));
221  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(2)[0],gr.LinId({2,9,2}));
222  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(3)[0],gr.LinId({2,2,9}));
223  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(4)[0],gr.LinId({9,9,2}));
224  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(5)[0],gr.LinId({9,2,9}));
225  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(6)[0],gr.LinId({2,9,9}));
226  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(7)[0],gr.LinId({9,9,9}));
227  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(8)[0],gr.LinId({0,0,0}));
228  BOOST_REQUIRE_EQUAL(cellIndex_LocalIndex.template get<0>(9)[0],gr.LinId({2,2,2}));
229 
230  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,2})),2);
231  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,2})),1);
232  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,2})),1);
233  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,2,9})),1);
234  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,2})),1);
235  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,2,9})),1);
236  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({2,9,9})),1);
237  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({9,9,9})),1);
238  BOOST_REQUIRE_EQUAL(cl_n.template get<0>(gr.LinId({0,0,0})),1);
239 }
240 
241 template<unsigned int dim, typename T>
242 void create_n_part(int n_part,
244  CellList<dim,T, Mem_fast<>> & cellList)
245 {
246  vPos.resize(n_part);
247 
248  auto it = vPos.getIterator();
249 
250  while(it.isNext())
251  {
252  auto p = it.get();
253 
254  vPos.template get<0>(p)[0] = (double)rand()/RAND_MAX;
255  vPos.template get<0>(p)[1] = (double)rand()/RAND_MAX;
256  vPos.template get<0>(p)[2] = (double)rand()/RAND_MAX;
257 
258  Point<dim,T> xp;
259  xp.get(0) = vPos.template get<0>(p)[0];
260  xp.get(1) = vPos.template get<0>(p)[1];
261  xp.get(2) = vPos.template get<0>(p)[2];
262 
263  size_t c = cellList.getCell(xp);
264  cellList.addCell(c,p);
265 
266  ++it;
267  }
268 }
269 
270 template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
271 void create_starts_and_parts_ids(CellList<dim,T, Mem_fast<>> & cellList,
272  grid_sm<dim,void> & gr,
273  size_t n_part,
274  size_t n_cell,
276  openfpm::vector<aggregate<ids_type[2]>,CudaMemory,memory_traits_inte> & cellIndex_LocalIndex,
278 {
279  // Construct starts and cellIndex_LocalIndex
280 
281  cellIndex_LocalIndex.resize(n_part);
282  starts.resize(n_cell);
283  cells.resize(n_part);
284 
286 
287  size_t start = 0;
288 
289  while (itg.isNext())
290  {
291  auto cell = itg.get();
292 
293  size_t clin = gr.LinId(cell);
294 
295  for (size_t j = 0 ; j < cellList.getNelements(clin) ; j++)
296  {
297  size_t p_id = cellList.get(clin,j);
298 
299  cellIndex_LocalIndex.template get<0>(p_id)[0] = clin;
300 
301  cellIndex_LocalIndex.template get<0>(p_id)[1] = j;
302 
303  cells.template get<0>(start+j) = p_id;
304  }
305  starts.template get<0>(clin) = start;
306  start += cellList.getNelements(clin);
307 
308  ++itg;
309  }
310 }
311 
312 template<typename sparse_vector_type>
313 __global__ void construct_cells(sparse_vector_type sv, grid_sm<3,void> gs)
314 {
315  sv.init();
316 
317  grid_key_dx<3> key1({5,5,5});
318  grid_key_dx<3> key2({5,5,6});
319  grid_key_dx<3> key3({5,6,5});
320  grid_key_dx<3> key4({5,6,6});
321  grid_key_dx<3> key5({6,5,5});
322  grid_key_dx<3> key6({6,5,6});
323  grid_key_dx<3> key7({6,6,5});
324  grid_key_dx<3> key8({6,6,6});
325 
326  grid_key_dx<3> key9({7,7,7});
327 
328  grid_key_dx<3> key10({9,9,9});
329 
330  sv.template insert<0>(gs.LinId(key1)) = gs.LinId(key1);
331  sv.template insert<0>(gs.LinId(key2)) = gs.LinId(key2);
332  sv.template insert<0>(gs.LinId(key3)) = gs.LinId(key3);
333  sv.template insert<0>(gs.LinId(key4)) = gs.LinId(key4);
334  sv.template insert<0>(gs.LinId(key5)) = gs.LinId(key5);
335  sv.template insert<0>(gs.LinId(key6)) = gs.LinId(key6);
336  sv.template insert<0>(gs.LinId(key7)) = gs.LinId(key7);
337  sv.template insert<0>(gs.LinId(key8)) = gs.LinId(key8);
338  sv.template insert<0>(gs.LinId(key9)) = gs.LinId(key9);
339  sv.template insert<0>(gs.LinId(key10)) = gs.LinId(key10);
340 
341  sv.flush_block_insert();
342 }
343 
344 void test_cell_count_n()
345 {
348  openfpm::vector_gpu<aggregate<int>> cells_nn_test;
349 
350  vs.template setBackground<0>(-1);
351 
352  vs.setGPUInsertBuffer(1,32);
353 
354  size_t sz[] = {17,17,17};
355  grid_sm<3,void> gs(sz);
356 
357  CUDA_LAUNCH_DIM3(construct_cells,1,1,vs.toKernel(),gs);
358 
359  gpu::ofp_context_t gpuContext;
360 
361  vs.flush<sadd_<0>>(gpuContext,flush_type::FLUSH_ON_DEVICE);
362 
363  cells_nn.resize(11);
364  cells_nn.fill<0>(0);
365 
366  grid_key_dx<3> start({0,0,0});
367  grid_key_dx<3> stop({2,2,2});
368  grid_key_dx<3> middle({1,1,1});
369 
370  int mid = gs.LinId(middle);
371 
372  grid_key_dx_iterator_sub<3> it(gs,start,stop);
373 
374  while (it.isNext())
375  {
376  auto p = it.get();
377 
378  cells_nn_test.add();
379  cells_nn_test.get<0>(cells_nn_test.size()-1) = (int)gs.LinId(p) - mid;
380 
381  ++it;
382  }
383 
384  cells_nn_test.template hostToDevice<0>();
385 
386  auto itgg = vs.getGPUIterator();
387  CUDA_LAUNCH((countNonEmptyNeighborCells),itgg,vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
388 
389  cells_nn.deviceToHost<0>();
390 
391  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(0),8);
392  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(1),8);
393  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(2),8);
394  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(3),8);
395  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(4),8);
396  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(5),8);
397  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(6),8);
398  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(7),9);
399  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(8),2);
400  BOOST_REQUIRE_EQUAL(cells_nn.template get<0>(9),1);
401 
402  // now we scan
403  openfpm::scan((unsigned int *)cells_nn.template getDeviceBuffer<0>(), cells_nn.size(), (unsigned int *)cells_nn.template getDeviceBuffer<0>() , gpuContext);
404 
406  cell_nn_list.resize(7*8 + 9 + 2 + 1);
407 
408  CUDA_LAUNCH((fillNeighborCellList),itgg,vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cell_nn_list.toKernel(),200);
409 
410  cell_nn_list.deviceToHost<0>();
411 
412  // 8 NN
413  for (size_t i = 0 ; i < 7 ; i++)
414  {
415  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+0),1535);
416  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+1),1536);
417  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+2),1552);
418  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+3),1553);
419  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+4),1824);
420  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+5),1825);
421  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+6),1841);
422  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*i+7),1842);
423  }
424 
425  // 9 NN
426  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+0),1535);
427  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+1),1536);
428  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+2),1552);
429  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+3),1553);
430  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+4),1824);
431  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+5),1825);
432  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+6),1841);
433  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+7),1842);
434  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+8),2149);
435 
436  // 2 NN
437  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9),1842);
438  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+1),2149);
439 
440  // 1 NN
441  BOOST_REQUIRE_EQUAL(cell_nn_list.template get<0>(8*7+9+2),2763);
442 }
443 
444 BOOST_AUTO_TEST_CASE( test_count_nn_cells )
445 {
446  std::cout << "Test cell count nn" << std::endl;
447 
448  test_cell_count_n();
449 }
450 
451 BOOST_AUTO_TEST_CASE( test_subindex_funcs )
452 {
453  std::cout << "Test cell list GPU base func" << "\n";
454 
455  test_sub_index<3,float,int,unsigned char>();
456  test_sub_index2<3,float,int,unsigned char>();
457 
458  std::cout << "End cell list GPU" << "\n";
459 
460  // Test the cell list
461 }
462 
463 template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
464 void test_reorder_parts(size_t n_part)
465 {
466  // Create n_part
473 
476 
477  // CellList to check the result
478 
479  Box<dim,T> domain;
480 
481  typename boost::array_openfpm<T,dim> spacing;
483 
484  size_t div_host[dim];
485 
486  size_t tot = 1;
487  for (size_t i = 0 ; i < dim ; i++)
488  {
489  div_host[i] = 10;
490  div_c[i] = tot;
491  tot *= div_host[i];
492  spacing[i] = 0.1;
493  domain.setLow(i,0.0);
494  domain.setHigh(i,1.0);
495  }
496 
497  CellList<dim,T, Mem_fast<>> cellList(domain,div_host,0);
499 
500  create_n_part(n_part,vPos,cellList);
501  vPrp.resize(n_part);
502  vPrpReorder.resize(n_part);
503  sortToNonSort.resize(n_part);
504  NonSortToSort.resize(n_part);
505 
506  auto p_it = vPrp.getIterator();
507  while (p_it.isNext())
508  {
509  auto p = p_it.get();
510 
511  vPrp.template get<0>(p) = 10000 + p;
512  vPrp.template get<1>(p) = 20000 + p;
513 
514  vPrp.template get<2>(p)[0] = 30000 + p;
515  vPrp.template get<2>(p)[1] = 40000 + p;
516  vPrp.template get<2>(p)[2] = 50000 + p;
517 
518  vPrp.template get<3>(p)[0][0] = 60000 + p;
519  vPrp.template get<3>(p)[0][1] = 70000 + p;
520  vPrp.template get<3>(p)[0][2] = 80000 + p;
521  vPrp.template get<3>(p)[1][0] = 90000 + p;
522  vPrp.template get<3>(p)[1][1] = 100000 + p;
523  vPrp.template get<3>(p)[1][2] = 110000 + p;
524  vPrp.template get<3>(p)[2][0] = 120000 + p;
525  vPrp.template get<3>(p)[2][1] = 130000 + p;
526  vPrp.template get<3>(p)[0][2] = 140000 + p;
527 
528  ++p_it;
529  }
530 
531  grid_sm<dim,void> gr(div_host);
532 
533  create_starts_and_parts_ids(cellList,gr,vPos.size(),tot,starts,cellIndex_LocalIndex,cells_out);
534 
535 
536  auto itgg = vPos.getGPUIterator();
537 
538  cells_out.template hostToDevice<0>();
539 
540  auto ite = vPos.getGPUIterator();
541 
542  vPrp.template hostToDevice<0,1,2,3>();
543 
544  CUDA_LAUNCH_DIM3((constructSortUnsortBidirectMap),
545  ite.wthr,ite.thr,
546  sortToNonSort.toKernel(),
547  NonSortToSort.toKernel(),
548  cells_out.toKernel()
549  );
550 
551  CUDA_LAUNCH_DIM3(
552  (reorderParticlesPrp<
553  decltype(vPrp.toKernel()),
554  decltype(NonSortToSort.toKernel()),
555  0>),
556  ite.wthr,ite.thr,
557  vPrp.toKernel(),
558  vPrpReorder.toKernel(),
559  NonSortToSort.toKernel(),
560  (size_t)0
561  );
562 
563  bool check = true;
564  vPrpReorder.template deviceToHost<0>();
565  sortToNonSort.template deviceToHost<0>();
566  NonSortToSort.template deviceToHost<0>();
567 
568  size_t st = 0;
569  for (size_t i = 0 ; i < tot ; i++)
570  {
571  size_t n = cellList.getNelements(i);
572 
573  for (size_t j = 0 ; j < n ; j++)
574  {
575  size_t p = cellList.get(i,j);
576 
577  check &= vPrpReorder.template get<0>(st) == vPrp.template get<0>(p);
578  check &= sortToNonSort.template get<0>(st) == p;
579  check &= NonSortToSort.template get<0>(p) == st;
580 
581  st++;
582  }
583  }
584 
585 
586  BOOST_REQUIRE_EQUAL(check,true);
587 }
588 
589 BOOST_AUTO_TEST_CASE ( test_reorder_particles )
590 {
591  std::cout << "Test GPU reorder" << "\n";
592 
593  test_reorder_parts<3,float,unsigned int, unsigned char>(5000);
594 
595  std::cout << "End GPU reorder" << "\n";
596 }
597 
598 template<unsigned int dim, typename T, typename CellS> void Test_cell_gpu(Box<dim,T> & box)
599 {
600  //Space where is living the Cell list
601  //Box<dim,T> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
602 
603  // Subdivisions
604  size_t div[dim] = {16,16,16};
605 
606  // Origin
607  Point<dim,T> org({0.0,0.0,0.0});
608 
609  // id Cell list
610  CellS cellList2(box,div);
611  cellList2.setOpt(CL_NON_SYMMETRIC | CL_GPU_REORDER_PROPERTY | CL_GPU_RESTORE_PROPERTY);
612 
613  // vector of particles
614 
617 
618  // create 3 particles
619 
620  Point<dim,double> p1({0.2,0.2,0.2});
621  Point<dim,double> p2({0.9,0.2,0.2});
622  Point<dim,double> p3({0.2,0.9,0.2});
623  Point<dim,double> p4({0.2,0.2,0.9});
624  Point<dim,double> p5({0.9,0.9,0.2});
625  Point<dim,double> p6({0.9,0.2,0.9});
626  Point<dim,double> p7({0.2,0.9,0.9});
627  Point<dim,double> p8({0.9,0.9,0.9});
628  Point<dim,double> p9({0.0,0.0,0.0});
629 
630  vPos.add(p1);
631  vPos.add(p2);
632  vPos.add(p3);
633  vPos.add(p4);
634  vPos.add(p5);
635  vPos.add(p6);
636  vPos.add(p7);
637  vPos.add(p8);
638  vPos.add(p9);
639 
640  vPrp.resize(vPos.size());
641 
642  for (size_t i = 0 ; i < vPos.size() ; i++)
643  {
644  vPrp.template get<0>(i) = vPos.template get<0>(i)[0];
645 
646  vPrp.template get<1>(i)[0] = vPos.template get<0>(i)[0]+100.0;
647  vPrp.template get<1>(i)[1] = vPos.template get<0>(i)[1]+100.0;
648  vPrp.template get<1>(i)[2] = vPos.template get<0>(i)[2]+100.0;
649 
650  vPrp.template get<2>(i)[0][0] = vPos.template get<0>(i)[0]+1000.0;
651  vPrp.template get<2>(i)[0][1] = vPos.template get<0>(i)[1]+1000.0;
652  vPrp.template get<2>(i)[0][2] = vPos.template get<0>(i)[2]+1000.0;
653 
654  vPrp.template get<2>(i)[1][0] = vPos.template get<0>(i)[0]+2000.0;
655  vPrp.template get<2>(i)[1][1] = vPos.template get<0>(i)[1]+3000.0;
656  vPrp.template get<2>(i)[1][2] = vPos.template get<0>(i)[2]+4000.0;
657 
658  vPrp.template get<2>(i)[2][0] = vPos.template get<0>(i)[0]+5000.0;
659  vPrp.template get<2>(i)[2][1] = vPos.template get<0>(i)[1]+6000.0;
660  vPrp.template get<2>(i)[2][2] = vPos.template get<0>(i)[2]+7000.0;
661  }
662 
663  vPrp.resize(vPos.size());
664 
665  vPos.template hostToDevice<0>();
666  vPrp.template hostToDevice<0,1,2>();
667 
670 
671  // create an gpu context
672  gpu::ofp_context_t gpuContext(gpu::gpu_context_opt::no_print_props);
673 
674  cellList2.template construct<decltype(vPos), decltype(vPrp), 0, 1, 2>(
675  vPos,
676  vPrp,
677  vPosReorder,
678  vPrpReorder,
679  gpuContext,
680  vPos.size(),
681  0,
682  vPos.size()
683  );
684 
685  vPrpReorder.template deviceToHost<0,1,2>();
686 
688 
690 
691  pl_correct.add(p9);
692  pl_correct.add(p1);
693  pl_correct.add(p2);
694  pl_correct.add(p3);
695  pl_correct.add(p5);
696  pl_correct.add(p4);
697  pl_correct.add(p6);
698  pl_correct.add(p7);
699  pl_correct.add(p8);
700 
701  for (size_t i = 0 ; i < pl_correct.size() ; i++)
702  {
703  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<0>(i),(float)pl_correct.template get<0>(i)[0]);
704  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<1>(i)[0],(float)(pl_correct.template get<0>(i)[0]+100.0));
705  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<1>(i)[1],(float)(pl_correct.template get<0>(i)[1]+100.0));
706  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<1>(i)[2],(float)(pl_correct.template get<0>(i)[2]+100.0));
707  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[0][0],(float)(pl_correct.template get<0>(i)[0] + 1000.0));
708  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[0][1],(float)(pl_correct.template get<0>(i)[1] + 1000.0));
709  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[0][2],(float)(pl_correct.template get<0>(i)[2] + 1000.0));
710  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[1][0],(float)(pl_correct.template get<0>(i)[0] + 2000.0));
711  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[1][1],(float)(pl_correct.template get<0>(i)[1] + 3000.0));
712  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[1][2],(float)(pl_correct.template get<0>(i)[2] + 4000.0));
713  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[2][0],(float)(pl_correct.template get<0>(i)[0] + 5000.0));
714  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[2][1],(float)(pl_correct.template get<0>(i)[1] + 6000.0));
715  BOOST_REQUIRE_EQUAL(vPrpReorder.template get<2>(i)[2][2],(float)(pl_correct.template get<0>(i)[2] + 7000.0));
716  }
717 
718  // Check the sort to non sort buffer
719 
720  auto & vsrt = cellList2.getSortToNonSort();
721  vsrt.template deviceToHost<0>();
722 
723  BOOST_REQUIRE_EQUAL(vsrt.size(),9);
724 
725  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(0),8);
726  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(1),0);
727  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(2),1);
728  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(3),2);
729  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(4),4);
730  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(5),3);
731  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(6),5);
732  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(7),6);
733  BOOST_REQUIRE_EQUAL(vsrt.template get<0>(8),7);
734 
735  auto & vnsrt = cellList2.getNonSortToSort();
736 
737  BOOST_REQUIRE_EQUAL(vnsrt.size(),9);
738 
739  // Move to CPU
740 
741  vnsrt.template deviceToHost<0>();
742 
743  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(8),0);
744  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(0),1);
745  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(1),2);
746  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(2),3);
747  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(4),4);
748  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(3),5);
749  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(5),6);
750  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(6),7);
751  BOOST_REQUIRE_EQUAL(vnsrt.template get<0>(7),8);
752 }
753 
754 
755 BOOST_AUTO_TEST_CASE( CellList_gpu_use)
756 {
757  std::cout << "Test cell list GPU" << "\n";
758 
759  Box<3,double> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
760  Box<3,double> box2({-1.0f,-1.0f,-1.0f},{1.0f,1.0f,1.0f});
761 
762  Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory>>(box);
763 
764  std::cout << "End cell list GPU" << "\n";
765 
766  // Test the cell list
767 }
768 
769 BOOST_AUTO_TEST_CASE( CellList_gpu_use_sparse )
770 {
771  std::cout << "Test cell list GPU sparse" << "\n";
772 
773  Box<3,double> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
774  Box<3,double> box2({-1.0f,-1.0f,-1.0f},{1.0f,1.0f,1.0f});
775 
776  Test_cell_gpu<3,double,CellList_gpu<3,double,CudaMemory,no_transform_only<3,double>,true>> (box);
777 
778  std::cout << "End cell list GPU sparse" << "\n";
779 
780  // Test the cell list
781 }
782 
783 template<unsigned int dim, typename vector_ps, typename vector_pr>
784 void fill_random_parts(Box<dim,float> & box, vector_ps & vd_pos, vector_pr & vd_prp, size_t n)
785 {
786  for (size_t i = 0 ; i < n ; i++)
787  {
789 
790  p.get(0) = ((box.getHigh(0) - box.getLow(0) - 0.0001)*(float)rand()/RAND_MAX) + box.getLow(0);
791  p.get(1) = ((box.getHigh(1) - box.getLow(1) - 0.0001)*(float)rand()/RAND_MAX) + box.getLow(1);
792  p.get(2) = ((box.getHigh(2) - box.getLow(2) - 0.0001)*(float)rand()/RAND_MAX) + box.getLow(2);
793 
794  vd_pos.add(p);
795  vd_prp.add();
796  vd_prp.last().template get<0>() = i % 3;
797  }
798 }
799 
800 
801 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
802 __global__ void calc_force_number(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out)
803 {
804  int p = threadIdx.x + blockIdx.x * blockDim.x;
805 
806  if (p >= pos.size()) return;
807 
808  Point<3,float> xp = pos.template get<0>(p);
809 
810  auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
811 
812  while (it.isNext())
813  {
814  auto q = it.get_sort();
815  auto q_ns = it.get();
816 
817  int s1 = sortToNonSort.template get<0>(q);
818 
819  atomicAdd(&n_out.template get<0>(s1), 1);
820 
821  ++it;
822  }
823 }
824 
825 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
826 __global__ void calc_force_number_noato(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out)
827 {
828  int p = threadIdx.x + blockIdx.x * blockDim.x;
829 
830  if (p >= pos.size()) return;
831 
832  Point<3,float> xp = pos.template get<0>(p);
833 
834  auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
835 
836  while (it.isNext())
837  {
838  auto q = it.get_sort();
839  auto q_ns = it.get();
840 
841  int s1 = sortToNonSort.template get<0>(q);
842 
843  ++n_out.template get<0>(p);
844 
845  ++it;
846  }
847 }
848 
849 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
850 __global__ void calc_force_number_box(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out, unsigned int start)
851 {
852  int p = threadIdx.x + blockIdx.x * blockDim.x + start;
853 
854  if (p >= pos.size()) return;
855 
856  Point<3,float> xp = pos.template get<0>(p);
857 
858  auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
859 
860  while (it.isNext())
861  {
862  auto q = it.get_sort();
863 
864  int s1 = sortToNonSort.template get<0>(q);
865 
866  atomicAdd(&n_out.template get<0>(s1), 1);
867 
868  ++it;
869  }
870 }
871 
872 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
873 __global__ void calc_force_number_box_noato(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out, unsigned int start)
874 {
875  int p = threadIdx.x + blockIdx.x * blockDim.x + start;
876 
877  if (p >= pos.size()) return;
878 
879  Point<3,float> xp = pos.template get<0>(p);
880 
881  auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
882 
883  while (it.isNext())
884  {
885  auto q = it.get_sort();
886 
887  ++n_out.template get<0>(p);
888 
889  ++it;
890  }
891 }
892 
893 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
894 __global__ void calc_force_number_rad(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type n_out)
895 {
896  int p = threadIdx.x + blockIdx.x * blockDim.x;
897 
898  if (p >= pos.size()) return;
899 
900  Point<3,float> xp = pos.template get<0>(p);
901 
902  auto it = cellList.getNNIteratorRadius(cellList.getCell(xp));
903 
904  while (it.isNext())
905  {
906  auto q = it.get_sort();
907 
908  int s1 = sortToNonSort.template get<0>(q);
909 
910  atomicAdd(&n_out.template get<0>(s1), 1);
911 
912  ++it;
913  }
914 }
915 
916 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
917 __global__ void calc_force_list_box(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type v_nscan ,vector_n_type v_list)
918 {
919  int p = threadIdx.x + blockIdx.x * blockDim.x;
920 
921  if (p >= pos.size()) return;
922 
923  Point<3,float> xp = pos.template get<0>(p);
924  int start_list = v_nscan.template get<0>(p);
925 
926  auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
927 
928  while (it.isNext())
929  {
930  auto q = it.get_sort();
931 
932  int s1 = sortToNonSort.template get<0>(q);
933 
934  v_list.template get<0>(start_list) = s1;
935 
936  ++start_list;
937  ++it;
938  }
939 }
940 
941 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
942 __global__ void calc_force_list(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type v_nscan ,vector_n_type v_list)
943 {
944  int p = threadIdx.x + blockIdx.x * blockDim.x;
945 
946  if (p >= pos.size()) return;
947 
948  Point<3,float> xp = pos.template get<0>(p);
949  int start_list = v_nscan.template get<0>(p);
950 
951  auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
952 
953  while (it.isNext())
954  {
955  auto q = it.get_sort();
956 
957  int s1 = sortToNonSort.template get<0>(q);
958 
959  v_list.template get<0>(start_list) = s1;
960 
961  ++start_list;
962  ++it;
963  }
964 }
965 
966 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
967 __global__ void calc_force_list_box_partial(vector_pos pos,
968  vector_ns sortToNonSort,
969  CellList_type cellList,
970  vector_n_type v_nscan,
971  vector_n_type v_nscan_part,
972  vector_n_type v_list)
973 {
974  int p = threadIdx.x + blockIdx.x * blockDim.x;
975 
976  if (p >= pos.size()) return;
977 
978  Point<3,float> xp = pos.template get<0>(p);
979  int start_list = v_nscan.template get<0>(p) + v_nscan_part.template get<0>(p);
980 
981  auto it = cellList.getNNIteratorBox(cellList.getCell(xp));
982 
983  while (it.isNext())
984  {
985  auto q = it.get_sort();
986 
987  int s1 = sortToNonSort.template get<0>(q);
988 
989  v_list.template get<0>(start_list) = s1;
990 
991  ++start_list;
992  ++it;
993  }
994 }
995 
996 template<typename vector_pos, typename vector_ns, typename CellList_type,typename vector_n_type>
997 __global__ void calc_force_list_rad(vector_pos pos, vector_ns sortToNonSort, CellList_type cellList, vector_n_type v_nscan ,vector_n_type v_list)
998 {
999  int p = threadIdx.x + blockIdx.x * blockDim.x;
1000 
1001  if (p >= pos.size()) return;
1002 
1003  Point<3,float> xp = pos.template get<0>(p);
1004  int start_list = v_nscan.template get<0>(p);
1005 
1006  auto it = cellList.getNNIteratorRadius(cellList.getCell(xp));
1007 
1008  while (it.isNext())
1009  {
1010  auto q = it.get_sort();
1011 
1012  int s1 = sortToNonSort.template get<0>(q);
1013 
1014  v_list.template get<0>(start_list) = s1;
1015 
1016  ++start_list;
1017  ++it;
1018  }
1019 }
1020 
1021 template<unsigned int impl>
1023 {
1024  template<typename CellS, typename Cells_cpu_type, typename T>
1025  static void set_radius(CellS & cellList2, Cells_cpu_type & cl_cpu, T & radius)
1026  {
1027  }
1028 
1029  template<typename pl_type, typename sortToNonSort_type, typename cl2_type, typename n_out_type>
1030  static void calc_num(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_type & n_out, unsigned int start)
1031  {
1032  auto ite = vPos.getGPUIterator();
1033 
1034  CUDA_LAUNCH((calc_force_number),ite,vPos.toKernel(),
1035  sortToNonSort.toKernel(),
1036  cellList2.toKernel(),
1037  n_out.toKernel()
1038  );
1039  }
1040 
1041  template<typename pl_type, typename sortToNonSort_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1042  static void calc_list(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2,n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1043  {
1044  auto ite = vPos.getGPUIterator();
1045 
1046  CUDA_LAUNCH((calc_force_list),ite,vPos.toKernel(),
1047  sortToNonSort.toKernel(),
1048  cellList2.toKernel(),
1049  n_out_scan.toKernel(),
1050  nn_list.toKernel()
1051  );
1052  }
1053 
1054  template<typename NN_type>
1055  static auto getNN(NN_type & nn, size_t cell) -> decltype(nn.getNNIteratorBox(cell))
1056  {
1057  return nn.getNNIteratorBox(cell);
1058  }
1059 };
1060 
1061 template<>
1063 {
1064  template<typename CellS, typename Cells_cpu_type, typename T>
1065  static void set_radius(CellS & cellList2, Cells_cpu_type & cl_cpu, T & radius)
1066  {
1067  cellList2.setRadius(radius);
1068  cl_cpu.setRadius(radius);
1069  }
1070 
1071  template<typename pl_type, typename sortToNonSort_type, typename cl2_type, typename n_out_type>
1072  static void calc_num(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_type & n_out, unsigned int start)
1073  {
1074  auto ite = vPos.getGPUIterator();
1075 
1076  CUDA_LAUNCH((calc_force_number_rad<decltype(vPos.toKernel()),
1077  decltype(sortToNonSort.toKernel()),
1078  decltype(cellList2.toKernel()),
1079  decltype(n_out.toKernel())>),
1080  ite,vPos.toKernel(),
1081  sortToNonSort.toKernel(),
1082  cellList2.toKernel(),
1083  n_out.toKernel());
1084  }
1085 
1086  template<typename pl_type, typename sortToNonSort_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1087  static void calc_list(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1088  {
1089  auto ite = vPos.getGPUIterator();
1090 
1091  CUDA_LAUNCH((calc_force_list_rad<decltype(vPos.toKernel()),
1092  decltype(sortToNonSort.toKernel()),
1093  decltype(cellList2.toKernel()),
1094  decltype(nn_list.toKernel())>),
1095  ite,vPos.toKernel(),
1096  sortToNonSort.toKernel(),
1097  cellList2.toKernel(),
1098  n_out_scan.toKernel(),
1099  nn_list.toKernel());
1100  }
1101 
1102  template<typename NN_type>
1103  static auto getNN(NN_type & nn, size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1104  {
1105  return nn.getNNIteratorRadius(cell);
1106  }
1107 };
1108 
1109 template<>
1111 {
1112  template<typename CellS, typename Cells_cpu_type, typename T>
1113  static void set_radius(CellS & cellList2, Cells_cpu_type & cl_cpu, T & radius)
1114  {
1115  cellList2.setRadius(radius);
1116  cl_cpu.setRadius(radius);
1117  }
1118 
1119  template<typename pl_type, typename sortToNonSort_type, typename cl2_type, typename n_out_type>
1120  static void calc_num_noato(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_type & n_out, unsigned int start)
1121  {
1122  auto ite = sortToNonSort.getGPUIterator();
1123 
1124  CUDA_LAUNCH((calc_force_number_box_noato<decltype(vPos.toKernel()),
1125  decltype(sortToNonSort.toKernel()),
1126  decltype(cellList2.toKernel()),
1127  decltype(n_out.toKernel())>),
1128  ite,vPos.toKernel(),
1129  sortToNonSort.toKernel(),
1130  cellList2.toKernel(),
1131  n_out.toKernel(),
1132  start);
1133  }
1134 
1135  template<typename pl_type, typename sortToNonSort_type, typename cl2_type, typename n_out_type>
1136  static void calc_num(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_type & n_out, unsigned int start)
1137  {
1138  auto ite = sortToNonSort.getGPUIterator();
1139 
1140  CUDA_LAUNCH((calc_force_number_box<decltype(vPos.toKernel()),
1141  decltype(sortToNonSort.toKernel()),
1142  decltype(cellList2.toKernel()),
1143  decltype(n_out.toKernel())>),
1144  ite,
1145  vPos.toKernel(),
1146  sortToNonSort.toKernel(),
1147  cellList2.toKernel(),
1148  n_out.toKernel(),
1149  start);
1150  }
1151 
1152  template<typename pl_type, typename sortToNonSort_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1153  static void calc_list(pl_type & vPos, sortToNonSort_type & sortToNonSort, cl2_type & cellList2, n_out_scan_type & n_out_scan, nn_list_type & nn_list)
1154  {
1155  auto ite = sortToNonSort.getGPUIterator();
1156 
1157  CUDA_LAUNCH((calc_force_list_box<decltype(vPos.toKernel()),
1158  decltype(sortToNonSort.toKernel()),
1159  decltype(cellList2.toKernel()),
1160  decltype(nn_list.toKernel())>),
1161  ite,vPos.toKernel(),
1162  sortToNonSort.toKernel(),
1163  cellList2.toKernel(),
1164  n_out_scan.toKernel(),
1165  nn_list.toKernel());
1166  }
1167 
1168  template<typename pl_type, typename sortToNonSort_type, typename cl2_type, typename n_out_scan_type, typename nn_list_type>
1169  static void calc_list_partial(pl_type & vPos,
1170  sortToNonSort_type & sortToNonSort,
1171  cl2_type & cellList2,
1172  n_out_scan_type & n_out_scan,
1173  n_out_scan_type & n_out_scan_partial,
1174  nn_list_type & nn_list)
1175  {
1176  auto ite = sortToNonSort.getGPUIterator();
1177 
1178  CUDA_LAUNCH((calc_force_list_box_partial),ite,vPos.toKernel(),
1179  sortToNonSort.toKernel(),
1180  cellList2.toKernel(),
1181  n_out_scan.toKernel(),
1182  n_out_scan_partial.toKernel(),
1183  nn_list.toKernel());
1184  }
1185 
1186  template<typename NN_type>
1187  static auto getNN(NN_type & nn, size_t cell) -> decltype(nn.getNNIteratorRadius(cell))
1188  {
1189  return nn.getNNIteratorRadius(cell);
1190  }
1191 };
1192 
1193 template<unsigned int dim, typename T, typename CellS, int impl>
1194 void Test_cell_gpu_force(Box<dim,T> & box, size_t npart, const size_t (& div)[dim],int box_nn = 2)
1195 {
1196  // Origin
1197  Point<dim,T> org({0.0,0.0,0.0});
1198 
1199  // id Cell list
1200  CellS cellList2(box,div,2);
1201 
1202  CellList<dim,T,Mem_fast<>,shift<dim,T>> cl_cpu(box,div,2);
1203 
1204  cellList2.setBoxNN(box_nn);
1205 
1206  T radius = (box.getHigh(0) - box.getLow(0))/div[0] * 2.0;
1207  execute_cl_test<impl>::set_radius(cellList2,cl_cpu,radius);
1208 
1209  // vector of particles
1210 
1213 
1215 
1216  // create random particles
1217 
1218  fill_random_parts<3>(box,vPos,vPrp,npart);
1219 
1220  n_out.resize(vPos.size()+1);
1221  n_out.fill<0>(0);
1222 
1223  vPrp.resize(vPos.size());
1224 
1225  vPos.template hostToDevice<0>();
1226  vPrp.template hostToDevice<0,1>();
1227 
1228  // Construct an equivalent CPU cell-list
1229 
1230  // construct
1231 
1232  auto it2 = vPos.getIterator();
1233 
1234  while (it2.isNext())
1235  {
1236  auto p = it2.get();
1237 
1238  Point<3,T> xp = vPos.get(p);
1239 
1240  cl_cpu.add(xp,p);
1241 
1242  ++it2;
1243  }
1244 
1245  size_t ghostMarker = vPos.size() / 2;
1246 
1247  gpu::ofp_context_t gpuContext(gpu::gpu_context_opt::no_print_props);
1248  cellList2.construct(vPos, vPrp, gpuContext, ghostMarker, 0, vPos.size());
1249 
1250  auto & sortToNonSort = cellList2.getSortToNonSort();
1251 
1252  vPos.template hostToDevice<0>();
1253 
1254  execute_cl_test<impl>::calc_num(vPos,sortToNonSort,cellList2,n_out,0);
1255 
1256  // Domain particles
1257 
1258  auto & gdsi = cellList2.getDomainSortIds();
1259  gdsi.template deviceToHost<0>();
1260  sortToNonSort.template deviceToHost<0>();
1261 
1262  bool match = true;
1263  for (size_t i = 0 ; i < ghostMarker ; i++)
1264  {
1265  unsigned int p = gdsi.template get<0>(i);
1266 
1267  match &= (sortToNonSort.template get<0>(p) < ghostMarker);
1268  }
1269 
1270  BOOST_REQUIRE_EQUAL(match,true);
1271 
1272  // Check
1273 
1274  n_out.deviceToHost<0>();
1275 
1276  {
1277  bool check = true;
1278  auto it = vPos.getIterator();
1279 
1280  while(it.isNext())
1281  {
1282  auto p = it.get();
1283 
1284  Point<dim,T> xp = vPos.get(p);
1285 
1286  // Get NN iterator
1287 
1288  auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIteratorBox(cl_cpu.getCell(xp))*/;
1289 
1290  size_t n_ele = 0;
1291  while (NN_it.isNext())
1292  {
1293  auto q = NN_it.get();
1294 
1295  n_ele++;
1296 
1297  ++NN_it;
1298  }
1299 
1300  check &= n_ele == n_out.template get<0>(p);
1301 
1302  if (check == false)
1303  {
1304  std::cout << p << " " << n_ele << " " << n_out.template get<0>(p) << " " << check << std::endl;
1305  break;
1306  }
1307 
1308  ++it;
1309  }
1310  BOOST_REQUIRE_EQUAL(check,true);
1311  }
1312 
1313  // now we scan the buffer
1314 
1317 
1318  n_out_scan.resize(vPos.size()+1);
1319 
1320  openfpm::scan((unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.size(),(unsigned int *)n_out_scan.template getDeviceBuffer<0>(),gpuContext);
1321  n_out_scan.template deviceToHost<0>();
1322 
1323  if (n_out_scan.template get<0>(vPos.size()) == 0)
1324  {return;}
1325 
1326  nn_list.resize(n_out_scan.template get<0>(vPos.size()));
1327 
1328  // Now for each particle we construct the list
1329 
1330  vPos.template hostToDevice<0>();
1331 
1332  execute_cl_test<impl>::calc_list(vPos,sortToNonSort,cellList2,n_out_scan,nn_list);
1333 
1334  nn_list.template deviceToHost<0>();
1335 
1336  // Check
1337 
1338  n_out.deviceToHost<0>();
1339 
1340  {
1341  bool check = true;
1342  auto it = vPos.getIterator();
1343 
1344  while(it.isNext())
1345  {
1346  auto p = it.get();
1347 
1348  Point<dim,T> xp = vPos.get(p);
1349 
1350  // Get NN iterator
1351 
1352  openfpm::vector<int> cpu_list;
1353 
1354  auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIteratorBox(cl_cpu.getCell(xp));*/
1355 
1356  while (NN_it.isNext())
1357  {
1358  auto q = NN_it.get();
1359 
1360  cpu_list.add(q);
1361 
1362  ++NN_it;
1363  }
1364 
1365  openfpm::vector<int> gpu_list;
1366 
1367  for (size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1368  {
1369  gpu_list.add(nn_list.template get<0>(i));
1370  }
1371 
1372  // sort bost vector
1373 
1374  cpu_list.sort();
1375  gpu_list.sort();
1376 
1377  for (size_t j = 0 ; j < cpu_list.size() ; j++)
1378  {check &= cpu_list.get(j) == gpu_list.get(j);}
1379 
1380  ++it;
1381  }
1382 
1383  BOOST_REQUIRE_EQUAL(check,true);
1384 
1385  }
1386 }
1387 
1388 template<unsigned int dim, typename T, typename CellS, int impl>
1389 void Test_cell_gpu_force_split(Box<dim,T> & box, size_t npart, const size_t (& div)[dim],int box_nn = 2)
1390 {
1391  // Origin
1392  Point<dim,T> org({0.0,0.0,0.0});
1393 
1394  // id Cell list
1395  CellS cl2_split1(box,div,2);
1396  CellS cl2_split2(box,div,2);
1397 
1398  CellList<dim,T,Mem_fast<>,shift<dim,T>> cl_cpu(box,div,2);
1399 
1400  cl2_split1.setBoxNN(box_nn);
1401  cl2_split2.setBoxNN(box_nn);
1402 
1403  T radius = (box.getHigh(0) - box.getLow(0))/div[0] * 2.0;
1404  execute_cl_test<impl>::set_radius(cl2_split1,cl_cpu,radius);
1405  execute_cl_test<impl>::set_radius(cl2_split2,cl_cpu,radius);
1406 
1407  // vector of particles
1408 
1411 
1414 
1415  // create random particles
1416 
1417  fill_random_parts<3>(box,vPos,vPrp,npart);
1418 
1419  n_out.resize(vPos.size()+1);
1420  n_out.fill<0>(0);
1421 
1422  vPrp.resize(vPos.size());
1423 
1424  vPos.template hostToDevice<0>();
1425  vPrp.template hostToDevice<0,1>();
1426 
1427  // Construct an equivalent CPU cell-list
1428 
1429  // construct
1430 
1431  auto it2 = vPos.getIterator();
1432 
1433  while (it2.isNext())
1434  {
1435  auto p = it2.get();
1436 
1437  Point<3,T> xp = vPos.get(p);
1438 
1439  cl_cpu.add(xp,p);
1440 
1441  ++it2;
1442  }
1443 
1444  size_t ghostMarker = vPos.size() / 2;
1445 
1446  gpu::ofp_context_t gpuContext(gpu::gpu_context_opt::no_print_props);
1447  cl2_split1.construct(vPos,vPrp,gpuContext,ghostMarker,0,vPos.size()/2);
1448  cl2_split2.construct(vPos,vPrp,gpuContext,ghostMarker,vPos.size()/2,vPos.size());
1449  auto & sortToNonSort_s1 = cl2_split1.getSortToNonSort();
1450  auto & sortToNonSort_s2 = cl2_split2.getSortToNonSort();
1451 
1452  execute_cl_test<impl>::calc_num_noato(vPos,sortToNonSort_s1,cl2_split1,n_out,0);
1453  n_out_partial = n_out;
1454  execute_cl_test<impl>::calc_num_noato(vPos,sortToNonSort_s2,cl2_split2,n_out,0);
1455 
1456  // Domain particles
1457 
1458  auto & gdsi_s1 = cl2_split1.getDomainSortIds();
1459  gdsi_s1.template deviceToHost<0>();
1460  sortToNonSort_s1.template deviceToHost<0>();
1461 
1462  bool match = true;
1463  for (size_t i = 0 ; i < ghostMarker ; i++)
1464  {
1465  unsigned int p = gdsi_s1.template get<0>(i);
1466 
1467  match &= (sortToNonSort_s1.template get<0>(p) < ghostMarker);
1468  }
1469 
1470  BOOST_REQUIRE_EQUAL(match,true);
1471 
1472  // Check
1473 
1474  n_out.deviceToHost<0>();
1475 
1476  {
1477  bool check = true;
1478  auto it = vPos.getIteratorTo(vPos.size()/2-1);
1479 
1480  while(it.isNext())
1481  {
1482  auto p = it.get();
1483 
1484  Point<dim,T> xp = vPos.get(p);
1485 
1486  // Get NN iterator
1487 
1488  auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIteratorBox(cl_cpu.getCell(xp))*/;
1489 
1490  size_t n_ele = 0;
1491  while (NN_it.isNext())
1492  {
1493  auto q = NN_it.get();
1494 
1495  n_ele++;
1496 
1497  ++NN_it;
1498  }
1499 
1500  check &= n_ele == n_out.template get<0>(p);
1501 
1502  if (check == false)
1503  {
1504  std::cout << p << " " << n_ele << " " << n_out.template get<0>(p) << " " << check << std::endl;
1505  break;
1506  }
1507 
1508  ++it;
1509  }
1510  BOOST_REQUIRE_EQUAL(check,true);
1511  }
1512 
1513  // now we scan the buffer
1514 
1517 
1518  n_out_scan.resize(n_out.size());
1519 
1520  openfpm::scan((unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.size(),(unsigned int *)n_out_scan.template getDeviceBuffer<0>(),gpuContext);
1521 
1522  n_out_scan.template deviceToHost<0>();
1523 
1524  if (n_out_scan.template get<0>(vPos.size()) == 0)
1525  {return;}
1526 
1527  nn_list.resize(n_out_scan.template get<0>(vPos.size()));
1528 
1529  // Now for each particle we construct the list
1530 
1531  vPos.template hostToDevice<0>();
1532 
1533  execute_cl_test<impl>::calc_list(vPos,sortToNonSort_s1,cl2_split1,n_out_scan,nn_list);
1534  execute_cl_test<impl>::calc_list_partial(vPos,sortToNonSort_s2,cl2_split2,n_out_scan,n_out_partial,nn_list);
1535 
1536  nn_list.template deviceToHost<0>();
1537 
1538  // Check
1539 
1540  n_out.deviceToHost<0>();
1541 
1542  {
1543  bool check = true;
1544  auto it = vPos.getIteratorTo(vPos.size()/2-1);
1545 
1546  while(it.isNext())
1547  {
1548  auto p = it.get();
1549 
1550  Point<dim,T> xp = vPos.get(p);
1551 
1552  // Get NN iterator
1553 
1554  openfpm::vector<int> cpu_list;
1555 
1556  auto NN_it = execute_cl_test<impl>::getNN(cl_cpu,cl_cpu.getCell(xp)); /*cl_cpu.getNNIteratorBox(cl_cpu.getCell(xp));*/
1557 
1558  while (NN_it.isNext())
1559  {
1560  auto q = NN_it.get();
1561 
1562  cpu_list.add(q);
1563 
1564  ++NN_it;
1565  }
1566 
1567  openfpm::vector<int> gpu_list;
1568 
1569  for (size_t i = n_out_scan.template get<0>(p) ; i < n_out_scan.template get<0>(p+1) ; i++)
1570  {
1571  gpu_list.add(nn_list.template get<0>(i));
1572  }
1573 
1574  // sort both vector
1575 
1576  cpu_list.sort();
1577  gpu_list.sort();
1578 
1579  for (size_t j = 0 ; j < cpu_list.size() ; j++)
1580  {check &= cpu_list.get(j) == gpu_list.get(j);}
1581 
1582  if (check == false)
1583  {
1584  std::cout << "NPARTS: " << npart << std::endl;
1585 
1586  for (size_t j = 0 ; j < cpu_list.size() ; j++)
1587  {std::cout << cpu_list.get(j) << " " << gpu_list.get(j) << std::endl;}
1588 
1589  break;
1590  }
1591 
1592  ++it;
1593  }
1594 
1595  BOOST_REQUIRE_EQUAL(check,true);
1596 
1597  }
1598 }
1599 
1600 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box)
1601 {
1602  std::cout << "Test cell list GPU" << "\n";
1603 
1604  Box<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1605  Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1606 
1607  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{8,8,8});
1608  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{8,8,8});
1609 
1610  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{8,8,8});
1611  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{8,8,8});
1612 
1613  std::cout << "End cell list GPU" << "\n";
1614 
1615  // Test the cell list
1616 }
1617 
1618 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split)
1619 {
1620  std::cout << "Test cell list GPU split" << "\n";
1621 
1622  Box<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1623  Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1624 
1625  Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,1000,{32,32,32});
1626  Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box,10000,{32,32,32});
1627 
1628  Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,1000,{32,32,32});
1629  Test_cell_gpu_force_split<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,2>(box2,10000,{32,32,32});
1630 
1631  std::cout << "End cell list GPU split" << "\n";
1632 
1633  // Test the cell list
1634 }
1635 
1636 /*BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split_test_performance)
1637 {
1638  typedef float T;
1639  constexpr int dim = 3;
1640  typedef CellList_gpu<3,float,CudaMemory,shift_only<3,float>> CellS;
1641 
1642  std::cout << "Performance" << "\n";
1643 
1644  Box<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1645  size_t div[] = {64,64,64};
1646 
1647  // Origin
1648  Point<dim,T> org({0.0,0.0,0.0});
1649 
1650  // id Cell list
1651  CellS cl2_split1(box,div,2);
1652  CellS cl2_split2(box,div,2);
1653 
1654  cl2_split1.setBoxNN(2);
1655  cl2_split2.setBoxNN(2);
1656 
1657  T radius = (box.getHigh(0) - box.getLow(0))/div[0] * 2.0;
1658 
1659  // vector of particles
1660 
1661  openfpm::vector<Point<dim,T>,CudaMemory,typename memory_traits_inte<Point<dim,T>>::type,memory_traits_inte> vPos;
1662  openfpm::vector<Point<dim,T>,CudaMemory,typename memory_traits_inte<Point<dim,T>>::type,memory_traits_inte> vPosReorder;
1663 
1664  openfpm::vector<aggregate<T,T[3]>,CudaMemory,typename memory_traits_inte<aggregate<T,T[3]>>::type,memory_traits_inte> vPrp;
1665  openfpm::vector<aggregate<T,T[3]>,CudaMemory,typename memory_traits_inte<aggregate<T,T[3]>>::type,memory_traits_inte> vPrpReorder;
1666 
1667  openfpm::vector<aggregate<unsigned int>,CudaMemory,typename memory_traits_inte<aggregate<unsigned int>>::type,memory_traits_inte> n_out;
1668  openfpm::vector<aggregate<unsigned int>,CudaMemory,typename memory_traits_inte<aggregate<unsigned int>>::type,memory_traits_inte> n_out_partial;
1669 
1670  // create random particles
1671 
1672  fill_random_parts<3>(box,vPos,vPrp,1400000);
1673 
1674  vPrpReorder.resize(vPos.size());
1675  vPosReorder.resize(vPos.size());
1676  n_out.resize(vPos.size()+1);
1677  n_out.fill<0>(0);
1678 
1679  vPrp.resize(vPos.size());
1680  vPrpReorder.resize(vPos.size());
1681 
1682  vPos.hostToDevice<0>();
1683  vPrp.hostToDevice<0,1>();
1684 
1685  size_t ghostMarker = vPos.size() / 2;
1686 
1687  gpu::ofp_context_t gpuContext(gpu::gpu_context_opt::no_print_props);
1688 
1689  cl2_split1.construct(vPos,vPrp,gpuContext,ghostMarker,0,vPos.size()/2);
1690  cl2_split2.construct(vPos,vPrp,gpuContext,ghostMarker,vPos.size()/2,vPos.size());
1691 
1692  cudaDeviceSynchronize();
1693 
1694  timer t;
1695  t.start();
1696 
1697  cl2_split1.construct(vPos,vPrp,gpuContext,ghostMarker,0,vPos.size()/2);
1698  cl2_split2.construct(vPos,vPrp,gpuContext,ghostMarker,vPos.size()/2,vPos.size());
1699 
1700  t.stop();
1701  std::cout << "Time: " << t.getwct() << std::endl;
1702 
1703  cudaDeviceSynchronize();
1704 
1705  cl2_split1.construct(vPos,vPrp,gpuContext,ghostMarker,0,vPos.size());
1706 
1707  cudaDeviceSynchronize();
1708 
1709  timer t2;
1710  t2.start();
1711 
1712  cl2_split1.construct(vPos,vPrp,gpuContext,ghostMarker,0,vPos.size());
1713 
1714  t2.stop();
1715  std::cout << "Time: " << t2.getwct() << std::endl;
1716 
1717  std::cout << "Performance" << "\n";
1718 
1719  // Test the cell list
1720 }*/
1721 
1722 
1723 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_sparse)
1724 {
1725  std::cout << "Test cell list GPU" << "\n";
1726 
1727  Box<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1728  Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1729 
1730  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,true>,2>(box,1000,{32,32,32},2);
1731  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,true>,2>(box,10000,{32,32,32},2);
1732 
1733  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,true>,2>(box2,1000,{32,32,32},2);
1734  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,true>,2>(box2,10000,{32,32,32},2);
1735 
1736  std::cout << "End cell list GPU" << "\n";
1737 
1738  // Test the cell list
1739 }
1740 
1741 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_radius)
1742 {
1743  std::cout << "Test cell list GPU" << "\n";
1744 
1745  Box<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1746  Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1747 
1748  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,1000,{32,32,32});
1749  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box,10000,{32,32,32});
1750 
1751  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,1000,{32,32,32});
1752  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,1>(box2,10000,{32,32,32});
1753 
1754  std::cout << "End cell list GPU" << "\n";
1755 
1756  // Test the cell list
1757 }
1758 
1759 #if 0
1760 
1761 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force)
1762 {
1763  std::cout << "Test cell list GPU" << "\n";
1764 
1765  Box<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1766  Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1767 
1768  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,1000,{16,16,16});
1769  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box,10000,{16,16,16});
1770 
1771  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,1000,{16,16,16});
1772  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>>,0>(box2,10000,{16,16,16});
1773 
1774  std::cout << "End cell list GPU" << "\n";
1775 
1776  // Test the cell list
1777 }
1778 
1779 BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_sparse)
1780 {
1781  std::cout << "Test cell list GPU force sparse" << "\n";
1782 
1783  Box<3,float> box({0.0f,0.0f,0.0f},{1.0f,1.0f,1.0f});
1784  Box<3,float> box2({-0.3f,-0.3f,-0.3f},{1.0f,1.0f,1.0f});
1785 
1786  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,true>,0>(box,1000,{16,16,16});
1787  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,true>,0>(box,10000,{16,16,16});
1788 
1789  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,true>,0>(box2,1000,{16,16,16});
1790  Test_cell_gpu_force<3,float,CellList_gpu<3,float,CudaMemory,shift_only<3,float>,true>,0>(box2,10000,{16,16,16});
1791 
1792  std::cout << "End cell list GPU force sparse" << "\n";
1793 
1794  // Test the cell list
1795 }
1796 
1797 #endif
1798 
1799 template<typename CellList_type, typename Vector_type, typename Vector_out>
1800 __global__ void cl_offload_gpu(CellList_type cellList, Vector_type parts, Vector_out output)
1801 {
1802  int p = threadIdx.x + blockIdx.x * blockDim.x;
1803 
1804  if (p >= parts.size()) return;
1805 
1806  Point<3,float> xp = parts.template get<0>(p);
1807 
1808  output.template get<0>(p) = cellList.getNelements(cellList.getCell(xp));
1809 }
1810 
1811 template<typename CellList_type, typename Vector_type, typename Vector_scan_type, typename Vector_list_type>
1812 __global__ void cl_offload_gpu_list(CellList_type cellList, Vector_type parts, Vector_scan_type scan, Vector_list_type list)
1813 {
1814  int p = threadIdx.x + blockIdx.x * blockDim.x;
1815 
1816  if (p >= parts.size()) return;
1817 
1818  Point<3,float> xp = parts.template get<0>(p);
1819 
1820  int id = cellList.getCell(xp);
1821  int n_ele = cellList.getNelements(id);
1822  int start = scan.template get<0>(p);
1823 
1824  for (int j = 0 ; j < n_ele ; j++)
1825  {
1826  list.template get<0>(start+j) = cellList.get(id,j);
1827  }
1828 
1829 }
1830 
1831 #if 0
1832 
1833 BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test )
1834 {
1835  std::cout << "Test cell list offload gpu" << "\n";
1836 
1837  // Subdivisions
1838  size_t div[3] = {10,10,10};
1839 
1840  // grid info
1841  grid_sm<3,void> g_info(div);
1842 
1843  Box<3,float> box({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
1844 
1845  // CellS = CellListM<dim,T,8>
1847 
1850  v.resize(10000);
1851  os.resize(v.size());
1852 
1853  for (size_t i = 0 ; i < v.size() ; i++)
1854  {
1855  v.template get<0>(i)[0] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1856  v.template get<0>(i)[1] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1857  v.template get<0>(i)[2] = 2.0 * (float)rand() / RAND_MAX - 1.0;
1858 
1859  Point<3,float> xp = v.template get<0>(i);
1860 
1861  cl1.add(xp,i);
1862  }
1863 
1864  auto ite = v.getGPUIterator();
1865 
1866  cl1.hostToDevice();
1867  v.hostToDevice<0>();
1868 
1869  CUDA_LAUNCH_DIM3((cl_offload_gpu<decltype(cl1.toKernel()),decltype(v.toKernel()),decltype(os.toKernel())>),ite.wthr,ite.thr,cl1.toKernel(),v.toKernel(),os.toKernel());
1870 
1871  os.deviceToHost<0>();
1872 
1873  bool match = true;
1874  for (size_t i = 0 ; i < os.size() ; i++)
1875  {
1876  Point<3,float> xp = v.template get<0>(i);
1877 
1878  match &= os.template get<0>(i) == cl1.getNelements(cl1.getCell(xp));
1879  }
1880 
1881  BOOST_REQUIRE_EQUAL(match,true);
1882 
1883  // now we scan the vector out
1884 
1886  os_scan.resize(v.size());
1887 
1888  gpu::ofp_context_t gpuContext;
1889  openfpm::scan((int *)os.template getDeviceBuffer<0>(),os.size(),(int *)os_scan.template getDeviceBuffer<0>(),gpuContext);
1890 
1891  os_scan.deviceToHost<0>();
1892  os.deviceToHost<0>(os.size()-1,os.size()-1);
1893  size_t size_list = os_scan.template get<0>(os_scan.size()-1) + os.template get<0>(os.size()-1);
1894 
1896  os_list.resize(size_list);
1897 
1898  CUDA_LAUNCH_DIM3((cl_offload_gpu_list<decltype(cl1.toKernel()),decltype(v.toKernel()),
1899  decltype(os_scan.toKernel()),decltype(os_list.toKernel())>),ite.wthr,ite.thr,
1900  cl1.toKernel(),v.toKernel(),os_scan.toKernel(),os_list.toKernel());
1901 
1902  os_list.deviceToHost<0>();
1903 
1904  match = true;
1905  for (size_t i = 0 ; i < os.size() ; i++)
1906  {
1907  Point<3,float> xp = v.template get<0>(i);
1908 
1909  for (size_t j = 0 ; j < cl1.getNelements(cl1.getCell(xp)) ; j++)
1910  {
1911  match &= os_list.template get<0>(os_scan.template get<0>(i)+j) == cl1.get(cl1.getCell(xp),j);
1912  }
1913  }
1914 
1915  BOOST_REQUIRE_EQUAL(match,true);
1916 
1917  std::cout << "End cell list offload gpu" << "\n";
1918 
1919  // Test the cell list
1920 }
1921 
1922 #endif
1923 
1924 BOOST_AUTO_TEST_CASE( CellList_swap_test )
1925 {
1926  size_t npart = 4096;
1927 
1928  Box<3,float> box({-1.0,-1.0,-1.0},{1.0,1.0,1.0});
1929 
1930  // Subdivisions
1931  size_t div[3] = {10,10,10};
1932 
1933  // Origin
1934  Point<3,float> org({0.0,0.0,0.0});
1935 
1936  // id Cell list
1937  CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cellList2(box,div,2);
1938  CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cellList3(box,div,2);
1939  CellList_gpu<3,float,CudaMemory,shift_only<3,float>> cellList4(box,div,2);
1940 
1941  // vector of particles
1942 
1945 
1946  // create random particles
1947 
1948  fill_random_parts<3>(box,vPos,vPrp,npart);
1949 
1950  vPrp.resize(vPos.size());
1951 
1952  vPos.template hostToDevice<0>();
1953  vPrp.template hostToDevice<0,1>();
1954 
1955  size_t ghostMarker = vPos.size() / 2;
1956 
1957  gpu::ofp_context_t gpuContext(gpu::gpu_context_opt::no_print_props);
1958  cellList2.construct(vPos,vPrp,gpuContext,ghostMarker);
1959  cellList4.construct(vPos,vPrp,gpuContext,ghostMarker);
1960 
1961  cellList3.swap(cellList2);
1962 
1963  // move device to host
1964 
1965  cellList3.debug_deviceToHost();
1966  cellList4.debug_deviceToHost();
1967 
1968  BOOST_REQUIRE_EQUAL(cellList3.getNCells(),cellList4.getNCells());
1969 
1972 
1973  bool check = true;
1974  for (size_t i = 0 ; i < cellList3.getNCells() ; i++)
1975  {
1976  check &= cellList3.getNelements(i) == cellList4.getNelements(i);
1977 
1978  for (size_t j = 0 ; j < cellList3.getNelements(i) ; j++)
1979  {
1980  s1.add(cellList3.get(i,j));
1981  s2.add(cellList4.get(i,j));
1982  }
1983 
1984  s1.sort();
1985  s2.sort();
1986 
1987  for (size_t j = 0 ; j < s1.size() ; j++)
1988  {
1989  check &= s1.get(j) == s2.get(j);
1990  }
1991  }
1992 
1993  BOOST_REQUIRE_EQUAL(check,true);
1994 
1996 }
1997 
1998 BOOST_AUTO_TEST_SUITE_END()
1999 
This class represent an N-dimensional box.
Definition: Box.hpp:60
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition: Box.hpp:555
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition: Box.hpp:566
Class for FAST cell list implementation.
Definition: CellList.hpp:558
This class implement an NxN (dense) matrix.
Definition: Matrix.hpp:33
It is a class that work like a vector of vector.
Definition: MemFast.hpp:88
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:28
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
Declaration grid_key_dx_iterator_sub.
grid_key_dx is the key to access any element in the grid
Definition: grid_key.hpp:19
Declaration grid_sm.
Definition: grid_sm.hpp:167
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const signed char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
Definition: grid_sm.hpp:454
No transformation.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void flush(gpu::ofp_context_t &gpuContext, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:204
size_t size()
Stub size.
Definition: map_vector.hpp:212
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:221
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition: memory_conf.hpp:84