OpenFPM  5.2.0
Project that contain the implementation of distributed structures
CellList_gpu.hpp
1 
2 #ifndef OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_
3 #define OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_
4 
5 #include "config.h"
6 
7 #ifdef CUDA_GPU
8 
9 #include "Vector/map_vector_sparse.hpp"
10 #include "NN/CellList/CellDecomposer.hpp"
11 #include "Vector/map_vector.hpp"
12 #include "NN/CellList/cuda/Cuda_cell_list_util_func.hpp"
13 #include "NN/CellList/cuda/CellList_gpu_ker.cuh"
14 #include "util/cuda_util.hpp"
15 #include "NN/CellList/CellList_util.hpp"
16 #include "NN/CellList/CellList.hpp"
17 #include "util/cuda/scan_ofp.cuh"
18 
19 
20 template<unsigned int dim,
21  typename T,
22  typename Memory = CudaMemory,
23  typename transform_type = no_transform_only<dim,T>,
24  bool is_sparse = false>
25 class CellList_gpu;
26 
27 template<unsigned int dim, typename T, typename Memory, typename transform_type>
28 class CellList_gpu<dim,T,Memory,transform_type,false> : public CellDecomposer_sm<dim,T,transform_type>
29 {
30 public:
31  typedef int ids_type;
32 
33 private:
36 
38  openfpm::vector_gpu<aggregate<unsigned int>> cellIndexLocalIndexToUnsorted;
39 
41  openfpm::vector_gpu<aggregate<unsigned int>> numPartInCellPrefixSum;
42 
45 
47  openfpm::vector_gpu<aggregate<unsigned int>> sortedToUnsortedIndex;
48 
50  openfpm::vector_gpu<aggregate<unsigned int>> sortedToSortedIndexNoGhost;
51 
53  openfpm::vector_gpu<aggregate<unsigned int>> unsortedToSortedIndex;
54 
56  openfpm::vector_gpu<aggregate<unsigned int>> isSortedDomainOrGhost;
57 
59  size_t boxNeighborNumber;
60 
62  openfpm::vector_gpu<aggregate<int>> boxNeighborCellOffset;
63 
65  openfpm::vector_gpu<aggregate<int>> boxNeighborCellOffsetSym;
66 
68  openfpm::array<T,dim> unitCellP2;
69 
72 
75 
77  openfpm::vector_gpu<aggregate<int>> rcutNeighborCellOffset;
78 
82  size_t nDecRefRedec;
83 
85  gpu::ofp_context_t* gpuContext;
86 
88  size_t opt;
89 
91  void InitializeStructures(
92  const size_t (& div)[dim],
93  size_t tot_n_cell,
94  size_t pad)
95  {
96  for (size_t i = 0 ; i < dim ; i++)
97  {
98  numCellDim[i] = div[i];
99  unitCellP2[i] = this->getCellBox().getP2().get(i);
100  cellPadDim[i] = pad;
101  }
102 
103  numPartInCell.resize(tot_n_cell+1);
104 
105  boxNeighborNumber = 1;
106  constructNeighborCellOffset(boxNeighborNumber);
107  }
108 
109  void constructNeighborCellOffset(size_t boxNeighborNumber)
110  {
111 
112  NNcalc_box(boxNeighborNumber,boxNeighborCellOffset,this->getGrid());
113  NNcalc_boxSym(boxNeighborNumber,boxNeighborCellOffsetSym,this->getGrid());
114 
115  boxNeighborCellOffset.template hostToDevice<0>();
116  boxNeighborCellOffsetSym.template hostToDevice<0>();
117  }
118 
124  void constructSortedToSortedIndexNoGhost(
125  gpu::ofp_context_t& gpuContext,
126  size_t start,
127  size_t stop,
128  size_t ghostMarker)
129  {
130 #ifdef __NVCC__
131  isSortedDomainOrGhost.resize(stop-start+1);
132 
133  auto ite = isSortedDomainOrGhost.getGPUIterator();
134 
135  CUDA_LAUNCH((mark_domain_particles),ite,
136  sortedToUnsortedIndex.toKernel(),
137  isSortedDomainOrGhost.toKernel(),
138  ghostMarker
139  );
140 
141  openfpm::scan(
142  (unsigned int *)isSortedDomainOrGhost.template getDeviceBuffer<0>(),
143  isSortedDomainOrGhost.size(),
144  (unsigned int *)isSortedDomainOrGhost.template getDeviceBuffer<0>(),
145  gpuContext
146  );
147 
148  isSortedDomainOrGhost.template deviceToHost<0>(isSortedDomainOrGhost.size()-1,isSortedDomainOrGhost.size()-1);
149  auto sz = isSortedDomainOrGhost.template get<0>(isSortedDomainOrGhost.size()-1);
150 
151  sortedToSortedIndexNoGhost.resize(sz);
152 
153  CUDA_LAUNCH((collect_domain_ghost_ids),ite,
154  isSortedDomainOrGhost.toKernel(),
155  sortedToSortedIndexNoGhost.toKernel()
156  );
157 #endif
158  }
159 
174  template<typename vector, typename vector_prp>
175  void construct_dense(
176  vector & vPos,
177  vector_prp & vPrp,
178  gpu::ofp_context_t& gpuContext,
179  size_t ghostMarker,
180  size_t start = 0,
181  size_t stop = -1)
182  {
183 #ifdef __NVCC__
184  this->gpuContext = &gpuContext;
185  this->ghostMarker = ghostMarker;
186  if (stop == (size_t)-1) stop = vPos.size();
187 
188  auto ite_gpu = vPos.getGPUIteratorTo(stop-start-1);
189 
190  // cellListGrid.size() returns total size of the grid
191  numPartInCell.resize(this->cellListGrid.size()+1);
192  numPartInCell.template fill<0>(0);
193 
194  cellIndex_LocalIndex.resize(stop - start);
195 
196  if (ite_gpu.wthr.x == 0 || vPos.size() == 0 || stop == 0)
197  {
198  // no particles
199  numPartInCellPrefixSum.resize(numPartInCell.size());
200  numPartInCellPrefixSum.template fill<0>(0);
201  return;
202  }
203 
204  CUDA_LAUNCH((fill_cellIndex_LocalIndex<dim,T,ids_type>),ite_gpu,
205  numCellDim,
206  unitCellP2,
207  cellPadDim,
208  this->getTransform(),
209  stop,
210  start,
211  vPos.toKernel(),
212  numPartInCell.toKernel(),
213  cellIndex_LocalIndex.toKernel()
214  );
215 
216  numPartInCellPrefixSum.resize(numPartInCell.size());
217  openfpm::scan(
218  (unsigned int *)numPartInCell.template getDeviceBuffer<0>(),
219  numPartInCell.size(),
220  (unsigned int *)numPartInCellPrefixSum.template getDeviceBuffer<0>(),
221  gpuContext
222  );
223 
224  cellIndexLocalIndexToUnsorted.resize(stop-start);
225  auto itgg = cellIndex_LocalIndex.getGPUIterator();
226 
227  CUDA_LAUNCH((fill_cells),itgg,
228  numPartInCellPrefixSum.toKernel(),
229  cellIndex_LocalIndex.toKernel(),
230  cellIndexLocalIndexToUnsorted.toKernel(),
231  start
232  );
233 
234  sortedToUnsortedIndex.resize(stop-start);
235  unsortedToSortedIndex.resize(vPrp.size());
236 
237  CUDA_LAUNCH((constructSortUnsortBidirectMap),
238  vPrp.getGPUIteratorTo(stop-start,64),
239  sortedToUnsortedIndex.toKernel(),
240  unsortedToSortedIndex.toKernel(),
241  cellIndexLocalIndexToUnsorted.toKernel()
242  );
243 
244  constructSortedToSortedIndexNoGhost(gpuContext,start,stop,ghostMarker);
245  #else
246 
247  std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
248 
249  #endif
250  }
251 
252 public:
253 
255  typedef int yes_is_gpu_celllist;
256 
257  // typedefs needed for toKernel_transform
258 
259  static const unsigned int dim_ = dim;
260  typedef T stype_;
261  typedef ids_type ids_type_;
262  typedef transform_type transform_type_;
263  typedef boost::mpl::bool_<false> is_sparse_;
264 
265  // end of typedefs needed for toKernel_transform
266 
272  CellList_gpu(const CellList_gpu<dim,T,Memory,transform_type> & clg)
273  {
274  this->operator=(clg);
275  }
276 
282  CellList_gpu(CellList_gpu<dim,T,Memory,transform_type> && clg)
283  {
284  this->operator=(std::move(clg));
285  }
286 
291  CellList_gpu() : opt(CL_NON_SYMMETRIC) {}
292 
293  CellList_gpu(
294  const Box<dim,T> & box,
295  const size_t (&div)[dim],
296  const size_t pad = 1)
297  : opt(CL_NON_SYMMETRIC)
298  {
299  Initialize(box,div,pad);
300  }
301 
302  void setBoxNN(size_t n_NN)
303  {
304  boxNeighborNumber = n_NN;
305  constructNeighborCellOffset(n_NN);
306  }
307 
308  inline size_t getBoxNN() const
309  {
310  return boxNeighborNumber;
311  }
312 
313  void resetBoxNN()
314  {
315  constructNeighborCellOffset(boxNeighborNumber);
316  }
317 
326  void Initialize(
327  const Box<dim,T> & box,
328  const size_t (&div)[dim],
329  const size_t pad = 1)
330  {
331  Matrix<dim,T> mat;
332  CellDecomposer_sm<dim,T,transform_type>::setDimensions(box,div, mat, pad);
333 
334  // create the array that store the number of particle on each cell and se it to 0
335  InitializeStructures(this->cellListGrid.getSize(),this->cellListGrid.size(),pad);
336  }
337 
339  getSortToNonSort() {
340  return sortedToUnsortedIndex;
341  }
342 
344  getNonSortToSort() {
345  return unsortedToSortedIndex;
346  }
347 
349  getDomainSortIds() {
350  return sortedToSortedIndexNoGhost;
351  }
352 
353 
359  void setRadius(T radius)
360  {
361  NNcalc_rad(radius,rcutNeighborCellOffset,this->getCellBox(),this->getGrid());
362 
363  rcutNeighborCellOffset.template hostToDevice<0>();
364  }
365 
381  template<typename vector, typename vector_prp>
382  void construct(
383  vector & vPos,
384  vector_prp & vPrp,
385  gpu::ofp_context_t& gpuContext,
386  size_t ghostMarker = 0,
387  size_t start = 0,
388  size_t stop = -1)
389  {
390 #ifdef __NVCC__
391  if (opt & CL_SYMMETRIC) {
392  std::cout << __FILE__ << ":" << __LINE__ << " symmetric cell list on GPU is not implemented. (And will never be, race conditions make them non suitable for GPU)" << std::endl;
393  }
394 
395  else if (opt & CL_LOCAL_SYMMETRIC) {
396  std::cout << __FILE__ << ":" << __LINE__ << " local symmetric cell list on GPU is not implemented" << std::endl;
397  }
398 
399  else if (opt & CL_NON_SYMMETRIC) {
400  construct_dense(vPos,vPrp,gpuContext,ghostMarker,start,stop);
401  }
402 #else
403  std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
404 #endif
405  }
406 
426  template<typename vector, typename vector_prp, unsigned int ... prp>
427  void construct(
428  vector & vPos,
429  vector_prp & vPrp,
430  vector & vPosReorder,
431  vector_prp & vPrpReorder,
432  gpu::ofp_context_t& gpuContext,
433  size_t ghostMarker = 0,
434  size_t start = 0,
435  size_t stop = -1)
436  {
437 #ifdef __NVCC__
438  if (opt & CL_SYMMETRIC) {
439  std::cout << __FILE__ << ":" << __LINE__ << " symmetric cell list on GPU is not implemented. (And will never be, race conditions make them non suitable for GPU)" << std::endl;
440  }
441 
442  else if (opt & CL_LOCAL_SYMMETRIC) {
443  std::cout << __FILE__ << ":" << __LINE__ << " local symmetric cell list on GPU is not implemented" << std::endl;
444  }
445 
446  else if (opt & CL_NON_SYMMETRIC) {
447 
448  if (!(opt & CL_GPU_SKIP_CONSTRUCT_ON_STATIC_DOMAIN))
449  construct_dense(vPos,vPrp,gpuContext,ghostMarker,start,stop);
450 
451  if (stop == (size_t)-1) stop = vPos.size();
452 
453  if (opt & CL_GPU_REORDER_POSITION) {
454  CUDA_LAUNCH((reorderParticlesPos),
455  vPos.getGPUIteratorTo(stop-start,64),
456  vPos.toKernel(),
457  vPosReorder.toKernel(),
458  unsortedToSortedIndex.toKernel(),
459  start
460  );
461  }
462 
463  if (opt & CL_GPU_REORDER_PROPERTY && sizeof...(prp)) {
464  CUDA_LAUNCH(
465  (reorderParticlesPrp<
466  decltype(vPrp.toKernel()),
467  decltype(unsortedToSortedIndex.toKernel()),
468  prp...>),
469  vPrp.getGPUIteratorTo(stop-start,64),
470  vPrp.toKernel(),
471  vPrpReorder.toKernel(),
472  unsortedToSortedIndex.toKernel(),
473  start
474  );
475  }
476  }
477 #else
478  std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
479 #endif
480  }
481 
483  {
485  numPartInCellPrefixSum.toKernel(),
486  sortedToUnsortedIndex.toKernel(),
487  sortedToSortedIndexNoGhost.toKernel(),
488  rcutNeighborCellOffset.toKernel(),
489  boxNeighborCellOffset.toKernel(),
490  boxNeighborCellOffsetSym.toKernel(),
491  unitCellP2,
492  numCellDim,
493  cellPadDim,
494  this->getTransform(),
495  ghostMarker,
496  this->cellListSpaceBox,
497  this->cellListGrid,
498  this->cellShift
499  );
500 };
501 
506  void clear()
507  {
508  numPartInCell.clear();
509  cellIndexLocalIndexToUnsorted.clear();
510  numPartInCellPrefixSum.clear();
511  cellIndex_LocalIndex.clear();
512  sortedToUnsortedIndex.clear();
513  }
514 
516 
518  size_t ghostMarker = 0;
519 
525  inline size_t getGhostMarker()
526  {
527  return ghostMarker;
528  }
529 
535  inline void setGhostMarker(size_t ghostMarker)
536  {
537  this->ghostMarker = ghostMarker;
538  }
539 
541 
547  void set_ndec(size_t nDecRefRedec)
548  {
549  this->nDecRefRedec = nDecRefRedec;
550  }
551 
557  size_t get_ndec() const
558  {
559  return nDecRefRedec;
560  }
561 
563 
567  void debug_deviceToHost()
568  {
569  numPartInCell.template deviceToHost<0>();
570  cellIndexLocalIndexToUnsorted.template deviceToHost<0>();
571  numPartInCellPrefixSum.template deviceToHost<0>();
572  }
573 
579  size_t getNCells()
580  {
581  return numPartInCell.size();
582  }
583 
589  size_t getNelements(size_t i)
590  {
591  return numPartInCell.template get<0>(i);
592  }
593 
603  inline auto get(size_t cell, size_t ele) -> decltype(cellIndexLocalIndexToUnsorted.template get<0>(numPartInCellPrefixSum.template get<0>(cell)+ele))
604  {
605  return cellIndexLocalIndexToUnsorted.template get<0>(numPartInCellPrefixSum.template get<0>(cell)+ele);
606  }
607 
618  inline auto get(size_t cell, size_t ele) const -> decltype(cellIndexLocalIndexToUnsorted.template get<0>(numPartInCellPrefixSum.template get<0>(cell)+ele))
619  {
620  return cellIndexLocalIndexToUnsorted.template get<0>(numPartInCellPrefixSum.template get<0>(cell)+ele);
621  }
622 
628  void swap(CellList_gpu<dim,T,Memory,transform_type,false> & clg)
629  {
630  ((CellDecomposer_sm<dim,T,transform_type> *)this)->swap(clg);
631  numPartInCell.swap(clg.numPartInCell);
632  cellIndexLocalIndexToUnsorted.swap(clg.cellIndexLocalIndexToUnsorted);
633  numPartInCellPrefixSum.swap(clg.numPartInCellPrefixSum);
634  cellIndex_LocalIndex.swap(clg.cellIndex_LocalIndex);
635  sortedToUnsortedIndex.swap(clg.sortedToUnsortedIndex);
636  sortedToSortedIndexNoGhost.swap(clg.sortedToSortedIndexNoGhost);
637  unsortedToSortedIndex.swap(clg.unsortedToSortedIndex);
638  boxNeighborCellOffset.swap(clg.boxNeighborCellOffset);
639  boxNeighborCellOffsetSym.swap(clg.boxNeighborCellOffsetSym);
640  rcutNeighborCellOffset.swap(clg.rcutNeighborCellOffset);
641 
642  unitCellP2.swap(clg.unitCellP2);
643  numCellDim.swap(clg.numCellDim);
644  cellPadDim.swap(clg.cellPadDim);
645 
646  size_t g_m_tmp = ghostMarker;
647  ghostMarker = clg.ghostMarker;
648  clg.ghostMarker = g_m_tmp;
649 
650  size_t n_dec_tmp = nDecRefRedec;
651  nDecRefRedec = clg.nDecRefRedec;
652  clg.nDecRefRedec = n_dec_tmp;
653 
654  size_t optTmp = opt;
655  opt = clg.opt;
656  clg.opt = optTmp;
657  }
658 
659  CellList_gpu<dim,T,Memory,transform_type,false> &
660  operator=(const CellList_gpu<dim,T,Memory,transform_type,false> & clg)
661  {
662  *static_cast<CellDecomposer_sm<dim,T,transform_type> *>(this) = *static_cast<const CellDecomposer_sm<dim,T,transform_type> *>(&clg);
663  numPartInCell = clg.numPartInCell;
664  cellIndexLocalIndexToUnsorted = clg.cellIndexLocalIndexToUnsorted;
665  numPartInCellPrefixSum = clg.numPartInCellPrefixSum;
666  cellIndex_LocalIndex = clg.cellIndex_LocalIndex;
667  sortedToUnsortedIndex = clg.sortedToUnsortedIndex;
668  sortedToSortedIndexNoGhost = clg.sortedToSortedIndexNoGhost;
669  unsortedToSortedIndex = clg.unsortedToSortedIndex;
670  boxNeighborCellOffset = clg.boxNeighborCellOffset;
671  boxNeighborCellOffsetSym= clg.boxNeighborCellOffsetSym;
672  rcutNeighborCellOffset = clg.rcutNeighborCellOffset;
673 
674  unitCellP2 = clg.unitCellP2;
675  numCellDim = clg.numCellDim;
676  cellPadDim = clg.cellPadDim;
677  ghostMarker = clg.ghostMarker;
678  nDecRefRedec = clg.nDecRefRedec;
679  opt = clg.opt;
680  boxNeighborNumber = clg.boxNeighborNumber;
681 
682  return *this;
683  }
684 
685  CellList_gpu<dim,T,Memory,transform_type> &
686  operator=(CellList_gpu<dim,T,Memory,transform_type> && clg)
687  {
688  static_cast<CellDecomposer_sm<dim,T,transform_type> *>(this)->swap(*static_cast<CellDecomposer_sm<dim,T,transform_type> *>(&clg));
689  numPartInCell.swap(clg.numPartInCell);
690  cellIndexLocalIndexToUnsorted.swap(clg.cellIndexLocalIndexToUnsorted);
691  numPartInCellPrefixSum.swap(clg.numPartInCellPrefixSum);
692  cellIndex_LocalIndex.swap(clg.cellIndex_LocalIndex);
693  sortedToUnsortedIndex.swap(clg.sortedToUnsortedIndex);
694  sortedToSortedIndexNoGhost.swap(clg.sortedToSortedIndexNoGhost);
695  unsortedToSortedIndex.swap(clg.unsortedToSortedIndex);
696  boxNeighborCellOffset.swap(clg.boxNeighborCellOffset);
697  boxNeighborCellOffsetSym.swap(clg.boxNeighborCellOffsetSym);
698  rcutNeighborCellOffset.swap(clg.rcutNeighborCellOffset);
699 
700  unitCellP2 = clg.unitCellP2;
701  numCellDim = clg.numCellDim;
702  cellPadDim = clg.cellPadDim;
703  ghostMarker = clg.ghostMarker;
704  nDecRefRedec = clg.nDecRefRedec;
705  opt = clg.opt;
706  boxNeighborNumber = clg.boxNeighborNumber;
707 
708  return *this;
709  }
710 
726  template<typename vector, typename vector_prp, unsigned int ... prp>
727  void restoreOrder(
728  vector & vPosReordered,
729  vector_prp & vPrpReordered,
730  vector & vPos,
731  vector_prp & vPrp,
732  size_t start = 0,
733  size_t stop = -1)
734  {
735  #ifdef __NVCC__
736  if (stop == (size_t)-1) stop = vPosReordered.size();
737 
738  if (opt & CL_GPU_RESTORE_POSITION) {
739  CUDA_LAUNCH((reorderParticlesPos),
740  vPosReordered.getGPUIteratorTo(stop-start,64),
741  vPosReordered.toKernel(),
742  vPos.toKernel(),
743  sortedToUnsortedIndex.toKernel(),
744  start
745  );
746  }
747 
748  if (opt & CL_GPU_RESTORE_PROPERTY && sizeof...(prp)) {
749  CUDA_LAUNCH(
750  (reorderParticlesPrp<
751  decltype(vPrpReordered.toKernel()),
752  decltype(sortedToUnsortedIndex.toKernel()),
753  prp...>),
754  vPrpReordered.getGPUIteratorTo(stop-start,64),
755  vPrpReordered.toKernel(),
756  vPrp.toKernel(),
757  sortedToUnsortedIndex.toKernel(),
758  start
759  );
760  }
761  #endif
762  }
763 
770  inline size_t getOpt() const
771  {
772  return opt;
773  }
774 
780  void setOpt(size_t opt)
781  {
782  this->opt = opt;
783  }
784 };
785 
786 
787 template<unsigned int dim, typename T, typename Memory, typename transform_type>
788 class CellList_gpu<dim,T,Memory,transform_type,true> : public CellDecomposer_sm<dim,T,transform_type>
789 {
790 public:
791  typedef int ids_type;
792 
793 private:
795  openfpm::vector_gpu<aggregate<unsigned int>> cellIndexLocalIndexToUnsorted;
796 
799 
801  openfpm::vector_sparse_gpu<aggregate<unsigned int>> vecSparseCellIndex_PartIndex;
802 
804  openfpm::vector_gpu<aggregate<unsigned int>> nonEmptyNeighborCellCount;
805 
808 
810  size_t boxNeighborNumber;
811 
813  openfpm::vector_gpu<aggregate<int>> boxNeighborCellOffset;
814 
816  openfpm::vector_gpu<aggregate<unsigned int>> sortedToUnsortedIndex;
817 
819  openfpm::vector_gpu<aggregate<unsigned int>> sortedToSortedIndexNoGhost;
820 
822  openfpm::vector_gpu<aggregate<unsigned int>> unsortedToSortedIndex;
823 
825  openfpm::vector_gpu<aggregate<unsigned int>> isSortedDomainOrGhost;
826 
828  openfpm::array<T,dim> unitCellP2;
829 
831  openfpm::array<ids_type,dim> numCellDim;
832 
834  openfpm::array<ids_type,dim> cellPadDim;
835 
838  size_t nDecRefRedec;
839 
841  gpu::ofp_context_t* gpuContext;
842 
844  size_t opt;
845 
847  void InitializeStructures(
848  const size_t (& div)[dim],
849  size_t tot_n_cell,
850  size_t pad)
851  {
852  for (size_t i = 0 ; i < dim ; i++)
853  {
854  numCellDim[i] = div[i];
855  unitCellP2[i] = this->getCellBox().getP2().get(i);
856  cellPadDim[i] = pad;
857  }
858 
859  boxNeighborNumber = 1;
860  constructNeighborCellOffset(boxNeighborNumber);
861  }
862 
863  void constructNeighborCellOffset(size_t boxNeighborNumber)
864  {
865  NNcalc_box(boxNeighborNumber,boxNeighborCellOffset,this->getGrid());
866 
867  boxNeighborCellOffset.template hostToDevice<0>();
868  }
869 
874  template<typename vector, typename vector_prp>
875  void construct_sparse(
876  vector & vPos,
877  vector_prp & vPrp,
878  gpu::ofp_context_t& gpuContext,
879  size_t ghostMarker,
880  size_t start = 0,
881  size_t stop = -1)
882  {
883 #ifdef __NVCC__
884  this->gpuContext = &gpuContext;
885  this->ghostMarker = ghostMarker;
886  if (stop == (size_t)-1) stop = vPos.size();
887 
888  cellIndex.resize(stop - start);
889  cellIndex.template fill<0>(0);
890 
891  auto ite_gpu = vPos.getGPUIteratorTo(stop-start,1024);
892 
893  if (ite_gpu.wthr.x == 0 || vPos.size() == 0 || stop == 0)
894  return;
895 
896  CUDA_LAUNCH((fill_cellIndex<dim,T,ids_type>),ite_gpu,
897  numCellDim,
898  unitCellP2,
899  cellPadDim,
900  this->getTransform(),
901  vPos.size(),
902  start,
903  vPos.toKernel(),
904  cellIndex.toKernel()
905  );
906 
907  cellIndexLocalIndexToUnsorted.resize(stop-start);
908 
909  vecSparseCellIndex_PartIndex.clear();
910  vecSparseCellIndex_PartIndex.template setBackground<0>((unsigned int)-1);
911  vecSparseCellIndex_PartIndex.setGPUInsertBuffer(ite_gpu.wthr.x,ite_gpu.thr.x);
912 
913  CUDA_LAUNCH((fill_vsCellIndex_PartIndex),ite_gpu,
914  vecSparseCellIndex_PartIndex.toKernel(),
915  cellIndex.toKernel()
916  );
917 
918  // flush_vd<sstart_<0>> returns the comulative prefix for cell indexes
919  vecSparseCellIndex_PartIndex.template flush_vd<sstart_<0>>(cellIndexLocalIndexToUnsorted,gpuContext,FLUSH_ON_DEVICE);
920 
921  nonEmptyNeighborCellCount.resize(vecSparseCellIndex_PartIndex.size()+1);
922  nonEmptyNeighborCellCount.template fill<0>(0);
923 
924  // for every particle increase the counter for every non-zero neighbor cell
925  auto itgg = vecSparseCellIndex_PartIndex.getGPUIterator();
926  CUDA_LAUNCH((countNonEmptyNeighborCells),itgg,
927  vecSparseCellIndex_PartIndex.toKernel(),
928  nonEmptyNeighborCellCount.toKernel(),
929  boxNeighborCellOffset.toKernel()
930  );
931 
932  // get total number of non-empty neighboring cells
933  openfpm::scan(
934  (unsigned int *)nonEmptyNeighborCellCount.template getDeviceBuffer<0>(),
935  nonEmptyNeighborCellCount.size(),
936  (unsigned int *)nonEmptyNeighborCellCount.template getDeviceBuffer<0>(),
937  gpuContext
938  );
939 
940  nonEmptyNeighborCellCount.template deviceToHost<0>(nonEmptyNeighborCellCount.size()-1, nonEmptyNeighborCellCount.size()-1);
941  size_t totalNeighborCellCount = nonEmptyNeighborCellCount.template get<0>(nonEmptyNeighborCellCount.size()-1);
942 
943  neighborPartIndexFrom_To.resize(totalNeighborCellCount);
944  CUDA_LAUNCH((fillNeighborCellList),itgg,
945  vecSparseCellIndex_PartIndex.toKernel(),
946  nonEmptyNeighborCellCount.toKernel(),
947  boxNeighborCellOffset.toKernel(),
948  neighborPartIndexFrom_To.toKernel(),
949  (typename decltype(vecSparseCellIndex_PartIndex.toKernel())::index_type)cellIndexLocalIndexToUnsorted.size()
950  );
951 
952  sortedToUnsortedIndex.resize(stop-start);
953  unsortedToSortedIndex.resize(vPrp.size());
954 
955  auto ite = vPos.getGPUIteratorTo(stop-start,64);
956 
957  CUDA_LAUNCH((constructSortUnsortBidirectMap),
958  vPrp.getGPUIteratorTo(stop-start,64),
959  sortedToUnsortedIndex.toKernel(),
960  unsortedToSortedIndex.toKernel(),
961  cellIndexLocalIndexToUnsorted.toKernel()
962  );
963 
964  constructSortedToSortedIndexNoGhost(gpuContext,start,stop,ghostMarker);
965 #else
966  std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
967 #endif
968  }
969 
975  void constructSortedToSortedIndexNoGhost(
976  gpu::ofp_context_t& gpuContext,
977  size_t start,
978  size_t stop,
979  size_t ghostMarker)
980  {
981 #ifdef __NVCC__
982  isSortedDomainOrGhost.resize(stop-start+1);
983  auto ite = isSortedDomainOrGhost.getGPUIterator();
984 
985  CUDA_LAUNCH((mark_domain_particles),ite,
986  sortedToUnsortedIndex.toKernel(),
987  isSortedDomainOrGhost.toKernel(),
988  ghostMarker
989  );
990 
991  openfpm::scan(
992  (unsigned int *)isSortedDomainOrGhost.template getDeviceBuffer<0>(),
993  isSortedDomainOrGhost.size(),
994  (unsigned int *)isSortedDomainOrGhost.template getDeviceBuffer<0>(),
995  gpuContext
996  );
997 
998  isSortedDomainOrGhost.template deviceToHost<0>(isSortedDomainOrGhost.size()-1,isSortedDomainOrGhost.size()-1);
999  auto totalParticleNoGhostCount = isSortedDomainOrGhost.template get<0>(isSortedDomainOrGhost.size()-1);
1000 
1001  sortedToSortedIndexNoGhost.resize(totalParticleNoGhostCount);
1002 
1003  CUDA_LAUNCH((collect_domain_ghost_ids),ite,
1004  isSortedDomainOrGhost.toKernel(),
1005  sortedToSortedIndexNoGhost.toKernel()
1006  );
1007 #endif
1008  }
1009 
1010 public:
1011 
1013  typedef int yes_is_gpu_celllist;
1014 
1015  // typedefs needed for toKernel_transform
1016 
1017  static const unsigned int dim_ = dim;
1018  typedef T stype_;
1019  typedef ids_type ids_type_;
1020  typedef transform_type transform_type_;
1021  typedef boost::mpl::bool_<true> is_sparse_;
1022 
1023  // end of typedefs needed for toKernel_transform
1024 
1030  CellList_gpu(const CellList_gpu<dim,T,Memory,transform_type> & clg)
1031  {
1032  this->operator=(clg);
1033  }
1034 
1040  CellList_gpu(CellList_gpu<dim,T,Memory,transform_type> && clg)
1041  {
1042  this->operator=(std::move(clg));
1043  }
1044 
1049  CellList_gpu() : opt(CL_NON_SYMMETRIC) {}
1050 
1051  CellList_gpu(
1052  const Box<dim,T> & box,
1053  const size_t (&div)[dim],
1054  const size_t pad = 1)
1055  : opt(CL_NON_SYMMETRIC)
1056  {
1057  Initialize(box,div,pad);
1058  }
1059 
1060  void setBoxNN(unsigned int n_NN)
1061  {
1062  boxNeighborNumber = n_NN;
1063  constructNeighborCellOffset(n_NN);
1064  }
1065 
1066  inline size_t getBoxNN() const
1067  {
1068  return boxNeighborNumber;
1069  }
1070 
1071  void resetBoxNN()
1072  {
1073  constructNeighborCellOffset(boxNeighborNumber);
1074  }
1075 
1084  void Initialize(
1085  const Box<dim,T> & box,
1086  const size_t (&div)[dim],
1087  const size_t pad = 1)
1088  {
1089  Matrix<dim,T> mat;
1090  CellDecomposer_sm<dim,T,transform_type>::setDimensions(box, div, mat, pad);
1091 
1092  // create the array that store the number of particle on each cell and se it to 0
1093  InitializeStructures(this->cellListGrid.getSize(),this->cellListGrid.size(),pad);
1094  }
1095 
1097  getSortToNonSort() {
1098  return sortedToUnsortedIndex;
1099  }
1100 
1102  getNonSortToSort() {
1103  return unsortedToSortedIndex;
1104  }
1105 
1107  getDomainSortIds() {
1108  return sortedToSortedIndexNoGhost;
1109  }
1110 
1111 
1117  void setRadius(T radius)
1118  {
1119  std::cerr << "setRadius() is supported by dense cell list only!\n";
1120  }
1121 
1137  template<typename vector, typename vector_prp>
1138  void construct(
1139  vector & vPos,
1140  vector_prp & vPrp,
1141  gpu::ofp_context_t& gpuContext,
1142  size_t ghostMarker = 0,
1143  size_t start = 0,
1144  size_t stop = -1)
1145  {
1146 #ifdef __NVCC__
1147  if (opt & CL_SYMMETRIC) {
1148  std::cout << __FILE__ << ":" << __LINE__ << " symmetric cell list on GPU is not implemented. (And will never be, race conditions make them non suitable for GPU)" << std::endl;
1149  }
1150 
1151  else if (opt & CL_LOCAL_SYMMETRIC) {
1152  std::cout << __FILE__ << ":" << __LINE__ << " local symmetric cell list on GPU is not implemented" << std::endl;
1153  }
1154 
1155  else if (opt & CL_NON_SYMMETRIC) {
1156  construct_sparse(vPos,vPrp,gpuContext,ghostMarker, start, stop);
1157  }
1158 #else
1159  std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
1160 #endif
1161  }
1162 
1182  template<typename vector, typename vector_prp, unsigned int ... prp>
1183  void construct(
1184  vector & vPos,
1185  vector_prp & vPrp,
1186  vector & vPosReorder,
1187  vector_prp & vPrpReorder,
1188  gpu::ofp_context_t& gpuContext,
1189  size_t ghostMarker = 0,
1190  size_t start = 0,
1191  size_t stop = -1)
1192  {
1193 #ifdef __NVCC__
1194  if (opt & CL_SYMMETRIC) {
1195  std::cout << __FILE__ << ":" << __LINE__ << " symmetric cell list on GPU is not implemented. (And will never be, race conditions make them non suitable for GPU)" << std::endl;
1196  }
1197 
1198  else if (opt & CL_LOCAL_SYMMETRIC) {
1199  std::cout << __FILE__ << ":" << __LINE__ << " local symmetric cell list on GPU is not implemented" << std::endl;
1200  }
1201 
1202  else if (opt & CL_NON_SYMMETRIC) {
1203  if (!(opt & CL_GPU_SKIP_CONSTRUCT_ON_STATIC_DOMAIN))
1204  construct_sparse(vPos,vPrp,gpuContext,ghostMarker, start, stop);
1205 
1206  if (stop == (size_t)-1) stop = vPos.size();
1207 
1208  if (opt & CL_GPU_REORDER_POSITION) {
1209  CUDA_LAUNCH((reorderParticlesPos),
1210  vPos.getGPUIteratorTo(stop-start,64),
1211  vPos.toKernel(),
1212  vPosReorder.toKernel(),
1213  unsortedToSortedIndex.toKernel(),
1214  start
1215  );
1216  }
1217 
1218  if (opt & CL_GPU_REORDER_PROPERTY && sizeof...(prp)) {
1219  CUDA_LAUNCH(
1220  (reorderParticlesPrp<
1221  decltype(vPrp.toKernel()),
1222  decltype(unsortedToSortedIndex.toKernel()),
1223  prp...>),
1224  vPrp.getGPUIteratorTo(stop-start,64),
1225  vPrp.toKernel(),
1226  vPrpReorder.toKernel(),
1227  unsortedToSortedIndex.toKernel(),
1228  start
1229  );
1230  }
1231  }
1232 #else
1233  std::cout << "Error: " << __FILE__ << ":" << __LINE__ << " you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
1234 #endif
1235  }
1236 
1238  {
1240  nonEmptyNeighborCellCount.toKernel(),
1241  neighborPartIndexFrom_To.toKernel(),
1242  vecSparseCellIndex_PartIndex.toKernel(),
1243  sortedToUnsortedIndex.toKernel(),
1244  sortedToSortedIndexNoGhost.toKernel(),
1245  unitCellP2,
1246  numCellDim,
1247  cellPadDim,
1248  this->getTransform(),
1249  ghostMarker,
1250  this->cellListSpaceBox,
1251  this->cellListGrid,
1252  this->cellShift
1253  );
1254  }
1255 
1260  void clear()
1261  {
1262  cellIndexLocalIndexToUnsorted.clear();
1263  cellIndex.clear();
1264  sortedToUnsortedIndex.clear();
1265  }
1266 
1268 
1270  size_t ghostMarker = 0;
1271 
1277  inline size_t getGhostMarker()
1278  {
1279  return ghostMarker;
1280  }
1281 
1287  inline void setGhostMarker(size_t ghostMarker)
1288  {
1289  this->ghostMarker = ghostMarker;
1290  }
1291 
1293 
1299  void set_ndec(size_t nDecRefRedec)
1300  {
1301  this->nDecRefRedec = nDecRefRedec;
1302  }
1303 
1309  size_t get_ndec() const
1310  {
1311  return nDecRefRedec;
1312  }
1313 
1315 
1319  void debug_deviceToHost()
1320  {
1321  cellIndexLocalIndexToUnsorted.template deviceToHost<0>();
1322  cellIndex.template deviceToHost<0>();
1323  }
1324 
1335  inline auto get(size_t cell, size_t ele) -> decltype(cellIndexLocalIndexToUnsorted.template get<0>(cellIndex.template get<0>(cell)+ele))
1336  {
1337  return cellIndexLocalIndexToUnsorted.template get<0>(cellIndex.template get<0>(cell)+ele);
1338  }
1339 
1350  inline auto get(size_t cell, size_t ele) const -> decltype(cellIndexLocalIndexToUnsorted.template get<0>(cellIndex.template get<0>(cell)+ele))
1351  {
1352  return cellIndexLocalIndexToUnsorted.template get<0>(cellIndex.template get<0>(cell)+ele);
1353  }
1354 
1360  void swap(CellList_gpu<dim,T,Memory,transform_type,true> & clg)
1361  {
1362  ((CellDecomposer_sm<dim,T,transform_type> *)this)->swap(clg);
1363  cellIndexLocalIndexToUnsorted.swap(clg.cellIndexLocalIndexToUnsorted);
1364  cellIndex.swap(clg.cellIndex);
1365  vecSparseCellIndex_PartIndex.swap(clg.vecSparseCellIndex_PartIndex);
1366  nonEmptyNeighborCellCount.swap(clg.nonEmptyNeighborCellCount);
1367  neighborPartIndexFrom_To.swap(clg.neighborPartIndexFrom_To);
1368  boxNeighborCellOffset.swap(clg.boxNeighborCellOffset);
1369  sortedToUnsortedIndex.swap(clg.sortedToUnsortedIndex);
1370  sortedToSortedIndexNoGhost.swap(clg.sortedToSortedIndexNoGhost);
1371  unsortedToSortedIndex.swap(clg.unsortedToSortedIndex);
1372 
1373  unitCellP2.swap(clg.unitCellP2);
1374  numCellDim.swap(clg.numCellDim);
1375  cellPadDim.swap(clg.cellPadDim);
1376 
1377  size_t g_m_tmp = ghostMarker;
1378  ghostMarker = clg.ghostMarker;
1379  clg.ghostMarker = g_m_tmp;
1380 
1381  size_t n_dec_tmp = nDecRefRedec;
1382  nDecRefRedec = clg.nDecRefRedec;
1383  clg.nDecRefRedec = n_dec_tmp;
1384 
1385  size_t optTmp = opt;
1386  opt = clg.opt;
1387  clg.opt = optTmp;
1388 
1389  int boxNN_tmp = boxNeighborNumber;
1390  boxNeighborNumber = clg.boxNeighborNumber;
1391  clg.boxNeighborNumber = boxNN_tmp;
1392  }
1393 
1394  CellList_gpu<dim,T,Memory,transform_type,true> &
1395  operator=(const CellList_gpu<dim,T,Memory,transform_type,true> & clg)
1396  {
1397  *static_cast<CellDecomposer_sm<dim,T,transform_type> *>(this) = *static_cast<const CellDecomposer_sm<dim,T,transform_type> *>(&clg);
1398  cellIndexLocalIndexToUnsorted = clg.cellIndexLocalIndexToUnsorted;
1399  cellIndex = clg.cellIndex;
1400  vecSparseCellIndex_PartIndex = clg.vecSparseCellIndex_PartIndex;
1401  nonEmptyNeighborCellCount = clg.nonEmptyNeighborCellCount;
1402  neighborPartIndexFrom_To = clg.neighborPartIndexFrom_To;
1403  boxNeighborCellOffset = clg.boxNeighborCellOffset;
1404  sortedToUnsortedIndex = clg.sortedToUnsortedIndex;
1405  sortedToSortedIndexNoGhost = clg.sortedToSortedIndexNoGhost;
1406  unsortedToSortedIndex = clg.unsortedToSortedIndex;
1407 
1408  unitCellP2 = clg.unitCellP2;
1409  numCellDim = clg.numCellDim;
1410  cellPadDim = clg.cellPadDim;
1411  ghostMarker = clg.ghostMarker;
1412  nDecRefRedec = clg.nDecRefRedec;
1413  opt = clg.opt;
1414 
1415  boxNeighborNumber = clg.boxNeighborNumber;
1416 
1417  return *this;
1418  }
1419 
1420  CellList_gpu<dim,T,Memory,transform_type> &
1421  operator=(CellList_gpu<dim,T,Memory,transform_type> && clg)
1422  {
1423  static_cast<CellDecomposer_sm<dim,T,transform_type> *>(this)->swap(*static_cast<CellDecomposer_sm<dim,T,transform_type> *>(&clg));
1424  cellIndexLocalIndexToUnsorted.swap(clg.cellIndexLocalIndexToUnsorted);
1425  cellIndex.swap(clg.cellIndex);
1426  vecSparseCellIndex_PartIndex.swap(clg.vecSparseCellIndex_PartIndex);
1427  nonEmptyNeighborCellCount.swap(clg.nonEmptyNeighborCellCount);
1428  neighborPartIndexFrom_To.swap(clg.neighborPartIndexFrom_To);
1429  boxNeighborCellOffset.swap(clg.boxNeighborCellOffset);
1430  sortedToUnsortedIndex.swap(clg.sortedToUnsortedIndex);
1431  sortedToSortedIndexNoGhost.swap(clg.sortedToSortedIndexNoGhost);
1432  unsortedToSortedIndex.swap(clg.unsortedToSortedIndex);
1433 
1434  unitCellP2 = clg.unitCellP2;
1435  numCellDim = clg.numCellDim;
1436  cellPadDim = clg.cellPadDim;
1437  ghostMarker = clg.ghostMarker;
1438  nDecRefRedec = clg.nDecRefRedec;
1439  opt = clg.opt;
1440 
1441  boxNeighborNumber = clg.boxNeighborNumber;
1442 
1443  return *this;
1444  }
1445 
1461  template<typename vector, typename vector_prp, unsigned int ... prp>
1462  void restoreOrder(
1463  vector & vPosReordered,
1464  vector_prp & vPrpReordered,
1465  vector & vPos,
1466  vector_prp & vPrp,
1467  size_t start = 0,
1468  size_t stop = -1)
1469  {
1470  #ifdef __NVCC__
1471  if (stop == (size_t)-1) stop = vPosReordered.size();
1472 
1473  if (opt & CL_GPU_RESTORE_POSITION) {
1474  CUDA_LAUNCH((reorderParticlesPos),
1475  vPosReordered.getGPUIteratorTo(stop-start,64),
1476  vPosReordered.toKernel(),
1477  vPos.toKernel(),
1478  sortedToUnsortedIndex.toKernel(),
1479  start
1480  );
1481  }
1482 
1483  if (opt & CL_GPU_RESTORE_PROPERTY && sizeof...(prp)) {
1484  CUDA_LAUNCH(
1485  (reorderParticlesPrp<
1486  decltype(vPrpReordered.toKernel()),
1487  decltype(sortedToUnsortedIndex.toKernel()),
1488  prp...>),
1489  vPrpReordered.getGPUIteratorTo(stop-start,64),
1490  vPrpReordered.toKernel(),
1491  vPrp.toKernel(),
1492  sortedToUnsortedIndex.toKernel(),
1493  start
1494  );
1495  }
1496  #endif
1497  }
1498 
1505  inline size_t getOpt() const
1506  {
1507  return opt;
1508  }
1509 
1515  void setOpt(size_t opt)
1516  {
1517  this->opt = opt;
1518  }
1519 };
1520 
1521 // This is a tranformation node for vector_distributed for the algorithm toKernel_tranform
1522 template<template <typename> class layout_base, typename T>
1523 struct toKernel_transform<layout_base,T,4>
1524 {
1525  typedef CellList_gpu_ker<T::dim_,
1526  typename T::stype_,
1527  typename T::ids_type_,
1528  typename T::transform_type_,
1529  T::is_sparse_::value> type;
1530 };
1531 
1532 #endif
1533 
1534 #endif /* OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_ */
This class represent an N-dimensional box.
Definition: Box.hpp:60
This class implement an NxN (dense) matrix.
Definition: Matrix.hpp:33
No transformation.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void clear()
Clear all from all the elements.
size_t size()
Return how many element you have in this map.
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:204
size_t size()
Stub size.
Definition: map_vector.hpp:212