OpenFPM  5.2.0
Project that contain the implementation of distributed structures
SparseGridGpu.hpp
1 //
2 // Created by tommaso on 6/06/19.
3 //
4 
5 #ifndef OPENFPM_PDATA_SPARSEGRIDGPU_HPP
6 #define OPENFPM_PDATA_SPARSEGRIDGPU_HPP
7 
8 constexpr int BLOCK_SIZE_STENCIL = 128;
9 
10 #include "config.h"
11 #include "util/cuda_util.hpp"
12 #include <cstdlib>
13 #include <SparseGridGpu/BlockMapGpu.hpp>
14 #include <Grid/iterators/grid_skin_iterator.hpp>
15 #include <Grid/Geometry/grid_smb.hpp>
16 #include "SparseGridGpu_ker.cuh"
17 #include "SparseGridGpu_kernels.cuh"
18 #include "Iterators/SparseGridGpu_iterator_sub.hpp"
19 #include "Grid/Geometry/grid_zmb.hpp"
20 #include "util/stat/common_statistics.hpp"
21 #include "Iterators/SparseGridGpu_iterator.hpp"
22 #include "Space/Shape/Box.hpp"
23 
24 #if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST)
25 #include "VTKWriter/VTKWriter.hpp"
26 #endif
27 
28 constexpr int NO_ITERATOR_INIT = 0;
29 
30 // todo: Move all the following utils into some proper file inside TemplateUtils
31 
32 enum tag_boundaries
33 {
34  NO_CALCULATE_EXISTING_POINTS,
35  CALCULATE_EXISTING_POINTS
36 };
37 
38 template<unsigned int dim>
40 {
41  typedef boost::mpl::int_<2> type;
42 };
43 
44 
45 template<>
46 struct default_edge<1>
47 {
48  typedef boost::mpl::int_<256> type;
49  typedef boost::mpl::int_<256> tb;
50 };
51 
52 template<>
53 struct default_edge<2>
54 {
55  typedef boost::mpl::int_<16> type;
56  typedef boost::mpl::int_<256> tb;
57 };
58 
59 template<>
60 struct default_edge<3>
61 {
62  typedef boost::mpl::int_<8> type;
63  typedef boost::mpl::int_<512> tb;
64 };
65 
66 template<typename T>
68 {
69  typedef T type;
70 };
71 
72 template<typename T, unsigned int dim, unsigned int blockEdgeSize>
74 {
76 };
77 
78 template<typename T, unsigned int dim, unsigned int blockEdgeSize, unsigned int N1>
79 struct process_data_block<T[N1],dim,blockEdgeSize>
80 {
82 };
83 
84 template<unsigned int dim, unsigned int blockEdgeSize, typename ... aggr_list>
86 {
88 };
89 
90 template<unsigned int dim, unsigned int blockEdgeSize, typename aggr>
92 {
93 };
94 
95 template<unsigned int dim, unsigned int blockEdgeSize, typename ... types>
96 struct aggregate_convert<dim,blockEdgeSize,aggregate<types ...>>
97 {
98  typedef typename aggregate_transform_datablock_impl<dim,blockEdgeSize,types ...>::type type;
99 };
100 
101 template<typename aggr>
103 {
104 };
105 
106 template<typename ... types>
107 struct aggregate_add<aggregate<types ...>>
108 {
109  typedef aggregate<types ..., unsigned char> type;
110 };
111 
113 
114 template<typename enc_type>
116 {
117  int offset;
118  enc_type enc;
119 
120  public:
121 
122  encap_data_block(int offset,const enc_type & enc)
123  :offset(offset),enc(enc)
124  {}
125 
126  encap_data_block operator=(const encap_data_block<enc_type> & enc)
127  {
129 
130  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,enc_type::T_type::max_prop> >(cp);
131 
132  return *this;
133  }
134 
135  template<unsigned int p>
136  auto get() -> decltype(enc.template get<p>()[offset])
137  {
138  return enc.template get<p>()[offset];
139  }
140 
141  template<unsigned int p>
142  auto get() const -> decltype(enc.template get<p>()[offset])
143  {
144  return enc.template get<p>()[offset];
145  }
146 };
147 
149 
150 enum StencilMode
151 {
152  STENCIL_MODE_INPLACE = 1,
153  STENCIL_MODE_INPLACE_NO_SHARED = 3
154 };
155 
160 template<typename SGridGpu, unsigned int prp, unsigned int stencil_size>
162 {
164  stencil_size,
166  SGridGpu::device_grid_type::dims> type;
167 };
168 
169 #include "encap_num.hpp"
170 
175 template<typename SGridGpu>
177 {
179 };
180 
185 template<unsigned int dim>
187 {
188  __device__ inline static bool is_padding()
189  {
190  printf("NNfull_is_padding_impl with dim: %d not implemented yet \n",dim);
191 
192  return false;
193  }
194 };
195 
200 template<>
202 {
203  template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
204  __device__ inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
205  {
206  bool isPadding_ = false;
207  for (int i = 0 ; i < 3 ; i++)
208  {
209  for (int j = 0 ; j < 3 ; j++)
210  {
211  for (int k = 0 ; k < 3 ; k++)
212  {
213  grid_key_dx<3,int> key;
214 
215  key.set_d(0,i-1);
216  key.set_d(1,j-1);
217  key.set_d(2,k-1);
218 
219  auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
220  typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
221  isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
222  if (isPadding_) break;
223  }
224  }
225  }
226  return isPadding_;
227  }
228 };
229 
230 
235 template<>
237 {
238  template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
239  __device__ inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
240  {
241  bool isPadding_ = false;
242  for (int i = 0 ; i < 3 ; i++)
243  {
244  for (int j = 0 ; j < 3 ; j++)
245  {
246  grid_key_dx<2,int> key;
247 
248  key.set_d(0,i-1);
249  key.set_d(1,j-1);
250 
251  auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
252  typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
253  isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
254  if (isPadding_) break;
255  }
256  }
257  return isPadding_;
258  }
259 };
260 
261 template<unsigned int dim>
262 struct NNFull
263 {
264  static const int nNN = IntPow<3, dim>::value;
265 
266  template<typename indexT, typename blockCoord_type, typename blockMap_type, typename SparseGrid_type>
267  __device__ static inline indexT getNNpos(blockCoord_type & blockCoord,
268  blockMap_type & blockMap,
269  SparseGrid_type & sparseGrid,
270  const unsigned int offset)
271  {
272  // point to the background element
273  int neighbourPos = blockMap.size();
274  if (offset < nNN && offset != nNN / 2)
275  {
276  int cnt = offset;
277  for (int i = 0 ; i < dim ; i++)
278  {
279  int dPos = cnt % 3;
280  cnt /= 3;
281  blockCoord.set_d(i, blockCoord.get(i) + dPos - 1);
282  }
283 
284  neighbourPos = blockMap.get_sparse(sparseGrid.getBlockLinId(blockCoord)).id;
285  }
286  return neighbourPos;
287  }
288 
289  template<typename indexT, unsigned int blockEdgeSize, typename coordType>
290  __host__ static inline indexT getNNskin(coordType & coord, int stencilSupportRadius)
291  {
292  // linearize the coord
293 
294  indexT neighbourNum = 0;
295 
296  indexT accu = 1;
297  for(int i = 0 ; i < dim ; i++)
298  {
299  int c = static_cast<int>(coord.get(i)) - static_cast<int>(stencilSupportRadius);
300  if (c < 0)
301  {
302  neighbourNum += 0;
303  }
304  else if (c >= blockEdgeSize)
305  {
306  neighbourNum += 2*accu;
307  }
308  else
309  {
310  neighbourNum += accu;
311  }
312  accu *= 3;
313  }
314 
315  return neighbourNum;
316  }
317 
318 
319  template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
320  __device__ static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
321  {
322  return NNfull_is_padding_impl<3>::template is_padding(sparseGrid,coord,enlargedBlock);
323  }
324 
335  template<unsigned int blockEdgeSize, typename indexT2>
336  __device__ static inline bool getNNindex_offset(grid_key_dx<dim,indexT2> & coord, unsigned int & NN_index, unsigned int & offset_nn)
337  {
338  bool out = false;
339  NN_index = 0;
340  offset_nn = 0;
341 
342  int cnt = 1;
343  int cnt_off = 1;
344  for (unsigned int i = 0 ; i < dim ; i++)
345  {
346  int p = 1 - ((int)(coord.get(i) < 0)) + ((int)(coord.get(i) >= (int)blockEdgeSize));
347 
348  NN_index += p*cnt;
349 
350  offset_nn += (coord.get(i) + (1 - p)*(int)blockEdgeSize)*cnt_off;
351 
352  cnt *= 3;
353  cnt_off *= blockEdgeSize;
354 
355  out |= (p != 1);
356  }
357 
358  return out;
359  }
360 };
361 
362 
363 
364 template<unsigned int nNN_, unsigned int nLoop_>
365 struct ct_par
366 {
367  static const unsigned int nNN = nNN_;
368  static const unsigned int nLoop = nLoop_;
369 };
370 
371 template<typename copy_type>
373 {
374  template<typename T, typename dst_type, typename src_type>
375  static inline void copy(src_type & src, dst_type & dst, unsigned int bPos)
376  {
377  dst.template get<T::value>() = src.template get<T::value>()[bPos];
378  }
379 };
380 
381 template<typename copy_type,unsigned int N1>
382 struct copy_prop_to_vector_block_impl<copy_type[N1]>
383 {
384  template<typename T, typename dst_type, typename src_type>
385  static inline void copy(src_type & src, dst_type & dst, unsigned int bPos)
386  {
387  for (int i = 0 ; i < N1 ; i++)
388  {
389  dst.template get<T::value>()[i] = src.template get<T::value>()[i][bPos];
390  }
391  }
392 };
393 
394 template<typename Tsrc,typename Tdst>
396 {
398  Tsrc src;
399 
401  Tdst dst;
402 
403  size_t pos;
404 
405  unsigned int bPos;
406 
407 public:
408 
409  copy_prop_to_vector_block(Tsrc src, Tdst dst,size_t pos, size_t bPos)
410  :src(src),dst(dst),pos(pos),bPos(bPos)
411  {}
412 
414  template<typename T>
415  inline void operator()(T& t) const
416  {
417  typedef typename std::remove_reference<decltype(dst.template get<T::value>())>::type copy_rtype;
418 
420 
421  //meta_copy<copy_rtype>::meta_copy_(src.template get<T::value>()[bPos],dst.template get<T::value>());
422  }
423 
424 };
425 
426 
427 
428 template<typename AggregateT, unsigned int n_it, unsigned int ... prp>
430 {
431  typedef typename to_boost_vmpl<prp...>::type vprp;
432 
434  void * base_ptr;
435 
436  mutable size_t tot = 0;
437 
438  size_t i;
439 
440  size_t sz = 0;
441 
442  arr_arr_ptr<n_it,sizeof...(prp)> & arrs;
443 
444 public:
445 
446  data_ptr_fill(void * base_ptr,size_t i, arr_arr_ptr<n_it,sizeof...(prp)> & arrs, size_t sz)
447  :base_ptr(base_ptr),i(i),sz(sz),arrs(arrs)
448  {}
449 
451  template<typename T>
452  inline void operator()(T& t) const
453  {
454  typedef typename boost::mpl::at<vprp,T>::type prp_cp;
455 
456  // Remove the reference from the type to copy
457  typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
458 
459  arrs.ptr[i][T::value] = (void *)((((unsigned char *)base_ptr) + tot));
460 
461  tot += sz * sizeof(pack_type);
462  }
463 
464 };
465 
466 template<typename SparseGridType>
468 {
469  SparseGridType * grd;
470 
471  // Source box
473 
474  // destination Box
476 
477  sparse_grid_section(SparseGridType & grd,const Box<SparseGridType::dims,size_t> & src, const Box<SparseGridType::dims,size_t> & dst)
478  :grd(&grd),src(src),dst(dst)
479  {}
480 };
481 
482 
483 template<unsigned int dim,
484  typename AggregateT,
485  unsigned int blockEdgeSize = default_edge<dim>::type::value,
486  unsigned int threadBlockSize = default_edge<dim>::tb::value,
487  typename indexT=long int,
488  template<typename> class layout_base=memory_traits_inte,
489  typename linearizer = grid_smb<dim, blockEdgeSize, indexT>>
490 class SparseGridGpu : public BlockMapGpu<
491  typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,
492  threadBlockSize, indexT, layout_base>
493 {
494 public:
495 
496  static constexpr unsigned int dims = dim;
497 
498 private:
499 
500  typedef BlockMapGpu<
502  threadBlockSize, indexT, layout_base> BMG;
503 
504  const static unsigned char PADDING_BIT = 1;
505  typedef typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type AggregateBlockT;
506  linearizer gridGeometry;
507  grid_sm<dim, int> extendedBlockGeometry;
508  grid_sm<dim, int> gridSize;
509  unsigned int stencilSupportRadius;
510  unsigned int ghostLayerSize;
511  int req_index;
512  int req_index_swp;
513  int req_index_swp_r;
514 
515  AggregateT bck;
516 
518 
519  // Queue of remove sections
521 
522  // Queue of copy sections
524 
525  CudaMemory mem;
526 
527  // pointers for unpack
528  openfpm::vector<void *> index_ptrs;
529  openfpm::vector<void *> index_ptrs_swp;
530  openfpm::vector<void *> index_ptrs_swp_r;
531  openfpm::vector<void *> scan_ptrs;
532  openfpm::vector<void *> scan_ptrs_swp;
533  openfpm::vector<void *> scan_ptrs_swp_r;
534  openfpm::vector<void *> data_ptrs;
535  openfpm::vector<void *> data_ptrs_swp;
536  openfpm::vector<void *> data_ptrs_swp_r;
537  openfpm::vector<void *> offset_ptrs;
538  openfpm::vector<void *> offset_ptrs_swp;
539  openfpm::vector<void *> offset_ptrs_swp_r;
540  openfpm::vector<void *> mask_ptrs;
541  openfpm::vector<void *> mask_ptrs_swp;
542  openfpm::vector<void *> mask_ptrs_swp_r;
543 
544  // pointers for copyRemove
545  openfpm::vector<void *> offset_ptrs_cp;
546  openfpm::vector<void *> offset_ptrs_cp_swp;
547  openfpm::vector<void *> offset_ptrs_cp_swp_r;
548  openfpm::vector<void *> scan_ptrs_cp;
549  openfpm::vector<void *> scan_ptrs_cp_swp;
550  openfpm::vector<void *> scan_ptrs_cp_swp_r;
551  openfpm::vector<void *> data_base_ptr_cp;
552  openfpm::vector<void *> data_base_ptr_cp_swp;
553  openfpm::vector<void *> data_base_ptr_cp_swp_r;
554  openfpm::vector<int> n_cnk_cp;
555  openfpm::vector<int> n_cnk_cp_swp;
556  openfpm::vector<int> n_cnk_cp_swp_r;
557  openfpm::vector<int> n_pnt_cp;
558  openfpm::vector<int> n_pnt_cp_swp;
559  openfpm::vector<int> n_pnt_cp_swp_r;
560  openfpm::vector<int> n_shifts_cp;
561  openfpm::vector<int> n_shift_cp_swp;
562  openfpm::vector<int> n_shift_cp_swp_r;
563  typedef typename aggregate_convert<dim,blockEdgeSize,aggregate<int>>::type convertAggr;
564 
565  // Map to convert blocks from missaligned chunks
567  openfpm::vector_gpu<convertAggr> convert_blk_swp;
568  openfpm::vector_gpu<convertAggr> convert_blk_swp_r;
571  openfpm::vector<Box<dim,size_t>> box_cp_swp_r;
572 
578 
583 
590 
592 
597 
600 
603 
606 
609  mutable openfpm::vector_gpu<aggregate<int>> new_map_swp;
610  mutable openfpm::vector_gpu<aggregate<int>> new_map_swp_r;
611 
614  mutable openfpm::vector_gpu<Box<dim,int>> pack_subs_swp;
615  mutable openfpm::vector_gpu<Box<dim,int>> pack_subs_swp_r;
616 
620  mutable int index_size_swp = -1;
621  mutable int index_size_swp_r = -1;
622 
625 
628 
631 
634 
637 
640 
643 
644  bool findNN = false;
645 
646  inline void swap_internal_remote()
647  {
648  n_cnk_cp_swp_r.swap(n_cnk_cp);
649  n_pnt_cp_swp_r.swap(n_pnt_cp);
650  n_shift_cp_swp_r.swap(n_shifts_cp);
651  convert_blk_swp_r.swap(convert_blk);
652  box_cp_swp_r.swap(box_cp);
653  new_map_swp_r.swap(new_map);
654  }
655 
656  inline void swap_internal_local()
657  {
658  offset_ptrs_cp_swp.swap(offset_ptrs_cp);
659  scan_ptrs_cp_swp.swap(scan_ptrs_cp);
660  data_base_ptr_cp_swp.swap(data_base_ptr_cp);
661  n_cnk_cp_swp.swap(n_cnk_cp);
662  n_pnt_cp_swp.swap(n_pnt_cp);
663  n_shift_cp_swp.swap(n_shifts_cp);
664  convert_blk_swp.swap(convert_blk);
665  box_cp_swp.swap(box_cp);
666  new_map_swp.swap(new_map);
667  }
668 
669  inline void swap_local_pack()
670  {
671  index_ptrs_swp.swap(index_ptrs);
672  scan_ptrs_swp.swap(scan_ptrs);
673  data_ptrs_swp.swap(data_ptrs);
674  offset_ptrs_swp.swap(offset_ptrs);
675  mask_ptrs_swp.swap(mask_ptrs);
676 
677  e_points_swp.swap(e_points);
678  pack_output_swp.swap(pack_output);
679  tmp_swp.swap(tmp);
680 
681  pack_subs_swp.swap(pack_subs);
683  }
684 
685  inline void swap_remote_pack()
686  {
687  index_ptrs_swp_r.swap(index_ptrs);
688  scan_ptrs_swp_r.swap(scan_ptrs);
689  data_ptrs_swp_r.swap(data_ptrs);
690  offset_ptrs_swp_r.swap(offset_ptrs);
691  mask_ptrs_swp_r.swap(mask_ptrs);
692 
693  e_points_swp_r.swap(e_points);
694  pack_output_swp_r.swap(pack_output);
695  tmp_swp_r.swap(tmp);
696 
697  pack_subs_swp_r.swap(pack_subs);
698  //req_index_swp_r = req_index;
699  index_size_swp_r = private_get_index_array().size();
700  }
701 
702 protected:
703  static constexpr unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;
704  typedef AggregateBlockT AggregateInternalT;
705 
706 public:
707 
709  typedef int yes_i_am_grid;
710 
711  static constexpr unsigned int blockEdgeSize_ = blockEdgeSize;
712 
713  typedef linearizer grid_info;
714 
715  typedef linearizer linearizer_type;
716 
717  template<typename Tfunc> using layout_mfunc = memory_traits_inte<Tfunc>;
718 
720 
721  typedef indexT indexT_;
722 
723  typedef decltype(std::declval<BMG>().toKernel().insertBlock(0)) insert_encap;
724 
730  inline size_t size() const
731  {
732  return this->countExistingElements();
733  }
734 
740  template <typename stencil = no_stencil>
742  {
744  }
745 
752  {
753  return SparseGridGpu_iterator<dim,self>(std::declval<self>());
754  }
755 
756  template<typename dim3T>
757  inline static int dim3SizeToInt(dim3T d)
758  {
759  return d.x * d.y * d.z;
760  }
761 
762  inline static int dim3SizeToInt(size_t d)
763  {
764  return d;
765  }
766 
767  inline static int dim3SizeToInt(unsigned int d)
768  {
769  return d;
770  }
771 
772  template<typename ... v_reduce>
773  void flush(gpu::ofp_context_t& gpuContext, flush_type opt = FLUSH_ON_HOST)
774  {
776  ::template flush<v_reduce ...>(gpuContext, opt);
777 
778  findNN = false;
779  }
780 
781 
782 
783  void saveUnpackVariableIfNotKeepGeometry(int opt, bool is_unpack_remote)
784  {
785  if (is_unpack_remote == true)
786  {swap_internal_remote();}
787 
788  if (is_unpack_remote == false)
789  {swap_internal_local();}
790  }
791 
792  void RestoreUnpackVariableIfKeepGeometry(int opt, bool is_unpack_remote)
793  {
794  if (opt & KEEP_GEOMETRY && is_unpack_remote == true)
795  {swap_internal_remote();}
796 
797  if (opt & KEEP_GEOMETRY && is_unpack_remote == false)
798  {swap_internal_local();}
799  }
800 
801 
802  void savePackVariableIfNotKeepGeometry(int opt, bool is_pack_remote)
803  {
804  if (is_pack_remote == false)
805  {
806  swap_local_pack();
807  req_index_swp = req_index;
808  }
809 
810  if (is_pack_remote == true)
811  {
812  swap_remote_pack();
813  req_index_swp_r = req_index;
814  }
815  }
816 
817  void RestorePackVariableIfKeepGeometry(int opt, bool is_pack_remote)
818  {
819  if (opt & KEEP_GEOMETRY && is_pack_remote == false)
820  {
821  swap_local_pack();
822  req_index = req_index_swp;
823  }
824 
825  if (opt & KEEP_GEOMETRY && is_pack_remote == true)
826  {
827  swap_remote_pack();
828  req_index = req_index_swp_r;
829  }
830  }
831 
832  template<unsigned int n_it>
833  void calculatePackingPointsFromBoxes(int opt,size_t tot_pnt)
834  {
835  if (!(opt & KEEP_GEOMETRY))
836  {
837  auto & indexBuffer = private_get_index_array();
838  auto & dataBuffer = private_get_data_array();
839 
840  e_points.resize(tot_pnt);
841  pack_output.resize(tot_pnt);
842 
843  ite_gpu<1> ite;
844 
845  ite.wthr.x = indexBuffer.size();
846  ite.wthr.y = 1;
847  ite.wthr.z = 1;
848  ite.thr.x = getBlockSize();
849  ite.thr.y = 1;
850  ite.thr.z = 1;
851 
852  // Launch a kernel that count the number of element on each chunks
853  CUDA_LAUNCH((SparseGridGpuKernels::get_exist_points_with_boxes<dim,
855  n_it,
856  indexT>),
857  ite,
858  indexBuffer.toKernel(),
859  pack_subs.toKernel(),
860  gridGeometry,
861  dataBuffer.toKernel(),
862  pack_output.toKernel(),
863  tmp.toKernel(),
864  scan_it.toKernel(),
865  e_points.toKernel());
866  }
867  }
868 
869 private:
870 
871  void computeSizeOfGhostLayer()
872  {
873  unsigned int term1 = 1;
874  for (int i = 0; i < dim; ++i)
875  {
876  term1 *= blockEdgeSize + 2 * stencilSupportRadius;
877  }
878  unsigned int term2 = 1;
879  for (int i = 0; i < dim; ++i)
880  {
881  term2 *= blockEdgeSize;
882  }
883  ghostLayerSize = term1 - term2;
884  }
885 
886  void allocateGhostLayerMapping()
887  {
888  ghostLayerToThreadsMapping.resize(ghostLayerSize);
889  }
890 
891  template<typename stencil_type>
892  void computeGhostLayerMapping()
893  {
894  size_t dimensions[dim],
895  origin[dim],
896  innerDomainBegin[dim], innerDomainEnd[dim],
897  outerBoxBegin[dim], outerBoxEnd[dim],
898  bc[dim];
899  for (int i = 0; i < dim; ++i)
900  {
901  dimensions[i] = blockEdgeSize + 2 * stencilSupportRadius;
902  origin[i] = 0;
903  innerDomainBegin[i] = stencilSupportRadius - 1;
904  innerDomainEnd[i] = dimensions[i] - stencilSupportRadius;
905  outerBoxBegin[i] = origin[i];
906  outerBoxEnd[i] = dimensions[i];
907  bc[i] = NON_PERIODIC;
908  }
909  grid_sm<dim, void> enlargedGrid;
910  enlargedGrid.setDimensions(dimensions);
911  Box<dim, size_t> outerBox(outerBoxBegin, outerBoxEnd);
912  Box<dim, size_t> innerBox(innerDomainBegin, innerDomainEnd);
913 
914  grid_skin_iterator_bc<dim> gsi(enlargedGrid, innerBox, outerBox, bc);
915 
916  unsigned int i = 0;
917  while (gsi.isNext())
918  {
919  auto coord = gsi.get();
920  assert(i < ghostLayerSize);
921  mem_id linId = enlargedGrid.LinId(coord);
922  // Mapping
923  ghostLayerToThreadsMapping.template get<gt>(i) = linId;
924  // Now compute the neighbour position to use
925  ghostLayerToThreadsMapping.template get<nt>(i) = stencil_type::template getNNskin<indexT,blockEdgeSize>(coord,stencilSupportRadius);
926  //
927  ++i;
928  ++gsi;
929  }
930  assert(i == ghostLayerSize);
931 
932  ghostLayerToThreadsMapping.template hostToDevice<gt,nt>();
933  }
934 
935  void initialize(const size_t (& res)[dim])
936  {
937  gridGeometry = linearizer(res);
938 
939  computeSizeOfGhostLayer();
940  allocateGhostLayerMapping();
941  computeGhostLayerMapping<NNStar<dim>>();
942 
943  size_t extBlockDims[dim];
944  for (int d=0; d<dim; ++d)
945  {
946  extBlockDims[d] = blockEdgeSize + 2*stencilSupportRadius;
947  }
948  extendedBlockGeometry.setDimensions(extBlockDims);
949 
950  gridSize.setDimensions(res);
951  }
952 
953 
954  template <typename stencil, typename... Args>
955  void applyStencilInPlace(const Box<dim,int> & box, StencilMode & mode,Args... args)
956  {
957  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
960 
961  const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
962  unsigned int numScalars = indexBuffer_.size() * dataChunkSize;
963 
964  if (numScalars == 0) return;
965 
966  // NOTE: Here we want to work only on one data chunk per block!
967  constexpr unsigned int chunksPerBlock = 1;
968  const unsigned int localThreadBlockSize = dataChunkSize * chunksPerBlock;
969  const unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
970  ? numScalars / localThreadBlockSize
971  : 1 + numScalars / localThreadBlockSize;
972 
973  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * chunksPerBlock)>::value; // todo: This works only for stencilSupportSize==1
974 
975 #ifdef CUDIFY_USE_CUDA
976 
977 
978  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace
979  <dim,
981  stencil>),
982  threadGridSize, localThreadBlockSize,
983  box,
984  indexBuffer_.toKernel(),
985  dataBuffer_.toKernel(),
986  this->template toKernelNN<stencil::stencil_type::nNN, nLoop>(),
987  args...);
988 
989 #else
990 
991  auto bx = box;
992  auto indexBuffer = indexBuffer_.toKernel();
993  auto dataBuffer = dataBuffer_.toKernel();
994  auto sparseGrid = this->template toKernelNN<stencil::stencil_type::nNN, nLoop>();
995 
997 
998  auto lamb = [=] __device__ () mutable
999  {
1000  constexpr unsigned int pIndex = 0;
1001 
1002  typedef typename decltype(indexBuffer)::value_type IndexAggregateT;
1003  typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1004 
1005  typedef typename decltype(dataBuffer)::value_type AggregateT_;
1006  typedef BlockTypeOf<AggregateT_, pMask> MaskBlockT;
1007  typedef ScalarTypeOf<AggregateT_, pMask> MaskT;
1008  constexpr unsigned int blockSize = MaskBlockT::size;
1009 
1010  // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
1011  // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
1012  const unsigned int dataBlockPos = blockIdx.x;
1013  const unsigned int offset = threadIdx.x;
1014 
1015  if (dataBlockPos >= indexBuffer.size())
1016  {
1017  return;
1018  }
1019 
1020  auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
1021 
1022  // todo: Add management of RED-BLACK stencil application! :)
1023  const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1024  grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
1025 
1026  unsigned char curMask;
1027 
1028  if (offset < blockSize)
1029  {
1030  // Read local mask to register
1031  curMask = dataBlockLoad.template get<pMask>()[offset];
1032  for (int i = 0 ; i < dim ; i++)
1033  {curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}
1034  }
1035 
1037  sdataBlockPos.id = dataBlockPos;
1038 
1039  stencil::stencil(
1040  sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1041  curMask, args...);
1042  };
1043 
1044  CUDA_LAUNCH_LAMBDA_DIM3_TLS(threadGridSize, localThreadBlockSize,lamb);
1045 
1046 #endif
1047 
1048  }
1049 
1050 
1051  template <typename stencil, typename... Args>
1052  void applyStencilInPlaceNoShared(const Box<dim,int> & box, StencilMode & mode,Args... args)
1053  {
1054  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
1057 
1058  const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
1059  unsigned int numScalars = indexBuffer.size() * dataChunkSize;
1060 
1061  if (numScalars == 0) return;
1062 
1063  auto ite = e_points.getGPUIterator(BLOCK_SIZE_STENCIL);
1064 
1065  CUDA_LAUNCH((SparseGridGpuKernels::applyStencilInPlaceNoShared
1066  <dim,
1068  stencil>),
1069  ite,
1070  box,
1071  indexBuffer.toKernel(),
1072  dataBuffer.toKernel(),
1073  this->template toKernelNN<stencil::stencil_type::nNN, 0>(),
1074  args...);
1075  }
1076 
1077  template<typename ids_type>
1078  void fill_chunks_boxes(openfpm::vector<Box<dim,double>> & chunks_box, ids_type & chunk_ids, Point<dim,double> & spacing, Point<dim,double> & offset)
1079  {
1080  for (int i = 0 ; i < chunk_ids.size() ; i++)
1081  {
1082  Box<dim,double> box;
1083 
1084  auto c_pos = gridGeometry.InvLinId(chunk_ids.template get<0>(i)*blockSize);
1085 
1086  for (int j = 0 ; j < dim ; j++)
1087  {
1088  box.setLow(j,c_pos.get(j) * spacing[j] - 0.5*spacing[j] + offset.get(j)*spacing[j]);
1089  box.setHigh(j,(c_pos.get(j) + blockEdgeSize)*spacing[j] - 0.5*spacing[j] + offset.get(j)*spacing[j]);
1090  }
1091 
1092  chunks_box.add(box);
1093  }
1094  }
1095 
1096  template<typename MemType, unsigned int ... prp>
1097  void preUnpack(ExtPreAlloc<MemType> * prAlloc_prp, gpu::ofp_context_t& gpuContext, int opt)
1098  {
1099  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1100  {
1101  // Convert the packed chunk ids
1102 
1103  prAlloc_prp->reset();
1104  Unpack_stat ups;
1105 
1106  for (size_t i = 0 ; i < copySect.size() ; i++)
1107  {
1108  auto sub_it = this->getIterator(copySect.get(i).dst.getKP1(),copySect.get(i).dst.getKP2(),NO_ITERATOR_INIT);
1109 
1110  copySect.get(i).grd->template addAndConvertPackedChunkToTmp<prp ...>(*prAlloc_prp,sub_it,ups,gpuContext);
1111  }
1112  }
1113  }
1114 
1115 
1116  template<unsigned int ... prp>
1117  void removeCopyToFinalize_phase1(gpu::ofp_context_t& gpuContext, int opt)
1118  {
1119  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1120  {removePoints(gpuContext);}
1121  }
1122 
1123  template<unsigned int ... prp>
1124  void removeCopyToFinalize_phase2(gpu::ofp_context_t& gpuContext, int opt)
1125  {
1126  // Pack information
1127  Pack_stat sts;
1128 
1129  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1130  {
1131  this->packReset();
1132 
1133  size_t req = 0;
1134  // First we do counting of point to copy (as source)
1135 
1136  for (size_t i = 0 ; i < copySect.size() ; i++)
1137  {
1138  auto sub_it = this->getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1139 
1140  this->packRequest(sub_it,req);
1141  }
1142 
1143  this->template packCalculate<prp...>(req,gpuContext);
1144 
1145  mem.resize(req);
1146 
1147  // Create an object of preallocated memory for properties
1148  prAlloc_prp = new ExtPreAlloc<CudaMemory>(req,mem);
1149  prAlloc_prp->incRef();
1150 
1151  for (size_t i = 0 ; i < copySect.size() ; i++)
1152  {
1153  auto sub_it = this->getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1154 
1155  this->pack<prp ...>(*prAlloc_prp,sub_it,sts);
1156  }
1157  }
1158  else
1159  {
1160  size_t req = mem.size();
1161 
1162  // Create an object of preallocated memory for properties
1163  prAlloc_prp = new ExtPreAlloc<CudaMemory>(req,mem);
1164  prAlloc_prp->incRef();
1165  }
1166 
1167  this->template packFinalize<prp ...>(*prAlloc_prp,sts,opt,false);
1168 
1169  preUnpack<CudaMemory,prp ...>(prAlloc_prp,gpuContext,opt);
1170 
1171  prAlloc_prp->decRef();
1172  delete prAlloc_prp;
1173  }
1174 
1175  template<unsigned int ... prp>
1176  void removeCopyToFinalize_phase3(gpu::ofp_context_t& gpuContext, int opt, bool is_unpack_remote)
1177  {
1178  ite_gpu<1> ite;
1179 
1180  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1181  {
1182  if (tmp2.size() == 0)
1183  {return;}
1184 
1185  // Fill the add buffer given tmp and than flush
1186 
1187  setGPUInsertBuffer(tmp2.size(),1ul);
1188 
1189  auto & add_buff = this->blockMap.private_get_vct_add_index();
1190  add_buff.swap(tmp2);
1191 
1192  auto & nadd_buff = this->blockMap.private_get_vct_nadd_index();
1193  ite = nadd_buff.getGPUIterator();
1194  CUDA_LAUNCH(SparseGridGpuKernels::set_one,ite,nadd_buff.toKernel());
1195 
1196  int sz_b = this->private_get_index_array().size();
1197 
1198  this->template flush<sLeft_<prp>...>(gpuContext,flush_type::FLUSH_ON_DEVICE);
1199 
1200  // get the map of the new inserted elements
1201 
1202  auto & m_map = this->getMergeIndexMapVector();
1203  auto & a_map = this->getMappingVector();
1204  auto & o_map = this->getSegmentToOutMap();
1205  auto & segments_data = this->getSegmentToMergeIndexMap();
1206 
1207  new_map.resize(a_map.size(),0);
1208 
1209  // construct new to old map
1210 
1211  ite = segments_data.getGPUIterator();
1212 
1213  if (ite.nblocks() != 0)
1214  CUDA_LAUNCH(SparseGridGpuKernels::construct_new_chunk_map<1>,ite,new_map.toKernel(),a_map.toKernel(),m_map.toKernel(),o_map.toKernel(),segments_data.toKernel(),sz_b);
1215 
1216  convert_blk.template hostToDevice<0>();
1217  }
1218  else
1219  {
1220  ite.wthr.x = 1;
1221  ite.wthr.y = 1;
1222  ite.wthr.z = 1;
1223 
1224  ite.thr.x = 1;
1225  ite.thr.y = 1;
1226  ite.thr.z = 1;
1227  }
1228 
1229  // Restore
1230  RestoreUnpackVariableIfKeepGeometry(opt,is_unpack_remote);
1231 
1232  // for each packed chunk
1233 
1234  size_t n_accu_cnk = 0;
1235  for (size_t i = 0 ; i < n_cnk_cp.size() ; i++)
1236  {
1237  arr_arr_ptr<1,sizeof...(prp)> data;
1238  size_t n_pnt = n_pnt_cp.get(i);
1239 
1240  void * data_base_ptr = data_base_ptr_cp.get(i);
1241  data_ptr_fill<AggregateT,1,prp...> dpf(data_base_ptr,0,data,n_pnt);
1242  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(dpf);
1243 
1244  ite.wthr.x = n_cnk_cp.get(i);
1245  ite.wthr.y = 1;
1246  ite.wthr.z = 1;
1247 
1248  // calculate best number of threads
1249  Box<dim,size_t> ub = box_cp.get(i);
1250 
1251  ite.thr.x = 1;
1252  for (int j = 0 ; j < dim ; j++)
1253  {
1254  size_t l = ub.getHigh(j) - ub.getLow(j) + 1;
1255 
1256  if (l >= blockEdgeSize)
1257  {ite.thr.x *= blockEdgeSize;}
1258  else
1259  {ite.thr.x *= l;}
1260  }
1261  ite.thr.y = 1;
1262  ite.thr.z = 1;
1263 
1264  // copy to new (1 block for each packed chunk)
1265  if (ite.nblocks() != 0 && ite.thr.x != 0)
1266  {
1267  auto & chunks = private_get_data_array();
1268 
1269  CUDA_LAUNCH((SparseGridGpuKernels::copy_packed_data_to_chunks<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
1270  AggregateT,decltype(convert_blk.toKernel()),decltype(new_map.toKernel()),
1271  decltype(data),decltype(chunks.toKernel()),prp... >),ite,
1272  (unsigned int *)scan_ptrs_cp.get(i),
1273  (unsigned short int *)offset_ptrs_cp.get(i),
1274  convert_blk.toKernel(),
1275  new_map.toKernel(),
1276  data,
1277  chunks.toKernel(),
1278  (unsigned int)n_cnk_cp.get(i),
1279  (unsigned int)n_shifts_cp.get(i),
1280  (unsigned int)n_pnt_cp.get(i),
1281  (unsigned int)i,
1282  (unsigned int)n_accu_cnk);
1283  }
1284 
1285  n_accu_cnk += n_cnk_cp.get(i)*n_shifts_cp.get(i);
1286  }
1287 
1288  // Save
1289  saveUnpackVariableIfNotKeepGeometry(opt,is_unpack_remote);
1290  }
1291 
1292  template<unsigned int n_it, unsigned int ... prp>
1293  void pack_sg_implement(ExtPreAlloc<CudaMemory> & mem,
1294  Pack_stat & sts,
1295  int opt,
1296  bool is_pack_remote)
1297  {
1298  arr_ptr<n_it> index_ptr;
1299  arr_arr_ptr<n_it,sizeof...(prp)> data_ptr;
1300  arr_ptr<n_it> scan_ptr;
1301  arr_ptr<n_it> offset_ptr;
1302  arr_ptr<n_it> mask_ptr;
1304 
1305  auto & indexBuffer = private_get_index_array();
1306  auto & dataBuffer = private_get_data_array();
1307 
1308  if (req_index != pack_subs.size())
1309  {std::cerr << __FILE__ << ":" << __LINE__ << " error the packing request number differ from the number of packed objects " << req_index << " " << pack_subs.size() << std::endl;}
1310 
1311  size_t tot_pnt = 0;
1312  size_t tot_cnk = 0;
1313 
1314  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
1315  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
1316 
1317  // Calculate total points
1318 
1319  for (size_t i = 0 ; i < pack_subs.size() ; i++)
1320  {
1321  size_t n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
1322  sar.sa[i] = n_pnt;
1323  tot_pnt += n_pnt;
1324  }
1325 
1326  // CUDA require aligned access, here we suppose 8 byte alligned and we ensure 8 byte aligned after
1327  // the cycle
1328  for (size_t i = 0 ; i < pack_subs.size() ; i++)
1329  {
1330  size_t n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
1331 
1332  // fill index_ptr data_ptr scan_ptr
1333  index_ptr.ptr[i] = index_ptrs.get(i);
1334  scan_ptr.ptr[i] = scan_ptrs.get(i);
1335 
1336  // for all properties fill the data pointer
1337 
1338  data_ptr_fill<AggregateT,n_it,prp...> dpf(data_ptrs.get(i),i,data_ptr,tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1));
1339  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(dpf);
1340 
1341  offset_ptr.ptr[i] = offset_ptrs.get(i);
1342  mask_ptr.ptr[i] = mask_ptrs.get(i);
1343 
1344  tot_cnk += n_cnk;
1345  }
1346 
1347  ite_gpu<1> ite;
1348 
1349  if (tot_pnt != 0)
1350  {
1351  calculatePackingPointsFromBoxes<n_it>(opt,tot_pnt);
1352 
1353  ite = e_points.getGPUIterator();
1354 
1355  // Here we copy the array of pointer of properties into a CudaMemory array
1356 
1357  CudaMemory mem;
1358  mem.allocate(sizeof(data_ptr));
1359 
1360  // copy
1361  arr_arr_ptr<n_it,sizeof...(prp)> * arr_data = (arr_arr_ptr<n_it,sizeof...(prp)> *)mem.getPointer();
1362 
1363  for(int i = 0 ; i < n_it ; i++)
1364  {
1365  for (int j = 0 ; j < sizeof...(prp) ; j++)
1366  {
1367  arr_data->ptr[i][j] = data_ptr.ptr[i][j];
1368  }
1369  }
1370 
1371  mem.hostToDevice();
1372 
1373  CUDA_LAUNCH((SparseGridGpuKernels::pack_data<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
1374  AggregateT,
1375  n_it,
1376  sizeof...(prp),
1377  indexT,
1378  decltype(e_points.toKernel()),
1379  decltype(pack_output.toKernel()),
1380  decltype(indexBuffer.toKernel()),
1381  decltype(dataBuffer.toKernel()),
1382  decltype(tmp.toKernel()),
1383  self::blockSize,
1384  prp ...>),
1385  ite,
1386  e_points.toKernel(),
1387  dataBuffer.toKernel(),
1388  indexBuffer.toKernel(),
1389  tmp.toKernel(),
1390  pack_output.toKernel(),
1391  index_ptr,
1392  scan_ptr,
1393  (arr_arr_ptr<n_it,sizeof...(prp)> *)mem.getDevicePointer(),
1394  offset_ptr,
1395  mask_ptr,
1396  sar);
1397  }
1398 
1399  ite.wthr.x = 1;
1400  ite.wthr.y = 1;
1401  ite.wthr.z = 1;
1402  ite.thr.x = pack_subs.size();
1403  ite.thr.y = 1;
1404  ite.thr.z = 1;
1405 
1406  if (pack_subs.size() != 0)
1407  {CUDA_LAUNCH(SparseGridGpuKernels::last_scan_point,ite,scan_ptr,tmp.toKernel(),(unsigned int)indexBuffer.size()+1,(unsigned int)pack_subs.size());}
1408  }
1409 
1410 
1420  template<unsigned int ... prp, typename S2>
1423  Unpack_stat & ps,
1424  gpu::ofp_context_t& gpuContext)
1425  {
1426  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
1427  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
1428 
1429  // First get the number of chunks
1430 
1431  size_t n_cnk;
1432 
1433  // Unpack the number of chunks
1434  mem.deviceToHost(ps.getOffset(),ps.getOffset() + sizeof(size_t) + 2*dim*sizeof(int));
1435  Unpacker<size_t,S2>::unpack(mem,n_cnk,ps);
1436 
1437  grid_key_dx<dim,int> origPack_pnt;
1438  grid_key_dx<dim,int> origPack_cnk;
1439  size_t sz[dim];
1440 
1441  // Unpack origin of the chunk indexing
1442  for (int i = 0 ; i < dim ; i++)
1443  {
1444  int tmp;
1445  Unpacker<int,S2>::unpack(mem,tmp,ps);
1446  origPack_cnk.set_d(i,((int)(tmp / blockEdgeSize))*blockEdgeSize);
1447  origPack_pnt.set_d(i,tmp);
1448  }
1449 
1450  for (int i = 0 ; i < dim ; i++)
1451  {
1452  int tmp;
1453  Unpacker<int,S2>::unpack(mem,tmp,ps);
1454  sz[i] = tmp;
1455  }
1456 
1457  size_t actual_offset = n_cnk*sizeof(indexT);
1458  // get the id pointers
1459  indexT * ids = (indexT *)((unsigned char *)mem.getDevicePointer() + ps.getOffset());
1460  unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));
1461 
1462  mem.deviceToHost(ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int),
1463  ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int) + sizeof(unsigned int));
1464 
1465 
1466 
1467  // Unpack number of points
1468  // calculate the number of total points
1469  size_t n_pnt = *(unsigned int *)((unsigned char *)mem.getPointer() + ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int));
1470  actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));
1471 
1472  void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );
1473 
1474  actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));
1475 
1476  short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);
1477 
1478  offset_ptrs_cp.add(offsets);
1479  scan_ptrs_cp.add(scan);
1480  n_cnk_cp.add(n_cnk);
1481  n_pnt_cp.add(n_pnt);
1482  data_base_ptr_cp.add(data_base_ptr);
1483 
1484  Box<dim,size_t> bx;
1485 
1486  for (int i = 0 ; i < dim ; i++)
1487  {
1488  bx.setLow(i,sub_it.getStart().get(i));
1489  bx.setHigh(i,sub_it.getStop().get(i));
1490  }
1491 
1492  box_cp.add(bx);
1493 
1494  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));
1495 
1496  if (n_cnk != 0)
1497  {
1498  shifts.clear();
1499 
1500  int n_shift = 1;
1501  shifts.add();
1502 
1503  for (int i = 0 ; i < dim ; i++)
1504  {shifts.last().template get<0>()[i] = 0;}
1505 
1506  for (int i = 0 ; i < dim ; i++)
1507  {
1508  int op_q = origPack_pnt.get(i) % blockEdgeSize;
1509  int ou_q = sub_it.getStart().get(i) % blockEdgeSize;
1510  int quot = abs(ou_q - op_q) % blockEdgeSize;
1511  int squot = openfpm::math::sgn(ou_q - op_q);
1512  if (quot != 0)
1513  {
1514  n_shift *= 2;
1515 
1516  int sz = shifts.size();
1517  for (int j = 0 ; j < sz ; j++)
1518  {
1519  shifts.add();
1520  for (int k = 0 ; k < dim ; k++)
1521  {
1522  shifts.last().template get<0>()[k] = shifts.template get<0>(j)[k] + ((i == k)?squot:0);
1523  }
1524  }
1525  }
1526  }
1527 
1528  shifts.template hostToDevice<0>();
1529 
1530  linearizer gridGeoPack(sz);
1531 
1532  int bs = 0;
1533  size_t sz[1] = {n_cnk};
1534  grid_sm<1,void> g(sz);
1535  auto ite = g.getGPUIterator();
1536 
1537  grid_key_dx<dim,int> sz_g;
1538  grid_key_dx<dim,int> origUnpack_cnk;
1539 
1540  for (int i = 0 ; i < dim ; i++)
1541  {
1542  sz_g.set_d(i,gridGeometry.getSize()[i]);
1543  origUnpack_cnk.set_d(i,(int)(sub_it.getStart().get(i) / blockEdgeSize)*blockEdgeSize);
1544  }
1545 
1546  bs = tmp2.size();
1547  tmp2.resize(tmp2.size() + n_cnk * shifts.size());
1548 
1549  n_shifts_cp.add(shifts.size());
1550 
1551  switch (shifts.size())
1552  {
1553  case 1:
1554  // Calculate for each chunk the indexes where they should go + active points
1555  CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,1,indexT>),ite,ids,
1556  (int)n_cnk,
1557  gridGeoPack,origPack_cnk,
1558  gridGeometry,origUnpack_cnk,
1559  tmp2.toKernel(),
1560  shifts.toKernel(),
1561  sz_g,
1562  bs);
1563  break;
1564  case 2:
1565  // Calculate for each chunk the indexes where they should go + active points
1566  CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,2,indexT>),ite,ids,
1567  (int)n_cnk,
1568  gridGeoPack,origPack_cnk,
1569  gridGeometry,origUnpack_cnk,
1570  tmp2.toKernel(),
1571  shifts.toKernel(),
1572  sz_g,
1573  bs);
1574  break;
1575  case 4:
1576  // Calculate for each chunk the indexes where they should go + active points
1577  CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,4,indexT>),ite,ids,
1578  (int)n_cnk,
1579  gridGeoPack,origPack_cnk,
1580  gridGeometry,origUnpack_cnk,
1581  tmp2.toKernel(),
1582  shifts.toKernel(),
1583  sz_g,
1584  bs);
1585  break;
1586  case 8:
1587  // Calculate for each chunk the indexes where they should go + active points
1588  CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,8,indexT>),ite,ids,
1589  (int)n_cnk,
1590  gridGeoPack,origPack_cnk,
1591  gridGeometry,origUnpack_cnk,
1592  tmp2.toKernel(),
1593  shifts.toKernel(),
1594  sz_g,
1595  bs);
1596  break;
1597  }
1598 
1599  convertChunkIds(offsets,origPack_pnt,sub_it);
1600  }
1601  else
1602  {
1603  convert_blk.add();
1604  n_shifts_cp.add(0);
1605  }
1606 
1607  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));
1608 
1609  ps.addOffset(actual_offset);
1610  }
1611 
1616  template<typename origPackType, typename IteratorType>
1617  void convertChunkIds(short int * offset, origPackType & origPack, IteratorType & sub_it)
1618  {
1619  int quot_diff[dim];
1620  for (int i = 0 ; i < dim ; i++)
1621  {
1622  int op_q = origPack.get(i) % blockEdgeSize;
1623  int ou_q = sub_it.getStart().get(i) % blockEdgeSize;
1624  int quot = abs(ou_q - op_q) % blockEdgeSize;
1625  quot_diff[i] = openfpm::math::sgn(ou_q - op_q)*quot;
1626  }
1627 
1628  convert_blk.add();
1629 
1630  // Create conversion block
1631 
1632  for (int j = 0 ; j < this->blockSize ; j++)
1633  {
1634  int offset = 0;
1635  int bpos = 0;
1636  int bp_c = 1;
1637  int pos = 0;
1638  int pos_c = 1;
1639 
1640  int x = j;
1641  for (int i = 0 ; i < dim ; i++)
1642  {
1643  int c = x % blockEdgeSize;
1644 
1645  if (quot_diff[i] + c < 0)
1646  {
1647  offset += pos_c*(quot_diff[i] + c + blockEdgeSize);
1648  bpos += bp_c*1;
1649  }
1650  else if (quot_diff[i] + c >= blockEdgeSize)
1651  {
1652  offset += pos_c*(quot_diff[i] + c - blockEdgeSize);
1653  bpos += bp_c*1;
1654  }
1655  else
1656  {
1657  offset += pos_c*(quot_diff[i] + c);
1658  }
1659 
1660  pos += pos_c*c;
1661  pos_c *= blockEdgeSize;
1662  bp_c *= (quot_diff[i] != 0)?2:1;
1663  x /= blockEdgeSize;
1664  }
1665 
1666  convert_blk.template get<0>(convert_blk.size()-1)[pos] = (bpos << 16) + offset;
1667  }
1668  }
1669 
1670 public:
1671 
1672  typedef AggregateT value_type;
1673 
1674  typedef self device_grid_type;
1675 
1676  SparseGridGpu()
1677  :stencilSupportRadius(1)
1678  {};
1679 
1685  void resize(size_t (& res)[dim])
1686  {
1687  initialize(res);
1688  }
1689 
1694  SparseGridGpu(const size_t (& res)[dim], unsigned int stencilSupportRadius = 1)
1695  :stencilSupportRadius(stencilSupportRadius)
1696  {
1697  initialize(res);
1698  };
1699 
1704  SparseGridGpu(linearizer & gridGeometry, unsigned int stencilSupportRadius = 1)
1705  : gridGeometry(gridGeometry),
1706  stencilSupportRadius(stencilSupportRadius)
1707  {
1708  // convert to size_t
1709  size_t sz_st[dim];
1710 
1711  for (int i = 0 ; i < dim ; i++) {sz_st[i] = gridGeometry.getSize()[i];}
1712 
1713  initialize(sz_st);
1714  };
1715 
1717  <
1718  dim,
1719  blockEdgeSize,
1720  typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1721  ct_par<0,1>,
1722  indexT,
1723  layout_base,
1724  decltype(extendedBlockGeometry),
1725  linearizer,
1726  AggregateT
1727  > toKernel()
1728  {
1730  <
1731  dim,
1732  blockEdgeSize,
1733  typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1734  ct_par<0,1>,
1735  indexT,
1736  layout_base,
1737  decltype(extendedBlockGeometry),
1738  linearizer,
1739  AggregateT
1740  > toKer(
1742  gridGeometry,
1743  extendedBlockGeometry,
1744  stencilSupportRadius,
1745  ghostLayerToThreadsMapping.toKernel(),
1746  nn_blocks.toKernel(),
1747  e_points.toKernel(),
1748  ghostLayerSize,
1749  bck);
1750  return toKer;
1751  }
1752 
1753  template<unsigned int nNN, unsigned int nLoop>
1755  <
1756  dim,
1757  blockEdgeSize,
1758  typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1760  indexT,
1761  layout_base,
1762  decltype(extendedBlockGeometry),
1763  linearizer,
1764  AggregateT
1765  > toKernelNN()
1766  {
1768  <
1769  dim,
1770  blockEdgeSize,
1771  typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1773  indexT,
1774  layout_base,
1775  decltype(extendedBlockGeometry),
1776  linearizer,
1777  AggregateT
1778  > toKer(
1780  gridGeometry,
1781  extendedBlockGeometry,
1782  stencilSupportRadius,
1783  ghostLayerToThreadsMapping.toKernel(),
1784  nn_blocks.toKernel(),
1785  e_points.toKernel(),
1786  ghostLayerSize,
1787  bck);
1788  return toKer;
1789  }
1790 
1794  void clear()
1795  {
1796  BMG::clear();
1797  }
1798 
1799  /* \brief Does nothing
1800  *
1801  */
1802  void setMemory()
1803  {}
1804 
1805  auto insertBlockFlush(size_t block) -> decltype(BMG::insertBlockFlush(block))
1806  {
1807  return BMG::insertBlockFlush(block);
1808  }
1809 
1815  linearizer & getGrid()
1816  {
1817  return gridGeometry;
1818  }
1819 
1825  template<typename stencil_type>
1826  void setNNType()
1827  {
1828  computeGhostLayerMapping<stencil_type>();
1829  }
1830 
1831 
1832  constexpr static unsigned int getBlockEdgeSize()
1833  {
1834  return blockEdgeSize;
1835  }
1836 
1837  constexpr unsigned int getBlockSize() const
1838  {
1839  return blockSize;
1840  }
1841 
1842  // Geometry
1843  template<typename CoordT>
1844  inline size_t getLinId(CoordT &coord)
1845  {
1846  return gridGeometry.LinId(coord);
1847  }
1848 
1849  inline grid_key_dx<dim, int> getCoord(size_t linId) const
1850  {
1851  return gridGeometry.InvLinId(linId);
1852  }
1853 
1854  inline ite_gpu<dim> getGridGPUIterator(const grid_key_dx<dim, int> & start, const grid_key_dx<dim, int> & stop, size_t n_thr = threadBlockSize)
1855  {
1856  return gridSize.getGPUIterator(start,stop,n_thr);
1857  }
1858 
1868  template<typename CoordT>
1870  {
1871  base_key k(*this,0,0);
1872 
1873  const auto & blockMap = this->private_get_blockMap();
1874 
1875  auto glid = gridGeometry.LinId(coord);
1876 
1877  auto bid = glid / blockSize;
1878  auto lid = glid % blockSize;
1879 
1880  auto key = blockMap.get_sparse(bid);
1881 
1882  k.set_cnk_pos_id(key.id);
1883  k.set_data_id(lid);
1884 
1885  return k;
1886  }
1887 
1897  template<unsigned int p, typename CoordT>
1898  auto get(const grid_key_dx<dim,CoordT> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &
1899  {
1901  }
1902 
1912  template<unsigned int p>
1913  auto get(const sparse_grid_gpu_index<self> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &
1914  {
1915  return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];
1916  }
1917 
1924  {
1926  }
1927 
1933  auto private_get_data_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer())
1934  {
1936  }
1937 
1945  template<typename CoordT>
1946  auto get_o(const grid_key_dx<dim,CoordT> & coord) const -> encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::get(0))>::type >
1947  {
1948  int offset;
1949  indexT lin;
1950  gridGeometry.LinId(coord,lin,offset);
1951 
1953  }
1954 
1962  auto get_o(const sparse_grid_gpu_index<self> & coord) const -> encap_data_block<typename std::remove_const<decltype(private_get_data_array().get(0))>::type >
1963  {
1964  return encap_data_block<typename std::remove_const<decltype(private_get_data_array().get(0))>::type >(coord.get_data_id(),private_get_data_array().get(coord.get_cnk_pos_id()));
1965  }
1966 
1973  {
1974  return (index_size_swp_r == private_get_index_array().size()) && (index_size_swp == private_get_index_array().size());
1975  }
1976 
1986  template<unsigned int p>
1987  auto get(const sparse_grid_gpu_index<self> & coord) -> ScalarTypeOf<AggregateBlockT, p> &
1988  {
1989  return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];
1990  }
1991 
1999  unsigned char getFlag(const sparse_grid_gpu_index<self> & coord) const
2000  {
2001  return private_get_data_array().template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(coord.get_cnk_pos_id())[coord.get_data_id()];
2002  }
2003 
2004  template<unsigned int p, typename CoordT>
2005  auto insert(const CoordT &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2006  {
2007  return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::template insert<p>(gridGeometry.LinId(coord));
2008  }
2009 
2010  template<typename CoordT>
2011  auto insert_o(const CoordT &coord) -> encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insert_o(0))>::type >
2012  {
2013  indexT ind;
2014  int offset;
2015  gridGeometry.LinId(coord,ind,offset);
2016 
2018  }
2019 
2026  void construct_link(self & grid_up, self & grid_dw, gpu::ofp_context_t& gpuContext)
2027  {
2028 /* // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2029  auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
2030  auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
2031 
2032  ite_gpu<1> ite;
2033 
2034  ite.wthr.x = indexBuffer.size();
2035  ite.wthr.y = 1;
2036  ite.wthr.z = 1;
2037 
2038  ite.thr.x = getBlockSize();
2039  ite.thr.y = 1;
2040  ite.thr.z = 1;
2041 
2042  openfpm::vector_gpu<aggregate<unsigned int>> output;
2043  output.resize(indexBuffer.size() + 1);
2044 
2045  CUDA_LAUNCH((SparseGridGpuKernels::link_construct<dim,
2046  BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
2047  blockSize>),ite,grid_up.toKernel(),this->toKernel(),output.toKernel());
2048 
2049 
2050  openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),gpuContext);
2051 
2052  output.template deviceToHost<0>(output.size()-1,output.size()-1);
2053 
2054  unsigned int np_lup = output.template get<0>(output.size()-1);
2055 
2056  links_up.resize(np_lup);
2057 
2058  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert<dim,
2059  BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
2060  blockSize>),ite,grid_up.toKernel(),this->toKernel(),output.toKernel(),links_up.toKernel());
2061 
2062 */
2063  }
2064 
2071  {
2072  return link_dw_scan;
2073  }
2074 
2081  {
2082  return link_dw;
2083  }
2084 
2091  {
2092  return link_up_scan;
2093  }
2094 
2101  {
2102  return link_up;
2103  }
2104 
2113  void construct_link_dw(self & grid_dw, const Box<dim,int> & db_, Point<dim,int> p_dw, gpu::ofp_context_t& gpuContext)
2114  {
2115  Box<dim,int> db = db_;
2116 
2117  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2120 
2121  // Count padding points
2122 
2123  // First we count the padding points
2124  ite_gpu<1> ite;
2125 
2126  ite.wthr.x = indexBuffer.size();
2127  ite.wthr.y = 1;
2128  ite.wthr.z = 1;
2129 
2130  ite.thr.x = getBlockSize();
2131  ite.thr.y = 1;
2132  ite.thr.z = 1;
2133 
2135  output.resize(indexBuffer.size()+1);
2136 
2137  output.fill<0>(0);
2138 
2139  CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2141  blockSize>),ite,this->toKernel(),output.toKernel(),db);
2142 
2143 
2144 
2145  openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),gpuContext);
2146 
2147  output.template deviceToHost<0>(output.size()-1,output.size()-1);
2148  unsigned int padding_points = output.template get<0>(output.size()-1);
2149 
2150  // get the padding points
2151 
2153  pd_points.resize(padding_points);
2154 
2155  CUDA_LAUNCH((SparseGridGpuKernels::collect_paddings<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,this->toKernel(),output.toKernel(),pd_points.toKernel(),db);
2156 
2157  // Count number of link down for padding points
2158 
2159  // Calculate ghost
2160 
2161  link_dw_scan.resize(padding_points+1);
2162  link_dw_scan.fill<0>(0);
2163 
2164  ite = link_dw_scan.getGPUIterator();
2165 
2166  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_dw_count<dim,
2168  blockSize>),
2169  ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),link_dw_scan.toKernel(),p_dw);
2170 
2171  openfpm::scan((unsigned int *)link_dw_scan.template getDeviceBuffer<0>(),link_dw_scan.size(),(unsigned int *)link_dw_scan.template getDeviceBuffer<0>(),gpuContext);
2172 
2173  link_dw_scan.template deviceToHost<0>(link_dw_scan.size()-1,link_dw_scan.size()-1);
2174 
2175  size_t np_ldw = link_dw_scan.template get<0>(link_dw_scan.size()-1);
2176 
2177  link_dw.resize(np_ldw);
2178 
2179  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_dw<dim,
2181  blockSize>),ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),link_dw_scan.toKernel(),link_dw.toKernel(),p_dw);
2182 
2183  link_dw_scan.resize(link_dw_scan.size()-1);
2184  }
2185 
2191  void construct_link_up(self & grid_up, const Box<dim,int> & db_, Point<dim,int> p_up, gpu::ofp_context_t& gpuContext)
2192  {
2193  Box<dim,int> db = db_;
2194 
2195  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2198 
2199  // Count padding points
2200 
2201  // First we count the padding points
2202  ite_gpu<1> ite;
2203 
2204  ite.wthr.x = indexBuffer.size();
2205  ite.wthr.y = 1;
2206  ite.wthr.z = 1;
2207 
2208  ite.thr.x = getBlockSize();
2209  ite.thr.y = 1;
2210  ite.thr.z = 1;
2211 
2213  output.resize(indexBuffer.size()+1);
2214 
2215  output.fill<0>(0);
2216 
2217  CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2219  blockSize>),ite,this->toKernel(),output.toKernel(),db);
2220 
2221 
2222 
2223  openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),gpuContext);
2224 
2225  output.template deviceToHost<0>(output.size()-1,output.size()-1);
2226  unsigned int padding_points = output.template get<0>(output.size()-1);
2227 
2228  // get the padding points
2229 
2231  pd_points.resize(padding_points);
2232 
2233  CUDA_LAUNCH((SparseGridGpuKernels::collect_paddings<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,this->toKernel(),output.toKernel(),pd_points.toKernel(),db);
2234 
2235  // Count number of link down for padding points
2236 
2237  // Calculate ghost
2238 
2239  link_up_scan.resize(padding_points+1);
2240  link_up_scan.fill<0>(0);
2241 
2242  ite = link_up_scan.getGPUIterator();
2243 
2244  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_up_count<dim,
2246  blockSize>),
2247  ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),link_up_scan.toKernel(),p_up);
2248 
2249  openfpm::scan((unsigned int *)link_up_scan.template getDeviceBuffer<0>(),link_up_scan.size(),(unsigned int *)link_up_scan.template getDeviceBuffer<0>(),gpuContext);
2250 
2251  link_up_scan.template deviceToHost<0>(link_up_scan.size()-1,link_up_scan.size()-1);
2252 
2253  size_t np_lup = link_up_scan.template get<0>(link_up_scan.size()-1);
2254 
2255  link_up.resize(np_lup);
2256 
2257  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_up<dim,
2259  blockSize>),ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),link_up_scan.toKernel(),link_up.toKernel(),p_up);
2260 
2261  link_up_scan.resize(link_up_scan.size()-1);
2262  }
2263 
2270  template<typename dim3T>
2271  void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
2272  {
2275  dim3SizeToInt(nBlock),
2276  dim3SizeToInt(nSlot)
2277  );
2278  }
2279 
2285  void preFlush()
2286  {
2288  }
2289 
2290  template<typename stencil_type = NNStar<dim>, typename checker_type = No_check>
2291  void tagBoundaries(gpu::ofp_context_t& gpuContext, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS)
2292  {
2293  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2296 
2297  const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
2298  unsigned int numScalars = indexBuffer.size() * dataChunkSize;
2299 
2300  if (numScalars == 0) return;
2301  if (findNN == false)
2302  {
2303  findNeighbours<stencil_type>();
2304  findNN = true;
2305  }
2306 
2307  // NOTE: Here we want to work only on one data chunk per block!
2308 
2309  unsigned int localThreadBlockSize = dataChunkSize;
2310  unsigned int threadGridSize = numScalars % dataChunkSize == 0
2311  ? numScalars / dataChunkSize
2312  : 1 + numScalars / dataChunkSize;
2313 
2314  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * 1)>::value; // todo: This works only for stencilSupportSize==1
2315 // constexpr unsigned int nLoop = IntPow<blockEdgeSize + 2, dim>::value; // todo: This works only for stencilSupportSize==1
2316 
2317  if (stencilSupportRadius == 1)
2318  {
2319  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2320  dim,
2321  1,
2323  stencil_type,
2324  checker_type>),
2325  threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2326  }
2327  else if (stencilSupportRadius == 2)
2328  {
2329  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2330  dim,
2331  2,
2333  stencil_type,
2334  checker_type>),
2335  threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2336  }
2337  else if (stencilSupportRadius == 0)
2338  {
2339  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2340  dim,
2341  0,
2343  stencil_type,
2344  checker_type>),
2345  threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2346  }
2347  else
2348  {
2349  //todo: KABOOOOOOM!!!
2350  std::cout << __FILE__ << ":" << __LINE__ << " error: stencilSupportRadius supported only up to 2, passed: " << stencilSupportRadius << std::endl;
2351 
2352  }
2353 
2354  if (opt == tag_boundaries::CALCULATE_EXISTING_POINTS)
2355  {
2356  // first we calculate the existing points
2358 
2359  block_points.resize(indexBuffer.size() + 1);
2360  block_points.template get<0>(block_points.size()-1) = 0;
2361  block_points.template hostToDevice<0>(block_points.size()-1,block_points.size()-1);
2362 
2363  ite_gpu<1> ite;
2364 
2365  ite.wthr.x = indexBuffer.size();
2366  ite.wthr.y = 1;
2367  ite.wthr.z = 1;
2368  ite.thr.x = getBlockSize();
2369  ite.thr.y = 1;
2370  ite.thr.z = 1;
2371 
2372  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),
2373  ite,
2374  dataBuffer.toKernel(),
2375  block_points.toKernel());
2376 
2377  // than we scan
2378  openfpm::scan((indexT *)block_points.template getDeviceBuffer<0>(),block_points.size(),(indexT *)block_points.template getDeviceBuffer<0>(),gpuContext);
2379 
2380  // Get the total number of points
2381  block_points.template deviceToHost<0>(block_points.size()-1,block_points.size()-1);
2382  size_t tot = block_points.template get<0>(block_points.size()-1);
2383  e_points.resize(tot);
2384 
2385  // we fill e_points
2386  CUDA_LAUNCH((SparseGridGpuKernels::fill_e_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,
2387  dataBuffer.toKernel(),
2388  block_points.toKernel(),
2389  e_points.toKernel());
2390 
2391  }
2392 
2393  cudaDeviceSynchronize();
2394  }
2395 
2396  template<typename NNtype = NNStar<dim>>
2397  void findNeighbours()
2398  {
2399  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2401 
2402  const unsigned int numBlocks = indexBuffer.size();
2403  const unsigned int numScalars = numBlocks * NNtype::nNN;
2404  nn_blocks.resize(numScalars);
2405 
2406  if (numScalars == 0) return;
2407 
2408  // NOTE: Here we want to work only on one data chunk per block!
2409 
2410  unsigned int localThreadBlockSize = NNtype::nNN;
2411 
2412  unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
2413  ? numScalars / localThreadBlockSize
2414  : 1 + numScalars / localThreadBlockSize;
2415 
2416  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::findNeighbours<dim,NNtype>),
2417  threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), this->toKernel(),nn_blocks.toKernel());
2418 
2419  findNN = true;
2420  }
2421 
2422  size_t countExistingElements() const
2423  {
2424  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2427 
2429  typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2430  typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2431  constexpr unsigned int blockSize = MaskBlockT::size;
2432  const auto bufferSize = indexBuffer.size();
2433 
2434  size_t numExistingElements = 0;
2435 
2436  for (size_t blockId=0; blockId<bufferSize; ++blockId)
2437  {
2438  auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2439  for (size_t elementId=0; elementId<blockSize; ++elementId)
2440  {
2441  const auto curMask = dataBlock.template get<pMask>()[elementId];
2442 
2443  if (this->exist(curMask))
2444  {
2445  ++numExistingElements;
2446  }
2447  }
2448  }
2449 
2450  return numExistingElements;
2451  }
2452 
2453  size_t countBoundaryElements()
2454  {
2455  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2458 
2460  typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2461  typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2462  constexpr unsigned int blockSize = MaskBlockT::size;
2463  const auto bufferSize = indexBuffer.size();
2464 
2465  size_t numBoundaryElements = 0;
2466 
2467  for (size_t blockId=0; blockId<bufferSize; ++blockId)
2468  {
2469  auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2470  for (size_t elementId=0; elementId<blockSize; ++elementId)
2471  {
2472  const auto curMask = dataBlock.template get<pMask>()[elementId];
2473 
2474  if (this->exist(curMask) && this->isPadding(curMask))
2475  {
2476  ++numBoundaryElements;
2477  }
2478  }
2479  }
2480 
2481  return numBoundaryElements;
2482  }
2483 
2484  // Also count mean+stdDev of occupancy of existing blocks
2485  void measureBlockOccupancyMemory(double &mean, double &deviation)
2486  {
2487  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2490 
2492  typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2493  typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2494  constexpr unsigned int blockSize = MaskBlockT::size;
2495  const auto bufferSize = indexBuffer.size();
2496 
2497  openfpm::vector<double> measures;
2498 
2499  for (size_t blockId=0; blockId<bufferSize; ++blockId)
2500  {
2501  auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2502  size_t numElementsInBlock = 0;
2503  for (size_t elementId=0; elementId<blockSize; ++elementId)
2504  {
2505  const auto curMask = dataBlock.template get<pMask>()[elementId];
2506 
2507  if (this->exist(curMask))
2508  {
2509  ++numElementsInBlock;
2510  }
2511  }
2512  double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;
2513  measures.add(blockOccupancy);
2514  }
2515 
2516  standard_deviation(measures, mean, deviation);
2517  }
2518 
2519  // Also count mean+stdDev of occupancy of existing blocks
2520  void measureBlockOccupancy(double &mean, double &deviation)
2521  {
2522  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2525 
2527  typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2528  typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2529  constexpr unsigned int blockSize = MaskBlockT::size;
2530  const auto bufferSize = indexBuffer.size();
2531 
2532  openfpm::vector<double> measures;
2533 
2534  for (size_t blockId=0; blockId<bufferSize; ++blockId)
2535  {
2536  auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2537  size_t numElementsInBlock = 0;
2538  for (size_t elementId=0; elementId<blockSize; ++elementId)
2539  {
2540  const auto curMask = dataBlock.template get<pMask>()[elementId];
2541 
2542  if (this->exist(curMask) && !this->isPadding(curMask))
2543  {
2544  ++numElementsInBlock;
2545  }
2546  }
2547  double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;
2548  measures.add(blockOccupancy);
2549  }
2550 
2551  standard_deviation(measures, mean, deviation);
2552  }
2553 
2568  template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2569  void conv_cross(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
2570  {
2571  Box<dim,int> box;
2572 
2573  for (int i = 0 ; i < dim ; i++)
2574  {
2575  box.setLow(i,start.get(i));
2576  box.setHigh(i,stop.get(i));
2577  }
2578 
2579  applyStencils< SparseGridGpuKernels::stencil_cross_func<dim,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2580  }
2581 
2582 
2587  template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2588  void conv(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
2589  {
2590  Box<dim,int> box;
2591 
2592  for (int i = 0 ; i < dim ; i++)
2593  {
2594  box.setLow(i,start.get(i));
2595  box.setHigh(i,stop.get(i));
2596  }
2597 
2598  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2599 
2600  applyStencils< SparseGridGpuKernels::stencil_cross_func_conv<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2601  }
2602 
2607  template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2608  void conv_cross_b(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
2609  {
2610  Box<dim,int> box;
2611 
2612  for (int i = 0 ; i < dim ; i++)
2613  {
2614  box.setLow(i,start.get(i));
2615  box.setHigh(i,stop.get(i));
2616  }
2617 
2618  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2619 
2620  applyStencils< SparseGridGpuKernels::stencil_cross_func_conv_block_read<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2621  }
2622 
2627  template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2628  void conv2_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
2629  {
2630  Box<dim,int> box;
2631 
2632  for (int i = 0 ; i < dim ; i++)
2633  {
2634  box.setLow(i,start.get(i));
2635  box.setHigh(i,stop.get(i));
2636  }
2637 
2638  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2639 
2640  applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2641  }
2642 
2647  template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_src3,
2648  unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int prop_dst3,
2649  unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2650  void conv3_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
2651  {
2652  Box<dim,int> box;
2653 
2654  for (int i = 0 ; i < dim ; i++)
2655  {
2656  box.setLow(i,start.get(i));
2657  box.setHigh(i,stop.get(i));
2658  }
2659 
2660  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2661 
2662  applyStencils< SparseGridGpuKernels::stencil_func_conv3_b<dim,nLoop,prop_src1,prop_src2,prop_src3,prop_dst1,prop_dst2,prop_dst3,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2663  }
2664 
2669  template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2670  void conv2(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
2671  {
2672  Box<dim,int> box;
2673 
2674  for (int i = 0 ; i < dim ; i++)
2675  {
2676  box.setLow(i,start.get(i));
2677  box.setHigh(i,stop.get(i));
2678  }
2679 
2680  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2681 
2682  applyStencils< SparseGridGpuKernels::stencil_func_conv2<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2683  }
2684 
2691  {
2692  Box<dim,int> b;
2693 
2694  for (int i = 0 ; i < dim ; i++)
2695  {
2696  b.setLow(i,0);
2697  b.setHigh(i,gridGeometry.getSize()[i]);
2698  }
2699 
2700  return b;
2701  }
2702 
2703  //todo: Move implems into a functor for compile time choice of stencil mode
2704  template<typename stencil, typename... Args>
2705  void applyStencils(const Box<dim,int> & box, StencilMode mode, Args... args)
2706  {
2707  if (findNN == false)
2708  {
2709  findNeighbours<typename stencil::stencil_type>();
2710  findNN = true;
2711  }
2712 
2713  // Apply the given stencil on all elements which are not boundary-tagged
2714  // The idea is to have this function launch a __global__ function (made by us) on all existing blocks
2715  // then this kernel checks if elements exist && !padding and on those it calls the user-provided
2716  // __device__ functor. The mode of the stencil application is used basically to choose how we load the block
2717  // that we pass to the user functor as storeBlock: in case of Insert, we get the block through an insert (and
2718  // we also call the necessary aux functions); in case of an In-place we just get the block from the data buffer.
2719  switch (mode)
2720  {
2721  case STENCIL_MODE_INPLACE:
2722  applyStencilInPlace<stencil>(box,mode,args...);
2723  break;
2724  case STENCIL_MODE_INPLACE_NO_SHARED:
2725  applyStencilInPlaceNoShared<stencil>(box,mode,args...);
2726  break;
2727  }
2728  }
2729  template<typename stencil1, typename stencil2, typename ... otherStencils, typename... Args>
2730  void applyStencils(Box<dim,int> box, StencilMode mode, Args... args)
2731  {
2732  applyStencils<stencil1>(box,mode, args...);
2733  applyStencils<stencil2, otherStencils ...>(box,mode, args...);
2734  }
2735 
2736  template<typename BitMaskT>
2737  inline static bool isPadding(BitMaskT &bitMask)
2738  {
2740  ::getBit(bitMask, PADDING_BIT);
2741  }
2742 
2743  template<typename BitMaskT>
2744  inline static void setPadding(BitMaskT &bitMask)
2745  {
2747  ::setBit(bitMask, PADDING_BIT);
2748  }
2749 
2750  template<typename BitMaskT>
2751  inline static void unsetPadding(BitMaskT &bitMask)
2752  {
2754  ::unsetBit(bitMask, PADDING_BIT);
2755  }
2756 
2764  template<typename CoordT>
2765  inline size_t getBlockLinId(const CoordT & blockCoord) const
2766  {
2767  return gridGeometry.BlockLinId(blockCoord);
2768  }
2769 
2780  template<unsigned int p>
2781  auto insertFlush(const sparse_grid_gpu_index<self> &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2782  {
2784 
2785  indexT block_id = indexBuffer.template get<0>(coord.get_cnk_pos_id());
2786  indexT local_id = coord.get_data_id();
2787 
2789 
2791  block_data.template get<BMG::pMask>()[local_id] = 1;
2792 
2793  return block_data.template get<p>()[local_id];
2794  }
2795 
2806  template<typename CoordT>
2808  {
2809  auto lin = gridGeometry.LinId(coord);
2810  indexT block_id = lin / blockSize;
2811  local_id = lin % blockSize;
2812 
2814 
2816  block_data.template get<BMG::pMask>()[local_id] = 1;
2817 
2818  return block_data;
2819  }
2820 
2831  template<unsigned int p, typename CoordT>
2832  auto insertFlush(const grid_key_dx<dim,CoordT> &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2833  {
2834  // Linearized block_id
2835  auto lin = gridGeometry.LinId(coord);
2836  indexT block_id = lin / blockSize;
2837  indexT local_id = lin % blockSize;
2838 
2840 
2842  block_data.template get<BMG::pMask>()[local_id] = 1;
2843 
2844  return block_data.template get<p>()[local_id];
2845  }
2846 
2847  template<unsigned int p>
2848  void print_vct_add_data()
2849  {
2850  typedef BlockMapGpu<
2852  threadBlockSize, indexT, layout_base> BMG;
2853 
2854  auto & bM = BMG::blockMap.private_get_vct_add_data();
2855  auto & vI = BMG::blockMap.private_get_vct_add_index();
2856  bM.template deviceToHost<p>();
2857  vI.template deviceToHost<0>();
2858 
2859  std::cout << "vct_add_data: " << std::endl;
2860 
2861  for (size_t i = 0 ; i < bM.size() ; i++)
2862  {
2863  std::cout << i << " index: " << vI.template get<0>(i) << " BlockData: " << std::endl;
2864  for (size_t j = 0 ; j < blockSize ; j++)
2865  {
2866  std::cout << (int)bM.template get<p>(i)[j] << " ";
2867  }
2868 
2869  std::cout << std::endl;
2870  }
2871  }
2872 
2878  template<unsigned int p>
2879  void setBackgroundValue(typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type backgroundValue)
2880  {
2881  meta_copy<typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>::meta_copy_(backgroundValue,bck.template get<p>());
2882 
2883  BMG::template setBackgroundValue<p,typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>(backgroundValue);
2884  }
2885 
2887 
2888  //Functions to check if the packing object is complex
2889  static bool pack()
2890  {
2891  return true;
2892  }
2893 
2894  //Functions to check if the packing object is complex
2895  static bool packRequest()
2896  {
2897  return true;
2898  }
2899 
2904  template<int ... prp> inline
2905  void packRequest(size_t & req) const
2906  {
2907  // To fill
2910 
2911  indexBuffer.template packRequest<prp ...>(req);
2912  dataBuffer.template packRequest<prp ...>(req);
2913 
2914  Packer<decltype(gridGeometry),HeapMemory>::packRequest(req);
2915  }
2916 
2926  template<int ... prp> void pack(ExtPreAlloc<HeapMemory> & mem,
2927  Pack_stat & sts) const
2928  {
2931 
2932  // To fill
2933  indexBuffer.template pack<prp ...>(mem,sts);
2934  dataBuffer.template pack<prp ...>(mem,sts);
2935 
2936  Packer<decltype(gridGeometry),HeapMemory>::pack(mem,gridGeometry,sts);
2937  }
2938 
2948  template<int ... prp> void unpack(ExtPreAlloc<HeapMemory> & mem,
2949  Unpack_stat & ps)
2950  {
2953 
2954  // To fill
2955  indexBuffer.template unpack<prp ...>(mem,ps);
2956  dataBuffer.template unpack<prp ...>(mem,ps);
2957 
2958  Unpacker<decltype(gridGeometry),HeapMemory>::unpack(mem,gridGeometry,ps);
2959  }
2960 
2961 
2971  template<int ... prp> void unpack(ExtPreAlloc<CudaMemory> & mem,
2972  Unpack_stat & ps)
2973  {
2974  if (mem.size() != 0)
2975  {std::cout << __FILE__ << ":" << __LINE__ << " not implemented: " << std::endl;}
2976  }
2977 
2983  template<int ... prp> inline
2984  void packRequest(size_t & req, gpu::ofp_context_t& gpuContext) const
2985  {
2986  ite_gpu<1> ite;
2987 
2988  auto & indexBuffer = private_get_index_array();
2989  auto & dataBuffer = private_get_data_array();
2990 
2991  ite.wthr.x = indexBuffer.size();
2992  ite.wthr.y = 1;
2993  ite.wthr.z = 1;
2994  ite.thr.x = getBlockSize();
2995  ite.thr.y = 1;
2996  ite.thr.z = 1;
2997 
2998  tmp.resize(indexBuffer.size() + 1);
2999 
3000  // Launch a kernel that count the number of element on each chunks
3001  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),
3002  ite,
3003  dataBuffer.toKernel(),
3004  tmp.toKernel());
3005 
3006  openfpm::scan((indexT *)tmp. template getDeviceBuffer<0>(),
3007  tmp.size(), (indexT *)tmp. template getDeviceBuffer<0>(), gpuContext);
3008 
3009  tmp.template deviceToHost<0>(tmp.size()-1,tmp.size()-1);
3010 
3011  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3012 
3013  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof... (prp)>>(spq);
3014 
3015  size_t n_pnt = tmp.template get<0>(tmp.size()-1);
3016 
3017 
3018  // 4 byte each chunks data // we use short to pack the offset
3019  // for each counter
3020  req = sizeof(indexT) + // byte required to pack the number
3021  sizeof(indexT)*indexBuffer.size() + // byte required to pack the chunk indexes
3022  sizeof(indexT)*tmp.size() + // byte required to pack the scan of the chunks points
3023  n_pnt*(spq.point_size + sizeof(short int) + sizeof(unsigned char)); // byte required to pack data + offset position
3024  }
3025 
3040  template<int ... prp> inline
3042  size_t & req) const
3043  {
3044  pack_subs.add();
3045 
3046  for (int i = 0 ; i < dim ; i++)
3047  {
3048  pack_subs.template get<0>(pack_subs.size()-1)[i] = sub_it.getStart().get(i);
3049  pack_subs.template get<1>(pack_subs.size()-1)[i] = sub_it.getStop().get(i);
3050  }
3051  }
3052 
3057  void packReset()
3058  {
3059  pack_subs.clear();
3060 
3061  index_ptrs.clear();
3062  scan_ptrs.clear();
3063  data_ptrs.clear();
3064  offset_ptrs.clear();
3065  mask_ptrs.clear();
3066 
3067  req_index = 0;
3068  }
3069 
3076  template<int ... prp> inline
3077  void packCalculate(size_t & req, gpu::ofp_context_t& gpuContext)
3078  {
3079  ite_gpu<1> ite;
3080  pack_subs.template hostToDevice<0,1>();
3081 
3082  auto & indexBuffer = private_get_index_array();
3083  auto & dataBuffer = private_get_data_array();
3084 
3085  ite.wthr.x = indexBuffer.size();
3086  ite.wthr.y = 1;
3087  ite.wthr.z = 1;
3088  ite.thr.x = getBlockSize();
3089  ite.thr.y = 1;
3090  ite.thr.z = 1;
3091 
3092  tmp.resize((indexBuffer.size() + 1)*pack_subs.size());
3093 
3094  if (indexBuffer.size() != 0)
3095  {
3096  if (pack_subs.size() <= 32)
3097  {
3098  // Launch a kernel that count the number of element on each chunks
3099  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3101  32,
3102  indexT>),
3103  ite,
3104  indexBuffer.toKernel(),
3105  pack_subs.toKernel(),
3106  gridGeometry,
3107  dataBuffer.toKernel(),
3108  tmp.toKernel(),
3109  (unsigned int)indexBuffer.size() + 1);
3110  }
3111  else if (pack_subs.size() <= 64)
3112  {
3113  // Launch a kernel that count the number of element on each chunks
3114  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3116  64,
3117  indexT>),
3118  ite,
3119  indexBuffer.toKernel(),
3120  pack_subs.toKernel(),
3121  gridGeometry,
3122  dataBuffer.toKernel(),
3123  tmp.toKernel(),
3124  (unsigned int)indexBuffer.size() + 1);
3125  }
3126  else if (pack_subs.size() <= 96)
3127  {
3128  // Launch a kernel that count the number of element on each chunks
3129  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3131  96,
3132  indexT>),
3133  ite,
3134  indexBuffer.toKernel(),
3135  pack_subs.toKernel(),
3136  gridGeometry,
3137  dataBuffer.toKernel(),
3138  tmp.toKernel(),
3139  (unsigned int)indexBuffer.size() + 1);
3140  }
3141  else if (pack_subs.size() <= 128)
3142  {
3143  // Launch a kernel that count the number of element on each chunks
3144  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3146  128,
3147  indexT>),
3148  ite,
3149  indexBuffer.toKernel(),
3150  pack_subs.toKernel(),
3151  gridGeometry,
3152  dataBuffer.toKernel(),
3153  tmp.toKernel(),
3154  (unsigned int)indexBuffer.size() + 1);
3155  }
3156  else
3157  {
3158  std::cout << __FILE__ << ":" << __LINE__ << " error no implementation available of packCalculate, create a new case for " << pack_subs.size() << std::endl;
3159  }
3160  }
3161 
3162  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3163 
3164  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3165 
3166  scan_it.resize(pack_subs.size());
3167 
3168  // scan all
3169  for (size_t i = 0 ; i < pack_subs.size() ; i++)
3170  {
3171  size_t n_pnt = 0;
3172  size_t n_cnk = 0;
3173 
3174  tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3175  tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3176 
3177  // put a zero at the end
3178  tmp.template hostToDevice<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3179  tmp.template hostToDevice<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3180 
3181  openfpm::scan(((indexT *)tmp. template getDeviceBuffer<0>()) + i*(indexBuffer.size() + 1),
3182  indexBuffer.size() + 1, (indexT *)tmp. template getDeviceBuffer<0>() + i*(indexBuffer.size() + 1), gpuContext);
3183 
3184  openfpm::scan(((unsigned int *)tmp. template getDeviceBuffer<1>()) + i*(indexBuffer.size() + 1),
3185  indexBuffer.size() + 1, (unsigned int *)tmp. template getDeviceBuffer<1>() + i*(indexBuffer.size() + 1), gpuContext);
3186 
3187  tmp.template deviceToHost<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3188  tmp.template deviceToHost<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3189 
3190  scan_it.template get<0>(i) = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3191 
3192  n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3193  n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3194 
3195  req += sizeof(size_t) + // byte required to pack the number of chunk packed
3196  2*dim*sizeof(int) + // starting point + size of the indexing packing
3197  sizeof(indexT)*n_cnk + // byte required to pack the chunk indexes
3198  align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)) + // byte required to pack the scan of the chunk point
3199  align_number(sizeof(indexT),n_pnt*(spq.point_size)) + // byte required to pack data
3200  align_number(sizeof(indexT),n_pnt*sizeof(short int)) + // byte required to pack offsets
3201  align_number(sizeof(indexT),n_pnt*sizeof(unsigned char)); // byte required to pack masks
3202  }
3203 
3204  scan_it.template hostToDevice<0>();
3205 
3206  openfpm::scan((indexT *)scan_it. template getDeviceBuffer<0>(),
3207  scan_it.size(), (indexT *)scan_it. template getDeviceBuffer<0>(), gpuContext);
3208  }
3209 
3215  auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
3216  {
3217  return this->blockMap.getMappingVector();
3218  }
3219 
3225  auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
3226  {
3227  return this->blockMap.getMergeIndexMapVector();
3228  }
3229 
3244  template<int ... prp> void pack(ExtPreAlloc<CudaMemory> & mem,
3246  Pack_stat & sts)
3247  {
3248  unsigned int i = req_index;
3249 
3250  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3251  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3252 
3253  auto & indexBuffer = private_get_index_array();
3254  auto & dataBuffer = private_get_data_array();
3255 
3256  size_t n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3257  size_t n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3258 
3259  Packer<size_t,CudaMemory>::pack(mem,n_cnk,sts);
3260  mem.hostToDevice(mem.getOffset(),mem.getOffset()+sizeof(size_t));
3261 
3262  size_t offset1 = mem.getOffsetEnd();
3263 
3264  grid_key_dx<dim> key = sub_it.getStart();
3265 
3266  for (int i = 0 ; i < dim ; i++)
3267  {Packer<int,CudaMemory>::pack(mem,key.get(i),sts);}
3268 
3269  for (int i = 0 ; i < dim ; i++)
3270  {Packer<int,CudaMemory>::pack(mem,(int)gridGeometry.getSize()[i],sts);}
3271 
3272  mem.hostToDevice(offset1,offset1+2*dim*sizeof(int));
3273 
3274  // chunk indexes
3275  mem.allocate(n_cnk*sizeof(indexT));
3276  index_ptrs.add(mem.getDevicePointer());
3277 
3278  // chunk point scan
3279  mem.allocate( align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)) );
3280  scan_ptrs.add(mem.getDevicePointer());
3281 
3282  // chunk data
3283  mem.allocate( align_number(sizeof(indexT),n_pnt*(spq.point_size)) );
3284  data_ptrs.add(mem.getDevicePointer());
3285 
3286  // space for offsets
3287  mem.allocate( align_number(sizeof(indexT),n_pnt*sizeof(short int) ) );
3288  offset_ptrs.add(mem.getDevicePointer());
3289 
3290  // space for offsets
3291  mem.allocate( align_number(sizeof(indexT),n_pnt*sizeof(unsigned char) ) );
3292  mask_ptrs.add(mem.getDevicePointer());
3293 
3294  req_index++;
3295  }
3296 
3297 
3314  template<unsigned int ... prp>
3315  void removeCopyToFinalize(gpu::ofp_context_t& gpuContext, int opt)
3316  {
3317  if ((opt & 0x3) == rem_copy_opt::PHASE1)
3318  {
3319  this->template removeCopyToFinalize_phase1<prp ...>(gpuContext,opt);
3320  }
3321  else if ((opt & 0x3) == rem_copy_opt::PHASE2)
3322  {
3323  this->template removeCopyToFinalize_phase2<prp ...>(gpuContext,opt);
3324  }
3325  else
3326  {
3327  this->template removeCopyToFinalize_phase3<prp ...>(gpuContext,opt,false);
3328  }
3329  }
3330 
3343  template<int ... prp> void packFinalize(ExtPreAlloc<CudaMemory> & mem,
3344  Pack_stat & sts,
3345  int opt = 0,
3346  bool is_pack_remote = false)
3347  {
3348 
3349  RestorePackVariableIfKeepGeometry(opt,is_pack_remote);
3350 
3351  if (pack_subs.size() <= 32)
3352  {
3353  pack_sg_implement<32,prp...>(mem,sts,opt,is_pack_remote);
3354  }
3355  else if (pack_subs.size() <= 64)
3356  {
3357  pack_sg_implement<64, prp...>(mem,sts,opt,is_pack_remote);
3358  }
3359  else if (pack_subs.size() <= 80)
3360  {
3361  pack_sg_implement<80, prp...>(mem,sts,opt,is_pack_remote);
3362  }
3363  else
3364  {
3365  std::cout << __FILE__ << ":" << __LINE__ << " error no implementation available of packCalculate, create a new case for " << pack_subs.size() << std::endl;
3366  }
3367 
3368  savePackVariableIfNotKeepGeometry(opt,is_pack_remote);
3369  }
3370 
3377  {
3378  rem_sects.clear();
3379 
3380  auto & vad = BMG::blockMap.private_get_vct_add_data();
3381  auto & vai = BMG::blockMap.private_get_vct_add_index();
3382 
3383  vad.clear();
3384  vai.clear();
3385 
3386  // Clear variables
3387  offset_ptrs_cp.clear();
3388  scan_ptrs_cp.clear();
3389  n_cnk_cp.clear();
3390  n_pnt_cp.clear();
3391  data_base_ptr_cp.clear();
3392  box_cp.clear();
3393  n_shifts_cp.clear();
3394  convert_blk.clear();
3395  tmp2.clear();
3396  }
3397 
3403  void swap(self & gr)
3404  {
3405  gridGeometry.swap(gr.gridGeometry);
3406 
3407  BMG::swap(gr);
3408  }
3409 
3418  {
3419  auto & indexBuffer = private_get_index_array();
3420  auto & dataBuffer = private_get_data_array();
3421 
3422  // first we remove
3423  if (rem_sects.size() != 0)
3424  {
3425  rem_sects.template hostToDevice<0,1>();
3426 
3427  tmp.resize(indexBuffer.size() + 1);
3428 
3429  tmp.template get<1>(tmp.size()-1) = 0;
3430  tmp.template hostToDevice<1>(tmp.size()-1,tmp.size()-1);
3431 
3432  auto ite = indexBuffer.getGPUIterator();
3433 
3434  if (has_work_gpu(ite) == true)
3435  {
3436  // mark all the chunks that must remove points
3437  CUDA_LAUNCH((SparseGridGpuKernels::calc_remove_points_chunks_boxes<dim,
3439  blockEdgeSize>),ite,indexBuffer.toKernel(),rem_sects.toKernel(),
3440  gridGeometry,dataBuffer.toKernel(),
3441  tmp.toKernel());
3442 
3443  // scan
3444  openfpm::scan((unsigned int *)tmp.template getDeviceBuffer<1>(),tmp.size(),(unsigned int *)tmp.template getDeviceBuffer<1>(),gpuContext);
3445 
3446  tmp.template deviceToHost<1>(tmp.size()-1,tmp.size()-1);
3447 
3448  // get the number of chunks involved
3449  size_t nr_cnk = tmp.template get<1>(tmp.size()-1);
3450 
3451  tmp3.resize(nr_cnk);
3452 
3453  // collect the chunks involved in the remove
3454  ite = indexBuffer.getGPUIterator();
3455 
3456  if (has_work_gpu(ite) == false) {return;}
3457 
3458  CUDA_LAUNCH((SparseGridGpuKernels::collect_rem_chunks),ite,tmp.toKernel(),tmp3.toKernel());
3459 
3460  // Launch to remove points
3461 
3462  ite = tmp3.getGPUIterator();
3463 
3464  ite.wthr.x = tmp3.size();
3465  ite.wthr.y = 1;
3466  ite.wthr.z = 1;
3467  ite.thr.x = getBlockSize();
3468  ite.thr.y = 1;
3469  ite.thr.z = 1;
3470 
3471  if (has_work_gpu(ite) == false) {return;}
3472 
3473  CUDA_LAUNCH((SparseGridGpuKernels::remove_points<dim,
3475  ite,indexBuffer.toKernel(),
3476  gridGeometry,
3477  dataBuffer.toKernel(),
3478  tmp3.toKernel(),
3479  rem_sects.toKernel());
3480 
3481  tmp3.clear();
3482  }
3483  }
3484  }
3485 
3491  template<unsigned int ... prp>
3492  void removeAddUnpackFinalize(gpu::ofp_context_t& gpuContext, int opt)
3493  {
3494  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
3495  {removePoints(gpuContext);}
3496 
3497  removeCopyToFinalize_phase3<prp ...>(gpuContext,opt,true);
3498  }
3499 
3505  {
3506  rem_sects.clear();
3507  copySect.clear();
3508  offset_ptrs_cp.clear();
3509  scan_ptrs_cp.clear();
3510  data_base_ptr_cp.clear();
3511  n_cnk_cp.clear();
3512  n_pnt_cp.clear();
3513  n_shifts_cp.clear();
3514  convert_blk.clear();
3515  box_cp.clear();
3516  data_base_ptr_cp.clear();
3517 
3518  tmp2.clear();
3519  }
3520 
3528  void remove(const Box<dim,int> & section_to_delete)
3529  {
3530  rem_sects.add(section_to_delete);
3531  }
3532 
3538  static constexpr bool isCompressed()
3539  {
3540  return true;
3541  }
3542 
3550  void copy_to(self & grid_src,
3551  const Box<dim,size_t> & box_src,
3552  const Box<dim,size_t> & box_dst)
3553  {
3554  // first we launch a kernel to count the number of points we have
3555 
3556  sparse_grid_section<self> sgs(*this,box_src,box_dst);
3557 
3558  grid_src.copySect.add(sgs);
3559  }
3560 
3564  template<typename pointers_type,
3565  typename headers_type,
3566  typename result_type,
3567  unsigned int ... prp >
3568  static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result, int n_slot)
3569  {
3570  // we have to increment ps by the right amount
3571  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3572  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3573 
3574  result.allocate(sizeof(int));
3575 
3576  if (pointers.size())
3577  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers<decltype(std::declval<self>().toKernel())>),1,pointers.size(),
3578  pointers.toKernel(),
3579  headers.toKernel(),
3580  (int *)result.getDevicePointer(),
3581  (unsigned int)spq.point_size,
3582  n_slot);
3583  }
3584 
3594  template<unsigned int ... prp, typename S2, typename header_type>
3597  header_type & headers,
3598  int ih,
3599  Unpack_stat & ps,
3600  gpu::ofp_context_t& gpuContext,
3601  rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3602  {
3604 
3605  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
3606  {
3607  this->template addAndConvertPackedChunkToTmp<prp ...>(mem,sub_it,ps,gpuContext);
3608 
3609  // readjust mem
3610  }
3611  else
3612  {
3613  // we have to increment ps by the right amount
3614  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3615  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3616 
3617  // First get the number of chunks
3618 
3619  size_t n_cnk = headers.template get<1>(ih);
3620  ps.addOffset(sizeof(size_t));
3621  ps.addOffset(2*dim*sizeof(unsigned int));
3622 
3623  size_t actual_offset = n_cnk*sizeof(indexT);
3624  unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));
3625 
3626  // Unpack number of points
3627  // calculate the number of total points
3628  size_t n_pnt = headers.template get<2>(ih);
3629  actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));
3630 
3631  void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );
3632 
3633  actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));
3634  short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);
3635 
3636  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));
3637  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));
3638 
3639  scan_ptrs_cp.add(scan);
3640  offset_ptrs_cp.add(offsets);
3641  data_base_ptr_cp.add(data_base_ptr);
3642 
3643  ps.addOffset(actual_offset);
3644  }
3645  }
3646 
3653  {return true;}
3654 
3664  template<unsigned int ... prp, typename S2>
3667  Unpack_stat & ps,
3668  gpu::ofp_context_t& gpuContext,
3669  rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3670  {
3672 
3673  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
3674  {
3675  this->template addAndConvertPackedChunkToTmp<prp ...>(mem,sub_it,ps,gpuContext);
3676 
3677  // readjust mem
3678  }
3679  else
3680  {
3681  // we have to increment ps by the right amount
3682  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3683  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3684 
3685  // First get the number of chunks
3686 
3687  size_t n_cnk;
3688 
3689  // Unpack the number of chunks
3690  mem.deviceToHost(ps.getOffset(),ps.getOffset() + sizeof(size_t) + 2*dim*sizeof(int));
3691  Unpacker<size_t,S2>::unpack(mem,n_cnk,ps);
3692 
3693  // Unpack origin of the chunk indexing
3694 /* for (int i = 0 ; i < dim ; i++)
3695  {
3696  int tmp;
3697  Unpacker<int,S2>::unpack(mem,tmp,ps);
3698  }
3699 
3700  for (int i = 0 ; i < dim ; i++)
3701  {
3702  int tmp;
3703  Unpacker<int,S2>::unpack(mem,tmp,ps);
3704  }*/
3705 
3706  ps.addOffset(2*dim*sizeof(unsigned int));
3707 
3708  size_t actual_offset = n_cnk*sizeof(indexT);
3709  unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));
3710 
3711  mem.deviceToHost(ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int),
3712  ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int) + sizeof(unsigned int));
3713 
3714  // Unpack number of points
3715  // calculate the number of total points
3716  size_t n_pnt = *(unsigned int *)((unsigned char *)mem.getPointer() + ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int));
3717  actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));
3718 
3719  void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );
3720 
3721  actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));
3722  short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);
3723 
3724  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));
3725  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));
3726 
3727  scan_ptrs_cp.add(scan);
3728  offset_ptrs_cp.add(offsets);
3729  data_base_ptr_cp.add(data_base_ptr);
3730 
3731  ps.addOffset(actual_offset);
3732  }
3733  }
3734 
3740  {
3742  }
3743 
3745 
3752  {
3753  return decltype(self::type_of_iterator())(*this);
3754  }
3755 
3761  decltype(self::type_of_subiterator()) getIterator(const grid_key_dx<dim> & start, const grid_key_dx<dim> & stop, int is_to_init = 1) const
3762  {
3763  return decltype(self::type_of_subiterator())(*this,start,stop,is_to_init);
3764  }
3765 
3772  {
3774  }
3775 
3781  auto private_get_add_index_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.private_get_vct_add_index()) &
3782  {
3784  }
3785 
3791  auto private_get_index_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer()) &
3792  {
3794  }
3795 
3796  auto getSegmentToOutMap() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())
3797  {
3799  }
3800 
3801  auto getSegmentToOutMap() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())
3802  {
3804  }
3805 
3806  auto getSegmentToMergeIndexMap() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())
3807  {
3809  }
3810 
3811  auto getSegmentToMergeIndexMap() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())
3812  {
3814  }
3815 
3822  {
3824  }
3825 
3831  auto private_get_neighborhood_array() -> decltype(nn_blocks) &
3832  {
3833  return nn_blocks;
3834  }
3835 
3836 
3837 #if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST) || defined(VTKWRITER_HPP_)
3838 
3844  template<typename Tw = float> bool write(const std::string & output)
3845  {
3846  Point<dim,double> spacing;
3847  Point<dim,double> offset;
3848 
3849  spacing.one();
3850  offset.zero();
3851 
3852  return write_with_spacing_offset(output,spacing,offset);
3853  }
3854 
3860  template<typename Tw = float>
3861  bool write_with_spacing_offset(const std::string & output, Point<dim,double> spacing, Point<dim,double> offset)
3862  {
3863  file_type ft = file_type::BINARY;
3864 
3865  auto & bm = this->private_get_blockMap();
3866 
3867  auto & index = bm.getIndexBuffer();
3868  auto & data = bm.getDataBuffer();
3869 
3872 
3873  // copy position and properties
3874 
3875  auto it = index.getIterator();
3876 
3877  while(it.isNext())
3878  {
3879  auto key = it.get();
3880 
3881  Point<dim,Tw> p;
3882 
3883  for (size_t i = 0 ; i < gridGeometry.getBlockSize() ; i++)
3884  {
3886  {
3887  // Get the block index
3888  grid_key_dx<dim,int> keyg = gridGeometry.InvLinId(index.template get<0>(key),i);
3889 
3890  for (size_t k = 0 ; k < dim ; k++)
3891  {p.get(k) = keyg.get(k)*spacing[k] + offset[k]*spacing[k];}
3892 
3893  tmp_pos.add(p);
3894 
3895  tmp_prp.add();
3896  copy_prop_to_vector_block<decltype(data.get_o(key)),decltype(tmp_prp.last())>
3897  cp(data.get_o(key),tmp_prp.last(),key,i);
3898 
3899  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,AggregateT::max_prop> >(cp);
3900 
3901  tmp_prp.last().template get<AggregateT::max_prop>() = data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i];
3902  }
3903  }
3904 
3905  ++it;
3906  }
3907 
3908  // VTKWriter for a set of points
3910  vtk_writer.add(tmp_pos,tmp_prp,tmp_pos.size());
3911 
3912  openfpm::vector<std::string> prp_names;
3913 
3914  // Write the VTK file
3915  return vtk_writer.write(output,prp_names,"sparse_grid","",ft);
3916  }
3917 
3923  template<typename Tw = float> bool write_debug(const std::string & output, Point<dim,double> spacing, Point<dim,double> offset)
3924  {
3926  VTKWriter<openfpm::vector<Box<dim, double>>, VECTOR_BOX> vtk_box1;
3927 
3928  openfpm::vector<Box<dim,double>> chunks_box;
3929 
3930  auto & ids = private_get_index_array();
3931 
3932  fill_chunks_boxes(chunks_box,ids,spacing,offset);
3933 
3934  vtk_box1.add(chunks_box);
3935  vtk_box1.write(std::string("chunks_") + output + std::string(".vtk"));
3936 
3937  //write data
3938 
3939  write_with_spacing_offset(std::string("data_") + output + std::string(".vtk"),spacing,offset);
3940 
3941  return true;
3942  }
3943 
3944 #endif
3945 };
3946 
3947 template<unsigned int dim,
3948  typename AggregateT,
3949  unsigned int blockEdgeSize = default_edge<dim>::type::value,
3950  unsigned int threadBlockSize = default_edge<dim>::tb::value,
3951  typename indexT=long int,
3952  template<typename> class layout_base=memory_traits_inte,
3953  typename linearizer = grid_zmb<dim, blockEdgeSize,indexT>>
3955 
3956 template<unsigned int dim,
3957  typename AggregateT,
3958  unsigned int blockEdgeSize = default_edge<dim>::type::value,
3959  unsigned int threadBlockSize = default_edge<dim>::tb::value,
3960  typename indexT=int,
3961  template<typename> class layout_base=memory_traits_inte,
3962  typename linearizer = grid_zmb<dim, blockEdgeSize,indexT>>
3964 
3965 template<unsigned int dim,
3966  typename AggregateT,
3967  unsigned int blockEdgeSize = default_edge<dim>::type::value,
3968  unsigned int threadBlockSize = default_edge<dim>::tb::value,
3969  typename indexT=int,
3970  template<typename> class layout_base=memory_traits_inte,
3971  typename linearizer = grid_smb<dim, blockEdgeSize,indexT>>
3973 
3974 #endif //OPENFPM_PDATA_SPARSEGRIDGPU_HPP
int sgn(T val)
Gets the sign of a variable.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
auto insert_o(unsigned int linId) -> decltype(blockMap.insert(0))
insert data, host version
auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId, is_new).template get< p >())
insert a block + flush, host version
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
void setGPUInsertBuffer(int nBlock, int nSlot)
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition: Box.hpp:555
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition: Box.hpp:566
__device__ __host__ void setHigh(int i, T val)
set the high interval of the box
Definition: Box.hpp:543
__device__ __host__ void setLow(int i, T val)
set the low interval of the box
Definition: Box.hpp:532
virtual void * getDevicePointer()
get a readable pointer with the data
Definition: CudaMemory.cu:503
virtual bool resize(size_t sz)
resize the momory allocated
Definition: CudaMemory.cu:261
virtual void hostToDevice()
Move memory from host to device.
Definition: CudaMemory.cu:514
virtual size_t size() const
the the size of the allocated memory
Definition: CudaMemory.cu:245
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:354
virtual bool allocate(size_t sz)
allocate memory
Definition: CudaMemory.cu:38
virtual void decRef()
Decrement the reference counter.
size_t getOffsetEnd()
Get offset.
size_t getOffset()
Get offset.
virtual void * getDevicePointer()
Return the pointer of the last allocation.
virtual void incRef()
Increment the reference counter.
virtual void deviceToHost()
Do nothing.
virtual void * getPointer()
Return the pointer of the last allocation.
void reset()
Reset the internal counters.
virtual void hostToDevice()
Return the pointer of the last allocation.
virtual bool allocate(size_t sz)
Allocate a chunk of memory.
virtual size_t size() const
Get the size of the LAST allocated memory.
This class allocate, and destroy CPU memory.
Definition: HeapMemory.hpp:40
Packing status object.
Definition: Pack_stat.hpp:61
Packing class.
Definition: Packer.hpp:50
static void pack(ExtPreAlloc< Mem >, const T &obj)
Error, no implementation.
Definition: Packer.hpp:56
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:28
void one()
Set to one the point coordinate.
Definition: Point.hpp:311
__device__ __host__ void zero()
Set to zero the point coordinate.
Definition: Point.hpp:299
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
grid_key_dx< dim > getStop() const
Return the stop point.
grid_key_dx< dim > getStart() const
Return the starting point.
openfpm::vector_gpu< aggregate< int, short int > > link_dw
links of the padding points with real points of a finer sparsegrid
openfpm::vector_gpu< aggregate< size_t > > links_up
links of the padding points with real points of a coarse sparsegrid
void unpack_with_headers(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, header_type &headers, int ih, Unpack_stat &ps, gpu::ofp_context_t &gpuContext, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
void addAndConvertPackedChunkToTmp(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &gpuContext)
unpack the sub-grid object
openfpm::vector_gpu< aggregate< int, short int > > & getUpLinks()
Get the links up for each point.
SparseGridGpu(linearizer &gridGeometry, unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
size_t getBlockLinId(const CoordT &blockCoord) const
Linearization of block coordinates.
openfpm::vector_gpu< aggregate< indexT > > tmp3
temporal 3
auto private_get_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer()) &
Return the index array of the blocks.
void conv_cross(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a convolution using a cross like stencil.
openfpm::vector_gpu< aggregate< int[dim]> > shifts
shifts for chunk conversion
auto get(const grid_key_dx< dim, CoordT > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using the point coordinates.
void pack(ExtPreAlloc< CudaMemory > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Pack_stat &sts)
Pack the object into the memory given an iterator.
auto get_o(const grid_key_dx< dim, CoordT > &coord) const -> encap_data_block< typename std::remove_const< decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::get(0))>::type >
Get an element using the point coordinates.
auto get(const sparse_grid_gpu_index< self > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
SparseGridGpu(const size_t(&res)[dim], unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
auto private_get_add_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
ExtPreAlloc< CudaMemory > * prAlloc_prp
Memory to remove copy finalize.
void remove(const Box< dim, int > &section_to_delete)
Remove all the points in this region.
openfpm::vector_gpu< aggregate< indexT > > e_points
void unpack(ExtPreAlloc< CudaMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
auto private_get_neighborhood_array() -> decltype(nn_blocks) &
Return the index array of the blocks.
void setNNType()
Set the neighborhood type.
openfpm::vector_gpu< aggregate< unsigned int > > pack_output
Helper array to pack points.
auto get(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
openfpm::vector_gpu< aggregate< indexT > > tmp2
temporal 2
auto insertFlush(const grid_key_dx< dim, CoordT > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void packRequest(size_t &req) const
Asking to pack a SparseGrid GPU without GPU context pack the grid on CPU and host memory.
void setBackgroundValue(typename boost::mpl::at< typename AggregateT::type, boost::mpl::int_< p >>::type backgroundValue)
set the background for property p
auto private_get_data_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer())
Return the data array of the blocks.
auto insertFlush(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
Return the mapping vector used to know where the data has been added.
void packRequest(size_t &req, gpu::ofp_context_t &gpuContext) const
memory requested to pack this object
static bool is_unpack_header_supported()
Indicate that unpacking the header is supported.
Box< dim, int > getBox()
Return a Box with the range if the SparseGrid.
openfpm::vector_gpu< aggregate< short int, short int > > ghostLayerToThreadsMapping
void copyRemoveReset()
Reset the queue to remove and copy section of grids.
unsigned char getFlag(const sparse_grid_gpu_index< self > &coord) const
Return the flag of the point.
void removeAddUnpackReset()
In this case it does nothing.
static SparseGridGpu_iterator_sub< dim, self > type_of_subiterator()
This is a meta-function return which type of sub iterator a grid produce.
void packCalculate(size_t &req, gpu::ofp_context_t &gpuContext)
Calculate the size of the information to pack.
size_t size() const
return the size of the grid
void packRequest(SparseGridGpu_iterator_sub< dim, self > &sub_it, size_t &req) const
Calculate the size to pack part of this structure.
void pack(ExtPreAlloc< HeapMemory > &mem, Pack_stat &sts) const
Pack the object into the memory.
linearizer & getGrid()
Return the grid information object.
static constexpr bool isCompressed()
This is a multiresolution sparse grid so is a compressed format.
void construct_link_up(self &grid_up, const Box< dim, int > &db_, Point< dim, int > p_up, gpu::ofp_context_t &gpuContext)
construct link on the up levels
auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
Return the mapping vector used to know where the data has been added.
decltype(self::type_of_iterator()) getIterator() const
Return a SparseGrid iterator.
void copy_to(self &grid_src, const Box< dim, size_t > &box_src, const Box< dim, size_t > &box_dst)
It queue a copy.
void removeAddUnpackFinalize(gpu::ofp_context_t &gpuContext, int opt)
This function remove the points we queue to remove and it flush all the added/unpacked data.
openfpm::vector_gpu< aggregate< int, short int > > & getDownLinks()
Get the links down for each point.
void resize(size_t(&res)[dim])
resize the SparseGrid
openfpm::vector_gpu< aggregate< int, short int > > link_up
links of the padding points with real points of a finer sparsegrid
void construct_link(self &grid_up, self &grid_dw, gpu::ofp_context_t &gpuContext)
construct link between levels
void conv2_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
void conv_cross_b(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
void unpack(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &gpuContext, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
openfpm::vector_gpu< aggregate< indexT, unsigned int > > tmp
temporal
void packReset()
Reset the pack calculation.
void removeCopyToFinalize(gpu::ofp_context_t &gpuContext, int opt)
It finalize the queued operations of remove() and copy_to()
void swap(self &gr)
bool isSkipLabellingPossible()
This function check if keep geometry is possible for this grid.
openfpm::vector_gpu< Box< dim, int > > pack_subs
the set of all sub-set to pack
auto private_get_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer())
Return the index array of the blocks.
auto insertBlockFlush(const grid_key_dx< dim, CoordT > &coord, indexT &local_id) -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::insertBlockFlush(0))
Insert the point on host side and flush directly.
void construct_link_dw(self &grid_dw, const Box< dim, int > &db_, Point< dim, int > p_dw, gpu::ofp_context_t &gpuContext)
construct link on the down level
auto private_get_data_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer()) &
Return the index array of the blocks.
auto get_o(const sparse_grid_gpu_index< self > &coord) const -> encap_data_block< typename std::remove_const< decltype(private_get_data_array().get(0))>::type >
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
static SparseGridGpu_iterator< dim, self > type_of_iterator()
This is a meta-function return which type of iterator a grid produce.
openfpm::vector_gpu< aggregate< unsigned int > > & getDownLinksOffsets()
Get the offsets for each point of the links down.
static void unpack_headers(pointers_type &pointers, headers_type &headers, result_type &result, int n_slot)
Stub does not do anything.
void conv3_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< int > > new_map
Map between the (Last) added chunks and their position in chunks data.
int yes_i_am_grid
it define that this data-structure is a grid
void convertChunkIds(short int *offset, origPackType &origPack, IteratorType &sub_it)
convert the offset index from the packed to the add buffer
void unpack(ExtPreAlloc< HeapMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
void removePoints(gpu::ofp_context_t &gpuContext)
Remove the points we queues to remove.
void conv2(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< unsigned int > > & getUpLinksOffsets()
Get the offsets for each point of the links up.
openfpm::vector_gpu< aggregate< unsigned int > > link_dw_scan
scan offsets of the links down
base_key get_sparse(const grid_key_dx< dim, CoordT > &coord) const
Get an element using the point coordinates.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
openfpm::vector_gpu< aggregate< unsigned int > > link_up_scan
scan offsets of the links down
auto private_get_add_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
openfpm::vector_gpu< aggregate< indexT > > scan_it
contain the scan of the point for each iterator
void packFinalize(ExtPreAlloc< CudaMemory > &mem, Pack_stat &sts, int opt=0, bool is_pack_remote=false)
Finalize the packing procedure.
void conv(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
Unpacking status object.
Definition: Pack_stat.hpp:16
size_t getOffset()
Return the actual counter.
Definition: Pack_stat.hpp:41
void addOffset(size_t off)
Increment the offset pointer by off.
Definition: Pack_stat.hpp:31
Unpacker class.
Definition: Unpacker.hpp:34
static void unpack(ExtPreAlloc< Mem >, T &obj)
Error, no implementation.
Definition: Unpacker.hpp:40
void operator()(T &t) const
It call the copy function for each property.
void operator()(T &t) const
It call the copy function for each property.
void * base_ptr
data pointers
grid_key_dx is the key to access any element in the grid
Definition: grid_key.hpp:19
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
Definition: grid_key.hpp:516
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition: grid_key.hpp:503
void setDimensions(const size_t(&dims)[N])
Reset the dimension of the grid.
Definition: grid_sm.hpp:326
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const signed char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
Definition: grid_sm.hpp:454
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:204
size_t size()
Stub size.
Definition: map_vector.hpp:212
Element index contain a data chunk index and a point index.
int get_cnk_pos_id() const
Get chunk position id.
int get_data_id() const
Get chunk local index (the returned index < getblockSize())
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
get the type of the insertBlock
get the type of the block
static __device__ bool getNNindex_offset(grid_key_dx< dim, indexT2 > &coord, unsigned int &NN_index, unsigned int &offset_nn)
given a coordinate writtel in local coordinate for a given it return the neighborhood chunk position ...
Check if is padding.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:221
this class is a functor for "for_each" algorithm
Definition: Encap.hpp:222
Definition: ids.hpp:169
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition: memory_conf.hpp:84
This class copy general objects.
Definition: meta_copy.hpp:53
this class is a functor for "for_each" algorithm