OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
SparseGridGpu.hpp
1 //
2 // Created by tommaso on 6/06/19.
3 //
4 
5 #ifndef OPENFPM_PDATA_SPARSEGRIDGPU_HPP
6 #define OPENFPM_PDATA_SPARSEGRIDGPU_HPP
7 
8 constexpr int BLOCK_SIZE_STENCIL = 128;
9 
10 #include "config.h"
11 #include "util/cuda_launch.hpp"
12 #include <cstdlib>
13 #include <SparseGridGpu/BlockMapGpu.hpp>
14 #include <Grid/iterators/grid_skin_iterator.hpp>
15 #include <Grid/Geometry/grid_smb.hpp>
16 #include "SparseGridGpu_ker.cuh"
17 #include "SparseGridGpu_kernels.cuh"
18 #include "Iterators/SparseGridGpu_iterator_sub.hpp"
19 #include "Grid/Geometry/grid_zmb.hpp"
20 #include "util/stat/common_statistics.hpp"
21 #include "Iterators/SparseGridGpu_iterator.hpp"
22 #include "Space/SpaceBox.hpp"
23 
24 #if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST)
25 #include "VTKWriter/VTKWriter.hpp"
26 #endif
27 
28 constexpr int NO_ITERATOR_INIT = 0;
29 
30 // todo: Move all the following utils into some proper file inside TemplateUtils
31 
32 enum tag_boundaries
33 {
34  NO_CALCULATE_EXISTING_POINTS,
35  CALCULATE_EXISTING_POINTS
36 };
37 
38 template<unsigned int dim>
40 {
41  typedef boost::mpl::int_<2> type;
42 };
43 
44 
45 template<>
46 struct default_edge<1>
47 {
48  typedef boost::mpl::int_<256> type;
49  typedef boost::mpl::int_<256> tb;
50 };
51 
52 template<>
53 struct default_edge<2>
54 {
55  typedef boost::mpl::int_<16> type;
56  typedef boost::mpl::int_<256> tb;
57 };
58 
59 template<>
60 struct default_edge<3>
61 {
62  typedef boost::mpl::int_<8> type;
63  typedef boost::mpl::int_<512> tb;
64 };
65 
66 template<typename T>
68 {
69  typedef T type;
70 };
71 
72 template<typename T, unsigned int dim, unsigned int blockEdgeSize>
74 {
76 };
77 
78 template<typename T, unsigned int dim, unsigned int blockEdgeSize, unsigned int N1>
79 struct process_data_block<T[N1],dim,blockEdgeSize>
80 {
82 };
83 
84 template<unsigned int dim, unsigned int blockEdgeSize, typename ... aggr_list>
86 {
88 };
89 
90 template<unsigned int dim, unsigned int blockEdgeSize, typename aggr>
92 {
93 };
94 
95 template<unsigned int dim, unsigned int blockEdgeSize, typename ... types>
96 struct aggregate_convert<dim,blockEdgeSize,aggregate<types ...>>
97 {
98  typedef typename aggregate_transform_datablock_impl<dim,blockEdgeSize,types ...>::type type;
99 };
100 
101 template<typename aggr>
103 {
104 };
105 
106 template<typename ... types>
107 struct aggregate_add<aggregate<types ...>>
108 {
109  typedef aggregate<types ..., unsigned char> type;
110 };
111 
113 
114 template<typename enc_type>
116 {
117  int offset;
118  enc_type enc;
119 
120  public:
121 
122  encap_data_block(int offset,const enc_type & enc)
123  :offset(offset),enc(enc)
124  {}
125 
126  encap_data_block operator=(const encap_data_block<enc_type> & enc)
127  {
129 
130  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,enc_type::T_type::max_prop> >(cp);
131 
132  return *this;
133  }
134 
135  template<unsigned int p>
136  auto get() -> decltype(enc.template get<p>()[offset])
137  {
138  return enc.template get<p>()[offset];
139  }
140 
141  template<unsigned int p>
142  auto get() const -> decltype(enc.template get<p>()[offset])
143  {
144  return enc.template get<p>()[offset];
145  }
146 };
147 
149 
150 enum StencilMode
151 {
152  STENCIL_MODE_INPLACE = 1,
153  STENCIL_MODE_INPLACE_NO_SHARED = 3
154 };
155 
160 template<typename SGridGpu, unsigned int prp, unsigned int stencil_size>
162 {
164  stencil_size,
166  SGridGpu::device_grid_type::dims> type;
167 };
168 
169 #include "encap_num.hpp"
170 
175 template<typename SGridGpu>
177 {
179 };
180 
185 template<unsigned int dim>
187 {
188  __device__ inline static bool is_padding()
189  {
190  printf("NNfull_is_padding_impl with dim: %d not implemented yet \n",dim);
191 
192  return false;
193  }
194 };
195 
200 template<>
202 {
203  template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
204  __device__ inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
205  {
206  bool isPadding_ = false;
207  for (int i = 0 ; i < 3 ; i++)
208  {
209  for (int j = 0 ; j < 3 ; j++)
210  {
211  for (int k = 0 ; k < 3 ; k++)
212  {
213  grid_key_dx<3,int> key;
214 
215  key.set_d(0,i-1);
216  key.set_d(1,j-1);
217  key.set_d(2,k-1);
218 
219  auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
220  typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
221  isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
222  if (isPadding_) break;
223  }
224  }
225  }
226  return isPadding_;
227  }
228 };
229 
230 
235 template<>
237 {
238  template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
239  __device__ inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
240  {
241  bool isPadding_ = false;
242  for (int i = 0 ; i < 3 ; i++)
243  {
244  for (int j = 0 ; j < 3 ; j++)
245  {
246  grid_key_dx<2,int> key;
247 
248  key.set_d(0,i-1);
249  key.set_d(1,j-1);
250 
251  auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
252  typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
253  isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
254  if (isPadding_) break;
255  }
256  }
257  return isPadding_;
258  }
259 };
260 
261 template<unsigned int dim>
262 struct NNFull
263 {
264  static const int nNN = IntPow<3, dim>::value;
265 
266  template<typename indexT, typename blockCoord_type, typename blockMap_type, typename SparseGrid_type>
267  __device__ static inline indexT getNNpos(blockCoord_type & blockCoord,
268  blockMap_type & blockMap,
269  SparseGrid_type & sparseGrid,
270  const unsigned int offset)
271  {
272  // point to the background element
273  int neighbourPos = blockMap.size();
274  if (offset < nNN && offset != nNN / 2)
275  {
276  int cnt = offset;
277  for (int i = 0 ; i < dim ; i++)
278  {
279  int dPos = cnt % 3;
280  cnt /= 3;
281  blockCoord.set_d(i, blockCoord.get(i) + dPos - 1);
282  }
283 
284  neighbourPos = blockMap.get_sparse(sparseGrid.getBlockLinId(blockCoord)).id;
285  }
286  return neighbourPos;
287  }
288 
289  template<typename indexT, unsigned int blockEdgeSize, typename coordType>
290  __host__ static inline indexT getNNskin(coordType & coord, int stencilSupportRadius)
291  {
292  // linearize the coord
293 
294  indexT neighbourNum = 0;
295 
296  indexT accu = 1;
297  for(int i = 0 ; i < dim ; i++)
298  {
299  int c = static_cast<int>(coord.get(i)) - static_cast<int>(stencilSupportRadius);
300  if (c < 0)
301  {
302  neighbourNum += 0;
303  }
304  else if (c >= blockEdgeSize)
305  {
306  neighbourNum += 2*accu;
307  }
308  else
309  {
310  neighbourNum += accu;
311  }
312  accu *= 3;
313  }
314 
315  return neighbourNum;
316  }
317 
318 
319  template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
320  __device__ static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
321  {
322  return NNfull_is_padding_impl<3>::template is_padding(sparseGrid,coord,enlargedBlock);
323  }
324 
335  template<unsigned int blockEdgeSize, typename indexT2>
336  __device__ static inline bool getNNindex_offset(grid_key_dx<dim,indexT2> & coord, unsigned int & NN_index, unsigned int & offset_nn)
337  {
338  bool out = false;
339  NN_index = 0;
340  offset_nn = 0;
341 
342  int cnt = 1;
343  int cnt_off = 1;
344  for (unsigned int i = 0 ; i < dim ; i++)
345  {
346  int p = 1 - ((int)(coord.get(i) < 0)) + ((int)(coord.get(i) >= (int)blockEdgeSize));
347 
348  NN_index += p*cnt;
349 
350  offset_nn += (coord.get(i) + (1 - p)*(int)blockEdgeSize)*cnt_off;
351 
352  cnt *= 3;
353  cnt_off *= blockEdgeSize;
354 
355  out |= (p != 1);
356  }
357 
358  return out;
359  }
360 };
361 
362 
363 
364 template<unsigned int nNN_, unsigned int nLoop_>
365 struct ct_par
366 {
367  static const unsigned int nNN = nNN_;
368  static const unsigned int nLoop = nLoop_;
369 };
370 
371 template<typename copy_type>
373 {
374  template<typename T, typename dst_type, typename src_type>
375  static inline void copy(src_type & src, dst_type & dst, unsigned int bPos)
376  {
377  dst.template get<T::value>() = src.template get<T::value>()[bPos];
378  }
379 };
380 
381 template<typename copy_type,unsigned int N1>
382 struct copy_prop_to_vector_block_impl<copy_type[N1]>
383 {
384  template<typename T, typename dst_type, typename src_type>
385  static inline void copy(src_type & src, dst_type & dst, unsigned int bPos)
386  {
387  for (int i = 0 ; i < N1 ; i++)
388  {
389  dst.template get<T::value>()[i] = src.template get<T::value>()[i][bPos];
390  }
391  }
392 };
393 
394 template<typename Tsrc,typename Tdst>
396 {
398  Tsrc src;
399 
401  Tdst dst;
402 
403  size_t pos;
404 
405  unsigned int bPos;
406 
407 public:
408 
409  copy_prop_to_vector_block(Tsrc src, Tdst dst,size_t pos, size_t bPos)
410  :src(src),dst(dst),pos(pos),bPos(bPos)
411  {}
412 
414  template<typename T>
415  inline void operator()(T& t) const
416  {
417  typedef typename std::remove_reference<decltype(dst.template get<T::value>())>::type copy_rtype;
418 
420 
421  //meta_copy<copy_rtype>::meta_copy_(src.template get<T::value>()[bPos],dst.template get<T::value>());
422  }
423 
424 };
425 
426 
427 
428 template<typename AggregateT, unsigned int n_it, unsigned int ... prp>
430 {
431  typedef typename to_boost_vmpl<prp...>::type vprp;
432 
434  void * base_ptr;
435 
436  mutable size_t tot = 0;
437 
438  size_t i;
439 
440  size_t sz = 0;
441 
442  arr_arr_ptr<n_it,sizeof...(prp)> & arrs;
443 
444 public:
445 
446  data_ptr_fill(void * base_ptr,size_t i, arr_arr_ptr<n_it,sizeof...(prp)> & arrs, size_t sz)
447  :base_ptr(base_ptr),i(i),sz(sz),arrs(arrs)
448  {}
449 
451  template<typename T>
452  inline void operator()(T& t) const
453  {
454  typedef typename boost::mpl::at<vprp,T>::type prp_cp;
455 
456  // Remove the reference from the type to copy
457  typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
458 
459  arrs.ptr[i][T::value] = (void *)((((unsigned char *)base_ptr) + tot));
460 
461  tot += sz * sizeof(pack_type);
462  }
463 
464 };
465 
466 template<typename SparseGridType>
468 {
469  SparseGridType * grd;
470 
471  // Source box
473 
474  // destination Box
476 
477  sparse_grid_section(SparseGridType & grd,const Box<SparseGridType::dims,size_t> & src, const Box<SparseGridType::dims,size_t> & dst)
478  :grd(&grd),src(src),dst(dst)
479  {}
480 };
481 
482 
483 template<unsigned int dim,
484  typename AggregateT,
485  unsigned int blockEdgeSize = default_edge<dim>::type::value,
486  unsigned int threadBlockSize = default_edge<dim>::tb::value,
487  typename indexT=long int,
488  template<typename> class layout_base=memory_traits_inte,
489  typename linearizer = grid_smb<dim, blockEdgeSize, indexT>>
490 class SparseGridGpu : public BlockMapGpu<
491  typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,
492  threadBlockSize, indexT, layout_base>
493 {
494 public:
495 
496  static constexpr unsigned int dims = dim;
497 
498 private:
499 
500  typedef BlockMapGpu<
502  threadBlockSize, indexT, layout_base> BMG;
503 
504  const static unsigned char PADDING_BIT = 1;
505  typedef typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type AggregateBlockT;
506  linearizer gridGeometry;
507  grid_sm<dim, int> extendedBlockGeometry;
508  grid_sm<dim, int> gridSize;
509  unsigned int stencilSupportRadius;
510  unsigned int ghostLayerSize;
511  int req_index;
512  int req_index_swp;
513  int req_index_swp_r;
514 
515  AggregateT bck;
516 
518 
519  // Queue of remove sections
521 
522  // Queue of copy sections
524 
525  CudaMemory mem;
526 
527  // pointers for unpack
528  openfpm::vector<void *> index_ptrs;
529  openfpm::vector<void *> index_ptrs_swp;
530  openfpm::vector<void *> index_ptrs_swp_r;
531  openfpm::vector<void *> scan_ptrs;
532  openfpm::vector<void *> scan_ptrs_swp;
533  openfpm::vector<void *> scan_ptrs_swp_r;
534  openfpm::vector<void *> data_ptrs;
535  openfpm::vector<void *> data_ptrs_swp;
536  openfpm::vector<void *> data_ptrs_swp_r;
537  openfpm::vector<void *> offset_ptrs;
538  openfpm::vector<void *> offset_ptrs_swp;
539  openfpm::vector<void *> offset_ptrs_swp_r;
540  openfpm::vector<void *> mask_ptrs;
541  openfpm::vector<void *> mask_ptrs_swp;
542  openfpm::vector<void *> mask_ptrs_swp_r;
543 
544  // pointers for copyRemove
545  openfpm::vector<void *> offset_ptrs_cp;
546  openfpm::vector<void *> offset_ptrs_cp_swp;
547  openfpm::vector<void *> offset_ptrs_cp_swp_r;
548  openfpm::vector<void *> scan_ptrs_cp;
549  openfpm::vector<void *> scan_ptrs_cp_swp;
550  openfpm::vector<void *> scan_ptrs_cp_swp_r;
551  openfpm::vector<void *> data_base_ptr_cp;
552  openfpm::vector<void *> data_base_ptr_cp_swp;
553  openfpm::vector<void *> data_base_ptr_cp_swp_r;
554  openfpm::vector<int> n_cnk_cp;
555  openfpm::vector<int> n_cnk_cp_swp;
556  openfpm::vector<int> n_cnk_cp_swp_r;
557  openfpm::vector<int> n_pnt_cp;
558  openfpm::vector<int> n_pnt_cp_swp;
559  openfpm::vector<int> n_pnt_cp_swp_r;
560  openfpm::vector<int> n_shifts_cp;
561  openfpm::vector<int> n_shift_cp_swp;
562  openfpm::vector<int> n_shift_cp_swp_r;
563  typedef typename aggregate_convert<dim,blockEdgeSize,aggregate<int>>::type convertAggr;
564 
565  // Map to convert blocks from missaligned chunks
567  openfpm::vector_gpu<convertAggr> convert_blk_swp;
568  openfpm::vector_gpu<convertAggr> convert_blk_swp_r;
571  openfpm::vector<Box<dim,size_t>> box_cp_swp_r;
572 
578 
583 
590 
592 
597 
600 
603 
606 
609  mutable openfpm::vector_gpu<aggregate<int>> new_map_swp;
610  mutable openfpm::vector_gpu<aggregate<int>> new_map_swp_r;
611 
614  mutable openfpm::vector_gpu<Box<dim,int>> pack_subs_swp;
615  mutable openfpm::vector_gpu<Box<dim,int>> pack_subs_swp_r;
616 
620  mutable int index_size_swp = -1;
621  mutable int index_size_swp_r = -1;
622 
625 
628 
631 
634 
637 
640 
643 
644  bool findNN = false;
645 
646  inline void swap_internal_remote()
647  {
648  n_cnk_cp_swp_r.swap(n_cnk_cp);
649  n_pnt_cp_swp_r.swap(n_pnt_cp);
650  n_shift_cp_swp_r.swap(n_shifts_cp);
651  convert_blk_swp_r.swap(convert_blk);
652  box_cp_swp_r.swap(box_cp);
653  new_map_swp_r.swap(new_map);
654  }
655 
656  inline void swap_internal_local()
657  {
658  offset_ptrs_cp_swp.swap(offset_ptrs_cp);
659  scan_ptrs_cp_swp.swap(scan_ptrs_cp);
660  data_base_ptr_cp_swp.swap(data_base_ptr_cp);
661  n_cnk_cp_swp.swap(n_cnk_cp);
662  n_pnt_cp_swp.swap(n_pnt_cp);
663  n_shift_cp_swp.swap(n_shifts_cp);
664  convert_blk_swp.swap(convert_blk);
665  box_cp_swp.swap(box_cp);
666  new_map_swp.swap(new_map);
667  }
668 
669  inline void swap_local_pack()
670  {
671  index_ptrs_swp.swap(index_ptrs);
672  scan_ptrs_swp.swap(scan_ptrs);
673  data_ptrs_swp.swap(data_ptrs);
674  offset_ptrs_swp.swap(offset_ptrs);
675  mask_ptrs_swp.swap(mask_ptrs);
676 
677  e_points_swp.swap(e_points);
678  pack_output_swp.swap(pack_output);
679  tmp_swp.swap(tmp);
680 
681  pack_subs_swp.swap(pack_subs);
683  }
684 
685  inline void swap_remote_pack()
686  {
687  index_ptrs_swp_r.swap(index_ptrs);
688  scan_ptrs_swp_r.swap(scan_ptrs);
689  data_ptrs_swp_r.swap(data_ptrs);
690  offset_ptrs_swp_r.swap(offset_ptrs);
691  mask_ptrs_swp_r.swap(mask_ptrs);
692 
693  e_points_swp_r.swap(e_points);
694  pack_output_swp_r.swap(pack_output);
695  tmp_swp_r.swap(tmp);
696 
697  pack_subs_swp_r.swap(pack_subs);
698  //req_index_swp_r = req_index;
699  index_size_swp_r = private_get_index_array().size();
700  }
701 
702 protected:
703  static constexpr unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;
704  typedef AggregateBlockT AggregateInternalT;
705 
706 public:
707 
709  typedef int yes_i_am_grid;
710 
711  static constexpr unsigned int blockEdgeSize_ = blockEdgeSize;
712 
713  typedef linearizer grid_info;
714 
715  typedef linearizer linearizer_type;
716 
717  template<typename Tfunc> using layout_mfunc = memory_traits_inte<Tfunc>;
718 
720 
721  typedef decltype(std::declval<BMG>().toKernel().insertBlock(0)) insert_encap;
722 
728  inline size_t size() const
729  {
730  return this->countExistingElements();
731  }
732 
738  template <typename stencil = no_stencil>
740  {
742  }
743 
750  {
751  return SparseGridGpu_iterator<dim,self>(std::declval<self>());
752  }
753 
754  template<typename dim3T>
755  inline static int dim3SizeToInt(dim3T d)
756  {
757  return d.x * d.y * d.z;
758  }
759 
760  inline static int dim3SizeToInt(size_t d)
761  {
762  return d;
763  }
764 
765  inline static int dim3SizeToInt(unsigned int d)
766  {
767  return d;
768  }
769 
770  template<typename ... v_reduce>
771  void flush(mgpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST)
772  {
774  ::template flush<v_reduce ...>(context, opt);
775 
776  findNN = false;
777  }
778 
779 
780 
781  void saveUnpackVariableIfNotKeepGeometry(int opt, bool is_unpack_remote)
782  {
783  if (is_unpack_remote == true)
784  {swap_internal_remote();}
785 
786  if (is_unpack_remote == false)
787  {swap_internal_local();}
788  }
789 
790  void RestoreUnpackVariableIfKeepGeometry(int opt, bool is_unpack_remote)
791  {
792  if (opt & KEEP_GEOMETRY && is_unpack_remote == true)
793  {swap_internal_remote();}
794 
795  if (opt & KEEP_GEOMETRY && is_unpack_remote == false)
796  {swap_internal_local();}
797  }
798 
799 
800  void savePackVariableIfNotKeepGeometry(int opt, bool is_pack_remote)
801  {
802  if (is_pack_remote == false)
803  {
804  swap_local_pack();
805  req_index_swp = req_index;
806  }
807 
808  if (is_pack_remote == true)
809  {
810  swap_remote_pack();
811  req_index_swp_r = req_index;
812  }
813  }
814 
815  void RestorePackVariableIfKeepGeometry(int opt, bool is_pack_remote)
816  {
817  if (opt & KEEP_GEOMETRY && is_pack_remote == false)
818  {
819  swap_local_pack();
820  req_index = req_index_swp;
821  }
822 
823  if (opt & KEEP_GEOMETRY && is_pack_remote == true)
824  {
825  swap_remote_pack();
826  req_index = req_index_swp_r;
827  }
828  }
829 
830  template<unsigned int n_it>
831  void calculatePackingPointsFromBoxes(int opt,size_t tot_pnt)
832  {
833  if (!(opt & KEEP_GEOMETRY))
834  {
835  auto & indexBuffer = private_get_index_array();
836  auto & dataBuffer = private_get_data_array();
837 
838  e_points.resize(tot_pnt);
839  pack_output.resize(tot_pnt);
840 
841  ite_gpu<1> ite;
842 
843  ite.wthr.x = indexBuffer.size();
844  ite.wthr.y = 1;
845  ite.wthr.z = 1;
846  ite.thr.x = getBlockSize();
847  ite.thr.y = 1;
848  ite.thr.z = 1;
849 
850  // Launch a kernel that count the number of element on each chunks
851  CUDA_LAUNCH((SparseGridGpuKernels::get_exist_points_with_boxes<dim,
853  n_it,
854  indexT>),
855  ite,
856  indexBuffer.toKernel(),
857  pack_subs.toKernel(),
858  gridGeometry,
859  dataBuffer.toKernel(),
860  pack_output.toKernel(),
861  tmp.toKernel(),
862  scan_it.toKernel(),
863  e_points.toKernel());
864  }
865  }
866 
867 private:
868 
869  void computeSizeOfGhostLayer()
870  {
871  unsigned int term1 = 1;
872  for (int i = 0; i < dim; ++i)
873  {
874  term1 *= blockEdgeSize + 2 * stencilSupportRadius;
875  }
876  unsigned int term2 = 1;
877  for (int i = 0; i < dim; ++i)
878  {
879  term2 *= blockEdgeSize;
880  }
881  ghostLayerSize = term1 - term2;
882  }
883 
884  void allocateGhostLayerMapping()
885  {
886  ghostLayerToThreadsMapping.resize(ghostLayerSize);
887  }
888 
889  template<typename stencil_type>
890  void computeGhostLayerMapping()
891  {
892  size_t dimensions[dim],
893  origin[dim],
894  innerDomainBegin[dim], innerDomainEnd[dim],
895  outerBoxBegin[dim], outerBoxEnd[dim],
896  bc[dim];
897  for (int i = 0; i < dim; ++i)
898  {
899  dimensions[i] = blockEdgeSize + 2 * stencilSupportRadius;
900  origin[i] = 0;
901  innerDomainBegin[i] = stencilSupportRadius - 1;
902  innerDomainEnd[i] = dimensions[i] - stencilSupportRadius;
903  outerBoxBegin[i] = origin[i];
904  outerBoxEnd[i] = dimensions[i];
905  bc[i] = NON_PERIODIC;
906  }
907  grid_sm<dim, void> enlargedGrid;
908  enlargedGrid.setDimensions(dimensions);
909  Box<dim, size_t> outerBox(outerBoxBegin, outerBoxEnd);
910  Box<dim, size_t> innerBox(innerDomainBegin, innerDomainEnd);
911 
912  grid_skin_iterator_bc<dim> gsi(enlargedGrid, innerBox, outerBox, bc);
913 
914  unsigned int i = 0;
915  while (gsi.isNext())
916  {
917  auto coord = gsi.get();
918  assert(i < ghostLayerSize);
919  mem_id linId = enlargedGrid.LinId(coord);
920  // Mapping
921  ghostLayerToThreadsMapping.template get<gt>(i) = linId;
922  // Now compute the neighbour position to use
923  ghostLayerToThreadsMapping.template get<nt>(i) = stencil_type::template getNNskin<indexT,blockEdgeSize>(coord,stencilSupportRadius);
924  //
925  ++i;
926  ++gsi;
927  }
928  assert(i == ghostLayerSize);
929 
930  ghostLayerToThreadsMapping.template hostToDevice<gt,nt>();
931  }
932 
933  void initialize(const size_t (& res)[dim])
934  {
935  gridGeometry = linearizer(res);
936 
937  computeSizeOfGhostLayer();
938  allocateGhostLayerMapping();
939  computeGhostLayerMapping<NNStar<dim>>();
940 
941  size_t extBlockDims[dim];
942  for (int d=0; d<dim; ++d)
943  {
944  extBlockDims[d] = blockEdgeSize + 2*stencilSupportRadius;
945  }
946  extendedBlockGeometry.setDimensions(extBlockDims);
947 
948  gridSize.setDimensions(res);
949  }
950 
951 
952  template <typename stencil, typename... Args>
953  void applyStencilInPlace(const Box<dim,int> & box, StencilMode & mode,Args... args)
954  {
955  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
958 
959  const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
960  unsigned int numScalars = indexBuffer_.size() * dataChunkSize;
961 
962  if (numScalars == 0) return;
963 
964  // NOTE: Here we want to work only on one data chunk per block!
965  constexpr unsigned int chunksPerBlock = 1;
966  const unsigned int localThreadBlockSize = dataChunkSize * chunksPerBlock;
967  const unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
968  ? numScalars / localThreadBlockSize
969  : 1 + numScalars / localThreadBlockSize;
970 
971  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * chunksPerBlock)>::value; // todo: This works only for stencilSupportSize==1
972 
973 #ifdef CUDIFY_USE_CUDA
974 
975 
976  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace
977  <dim,
979  stencil>),
980  threadGridSize, localThreadBlockSize,
981  box,
982  indexBuffer_.toKernel(),
983  dataBuffer_.toKernel(),
984  this->template toKernelNN<stencil::stencil_type::nNN, nLoop>(),
985  args...);
986 
987 #else
988 
989  auto bx = box;
990  auto indexBuffer = indexBuffer_.toKernel();
991  auto dataBuffer = dataBuffer_.toKernel();
992  auto sparseGrid = this->template toKernelNN<stencil::stencil_type::nNN, nLoop>();
993 
995 
996  auto lamb = [=] __device__ () mutable
997  {
998  constexpr unsigned int pIndex = 0;
999 
1000  typedef typename decltype(indexBuffer)::value_type IndexAggregateT;
1001  typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1002 
1003  typedef typename decltype(dataBuffer)::value_type AggregateT_;
1004  typedef BlockTypeOf<AggregateT_, pMask> MaskBlockT;
1005  typedef ScalarTypeOf<AggregateT_, pMask> MaskT;
1006  constexpr unsigned int blockSize = MaskBlockT::size;
1007 
1008  // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
1009  // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
1010  const unsigned int dataBlockPos = blockIdx.x;
1011  const unsigned int offset = threadIdx.x;
1012 
1013  if (dataBlockPos >= indexBuffer.size())
1014  {
1015  return;
1016  }
1017 
1018  auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
1019 
1020  // todo: Add management of RED-BLACK stencil application! :)
1021  const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1022  grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
1023 
1024  unsigned char curMask;
1025 
1026  if (offset < blockSize)
1027  {
1028  // Read local mask to register
1029  curMask = dataBlockLoad.template get<pMask>()[offset];
1030  for (int i = 0 ; i < dim ; i++)
1031  {curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}
1032  }
1033 
1035  sdataBlockPos.id = dataBlockPos;
1036 
1037  stencil::stencil(
1038  sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1039  curMask, args...);
1040  };
1041 
1042  CUDA_LAUNCH_LAMBDA_DIM3_TLS(threadGridSize, localThreadBlockSize,lamb);
1043 
1044 #endif
1045 
1046  }
1047 
1048  template <typename stencil, typename... Args>
1049  void applyStencilInPlaceNoShared(const Box<dim,int> & box, StencilMode & mode,Args... args)
1050  {
1051  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
1054 
1055  const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
1056  unsigned int numScalars = indexBuffer.size() * dataChunkSize;
1057 
1058  if (numScalars == 0) return;
1059 
1060  auto ite = e_points.getGPUIterator(BLOCK_SIZE_STENCIL);
1061 
1062  CUDA_LAUNCH((SparseGridGpuKernels::applyStencilInPlaceNoShared
1063  <dim,
1065  stencil>),
1066  ite,
1067  box,
1068  indexBuffer.toKernel(),
1069  dataBuffer.toKernel(),
1070  this->template toKernelNN<stencil::stencil_type::nNN, 0>(),
1071  args...);
1072  }
1073 
1074  template<typename ids_type>
1075  void fill_chunks_boxes(openfpm::vector<SpaceBox<dim,double>> & chunks_box, ids_type & chunk_ids, Point<dim,double> & spacing, Point<dim,double> & offset)
1076  {
1077  for (int i = 0 ; i < chunk_ids.size() ; i++)
1078  {
1080 
1081  auto c_pos = gridGeometry.InvLinId(chunk_ids.template get<0>(i)*blockSize);
1082 
1083  for (int j = 0 ; j < dim ; j++)
1084  {
1085  box.setLow(j,c_pos.get(j) * spacing[j] - 0.5*spacing[j] + offset.get(j)*spacing[j]);
1086  box.setHigh(j,(c_pos.get(j) + blockEdgeSize)*spacing[j] - 0.5*spacing[j] + offset.get(j)*spacing[j]);
1087  }
1088 
1089  chunks_box.add(box);
1090  }
1091  }
1092 
1093  template<typename MemType, unsigned int ... prp>
1094  void preUnpack(ExtPreAlloc<MemType> * prAlloc_prp, mgpu::ofp_context_t & ctx, int opt)
1095  {
1096  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1097  {
1098  // Convert the packed chunk ids
1099 
1100  prAlloc_prp->reset();
1101  Unpack_stat ups;
1102 
1103  for (size_t i = 0 ; i < copySect.size() ; i++)
1104  {
1105  auto sub_it = this->getIterator(copySect.get(i).dst.getKP1(),copySect.get(i).dst.getKP2(),NO_ITERATOR_INIT);
1106 
1107  copySect.get(i).grd->template addAndConvertPackedChunkToTmp<prp ...>(*prAlloc_prp,sub_it,ups,ctx);
1108  }
1109  }
1110  }
1111 
1112 
1113  template<unsigned int ... prp>
1114  void removeCopyToFinalize_phase1(mgpu::ofp_context_t & ctx, int opt)
1115  {
1116  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1117  {removePoints(ctx);}
1118  }
1119 
1120  template<unsigned int ... prp>
1121  void removeCopyToFinalize_phase2(mgpu::ofp_context_t & ctx, int opt)
1122  {
1123  // Pack information
1124  Pack_stat sts;
1125 
1126  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1127  {
1128  this->packReset();
1129 
1130  size_t req = 0;
1131  // First we do counting of point to copy (as source)
1132 
1133  for (size_t i = 0 ; i < copySect.size() ; i++)
1134  {
1135  auto sub_it = this->getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1136 
1137  this->packRequest(sub_it,req);
1138  }
1139 
1140  this->template packCalculate<prp...>(req,ctx);
1141 
1142  mem.resize(req);
1143 
1144  // Create an object of preallocated memory for properties
1145  prAlloc_prp = new ExtPreAlloc<CudaMemory>(req,mem);
1146  prAlloc_prp->incRef();
1147 
1148  for (size_t i = 0 ; i < copySect.size() ; i++)
1149  {
1150  auto sub_it = this->getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1151 
1152  this->pack<prp ...>(*prAlloc_prp,sub_it,sts);
1153  }
1154  }
1155  else
1156  {
1157  size_t req = mem.size();
1158 
1159  // Create an object of preallocated memory for properties
1160  prAlloc_prp = new ExtPreAlloc<CudaMemory>(req,mem);
1161  prAlloc_prp->incRef();
1162  }
1163 
1164  this->template packFinalize<prp ...>(*prAlloc_prp,sts,opt,false);
1165 
1166  preUnpack<CudaMemory,prp ...>(prAlloc_prp,ctx,opt);
1167 
1168  prAlloc_prp->decRef();
1169  delete prAlloc_prp;
1170  }
1171 
1172  template<unsigned int ... prp>
1173  void removeCopyToFinalize_phase3(mgpu::ofp_context_t & ctx, int opt, bool is_unpack_remote)
1174  {
1175  ite_gpu<1> ite;
1176 
1177  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1178  {
1179  if (tmp2.size() == 0)
1180  {return;}
1181 
1182  // Fill the add buffer given tmp and than flush
1183 
1184  setGPUInsertBuffer(tmp2.size(),1ul);
1185 
1186  auto & add_buff = this->blockMap.private_get_vct_add_index();
1187  add_buff.swap(tmp2);
1188 
1189  auto & nadd_buff = this->blockMap.private_get_vct_nadd_index();
1190  ite = nadd_buff.getGPUIterator();
1191  CUDA_LAUNCH(SparseGridGpuKernels::set_one,ite,nadd_buff.toKernel());
1192 
1193  int sz_b = this->private_get_index_array().size();
1194 
1195  this->template flush<sLeft_<prp>...>(ctx,flush_type::FLUSH_ON_DEVICE);
1196 
1197  // get the map of the new inserted elements
1198 
1199  auto & m_map = this->getMergeIndexMapVector();
1200  auto & a_map = this->getMappingVector();
1201  auto & o_map = this->getSegmentToOutMap();
1202  auto & segments_data = this->getSegmentToMergeIndexMap();
1203 
1204  new_map.resize(a_map.size(),0);
1205 
1206  // construct new to old map
1207 
1208  ite = segments_data.getGPUIterator();
1209 
1210  if (ite.nblocks() != 0)
1211  CUDA_LAUNCH(SparseGridGpuKernels::construct_new_chunk_map<1>,ite,new_map.toKernel(),a_map.toKernel(),m_map.toKernel(),o_map.toKernel(),segments_data.toKernel(),sz_b);
1212 
1213  convert_blk.template hostToDevice<0>();
1214  }
1215  else
1216  {
1217  ite.wthr.x = 1;
1218  ite.wthr.y = 1;
1219  ite.wthr.z = 1;
1220 
1221  ite.thr.x = 1;
1222  ite.thr.y = 1;
1223  ite.thr.z = 1;
1224  }
1225 
1226  // Restore
1227  RestoreUnpackVariableIfKeepGeometry(opt,is_unpack_remote);
1228 
1229  // for each packed chunk
1230 
1231  size_t n_accu_cnk = 0;
1232  for (size_t i = 0 ; i < n_cnk_cp.size() ; i++)
1233  {
1234  arr_arr_ptr<1,sizeof...(prp)> data;
1235  size_t n_pnt = n_pnt_cp.get(i);
1236 
1237  void * data_base_ptr = data_base_ptr_cp.get(i);
1238  data_ptr_fill<AggregateT,1,prp...> dpf(data_base_ptr,0,data,n_pnt);
1239  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(dpf);
1240 
1241  ite.wthr.x = n_cnk_cp.get(i);
1242 
1243  // calculate best number of threads
1244  Box<dim,size_t> ub = box_cp.get(i);
1245 
1246  ite.thr.x = 1;
1247  for (int j = 0 ; j < dim ; j++)
1248  {
1249  size_t l = ub.getHigh(j) - ub.getLow(j) + 1;
1250 
1251  if (l >= blockEdgeSize)
1252  {ite.thr.x *= blockEdgeSize;}
1253  else
1254  {ite.thr.x *= l;}
1255  }
1256 
1257  // copy to new (1 block for each packed chunk)
1258  if (ite.nblocks() != 0 && ite.thr.x != 0)
1259  {
1260  auto & chunks = private_get_data_array();
1261 
1262  CUDA_LAUNCH((SparseGridGpuKernels::copy_packed_data_to_chunks<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
1263  AggregateT,decltype(convert_blk.toKernel()),decltype(new_map.toKernel()),
1264  decltype(data),decltype(chunks.toKernel()),prp... >),ite,
1265  (unsigned int *)scan_ptrs_cp.get(i),
1266  (unsigned short int *)offset_ptrs_cp.get(i),
1267  convert_blk.toKernel(),
1268  new_map.toKernel(),
1269  data,
1270  chunks.toKernel(),
1271  n_cnk_cp.get(i),
1272  n_shifts_cp.get(i),
1273  n_pnt_cp.get(i),
1274  i,
1275  n_accu_cnk);
1276  }
1277 
1278  n_accu_cnk += n_cnk_cp.get(i)*n_shifts_cp.get(i);
1279  }
1280 
1281  // Save
1282  saveUnpackVariableIfNotKeepGeometry(opt,is_unpack_remote);
1283  }
1284 
1285  template<unsigned int n_it, unsigned int ... prp>
1286  void pack_sg_implement(ExtPreAlloc<CudaMemory> & mem,
1287  Pack_stat & sts,
1288  int opt,
1289  bool is_pack_remote)
1290  {
1291  arr_ptr<n_it> index_ptr;
1292  arr_arr_ptr<n_it,sizeof...(prp)> data_ptr;
1293  arr_ptr<n_it> scan_ptr;
1294  arr_ptr<n_it> offset_ptr;
1295  arr_ptr<n_it> mask_ptr;
1297 
1298  auto & indexBuffer = private_get_index_array();
1299  auto & dataBuffer = private_get_data_array();
1300 
1301  if (req_index != pack_subs.size())
1302  {std::cerr << __FILE__ << ":" << __LINE__ << " error the packing request number differ from the number of packed objects " << req_index << " " << pack_subs.size() << std::endl;}
1303 
1304  size_t tot_pnt = 0;
1305  size_t tot_cnk = 0;
1306 
1307  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
1308  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
1309 
1310  // Calculate total points
1311 
1312  for (size_t i = 0 ; i < pack_subs.size() ; i++)
1313  {
1314  size_t n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
1315  sar.sa[i] = n_pnt;
1316  tot_pnt += n_pnt;
1317  }
1318 
1319  // CUDA require aligned access, here we suppose 8 byte alligned and we ensure 8 byte aligned after
1320  // the cycle
1321  for (size_t i = 0 ; i < pack_subs.size() ; i++)
1322  {
1323  size_t n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
1324 
1325  // fill index_ptr data_ptr scan_ptr
1326  index_ptr.ptr[i] = index_ptrs.get(i);
1327  scan_ptr.ptr[i] = scan_ptrs.get(i);
1328 
1329  // for all properties fill the data pointer
1330 
1331  data_ptr_fill<AggregateT,n_it,prp...> dpf(data_ptrs.get(i),i,data_ptr,tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1));
1332  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(dpf);
1333 
1334  offset_ptr.ptr[i] = offset_ptrs.get(i);
1335  mask_ptr.ptr[i] = mask_ptrs.get(i);
1336 
1337  tot_cnk += n_cnk;
1338  }
1339 
1340  ite_gpu<1> ite;
1341 
1342  if (tot_pnt != 0)
1343  {
1344  calculatePackingPointsFromBoxes<n_it>(opt,tot_pnt);
1345 
1346  ite = e_points.getGPUIterator();
1347 
1348  // Here we copy the array of pointer of properties into a CudaMemory array
1349 
1350  CudaMemory mem;
1351  mem.allocate(sizeof(data_ptr));
1352 
1353  // copy
1354  arr_arr_ptr<n_it,sizeof...(prp)> * arr_data = (arr_arr_ptr<n_it,sizeof...(prp)> *)mem.getPointer();
1355 
1356  for(int i = 0 ; i < n_it ; i++)
1357  {
1358  for (int j = 0 ; j < sizeof...(prp) ; j++)
1359  {
1360  arr_data->ptr[i][j] = data_ptr.ptr[i][j];
1361  }
1362  }
1363 
1364  mem.hostToDevice();
1365 
1366  CUDA_LAUNCH((SparseGridGpuKernels::pack_data<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
1367  AggregateT,
1368  n_it,
1369  sizeof...(prp),
1370  indexT,
1371  decltype(e_points.toKernel()),
1372  decltype(pack_output.toKernel()),
1373  decltype(indexBuffer.toKernel()),
1374  decltype(dataBuffer.toKernel()),
1375  decltype(tmp.toKernel()),
1376  self::blockSize,
1377  prp ...>),
1378  ite,
1379  e_points.toKernel(),
1380  dataBuffer.toKernel(),
1381  indexBuffer.toKernel(),
1382  tmp.toKernel(),
1383  pack_output.toKernel(),
1384  index_ptr,
1385  scan_ptr,
1386  (arr_arr_ptr<n_it,sizeof...(prp)> *)mem.getDevicePointer(),
1387  offset_ptr,
1388  mask_ptr,
1389  sar);
1390  }
1391 
1392  ite.wthr.x = 1;
1393  ite.wthr.y = 1;
1394  ite.wthr.z = 1;
1395  ite.thr.x = pack_subs.size();
1396  ite.thr.y = 1;
1397  ite.thr.z = 1;
1398 
1399  if (pack_subs.size() != 0)
1400  {CUDA_LAUNCH(SparseGridGpuKernels::last_scan_point,ite,scan_ptr,tmp.toKernel(),indexBuffer.size()+1,pack_subs.size());}
1401  }
1402 
1403 
1413  template<unsigned int ... prp, typename S2>
1416  Unpack_stat & ps,
1417  mgpu::ofp_context_t &context)
1418  {
1419  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
1420  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
1421 
1422  // First get the number of chunks
1423 
1424  size_t n_cnk;
1425 
1426  // Unpack the number of chunks
1427  mem.deviceToHost(ps.getOffset(),ps.getOffset() + sizeof(size_t) + 2*dim*sizeof(int));
1428  Unpacker<size_t,S2>::unpack(mem,n_cnk,ps);
1429 
1430  grid_key_dx<dim,int> origPack_pnt;
1431  grid_key_dx<dim,int> origPack_cnk;
1432  size_t sz[dim];
1433 
1434  // Unpack origin of the chunk indexing
1435  for (int i = 0 ; i < dim ; i++)
1436  {
1437  int tmp;
1438  Unpacker<int,S2>::unpack(mem,tmp,ps);
1439  origPack_cnk.set_d(i,((int)(tmp / blockEdgeSize))*blockEdgeSize);
1440  origPack_pnt.set_d(i,tmp);
1441  }
1442 
1443  for (int i = 0 ; i < dim ; i++)
1444  {
1445  int tmp;
1446  Unpacker<int,S2>::unpack(mem,tmp,ps);
1447  sz[i] = tmp;
1448  }
1449 
1450  size_t actual_offset = n_cnk*sizeof(indexT);
1451  // get the id pointers
1452  indexT * ids = (indexT *)((unsigned char *)mem.getDevicePointer() + ps.getOffset());
1453  unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));
1454 
1455  mem.deviceToHost(ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int),
1456  ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int) + sizeof(unsigned int));
1457 
1458 
1459 
1460  // Unpack number of points
1461  // calculate the number of total points
1462  size_t n_pnt = *(unsigned int *)((unsigned char *)mem.getPointer() + ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int));
1463  actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));
1464 
1465  void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );
1466 
1467  actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));
1468 
1469  short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);
1470 
1471  offset_ptrs_cp.add(offsets);
1472  scan_ptrs_cp.add(scan);
1473  n_cnk_cp.add(n_cnk);
1474  n_pnt_cp.add(n_pnt);
1475  data_base_ptr_cp.add(data_base_ptr);
1476 
1477  Box<dim,size_t> bx;
1478 
1479  for (int i = 0 ; i < dim ; i++)
1480  {
1481  bx.setLow(i,sub_it.getStart().get(i));
1482  bx.setHigh(i,sub_it.getStop().get(i));
1483  }
1484 
1485  box_cp.add(bx);
1486 
1487  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));
1488 
1489  if (n_cnk != 0)
1490  {
1491  shifts.clear();
1492 
1493  int n_shift = 1;
1494  shifts.add();
1495 
1496  for (int i = 0 ; i < dim ; i++)
1497  {shifts.last().template get<0>()[i] = 0;}
1498 
1499  for (int i = 0 ; i < dim ; i++)
1500  {
1501  int op_q = origPack_pnt.get(i) % blockEdgeSize;
1502  int ou_q = sub_it.getStart().get(i) % blockEdgeSize;
1503  int quot = abs(ou_q - op_q) % blockEdgeSize;
1504  int squot = openfpm::math::sgn(ou_q - op_q);
1505  if (quot != 0)
1506  {
1507  n_shift *= 2;
1508 
1509  int sz = shifts.size();
1510  for (int j = 0 ; j < sz ; j++)
1511  {
1512  shifts.add();
1513  for (int k = 0 ; k < dim ; k++)
1514  {
1515  shifts.last().template get<0>()[k] = shifts.template get<0>(j)[k] + ((i == k)?squot:0);
1516  }
1517  }
1518  }
1519  }
1520 
1521  shifts.template hostToDevice<0>();
1522 
1523  linearizer gridGeoPack(sz);
1524 
1525  int bs = 0;
1526  size_t sz[1] = {n_cnk};
1527  grid_sm<1,void> g(sz);
1528  auto ite = g.getGPUIterator();
1529 
1530  grid_key_dx<dim,int> sz_g;
1531  grid_key_dx<dim,int> origUnpack_cnk;
1532 
1533  for (int i = 0 ; i < dim ; i++)
1534  {
1535  sz_g.set_d(i,gridGeometry.getSize()[i]);
1536  origUnpack_cnk.set_d(i,(int)(sub_it.getStart().get(i) / blockEdgeSize)*blockEdgeSize);
1537  }
1538 
1539  bs = tmp2.size();
1540  tmp2.resize(tmp2.size() + n_cnk * shifts.size());
1541 
1542  n_shifts_cp.add(shifts.size());
1543 
1544  switch (shifts.size())
1545  {
1546  case 1:
1547  // Calculate for each chunk the indexes where they should go + active points
1548  CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,1,indexT>),ite,ids,
1549  n_cnk,
1550  gridGeoPack,origPack_cnk,
1551  gridGeometry,origUnpack_cnk,
1552  tmp2.toKernel(),
1553  shifts.toKernel(),
1554  sz_g,
1555  bs);
1556  break;
1557  case 2:
1558  // Calculate for each chunk the indexes where they should go + active points
1559  CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,2,indexT>),ite,ids,
1560  n_cnk,
1561  gridGeoPack,origPack_cnk,
1562  gridGeometry,origUnpack_cnk,
1563  tmp2.toKernel(),
1564  shifts.toKernel(),
1565  sz_g,
1566  bs);
1567  break;
1568  case 4:
1569  // Calculate for each chunk the indexes where they should go + active points
1570  CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,4,indexT>),ite,ids,
1571  n_cnk,
1572  gridGeoPack,origPack_cnk,
1573  gridGeometry,origUnpack_cnk,
1574  tmp2.toKernel(),
1575  shifts.toKernel(),
1576  sz_g,
1577  bs);
1578  break;
1579  case 8:
1580  // Calculate for each chunk the indexes where they should go + active points
1581  CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,8,indexT>),ite,ids,
1582  n_cnk,
1583  gridGeoPack,origPack_cnk,
1584  gridGeometry,origUnpack_cnk,
1585  tmp2.toKernel(),
1586  shifts.toKernel(),
1587  sz_g,
1588  bs);
1589  break;
1590  }
1591 
1592  convertChunkIds(offsets,origPack_pnt,sub_it);
1593  }
1594  else
1595  {
1596  convert_blk.add();
1597  n_shifts_cp.add(0);
1598  }
1599 
1600  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));
1601 
1602  ps.addOffset(actual_offset);
1603  }
1604 
1609  template<typename origPackType, typename IteratorType>
1610  void convertChunkIds(short int * offset, origPackType & origPack, IteratorType & sub_it)
1611  {
1612  int quot_diff[dim];
1613  for (int i = 0 ; i < dim ; i++)
1614  {
1615  int op_q = origPack.get(i) % blockEdgeSize;
1616  int ou_q = sub_it.getStart().get(i) % blockEdgeSize;
1617  int quot = abs(ou_q - op_q) % blockEdgeSize;
1618  quot_diff[i] = openfpm::math::sgn(ou_q - op_q)*quot;
1619  }
1620 
1621  convert_blk.add();
1622 
1623  // Create conversion block
1624 
1625  for (int j = 0 ; j < this->blockSize ; j++)
1626  {
1627  int offset = 0;
1628  int bpos = 0;
1629  int bp_c = 1;
1630  int pos = 0;
1631  int pos_c = 1;
1632 
1633  int x = j;
1634  for (int i = 0 ; i < dim ; i++)
1635  {
1636  int c = x % blockEdgeSize;
1637 
1638  if (quot_diff[i] + c < 0)
1639  {
1640  offset += pos_c*(quot_diff[i] + c + blockEdgeSize);
1641  bpos += bp_c*1;
1642  }
1643  else if (quot_diff[i] + c >= blockEdgeSize)
1644  {
1645  offset += pos_c*(quot_diff[i] + c - blockEdgeSize);
1646  bpos += bp_c*1;
1647  }
1648  else
1649  {
1650  offset += pos_c*(quot_diff[i] + c);
1651  }
1652 
1653  pos += pos_c*c;
1654  pos_c *= blockEdgeSize;
1655  bp_c *= (quot_diff[i] != 0)?2:1;
1656  x /= blockEdgeSize;
1657  }
1658 
1659  convert_blk.template get<0>(convert_blk.size()-1)[pos] = (bpos << 16) + offset;
1660  }
1661  }
1662 
1663 public:
1664 
1665  typedef AggregateT value_type;
1666 
1667  typedef self device_grid_type;
1668 
1669  SparseGridGpu()
1670  :stencilSupportRadius(1)
1671  {};
1672 
1678  void resize(size_t (& res)[dim])
1679  {
1680  initialize(res);
1681  }
1682 
1687  SparseGridGpu(const size_t (& res)[dim], unsigned int stencilSupportRadius = 1)
1688  :stencilSupportRadius(stencilSupportRadius)
1689  {
1690  initialize(res);
1691  };
1692 
1697  SparseGridGpu(linearizer & gridGeometry, unsigned int stencilSupportRadius = 1)
1698  : gridGeometry(gridGeometry),
1699  stencilSupportRadius(stencilSupportRadius)
1700  {
1701  // convert to size_t
1702  size_t sz_st[dim];
1703 
1704  for (int i = 0 ; i < dim ; i++) {sz_st[i] = gridGeometry.getSize()[i];}
1705 
1706  initialize(sz_st);
1707  };
1708 
1710  <
1711  dim,
1712  blockEdgeSize,
1713  typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1714  ct_par<0,1>,
1715  indexT,
1716  layout_base,
1717  decltype(extendedBlockGeometry),
1718  linearizer,
1719  AggregateT
1720  > toKernel()
1721  {
1723  <
1724  dim,
1725  blockEdgeSize,
1726  typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1727  ct_par<0,1>,
1728  indexT,
1729  layout_base,
1730  decltype(extendedBlockGeometry),
1731  linearizer,
1732  AggregateT
1733  > toKer(
1735  gridGeometry,
1736  extendedBlockGeometry,
1737  stencilSupportRadius,
1738  ghostLayerToThreadsMapping.toKernel(),
1739  nn_blocks.toKernel(),
1740  e_points.toKernel(),
1741  ghostLayerSize,
1742  bck);
1743  return toKer;
1744  }
1745 
1746  template<unsigned int nNN, unsigned int nLoop>
1748  <
1749  dim,
1750  blockEdgeSize,
1751  typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1753  indexT,
1754  layout_base,
1755  decltype(extendedBlockGeometry),
1756  linearizer,
1757  AggregateT
1758  > toKernelNN()
1759  {
1761  <
1762  dim,
1763  blockEdgeSize,
1764  typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1766  indexT,
1767  layout_base,
1768  decltype(extendedBlockGeometry),
1769  linearizer,
1770  AggregateT
1771  > toKer(
1773  gridGeometry,
1774  extendedBlockGeometry,
1775  stencilSupportRadius,
1776  ghostLayerToThreadsMapping.toKernel(),
1777  nn_blocks.toKernel(),
1778  e_points.toKernel(),
1779  ghostLayerSize,
1780  bck);
1781  return toKer;
1782  }
1783 
1787  void clear()
1788  {
1789  BMG::clear();
1790  }
1791 
1792  /* \brief Does nothing
1793  *
1794  */
1795  void setMemory()
1796  {}
1797 
1803  linearizer & getGrid()
1804  {
1805  return gridGeometry;
1806  }
1807 
1813  template<typename stencil_type>
1814  void setNNType()
1815  {
1816  computeGhostLayerMapping<stencil_type>();
1817  }
1818 
1819 
1820  constexpr static unsigned int getBlockEdgeSize()
1821  {
1822  return blockEdgeSize;
1823  }
1824 
1825  constexpr unsigned int getBlockSize() const
1826  {
1827  return blockSize;
1828  }
1829 
1830  // Geometry
1831  template<typename CoordT>
1832  inline size_t getLinId(CoordT &coord)
1833  {
1834  return gridGeometry.LinId(coord);
1835  }
1836 
1837  inline grid_key_dx<dim, int> getCoord(size_t linId) const
1838  {
1839  return gridGeometry.InvLinId(linId);
1840  }
1841 
1842  inline ite_gpu<dim> getGridGPUIterator(const grid_key_dx<dim, int> & start, const grid_key_dx<dim, int> & stop, size_t n_thr = threadBlockSize)
1843  {
1844  return gridSize.getGPUIterator(start,stop,n_thr);
1845  }
1846 
1856  template<typename CoordT>
1858  {
1859  base_key k(*this,0,0);
1860 
1861  const auto & blockMap = this->private_get_blockMap();
1862 
1863  auto glid = gridGeometry.LinId(coord);
1864 
1865  auto bid = glid / blockSize;
1866  auto lid = glid % blockSize;
1867 
1868  auto key = blockMap.get_sparse(bid);
1869 
1870  k.set_cnk_pos_id(key.id);
1871  k.set_data_id(lid);
1872 
1873  return k;
1874  }
1875 
1885  template<unsigned int p, typename CoordT>
1886  auto get(const grid_key_dx<dim,CoordT> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &
1887  {
1889  }
1890 
1900  template<unsigned int p>
1901  auto get(const sparse_grid_gpu_index<self> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &
1902  {
1903  return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];
1904  }
1905 
1912  {
1914  }
1915 
1921  auto private_get_data_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer())
1922  {
1924  }
1925 
1933  template<typename CoordT>
1934  auto get_o(const grid_key_dx<dim,CoordT> & coord) const -> encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::get(0))>::type >
1935  {
1936  int offset;
1937  indexT lin;
1938  gridGeometry.LinId(coord,lin,offset);
1939 
1941  }
1942 
1950  auto get_o(const sparse_grid_gpu_index<self> & coord) const -> encap_data_block<typename std::remove_const<decltype(private_get_data_array().get(0))>::type >
1951  {
1952  return encap_data_block<typename std::remove_const<decltype(private_get_data_array().get(0))>::type >(coord.get_data_id(),private_get_data_array().get(coord.get_cnk_pos_id()));
1953  }
1954 
1961  {
1962  return (index_size_swp_r == private_get_index_array().size()) && (index_size_swp == private_get_index_array().size());
1963  }
1964 
1974  template<unsigned int p>
1975  auto get(const sparse_grid_gpu_index<self> & coord) -> ScalarTypeOf<AggregateBlockT, p> &
1976  {
1977  return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];
1978  }
1979 
1987  unsigned char getFlag(const sparse_grid_gpu_index<self> & coord) const
1988  {
1989  return private_get_data_array().template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(coord.get_cnk_pos_id())[coord.get_data_id()];
1990  }
1991 
1992  template<unsigned int p, typename CoordT>
1993  auto insert(const CoordT &coord) -> ScalarTypeOf<AggregateBlockT, p> &
1994  {
1995  return BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::template insert<p>(gridGeometry.LinId(coord));
1996  }
1997 
1998  template<typename CoordT>
1999  auto insert_o(const CoordT &coord) -> encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insert_o(0))>::type >
2000  {
2001  indexT ind;
2002  int offset;
2003  gridGeometry.LinId(coord,ind,offset);
2004 
2006  }
2007 
2014  void construct_link(self & grid_up, self & grid_dw, mgpu::ofp_context_t &context)
2015  {
2016 /* // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2017  auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
2018  auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
2019 
2020  ite_gpu<1> ite;
2021 
2022  ite.wthr.x = indexBuffer.size();
2023  ite.wthr.y = 1;
2024  ite.wthr.z = 1;
2025 
2026  ite.thr.x = getBlockSize();
2027  ite.thr.y = 1;
2028  ite.thr.z = 1;
2029 
2030  openfpm::vector_gpu<aggregate<unsigned int>> output;
2031  output.resize(indexBuffer.size() + 1);
2032 
2033  CUDA_LAUNCH((SparseGridGpuKernels::link_construct<dim,
2034  BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
2035  blockSize>),ite,grid_up.toKernel(),this->toKernel(),output.toKernel());
2036 
2037 
2038  openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);
2039 
2040  output.template deviceToHost<0>(output.size()-1,output.size()-1);
2041 
2042  unsigned int np_lup = output.template get<0>(output.size()-1);
2043 
2044  links_up.resize(np_lup);
2045 
2046  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert<dim,
2047  BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
2048  blockSize>),ite,grid_up.toKernel(),this->toKernel(),output.toKernel(),links_up.toKernel());
2049 
2050 */
2051  }
2052 
2059  {
2060  return link_dw_scan;
2061  }
2062 
2069  {
2070  return link_dw;
2071  }
2072 
2079  {
2080  return link_up_scan;
2081  }
2082 
2089  {
2090  return link_up;
2091  }
2092 
2101  void construct_link_dw(self & grid_dw, const Box<dim,int> & db_, Point<dim,int> p_dw, mgpu::ofp_context_t &context)
2102  {
2103  Box<dim,int> db = db_;
2104 
2105  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2108 
2109  // Count padding points
2110 
2111  // First we count the padding points
2112  ite_gpu<1> ite;
2113 
2114  ite.wthr.x = indexBuffer.size();
2115  ite.wthr.y = 1;
2116  ite.wthr.z = 1;
2117 
2118  ite.thr.x = getBlockSize();
2119  ite.thr.y = 1;
2120  ite.thr.z = 1;
2121 
2123  output.resize(indexBuffer.size()+1);
2124 
2125  output.fill<0>(0);
2126 
2127  CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2129  blockSize>),ite,this->toKernel(),output.toKernel(),db);
2130 
2131 
2132 
2133  openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);
2134 
2135  output.template deviceToHost<0>(output.size()-1,output.size()-1);
2136  unsigned int padding_points = output.template get<0>(output.size()-1);
2137 
2138  // get the padding points
2139 
2141  pd_points.resize(padding_points);
2142 
2143  CUDA_LAUNCH((SparseGridGpuKernels::collect_paddings<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,this->toKernel(),output.toKernel(),pd_points.toKernel(),db);
2144 
2145  // Count number of link down for padding points
2146 
2147  // Calculate ghost
2148 
2149  link_dw_scan.resize(padding_points+1);
2150  link_dw_scan.fill<0>(0);
2151 
2152  ite = link_dw_scan.getGPUIterator();
2153 
2154  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_dw_count<dim,
2156  blockSize>),
2157  ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),link_dw_scan.toKernel(),p_dw);
2158 
2159  openfpm::scan((unsigned int *)link_dw_scan.template getDeviceBuffer<0>(),link_dw_scan.size(),(unsigned int *)link_dw_scan.template getDeviceBuffer<0>(),context);
2160 
2161  link_dw_scan.template deviceToHost<0>(link_dw_scan.size()-1,link_dw_scan.size()-1);
2162 
2163  size_t np_ldw = link_dw_scan.template get<0>(link_dw_scan.size()-1);
2164 
2165  link_dw.resize(np_ldw);
2166 
2167  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_dw<dim,
2169  blockSize>),ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),link_dw_scan.toKernel(),link_dw.toKernel(),p_dw);
2170 
2171  link_dw_scan.resize(link_dw_scan.size()-1);
2172  }
2173 
2179  void construct_link_up(self & grid_up, const Box<dim,int> & db_, Point<dim,int> p_up, mgpu::ofp_context_t &context)
2180  {
2181  Box<dim,int> db = db_;
2182 
2183  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2186 
2187  // Count padding points
2188 
2189  // First we count the padding points
2190  ite_gpu<1> ite;
2191 
2192  ite.wthr.x = indexBuffer.size();
2193  ite.wthr.y = 1;
2194  ite.wthr.z = 1;
2195 
2196  ite.thr.x = getBlockSize();
2197  ite.thr.y = 1;
2198  ite.thr.z = 1;
2199 
2201  output.resize(indexBuffer.size()+1);
2202 
2203  output.fill<0>(0);
2204 
2205  CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2207  blockSize>),ite,this->toKernel(),output.toKernel(),db);
2208 
2209 
2210 
2211  openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);
2212 
2213  output.template deviceToHost<0>(output.size()-1,output.size()-1);
2214  unsigned int padding_points = output.template get<0>(output.size()-1);
2215 
2216  // get the padding points
2217 
2219  pd_points.resize(padding_points);
2220 
2221  CUDA_LAUNCH((SparseGridGpuKernels::collect_paddings<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,this->toKernel(),output.toKernel(),pd_points.toKernel(),db);
2222 
2223  // Count number of link down for padding points
2224 
2225  // Calculate ghost
2226 
2227  link_up_scan.resize(padding_points+1);
2228  link_up_scan.fill<0>(0);
2229 
2230  ite = link_up_scan.getGPUIterator();
2231 
2232  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_up_count<dim,
2234  blockSize>),
2235  ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),link_up_scan.toKernel(),p_up);
2236 
2237  openfpm::scan((unsigned int *)link_up_scan.template getDeviceBuffer<0>(),link_up_scan.size(),(unsigned int *)link_up_scan.template getDeviceBuffer<0>(),context);
2238 
2239  link_up_scan.template deviceToHost<0>(link_up_scan.size()-1,link_up_scan.size()-1);
2240 
2241  size_t np_lup = link_up_scan.template get<0>(link_up_scan.size()-1);
2242 
2243  link_up.resize(np_lup);
2244 
2245  CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_up<dim,
2247  blockSize>),ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),link_up_scan.toKernel(),link_up.toKernel(),p_up);
2248 
2249  link_up_scan.resize(link_up_scan.size()-1);
2250  }
2251 
2258  template<typename dim3T>
2259  void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
2260  {
2263  dim3SizeToInt(nBlock),
2264  dim3SizeToInt(nSlot)
2265  );
2266  }
2267 
2273  void preFlush()
2274  {
2276  }
2277 
2278  template<typename stencil_type = NNStar<dim>, typename checker_type = No_check>
2279  void tagBoundaries(mgpu::ofp_context_t &context, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS)
2280  {
2281  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2284 
2285  const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
2286  unsigned int numScalars = indexBuffer.size() * dataChunkSize;
2287 
2288  if (numScalars == 0) return;
2289  if (findNN == false)
2290  {
2291  findNeighbours<stencil_type>();
2292  findNN = true;
2293  }
2294 
2295  // NOTE: Here we want to work only on one data chunk per block!
2296 
2297  unsigned int localThreadBlockSize = dataChunkSize;
2298  unsigned int threadGridSize = numScalars % dataChunkSize == 0
2299  ? numScalars / dataChunkSize
2300  : 1 + numScalars / dataChunkSize;
2301 
2302  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * 1)>::value; // todo: This works only for stencilSupportSize==1
2303 // constexpr unsigned int nLoop = IntPow<blockEdgeSize + 2, dim>::value; // todo: This works only for stencilSupportSize==1
2304 
2305  if (stencilSupportRadius == 1)
2306  {
2307  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2308  dim,
2309  1,
2311  stencil_type,
2312  checker_type>),
2313  threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2314  }
2315  else if (stencilSupportRadius == 2)
2316  {
2317  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2318  dim,
2319  2,
2321  stencil_type,
2322  checker_type>),
2323  threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2324  }
2325  else if (stencilSupportRadius == 0)
2326  {
2327  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2328  dim,
2329  0,
2331  stencil_type,
2332  checker_type>),
2333  threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2334  }
2335  else
2336  {
2337  //todo: KABOOOOOOM!!!
2338  std::cout << __FILE__ << ":" << __LINE__ << " error: stencilSupportRadius supported only up to 2, passed: " << stencilSupportRadius << std::endl;
2339 
2340  }
2341 
2342  if (opt == tag_boundaries::CALCULATE_EXISTING_POINTS)
2343  {
2344  // first we calculate the existing points
2346 
2347  block_points.resize(indexBuffer.size() + 1);
2348  block_points.template get<0>(block_points.size()-1) = 0;
2349  block_points.template hostToDevice<0>(block_points.size()-1,block_points.size()-1);
2350 
2351  ite_gpu<1> ite;
2352 
2353  ite.wthr.x = indexBuffer.size();
2354  ite.wthr.y = 1;
2355  ite.wthr.z = 1;
2356  ite.thr.x = getBlockSize();
2357  ite.thr.y = 1;
2358  ite.thr.z = 1;
2359 
2360  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),
2361  ite,
2362  dataBuffer.toKernel(),
2363  block_points.toKernel());
2364 
2365  // than we scan
2366  openfpm::scan((indexT *)block_points.template getDeviceBuffer<0>(),block_points.size(),(indexT *)block_points.template getDeviceBuffer<0>(),context);
2367 
2368  // Get the total number of points
2369  block_points.template deviceToHost<0>(block_points.size()-1,block_points.size()-1);
2370  size_t tot = block_points.template get<0>(block_points.size()-1);
2371  e_points.resize(tot);
2372 
2373  // we fill e_points
2374  CUDA_LAUNCH((SparseGridGpuKernels::fill_e_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,
2375  dataBuffer.toKernel(),
2376  block_points.toKernel(),
2377  e_points.toKernel())
2378 
2379  }
2380 
2381  cudaDeviceSynchronize();
2382  }
2383 
2384  template<typename NNtype = NNStar<dim>>
2385  void findNeighbours()
2386  {
2387  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2389 
2390  const unsigned int numBlocks = indexBuffer.size();
2391  const unsigned int numScalars = numBlocks * NNtype::nNN;
2392  nn_blocks.resize(numScalars);
2393 
2394  if (numScalars == 0) return;
2395 
2396  // NOTE: Here we want to work only on one data chunk per block!
2397 
2398  unsigned int localThreadBlockSize = NNtype::nNN;
2399 
2400  unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
2401  ? numScalars / localThreadBlockSize
2402  : 1 + numScalars / localThreadBlockSize;
2403 
2404  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::findNeighbours<dim,NNtype>),
2405  threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), this->toKernel(),nn_blocks.toKernel());
2406 
2407  findNN = true;
2408  }
2409 
2410  size_t countExistingElements() const
2411  {
2412  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2415 
2417  typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2418  typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2419  constexpr unsigned int blockSize = MaskBlockT::size;
2420  const auto bufferSize = indexBuffer.size();
2421 
2422  size_t numExistingElements = 0;
2423 
2424  for (size_t blockId=0; blockId<bufferSize; ++blockId)
2425  {
2426  auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2427  for (size_t elementId=0; elementId<blockSize; ++elementId)
2428  {
2429  const auto curMask = dataBlock.template get<pMask>()[elementId];
2430 
2431  if (this->exist(curMask))
2432  {
2433  ++numExistingElements;
2434  }
2435  }
2436  }
2437 
2438  return numExistingElements;
2439  }
2440 
2441  size_t countBoundaryElements()
2442  {
2443  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2446 
2448  typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2449  typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2450  constexpr unsigned int blockSize = MaskBlockT::size;
2451  const auto bufferSize = indexBuffer.size();
2452 
2453  size_t numBoundaryElements = 0;
2454 
2455  for (size_t blockId=0; blockId<bufferSize; ++blockId)
2456  {
2457  auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2458  for (size_t elementId=0; elementId<blockSize; ++elementId)
2459  {
2460  const auto curMask = dataBlock.template get<pMask>()[elementId];
2461 
2462  if (this->exist(curMask) && this->isPadding(curMask))
2463  {
2464  ++numBoundaryElements;
2465  }
2466  }
2467  }
2468 
2469  return numBoundaryElements;
2470  }
2471 
2472  // Also count mean+stdDev of occupancy of existing blocks
2473  void measureBlockOccupancyMemory(double &mean, double &deviation)
2474  {
2475  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2478 
2480  typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2481  typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2482  constexpr unsigned int blockSize = MaskBlockT::size;
2483  const auto bufferSize = indexBuffer.size();
2484 
2485  openfpm::vector<double> measures;
2486 
2487  for (size_t blockId=0; blockId<bufferSize; ++blockId)
2488  {
2489  auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2490  size_t numElementsInBlock = 0;
2491  for (size_t elementId=0; elementId<blockSize; ++elementId)
2492  {
2493  const auto curMask = dataBlock.template get<pMask>()[elementId];
2494 
2495  if (this->exist(curMask))
2496  {
2497  ++numElementsInBlock;
2498  }
2499  }
2500  double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;
2501  measures.add(blockOccupancy);
2502  }
2503 
2504  standard_deviation(measures, mean, deviation);
2505  }
2506 
2507  // Also count mean+stdDev of occupancy of existing blocks
2508  void measureBlockOccupancy(double &mean, double &deviation)
2509  {
2510  // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2513 
2515  typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2516  typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2517  constexpr unsigned int blockSize = MaskBlockT::size;
2518  const auto bufferSize = indexBuffer.size();
2519 
2520  openfpm::vector<double> measures;
2521 
2522  for (size_t blockId=0; blockId<bufferSize; ++blockId)
2523  {
2524  auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2525  size_t numElementsInBlock = 0;
2526  for (size_t elementId=0; elementId<blockSize; ++elementId)
2527  {
2528  const auto curMask = dataBlock.template get<pMask>()[elementId];
2529 
2530  if (this->exist(curMask) && !this->isPadding(curMask))
2531  {
2532  ++numElementsInBlock;
2533  }
2534  }
2535  double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;
2536  measures.add(blockOccupancy);
2537  }
2538 
2539  standard_deviation(measures, mean, deviation);
2540  }
2541 
2556  template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2557  void conv_cross(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
2558  {
2559  Box<dim,int> box;
2560 
2561  for (int i = 0 ; i < dim ; i++)
2562  {
2563  box.setLow(i,start.get(i));
2564  box.setHigh(i,stop.get(i));
2565  }
2566 
2567  applyStencils< SparseGridGpuKernels::stencil_cross_func<dim,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2568  }
2569 
2570 
2575  template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2576  void conv(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
2577  {
2578  Box<dim,int> box;
2579 
2580  for (int i = 0 ; i < dim ; i++)
2581  {
2582  box.setLow(i,start.get(i));
2583  box.setHigh(i,stop.get(i));
2584  }
2585 
2586  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2587 
2588  applyStencils< SparseGridGpuKernels::stencil_cross_func_conv<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2589  }
2590 
2595  template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2596  void conv_cross_b(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
2597  {
2598  Box<dim,int> box;
2599 
2600  for (int i = 0 ; i < dim ; i++)
2601  {
2602  box.setLow(i,start.get(i));
2603  box.setHigh(i,stop.get(i));
2604  }
2605 
2606  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2607 
2608  applyStencils< SparseGridGpuKernels::stencil_cross_func_conv_block_read<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2609  }
2610 
2615  template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2616  void conv2_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
2617  {
2618  Box<dim,int> box;
2619 
2620  for (int i = 0 ; i < dim ; i++)
2621  {
2622  box.setLow(i,start.get(i));
2623  box.setHigh(i,stop.get(i));
2624  }
2625 
2626  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2627 
2628  applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2629  }
2630 
2635  template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2636  void conv2(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
2637  {
2638  Box<dim,int> box;
2639 
2640  for (int i = 0 ; i < dim ; i++)
2641  {
2642  box.setLow(i,start.get(i));
2643  box.setHigh(i,stop.get(i));
2644  }
2645 
2646  constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2647 
2648  applyStencils< SparseGridGpuKernels::stencil_func_conv2<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2649  }
2650 
2657  {
2658  Box<dim,int> b;
2659 
2660  for (int i = 0 ; i < dim ; i++)
2661  {
2662  b.setLow(i,0);
2663  b.setHigh(i,gridGeometry.getSize()[i]);
2664  }
2665 
2666  return b;
2667  }
2668 
2669  //todo: Move implems into a functor for compile time choice of stencil mode
2670  template<typename stencil, typename... Args>
2671  void applyStencils(const Box<dim,int> & box, StencilMode mode, Args... args)
2672  {
2673  if (findNN == false)
2674  {
2675  findNeighbours<typename stencil::stencil_type>();
2676  findNN = true;
2677  }
2678 
2679  // Apply the given stencil on all elements which are not boundary-tagged
2680  // The idea is to have this function launch a __global__ function (made by us) on all existing blocks
2681  // then this kernel checks if elements exist && !padding and on those it calls the user-provided
2682  // __device__ functor. The mode of the stencil application is used basically to choose how we load the block
2683  // that we pass to the user functor as storeBlock: in case of Insert, we get the block through an insert (and
2684  // we also call the necessary aux functions); in case of an In-place we just get the block from the data buffer.
2685  switch (mode)
2686  {
2687  case STENCIL_MODE_INPLACE:
2688  applyStencilInPlace<stencil>(box,mode,args...);
2689  break;
2690  case STENCIL_MODE_INPLACE_NO_SHARED:
2691  applyStencilInPlaceNoShared<stencil>(box,mode,args...);
2692  break;
2693  }
2694  }
2695  template<typename stencil1, typename stencil2, typename ... otherStencils, typename... Args>
2696  void applyStencils(Box<dim,int> box, StencilMode mode, Args... args)
2697  {
2698  applyStencils<stencil1>(box,mode, args...);
2699  applyStencils<stencil2, otherStencils ...>(box,mode, args...);
2700  }
2701 
2702  template<typename BitMaskT>
2703  inline static bool isPadding(BitMaskT &bitMask)
2704  {
2706  ::getBit(bitMask, PADDING_BIT);
2707  }
2708 
2709  template<typename BitMaskT>
2710  inline static void setPadding(BitMaskT &bitMask)
2711  {
2713  ::setBit(bitMask, PADDING_BIT);
2714  }
2715 
2716  template<typename BitMaskT>
2717  inline static void unsetPadding(BitMaskT &bitMask)
2718  {
2720  ::unsetBit(bitMask, PADDING_BIT);
2721  }
2722 
2730  template<typename CoordT>
2731  inline size_t getBlockLinId(const CoordT & blockCoord) const
2732  {
2733  return gridGeometry.BlockLinId(blockCoord);
2734  }
2735 
2746  template<unsigned int p>
2747  auto insertFlush(const sparse_grid_gpu_index<self> &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2748  {
2750 
2751  indexT block_id = indexBuffer.template get<0>(coord.get_cnk_pos_id());
2752  indexT local_id = coord.get_data_id();
2753 
2755 
2756  auto block_data = this->insertBlockFlush(block_id);
2757  block_data.template get<BMG::pMask>()[local_id] = 1;
2758 
2759  return block_data.template get<p>()[local_id];
2760  }
2761 
2772  template<unsigned int p, typename CoordT>
2773  auto insertFlush(const grid_key_dx<dim,CoordT> &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2774  {
2775  // Linearized block_id
2776  auto lin = gridGeometry.LinId(coord);
2777  indexT block_id = lin / blockSize;
2778  indexT local_id = lin % blockSize;
2779 
2781 
2782  auto block_data = this->insertBlockFlush(block_id);
2783  block_data.template get<BMG::pMask>()[local_id] = 1;
2784 
2785  return block_data.template get<p>()[local_id];
2786  }
2787 
2788  template<unsigned int p>
2789  void print_vct_add_data()
2790  {
2791  typedef BlockMapGpu<
2793  threadBlockSize, indexT, layout_base> BMG;
2794 
2795  auto & bM = BMG::blockMap.private_get_vct_add_data();
2796  auto & vI = BMG::blockMap.private_get_vct_add_index();
2797  bM.template deviceToHost<p>();
2798  vI.template deviceToHost<0>();
2799 
2800  std::cout << "vct_add_data: " << std::endl;
2801 
2802  for (size_t i = 0 ; i < bM.size() ; i++)
2803  {
2804  std::cout << i << " index: " << vI.template get<0>(i) << " BlockData: " << std::endl;
2805  for (size_t j = 0 ; j < blockSize ; j++)
2806  {
2807  std::cout << (int)bM.template get<p>(i)[j] << " ";
2808  }
2809 
2810  std::cout << std::endl;
2811  }
2812  }
2813 
2819  template<unsigned int p>
2820  void setBackgroundValue(typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type backgroundValue)
2821  {
2822  meta_copy<typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>::meta_copy_(backgroundValue,bck.template get<p>());
2823 
2824  BMG::template setBackgroundValue<p,typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>(backgroundValue);
2825  }
2826 
2828 
2829  //Functions to check if the packing object is complex
2830  static bool pack()
2831  {
2832  return true;
2833  }
2834 
2835  //Functions to check if the packing object is complex
2836  static bool packRequest()
2837  {
2838  return true;
2839  }
2840 
2845  template<int ... prp> inline
2846  void packRequest(size_t & req) const
2847  {
2848  // To fill
2851 
2852  indexBuffer.template packRequest<prp ...>(req);
2853  dataBuffer.template packRequest<prp ...>(req);
2854 
2855  Packer<decltype(gridGeometry),HeapMemory>::packRequest(req);
2856  }
2857 
2867  template<int ... prp> void pack(ExtPreAlloc<HeapMemory> & mem,
2868  Pack_stat & sts) const
2869  {
2872 
2873  // To fill
2874  indexBuffer.template pack<prp ...>(mem,sts);
2875  dataBuffer.template pack<prp ...>(mem,sts);
2876 
2877  Packer<decltype(gridGeometry),HeapMemory>::pack(mem,gridGeometry,sts);
2878  }
2879 
2889  template<int ... prp> void unpack(ExtPreAlloc<HeapMemory> & mem,
2890  Unpack_stat & ps)
2891  {
2894 
2895  // To fill
2896  indexBuffer.template unpack<prp ...>(mem,ps);
2897  dataBuffer.template unpack<prp ...>(mem,ps);
2898 
2899  Unpacker<decltype(gridGeometry),HeapMemory>::unpack(mem,gridGeometry,ps);
2900  }
2901 
2902 
2912  template<int ... prp> void unpack(ExtPreAlloc<CudaMemory> & mem,
2913  Unpack_stat & ps)
2914  {
2915  if (mem.size() != 0)
2916  {std::cout << __FILE__ << ":" << __LINE__ << " not implemented: " << std::endl;}
2917  }
2918 
2924  template<int ... prp> inline
2925  void packRequest(size_t & req, mgpu::ofp_context_t &context) const
2926  {
2927  ite_gpu<1> ite;
2928 
2929  auto & indexBuffer = private_get_index_array();
2930  auto & dataBuffer = private_get_data_array();
2931 
2932  ite.wthr.x = indexBuffer.size();
2933  ite.wthr.y = 1;
2934  ite.wthr.z = 1;
2935  ite.thr.x = getBlockSize();
2936  ite.thr.y = 1;
2937  ite.thr.z = 1;
2938 
2939  tmp.resize(indexBuffer.size() + 1);
2940 
2941  // Launch a kernel that count the number of element on each chunks
2942  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),
2943  ite,
2944  dataBuffer.toKernel(),
2945  tmp.toKernel());
2946 
2947  openfpm::scan((indexT *)tmp. template getDeviceBuffer<0>(),
2948  tmp.size(), (indexT *)tmp. template getDeviceBuffer<0>(), context);
2949 
2950  tmp.template deviceToHost<0>(tmp.size()-1,tmp.size()-1);
2951 
2952  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
2953 
2954  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof... (prp)>>(spq);
2955 
2956  size_t n_pnt = tmp.template get<0>(tmp.size()-1);
2957 
2958 
2959  // 4 byte each chunks data // we use short to pack the offset
2960  // for each counter
2961  req = sizeof(indexT) + // byte required to pack the number
2962  sizeof(indexT)*indexBuffer.size() + // byte required to pack the chunk indexes
2963  sizeof(indexT)*tmp.size() + // byte required to pack the scan of the chunks points
2964  n_pnt*(spq.point_size + sizeof(short int) + sizeof(unsigned char)); // byte required to pack data + offset position
2965  }
2966 
2981  template<int ... prp> inline
2983  size_t & req) const
2984  {
2985  pack_subs.add();
2986 
2987  for (int i = 0 ; i < dim ; i++)
2988  {
2989  pack_subs.template get<0>(pack_subs.size()-1)[i] = sub_it.getStart().get(i);
2990  pack_subs.template get<1>(pack_subs.size()-1)[i] = sub_it.getStop().get(i);
2991  }
2992  }
2993 
2998  void packReset()
2999  {
3000  pack_subs.clear();
3001 
3002  index_ptrs.clear();
3003  scan_ptrs.clear();
3004  data_ptrs.clear();
3005  offset_ptrs.clear();
3006  mask_ptrs.clear();
3007 
3008  req_index = 0;
3009  }
3010 
3017  template<int ... prp> inline
3018  void packCalculate(size_t & req, mgpu::ofp_context_t &context)
3019  {
3020  ite_gpu<1> ite;
3021  pack_subs.template hostToDevice<0,1>();
3022 
3023  auto & indexBuffer = private_get_index_array();
3024  auto & dataBuffer = private_get_data_array();
3025 
3026  ite.wthr.x = indexBuffer.size();
3027  ite.wthr.y = 1;
3028  ite.wthr.z = 1;
3029  ite.thr.x = getBlockSize();
3030  ite.thr.y = 1;
3031  ite.thr.z = 1;
3032 
3033  tmp.resize((indexBuffer.size() + 1)*pack_subs.size());
3034 
3035  if (indexBuffer.size() != 0)
3036  {
3037  if (pack_subs.size() <= 32)
3038  {
3039  // Launch a kernel that count the number of element on each chunks
3040  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3042  32,
3043  indexT>),
3044  ite,
3045  indexBuffer.toKernel(),
3046  pack_subs.toKernel(),
3047  gridGeometry,
3048  dataBuffer.toKernel(),
3049  tmp.toKernel(),
3050  indexBuffer.size() + 1);
3051  }
3052  else if (pack_subs.size() <= 64)
3053  {
3054  // Launch a kernel that count the number of element on each chunks
3055  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3057  64,
3058  indexT>),
3059  ite,
3060  indexBuffer.toKernel(),
3061  pack_subs.toKernel(),
3062  gridGeometry,
3063  dataBuffer.toKernel(),
3064  tmp.toKernel(),
3065  indexBuffer.size() + 1);
3066  }
3067  else if (pack_subs.size() <= 96)
3068  {
3069  // Launch a kernel that count the number of element on each chunks
3070  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3072  96,
3073  indexT>),
3074  ite,
3075  indexBuffer.toKernel(),
3076  pack_subs.toKernel(),
3077  gridGeometry,
3078  dataBuffer.toKernel(),
3079  tmp.toKernel(),
3080  indexBuffer.size() + 1);
3081  }
3082  else if (pack_subs.size() <= 128)
3083  {
3084  // Launch a kernel that count the number of element on each chunks
3085  CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3087  128,
3088  indexT>),
3089  ite,
3090  indexBuffer.toKernel(),
3091  pack_subs.toKernel(),
3092  gridGeometry,
3093  dataBuffer.toKernel(),
3094  tmp.toKernel(),
3095  indexBuffer.size() + 1);
3096  }
3097  else
3098  {
3099  std::cout << __FILE__ << ":" << __LINE__ << " error no implementation available of packCalculate, create a new case for " << pack_subs.size() << std::endl;
3100  }
3101  }
3102 
3103  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3104 
3105  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3106 
3107  scan_it.resize(pack_subs.size());
3108 
3109  // scan all
3110  for (size_t i = 0 ; i < pack_subs.size() ; i++)
3111  {
3112  size_t n_pnt = 0;
3113  size_t n_cnk = 0;
3114 
3115  tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3116  tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3117 
3118  // put a zero at the end
3119  tmp.template hostToDevice<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3120  tmp.template hostToDevice<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3121 
3122  openfpm::scan(((indexT *)tmp. template getDeviceBuffer<0>()) + i*(indexBuffer.size() + 1),
3123  indexBuffer.size() + 1, (indexT *)tmp. template getDeviceBuffer<0>() + i*(indexBuffer.size() + 1), context);
3124 
3125  openfpm::scan(((unsigned int *)tmp. template getDeviceBuffer<1>()) + i*(indexBuffer.size() + 1),
3126  indexBuffer.size() + 1, (unsigned int *)tmp. template getDeviceBuffer<1>() + i*(indexBuffer.size() + 1), context);
3127 
3128  tmp.template deviceToHost<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3129  tmp.template deviceToHost<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3130 
3131  scan_it.template get<0>(i) = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3132 
3133  n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3134  n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3135 
3136  req += sizeof(size_t) + // byte required to pack the number of chunk packed
3137  2*dim*sizeof(int) + // starting point + size of the indexing packing
3138  sizeof(indexT)*n_cnk + // byte required to pack the chunk indexes
3139  align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)) + // byte required to pack the scan of the chunk point
3140  align_number(sizeof(indexT),n_pnt*(spq.point_size)) + // byte required to pack data
3141  align_number(sizeof(indexT),n_pnt*sizeof(short int)) + // byte required to pack offsets
3142  align_number(sizeof(indexT),n_pnt*sizeof(unsigned char)); // byte required to pack masks
3143  }
3144 
3145  scan_it.template hostToDevice<0>();
3146 
3147  openfpm::scan((indexT *)scan_it. template getDeviceBuffer<0>(),
3148  scan_it.size(), (indexT *)scan_it. template getDeviceBuffer<0>(), context);
3149  }
3150 
3156  auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
3157  {
3158  return this->blockMap.getMappingVector();
3159  }
3160 
3166  auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
3167  {
3168  return this->blockMap.getMergeIndexMapVector();
3169  }
3170 
3185  template<int ... prp> void pack(ExtPreAlloc<CudaMemory> & mem,
3187  Pack_stat & sts)
3188  {
3189  unsigned int i = req_index;
3190 
3191  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3192  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3193 
3194  auto & indexBuffer = private_get_index_array();
3195  auto & dataBuffer = private_get_data_array();
3196 
3197  size_t n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3198  size_t n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3199 
3200  Packer<size_t,CudaMemory>::pack(mem,n_cnk,sts);
3201  mem.hostToDevice(mem.getOffset(),mem.getOffset()+sizeof(size_t));
3202 
3203  size_t offset1 = mem.getOffsetEnd();
3204 
3205  grid_key_dx<dim> key = sub_it.getStart();
3206 
3207  for (int i = 0 ; i < dim ; i++)
3208  {Packer<int,CudaMemory>::pack(mem,key.get(i),sts);}
3209 
3210  for (int i = 0 ; i < dim ; i++)
3211  {Packer<int,CudaMemory>::pack(mem,(int)gridGeometry.getSize()[i],sts);}
3212 
3213  mem.hostToDevice(offset1,offset1+2*dim*sizeof(int));
3214 
3215  // chunk indexes
3216  mem.allocate(n_cnk*sizeof(indexT));
3217  index_ptrs.add(mem.getDevicePointer());
3218 
3219  // chunk point scan
3220  mem.allocate( align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)) );
3221  scan_ptrs.add(mem.getDevicePointer());
3222 
3223  // chunk data
3224  mem.allocate( align_number(sizeof(indexT),n_pnt*(spq.point_size)) );
3225  data_ptrs.add(mem.getDevicePointer());
3226 
3227  // space for offsets
3228  mem.allocate( align_number(sizeof(indexT),n_pnt*sizeof(short int) ) );
3229  offset_ptrs.add(mem.getDevicePointer());
3230 
3231  // space for offsets
3232  mem.allocate( align_number(sizeof(indexT),n_pnt*sizeof(unsigned char) ) );
3233  mask_ptrs.add(mem.getDevicePointer());
3234 
3235  req_index++;
3236  }
3237 
3238 
3255  template<unsigned int ... prp>
3256  void removeCopyToFinalize(mgpu::ofp_context_t & ctx, int opt)
3257  {
3258  if ((opt & 0x3) == rem_copy_opt::PHASE1)
3259  {
3260  this->template removeCopyToFinalize_phase1<prp ...>(ctx,opt);
3261  }
3262  else if ((opt & 0x3) == rem_copy_opt::PHASE2)
3263  {
3264  this->template removeCopyToFinalize_phase2<prp ...>(ctx,opt);
3265  }
3266  else
3267  {
3268  this->template removeCopyToFinalize_phase3<prp ...>(ctx,opt,false);
3269  }
3270  }
3271 
3284  template<int ... prp> void packFinalize(ExtPreAlloc<CudaMemory> & mem,
3285  Pack_stat & sts,
3286  int opt = 0,
3287  bool is_pack_remote = false)
3288  {
3289 
3290  RestorePackVariableIfKeepGeometry(opt,is_pack_remote);
3291 
3292  if (pack_subs.size() <= 32)
3293  {
3294  pack_sg_implement<32,prp...>(mem,sts,opt,is_pack_remote);
3295  }
3296  else if (pack_subs.size() <= 64)
3297  {
3298  pack_sg_implement<64, prp...>(mem,sts,opt,is_pack_remote);
3299  }
3300  else if (pack_subs.size() <= 80)
3301  {
3302  pack_sg_implement<80, prp...>(mem,sts,opt,is_pack_remote);
3303  }
3304  else
3305  {
3306  std::cout << __FILE__ << ":" << __LINE__ << " error no implementation available of packCalculate, create a new case for " << pack_subs.size() << std::endl;
3307  }
3308 
3309  savePackVariableIfNotKeepGeometry(opt,is_pack_remote);
3310  }
3311 
3318  {
3319  rem_sects.clear();
3320 
3321  auto & vad = BMG::blockMap.private_get_vct_add_data();
3322  auto & vai = BMG::blockMap.private_get_vct_add_index();
3323 
3324  vad.clear();
3325  vai.clear();
3326 
3327  // Clear variables
3328  offset_ptrs_cp.clear();
3329  scan_ptrs_cp.clear();
3330  n_cnk_cp.clear();
3331  n_pnt_cp.clear();
3332  data_base_ptr_cp.clear();
3333  box_cp.clear();
3334  n_shifts_cp.clear();
3335  convert_blk.clear();
3336  tmp2.clear();
3337  }
3338 
3344  void swap(self & gr)
3345  {
3346  gridGeometry.swap(gr.gridGeometry);
3347 
3348  BMG::swap(gr);
3349  }
3350 
3358  void removePoints(mgpu::ofp_context_t& context)
3359  {
3360  auto & indexBuffer = private_get_index_array();
3361  auto & dataBuffer = private_get_data_array();
3362 
3363  // first we remove
3364  if (rem_sects.size() != 0)
3365  {
3366  rem_sects.template hostToDevice<0,1>();
3367 
3368  tmp.resize(indexBuffer.size() + 1);
3369 
3370  tmp.template get<1>(tmp.size()-1) = 0;
3371  tmp.template hostToDevice<1>(tmp.size()-1,tmp.size()-1);
3372 
3373  auto ite = indexBuffer.getGPUIterator();
3374 
3375  if (has_work_gpu(ite) == true)
3376  {
3377  // mark all the chunks that must remove points
3378  CUDA_LAUNCH((SparseGridGpuKernels::calc_remove_points_chunks_boxes<dim,
3380  blockEdgeSize>),ite,indexBuffer.toKernel(),rem_sects.toKernel(),
3381  gridGeometry,dataBuffer.toKernel(),
3382  tmp.toKernel());
3383 
3384  // scan
3385  openfpm::scan((unsigned int *)tmp.template getDeviceBuffer<1>(),tmp.size(),(unsigned int *)tmp.template getDeviceBuffer<1>(),context);
3386 
3387  tmp.template deviceToHost<1>(tmp.size()-1,tmp.size()-1);
3388 
3389  // get the number of chunks involved
3390  size_t nr_cnk = tmp.template get<1>(tmp.size()-1);
3391 
3392  tmp3.resize(nr_cnk);
3393 
3394  // collect the chunks involved in the remove
3395  ite = indexBuffer.getGPUIterator();
3396 
3397  if (has_work_gpu(ite) == false) {return;}
3398 
3399  CUDA_LAUNCH((SparseGridGpuKernels::collect_rem_chunks),ite,tmp.toKernel(),tmp3.toKernel());
3400 
3401  // Launch to remove points
3402 
3403  ite = tmp3.getGPUIterator();
3404 
3405  ite.wthr.x = tmp3.size();
3406  ite.wthr.y = 1;
3407  ite.wthr.z = 1;
3408  ite.thr.x = getBlockSize();
3409  ite.thr.y = 1;
3410  ite.thr.z = 1;
3411 
3412  if (has_work_gpu(ite) == false) {return;}
3413 
3414  CUDA_LAUNCH((SparseGridGpuKernels::remove_points<dim,
3416  ite,indexBuffer.toKernel(),
3417  gridGeometry,
3418  dataBuffer.toKernel(),
3419  tmp3.toKernel(),
3420  rem_sects.toKernel());
3421 
3422  tmp3.clear();
3423  }
3424  }
3425  }
3426 
3432  template<unsigned int ... prp>
3433  void removeAddUnpackFinalize(mgpu::ofp_context_t& context, int opt)
3434  {
3435  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
3436  {removePoints(context);}
3437 
3438  removeCopyToFinalize_phase3<prp ...>(context,opt,true);
3439  }
3440 
3446  {
3447  rem_sects.clear();
3448  copySect.clear();
3449  offset_ptrs_cp.clear();
3450  scan_ptrs_cp.clear();
3451  data_base_ptr_cp.clear();
3452  n_cnk_cp.clear();
3453  n_pnt_cp.clear();
3454  n_shifts_cp.clear();
3455  convert_blk.clear();
3456  box_cp.clear();
3457  data_base_ptr_cp.clear();
3458 
3459  tmp2.clear();
3460  }
3461 
3469  void remove(const Box<dim,int> & section_to_delete)
3470  {
3471  rem_sects.add(section_to_delete);
3472  }
3473 
3479  static constexpr bool isCompressed()
3480  {
3481  return true;
3482  }
3483 
3491  void copy_to(self & grid_src,
3492  const Box<dim,size_t> & box_src,
3493  const Box<dim,size_t> & box_dst)
3494  {
3495  // first we launch a kernel to count the number of points we have
3496 
3497  sparse_grid_section<self> sgs(*this,box_src,box_dst);
3498 
3499  grid_src.copySect.add(sgs);
3500  }
3501 
3505  template<typename pointers_type,
3506  typename headers_type,
3507  typename result_type,
3508  unsigned int ... prp >
3509  static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result, int n_slot)
3510  {
3511  // we have to increment ps by the right amount
3512  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3513  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3514 
3515  result.allocate(sizeof(int));
3516 
3517  CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers<decltype(std::declval<self>().toKernel())>),1,pointers.size(),
3518  pointers.toKernel(),
3519  headers.toKernel(),
3520  (int *)result.getDevicePointer(),
3521  spq.point_size,
3522  n_slot)
3523  }
3524 
3534  template<unsigned int ... prp, typename S2, typename header_type>
3537  header_type & headers,
3538  int ih,
3539  Unpack_stat & ps,
3540  mgpu::ofp_context_t &context,
3541  rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3542  {
3544 
3545  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
3546  {
3547  this->template addAndConvertPackedChunkToTmp<prp ...>(mem,sub_it,ps,context);
3548 
3549  // readjust mem
3550  }
3551  else
3552  {
3553  // we have to increment ps by the right amount
3554  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3555  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3556 
3557  // First get the number of chunks
3558 
3559  size_t n_cnk = headers.template get<1>(ih);
3560  ps.addOffset(sizeof(size_t));
3561  ps.addOffset(2*dim*sizeof(unsigned int));
3562 
3563  size_t actual_offset = n_cnk*sizeof(indexT);
3564  unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));
3565 
3566  // Unpack number of points
3567  // calculate the number of total points
3568  size_t n_pnt = headers.template get<2>(ih);
3569  actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));
3570 
3571  void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );
3572 
3573  actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));
3574  short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);
3575 
3576  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));
3577  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));
3578 
3579  scan_ptrs_cp.add(scan);
3580  offset_ptrs_cp.add(offsets);
3581  data_base_ptr_cp.add(data_base_ptr);
3582 
3583  ps.addOffset(actual_offset);
3584  }
3585  }
3586 
3593  {return true;}
3594 
3604  template<unsigned int ... prp, typename S2>
3607  Unpack_stat & ps,
3608  mgpu::ofp_context_t &context,
3609  rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3610  {
3612 
3613  if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
3614  {
3615  this->template addAndConvertPackedChunkToTmp<prp ...>(mem,sub_it,ps,context);
3616 
3617  // readjust mem
3618  }
3619  else
3620  {
3621  // we have to increment ps by the right amount
3622  sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3623  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3624 
3625  // First get the number of chunks
3626 
3627  size_t n_cnk;
3628 
3629  // Unpack the number of chunks
3630  mem.deviceToHost(ps.getOffset(),ps.getOffset() + sizeof(size_t) + 2*dim*sizeof(int));
3631  Unpacker<size_t,S2>::unpack(mem,n_cnk,ps);
3632 
3633  // Unpack origin of the chunk indexing
3634 /* for (int i = 0 ; i < dim ; i++)
3635  {
3636  int tmp;
3637  Unpacker<int,S2>::unpack(mem,tmp,ps);
3638  }
3639 
3640  for (int i = 0 ; i < dim ; i++)
3641  {
3642  int tmp;
3643  Unpacker<int,S2>::unpack(mem,tmp,ps);
3644  }*/
3645 
3646  ps.addOffset(2*dim*sizeof(unsigned int));
3647 
3648  size_t actual_offset = n_cnk*sizeof(indexT);
3649  unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));
3650 
3651  mem.deviceToHost(ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int),
3652  ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int) + sizeof(unsigned int));
3653 
3654  // Unpack number of points
3655  // calculate the number of total points
3656  size_t n_pnt = *(unsigned int *)((unsigned char *)mem.getPointer() + ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int));
3657  actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));
3658 
3659  void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );
3660 
3661  actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));
3662  short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);
3663 
3664  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));
3665  actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));
3666 
3667  scan_ptrs_cp.add(scan);
3668  offset_ptrs_cp.add(offsets);
3669  data_base_ptr_cp.add(data_base_ptr);
3670 
3671  ps.addOffset(actual_offset);
3672  }
3673  }
3674 
3680  {
3682  }
3683 
3685 
3692  {
3693  return decltype(self::type_of_iterator())(*this);
3694  }
3695 
3701  decltype(self::type_of_subiterator()) getIterator(const grid_key_dx<dim> & start, const grid_key_dx<dim> & stop, int is_to_init = 1) const
3702  {
3703  return decltype(self::type_of_subiterator())(*this,start,stop,is_to_init);
3704  }
3705 
3712  {
3714  }
3715 
3721  auto private_get_add_index_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.private_get_vct_add_index()) &
3722  {
3724  }
3725 
3731  auto private_get_index_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer()) &
3732  {
3734  }
3735 
3736  auto getSegmentToOutMap() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())
3737  {
3739  }
3740 
3741  auto getSegmentToOutMap() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())
3742  {
3744  }
3745 
3746  auto getSegmentToMergeIndexMap() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())
3747  {
3749  }
3750 
3751  auto getSegmentToMergeIndexMap() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())
3752  {
3754  }
3755 
3762  {
3764  }
3765 
3771  auto private_get_neighborhood_array() -> decltype(nn_blocks) &
3772  {
3773  return nn_blocks;
3774  }
3775 
3776 
3777 #if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST) || defined(VTKWRITER_HPP_)
3778 
3784  template<typename Tw = float> bool write(const std::string & output)
3785  {
3786  Point<dim,double> spacing;
3787  Point<dim,double> offset;
3788 
3789  spacing.one();
3790  offset.zero();
3791 
3792  return write_with_spacing_offset(output,spacing,offset);
3793  }
3794 
3800  template<typename Tw = float>
3801  bool write_with_spacing_offset(const std::string & output, Point<dim,double> spacing, Point<dim,double> offset)
3802  {
3803  file_type ft = file_type::BINARY;
3804 
3805  auto & bm = this->private_get_blockMap();
3806 
3807  auto & index = bm.getIndexBuffer();
3808  auto & data = bm.getDataBuffer();
3809 
3812 
3813  // copy position and properties
3814 
3815  auto it = index.getIterator();
3816 
3817  while(it.isNext())
3818  {
3819  auto key = it.get();
3820 
3821  Point<dim,Tw> p;
3822 
3823  for (size_t i = 0 ; i < gridGeometry.getBlockSize() ; i++)
3824  {
3826  {
3827  // Get the block index
3828  grid_key_dx<dim,int> keyg = gridGeometry.InvLinId(index.template get<0>(key),i);
3829 
3830  for (size_t k = 0 ; k < dim ; k++)
3831  {p.get(k) = keyg.get(k)*spacing[k] + offset[k]*spacing[k];}
3832 
3833  tmp_pos.add(p);
3834 
3835  tmp_prp.add();
3836  copy_prop_to_vector_block<decltype(data.get_o(key)),decltype(tmp_prp.last())>
3837  cp(data.get_o(key),tmp_prp.last(),key,i);
3838 
3839  boost::mpl::for_each_ref< boost::mpl::range_c<int,0,AggregateT::max_prop> >(cp);
3840 
3841  tmp_prp.last().template get<AggregateT::max_prop>() = data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i];
3842  }
3843  }
3844 
3845  ++it;
3846  }
3847 
3848  // VTKWriter for a set of points
3850  vtk_writer.add(tmp_pos,tmp_prp,tmp_pos.size());
3851 
3852  openfpm::vector<std::string> prp_names;
3853 
3854  // Write the VTK file
3855  return vtk_writer.write(output,prp_names,"sparse_grid","",ft);
3856  }
3857 
3863  template<typename Tw = float> bool write_debug(const std::string & output, Point<dim,double> spacing, Point<dim,double> offset)
3864  {
3866  VTKWriter<openfpm::vector<SpaceBox<dim, double>>, VECTOR_BOX> vtk_box1;
3867 
3869 
3870  auto & ids = private_get_index_array();
3871 
3872  fill_chunks_boxes(chunks_box,ids,spacing,offset);
3873 
3874  vtk_box1.add(chunks_box);
3875  vtk_box1.write(std::string("chunks_") + output + std::string(".vtk"));
3876 
3877  //write data
3878 
3879  write_with_spacing_offset(std::string("data_") + output + std::string(".vtk"),spacing,offset);
3880 
3881  return true;
3882  }
3883 
3884 #endif
3885 };
3886 
3887 template<unsigned int dim,
3888  typename AggregateT,
3889  unsigned int blockEdgeSize = default_edge<dim>::type::value,
3890  unsigned int threadBlockSize = default_edge<dim>::tb::value,
3891  typename indexT=long int,
3892  template<typename> class layout_base=memory_traits_inte,
3893  typename linearizer = grid_zmb<dim, blockEdgeSize,indexT>>
3895 
3896 template<unsigned int dim,
3897  typename AggregateT,
3898  unsigned int blockEdgeSize = default_edge<dim>::type::value,
3899  unsigned int threadBlockSize = default_edge<dim>::tb::value,
3900  typename indexT=int,
3901  template<typename> class layout_base=memory_traits_inte,
3902  typename linearizer = grid_zmb<dim, blockEdgeSize,indexT>>
3904 
3905 template<unsigned int dim,
3906  typename AggregateT,
3907  unsigned int blockEdgeSize = default_edge<dim>::type::value,
3908  unsigned int threadBlockSize = default_edge<dim>::tb::value,
3909  typename indexT=int,
3910  template<typename> class layout_base=memory_traits_inte,
3911  typename linearizer = grid_smb<dim, blockEdgeSize,indexT>>
3913 
3914 #endif //OPENFPM_PDATA_SPARSEGRIDGPU_HPP
Box< dim, int > getBox()
Return a Box with the range if the SparseGrid.
void operator()(T &t) const
It call the copy function for each property.
int get_cnk_pos_id() const
Get chunk position id.
auto get(const sparse_grid_gpu_index< self > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
void * base_ptr
data pointers
size_t getOffset()
Return the actual counter.
Definition: Pack_stat.hpp:41
This class represent an N-dimensional box.
Definition: SpaceBox.hpp:26
openfpm::vector_gpu< aggregate< int[dim]> > shifts
shifts for chunk conversion
void packRequest(size_t &req, mgpu::ofp_context_t &context) const
memory requested to pack this object
openfpm::vector_gpu< aggregate< indexT > > e_points
size_t getBlockLinId(const CoordT &blockCoord) const
Linearization of block coordinates.
virtual size_t size() const
Get the size of the LAST allocated memory.
auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId, is_new).template get< p >())
insert a block + flush, host version
unsigned char getFlag(const sparse_grid_gpu_index< self > &coord) const
Return the flag of the point.
void setGPUInsertBuffer(int nBlock, int nSlot)
Unpacker class.
Definition: Packer_util.hpp:20
SparseGridGpu(linearizer &gridGeometry, unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
auto get_o(const grid_key_dx< dim, CoordT > &coord) const -> encap_data_block< typename std::remove_const< decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::get(0))>::type >
Get an element using the point coordinates.
auto private_get_add_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
grid_key_dx is the key to access any element in the grid
Definition: grid_key.hpp:18
virtual void deviceToHost()
Do nothing.
SparseGridGpu(const size_t(&res)[dim], unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
void construct_link_dw(self &grid_dw, const Box< dim, int > &db_, Point< dim, int > p_dw, mgpu::ofp_context_t &context)
construct link on the down level
void swap(self &gr)
int get_data_id() const
Get chunk local index (the returned index < getblockSize())
openfpm::vector_gpu< aggregate< indexT, unsigned int > > tmp
temporal
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
ExtPreAlloc< CudaMemory > * prAlloc_prp
Memory to remove copy finalize.
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition: Box.hpp:556
virtual bool allocate(size_t sz)
allocate memory
Definition: CudaMemory.cu:38
openfpm::vector_gpu< aggregate< short int, short int > > ghostLayerToThreadsMapping
openfpm::vector_gpu< aggregate< indexT > > tmp2
temporal 2
static constexpr bool isCompressed()
This is a multiresolution sparse grid so is a compressed format.
decltype(self::type_of_iterator()) getIterator() const
Return a SparseGrid iterator.
virtual void hostToDevice()
Move memory from host to device.
Definition: CudaMemory.cu:508
virtual void * getPointer()
Return the pointer of the last allocation.
openfpm::vector_gpu< aggregate< unsigned int > > pack_output
Helper array to pack points.
size_t getOffsetEnd()
Get offset.
void pack(ExtPreAlloc< HeapMemory > &mem, Pack_stat &sts) const
Pack the object into the memory.
void conv_cross_b(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:352
openfpm::vector_gpu< Box< dim, int > > pack_subs
the set of all sub-set to pack
void construct_link(self &grid_up, self &grid_dw, mgpu::ofp_context_t &context)
construct link between levels
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition: grid_key.hpp:503
static __device__ bool getNNindex_offset(grid_key_dx< dim, indexT2 > &coord, unsigned int &NN_index, unsigned int &offset_nn)
given a coordinate writtel in local coordinate for a given it return the neighborhood chunk position ...
This class implement the point shape in an N-dimensional space.
Definition: Point.hpp:27
void removeAddUnpackReset()
In this case it does nothing.
size_t size() const
return the size of the grid
void unpack(ExtPreAlloc< CudaMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
size_t size()
Stub size.
Definition: map_vector.hpp:211
void construct_link_up(self &grid_up, const Box< dim, int > &db_, Point< dim, int > p_up, mgpu::ofp_context_t &context)
construct link on the up levels
void packReset()
Reset the pack calculation.
void remove(const Box< dim, int > &section_to_delete)
Remove all the points in this region.
auto get_o(const sparse_grid_gpu_index< self > &coord) const -> encap_data_block< typename std::remove_const< decltype(private_get_data_array().get(0))>::type >
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
This class allocate, and destroy CPU memory.
Definition: HeapMemory.hpp:39
openfpm::vector_gpu< aggregate< int, short int > > link_dw
links of the padding points with real points of a finer sparsegrid
static SparseGridGpu_iterator< dim, self > type_of_iterator()
This is a meta-function return which type of iterator a grid produce.
virtual bool allocate(size_t sz)
Allocate a chunk of memory.
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition: memory_conf.hpp:83
void removePoints(mgpu::ofp_context_t &context)
Remove the points we queues to remove.
this class is a functor for "for_each" algorithm
Definition: Encap.hpp:221
__device__ __host__ void setHigh(int i, T val)
set the high interval of the box
Definition: Box.hpp:544
This class copy general objects.
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
Definition: grid_sm.hpp:434
int sgn(T val)
Gets the sign of a variable.
void removeAddUnpackFinalize(mgpu::ofp_context_t &context, int opt)
This function remove the points we queue to remove and it flush all the added/unpacked data.
openfpm::vector_gpu< aggregate< int > > new_map
Map between the (Last) added chunks and their position in chunks data.
openfpm::vector_gpu< aggregate< unsigned int > > link_up_scan
scan offsets of the links down
virtual void * getDevicePointer()
get a readable pointer with the data
Definition: CudaMemory.cu:497
void resize(size_t(&res)[dim])
resize the SparseGrid
void unpack(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, mgpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
virtual void hostToDevice()
Return the pointer of the last allocation.
void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
openfpm::vector_gpu< aggregate< indexT > > tmp3
temporal 3
auto private_get_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer())
Return the index array of the blocks.
base_key get_sparse(const grid_key_dx< dim, CoordT > &coord) const
Get an element using the point coordinates.
static void unpack_headers(pointers_type &pointers, headers_type &headers, result_type &result, int n_slot)
Stub does not do anything.
Element index contain a data chunk index and a point index.
bool isSkipLabellingPossible()
This function check if keep geometry is possible for this grid.
void unpack(ExtPreAlloc< HeapMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
void removeCopyToFinalize(mgpu::ofp_context_t &ctx, int opt)
It finalize the queued operations of remove() and copy_to()
void packFinalize(ExtPreAlloc< CudaMemory > &mem, Pack_stat &sts, int opt=0, bool is_pack_remote=false)
Finalize the packing procedure.
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition: Point.hpp:172
auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
Return the mapping vector used to know where the data has been added.
void one()
Set to one the point coordinate.
Definition: Point.hpp:296
static bool is_unpack_header_supported()
Indicate that unpacking the header is supported.
size_t getOffset()
Get offset.
__device__ __host__ void setLow(int i, T val)
set the low interval of the box
Definition: Box.hpp:533
auto private_get_data_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer())
Return the data array of the blocks.
Packing class.
Definition: Packer.hpp:49
auto insert_o(unsigned int linId) -> decltype(blockMap.insert(0))
insert data, host version
void conv2(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
virtual bool resize(size_t sz)
resize the momory allocated
Definition: CudaMemory.cu:259
virtual size_t size() const
the the size of the allocated memory
Definition: CudaMemory.cu:243
void setNNType()
Set the neighborhood type.
void addOffset(size_t off)
Increment the offset pointer by off.
Definition: Pack_stat.hpp:31
virtual void incRef()
Increment the reference counter.
Definition: ExtPreAlloc.hpp:98
Unpacking status object.
Definition: Pack_stat.hpp:15
void pack(ExtPreAlloc< CudaMemory > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Pack_stat &sts)
Pack the object into the memory given an iterator.
auto get(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
void convertChunkIds(short int *offset, origPackType &origPack, IteratorType &sub_it)
convert the offset index from the packed to the add buffer
auto private_get_data_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer()) &
Return the index array of the blocks.
grid_key_dx< dim > getStart() const
Return the starting point.
auto private_get_neighborhood_array() -> decltype(nn_blocks) &
Return the index array of the blocks.
void addAndConvertPackedChunkToTmp(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, mgpu::ofp_context_t &context)
unpack the sub-grid object
openfpm::vector_gpu< aggregate< unsigned int > > & getDownLinksOffsets()
Get the offsets for each point of the links down.
void copyRemoveReset()
Reset the queue to remove and copy section of grids.
void conv_cross(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a convolution using a cross like stencil.
void packCalculate(size_t &req, mgpu::ofp_context_t &context)
Calculate the size of the information to pack.
static SparseGridGpu_iterator_sub< dim, self > type_of_subiterator()
This is a meta-function return which type of sub iterator a grid produce.
__device__ __host__ void zero()
Set to zero the point coordinate.
Definition: Point.hpp:284
openfpm::vector_gpu< aggregate< int, short int > > link_up
links of the padding points with real points of a finer sparsegrid
static void unpack(ExtPreAlloc< Mem >, T &obj)
Error, no implementation.
Definition: Unpacker.hpp:40
openfpm::vector_gpu< aggregate< unsigned int > > link_dw_scan
scan offsets of the links down
int yes_i_am_grid
it define that this data-structure is a grid
openfpm::vector_gpu< aggregate< size_t > > links_up
links of the padding points with real points of a coarse sparsegrid
void packRequest(size_t &req) const
Asking to pack a SparseGrid GPU without GPU context pack the grid on CPU and host memory.
get the type of the block
openfpm::vector_gpu< aggregate< int, short int > > & getDownLinks()
Get the links down for each point.
openfpm::vector_gpu< aggregate< unsigned int > > & getUpLinksOffsets()
Get the offsets for each point of the links up.
auto insertFlush(const grid_key_dx< dim, CoordT > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
Return the mapping vector used to know where the data has been added.
grid_key_dx< dim > getStop() const
Return the stop point.
virtual void decRef()
Decrement the reference counter.
void copy_to(self &grid_src, const Box< dim, size_t > &box_src, const Box< dim, size_t > &box_dst)
It queue a copy.
openfpm::vector_gpu< aggregate< int, short int > > & getUpLinks()
Get the links up for each point.
void reset()
Reset the internal counters.
void unpack_with_headers(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, header_type &headers, int ih, Unpack_stat &ps, mgpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
Definition: ids.hpp:168
void conv(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
auto insertFlush(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
Check if is padding.
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
Definition: grid_key.hpp:516
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:214
auto private_get_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer()) &
Return the index array of the blocks.
linearizer & getGrid()
Return the grid information object.
void operator()(T &t) const
It call the copy function for each property.
void packRequest(SparseGridGpu_iterator_sub< dim, self > &sub_it, size_t &req) const
Calculate the size to pack part of this structure.
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
openfpm::vector_gpu< aggregate< indexT > > scan_it
contain the scan of the point for each iterator
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202
Packing status object.
Definition: Pack_stat.hpp:60
auto private_get_add_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition: Box.hpp:567
static void pack(ExtPreAlloc< Mem >, const T &obj)
Error, no implementation.
Definition: Packer.hpp:56
this class is a functor for "for_each" algorithm
virtual void * getDevicePointer()
Return the pointer of the last allocation.
get the type of the insertBlock
void setDimensions(const size_t(&dims)[N])
Reset the dimension of the grid.
Definition: grid_sm.hpp:326
void setBackgroundValue(typename boost::mpl::at< typename AggregateT::type, boost::mpl::int_< p >>::type backgroundValue)
set the background for property p
auto get(const grid_key_dx< dim, CoordT > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using the point coordinates.
void conv2_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.