OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
SparseGridGpu.hpp
1//
2// Created by tommaso on 6/06/19.
3//
4
5#ifndef OPENFPM_PDATA_SPARSEGRIDGPU_HPP
6#define OPENFPM_PDATA_SPARSEGRIDGPU_HPP
7
8constexpr int BLOCK_SIZE_STENCIL = 128;
9
10#include "config.h"
11#include "util/cuda_launch.hpp"
12#include <cstdlib>
13#include <SparseGridGpu/BlockMapGpu.hpp>
14#include <Grid/iterators/grid_skin_iterator.hpp>
15#include <Grid/Geometry/grid_smb.hpp>
16#include "SparseGridGpu_ker.cuh"
17#include "SparseGridGpu_kernels.cuh"
18#include "Iterators/SparseGridGpu_iterator_sub.hpp"
19#include "Grid/Geometry/grid_zmb.hpp"
20#include "util/stat/common_statistics.hpp"
21#include "Iterators/SparseGridGpu_iterator.hpp"
22#include "Space/SpaceBox.hpp"
23
24#if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST)
25#include "VTKWriter/VTKWriter.hpp"
26#endif
27
28constexpr int NO_ITERATOR_INIT = 0;
29
30// todo: Move all the following utils into some proper file inside TemplateUtils
31
32enum tag_boundaries
33{
34 NO_CALCULATE_EXISTING_POINTS,
35 CALCULATE_EXISTING_POINTS
36};
37
38template<unsigned int dim>
40{
41 typedef boost::mpl::int_<2> type;
42};
43
44
45template<>
46struct default_edge<1>
47{
48 typedef boost::mpl::int_<256> type;
49 typedef boost::mpl::int_<256> tb;
50};
51
52template<>
53struct default_edge<2>
54{
55 typedef boost::mpl::int_<16> type;
56 typedef boost::mpl::int_<256> tb;
57};
58
59template<>
60struct default_edge<3>
61{
62 typedef boost::mpl::int_<8> type;
63 typedef boost::mpl::int_<512> tb;
64};
65
66template<typename T>
68{
69 typedef T type;
70};
71
72template<typename T, unsigned int dim, unsigned int blockEdgeSize>
74{
76};
77
78template<typename T, unsigned int dim, unsigned int blockEdgeSize, unsigned int N1>
79struct process_data_block<T[N1],dim,blockEdgeSize>
80{
82};
83
84template<unsigned int dim, unsigned int blockEdgeSize, typename ... aggr_list>
86{
88};
89
90template<unsigned int dim, unsigned int blockEdgeSize, typename aggr>
92{
93};
94
95template<unsigned int dim, unsigned int blockEdgeSize, typename ... types>
96struct aggregate_convert<dim,blockEdgeSize,aggregate<types ...>>
97{
98 typedef typename aggregate_transform_datablock_impl<dim,blockEdgeSize,types ...>::type type;
99};
100
101template<typename aggr>
103{
104};
105
106template<typename ... types>
107struct aggregate_add<aggregate<types ...>>
108{
109 typedef aggregate<types ..., unsigned char> type;
110};
111
113
114template<typename enc_type>
116{
117 int offset;
118 enc_type enc;
119
120 public:
121
122 encap_data_block(int offset,const enc_type & enc)
123 :offset(offset),enc(enc)
124 {}
125
126 encap_data_block operator=(const encap_data_block<enc_type> & enc)
127 {
129
130 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,enc_type::T_type::max_prop> >(cp);
131
132 return *this;
133 }
134
135 template<unsigned int p>
136 auto get() -> decltype(enc.template get<p>()[offset])
137 {
138 return enc.template get<p>()[offset];
139 }
140
141 template<unsigned int p>
142 auto get() const -> decltype(enc.template get<p>()[offset])
143 {
144 return enc.template get<p>()[offset];
145 }
146};
147
149
150enum StencilMode
151{
152 STENCIL_MODE_INPLACE = 1,
153 STENCIL_MODE_INPLACE_NO_SHARED = 3
154};
155
160template<typename SGridGpu, unsigned int prp, unsigned int stencil_size>
162{
164 stencil_size,
166 SGridGpu::device_grid_type::dims> type;
167};
168
169#include "encap_num.hpp"
170
175template<typename SGridGpu>
177{
179};
180
185template<unsigned int dim>
187{
188 __device__ inline static bool is_padding()
189 {
190 printf("NNfull_is_padding_impl with dim: %d not implemented yet \n",dim);
191
192 return false;
193 }
194};
195
200template<>
202{
203 template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
204 __device__ inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
205 {
206 bool isPadding_ = false;
207 for (int i = 0 ; i < 3 ; i++)
208 {
209 for (int j = 0 ; j < 3 ; j++)
210 {
211 for (int k = 0 ; k < 3 ; k++)
212 {
214
215 key.set_d(0,i-1);
216 key.set_d(1,j-1);
217 key.set_d(2,k-1);
218
219 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
220 typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
221 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
222 if (isPadding_) break;
223 }
224 }
225 }
226 return isPadding_;
227 }
228};
229
230
235template<>
237{
238 template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
239 __device__ inline static bool is_padding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
240 {
241 bool isPadding_ = false;
242 for (int i = 0 ; i < 3 ; i++)
243 {
244 for (int j = 0 ; j < 3 ; j++)
245 {
247
248 key.set_d(0,i-1);
249 key.set_d(1,j-1);
250
251 auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, key);
252 typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
253 isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
254 if (isPadding_) break;
255 }
256 }
257 return isPadding_;
258 }
259};
260
261template<unsigned int dim>
262struct NNFull
263{
264 static const int nNN = IntPow<3, dim>::value;
265
266 template<typename indexT, typename blockCoord_type, typename blockMap_type, typename SparseGrid_type>
267 __device__ static inline indexT getNNpos(blockCoord_type & blockCoord,
268 blockMap_type & blockMap,
269 SparseGrid_type & sparseGrid,
270 const unsigned int offset)
271 {
272 // point to the background element
273 int neighbourPos = blockMap.size();
274 if (offset < nNN && offset != nNN / 2)
275 {
276 int cnt = offset;
277 for (int i = 0 ; i < dim ; i++)
278 {
279 int dPos = cnt % 3;
280 cnt /= 3;
281 blockCoord.set_d(i, blockCoord.get(i) + dPos - 1);
282 }
283
284 neighbourPos = blockMap.get_sparse(sparseGrid.getBlockLinId(blockCoord)).id;
285 }
286 return neighbourPos;
287 }
288
289 template<typename indexT, unsigned int blockEdgeSize, typename coordType>
290 __host__ static inline indexT getNNskin(coordType & coord, int stencilSupportRadius)
291 {
292 // linearize the coord
293
294 indexT neighbourNum = 0;
295
296 indexT accu = 1;
297 for(int i = 0 ; i < dim ; i++)
298 {
299 int c = static_cast<int>(coord.get(i)) - static_cast<int>(stencilSupportRadius);
300 if (c < 0)
301 {
302 neighbourNum += 0;
303 }
304 else if (c >= blockEdgeSize)
305 {
306 neighbourNum += 2*accu;
307 }
308 else
309 {
310 neighbourNum += accu;
311 }
312 accu *= 3;
313 }
314
315 return neighbourNum;
316 }
317
318
319 template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
320 __device__ static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
321 {
322 return NNfull_is_padding_impl<3>::template is_padding(sparseGrid,coord,enlargedBlock);
323 }
324
335 template<unsigned int blockEdgeSize, typename indexT2>
336 __device__ static inline bool getNNindex_offset(grid_key_dx<dim,indexT2> & coord, unsigned int & NN_index, unsigned int & offset_nn)
337 {
338 bool out = false;
339 NN_index = 0;
340 offset_nn = 0;
341
342 int cnt = 1;
343 int cnt_off = 1;
344 for (unsigned int i = 0 ; i < dim ; i++)
345 {
346 int p = 1 - ((int)(coord.get(i) < 0)) + ((int)(coord.get(i) >= (int)blockEdgeSize));
347
348 NN_index += p*cnt;
349
350 offset_nn += (coord.get(i) + (1 - p)*(int)blockEdgeSize)*cnt_off;
351
352 cnt *= 3;
353 cnt_off *= blockEdgeSize;
354
355 out |= (p != 1);
356 }
357
358 return out;
359 }
360};
361
362
363
364template<unsigned int nNN_, unsigned int nLoop_>
365struct ct_par
366{
367 static const unsigned int nNN = nNN_;
368 static const unsigned int nLoop = nLoop_;
369};
370
371template<typename copy_type>
373{
374 template<typename T, typename dst_type, typename src_type>
375 static inline void copy(src_type & src, dst_type & dst, unsigned int bPos)
376 {
377 dst.template get<T::value>() = src.template get<T::value>()[bPos];
378 }
379};
380
381template<typename copy_type,unsigned int N1>
383{
384 template<typename T, typename dst_type, typename src_type>
385 static inline void copy(src_type & src, dst_type & dst, unsigned int bPos)
386 {
387 for (int i = 0 ; i < N1 ; i++)
388 {
389 dst.template get<T::value>()[i] = src.template get<T::value>()[i][bPos];
390 }
391 }
392};
393
394template<typename Tsrc,typename Tdst>
396{
398 Tsrc src;
399
401 Tdst dst;
402
403 size_t pos;
404
405 unsigned int bPos;
406
407public:
408
409 copy_prop_to_vector_block(Tsrc src, Tdst dst,size_t pos, size_t bPos)
410 :src(src),dst(dst),pos(pos),bPos(bPos)
411 {}
412
414 template<typename T>
415 inline void operator()(T& t) const
416 {
417 typedef typename std::remove_reference<decltype(dst.template get<T::value>())>::type copy_rtype;
418
420
421 //meta_copy<copy_rtype>::meta_copy_(src.template get<T::value>()[bPos],dst.template get<T::value>());
422 }
423
424};
425
426
427
428template<typename AggregateT, unsigned int n_it, unsigned int ... prp>
430{
431 typedef typename to_boost_vmpl<prp...>::type vprp;
432
434 void * base_ptr;
435
436 mutable size_t tot = 0;
437
438 size_t i;
439
440 size_t sz = 0;
441
442 arr_arr_ptr<n_it,sizeof...(prp)> & arrs;
443
444public:
445
446 data_ptr_fill(void * base_ptr,size_t i, arr_arr_ptr<n_it,sizeof...(prp)> & arrs, size_t sz)
447 :base_ptr(base_ptr),i(i),sz(sz),arrs(arrs)
448 {}
449
451 template<typename T>
452 inline void operator()(T& t) const
453 {
454 typedef typename boost::mpl::at<vprp,T>::type prp_cp;
455
456 // Remove the reference from the type to copy
457 typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;
458
459 arrs.ptr[i][T::value] = (void *)((((unsigned char *)base_ptr) + tot));
460
461 tot += sz * sizeof(pack_type);
462 }
463
464};
465
466template<typename SparseGridType>
468{
469 SparseGridType * grd;
470
471 // Source box
473
474 // destination Box
476
478 :grd(&grd),src(src),dst(dst)
479 {}
480};
481
482
483template<unsigned int dim,
484 typename AggregateT,
485 unsigned int blockEdgeSize = default_edge<dim>::type::value,
486 unsigned int threadBlockSize = default_edge<dim>::tb::value,
487 typename indexT=long int,
488 template<typename> class layout_base=memory_traits_inte,
489 typename linearizer = grid_smb<dim, blockEdgeSize, indexT>>
491 typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type,
492 threadBlockSize, indexT, layout_base>
493{
494public:
495
496 static constexpr unsigned int dims = dim;
497
498private:
499
500 typedef BlockMapGpu<
502 threadBlockSize, indexT, layout_base> BMG;
503
504 const static unsigned char PADDING_BIT = 1;
505 typedef typename aggregate_convert<dim,blockEdgeSize,AggregateT>::type AggregateBlockT;
506 linearizer gridGeometry;
507 grid_sm<dim, int> extendedBlockGeometry;
508 grid_sm<dim, int> gridSize;
509 unsigned int stencilSupportRadius;
510 unsigned int ghostLayerSize;
511 int req_index;
512 int req_index_swp;
513 int req_index_swp_r;
514
515 AggregateT bck;
516
518
519 // Queue of remove sections
521
522 // Queue of copy sections
524
525 CudaMemory mem;
526
527 // pointers for unpack
528 openfpm::vector<void *> index_ptrs;
529 openfpm::vector<void *> index_ptrs_swp;
530 openfpm::vector<void *> index_ptrs_swp_r;
531 openfpm::vector<void *> scan_ptrs;
532 openfpm::vector<void *> scan_ptrs_swp;
533 openfpm::vector<void *> scan_ptrs_swp_r;
534 openfpm::vector<void *> data_ptrs;
535 openfpm::vector<void *> data_ptrs_swp;
536 openfpm::vector<void *> data_ptrs_swp_r;
537 openfpm::vector<void *> offset_ptrs;
538 openfpm::vector<void *> offset_ptrs_swp;
539 openfpm::vector<void *> offset_ptrs_swp_r;
540 openfpm::vector<void *> mask_ptrs;
541 openfpm::vector<void *> mask_ptrs_swp;
542 openfpm::vector<void *> mask_ptrs_swp_r;
543
544 // pointers for copyRemove
545 openfpm::vector<void *> offset_ptrs_cp;
546 openfpm::vector<void *> offset_ptrs_cp_swp;
547 openfpm::vector<void *> offset_ptrs_cp_swp_r;
548 openfpm::vector<void *> scan_ptrs_cp;
549 openfpm::vector<void *> scan_ptrs_cp_swp;
550 openfpm::vector<void *> scan_ptrs_cp_swp_r;
551 openfpm::vector<void *> data_base_ptr_cp;
552 openfpm::vector<void *> data_base_ptr_cp_swp;
553 openfpm::vector<void *> data_base_ptr_cp_swp_r;
554 openfpm::vector<int> n_cnk_cp;
555 openfpm::vector<int> n_cnk_cp_swp;
556 openfpm::vector<int> n_cnk_cp_swp_r;
557 openfpm::vector<int> n_pnt_cp;
558 openfpm::vector<int> n_pnt_cp_swp;
559 openfpm::vector<int> n_pnt_cp_swp_r;
560 openfpm::vector<int> n_shifts_cp;
561 openfpm::vector<int> n_shift_cp_swp;
562 openfpm::vector<int> n_shift_cp_swp_r;
563 typedef typename aggregate_convert<dim,blockEdgeSize,aggregate<int>>::type convertAggr;
564
565 // Map to convert blocks from missaligned chunks
567 openfpm::vector_gpu<convertAggr> convert_blk_swp;
568 openfpm::vector_gpu<convertAggr> convert_blk_swp_r;
572
578
583
590
592
597
600
603
606
609 mutable openfpm::vector_gpu<aggregate<int>> new_map_swp;
610 mutable openfpm::vector_gpu<aggregate<int>> new_map_swp_r;
611
614 mutable openfpm::vector_gpu<Box<dim,int>> pack_subs_swp;
615 mutable openfpm::vector_gpu<Box<dim,int>> pack_subs_swp_r;
616
620 mutable int index_size_swp = -1;
621 mutable int index_size_swp_r = -1;
622
625
628
631
634
637
640
643
644 bool findNN = false;
645
646 inline void swap_internal_remote()
647 {
648 n_cnk_cp_swp_r.swap(n_cnk_cp);
649 n_pnt_cp_swp_r.swap(n_pnt_cp);
650 n_shift_cp_swp_r.swap(n_shifts_cp);
651 convert_blk_swp_r.swap(convert_blk);
652 box_cp_swp_r.swap(box_cp);
653 new_map_swp_r.swap(new_map);
654 }
655
656 inline void swap_internal_local()
657 {
658 offset_ptrs_cp_swp.swap(offset_ptrs_cp);
659 scan_ptrs_cp_swp.swap(scan_ptrs_cp);
660 data_base_ptr_cp_swp.swap(data_base_ptr_cp);
661 n_cnk_cp_swp.swap(n_cnk_cp);
662 n_pnt_cp_swp.swap(n_pnt_cp);
663 n_shift_cp_swp.swap(n_shifts_cp);
664 convert_blk_swp.swap(convert_blk);
665 box_cp_swp.swap(box_cp);
666 new_map_swp.swap(new_map);
667 }
668
669 inline void swap_local_pack()
670 {
671 index_ptrs_swp.swap(index_ptrs);
672 scan_ptrs_swp.swap(scan_ptrs);
673 data_ptrs_swp.swap(data_ptrs);
674 offset_ptrs_swp.swap(offset_ptrs);
675 mask_ptrs_swp.swap(mask_ptrs);
676
677 e_points_swp.swap(e_points);
678 pack_output_swp.swap(pack_output);
679 tmp_swp.swap(tmp);
680
681 pack_subs_swp.swap(pack_subs);
683 }
684
685 inline void swap_remote_pack()
686 {
687 index_ptrs_swp_r.swap(index_ptrs);
688 scan_ptrs_swp_r.swap(scan_ptrs);
689 data_ptrs_swp_r.swap(data_ptrs);
690 offset_ptrs_swp_r.swap(offset_ptrs);
691 mask_ptrs_swp_r.swap(mask_ptrs);
692
693 e_points_swp_r.swap(e_points);
694 pack_output_swp_r.swap(pack_output);
695 tmp_swp_r.swap(tmp);
696
697 pack_subs_swp_r.swap(pack_subs);
698 //req_index_swp_r = req_index;
699 index_size_swp_r = private_get_index_array().size();
700 }
701
702protected:
703 static constexpr unsigned int blockSize = BlockTypeOf<AggregateBlockT, 0>::size;
704 typedef AggregateBlockT AggregateInternalT;
705
706public:
707
709 typedef int yes_i_am_grid;
710
711 static constexpr unsigned int blockEdgeSize_ = blockEdgeSize;
712
713 typedef linearizer grid_info;
714
715 typedef linearizer linearizer_type;
716
717 template<typename Tfunc> using layout_mfunc = memory_traits_inte<Tfunc>;
718
720
721 typedef indexT indexT_;
722
723 typedef decltype(std::declval<BMG>().toKernel().insertBlock(0)) insert_encap;
724
730 inline size_t size() const
731 {
732 return this->countExistingElements();
733 }
734
740 template <typename stencil = no_stencil>
742 {
744 }
745
752 {
753 return SparseGridGpu_iterator<dim,self>(std::declval<self>());
754 }
755
756 template<typename dim3T>
757 inline static int dim3SizeToInt(dim3T d)
758 {
759 return d.x * d.y * d.z;
760 }
761
762 inline static int dim3SizeToInt(size_t d)
763 {
764 return d;
765 }
766
767 inline static int dim3SizeToInt(unsigned int d)
768 {
769 return d;
770 }
771
772 template<typename ... v_reduce>
773 void flush(gpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST)
774 {
776 ::template flush<v_reduce ...>(context, opt);
777
778 findNN = false;
779 }
780
781
782
783 void saveUnpackVariableIfNotKeepGeometry(int opt, bool is_unpack_remote)
784 {
785 if (is_unpack_remote == true)
786 {swap_internal_remote();}
787
788 if (is_unpack_remote == false)
789 {swap_internal_local();}
790 }
791
792 void RestoreUnpackVariableIfKeepGeometry(int opt, bool is_unpack_remote)
793 {
794 if (opt & KEEP_GEOMETRY && is_unpack_remote == true)
795 {swap_internal_remote();}
796
797 if (opt & KEEP_GEOMETRY && is_unpack_remote == false)
798 {swap_internal_local();}
799 }
800
801
802 void savePackVariableIfNotKeepGeometry(int opt, bool is_pack_remote)
803 {
804 if (is_pack_remote == false)
805 {
806 swap_local_pack();
807 req_index_swp = req_index;
808 }
809
810 if (is_pack_remote == true)
811 {
812 swap_remote_pack();
813 req_index_swp_r = req_index;
814 }
815 }
816
817 void RestorePackVariableIfKeepGeometry(int opt, bool is_pack_remote)
818 {
819 if (opt & KEEP_GEOMETRY && is_pack_remote == false)
820 {
821 swap_local_pack();
822 req_index = req_index_swp;
823 }
824
825 if (opt & KEEP_GEOMETRY && is_pack_remote == true)
826 {
827 swap_remote_pack();
828 req_index = req_index_swp_r;
829 }
830 }
831
832 template<unsigned int n_it>
833 void calculatePackingPointsFromBoxes(int opt,size_t tot_pnt)
834 {
835 if (!(opt & KEEP_GEOMETRY))
836 {
837 auto & indexBuffer = private_get_index_array();
838 auto & dataBuffer = private_get_data_array();
839
840 e_points.resize(tot_pnt);
841 pack_output.resize(tot_pnt);
842
843 ite_gpu<1> ite;
844
845 ite.wthr.x = indexBuffer.size();
846 ite.wthr.y = 1;
847 ite.wthr.z = 1;
848 ite.thr.x = getBlockSize();
849 ite.thr.y = 1;
850 ite.thr.z = 1;
851
852 // Launch a kernel that count the number of element on each chunks
853 CUDA_LAUNCH((SparseGridGpuKernels::get_exist_points_with_boxes<dim,
855 n_it,
856 indexT>),
857 ite,
858 indexBuffer.toKernel(),
859 pack_subs.toKernel(),
860 gridGeometry,
861 dataBuffer.toKernel(),
862 pack_output.toKernel(),
863 tmp.toKernel(),
864 scan_it.toKernel(),
865 e_points.toKernel());
866 }
867 }
868
869private:
870
871 void computeSizeOfGhostLayer()
872 {
873 unsigned int term1 = 1;
874 for (int i = 0; i < dim; ++i)
875 {
876 term1 *= blockEdgeSize + 2 * stencilSupportRadius;
877 }
878 unsigned int term2 = 1;
879 for (int i = 0; i < dim; ++i)
880 {
881 term2 *= blockEdgeSize;
882 }
883 ghostLayerSize = term1 - term2;
884 }
885
886 void allocateGhostLayerMapping()
887 {
888 ghostLayerToThreadsMapping.resize(ghostLayerSize);
889 }
890
891 template<typename stencil_type>
892 void computeGhostLayerMapping()
893 {
894 size_t dimensions[dim],
895 origin[dim],
896 innerDomainBegin[dim], innerDomainEnd[dim],
897 outerBoxBegin[dim], outerBoxEnd[dim],
898 bc[dim];
899 for (int i = 0; i < dim; ++i)
900 {
901 dimensions[i] = blockEdgeSize + 2 * stencilSupportRadius;
902 origin[i] = 0;
903 innerDomainBegin[i] = stencilSupportRadius - 1;
904 innerDomainEnd[i] = dimensions[i] - stencilSupportRadius;
905 outerBoxBegin[i] = origin[i];
906 outerBoxEnd[i] = dimensions[i];
907 bc[i] = NON_PERIODIC;
908 }
909 grid_sm<dim, void> enlargedGrid;
910 enlargedGrid.setDimensions(dimensions);
911 Box<dim, size_t> outerBox(outerBoxBegin, outerBoxEnd);
912 Box<dim, size_t> innerBox(innerDomainBegin, innerDomainEnd);
913
914 grid_skin_iterator_bc<dim> gsi(enlargedGrid, innerBox, outerBox, bc);
915
916 unsigned int i = 0;
917 while (gsi.isNext())
918 {
919 auto coord = gsi.get();
920 assert(i < ghostLayerSize);
921 mem_id linId = enlargedGrid.LinId(coord);
922 // Mapping
923 ghostLayerToThreadsMapping.template get<gt>(i) = linId;
924 // Now compute the neighbour position to use
925 ghostLayerToThreadsMapping.template get<nt>(i) = stencil_type::template getNNskin<indexT,blockEdgeSize>(coord,stencilSupportRadius);
926 //
927 ++i;
928 ++gsi;
929 }
930 assert(i == ghostLayerSize);
931
932 ghostLayerToThreadsMapping.template hostToDevice<gt,nt>();
933 }
934
935 void initialize(const size_t (& res)[dim])
936 {
937 gridGeometry = linearizer(res);
938
939 computeSizeOfGhostLayer();
940 allocateGhostLayerMapping();
941 computeGhostLayerMapping<NNStar<dim>>();
942
943 size_t extBlockDims[dim];
944 for (int d=0; d<dim; ++d)
945 {
946 extBlockDims[d] = blockEdgeSize + 2*stencilSupportRadius;
947 }
948 extendedBlockGeometry.setDimensions(extBlockDims);
949
950 gridSize.setDimensions(res);
951 }
952
953
954 template <typename stencil, typename... Args>
955 void applyStencilInPlace(const Box<dim,int> & box, StencilMode & mode,Args... args)
956 {
957 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
960
961 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
962 unsigned int numScalars = indexBuffer_.size() * dataChunkSize;
963
964 if (numScalars == 0) return;
965
966 // NOTE: Here we want to work only on one data chunk per block!
967 constexpr unsigned int chunksPerBlock = 1;
968 const unsigned int localThreadBlockSize = dataChunkSize * chunksPerBlock;
969 const unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
970 ? numScalars / localThreadBlockSize
971 : 1 + numScalars / localThreadBlockSize;
972
973 constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * chunksPerBlock)>::value; // todo: This works only for stencilSupportSize==1
974
975#ifdef CUDIFY_USE_CUDA
976
977
978 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::applyStencilInPlace
979 <dim,
981 stencil>),
982 threadGridSize, localThreadBlockSize,
983 box,
984 indexBuffer_.toKernel(),
985 dataBuffer_.toKernel(),
986 this->template toKernelNN<stencil::stencil_type::nNN, nLoop>(),
987 args...);
988
989#else
990
991 auto bx = box;
992 auto indexBuffer = indexBuffer_.toKernel();
993 auto dataBuffer = dataBuffer_.toKernel();
994 auto sparseGrid = this->template toKernelNN<stencil::stencil_type::nNN, nLoop>();
995
997
998 auto lamb = [=] __device__ () mutable
999 {
1000 constexpr unsigned int pIndex = 0;
1001
1002 typedef typename decltype(indexBuffer)::value_type IndexAggregateT;
1003 typedef BlockTypeOf<IndexAggregateT , pIndex> IndexT;
1004
1005 typedef typename decltype(dataBuffer)::value_type AggregateT_;
1006 typedef BlockTypeOf<AggregateT_, pMask> MaskBlockT;
1007 typedef ScalarTypeOf<AggregateT_, pMask> MaskT;
1008 constexpr unsigned int blockSize = MaskBlockT::size;
1009
1010 // NOTE: here we do 1 chunk per block! (we want to be sure to fit local memory constraints
1011 // since we will be loading also neighbouring elements!) (beware curse of dimensionality...)
1012 const unsigned int dataBlockPos = blockIdx.x;
1013 const unsigned int offset = threadIdx.x;
1014
1015 if (dataBlockPos >= indexBuffer.size())
1016 {
1017 return;
1018 }
1019
1020 auto dataBlockLoad = dataBuffer.get(dataBlockPos); // Avoid binary searches as much as possible
1021
1022 // todo: Add management of RED-BLACK stencil application! :)
1023 const unsigned int dataBlockId = indexBuffer.template get<pIndex>(dataBlockPos);
1024 grid_key_dx<dim, int> pointCoord = sparseGrid.getCoord(dataBlockId * blockSize + offset);
1025
1026 unsigned char curMask;
1027
1028 if (offset < blockSize)
1029 {
1030 // Read local mask to register
1031 curMask = dataBlockLoad.template get<pMask>()[offset];
1032 for (int i = 0 ; i < dim ; i++)
1033 {curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;}
1034 }
1035
1037 sdataBlockPos.id = dataBlockPos;
1038
1039 stencil::stencil(
1040 sparseGrid, dataBlockId, sdataBlockPos , offset, pointCoord, dataBlockLoad, dataBlockLoad,
1041 curMask, args...);
1042 };
1043
1044 CUDA_LAUNCH_LAMBDA_DIM3_TLS(threadGridSize, localThreadBlockSize,lamb);
1045
1046#endif
1047
1048 }
1049
1050
1051 template <typename stencil, typename... Args>
1052 void applyStencilInPlaceNoShared(const Box<dim,int> & box, StencilMode & mode,Args... args)
1053 {
1054 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
1057
1058 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
1059 unsigned int numScalars = indexBuffer.size() * dataChunkSize;
1060
1061 if (numScalars == 0) return;
1062
1063 auto ite = e_points.getGPUIterator(BLOCK_SIZE_STENCIL);
1064
1065 CUDA_LAUNCH((SparseGridGpuKernels::applyStencilInPlaceNoShared
1066 <dim,
1068 stencil>),
1069 ite,
1070 box,
1071 indexBuffer.toKernel(),
1072 dataBuffer.toKernel(),
1073 this->template toKernelNN<stencil::stencil_type::nNN, 0>(),
1074 args...);
1075 }
1076
1077 template<typename ids_type>
1078 void fill_chunks_boxes(openfpm::vector<SpaceBox<dim,double>> & chunks_box, ids_type & chunk_ids, Point<dim,double> & spacing, Point<dim,double> & offset)
1079 {
1080 for (int i = 0 ; i < chunk_ids.size() ; i++)
1081 {
1083
1084 auto c_pos = gridGeometry.InvLinId(chunk_ids.template get<0>(i)*blockSize);
1085
1086 for (int j = 0 ; j < dim ; j++)
1087 {
1088 box.setLow(j,c_pos.get(j) * spacing[j] - 0.5*spacing[j] + offset.get(j)*spacing[j]);
1089 box.setHigh(j,(c_pos.get(j) + blockEdgeSize)*spacing[j] - 0.5*spacing[j] + offset.get(j)*spacing[j]);
1090 }
1091
1092 chunks_box.add(box);
1093 }
1094 }
1095
1096 template<typename MemType, unsigned int ... prp>
1097 void preUnpack(ExtPreAlloc<MemType> * prAlloc_prp, gpu::ofp_context_t & ctx, int opt)
1098 {
1099 if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1100 {
1101 // Convert the packed chunk ids
1102
1103 prAlloc_prp->reset();
1104 Unpack_stat ups;
1105
1106 for (size_t i = 0 ; i < copySect.size() ; i++)
1107 {
1108 auto sub_it = this->getIterator(copySect.get(i).dst.getKP1(),copySect.get(i).dst.getKP2(),NO_ITERATOR_INIT);
1109
1110 copySect.get(i).grd->template addAndConvertPackedChunkToTmp<prp ...>(*prAlloc_prp,sub_it,ups,ctx);
1111 }
1112 }
1113 }
1114
1115
1116 template<unsigned int ... prp>
1117 void removeCopyToFinalize_phase1(gpu::ofp_context_t & ctx, int opt)
1118 {
1119 if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1120 {removePoints(ctx);}
1121 }
1122
1123 template<unsigned int ... prp>
1124 void removeCopyToFinalize_phase2(gpu::ofp_context_t & ctx, int opt)
1125 {
1126 // Pack information
1127 Pack_stat sts;
1128
1129 if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1130 {
1131 this->packReset();
1132
1133 size_t req = 0;
1134 // First we do counting of point to copy (as source)
1135
1136 for (size_t i = 0 ; i < copySect.size() ; i++)
1137 {
1138 auto sub_it = this->getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1139
1140 this->packRequest(sub_it,req);
1141 }
1142
1143 this->template packCalculate<prp...>(req,ctx);
1144
1145 mem.resize(req);
1146
1147 // Create an object of preallocated memory for properties
1148 prAlloc_prp = new ExtPreAlloc<CudaMemory>(req,mem);
1150
1151 for (size_t i = 0 ; i < copySect.size() ; i++)
1152 {
1153 auto sub_it = this->getIterator(copySect.get(i).src.getKP1(),copySect.get(i).src.getKP2(),NO_ITERATOR_INIT);
1154
1155 this->pack<prp ...>(*prAlloc_prp,sub_it,sts);
1156 }
1157 }
1158 else
1159 {
1160 size_t req = mem.size();
1161
1162 // Create an object of preallocated memory for properties
1163 prAlloc_prp = new ExtPreAlloc<CudaMemory>(req,mem);
1165 }
1166
1167 this->template packFinalize<prp ...>(*prAlloc_prp,sts,opt,false);
1168
1169 preUnpack<CudaMemory,prp ...>(prAlloc_prp,ctx,opt);
1170
1172 delete prAlloc_prp;
1173 }
1174
1175 template<unsigned int ... prp>
1176 void removeCopyToFinalize_phase3(gpu::ofp_context_t & ctx, int opt, bool is_unpack_remote)
1177 {
1178 ite_gpu<1> ite;
1179
1180 if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
1181 {
1182 if (tmp2.size() == 0)
1183 {return;}
1184
1185 // Fill the add buffer given tmp and than flush
1186
1187 setGPUInsertBuffer(tmp2.size(),1ul);
1188
1189 auto & add_buff = this->blockMap.private_get_vct_add_index();
1190 add_buff.swap(tmp2);
1191
1192 auto & nadd_buff = this->blockMap.private_get_vct_nadd_index();
1193 ite = nadd_buff.getGPUIterator();
1194 CUDA_LAUNCH(SparseGridGpuKernels::set_one,ite,nadd_buff.toKernel());
1195
1196 int sz_b = this->private_get_index_array().size();
1197
1198 this->template flush<sLeft_<prp>...>(ctx,flush_type::FLUSH_ON_DEVICE);
1199
1200 // get the map of the new inserted elements
1201
1202 auto & m_map = this->getMergeIndexMapVector();
1203 auto & a_map = this->getMappingVector();
1204 auto & o_map = this->getSegmentToOutMap();
1205 auto & segments_data = this->getSegmentToMergeIndexMap();
1206
1207 new_map.resize(a_map.size(),0);
1208
1209 // construct new to old map
1210
1211 ite = segments_data.getGPUIterator();
1212
1213 if (ite.nblocks() != 0)
1214 CUDA_LAUNCH(SparseGridGpuKernels::construct_new_chunk_map<1>,ite,new_map.toKernel(),a_map.toKernel(),m_map.toKernel(),o_map.toKernel(),segments_data.toKernel(),sz_b);
1215
1216 convert_blk.template hostToDevice<0>();
1217 }
1218 else
1219 {
1220 ite.wthr.x = 1;
1221 ite.wthr.y = 1;
1222 ite.wthr.z = 1;
1223
1224 ite.thr.x = 1;
1225 ite.thr.y = 1;
1226 ite.thr.z = 1;
1227 }
1228
1229 // Restore
1230 RestoreUnpackVariableIfKeepGeometry(opt,is_unpack_remote);
1231
1232 // for each packed chunk
1233
1234 size_t n_accu_cnk = 0;
1235 for (size_t i = 0 ; i < n_cnk_cp.size() ; i++)
1236 {
1237 arr_arr_ptr<1,sizeof...(prp)> data;
1238 size_t n_pnt = n_pnt_cp.get(i);
1239
1240 void * data_base_ptr = data_base_ptr_cp.get(i);
1241 data_ptr_fill<AggregateT,1,prp...> dpf(data_base_ptr,0,data,n_pnt);
1242 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(dpf);
1243
1244 ite.wthr.x = n_cnk_cp.get(i);
1245
1246 // calculate best number of threads
1247 Box<dim,size_t> ub = box_cp.get(i);
1248
1249 ite.thr.x = 1;
1250 for (int j = 0 ; j < dim ; j++)
1251 {
1252 size_t l = ub.getHigh(j) - ub.getLow(j) + 1;
1253
1254 if (l >= blockEdgeSize)
1255 {ite.thr.x *= blockEdgeSize;}
1256 else
1257 {ite.thr.x *= l;}
1258 }
1259
1260 // copy to new (1 block for each packed chunk)
1261 if (ite.nblocks() != 0 && ite.thr.x != 0)
1262 {
1263 auto & chunks = private_get_data_array();
1264
1265 CUDA_LAUNCH((SparseGridGpuKernels::copy_packed_data_to_chunks<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
1266 AggregateT,decltype(convert_blk.toKernel()),decltype(new_map.toKernel()),
1267 decltype(data),decltype(chunks.toKernel()),prp... >),ite,
1268 (unsigned int *)scan_ptrs_cp.get(i),
1269 (unsigned short int *)offset_ptrs_cp.get(i),
1270 convert_blk.toKernel(),
1271 new_map.toKernel(),
1272 data,
1273 chunks.toKernel(),
1274 n_cnk_cp.get(i),
1275 n_shifts_cp.get(i),
1276 n_pnt_cp.get(i),
1277 i,
1278 n_accu_cnk);
1279 }
1280
1281 n_accu_cnk += n_cnk_cp.get(i)*n_shifts_cp.get(i);
1282 }
1283
1284 // Save
1285 saveUnpackVariableIfNotKeepGeometry(opt,is_unpack_remote);
1286 }
1287
1288 template<unsigned int n_it, unsigned int ... prp>
1289 void pack_sg_implement(ExtPreAlloc<CudaMemory> & mem,
1290 Pack_stat & sts,
1291 int opt,
1292 bool is_pack_remote)
1293 {
1294 arr_ptr<n_it> index_ptr;
1295 arr_arr_ptr<n_it,sizeof...(prp)> data_ptr;
1296 arr_ptr<n_it> scan_ptr;
1297 arr_ptr<n_it> offset_ptr;
1298 arr_ptr<n_it> mask_ptr;
1300
1301 auto & indexBuffer = private_get_index_array();
1302 auto & dataBuffer = private_get_data_array();
1303
1304 if (req_index != pack_subs.size())
1305 {std::cerr << __FILE__ << ":" << __LINE__ << " error the packing request number differ from the number of packed objects " << req_index << " " << pack_subs.size() << std::endl;}
1306
1307 size_t tot_pnt = 0;
1308 size_t tot_cnk = 0;
1309
1310 sparsegridgpu_pack_request<AggregateT,prp ...> spq;
1311 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
1312
1313 // Calculate total points
1314
1315 for (size_t i = 0 ; i < pack_subs.size() ; i++)
1316 {
1317 size_t n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
1318 sar.sa[i] = n_pnt;
1319 tot_pnt += n_pnt;
1320 }
1321
1322 // CUDA require aligned access, here we suppose 8 byte alligned and we ensure 8 byte aligned after
1323 // the cycle
1324 for (size_t i = 0 ; i < pack_subs.size() ; i++)
1325 {
1326 size_t n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
1327
1328 // fill index_ptr data_ptr scan_ptr
1329 index_ptr.ptr[i] = index_ptrs.get(i);
1330 scan_ptr.ptr[i] = scan_ptrs.get(i);
1331
1332 // for all properties fill the data pointer
1333
1334 data_ptr_fill<AggregateT,n_it,prp...> dpf(data_ptrs.get(i),i,data_ptr,tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1));
1335 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(dpf);
1336
1337 offset_ptr.ptr[i] = offset_ptrs.get(i);
1338 mask_ptr.ptr[i] = mask_ptrs.get(i);
1339
1340 tot_cnk += n_cnk;
1341 }
1342
1343 ite_gpu<1> ite;
1344
1345 if (tot_pnt != 0)
1346 {
1347 calculatePackingPointsFromBoxes<n_it>(opt,tot_pnt);
1348
1349 ite = e_points.getGPUIterator();
1350
1351 // Here we copy the array of pointer of properties into a CudaMemory array
1352
1353 CudaMemory mem;
1354 mem.allocate(sizeof(data_ptr));
1355
1356 // copy
1357 arr_arr_ptr<n_it,sizeof...(prp)> * arr_data = (arr_arr_ptr<n_it,sizeof...(prp)> *)mem.getPointer();
1358
1359 for(int i = 0 ; i < n_it ; i++)
1360 {
1361 for (int j = 0 ; j < sizeof...(prp) ; j++)
1362 {
1363 arr_data->ptr[i][j] = data_ptr.ptr[i][j];
1364 }
1365 }
1366
1367 mem.hostToDevice();
1368
1369 CUDA_LAUNCH((SparseGridGpuKernels::pack_data<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
1370 AggregateT,
1371 n_it,
1372 sizeof...(prp),
1373 indexT,
1374 decltype(e_points.toKernel()),
1375 decltype(pack_output.toKernel()),
1376 decltype(indexBuffer.toKernel()),
1377 decltype(dataBuffer.toKernel()),
1378 decltype(tmp.toKernel()),
1379 self::blockSize,
1380 prp ...>),
1381 ite,
1382 e_points.toKernel(),
1383 dataBuffer.toKernel(),
1384 indexBuffer.toKernel(),
1385 tmp.toKernel(),
1386 pack_output.toKernel(),
1387 index_ptr,
1388 scan_ptr,
1389 (arr_arr_ptr<n_it,sizeof...(prp)> *)mem.getDevicePointer(),
1390 offset_ptr,
1391 mask_ptr,
1392 sar);
1393 }
1394
1395 ite.wthr.x = 1;
1396 ite.wthr.y = 1;
1397 ite.wthr.z = 1;
1398 ite.thr.x = pack_subs.size();
1399 ite.thr.y = 1;
1400 ite.thr.z = 1;
1401
1402 if (pack_subs.size() != 0)
1403 {CUDA_LAUNCH(SparseGridGpuKernels::last_scan_point,ite,scan_ptr,tmp.toKernel(),indexBuffer.size()+1,pack_subs.size());}
1404 }
1405
1406
1416 template<unsigned int ... prp, typename S2>
1419 Unpack_stat & ps,
1420 gpu::ofp_context_t &context)
1421 {
1422 sparsegridgpu_pack_request<AggregateT,prp ...> spq;
1423 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
1424
1425 // First get the number of chunks
1426
1427 size_t n_cnk;
1428
1429 // Unpack the number of chunks
1430 mem.deviceToHost(ps.getOffset(),ps.getOffset() + sizeof(size_t) + 2*dim*sizeof(int));
1431 Unpacker<size_t,S2>::unpack(mem,n_cnk,ps);
1432
1433 grid_key_dx<dim,int> origPack_pnt;
1434 grid_key_dx<dim,int> origPack_cnk;
1435 size_t sz[dim];
1436
1437 // Unpack origin of the chunk indexing
1438 for (int i = 0 ; i < dim ; i++)
1439 {
1440 int tmp;
1442 origPack_cnk.set_d(i,((int)(tmp / blockEdgeSize))*blockEdgeSize);
1443 origPack_pnt.set_d(i,tmp);
1444 }
1445
1446 for (int i = 0 ; i < dim ; i++)
1447 {
1448 int tmp;
1450 sz[i] = tmp;
1451 }
1452
1453 size_t actual_offset = n_cnk*sizeof(indexT);
1454 // get the id pointers
1455 indexT * ids = (indexT *)((unsigned char *)mem.getDevicePointer() + ps.getOffset());
1456 unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));
1457
1458 mem.deviceToHost(ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int),
1459 ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int) + sizeof(unsigned int));
1460
1461
1462
1463 // Unpack number of points
1464 // calculate the number of total points
1465 size_t n_pnt = *(unsigned int *)((unsigned char *)mem.getPointer() + ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int));
1466 actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));
1467
1468 void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );
1469
1470 actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));
1471
1472 short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);
1473
1474 offset_ptrs_cp.add(offsets);
1475 scan_ptrs_cp.add(scan);
1476 n_cnk_cp.add(n_cnk);
1477 n_pnt_cp.add(n_pnt);
1478 data_base_ptr_cp.add(data_base_ptr);
1479
1480 Box<dim,size_t> bx;
1481
1482 for (int i = 0 ; i < dim ; i++)
1483 {
1484 bx.setLow(i,sub_it.getStart().get(i));
1485 bx.setHigh(i,sub_it.getStop().get(i));
1486 }
1487
1488 box_cp.add(bx);
1489
1490 actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));
1491
1492 if (n_cnk != 0)
1493 {
1494 shifts.clear();
1495
1496 int n_shift = 1;
1497 shifts.add();
1498
1499 for (int i = 0 ; i < dim ; i++)
1500 {shifts.last().template get<0>()[i] = 0;}
1501
1502 for (int i = 0 ; i < dim ; i++)
1503 {
1504 int op_q = origPack_pnt.get(i) % blockEdgeSize;
1505 int ou_q = sub_it.getStart().get(i) % blockEdgeSize;
1506 int quot = abs(ou_q - op_q) % blockEdgeSize;
1507 int squot = openfpm::math::sgn(ou_q - op_q);
1508 if (quot != 0)
1509 {
1510 n_shift *= 2;
1511
1512 int sz = shifts.size();
1513 for (int j = 0 ; j < sz ; j++)
1514 {
1515 shifts.add();
1516 for (int k = 0 ; k < dim ; k++)
1517 {
1518 shifts.last().template get<0>()[k] = shifts.template get<0>(j)[k] + ((i == k)?squot:0);
1519 }
1520 }
1521 }
1522 }
1523
1524 shifts.template hostToDevice<0>();
1525
1526 linearizer gridGeoPack(sz);
1527
1528 int bs = 0;
1529 size_t sz[1] = {n_cnk};
1530 grid_sm<1,void> g(sz);
1531 auto ite = g.getGPUIterator();
1532
1534 grid_key_dx<dim,int> origUnpack_cnk;
1535
1536 for (int i = 0 ; i < dim ; i++)
1537 {
1538 sz_g.set_d(i,gridGeometry.getSize()[i]);
1539 origUnpack_cnk.set_d(i,(int)(sub_it.getStart().get(i) / blockEdgeSize)*blockEdgeSize);
1540 }
1541
1542 bs = tmp2.size();
1543 tmp2.resize(tmp2.size() + n_cnk * shifts.size());
1544
1545 n_shifts_cp.add(shifts.size());
1546
1547 switch (shifts.size())
1548 {
1549 case 1:
1550 // Calculate for each chunk the indexes where they should go + active points
1551 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,1,indexT>),ite,ids,
1552 n_cnk,
1553 gridGeoPack,origPack_cnk,
1554 gridGeometry,origUnpack_cnk,
1555 tmp2.toKernel(),
1556 shifts.toKernel(),
1557 sz_g,
1558 bs);
1559 break;
1560 case 2:
1561 // Calculate for each chunk the indexes where they should go + active points
1562 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,2,indexT>),ite,ids,
1563 n_cnk,
1564 gridGeoPack,origPack_cnk,
1565 gridGeometry,origUnpack_cnk,
1566 tmp2.toKernel(),
1567 shifts.toKernel(),
1568 sz_g,
1569 bs);
1570 break;
1571 case 4:
1572 // Calculate for each chunk the indexes where they should go + active points
1573 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,4,indexT>),ite,ids,
1574 n_cnk,
1575 gridGeoPack,origPack_cnk,
1576 gridGeometry,origUnpack_cnk,
1577 tmp2.toKernel(),
1578 shifts.toKernel(),
1579 sz_g,
1580 bs);
1581 break;
1582 case 8:
1583 // Calculate for each chunk the indexes where they should go + active points
1584 CUDA_LAUNCH((SparseGridGpuKernels::convert_chunk_ids<dim,blockSize,blockEdgeSize,8,indexT>),ite,ids,
1585 n_cnk,
1586 gridGeoPack,origPack_cnk,
1587 gridGeometry,origUnpack_cnk,
1588 tmp2.toKernel(),
1589 shifts.toKernel(),
1590 sz_g,
1591 bs);
1592 break;
1593 }
1594
1595 convertChunkIds(offsets,origPack_pnt,sub_it);
1596 }
1597 else
1598 {
1599 convert_blk.add();
1600 n_shifts_cp.add(0);
1601 }
1602
1603 actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));
1604
1605 ps.addOffset(actual_offset);
1606 }
1607
1612 template<typename origPackType, typename IteratorType>
1613 void convertChunkIds(short int * offset, origPackType & origPack, IteratorType & sub_it)
1614 {
1615 int quot_diff[dim];
1616 for (int i = 0 ; i < dim ; i++)
1617 {
1618 int op_q = origPack.get(i) % blockEdgeSize;
1619 int ou_q = sub_it.getStart().get(i) % blockEdgeSize;
1620 int quot = abs(ou_q - op_q) % blockEdgeSize;
1621 quot_diff[i] = openfpm::math::sgn(ou_q - op_q)*quot;
1622 }
1623
1624 convert_blk.add();
1625
1626 // Create conversion block
1627
1628 for (int j = 0 ; j < this->blockSize ; j++)
1629 {
1630 int offset = 0;
1631 int bpos = 0;
1632 int bp_c = 1;
1633 int pos = 0;
1634 int pos_c = 1;
1635
1636 int x = j;
1637 for (int i = 0 ; i < dim ; i++)
1638 {
1639 int c = x % blockEdgeSize;
1640
1641 if (quot_diff[i] + c < 0)
1642 {
1643 offset += pos_c*(quot_diff[i] + c + blockEdgeSize);
1644 bpos += bp_c*1;
1645 }
1646 else if (quot_diff[i] + c >= blockEdgeSize)
1647 {
1648 offset += pos_c*(quot_diff[i] + c - blockEdgeSize);
1649 bpos += bp_c*1;
1650 }
1651 else
1652 {
1653 offset += pos_c*(quot_diff[i] + c);
1654 }
1655
1656 pos += pos_c*c;
1657 pos_c *= blockEdgeSize;
1658 bp_c *= (quot_diff[i] != 0)?2:1;
1659 x /= blockEdgeSize;
1660 }
1661
1662 convert_blk.template get<0>(convert_blk.size()-1)[pos] = (bpos << 16) + offset;
1663 }
1664 }
1665
1666public:
1667
1668 typedef AggregateT value_type;
1669
1670 typedef self device_grid_type;
1671
1673 :stencilSupportRadius(1)
1674 {};
1675
1681 void resize(size_t (& res)[dim])
1682 {
1683 initialize(res);
1684 }
1685
1690 SparseGridGpu(const size_t (& res)[dim], unsigned int stencilSupportRadius = 1)
1691 :stencilSupportRadius(stencilSupportRadius)
1692 {
1693 initialize(res);
1694 };
1695
1700 SparseGridGpu(linearizer & gridGeometry, unsigned int stencilSupportRadius = 1)
1701 : gridGeometry(gridGeometry),
1702 stencilSupportRadius(stencilSupportRadius)
1703 {
1704 // convert to size_t
1705 size_t sz_st[dim];
1706
1707 for (int i = 0 ; i < dim ; i++) {sz_st[i] = gridGeometry.getSize()[i];}
1708
1709 initialize(sz_st);
1710 };
1711
1713 <
1714 dim,
1715 blockEdgeSize,
1716 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1718 indexT,
1719 layout_base,
1720 decltype(extendedBlockGeometry),
1721 linearizer,
1722 AggregateT
1723 > toKernel()
1724 {
1726 <
1727 dim,
1728 blockEdgeSize,
1729 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1731 indexT,
1732 layout_base,
1733 decltype(extendedBlockGeometry),
1734 linearizer,
1735 AggregateT
1736 > toKer(
1738 gridGeometry,
1739 extendedBlockGeometry,
1740 stencilSupportRadius,
1741 ghostLayerToThreadsMapping.toKernel(),
1742 nn_blocks.toKernel(),
1743 e_points.toKernel(),
1744 ghostLayerSize,
1745 bck);
1746 return toKer;
1747 }
1748
1749 template<unsigned int nNN, unsigned int nLoop>
1751 <
1752 dim,
1753 blockEdgeSize,
1754 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1756 indexT,
1757 layout_base,
1758 decltype(extendedBlockGeometry),
1759 linearizer,
1760 AggregateT
1761 > toKernelNN()
1762 {
1764 <
1765 dim,
1766 blockEdgeSize,
1767 typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT,
1769 indexT,
1770 layout_base,
1771 decltype(extendedBlockGeometry),
1772 linearizer,
1773 AggregateT
1774 > toKer(
1776 gridGeometry,
1777 extendedBlockGeometry,
1778 stencilSupportRadius,
1779 ghostLayerToThreadsMapping.toKernel(),
1780 nn_blocks.toKernel(),
1781 e_points.toKernel(),
1782 ghostLayerSize,
1783 bck);
1784 return toKer;
1785 }
1786
1790 void clear()
1791 {
1792 BMG::clear();
1793 }
1794
1795 /* \brief Does nothing
1796 *
1797 */
1798 void setMemory()
1799 {}
1800
1801 auto insertBlockFlush(size_t block) -> decltype(BMG::insertBlockFlush(block))
1802 {
1803 return BMG::insertBlockFlush(block);
1804 }
1805
1811 linearizer & getGrid()
1812 {
1813 return gridGeometry;
1814 }
1815
1821 template<typename stencil_type>
1823 {
1824 computeGhostLayerMapping<stencil_type>();
1825 }
1826
1827
1828 constexpr static unsigned int getBlockEdgeSize()
1829 {
1830 return blockEdgeSize;
1831 }
1832
1833 constexpr unsigned int getBlockSize() const
1834 {
1835 return blockSize;
1836 }
1837
1838 // Geometry
1839 template<typename CoordT>
1840 inline size_t getLinId(CoordT &coord)
1841 {
1842 return gridGeometry.LinId(coord);
1843 }
1844
1845 inline grid_key_dx<dim, int> getCoord(size_t linId) const
1846 {
1847 return gridGeometry.InvLinId(linId);
1848 }
1849
1850 inline ite_gpu<dim> getGridGPUIterator(const grid_key_dx<dim, int> & start, const grid_key_dx<dim, int> & stop, size_t n_thr = threadBlockSize)
1851 {
1852 return gridSize.getGPUIterator(start,stop,n_thr);
1853 }
1854
1864 template<typename CoordT>
1866 {
1867 base_key k(*this,0,0);
1868
1869 const auto & blockMap = this->private_get_blockMap();
1870
1871 auto glid = gridGeometry.LinId(coord);
1872
1873 auto bid = glid / blockSize;
1874 auto lid = glid % blockSize;
1875
1876 auto key = blockMap.get_sparse(bid);
1877
1878 k.set_cnk_pos_id(key.id);
1879 k.set_data_id(lid);
1880
1881 return k;
1882 }
1883
1893 template<unsigned int p, typename CoordT>
1894 auto get(const grid_key_dx<dim,CoordT> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &
1895 {
1897 }
1898
1908 template<unsigned int p>
1909 auto get(const sparse_grid_gpu_index<self> & coord) const -> const ScalarTypeOf<AggregateBlockT, p> &
1910 {
1911 return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];
1912 }
1913
1920 {
1922 }
1923
1929 auto private_get_data_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer())
1930 {
1932 }
1933
1941 template<typename CoordT>
1942 auto get_o(const grid_key_dx<dim,CoordT> & coord) const -> encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::get(0))>::type >
1943 {
1944 int offset;
1945 indexT lin;
1946 gridGeometry.LinId(coord,lin,offset);
1947
1949 }
1950
1958 auto get_o(const sparse_grid_gpu_index<self> & coord) const -> encap_data_block<typename std::remove_const<decltype(private_get_data_array().get(0))>::type >
1959 {
1960 return encap_data_block<typename std::remove_const<decltype(private_get_data_array().get(0))>::type >(coord.get_data_id(),private_get_data_array().get(coord.get_cnk_pos_id()));
1961 }
1962
1969 {
1970 return (index_size_swp_r == private_get_index_array().size()) && (index_size_swp == private_get_index_array().size());
1971 }
1972
1982 template<unsigned int p>
1983 auto get(const sparse_grid_gpu_index<self> & coord) -> ScalarTypeOf<AggregateBlockT, p> &
1984 {
1985 return private_get_data_array().template get<p>(coord.get_cnk_pos_id())[coord.get_data_id()];
1986 }
1987
1995 unsigned char getFlag(const sparse_grid_gpu_index<self> & coord) const
1996 {
1997 return private_get_data_array().template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(coord.get_cnk_pos_id())[coord.get_data_id()];
1998 }
1999
2000 template<unsigned int p, typename CoordT>
2001 auto insert(const CoordT &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2002 {
2004 }
2005
2006 template<typename CoordT>
2007 auto insert_o(const CoordT &coord) -> encap_data_block<typename std::remove_const<decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::insert_o(0))>::type >
2008 {
2009 indexT ind;
2010 int offset;
2011 gridGeometry.LinId(coord,ind,offset);
2012
2014 }
2015
2022 void construct_link(self & grid_up, self & grid_dw, gpu::ofp_context_t &context)
2023 {
2024/* // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2025 auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
2026 auto & dataBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getDataBuffer();
2027
2028 ite_gpu<1> ite;
2029
2030 ite.wthr.x = indexBuffer.size();
2031 ite.wthr.y = 1;
2032 ite.wthr.z = 1;
2033
2034 ite.thr.x = getBlockSize();
2035 ite.thr.y = 1;
2036 ite.thr.z = 1;
2037
2038 openfpm::vector_gpu<aggregate<unsigned int>> output;
2039 output.resize(indexBuffer.size() + 1);
2040
2041 CUDA_LAUNCH((SparseGridGpuKernels::link_construct<dim,
2042 BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
2043 blockSize>),ite,grid_up.toKernel(),this->toKernel(),output.toKernel());
2044
2045
2046 openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);
2047
2048 output.template deviceToHost<0>(output.size()-1,output.size()-1);
2049
2050 unsigned int np_lup = output.template get<0>(output.size()-1);
2051
2052 links_up.resize(np_lup);
2053
2054 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert<dim,
2055 BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask,
2056 blockSize>),ite,grid_up.toKernel(),this->toKernel(),output.toKernel(),links_up.toKernel());
2057
2058*/
2059 }
2060
2067 {
2068 return link_dw_scan;
2069 }
2070
2077 {
2078 return link_dw;
2079 }
2080
2087 {
2088 return link_up_scan;
2089 }
2090
2097 {
2098 return link_up;
2099 }
2100
2109 void construct_link_dw(self & grid_dw, const Box<dim,int> & db_, Point<dim,int> p_dw, gpu::ofp_context_t &context)
2110 {
2111 Box<dim,int> db = db_;
2112
2113 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2116
2117 // Count padding points
2118
2119 // First we count the padding points
2120 ite_gpu<1> ite;
2121
2122 ite.wthr.x = indexBuffer.size();
2123 ite.wthr.y = 1;
2124 ite.wthr.z = 1;
2125
2126 ite.thr.x = getBlockSize();
2127 ite.thr.y = 1;
2128 ite.thr.z = 1;
2129
2131 output.resize(indexBuffer.size()+1);
2132
2133 output.fill<0>(0);
2134
2135 CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2137 blockSize>),ite,this->toKernel(),output.toKernel(),db);
2138
2139
2140
2141 openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);
2142
2143 output.template deviceToHost<0>(output.size()-1,output.size()-1);
2144 unsigned int padding_points = output.template get<0>(output.size()-1);
2145
2146 // get the padding points
2147
2149 pd_points.resize(padding_points);
2150
2151 CUDA_LAUNCH((SparseGridGpuKernels::collect_paddings<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,this->toKernel(),output.toKernel(),pd_points.toKernel(),db);
2152
2153 // Count number of link down for padding points
2154
2155 // Calculate ghost
2156
2157 link_dw_scan.resize(padding_points+1);
2158 link_dw_scan.fill<0>(0);
2159
2160 ite = link_dw_scan.getGPUIterator();
2161
2162 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_dw_count<dim,
2164 blockSize>),
2165 ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),link_dw_scan.toKernel(),p_dw);
2166
2167 openfpm::scan((unsigned int *)link_dw_scan.template getDeviceBuffer<0>(),link_dw_scan.size(),(unsigned int *)link_dw_scan.template getDeviceBuffer<0>(),context);
2168
2169 link_dw_scan.template deviceToHost<0>(link_dw_scan.size()-1,link_dw_scan.size()-1);
2170
2171 size_t np_ldw = link_dw_scan.template get<0>(link_dw_scan.size()-1);
2172
2173 link_dw.resize(np_ldw);
2174
2175 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_dw<dim,
2177 blockSize>),ite,pd_points.toKernel(),grid_dw.toKernel(),this->toKernel(),link_dw_scan.toKernel(),link_dw.toKernel(),p_dw);
2178
2179 link_dw_scan.resize(link_dw_scan.size()-1);
2180 }
2181
2187 void construct_link_up(self & grid_up, const Box<dim,int> & db_, Point<dim,int> p_up, gpu::ofp_context_t &context)
2188 {
2189 Box<dim,int> db = db_;
2190
2191 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2194
2195 // Count padding points
2196
2197 // First we count the padding points
2198 ite_gpu<1> ite;
2199
2200 ite.wthr.x = indexBuffer.size();
2201 ite.wthr.y = 1;
2202 ite.wthr.z = 1;
2203
2204 ite.thr.x = getBlockSize();
2205 ite.thr.y = 1;
2206 ite.thr.z = 1;
2207
2209 output.resize(indexBuffer.size()+1);
2210
2211 output.fill<0>(0);
2212
2213 CUDA_LAUNCH((SparseGridGpuKernels::count_paddings<dim,
2215 blockSize>),ite,this->toKernel(),output.toKernel(),db);
2216
2217
2218
2219 openfpm::scan((unsigned int *)output.template getDeviceBuffer<0>(),output.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);
2220
2221 output.template deviceToHost<0>(output.size()-1,output.size()-1);
2222 unsigned int padding_points = output.template get<0>(output.size()-1);
2223
2224 // get the padding points
2225
2227 pd_points.resize(padding_points);
2228
2229 CUDA_LAUNCH((SparseGridGpuKernels::collect_paddings<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,this->toKernel(),output.toKernel(),pd_points.toKernel(),db);
2230
2231 // Count number of link down for padding points
2232
2233 // Calculate ghost
2234
2235 link_up_scan.resize(padding_points+1);
2236 link_up_scan.fill<0>(0);
2237
2238 ite = link_up_scan.getGPUIterator();
2239
2240 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_up_count<dim,
2242 blockSize>),
2243 ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),link_up_scan.toKernel(),p_up);
2244
2245 openfpm::scan((unsigned int *)link_up_scan.template getDeviceBuffer<0>(),link_up_scan.size(),(unsigned int *)link_up_scan.template getDeviceBuffer<0>(),context);
2246
2247 link_up_scan.template deviceToHost<0>(link_up_scan.size()-1,link_up_scan.size()-1);
2248
2249 size_t np_lup = link_up_scan.template get<0>(link_up_scan.size()-1);
2250
2251 link_up.resize(np_lup);
2252
2253 CUDA_LAUNCH((SparseGridGpuKernels::link_construct_insert_up<dim,
2255 blockSize>),ite,pd_points.toKernel(),grid_up.toKernel(),this->toKernel(),link_up_scan.toKernel(),link_up.toKernel(),p_up);
2256
2257 link_up_scan.resize(link_up_scan.size()-1);
2258 }
2259
2266 template<typename dim3T>
2267 void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
2268 {
2271 dim3SizeToInt(nBlock),
2272 dim3SizeToInt(nSlot)
2273 );
2274 }
2275
2282 {
2284 }
2285
2286 template<typename stencil_type = NNStar<dim>, typename checker_type = No_check>
2287 void tagBoundaries(gpu::ofp_context_t &context, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS)
2288 {
2289 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2292
2293 const unsigned int dataChunkSize = BlockTypeOf<AggregateBlockT, 0>::size;
2294 unsigned int numScalars = indexBuffer.size() * dataChunkSize;
2295
2296 if (numScalars == 0) return;
2297 if (findNN == false)
2298 {
2299 findNeighbours<stencil_type>();
2300 findNN = true;
2301 }
2302
2303 // NOTE: Here we want to work only on one data chunk per block!
2304
2305 unsigned int localThreadBlockSize = dataChunkSize;
2306 unsigned int threadGridSize = numScalars % dataChunkSize == 0
2307 ? numScalars / dataChunkSize
2308 : 1 + numScalars / dataChunkSize;
2309
2310 constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value - IntPow<blockEdgeSize, dim>::value), (blockSize * 1)>::value; // todo: This works only for stencilSupportSize==1
2311// constexpr unsigned int nLoop = IntPow<blockEdgeSize + 2, dim>::value; // todo: This works only for stencilSupportSize==1
2312
2313 if (stencilSupportRadius == 1)
2314 {
2315 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2316 dim,
2317 1,
2319 stencil_type,
2320 checker_type>),
2321 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2322 }
2323 else if (stencilSupportRadius == 2)
2324 {
2325 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2326 dim,
2327 2,
2329 stencil_type,
2330 checker_type>),
2331 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2332 }
2333 else if (stencilSupportRadius == 0)
2334 {
2335 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::tagBoundaries<
2336 dim,
2337 0,
2339 stencil_type,
2340 checker_type>),
2341 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), dataBuffer.toKernel(), this->template toKernelNN<stencil_type::nNN, nLoop>(), nn_blocks.toKernel(),chk);
2342 }
2343 else
2344 {
2345 //todo: KABOOOOOOM!!!
2346 std::cout << __FILE__ << ":" << __LINE__ << " error: stencilSupportRadius supported only up to 2, passed: " << stencilSupportRadius << std::endl;
2347
2348 }
2349
2350 if (opt == tag_boundaries::CALCULATE_EXISTING_POINTS)
2351 {
2352 // first we calculate the existing points
2354
2355 block_points.resize(indexBuffer.size() + 1);
2356 block_points.template get<0>(block_points.size()-1) = 0;
2357 block_points.template hostToDevice<0>(block_points.size()-1,block_points.size()-1);
2358
2359 ite_gpu<1> ite;
2360
2361 ite.wthr.x = indexBuffer.size();
2362 ite.wthr.y = 1;
2363 ite.wthr.z = 1;
2364 ite.thr.x = getBlockSize();
2365 ite.thr.y = 1;
2366 ite.thr.z = 1;
2367
2368 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),
2369 ite,
2370 dataBuffer.toKernel(),
2371 block_points.toKernel());
2372
2373 // than we scan
2374 openfpm::scan((indexT *)block_points.template getDeviceBuffer<0>(),block_points.size(),(indexT *)block_points.template getDeviceBuffer<0>(),context);
2375
2376 // Get the total number of points
2377 block_points.template deviceToHost<0>(block_points.size()-1,block_points.size()-1);
2378 size_t tot = block_points.template get<0>(block_points.size()-1);
2379 e_points.resize(tot);
2380
2381 // we fill e_points
2382 CUDA_LAUNCH((SparseGridGpuKernels::fill_e_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),ite,
2383 dataBuffer.toKernel(),
2384 block_points.toKernel(),
2385 e_points.toKernel())
2386
2387 }
2388
2389 cudaDeviceSynchronize();
2390 }
2391
2392 template<typename NNtype = NNStar<dim>>
2393 void findNeighbours()
2394 {
2395 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2397
2398 const unsigned int numBlocks = indexBuffer.size();
2399 const unsigned int numScalars = numBlocks * NNtype::nNN;
2400 nn_blocks.resize(numScalars);
2401
2402 if (numScalars == 0) return;
2403
2404 // NOTE: Here we want to work only on one data chunk per block!
2405
2406 unsigned int localThreadBlockSize = NNtype::nNN;
2407
2408 unsigned int threadGridSize = numScalars % localThreadBlockSize == 0
2409 ? numScalars / localThreadBlockSize
2410 : 1 + numScalars / localThreadBlockSize;
2411
2412 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::findNeighbours<dim,NNtype>),
2413 threadGridSize, localThreadBlockSize,indexBuffer.toKernel(), this->toKernel(),nn_blocks.toKernel());
2414
2415 findNN = true;
2416 }
2417
2418 size_t countExistingElements() const
2419 {
2420 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2423
2425 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2426 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2427 constexpr unsigned int blockSize = MaskBlockT::size;
2428 const auto bufferSize = indexBuffer.size();
2429
2430 size_t numExistingElements = 0;
2431
2432 for (size_t blockId=0; blockId<bufferSize; ++blockId)
2433 {
2434 auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2435 for (size_t elementId=0; elementId<blockSize; ++elementId)
2436 {
2437 const auto curMask = dataBlock.template get<pMask>()[elementId];
2438
2439 if (this->exist(curMask))
2440 {
2441 ++numExistingElements;
2442 }
2443 }
2444 }
2445
2446 return numExistingElements;
2447 }
2448
2449 size_t countBoundaryElements()
2450 {
2451 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2454
2456 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2457 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2458 constexpr unsigned int blockSize = MaskBlockT::size;
2459 const auto bufferSize = indexBuffer.size();
2460
2461 size_t numBoundaryElements = 0;
2462
2463 for (size_t blockId=0; blockId<bufferSize; ++blockId)
2464 {
2465 auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2466 for (size_t elementId=0; elementId<blockSize; ++elementId)
2467 {
2468 const auto curMask = dataBlock.template get<pMask>()[elementId];
2469
2470 if (this->exist(curMask) && this->isPadding(curMask))
2471 {
2472 ++numBoundaryElements;
2473 }
2474 }
2475 }
2476
2477 return numBoundaryElements;
2478 }
2479
2480 // Also count mean+stdDev of occupancy of existing blocks
2481 void measureBlockOccupancyMemory(double &mean, double &deviation)
2482 {
2483 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2486
2488 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2489 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2490 constexpr unsigned int blockSize = MaskBlockT::size;
2491 const auto bufferSize = indexBuffer.size();
2492
2493 openfpm::vector<double> measures;
2494
2495 for (size_t blockId=0; blockId<bufferSize; ++blockId)
2496 {
2497 auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2498 size_t numElementsInBlock = 0;
2499 for (size_t elementId=0; elementId<blockSize; ++elementId)
2500 {
2501 const auto curMask = dataBlock.template get<pMask>()[elementId];
2502
2503 if (this->exist(curMask))
2504 {
2505 ++numElementsInBlock;
2506 }
2507 }
2508 double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;
2509 measures.add(blockOccupancy);
2510 }
2511
2512 standard_deviation(measures, mean, deviation);
2513 }
2514
2515 // Also count mean+stdDev of occupancy of existing blocks
2516 void measureBlockOccupancy(double &mean, double &deviation)
2517 {
2518 // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
2521
2523 typedef typename BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::AggregateInternalT BAggregateT;
2524 typedef BlockTypeOf<BAggregateT, pMask> MaskBlockT;
2525 constexpr unsigned int blockSize = MaskBlockT::size;
2526 const auto bufferSize = indexBuffer.size();
2527
2528 openfpm::vector<double> measures;
2529
2530 for (size_t blockId=0; blockId<bufferSize; ++blockId)
2531 {
2532 auto dataBlock = dataBuffer.get(blockId); // Avoid binary searches as much as possible
2533 size_t numElementsInBlock = 0;
2534 for (size_t elementId=0; elementId<blockSize; ++elementId)
2535 {
2536 const auto curMask = dataBlock.template get<pMask>()[elementId];
2537
2538 if (this->exist(curMask) && !this->isPadding(curMask))
2539 {
2540 ++numElementsInBlock;
2541 }
2542 }
2543 double blockOccupancy = static_cast<double>(numElementsInBlock)/blockSize;
2544 measures.add(blockOccupancy);
2545 }
2546
2547 standard_deviation(measures, mean, deviation);
2548 }
2549
2564 template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2565 void conv_cross(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
2566 {
2567 Box<dim,int> box;
2568
2569 for (int i = 0 ; i < dim ; i++)
2570 {
2571 box.setLow(i,start.get(i));
2572 box.setHigh(i,stop.get(i));
2573 }
2574
2575 applyStencils< SparseGridGpuKernels::stencil_cross_func<dim,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2576 }
2577
2578
2583 template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2584 void conv(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
2585 {
2586 Box<dim,int> box;
2587
2588 for (int i = 0 ; i < dim ; i++)
2589 {
2590 box.setLow(i,start.get(i));
2591 box.setHigh(i,stop.get(i));
2592 }
2593
2594 constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2595
2596 applyStencils< SparseGridGpuKernels::stencil_cross_func_conv<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2597 }
2598
2603 template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2604 void conv_cross_b(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args)
2605 {
2606 Box<dim,int> box;
2607
2608 for (int i = 0 ; i < dim ; i++)
2609 {
2610 box.setLow(i,start.get(i));
2611 box.setHigh(i,stop.get(i));
2612 }
2613
2614 constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2615
2616 applyStencils< SparseGridGpuKernels::stencil_cross_func_conv_block_read<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2617 }
2618
2623 template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2624 void conv2_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
2625 {
2626 Box<dim,int> box;
2627
2628 for (int i = 0 ; i < dim ; i++)
2629 {
2630 box.setLow(i,start.get(i));
2631 box.setHigh(i,stop.get(i));
2632 }
2633
2634 constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2635
2636 applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2637 }
2638
2643 template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_src3,
2644 unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int prop_dst3,
2645 unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2646 void conv3_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
2647 {
2648 Box<dim,int> box;
2649
2650 for (int i = 0 ; i < dim ; i++)
2651 {
2652 box.setLow(i,start.get(i));
2653 box.setHigh(i,stop.get(i));
2654 }
2655
2656 constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2657
2658 applyStencils< SparseGridGpuKernels::stencil_func_conv3_b<dim,nLoop,prop_src1,prop_src2,prop_src3,prop_dst1,prop_dst2,prop_dst3,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2659 }
2660
2665 template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT >
2666 void conv2(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args)
2667 {
2668 Box<dim,int> box;
2669
2670 for (int i = 0 ; i < dim ; i++)
2671 {
2672 box.setLow(i,start.get(i));
2673 box.setHigh(i,stop.get(i));
2674 }
2675
2676 constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value;
2677
2678 applyStencils< SparseGridGpuKernels::stencil_func_conv2<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...);
2679 }
2680
2687 {
2688 Box<dim,int> b;
2689
2690 for (int i = 0 ; i < dim ; i++)
2691 {
2692 b.setLow(i,0);
2693 b.setHigh(i,gridGeometry.getSize()[i]);
2694 }
2695
2696 return b;
2697 }
2698
2699 //todo: Move implems into a functor for compile time choice of stencil mode
2700 template<typename stencil, typename... Args>
2701 void applyStencils(const Box<dim,int> & box, StencilMode mode, Args... args)
2702 {
2703 if (findNN == false)
2704 {
2705 findNeighbours<typename stencil::stencil_type>();
2706 findNN = true;
2707 }
2708
2709 // Apply the given stencil on all elements which are not boundary-tagged
2710 // The idea is to have this function launch a __global__ function (made by us) on all existing blocks
2711 // then this kernel checks if elements exist && !padding and on those it calls the user-provided
2712 // __device__ functor. The mode of the stencil application is used basically to choose how we load the block
2713 // that we pass to the user functor as storeBlock: in case of Insert, we get the block through an insert (and
2714 // we also call the necessary aux functions); in case of an In-place we just get the block from the data buffer.
2715 switch (mode)
2716 {
2717 case STENCIL_MODE_INPLACE:
2718 applyStencilInPlace<stencil>(box,mode,args...);
2719 break;
2720 case STENCIL_MODE_INPLACE_NO_SHARED:
2721 applyStencilInPlaceNoShared<stencil>(box,mode,args...);
2722 break;
2723 }
2724 }
2725 template<typename stencil1, typename stencil2, typename ... otherStencils, typename... Args>
2726 void applyStencils(Box<dim,int> box, StencilMode mode, Args... args)
2727 {
2728 applyStencils<stencil1>(box,mode, args...);
2729 applyStencils<stencil2, otherStencils ...>(box,mode, args...);
2730 }
2731
2732 template<typename BitMaskT>
2733 inline static bool isPadding(BitMaskT &bitMask)
2734 {
2736 ::getBit(bitMask, PADDING_BIT);
2737 }
2738
2739 template<typename BitMaskT>
2740 inline static void setPadding(BitMaskT &bitMask)
2741 {
2743 ::setBit(bitMask, PADDING_BIT);
2744 }
2745
2746 template<typename BitMaskT>
2747 inline static void unsetPadding(BitMaskT &bitMask)
2748 {
2750 ::unsetBit(bitMask, PADDING_BIT);
2751 }
2752
2760 template<typename CoordT>
2761 inline size_t getBlockLinId(const CoordT & blockCoord) const
2762 {
2763 return gridGeometry.BlockLinId(blockCoord);
2764 }
2765
2776 template<unsigned int p>
2777 auto insertFlush(const sparse_grid_gpu_index<self> &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2778 {
2780
2781 indexT block_id = indexBuffer.template get<0>(coord.get_cnk_pos_id());
2782 indexT local_id = coord.get_data_id();
2783
2785
2787 block_data.template get<BMG::pMask>()[local_id] = 1;
2788
2789 return block_data.template get<p>()[local_id];
2790 }
2791
2802 template<typename CoordT>
2804 {
2805 auto lin = gridGeometry.LinId(coord);
2806 indexT block_id = lin / blockSize;
2807 local_id = lin % blockSize;
2808
2810
2812 block_data.template get<BMG::pMask>()[local_id] = 1;
2813
2814 return block_data;
2815 }
2816
2827 template<unsigned int p, typename CoordT>
2828 auto insertFlush(const grid_key_dx<dim,CoordT> &coord) -> ScalarTypeOf<AggregateBlockT, p> &
2829 {
2830 // Linearized block_id
2831 auto lin = gridGeometry.LinId(coord);
2832 indexT block_id = lin / blockSize;
2833 indexT local_id = lin % blockSize;
2834
2836
2838 block_data.template get<BMG::pMask>()[local_id] = 1;
2839
2840 return block_data.template get<p>()[local_id];
2841 }
2842
2843 template<unsigned int p>
2844 void print_vct_add_data()
2845 {
2846 typedef BlockMapGpu<
2848 threadBlockSize, indexT, layout_base> BMG;
2849
2850 auto & bM = BMG::blockMap.private_get_vct_add_data();
2851 auto & vI = BMG::blockMap.private_get_vct_add_index();
2852 bM.template deviceToHost<p>();
2853 vI.template deviceToHost<0>();
2854
2855 std::cout << "vct_add_data: " << std::endl;
2856
2857 for (size_t i = 0 ; i < bM.size() ; i++)
2858 {
2859 std::cout << i << " index: " << vI.template get<0>(i) << " BlockData: " << std::endl;
2860 for (size_t j = 0 ; j < blockSize ; j++)
2861 {
2862 std::cout << (int)bM.template get<p>(i)[j] << " ";
2863 }
2864
2865 std::cout << std::endl;
2866 }
2867 }
2868
2874 template<unsigned int p>
2875 void setBackgroundValue(typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type backgroundValue)
2876 {
2877 meta_copy<typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>::meta_copy_(backgroundValue,bck.template get<p>());
2878
2879 BMG::template setBackgroundValue<p,typename boost::mpl::at<typename AggregateT::type,boost::mpl::int_<p>>::type>(backgroundValue);
2880 }
2881
2883
2884 //Functions to check if the packing object is complex
2885 static bool pack()
2886 {
2887 return true;
2888 }
2889
2890 //Functions to check if the packing object is complex
2891 static bool packRequest()
2892 {
2893 return true;
2894 }
2895
2900 template<int ... prp> inline
2901 void packRequest(size_t & req) const
2902 {
2903 // To fill
2906
2907 indexBuffer.template packRequest<prp ...>(req);
2908 dataBuffer.template packRequest<prp ...>(req);
2909
2910 Packer<decltype(gridGeometry),HeapMemory>::packRequest(req);
2911 }
2912
2922 template<int ... prp> void pack(ExtPreAlloc<HeapMemory> & mem,
2923 Pack_stat & sts) const
2924 {
2927
2928 // To fill
2929 indexBuffer.template pack<prp ...>(mem,sts);
2930 dataBuffer.template pack<prp ...>(mem,sts);
2931
2932 Packer<decltype(gridGeometry),HeapMemory>::pack(mem,gridGeometry,sts);
2933 }
2934
2944 template<int ... prp> void unpack(ExtPreAlloc<HeapMemory> & mem,
2945 Unpack_stat & ps)
2946 {
2949
2950 // To fill
2951 indexBuffer.template unpack<prp ...>(mem,ps);
2952 dataBuffer.template unpack<prp ...>(mem,ps);
2953
2954 Unpacker<decltype(gridGeometry),HeapMemory>::unpack(mem,gridGeometry,ps);
2955 }
2956
2957
2967 template<int ... prp> void unpack(ExtPreAlloc<CudaMemory> & mem,
2968 Unpack_stat & ps)
2969 {
2970 if (mem.size() != 0)
2971 {std::cout << __FILE__ << ":" << __LINE__ << " not implemented: " << std::endl;}
2972 }
2973
2979 template<int ... prp> inline
2980 void packRequest(size_t & req, gpu::ofp_context_t &context) const
2981 {
2982 ite_gpu<1> ite;
2983
2984 auto & indexBuffer = private_get_index_array();
2985 auto & dataBuffer = private_get_data_array();
2986
2987 ite.wthr.x = indexBuffer.size();
2988 ite.wthr.y = 1;
2989 ite.wthr.z = 1;
2990 ite.thr.x = getBlockSize();
2991 ite.thr.y = 1;
2992 ite.thr.z = 1;
2993
2994 tmp.resize(indexBuffer.size() + 1);
2995
2996 // Launch a kernel that count the number of element on each chunks
2997 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>),
2998 ite,
2999 dataBuffer.toKernel(),
3000 tmp.toKernel());
3001
3002 openfpm::scan((indexT *)tmp. template getDeviceBuffer<0>(),
3003 tmp.size(), (indexT *)tmp. template getDeviceBuffer<0>(), context);
3004
3005 tmp.template deviceToHost<0>(tmp.size()-1,tmp.size()-1);
3006
3007 sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3008
3009 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof... (prp)>>(spq);
3010
3011 size_t n_pnt = tmp.template get<0>(tmp.size()-1);
3012
3013
3014 // 4 byte each chunks data // we use short to pack the offset
3015 // for each counter
3016 req = sizeof(indexT) + // byte required to pack the number
3017 sizeof(indexT)*indexBuffer.size() + // byte required to pack the chunk indexes
3018 sizeof(indexT)*tmp.size() + // byte required to pack the scan of the chunks points
3019 n_pnt*(spq.point_size + sizeof(short int) + sizeof(unsigned char)); // byte required to pack data + offset position
3020 }
3021
3036 template<int ... prp> inline
3038 size_t & req) const
3039 {
3040 pack_subs.add();
3041
3042 for (int i = 0 ; i < dim ; i++)
3043 {
3044 pack_subs.template get<0>(pack_subs.size()-1)[i] = sub_it.getStart().get(i);
3045 pack_subs.template get<1>(pack_subs.size()-1)[i] = sub_it.getStop().get(i);
3046 }
3047 }
3048
3054 {
3055 pack_subs.clear();
3056
3057 index_ptrs.clear();
3058 scan_ptrs.clear();
3059 data_ptrs.clear();
3060 offset_ptrs.clear();
3061 mask_ptrs.clear();
3062
3063 req_index = 0;
3064 }
3065
3072 template<int ... prp> inline
3073 void packCalculate(size_t & req, gpu::ofp_context_t &context)
3074 {
3075 ite_gpu<1> ite;
3076 pack_subs.template hostToDevice<0,1>();
3077
3078 auto & indexBuffer = private_get_index_array();
3079 auto & dataBuffer = private_get_data_array();
3080
3081 ite.wthr.x = indexBuffer.size();
3082 ite.wthr.y = 1;
3083 ite.wthr.z = 1;
3084 ite.thr.x = getBlockSize();
3085 ite.thr.y = 1;
3086 ite.thr.z = 1;
3087
3088 tmp.resize((indexBuffer.size() + 1)*pack_subs.size());
3089
3090 if (indexBuffer.size() != 0)
3091 {
3092 if (pack_subs.size() <= 32)
3093 {
3094 // Launch a kernel that count the number of element on each chunks
3095 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3097 32,
3098 indexT>),
3099 ite,
3100 indexBuffer.toKernel(),
3101 pack_subs.toKernel(),
3102 gridGeometry,
3103 dataBuffer.toKernel(),
3104 tmp.toKernel(),
3105 indexBuffer.size() + 1);
3106 }
3107 else if (pack_subs.size() <= 64)
3108 {
3109 // Launch a kernel that count the number of element on each chunks
3110 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3112 64,
3113 indexT>),
3114 ite,
3115 indexBuffer.toKernel(),
3116 pack_subs.toKernel(),
3117 gridGeometry,
3118 dataBuffer.toKernel(),
3119 tmp.toKernel(),
3120 indexBuffer.size() + 1);
3121 }
3122 else if (pack_subs.size() <= 96)
3123 {
3124 // Launch a kernel that count the number of element on each chunks
3125 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3127 96,
3128 indexT>),
3129 ite,
3130 indexBuffer.toKernel(),
3131 pack_subs.toKernel(),
3132 gridGeometry,
3133 dataBuffer.toKernel(),
3134 tmp.toKernel(),
3135 indexBuffer.size() + 1);
3136 }
3137 else if (pack_subs.size() <= 128)
3138 {
3139 // Launch a kernel that count the number of element on each chunks
3140 CUDA_LAUNCH((SparseGridGpuKernels::calc_exist_points_with_boxes<dim,
3142 128,
3143 indexT>),
3144 ite,
3145 indexBuffer.toKernel(),
3146 pack_subs.toKernel(),
3147 gridGeometry,
3148 dataBuffer.toKernel(),
3149 tmp.toKernel(),
3150 indexBuffer.size() + 1);
3151 }
3152 else
3153 {
3154 std::cout << __FILE__ << ":" << __LINE__ << " error no implementation available of packCalculate, create a new case for " << pack_subs.size() << std::endl;
3155 }
3156 }
3157
3158 sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3159
3160 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3161
3162 scan_it.resize(pack_subs.size());
3163
3164 // scan all
3165 for (size_t i = 0 ; i < pack_subs.size() ; i++)
3166 {
3167 size_t n_pnt = 0;
3168 size_t n_cnk = 0;
3169
3170 tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3171 tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1) = 0;
3172
3173 // put a zero at the end
3174 tmp.template hostToDevice<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3175 tmp.template hostToDevice<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3176
3177 openfpm::scan(((indexT *)tmp. template getDeviceBuffer<0>()) + i*(indexBuffer.size() + 1),
3178 indexBuffer.size() + 1, (indexT *)tmp. template getDeviceBuffer<0>() + i*(indexBuffer.size() + 1), context);
3179
3180 openfpm::scan(((unsigned int *)tmp. template getDeviceBuffer<1>()) + i*(indexBuffer.size() + 1),
3181 indexBuffer.size() + 1, (unsigned int *)tmp. template getDeviceBuffer<1>() + i*(indexBuffer.size() + 1), context);
3182
3183 tmp.template deviceToHost<0>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3184 tmp.template deviceToHost<1>((i+1)*(indexBuffer.size() + 1)-1,(i+1)*(indexBuffer.size() + 1)-1);
3185
3186 scan_it.template get<0>(i) = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3187
3188 n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3189 n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3190
3191 req += sizeof(size_t) + // byte required to pack the number of chunk packed
3192 2*dim*sizeof(int) + // starting point + size of the indexing packing
3193 sizeof(indexT)*n_cnk + // byte required to pack the chunk indexes
3194 align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)) + // byte required to pack the scan of the chunk point
3195 align_number(sizeof(indexT),n_pnt*(spq.point_size)) + // byte required to pack data
3196 align_number(sizeof(indexT),n_pnt*sizeof(short int)) + // byte required to pack offsets
3197 align_number(sizeof(indexT),n_pnt*sizeof(unsigned char)); // byte required to pack masks
3198 }
3199
3200 scan_it.template hostToDevice<0>();
3201
3202 openfpm::scan((indexT *)scan_it. template getDeviceBuffer<0>(),
3203 scan_it.size(), (indexT *)scan_it. template getDeviceBuffer<0>(), context);
3204 }
3205
3211 auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
3212 {
3213 return this->blockMap.getMappingVector();
3214 }
3215
3221 auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
3222 {
3223 return this->blockMap.getMergeIndexMapVector();
3224 }
3225
3240 template<int ... prp> void pack(ExtPreAlloc<CudaMemory> & mem,
3242 Pack_stat & sts)
3243 {
3244 unsigned int i = req_index;
3245
3246 sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3247 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3248
3249 auto & indexBuffer = private_get_index_array();
3250 auto & dataBuffer = private_get_data_array();
3251
3252 size_t n_pnt = tmp.template get<0>((i+1)*(indexBuffer.size() + 1)-1);
3253 size_t n_cnk = tmp.template get<1>((i+1)*(indexBuffer.size() + 1)-1);
3254
3255 Packer<size_t,CudaMemory>::pack(mem,n_cnk,sts);
3256 mem.hostToDevice(mem.getOffset(),mem.getOffset()+sizeof(size_t));
3257
3258 size_t offset1 = mem.getOffsetEnd();
3259
3260 grid_key_dx<dim> key = sub_it.getStart();
3261
3262 for (int i = 0 ; i < dim ; i++)
3263 {Packer<int,CudaMemory>::pack(mem,key.get(i),sts);}
3264
3265 for (int i = 0 ; i < dim ; i++)
3266 {Packer<int,CudaMemory>::pack(mem,(int)gridGeometry.getSize()[i],sts);}
3267
3268 mem.hostToDevice(offset1,offset1+2*dim*sizeof(int));
3269
3270 // chunk indexes
3271 mem.allocate(n_cnk*sizeof(indexT));
3272 index_ptrs.add(mem.getDevicePointer());
3273
3274 // chunk point scan
3275 mem.allocate( align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)) );
3276 scan_ptrs.add(mem.getDevicePointer());
3277
3278 // chunk data
3279 mem.allocate( align_number(sizeof(indexT),n_pnt*(spq.point_size)) );
3280 data_ptrs.add(mem.getDevicePointer());
3281
3282 // space for offsets
3283 mem.allocate( align_number(sizeof(indexT),n_pnt*sizeof(short int) ) );
3284 offset_ptrs.add(mem.getDevicePointer());
3285
3286 // space for offsets
3287 mem.allocate( align_number(sizeof(indexT),n_pnt*sizeof(unsigned char) ) );
3288 mask_ptrs.add(mem.getDevicePointer());
3289
3290 req_index++;
3291 }
3292
3293
3310 template<unsigned int ... prp>
3312 {
3313 if ((opt & 0x3) == rem_copy_opt::PHASE1)
3314 {
3315 this->template removeCopyToFinalize_phase1<prp ...>(ctx,opt);
3316 }
3317 else if ((opt & 0x3) == rem_copy_opt::PHASE2)
3318 {
3319 this->template removeCopyToFinalize_phase2<prp ...>(ctx,opt);
3320 }
3321 else
3322 {
3323 this->template removeCopyToFinalize_phase3<prp ...>(ctx,opt,false);
3324 }
3325 }
3326
3339 template<int ... prp> void packFinalize(ExtPreAlloc<CudaMemory> & mem,
3340 Pack_stat & sts,
3341 int opt = 0,
3342 bool is_pack_remote = false)
3343 {
3344
3345 RestorePackVariableIfKeepGeometry(opt,is_pack_remote);
3346
3347 if (pack_subs.size() <= 32)
3348 {
3349 pack_sg_implement<32,prp...>(mem,sts,opt,is_pack_remote);
3350 }
3351 else if (pack_subs.size() <= 64)
3352 {
3353 pack_sg_implement<64, prp...>(mem,sts,opt,is_pack_remote);
3354 }
3355 else if (pack_subs.size() <= 80)
3356 {
3357 pack_sg_implement<80, prp...>(mem,sts,opt,is_pack_remote);
3358 }
3359 else
3360 {
3361 std::cout << __FILE__ << ":" << __LINE__ << " error no implementation available of packCalculate, create a new case for " << pack_subs.size() << std::endl;
3362 }
3363
3364 savePackVariableIfNotKeepGeometry(opt,is_pack_remote);
3365 }
3366
3373 {
3374 rem_sects.clear();
3375
3376 auto & vad = BMG::blockMap.private_get_vct_add_data();
3377 auto & vai = BMG::blockMap.private_get_vct_add_index();
3378
3379 vad.clear();
3380 vai.clear();
3381
3382 // Clear variables
3383 offset_ptrs_cp.clear();
3384 scan_ptrs_cp.clear();
3385 n_cnk_cp.clear();
3386 n_pnt_cp.clear();
3387 data_base_ptr_cp.clear();
3388 box_cp.clear();
3389 n_shifts_cp.clear();
3390 convert_blk.clear();
3391 tmp2.clear();
3392 }
3393
3399 void swap(self & gr)
3400 {
3401 gridGeometry.swap(gr.gridGeometry);
3402
3403 BMG::swap(gr);
3404 }
3405
3414 {
3415 auto & indexBuffer = private_get_index_array();
3416 auto & dataBuffer = private_get_data_array();
3417
3418 // first we remove
3419 if (rem_sects.size() != 0)
3420 {
3421 rem_sects.template hostToDevice<0,1>();
3422
3423 tmp.resize(indexBuffer.size() + 1);
3424
3425 tmp.template get<1>(tmp.size()-1) = 0;
3426 tmp.template hostToDevice<1>(tmp.size()-1,tmp.size()-1);
3427
3428 auto ite = indexBuffer.getGPUIterator();
3429
3430 if (has_work_gpu(ite) == true)
3431 {
3432 // mark all the chunks that must remove points
3433 CUDA_LAUNCH((SparseGridGpuKernels::calc_remove_points_chunks_boxes<dim,
3435 blockEdgeSize>),ite,indexBuffer.toKernel(),rem_sects.toKernel(),
3436 gridGeometry,dataBuffer.toKernel(),
3437 tmp.toKernel());
3438
3439 // scan
3440 openfpm::scan((unsigned int *)tmp.template getDeviceBuffer<1>(),tmp.size(),(unsigned int *)tmp.template getDeviceBuffer<1>(),context);
3441
3442 tmp.template deviceToHost<1>(tmp.size()-1,tmp.size()-1);
3443
3444 // get the number of chunks involved
3445 size_t nr_cnk = tmp.template get<1>(tmp.size()-1);
3446
3447 tmp3.resize(nr_cnk);
3448
3449 // collect the chunks involved in the remove
3450 ite = indexBuffer.getGPUIterator();
3451
3452 if (has_work_gpu(ite) == false) {return;}
3453
3454 CUDA_LAUNCH((SparseGridGpuKernels::collect_rem_chunks),ite,tmp.toKernel(),tmp3.toKernel());
3455
3456 // Launch to remove points
3457
3458 ite = tmp3.getGPUIterator();
3459
3460 ite.wthr.x = tmp3.size();
3461 ite.wthr.y = 1;
3462 ite.wthr.z = 1;
3463 ite.thr.x = getBlockSize();
3464 ite.thr.y = 1;
3465 ite.thr.z = 1;
3466
3467 if (has_work_gpu(ite) == false) {return;}
3468
3469 CUDA_LAUNCH((SparseGridGpuKernels::remove_points<dim,
3471 ite,indexBuffer.toKernel(),
3472 gridGeometry,
3473 dataBuffer.toKernel(),
3474 tmp3.toKernel(),
3475 rem_sects.toKernel());
3476
3477 tmp3.clear();
3478 }
3479 }
3480 }
3481
3487 template<unsigned int ... prp>
3489 {
3490 if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
3491 {removePoints(context);}
3492
3493 removeCopyToFinalize_phase3<prp ...>(context,opt,true);
3494 }
3495
3501 {
3502 rem_sects.clear();
3503 copySect.clear();
3504 offset_ptrs_cp.clear();
3505 scan_ptrs_cp.clear();
3506 data_base_ptr_cp.clear();
3507 n_cnk_cp.clear();
3508 n_pnt_cp.clear();
3509 n_shifts_cp.clear();
3510 convert_blk.clear();
3511 box_cp.clear();
3512 data_base_ptr_cp.clear();
3513
3514 tmp2.clear();
3515 }
3516
3524 void remove(const Box<dim,int> & section_to_delete)
3525 {
3526 rem_sects.add(section_to_delete);
3527 }
3528
3534 static constexpr bool isCompressed()
3535 {
3536 return true;
3537 }
3538
3546 void copy_to(self & grid_src,
3547 const Box<dim,size_t> & box_src,
3548 const Box<dim,size_t> & box_dst)
3549 {
3550 // first we launch a kernel to count the number of points we have
3551
3552 sparse_grid_section<self> sgs(*this,box_src,box_dst);
3553
3554 grid_src.copySect.add(sgs);
3555 }
3556
3560 template<typename pointers_type,
3561 typename headers_type,
3562 typename result_type,
3563 unsigned int ... prp >
3564 static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result, int n_slot)
3565 {
3566 // we have to increment ps by the right amount
3567 sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3568 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3569
3570 result.allocate(sizeof(int));
3571
3572 if (pointers.size())
3573 CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers<decltype(std::declval<self>().toKernel())>),1,pointers.size(),
3574 pointers.toKernel(),
3575 headers.toKernel(),
3576 (int *)result.getDevicePointer(),
3577 spq.point_size,
3578 n_slot)
3579 }
3580
3590 template<unsigned int ... prp, typename S2, typename header_type>
3593 header_type & headers,
3594 int ih,
3595 Unpack_stat & ps,
3596 gpu::ofp_context_t &context,
3597 rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3598 {
3600
3601 if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
3602 {
3603 this->template addAndConvertPackedChunkToTmp<prp ...>(mem,sub_it,ps,context);
3604
3605 // readjust mem
3606 }
3607 else
3608 {
3609 // we have to increment ps by the right amount
3610 sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3611 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3612
3613 // First get the number of chunks
3614
3615 size_t n_cnk = headers.template get<1>(ih);
3616 ps.addOffset(sizeof(size_t));
3617 ps.addOffset(2*dim*sizeof(unsigned int));
3618
3619 size_t actual_offset = n_cnk*sizeof(indexT);
3620 unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));
3621
3622 // Unpack number of points
3623 // calculate the number of total points
3624 size_t n_pnt = headers.template get<2>(ih);
3625 actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));
3626
3627 void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );
3628
3629 actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));
3630 short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);
3631
3632 actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));
3633 actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));
3634
3635 scan_ptrs_cp.add(scan);
3636 offset_ptrs_cp.add(offsets);
3637 data_base_ptr_cp.add(data_base_ptr);
3638
3639 ps.addOffset(actual_offset);
3640 }
3641 }
3642
3649 {return true;}
3650
3660 template<unsigned int ... prp, typename S2>
3663 Unpack_stat & ps,
3664 gpu::ofp_context_t &context,
3665 rem_copy_opt opt = rem_copy_opt::NONE_OPT)
3666 {
3668
3669 if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
3670 {
3671 this->template addAndConvertPackedChunkToTmp<prp ...>(mem,sub_it,ps,context);
3672
3673 // readjust mem
3674 }
3675 else
3676 {
3677 // we have to increment ps by the right amount
3678 sparsegridgpu_pack_request<AggregateT,prp ...> spq;
3679 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
3680
3681 // First get the number of chunks
3682
3683 size_t n_cnk;
3684
3685 // Unpack the number of chunks
3686 mem.deviceToHost(ps.getOffset(),ps.getOffset() + sizeof(size_t) + 2*dim*sizeof(int));
3687 Unpacker<size_t,S2>::unpack(mem,n_cnk,ps);
3688
3689 // Unpack origin of the chunk indexing
3690/* for (int i = 0 ; i < dim ; i++)
3691 {
3692 int tmp;
3693 Unpacker<int,S2>::unpack(mem,tmp,ps);
3694 }
3695
3696 for (int i = 0 ; i < dim ; i++)
3697 {
3698 int tmp;
3699 Unpacker<int,S2>::unpack(mem,tmp,ps);
3700 }*/
3701
3702 ps.addOffset(2*dim*sizeof(unsigned int));
3703
3704 size_t actual_offset = n_cnk*sizeof(indexT);
3705 unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT));
3706
3707 mem.deviceToHost(ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int),
3708 ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int) + sizeof(unsigned int));
3709
3710 // Unpack number of points
3711 // calculate the number of total points
3712 size_t n_pnt = *(unsigned int *)((unsigned char *)mem.getPointer() + ps.getOffset() + actual_offset + n_cnk*sizeof(unsigned int));
3713 actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int));
3714
3715 void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset );
3716
3717 actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size));
3718 short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset);
3719
3720 actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short));
3721 actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char));
3722
3723 scan_ptrs_cp.add(scan);
3724 offset_ptrs_cp.add(offsets);
3725 data_base_ptr_cp.add(data_base_ptr);
3726
3727 ps.addOffset(actual_offset);
3728 }
3729 }
3730
3736 {
3738 }
3739
3741
3748 {
3749 return decltype(self::type_of_iterator())(*this);
3750 }
3751
3757 decltype(self::type_of_subiterator()) getIterator(const grid_key_dx<dim> & start, const grid_key_dx<dim> & stop, int is_to_init = 1) const
3758 {
3759 return decltype(self::type_of_subiterator())(*this,start,stop,is_to_init);
3760 }
3761
3768 {
3770 }
3771
3777 auto private_get_add_index_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.private_get_vct_add_index()) &
3778 {
3780 }
3781
3787 auto private_get_index_array() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer()) &
3788 {
3790 }
3791
3792 auto getSegmentToOutMap() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())
3793 {
3795 }
3796
3797 auto getSegmentToOutMap() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToOutMap())
3798 {
3800 }
3801
3802 auto getSegmentToMergeIndexMap() -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())
3803 {
3805 }
3806
3807 auto getSegmentToMergeIndexMap() const -> decltype(BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getSegmentToMergeIndexMap())
3808 {
3810 }
3811
3818 {
3820 }
3821
3827 auto private_get_neighborhood_array() -> decltype(nn_blocks) &
3828 {
3829 return nn_blocks;
3830 }
3831
3832
3833#if defined(OPENFPM_DATA_ENABLE_IO_MODULE) || defined(PERFORMANCE_TEST) || defined(VTKWRITER_HPP_)
3834
3840 template<typename Tw = float> bool write(const std::string & output)
3841 {
3842 Point<dim,double> spacing;
3843 Point<dim,double> offset;
3844
3845 spacing.one();
3846 offset.zero();
3847
3848 return write_with_spacing_offset(output,spacing,offset);
3849 }
3850
3856 template<typename Tw = float>
3857 bool write_with_spacing_offset(const std::string & output, Point<dim,double> spacing, Point<dim,double> offset)
3858 {
3859 file_type ft = file_type::BINARY;
3860
3861 auto & bm = this->private_get_blockMap();
3862
3863 auto & index = bm.getIndexBuffer();
3864 auto & data = bm.getDataBuffer();
3865
3868
3869 // copy position and properties
3870
3871 auto it = index.getIterator();
3872
3873 while(it.isNext())
3874 {
3875 auto key = it.get();
3876
3877 Point<dim,Tw> p;
3878
3879 for (size_t i = 0 ; i < gridGeometry.getBlockSize() ; i++)
3880 {
3882 {
3883 // Get the block index
3884 grid_key_dx<dim,int> keyg = gridGeometry.InvLinId(index.template get<0>(key),i);
3885
3886 for (size_t k = 0 ; k < dim ; k++)
3887 {p.get(k) = keyg.get(k)*spacing[k] + offset[k]*spacing[k];}
3888
3889 tmp_pos.add(p);
3890
3891 tmp_prp.add();
3892 copy_prop_to_vector_block<decltype(data.get_o(key)),decltype(tmp_prp.last())>
3893 cp(data.get_o(key),tmp_prp.last(),key,i);
3894
3895 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,AggregateT::max_prop> >(cp);
3896
3897 tmp_prp.last().template get<AggregateT::max_prop>() = data.template get<BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::pMask>(key)[i];
3898 }
3899 }
3900
3901 ++it;
3902 }
3903
3904 // VTKWriter for a set of points
3906 vtk_writer.add(tmp_pos,tmp_prp,tmp_pos.size());
3907
3909
3910 // Write the VTK file
3911 return vtk_writer.write(output,prp_names,"sparse_grid","",ft);
3912 }
3913
3919 template<typename Tw = float> bool write_debug(const std::string & output, Point<dim,double> spacing, Point<dim,double> offset)
3920 {
3923
3925
3926 auto & ids = private_get_index_array();
3927
3928 fill_chunks_boxes(chunks_box,ids,spacing,offset);
3929
3930 vtk_box1.add(chunks_box);
3931 vtk_box1.write(std::string("chunks_") + output + std::string(".vtk"));
3932
3933 //write data
3934
3935 write_with_spacing_offset(std::string("data_") + output + std::string(".vtk"),spacing,offset);
3936
3937 return true;
3938 }
3939
3940#endif
3941};
3942
3943template<unsigned int dim,
3944 typename AggregateT,
3945 unsigned int blockEdgeSize = default_edge<dim>::type::value,
3946 unsigned int threadBlockSize = default_edge<dim>::tb::value,
3947 typename indexT=long int,
3948 template<typename> class layout_base=memory_traits_inte,
3949 typename linearizer = grid_zmb<dim, blockEdgeSize,indexT>>
3951
3952template<unsigned int dim,
3953 typename AggregateT,
3954 unsigned int blockEdgeSize = default_edge<dim>::type::value,
3955 unsigned int threadBlockSize = default_edge<dim>::tb::value,
3956 typename indexT=int,
3957 template<typename> class layout_base=memory_traits_inte,
3958 typename linearizer = grid_zmb<dim, blockEdgeSize,indexT>>
3960
3961template<unsigned int dim,
3962 typename AggregateT,
3963 unsigned int blockEdgeSize = default_edge<dim>::type::value,
3964 unsigned int threadBlockSize = default_edge<dim>::tb::value,
3965 typename indexT=int,
3966 template<typename> class layout_base=memory_traits_inte,
3967 typename linearizer = grid_smb<dim, blockEdgeSize,indexT>>
3969
3970#endif //OPENFPM_PDATA_SPARSEGRIDGPU_HPP
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
auto insert_o(unsigned int linId) -> decltype(blockMap.insert(0))
insert data, host version
auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId, is_new).template get< p >())
insert a block + flush, host version
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
decltype(blockMap) & private_get_blockMap()
Return internal structure block map.
This class represent an N-dimensional box.
Definition Box.hpp:61
__device__ __host__ T getLow(int i) const
get the i-coordinate of the low bound interval of the box
Definition Box.hpp:556
__device__ __host__ T getHigh(int i) const
get the high interval of the box
Definition Box.hpp:567
__device__ __host__ void setHigh(int i, T val)
set the high interval of the box
Definition Box.hpp:544
__device__ __host__ void setLow(int i, T val)
set the low interval of the box
Definition Box.hpp:533
virtual void * getDevicePointer()
get a readable pointer with the data
virtual bool resize(size_t sz)
resize the momory allocated
virtual void hostToDevice()
Move memory from host to device.
virtual size_t size() const
the the size of the allocated memory
virtual void * getPointer()
get a readable pointer with the data
virtual bool allocate(size_t sz)
allocate memory
Definition CudaMemory.cu:38
virtual void decRef()
Decrement the reference counter.
size_t getOffsetEnd()
Get offset.
size_t getOffset()
Get offset.
virtual void * getDevicePointer()
Return the pointer of the last allocation.
virtual void incRef()
Increment the reference counter.
virtual void * getPointer()
Return the pointer of the last allocation.
virtual void deviceToHost()
Do nothing.
void reset()
Reset the internal counters.
virtual void hostToDevice()
Return the pointer of the last allocation.
virtual bool allocate(size_t sz)
Allocate a chunk of memory.
virtual size_t size() const
Get the size of the LAST allocated memory.
This class allocate, and destroy CPU memory.
Packing status object.
Definition Pack_stat.hpp:61
Packing class.
Definition Packer.hpp:50
static void pack(ExtPreAlloc< Mem >, const T &obj)
Error, no implementation.
Definition Packer.hpp:56
This class implement the point shape in an N-dimensional space.
Definition Point.hpp:28
void one()
Set to one the point coordinate.
Definition Point.hpp:296
__device__ __host__ void zero()
Set to zero the point coordinate.
Definition Point.hpp:284
__device__ __host__ const T & get(unsigned int i) const
Get coordinate.
Definition Point.hpp:172
This class represent an N-dimensional box.
Definition SpaceBox.hpp:27
grid_key_dx< dim > getStart() const
Return the starting point.
grid_key_dx< dim > getStop() const
Return the stop point.
__device__ unsigned int size(unsigned int i)
Size of the sparse grid in each direction.
constexpr __device__ unsigned int getBlockSize() const
Return the size of the block.
openfpm::vector_gpu< aggregate< int, short int > > link_dw
links of the padding points with real points of a finer sparsegrid
openfpm::vector_gpu< aggregate< size_t > > links_up
links of the padding points with real points of a coarse sparsegrid
SparseGridGpu(linearizer &gridGeometry, unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
size_t getBlockLinId(const CoordT &blockCoord) const
Linearization of block coordinates.
openfpm::vector_gpu< aggregate< indexT > > tmp3
temporal 3
auto private_get_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer()) &
Return the index array of the blocks.
void conv_cross(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a convolution using a cross like stencil.
openfpm::vector_gpu< aggregate< int[dim]> > shifts
shifts for chunk conversion
auto get(const grid_key_dx< dim, CoordT > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using the point coordinates.
openfpm::vector_gpu< aggregate< int, short int > > & getUpLinks()
Get the links up for each point.
void pack(ExtPreAlloc< CudaMemory > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Pack_stat &sts)
Pack the object into the memory given an iterator.
auto get_o(const grid_key_dx< dim, CoordT > &coord) const -> encap_data_block< typename std::remove_const< decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::get(0))>::type >
Get an element using the point coordinates.
auto get(const sparse_grid_gpu_index< self > &coord) const -> const ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
SparseGridGpu(const size_t(&res)[dim], unsigned int stencilSupportRadius=1)
Constructor from glock geometry.
auto private_get_add_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
void removeCopyToFinalize(gpu::ofp_context_t &ctx, int opt)
It finalize the queued operations of remove() and copy_to()
ExtPreAlloc< CudaMemory > * prAlloc_prp
Memory to remove copy finalize.
void remove(const Box< dim, int > &section_to_delete)
Remove all the points in this region.
openfpm::vector_gpu< aggregate< indexT > > e_points
void removePoints(gpu::ofp_context_t &context)
Remove the points we queues to remove.
void unpack(ExtPreAlloc< CudaMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
void construct_link(self &grid_up, self &grid_dw, gpu::ofp_context_t &context)
construct link between levels
void unpack_with_headers(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, header_type &headers, int ih, Unpack_stat &ps, gpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
auto private_get_neighborhood_array() -> decltype(nn_blocks) &
Return the index array of the blocks.
void setNNType()
Set the neighborhood type.
openfpm::vector_gpu< aggregate< unsigned int > > pack_output
Helper array to pack points.
auto get(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
openfpm::vector_gpu< aggregate< indexT > > tmp2
temporal 2
auto insertFlush(const grid_key_dx< dim, CoordT > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void packRequest(size_t &req) const
Asking to pack a SparseGrid GPU without GPU context pack the grid on CPU and host memory.
auto private_get_data_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer())
Return the data array of the blocks.
auto insertFlush(const sparse_grid_gpu_index< self > &coord) -> ScalarTypeOf< AggregateBlockT, p > &
Insert the point on host side and flush directly.
void setGPUInsertBuffer(dim3T nBlock, dim3T nSlot)
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
void packCalculate(size_t &req, gpu::ofp_context_t &context)
Calculate the size of the information to pack.
auto getMappingVector() -> decltype(this->blockMap.getMappingVector())
Return the mapping vector used to know where the data has been added.
static bool is_unpack_header_supported()
Indicate that unpacking the header is supported.
openfpm::vector_gpu< aggregate< short int, short int > > ghostLayerToThreadsMapping
void copyRemoveReset()
Reset the queue to remove and copy section of grids.
openfpm::vector_gpu< aggregate< int, short int > > & getDownLinks()
Get the links down for each point.
unsigned char getFlag(const sparse_grid_gpu_index< self > &coord) const
Return the flag of the point.
void removeAddUnpackReset()
In this case it does nothing.
static SparseGridGpu_iterator_sub< dim, self > type_of_subiterator()
This is a meta-function return which type of sub iterator a grid produce.
size_t size() const
return the size of the grid
void packRequest(SparseGridGpu_iterator_sub< dim, self > &sub_it, size_t &req) const
Calculate the size to pack part of this structure.
openfpm::vector_gpu< aggregate< unsigned int > > & getDownLinksOffsets()
Get the offsets for each point of the links down.
void pack(ExtPreAlloc< HeapMemory > &mem, Pack_stat &sts) const
Pack the object into the memory.
openfpm::vector_gpu< aggregate< unsigned int > > & getUpLinksOffsets()
Get the offsets for each point of the links up.
static constexpr bool isCompressed()
This is a multiresolution sparse grid so is a compressed format.
void removeAddUnpackFinalize(gpu::ofp_context_t &context, int opt)
This function remove the points we queue to remove and it flush all the added/unpacked data.
auto getMergeIndexMapVector() -> decltype(this->blockMap.getMergeIndexMapVector())
Return the mapping vector used to know where the data has been added.
decltype(self::type_of_iterator()) getIterator() const
Return a SparseGrid iterator.
void packRequest(size_t &req, gpu::ofp_context_t &context) const
memory requested to pack this object
void copy_to(self &grid_src, const Box< dim, size_t > &box_src, const Box< dim, size_t > &box_dst)
It queue a copy.
static SparseGridGpu_iterator< dim, self > type_of_iterator()
This is a meta-function return which type of iterator a grid produce.
void resize(size_t(&res)[dim])
resize the SparseGrid
void construct_link_dw(self &grid_dw, const Box< dim, int > &db_, Point< dim, int > p_dw, gpu::ofp_context_t &context)
construct link on the down level
openfpm::vector_gpu< aggregate< int, short int > > link_up
links of the padding points with real points of a finer sparsegrid
void conv2_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
void conv_cross_b(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< indexT, unsigned int > > tmp
temporal
void packReset()
Reset the pack calculation.
void swap(self &gr)
bool isSkipLabellingPossible()
This function check if keep geometry is possible for this grid.
void construct_link_up(self &grid_up, const Box< dim, int > &db_, Point< dim, int > p_up, gpu::ofp_context_t &context)
construct link on the up levels
openfpm::vector_gpu< Box< dim, int > > pack_subs
the set of all sub-set to pack
auto private_get_index_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getIndexBuffer())
Return the index array of the blocks.
void addAndConvertPackedChunkToTmp(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &context)
unpack the sub-grid object
auto insertBlockFlush(const grid_key_dx< dim, CoordT > &coord, indexT &local_id) -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::insertBlockFlush(0))
Insert the point on host side and flush directly.
void unpack(ExtPreAlloc< S2 > &mem, SparseGridGpu_iterator_sub< dim, self > &sub_it, Unpack_stat &ps, gpu::ofp_context_t &context, rem_copy_opt opt=rem_copy_opt::NONE_OPT)
unpack the sub-grid object
auto private_get_data_array() -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.getDataBuffer()) &
Return the index array of the blocks.
auto get_o(const sparse_grid_gpu_index< self > &coord) const -> encap_data_block< typename std::remove_const< decltype(private_get_data_array().get(0))>::type >
Get an element using sparse_grid_gpu_index (using this index it guarantee that the point exist)
static void unpack_headers(pointers_type &pointers, headers_type &headers, result_type &result, int n_slot)
Stub does not do anything.
void conv3_b(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< int > > new_map
Map between the (Last) added chunks and their position in chunks data.
int yes_i_am_grid
it define that this data-structure is a grid
void setBackgroundValue(typename boost::mpl::at< typename AggregateT::type, boost::mpl::int_< p > >::type backgroundValue)
set the background for property p
void convertChunkIds(short int *offset, origPackType &origPack, IteratorType &sub_it)
convert the offset index from the packed to the add buffer
void unpack(ExtPreAlloc< HeapMemory > &mem, Unpack_stat &ps)
Unpack the object into the memory.
void conv2(grid_key_dx< dim > start, grid_key_dx< dim > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
openfpm::vector_gpu< aggregate< unsigned int > > link_dw_scan
scan offsets of the links down
Box< dim, int > getBox()
Return a Box with the range if the SparseGrid.
base_key get_sparse(const grid_key_dx< dim, CoordT > &coord) const
Get an element using the point coordinates.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
decltype(self::type_of_subiterator()) getIterator(const grid_key_dx< dim > &start, const grid_key_dx< dim > &stop, int is_to_init=1) const
Return a SparseGrid iterator only on a sub-set of elements.
openfpm::vector_gpu< aggregate< unsigned int > > link_up_scan
scan offsets of the links down
linearizer & getGrid()
Return the grid information object.
auto private_get_add_index_array() const -> decltype(BlockMapGpu< AggregateInternalT, threadBlockSize, indexT, layout_base >::blockMap.private_get_vct_add_index()) &
Return the index array of the blocks.
openfpm::vector_gpu< aggregate< indexT > > scan_it
contain the scan of the point for each iterator
void packFinalize(ExtPreAlloc< CudaMemory > &mem, Pack_stat &sts, int opt=0, bool is_pack_remote=false)
Finalize the packing procedure.
void conv(grid_key_dx< 3 > start, grid_key_dx< 3 > stop, lambda_f func, ArgsT ... args)
Apply a free type convolution using blocks.
Unpacking status object.
Definition Pack_stat.hpp:16
size_t getOffset()
Return the actual counter.
Definition Pack_stat.hpp:41
void addOffset(size_t off)
Increment the offset pointer by off.
Definition Pack_stat.hpp:31
Unpacker class.
Definition Unpacker.hpp:34
static void unpack(ExtPreAlloc< Mem >, T &obj)
Error, no implementation.
Definition Unpacker.hpp:40
void operator()(T &t) const
It call the copy function for each property.
void operator()(T &t) const
It call the copy function for each property.
void * base_ptr
data pointers
grid_key_dx is the key to access any element in the grid
Definition grid_key.hpp:19
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
Definition grid_key.hpp:516
__device__ __host__ index_type get(index_type i) const
Get the i index.
Definition grid_key.hpp:503
Declaration grid_sm.
Definition grid_sm.hpp:167
void setDimensions(const size_t(&dims)[N])
Reset the dimension of the grid.
Definition grid_sm.hpp:326
mem_id LinId(const grid_key_dx< N, ids_type > &gk, const signed char sum_id[N]) const
Linearization of the grid_key_dx with a specified shift.
Definition grid_sm.hpp:454
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.
Element index contain a data chunk index and a point index.
int get_cnk_pos_id() const
Get chunk position id.
int get_data_id() const
Get chunk local index (the returned index < getblockSize())
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
get the type of the insertBlock
get the type of the block
static __device__ bool getNNindex_offset(grid_key_dx< dim, indexT2 > &coord, unsigned int &NN_index, unsigned int &offset_nn)
given a coordinate writtel in local coordinate for a given it return the neighborhood chunk position ...
Check if is padding.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
this class is a functor for "for_each" algorithm
Definition Encap.hpp:222
Definition ids.hpp:169
Transform the boost::fusion::vector into memory specification (memory_traits)
This class copy general objects.
Definition meta_copy.hpp:53
this class is a functor for "for_each" algorithm