8 #ifndef OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_ 9 #define OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_ 15 #include "Vector/map_vector_sparse.hpp" 16 #include "NN/CellList/CellDecomposer.hpp" 17 #include "Vector/map_vector.hpp" 18 #include "Cuda_cell_list_util_func.hpp" 19 #include "NN/CellList/cuda/CellList_gpu_ker.cuh" 20 #include "util/cuda_util.hpp" 21 #include "NN/CellList/CellList_util.hpp" 22 #include "NN/CellList/CellList.hpp" 23 #include "util/cuda/scan_ofp.cuh" 25 constexpr
int count = 0;
26 constexpr
int start = 1;
28 template<
unsigned int dim,
typename T,
29 typename cnt_type,
typename ids_type,
30 typename Memory,
typename transform,
31 typename vector_cnt_type,
typename vector_cnt_type2,
32 typename cl_sparse_type,
34 struct CellList_gpu_ker_selector
37 vector_cnt_type & cell_nn,
38 vector_cnt_type2 & cell_nn_list,
39 cl_sparse_type & cl_sparse,
40 vector_cnt_type & sorted_to_not_sorted,
41 vector_cnt_type & sorted_domain_particles_ids,
50 sorted_to_not_sorted.toKernel(),
51 sorted_domain_particles_ids.toKernel(),
61 template<
unsigned int dim,
typename T,
62 typename cnt_type,
typename ids_type,
63 typename Memory,
typename transform,
64 typename vector_cnt_type,
typename vector_cnt_type2,
65 typename cl_sparse_type>
66 struct CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,vector_cnt_type,vector_cnt_type2,cl_sparse_type,true>
69 vector_cnt_type & cell_nn,
70 vector_cnt_type2 & cell_nn_list,
71 cl_sparse_type & cl_sparse,
72 vector_cnt_type & srt,
73 vector_cnt_type & dprt,
82 cell_nn_list.toKernel(),
93 template<
unsigned int dim,
97 typename cnt_type =
unsigned int,
98 typename ids_type =
int,
99 bool is_sparse =
false>
100 class CellList_gpu :
public CellDecomposer_sm<dim,T,transform>
105 vector_cnt_type cl_n;
108 vector_cnt_type cells;
111 vector_cnt_type starts;
126 int cells_nn_test_size;
132 vector_cnt_type sorted_to_not_sorted;
135 vector_cnt_type sorted_domain_particles_dg;
138 vector_cnt_type sorted_domain_particles_ids;
141 vector_cnt_type non_sorted_to_sorted;
160 void InitializeStructures(
const size_t (& div)[dim],
size_t tot_n_cell,
size_t pad)
162 for (
size_t i = 0 ; i < dim ; i++)
165 spacing_c[i] = this->getCellBox().getP2().get(i);
169 cl_n.resize(tot_n_cell);
171 cells_nn_test_size = 1;
172 construct_cell_nn_test(cells_nn_test_size);
175 void construct_cell_nn_test(
unsigned int box_nn = 1)
177 auto & gs = this->getGrid();
183 for (
size_t i = 0 ; i < dim ; i++)
186 stop.
set_d(i,2*box_nn);
187 middle.
set_d(i,box_nn);
190 cells_nn_test.resize(openfpm::math::pow(2*box_nn+1,dim));
192 int mid = gs.LinId(middle);
201 cells_nn_test.template get<0>(i) = (
int)gs.LinId(p) - mid;
207 cells_nn_test.template hostToDevice<0>();
209 #if defined(__NVCC__) && defined(USE_LOW_REGISTER_ITERATOR) 212 cudaMemcpyToSymbol(cells_striding,cells_nn_test.template getPointer<0>(),cells_nn_test.
size()*
sizeof(
int));
221 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
222 void construct_sparse(vector & pl,
225 vector_prp & pl_prp_out,
226 mgpu::ofp_context_t & mgpuContext,
230 cl_construct_opt opt = cl_construct_opt::Full)
234 part_ids.resize(stop - start);
235 starts.resize(stop - start);
239 auto ite_gpu = pl.getGPUIteratorTo(stop-start);
246 CUDA_LAUNCH((subindex<true,dim,T,cnt_type,ids_type>),
ite_gpu,div_c,
249 this->getTransform(),
254 static_cast<T *>(pl.template getDeviceBuffer<0>()),
255 static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
256 static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
260 cells.resize(stop-start);
264 cl_sparse.template setBackground<0>((cnt_type)-1);
266 CUDA_LAUNCH((fill_cells_sparse),
ite_gpu,cl_sparse.
toKernel(),starts.toKernel());
267 cl_sparse.template flush_vd<sstart_<0>>(cells,mgpuContext,FLUSH_ON_DEVICE);
269 cells_nn.resize(cl_sparse.
size()+1);
270 cells_nn.template fill<0>(0);
273 auto itgg = cl_sparse.getGPUIterator();
274 CUDA_LAUNCH((count_nn_cells),itgg,cl_sparse.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
277 openfpm::scan((cnt_type *)cells_nn.template getDeviceBuffer<0>(), cells_nn.
size(), (cnt_type *)cells_nn.template getDeviceBuffer<0>() , mgpuContext);
279 cells_nn.template deviceToHost<0>(cells_nn.
size() - 1, cells_nn.
size() - 1);
280 size_t n_nn_cells = cells_nn.template get<0>(cells_nn.
size() - 1);
282 cells_nn_list.resize(n_nn_cells);
284 CUDA_LAUNCH((fill_nn_cells),itgg,cl_sparse.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cells_nn_list.toKernel(),cells.
size());
286 sorted_to_not_sorted.resize(stop-start);
287 non_sorted_to_sorted.resize(pl.size());
289 auto ite = pl.getGPUIteratorTo(stop-start,64);
292 CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
293 decltype(pl.toKernel()),
294 decltype(sorted_to_not_sorted.toKernel()),
297 pl_prp_out.toKernel(),
300 sorted_to_not_sorted.toKernel(),
301 non_sorted_to_sorted.toKernel(),
302 static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
304 if (opt == cl_construct_opt::Full)
306 construct_domain_ids(mgpuContext,start,stop,g_m);
311 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
321 void construct_domain_ids(mgpu::ofp_context_t & mgpuContext,
size_t start,
size_t stop,
size_t g_m)
324 sorted_domain_particles_dg.resize(stop-start+1);
326 auto ite = sorted_domain_particles_dg.getGPUIterator();
328 CUDA_LAUNCH((mark_domain_particles),ite,sorted_to_not_sorted.toKernel(),sorted_domain_particles_dg.toKernel(),g_m);
331 openfpm::scan((
unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),sorted_domain_particles_dg.size(),(
unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),mgpuContext);
333 sorted_domain_particles_dg.template deviceToHost<0>(sorted_domain_particles_dg.size()-1,sorted_domain_particles_dg.size()-1);
334 auto sz = sorted_domain_particles_dg.template get<0>(sorted_domain_particles_dg.size()-1);
336 sorted_domain_particles_ids.resize(sz);
338 CUDA_LAUNCH((collect_domain_ghost_ids),ite,sorted_domain_particles_dg.toKernel(),sorted_domain_particles_ids.toKernel());
346 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
347 void construct_dense(vector & pl,
350 vector_prp & pl_prp_out,
351 mgpu::ofp_context_t & mgpuContext,
355 cl_construct_opt opt = cl_construct_opt::Full)
361 auto ite_gpu = pl.getGPUIteratorTo(stop-start-1);
363 cl_n.resize(this->gr_cell.size()+1);
364 cl_n.template fill<0>(0);
366 part_ids.resize(stop - start);
368 if (
ite_gpu.wthr.x == 0 || pl.size() == 0 || stop == 0)
371 starts.resize(cl_n.size());
372 starts.template fill<0>(0);
376 CUDA_LAUNCH((subindex<false,dim,T,cnt_type,ids_type>),
ite_gpu,div_c,
379 this->getTransform(),
384 static_cast<T *>(pl.template getDeviceBuffer<0>()),
385 static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
386 static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()));
389 starts.resize(cl_n.size());
390 openfpm::scan((cnt_type *)cl_n.template getDeviceBuffer<0>(), cl_n.size(), (cnt_type *)starts.template getDeviceBuffer<0>() , mgpuContext);
394 cells.resize(stop-start);
395 auto itgg = part_ids.getGPUIterator();
398 #ifdef MAKE_CELLLIST_DETERMINISTIC 402 static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
406 mgpu::mergesort(static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()),pl.size(),
mgpu::less_t<cnt_type>(),mgpuContext);
416 static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
417 static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),
418 static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
423 sorted_to_not_sorted.resize(stop-start);
424 non_sorted_to_sorted.resize(pl.size());
426 auto ite = pl.getGPUIteratorTo(stop-start,64);
428 if (
sizeof...(prp) == 0)
431 CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
432 decltype(pl.toKernel()),
433 decltype(sorted_to_not_sorted.toKernel()),
436 pl_prp_out.toKernel(),
439 sorted_to_not_sorted.toKernel(),
440 non_sorted_to_sorted.toKernel(),
441 static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
446 CUDA_LAUNCH((reorder_parts_wprp<decltype(pl_prp.toKernel()),
447 decltype(pl.toKernel()),
448 decltype(sorted_to_not_sorted.toKernel()),
451 pl_prp_out.toKernel(),
454 sorted_to_not_sorted.toKernel(),
455 non_sorted_to_sorted.toKernel(),
456 static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
459 if (opt == cl_construct_opt::Full)
461 construct_domain_ids(mgpuContext,start,stop,g_m);
466 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
474 typedef int yes_is_gpu_celllist;
480 static const unsigned int dims = dim;
483 typedef cnt_type cnt_type_;
486 typedef ids_type ids_type_;
489 typedef transform transform_;
492 typedef boost::mpl::bool_<is_sparse> is_sparse_;
499 CellList_gpu(
const CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> & clg)
501 this->operator=(clg);
509 CellList_gpu(CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> && clg)
511 this->operator=(clg);
521 CellList_gpu(
const Box<dim,T> & box,
const size_t (&div)[dim],
const size_t pad = 1)
523 Initialize(box,div,pad);
535 void Initialize(
const Box<dim,T> & box,
const size_t (&div)[dim],
const size_t pad = 1)
541 Initialize(sbox,div,pad);
544 void setBoxNN(
unsigned int n_NN)
546 cells_nn_test_size = n_NN;
547 construct_cell_nn_test(n_NN);
552 construct_cell_nn_test(cells_nn_test_size);
563 void Initialize(
const SpaceBox<dim,T> & box,
const size_t (&div)[dim],
const size_t pad = 1)
566 CellDecomposer_sm<dim,T,transform>::setDimensions(box,div, mat, pad);
569 InitializeStructures(this->gr_cell.getSize(),this->gr_cell.size(),pad);
572 vector_cnt_type & getSortToNonSort()
574 return sorted_to_not_sorted;
577 vector_cnt_type & getNonSortToSort()
579 return non_sorted_to_sorted;
582 vector_cnt_type & getDomainSortIds()
584 return sorted_domain_particles_ids;
593 void setRadius(T radius)
597 NNcalc_rad(radius,nnc_rad_,this->getCellBox(),this->getGrid());
599 nnc_rad.resize(nnc_rad_.
size(),0);
603 for (
unsigned int i = 0 ; i < nnc_rad_.
size() ; i++)
604 {nnc_rad.template get<0>(i) = nnc_rad_.template get<0>(i);}
606 nnc_rad.template hostToDevice<0>();
616 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
617 void construct(vector & pl,
620 vector_prp & pl_prp_out,
621 mgpu::ofp_context_t & mgpuContext,
624 size_t stop = (
size_t)-1,
625 cl_construct_opt opt = cl_construct_opt::Full)
628 if (stop == (
size_t)-1)
631 if (is_sparse ==
false) {construct_dense<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,mgpuContext,g_m,start,stop,opt);}
632 else {construct_sparse<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,mgpuContext,g_m,start,stop,opt);}
644 return CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,
646 decltype(cl_sparse),is_sparse>
651 sorted_to_not_sorted,
652 sorted_domain_particles_ids,
657 this->getTransform(),
671 sorted_to_not_sorted.clear();
684 inline size_t get_gm()
694 inline void set_gm(
size_t g_m)
706 void set_ndec(
size_t n_dec)
716 size_t get_ndec()
const 726 void debug_deviceToHost()
728 cl_n.template deviceToHost<0>();
729 cells.template deviceToHost<0>();
730 starts.template deviceToHost<0>();
748 size_t getNelements(
size_t i)
750 return cl_n.template get<0>(i);
763 inline auto get(
size_t cell,
size_t ele) -> decltype(cells.template get<0>(starts.template get<0>(cell)+
ele))
765 return cells.template get<0>(starts.template get<0>(cell)+
ele);
778 inline auto get(
size_t cell,
size_t ele)
const -> decltype(cells.template get<0>(starts.template get<0>(cell)+
ele))
780 return cells.template get<0>(starts.template get<0>(cell)+
ele);
788 void swap(CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type,is_sparse> & clg)
790 ((CellDecomposer_sm<dim,T,transform> *)
this)->swap(clg);
792 cells.swap(clg.cells);
793 starts.swap(clg.starts);
794 part_ids.swap(clg.part_ids);
795 cl_sparse.swap(clg.cl_sparse);
796 cells_nn.swap(clg.cells_nn);
797 cells_nn_list.swap(clg.cells_nn_list);
798 cells_nn_test.swap(clg.cells_nn_test);
799 sorted_to_not_sorted.swap(clg.sorted_to_not_sorted);
800 sorted_domain_particles_dg.swap(clg.sorted_domain_particles_dg);
801 sorted_domain_particles_ids.swap(clg.sorted_domain_particles_ids);
802 non_sorted_to_sorted.swap(clg.non_sorted_to_sorted);
804 spacing_c.swap(clg.spacing_c);
805 div_c.swap(clg.div_c);
808 size_t g_m_tmp = g_m;
812 size_t n_dec_tmp = n_dec;
814 clg.n_dec = n_dec_tmp;
816 int cells_nn_test_size_tmp = cells_nn_test_size;
817 cells_nn_test_size = clg.cells_nn_test_size;
818 clg.cells_nn_test_size = cells_nn_test_size_tmp;
821 CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type,is_sparse> &
822 operator=(
const CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type,is_sparse> & clg)
824 *
static_cast<CellDecomposer_sm<dim,T,transform> *
>(
this) = *
static_cast<const CellDecomposer_sm<dim,T,transform> *
>(&clg);
828 part_ids = clg.part_ids;
829 cl_sparse = clg.cl_sparse;
830 cells_nn = clg.cells_nn;
831 cells_nn_list = clg.cells_nn_list;
832 cells_nn_test = clg.cells_nn_test;
833 sorted_to_not_sorted = clg.sorted_to_not_sorted;
834 sorted_domain_particles_dg = clg.sorted_domain_particles_dg;
835 sorted_domain_particles_ids = clg.sorted_domain_particles_ids;
836 non_sorted_to_sorted = clg.non_sorted_to_sorted;
838 spacing_c = clg.spacing_c;
844 cells_nn_test_size = clg.cells_nn_test_size;
849 CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> &
850 operator=(CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> && clg)
852 static_cast<CellDecomposer_sm<dim,T,transform> *
>(
this)->swap(*
static_cast<CellDecomposer_sm<dim,T,transform> *
>(&clg));
854 cells.swap(clg.cells);
855 starts.swap(clg.starts);
856 part_ids.swap(clg.part_ids);
857 cl_sparse.swap(clg.cl_sparse);
858 cells_nn.swap(clg.cells_nn);
859 cells_nn_list.swap(clg.cells_nn_list);
860 cells_nn_test.swap(clg.cells_nn_test);
861 sorted_to_not_sorted.swap(clg.sorted_to_not_sorted);
862 sorted_domain_particles_dg.swap(clg.sorted_domain_particles_dg);
863 sorted_domain_particles_ids.swap(clg.sorted_domain_particles_ids);
864 non_sorted_to_sorted.swap(clg.non_sorted_to_sorted);
866 spacing_c = clg.spacing_c;
872 cells_nn_test_size = clg.cells_nn_test_size;
879 template<
template <
typename>
class layout_base,
typename T>
884 typename T::cnt_type_,
885 typename T::ids_type_,
886 typename T::transform_,
887 T::is_sparse_::value> type;
This class represent an N-dimensional box.
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
Transform the boost::fusion::vector into memory specification (memory_traits)
This class implement an NxN (dense) matrix.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
size_t size()
Return how many element you have in this map.
This class represent an N-dimensional box.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void clear()
Clear all from all the elements.
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Implementation of 1-D std::vector like structure.