8#ifndef OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_
9#define OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_
15#include "Vector/map_vector_sparse.hpp"
16#include "NN/CellList/CellDecomposer.hpp"
17#include "Vector/map_vector.hpp"
18#include "Cuda_cell_list_util_func.hpp"
19#include "NN/CellList/cuda/CellList_gpu_ker.cuh"
20#include "util/cuda_util.hpp"
21#include "NN/CellList/CellList_util.hpp"
22#include "NN/CellList/CellList.hpp"
23#include "util/cuda/scan_ofp.cuh"
25constexpr int count = 0;
26constexpr int start = 1;
28template<
unsigned int dim,
typename T,
29 typename cnt_type,
typename ids_type,
30 typename Memory,
typename transform,
31 typename vector_cnt_type,
typename vector_cnt_type2,
32 typename cl_sparse_type,
34struct CellList_gpu_ker_selector
37 vector_cnt_type & cell_nn,
38 vector_cnt_type2 & cell_nn_list,
39 cl_sparse_type & cl_sparse,
40 vector_cnt_type & sorted_to_not_sorted,
41 vector_cnt_type & sorted_domain_particles_ids,
53 sorted_to_not_sorted.toKernel(),
54 sorted_domain_particles_ids.toKernel(),
67template<
unsigned int dim,
typename T,
68 typename cnt_type,
typename ids_type,
69 typename Memory,
typename transform,
70 typename vector_cnt_type,
typename vector_cnt_type2,
71 typename cl_sparse_type>
72struct CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,vector_cnt_type,vector_cnt_type2,cl_sparse_type,true>
75 vector_cnt_type & cell_nn,
76 vector_cnt_type2 & cell_nn_list,
77 cl_sparse_type & cl_sparse,
78 vector_cnt_type & srt,
79 vector_cnt_type & dprt,
92 cell_nn_list.toKernel(),
107template<
unsigned int dim,
111 typename cnt_type =
unsigned int,
112 typename ids_type =
int,
113 bool is_sparse =
false>
114class CellList_gpu :
public CellDecomposer_sm<dim,T,transform>
119 vector_cnt_type cl_n;
122 vector_cnt_type cells;
125 vector_cnt_type starts;
140 int cells_nn_test_size;
146 vector_cnt_type sorted_to_not_sorted;
149 vector_cnt_type sorted_domain_particles_dg;
152 vector_cnt_type sorted_domain_particles_ids;
155 vector_cnt_type non_sorted_to_sorted;
174 void InitializeStructures(
const size_t (& div)[dim],
size_t tot_n_cell,
size_t pad)
176 for (
size_t i = 0 ; i < dim ; i++)
179 spacing_c[i] = this->getCellBox().getP2().get(i);
183 cl_n.resize(tot_n_cell);
185 cells_nn_test_size = 1;
186 construct_cell_nn_test(cells_nn_test_size);
189 void construct_cell_nn_test(
unsigned int box_nn = 1)
191 auto & gs = this->getGrid();
197 for (
size_t i = 0 ; i < dim ; i++)
200 stop.
set_d(i,2*box_nn);
201 middle.
set_d(i,box_nn);
204 cells_nn_test.resize(openfpm::math::pow(2*box_nn+1,dim));
206 int mid = gs.LinId(middle);
215 cells_nn_test.template get<0>(i) = (
int)gs.LinId(p) - mid;
221 cells_nn_test.template hostToDevice<0>();
223#if defined(__NVCC__) && defined(USE_LOW_REGISTER_ITERATOR)
226 cudaMemcpyToSymbol(cells_striding,cells_nn_test.template getPointer<0>(),cells_nn_test.
size()*
sizeof(
int));
235 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
236 void construct_sparse(vector & pl,
239 vector_prp & pl_prp_out,
244 cl_construct_opt opt = cl_construct_opt::Full)
248 part_ids.resize(stop - start);
249 starts.resize(stop - start);
253 auto ite_gpu = pl.getGPUIteratorTo(stop-start);
260 CUDA_LAUNCH((subindex<true,dim,T,cnt_type,ids_type>),
ite_gpu,div_c,
263 this->getTransform(),
268 part_ids.toKernel());
272 cells.resize(stop-start);
276 cl_sparse.template setBackground<0>((cnt_type)-1);
278 CUDA_LAUNCH((fill_cells_sparse),
ite_gpu,cl_sparse.
toKernel(),starts.toKernel());
279 cl_sparse.template flush_vd<sstart_<0>>(cells,gpuContext,FLUSH_ON_DEVICE);
281 cells_nn.resize(cl_sparse.
size()+1);
282 cells_nn.template fill<0>(0);
285 auto itgg = cl_sparse.getGPUIterator();
286 CUDA_LAUNCH((count_nn_cells),itgg,cl_sparse.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
289 openfpm::scan((cnt_type *)cells_nn.template getDeviceBuffer<0>(), cells_nn.
size(), (cnt_type *)cells_nn.template getDeviceBuffer<0>() , gpuContext);
291 cells_nn.template deviceToHost<0>(cells_nn.
size() - 1, cells_nn.
size() - 1);
292 size_t n_nn_cells = cells_nn.template get<0>(cells_nn.
size() - 1);
294 cells_nn_list.resize(n_nn_cells);
296 CUDA_LAUNCH((fill_nn_cells),itgg,cl_sparse.
toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cells_nn_list.toKernel(),cells.
size());
298 sorted_to_not_sorted.resize(stop-start);
299 non_sorted_to_sorted.resize(pl.size());
301 auto ite = pl.getGPUIteratorTo(stop-start,64);
304 CUDA_LAUNCH((reorder_parts<
decltype(pl_prp.toKernel()),
305 decltype(pl.toKernel()),
306 decltype(sorted_to_not_sorted.toKernel()),
307 decltype(cells.toKernel()),
310 pl_prp_out.toKernel(),
313 sorted_to_not_sorted.toKernel(),
314 non_sorted_to_sorted.toKernel(),
317 if (opt == cl_construct_opt::Full)
319 construct_domain_ids(gpuContext,start,stop,g_m);
324 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
334 void construct_domain_ids(
gpu::ofp_context_t & gpuContext,
size_t start,
size_t stop,
size_t g_m)
337 sorted_domain_particles_dg.resize(stop-start+1);
339 auto ite = sorted_domain_particles_dg.getGPUIterator();
341 CUDA_LAUNCH((mark_domain_particles),ite,sorted_to_not_sorted.toKernel(),sorted_domain_particles_dg.toKernel(),g_m);
344 openfpm::scan((
unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),sorted_domain_particles_dg.size(),(
unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),gpuContext);
346 sorted_domain_particles_dg.template deviceToHost<0>(sorted_domain_particles_dg.size()-1,sorted_domain_particles_dg.size()-1);
347 auto sz = sorted_domain_particles_dg.template get<0>(sorted_domain_particles_dg.size()-1);
349 sorted_domain_particles_ids.resize(sz);
351 CUDA_LAUNCH((collect_domain_ghost_ids),ite,sorted_domain_particles_dg.toKernel(),sorted_domain_particles_ids.toKernel());
359 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
360 void construct_dense(vector & pl,
363 vector_prp & pl_prp_out,
368 cl_construct_opt opt = cl_construct_opt::Full)
374 auto ite_gpu = pl.getGPUIteratorTo(stop-start-1);
376 cl_n.resize(this->gr_cell.size()+1);
377 cl_n.template fill<0>(0);
379 part_ids.resize(stop - start);
381 if (
ite_gpu.wthr.x == 0 || pl.size() == 0 || stop == 0)
384 starts.resize(cl_n.size());
385 starts.template fill<0>(0);
389 CUDA_LAUNCH((subindex<false,dim,T,cnt_type,ids_type>),
ite_gpu,div_c,
392 this->getTransform(),
397 part_ids.toKernel());
400 starts.resize(cl_n.size());
401 openfpm::scan((cnt_type *)cl_n.template getDeviceBuffer<0>(), cl_n.size(), (cnt_type *)starts.template getDeviceBuffer<0>() , gpuContext);
405 cells.resize(stop-start);
406 auto itgg = part_ids.getGPUIterator();
409#ifdef MAKE_CELLLIST_DETERMINISTIC
417 gpu::mergesort(
static_cast<cnt_type *
>(part_ids.template getDeviceBuffer<0>()),
static_cast<cnt_type *
>(cells.template getDeviceBuffer<0>()),pl.size(),
gpu::less_t<cnt_type>(),gpuContext);
433 sorted_to_not_sorted.resize(stop-start);
434 non_sorted_to_sorted.resize(pl.size());
436 auto ite = pl.getGPUIteratorTo(stop-start,64);
438 if (
sizeof...(prp) == 0)
441 CUDA_LAUNCH((reorder_parts<
decltype(pl_prp.toKernel()),
442 decltype(pl.toKernel()),
443 decltype(sorted_to_not_sorted.toKernel()),
444 decltype(cells.toKernel()),
447 pl_prp_out.toKernel(),
450 sorted_to_not_sorted.toKernel(),
451 non_sorted_to_sorted.toKernel(),
457 CUDA_LAUNCH((reorder_parts_wprp<
decltype(pl_prp.toKernel()),
458 decltype(pl.toKernel()),
459 decltype(sorted_to_not_sorted.toKernel()),
460 decltype(cells.toKernel()),
463 pl_prp_out.toKernel(),
466 sorted_to_not_sorted.toKernel(),
467 non_sorted_to_sorted.toKernel(),
471 if (opt == cl_construct_opt::Full)
473 construct_domain_ids(gpuContext,start,stop,g_m);
478 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
486 typedef int yes_is_gpu_celllist;
492 static const unsigned int dims = dim;
495 typedef cnt_type cnt_type_;
498 typedef ids_type ids_type_;
501 typedef transform transform_;
504 typedef boost::mpl::bool_<is_sparse> is_sparse_;
511 CellList_gpu(
const CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> & clg)
513 this->operator=(clg);
521 CellList_gpu(CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> && clg)
523 this->operator=(clg);
533 CellList_gpu(
const Box<dim,T> & box,
const size_t (&div)[dim],
const size_t pad = 1)
535 Initialize(box,div,pad);
547 void Initialize(
const Box<dim,T> & box,
const size_t (&div)[dim],
const size_t pad = 1)
553 Initialize(sbox,div,pad);
556 void setBoxNN(
unsigned int n_NN)
558 cells_nn_test_size = n_NN;
559 construct_cell_nn_test(n_NN);
564 construct_cell_nn_test(cells_nn_test_size);
575 void Initialize(
const SpaceBox<dim,T> & box,
const size_t (&div)[dim],
const size_t pad = 1)
578 CellDecomposer_sm<dim,T,transform>::setDimensions(box,div, mat, pad);
581 InitializeStructures(this->gr_cell.getSize(),this->gr_cell.size(),pad);
584 vector_cnt_type & getSortToNonSort()
586 return sorted_to_not_sorted;
589 vector_cnt_type & getNonSortToSort()
591 return non_sorted_to_sorted;
594 vector_cnt_type & getDomainSortIds()
596 return sorted_domain_particles_ids;
605 void setRadius(T radius)
609 NNcalc_rad(radius,nnc_rad_,this->getCellBox(),this->getGrid());
611 nnc_rad.resize(nnc_rad_.
size(),0);
615 for (
unsigned int i = 0 ; i < nnc_rad_.
size() ; i++)
616 {nnc_rad.template get<0>(i) = nnc_rad_.template get<0>(i);}
618 nnc_rad.template hostToDevice<0>();
628 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
629 void construct(vector & pl,
632 vector_prp & pl_prp_out,
636 size_t stop = (
size_t)-1,
637 cl_construct_opt opt = cl_construct_opt::Full)
640 if (stop == (
size_t)-1)
643 if (is_sparse ==
false) {construct_dense<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,gpuContext,g_m,start,stop,opt);}
644 else {construct_sparse<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,gpuContext,g_m,start,stop,opt);}
656 return CellList_gpu_ker_selector<dim,T,cnt_type,ids_type,Memory,transform,
658 decltype(cl_sparse),is_sparse>
663 sorted_to_not_sorted,
664 sorted_domain_particles_ids,
669 this->getTransform(),
686 sorted_to_not_sorted.clear();
699 inline size_t get_gm()
709 inline void set_gm(
size_t g_m)
721 void set_ndec(
size_t n_dec)
731 size_t get_ndec()
const
741 void debug_deviceToHost()
743 cl_n.template deviceToHost<0>();
744 cells.template deviceToHost<0>();
745 starts.template deviceToHost<0>();
763 size_t getNelements(
size_t i)
765 return cl_n.template get<0>(i);
778 inline auto get(
size_t cell,
size_t ele) ->
decltype(cells.template get<0>(starts.template get<0>(cell)+
ele))
780 return cells.template get<0>(starts.template get<0>(cell)+
ele);
793 inline auto get(
size_t cell,
size_t ele)
const ->
decltype(cells.template get<0>(starts.template get<0>(cell)+
ele))
795 return cells.template get<0>(starts.template get<0>(cell)+
ele);
803 void swap(CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type,is_sparse> & clg)
805 ((CellDecomposer_sm<dim,T,transform> *)
this)->swap(clg);
807 cells.swap(clg.cells);
808 starts.swap(clg.starts);
809 part_ids.swap(clg.part_ids);
810 cl_sparse.swap(clg.cl_sparse);
811 cells_nn.swap(clg.cells_nn);
812 cells_nn_list.swap(clg.cells_nn_list);
813 cells_nn_test.swap(clg.cells_nn_test);
814 sorted_to_not_sorted.swap(clg.sorted_to_not_sorted);
815 sorted_domain_particles_dg.swap(clg.sorted_domain_particles_dg);
816 sorted_domain_particles_ids.swap(clg.sorted_domain_particles_ids);
817 non_sorted_to_sorted.swap(clg.non_sorted_to_sorted);
819 spacing_c.swap(clg.spacing_c);
820 div_c.swap(clg.div_c);
823 size_t g_m_tmp = g_m;
827 size_t n_dec_tmp = n_dec;
829 clg.n_dec = n_dec_tmp;
831 int cells_nn_test_size_tmp = cells_nn_test_size;
832 cells_nn_test_size = clg.cells_nn_test_size;
833 clg.cells_nn_test_size = cells_nn_test_size_tmp;
836 CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type,is_sparse> &
837 operator=(
const CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type,is_sparse> & clg)
839 *
static_cast<CellDecomposer_sm<dim,T,transform> *
>(
this) = *
static_cast<const CellDecomposer_sm<dim,T,transform> *
>(&clg);
843 part_ids = clg.part_ids;
844 cl_sparse = clg.cl_sparse;
845 cells_nn = clg.cells_nn;
846 cells_nn_list = clg.cells_nn_list;
847 cells_nn_test = clg.cells_nn_test;
848 sorted_to_not_sorted = clg.sorted_to_not_sorted;
849 sorted_domain_particles_dg = clg.sorted_domain_particles_dg;
850 sorted_domain_particles_ids = clg.sorted_domain_particles_ids;
851 non_sorted_to_sorted = clg.non_sorted_to_sorted;
853 spacing_c = clg.spacing_c;
859 cells_nn_test_size = clg.cells_nn_test_size;
864 CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> &
865 operator=(CellList_gpu<dim,T,Memory,transform,cnt_type,ids_type> && clg)
867 static_cast<CellDecomposer_sm<dim,T,transform> *
>(
this)->swap(*
static_cast<CellDecomposer_sm<dim,T,transform> *
>(&clg));
869 cells.swap(clg.cells);
870 starts.swap(clg.starts);
871 part_ids.swap(clg.part_ids);
872 cl_sparse.swap(clg.cl_sparse);
873 cells_nn.swap(clg.cells_nn);
874 cells_nn_list.swap(clg.cells_nn_list);
875 cells_nn_test.swap(clg.cells_nn_test);
876 sorted_to_not_sorted.swap(clg.sorted_to_not_sorted);
877 sorted_domain_particles_dg.swap(clg.sorted_domain_particles_dg);
878 sorted_domain_particles_ids.swap(clg.sorted_domain_particles_ids);
879 non_sorted_to_sorted.swap(clg.non_sorted_to_sorted);
881 spacing_c = clg.spacing_c;
887 cells_nn_test_size = clg.cells_nn_test_size;
894template<
template <
typename>
class layout_base,
typename T>
899 typename T::cnt_type_,
900 typename T::ids_type_,
901 typename T::transform_,
902 T::is_sparse_::value> type;
This class represent an N-dimensional box.
This class implement an NxN (dense) matrix.
This class implement the point shape in an N-dimensional space.
This class represent an N-dimensional box.
Declaration grid_key_dx_iterator_sub.
grid_key_dx is the key to access any element in the grid
__device__ __host__ void set_d(index_type i, index_type id)
Set the i index.
void clear()
Clear all from all the elements.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
size_t size()
Return how many element you have in this map.
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
Implementation of 1-D std::vector like structure.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)