2 #ifndef OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_
3 #define OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_
9 #include "Vector/map_vector_sparse.hpp"
10 #include "NN/CellList/CellDecomposer.hpp"
11 #include "Vector/map_vector.hpp"
12 #include "NN/CellList/cuda/Cuda_cell_list_util_func.hpp"
13 #include "NN/CellList/cuda/CellList_gpu_ker.cuh"
14 #include "util/cuda_util.hpp"
15 #include "NN/CellList/CellList_util.hpp"
16 #include "NN/CellList/CellList.hpp"
17 #include "util/cuda/scan_ofp.cuh"
20 template<
unsigned int dim,
24 bool is_sparse =
false>
27 template<
unsigned int dim,
typename T,
typename Memory,
typename transform_type>
28 class CellList_gpu<dim,T,Memory,transform_type,false> :
public CellDecomposer_sm<dim,T,transform_type>
59 size_t boxNeighborNumber;
91 void InitializeStructures(
92 const size_t (& div)[dim],
96 for (
size_t i = 0 ; i < dim ; i++)
98 numCellDim[i] = div[i];
99 unitCellP2[i] = this->getCellBox().getP2().get(i);
103 numPartInCell.resize(tot_n_cell+1);
105 boxNeighborNumber = 1;
106 constructNeighborCellOffset(boxNeighborNumber);
109 void constructNeighborCellOffset(
size_t boxNeighborNumber)
112 NNcalc_box(boxNeighborNumber,boxNeighborCellOffset,this->getGrid());
113 NNcalc_boxSym(boxNeighborNumber,boxNeighborCellOffsetSym,this->getGrid());
115 boxNeighborCellOffset.template hostToDevice<0>();
116 boxNeighborCellOffsetSym.template hostToDevice<0>();
124 void constructSortedToSortedIndexNoGhost(
131 isSortedDomainOrGhost.resize(stop-start+1);
133 auto ite = isSortedDomainOrGhost.getGPUIterator();
135 CUDA_LAUNCH((mark_domain_particles),ite,
136 sortedToUnsortedIndex.toKernel(),
137 isSortedDomainOrGhost.toKernel(),
142 (
unsigned int *)isSortedDomainOrGhost.template getDeviceBuffer<0>(),
143 isSortedDomainOrGhost.
size(),
144 (
unsigned int *)isSortedDomainOrGhost.template getDeviceBuffer<0>(),
148 isSortedDomainOrGhost.template deviceToHost<0>(isSortedDomainOrGhost.
size()-1,isSortedDomainOrGhost.
size()-1);
149 auto sz = isSortedDomainOrGhost.template get<0>(isSortedDomainOrGhost.
size()-1);
151 sortedToSortedIndexNoGhost.resize(sz);
153 CUDA_LAUNCH((collect_domain_ghost_ids),ite,
154 isSortedDomainOrGhost.toKernel(),
155 sortedToSortedIndexNoGhost.toKernel()
174 template<
typename vector,
typename vector_prp>
175 void construct_dense(
184 this->gpuContext = &gpuContext;
185 this->ghostMarker = ghostMarker;
186 if (stop == (
size_t)-1) stop = vPos.size();
188 auto ite_gpu = vPos.getGPUIteratorTo(stop-start-1);
191 numPartInCell.resize(this->cellListGrid.size()+1);
192 numPartInCell.template fill<0>(0);
194 cellIndex_LocalIndex.resize(stop - start);
196 if (
ite_gpu.wthr.x == 0 || vPos.size() == 0 || stop == 0)
199 numPartInCellPrefixSum.resize(numPartInCell.
size());
200 numPartInCellPrefixSum.template fill<0>(0);
204 CUDA_LAUNCH((fill_cellIndex_LocalIndex<dim,T,ids_type>),
ite_gpu,
208 this->getTransform(),
212 numPartInCell.toKernel(),
213 cellIndex_LocalIndex.toKernel()
216 numPartInCellPrefixSum.resize(numPartInCell.
size());
218 (
unsigned int *)numPartInCell.template getDeviceBuffer<0>(),
219 numPartInCell.
size(),
220 (
unsigned int *)numPartInCellPrefixSum.template getDeviceBuffer<0>(),
224 cellIndexLocalIndexToUnsorted.resize(stop-start);
225 auto itgg = cellIndex_LocalIndex.getGPUIterator();
227 CUDA_LAUNCH((fill_cells),itgg,
228 numPartInCellPrefixSum.toKernel(),
229 cellIndex_LocalIndex.toKernel(),
230 cellIndexLocalIndexToUnsorted.toKernel(),
234 sortedToUnsortedIndex.resize(stop-start);
235 unsortedToSortedIndex.resize(vPrp.size());
237 CUDA_LAUNCH((constructSortUnsortBidirectMap),
238 vPrp.getGPUIteratorTo(stop-start,64),
239 sortedToUnsortedIndex.toKernel(),
240 unsortedToSortedIndex.toKernel(),
241 cellIndexLocalIndexToUnsorted.toKernel()
244 constructSortedToSortedIndexNoGhost(gpuContext,start,stop,ghostMarker);
247 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
255 typedef int yes_is_gpu_celllist;
259 static const unsigned int dim_ = dim;
261 typedef ids_type ids_type_;
262 typedef transform_type transform_type_;
263 typedef boost::mpl::bool_<false> is_sparse_;
272 CellList_gpu(
const CellList_gpu<dim,T,Memory,transform_type> & clg)
274 this->operator=(clg);
282 CellList_gpu(CellList_gpu<dim,T,Memory,transform_type> && clg)
284 this->operator=(std::move(clg));
291 CellList_gpu() : opt(CL_NON_SYMMETRIC) {}
295 const size_t (&div)[dim],
296 const size_t pad = 1)
297 : opt(CL_NON_SYMMETRIC)
299 Initialize(box,div,pad);
302 void setBoxNN(
size_t n_NN)
304 boxNeighborNumber = n_NN;
305 constructNeighborCellOffset(n_NN);
308 inline size_t getBoxNN()
const
310 return boxNeighborNumber;
315 constructNeighborCellOffset(boxNeighborNumber);
328 const size_t (&div)[dim],
329 const size_t pad = 1)
332 CellDecomposer_sm<dim,T,transform_type>::setDimensions(box,div, mat, pad);
335 InitializeStructures(this->cellListGrid.getSize(),this->cellListGrid.size(),pad);
340 return sortedToUnsortedIndex;
345 return unsortedToSortedIndex;
350 return sortedToSortedIndexNoGhost;
359 void setRadius(T radius)
361 NNcalc_rad(radius,rcutNeighborCellOffset,this->getCellBox(),this->getGrid());
363 rcutNeighborCellOffset.template hostToDevice<0>();
381 template<
typename vector,
typename vector_prp>
386 size_t ghostMarker = 0,
391 if (opt & CL_SYMMETRIC) {
392 std::cout << __FILE__ <<
":" << __LINE__ <<
" symmetric cell list on GPU is not implemented. (And will never be, race conditions make them non suitable for GPU)" << std::endl;
395 else if (opt & CL_LOCAL_SYMMETRIC) {
396 std::cout << __FILE__ <<
":" << __LINE__ <<
" local symmetric cell list on GPU is not implemented" << std::endl;
399 else if (opt & CL_NON_SYMMETRIC) {
400 construct_dense(vPos,vPrp,gpuContext,ghostMarker,start,stop);
403 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
426 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
430 vector & vPosReorder,
431 vector_prp & vPrpReorder,
433 size_t ghostMarker = 0,
438 if (opt & CL_SYMMETRIC) {
439 std::cout << __FILE__ <<
":" << __LINE__ <<
" symmetric cell list on GPU is not implemented. (And will never be, race conditions make them non suitable for GPU)" << std::endl;
442 else if (opt & CL_LOCAL_SYMMETRIC) {
443 std::cout << __FILE__ <<
":" << __LINE__ <<
" local symmetric cell list on GPU is not implemented" << std::endl;
446 else if (opt & CL_NON_SYMMETRIC) {
448 if (!(opt & CL_GPU_SKIP_CONSTRUCT_ON_STATIC_DOMAIN))
449 construct_dense(vPos,vPrp,gpuContext,ghostMarker,start,stop);
451 if (stop == (
size_t)-1) stop = vPos.size();
453 if (opt & CL_GPU_REORDER_POSITION) {
454 CUDA_LAUNCH((reorderParticlesPos),
455 vPos.getGPUIteratorTo(stop-start,64),
457 vPosReorder.toKernel(),
458 unsortedToSortedIndex.toKernel(),
463 if (opt & CL_GPU_REORDER_PROPERTY &&
sizeof...(prp)) {
465 (reorderParticlesPrp<
466 decltype(vPrp.toKernel()),
467 decltype(unsortedToSortedIndex.toKernel()),
469 vPrp.getGPUIteratorTo(stop-start,64),
471 vPrpReorder.toKernel(),
472 unsortedToSortedIndex.toKernel(),
478 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
485 numPartInCellPrefixSum.toKernel(),
486 sortedToUnsortedIndex.toKernel(),
487 sortedToSortedIndexNoGhost.toKernel(),
488 rcutNeighborCellOffset.toKernel(),
489 boxNeighborCellOffset.toKernel(),
490 boxNeighborCellOffsetSym.toKernel(),
494 this->getTransform(),
496 this->cellListSpaceBox,
508 numPartInCell.clear();
509 cellIndexLocalIndexToUnsorted.clear();
510 numPartInCellPrefixSum.clear();
511 cellIndex_LocalIndex.clear();
512 sortedToUnsortedIndex.clear();
518 size_t ghostMarker = 0;
525 inline size_t getGhostMarker()
535 inline void setGhostMarker(
size_t ghostMarker)
537 this->ghostMarker = ghostMarker;
547 void set_ndec(
size_t nDecRefRedec)
549 this->nDecRefRedec = nDecRefRedec;
557 size_t get_ndec()
const
567 void debug_deviceToHost()
569 numPartInCell.template deviceToHost<0>();
570 cellIndexLocalIndexToUnsorted.template deviceToHost<0>();
571 numPartInCellPrefixSum.template deviceToHost<0>();
581 return numPartInCell.
size();
589 size_t getNelements(
size_t i)
591 return numPartInCell.template get<0>(i);
603 inline auto get(
size_t cell,
size_t ele) -> decltype(cellIndexLocalIndexToUnsorted.template get<0>(numPartInCellPrefixSum.template get<0>(cell)+
ele))
605 return cellIndexLocalIndexToUnsorted.template get<0>(numPartInCellPrefixSum.template get<0>(cell)+
ele);
618 inline auto get(
size_t cell,
size_t ele)
const -> decltype(cellIndexLocalIndexToUnsorted.template get<0>(numPartInCellPrefixSum.template get<0>(cell)+
ele))
620 return cellIndexLocalIndexToUnsorted.template get<0>(numPartInCellPrefixSum.template get<0>(cell)+
ele);
628 void swap(CellList_gpu<dim,T,Memory,transform_type,false> & clg)
630 ((CellDecomposer_sm<dim,T,transform_type> *)
this)->swap(clg);
631 numPartInCell.swap(clg.numPartInCell);
632 cellIndexLocalIndexToUnsorted.swap(clg.cellIndexLocalIndexToUnsorted);
633 numPartInCellPrefixSum.swap(clg.numPartInCellPrefixSum);
634 cellIndex_LocalIndex.swap(clg.cellIndex_LocalIndex);
635 sortedToUnsortedIndex.swap(clg.sortedToUnsortedIndex);
636 sortedToSortedIndexNoGhost.swap(clg.sortedToSortedIndexNoGhost);
637 unsortedToSortedIndex.swap(clg.unsortedToSortedIndex);
638 boxNeighborCellOffset.swap(clg.boxNeighborCellOffset);
639 boxNeighborCellOffsetSym.swap(clg.boxNeighborCellOffsetSym);
640 rcutNeighborCellOffset.swap(clg.rcutNeighborCellOffset);
642 unitCellP2.swap(clg.unitCellP2);
643 numCellDim.swap(clg.numCellDim);
644 cellPadDim.swap(clg.cellPadDim);
646 size_t g_m_tmp = ghostMarker;
647 ghostMarker = clg.ghostMarker;
648 clg.ghostMarker = g_m_tmp;
650 size_t n_dec_tmp = nDecRefRedec;
651 nDecRefRedec = clg.nDecRefRedec;
652 clg.nDecRefRedec = n_dec_tmp;
659 CellList_gpu<dim,T,Memory,transform_type,false> &
660 operator=(
const CellList_gpu<dim,T,Memory,transform_type,false> & clg)
662 *
static_cast<CellDecomposer_sm<dim,T,transform_type> *
>(
this) = *
static_cast<const CellDecomposer_sm<dim,T,transform_type> *
>(&clg);
663 numPartInCell = clg.numPartInCell;
664 cellIndexLocalIndexToUnsorted = clg.cellIndexLocalIndexToUnsorted;
665 numPartInCellPrefixSum = clg.numPartInCellPrefixSum;
666 cellIndex_LocalIndex = clg.cellIndex_LocalIndex;
667 sortedToUnsortedIndex = clg.sortedToUnsortedIndex;
668 sortedToSortedIndexNoGhost = clg.sortedToSortedIndexNoGhost;
669 unsortedToSortedIndex = clg.unsortedToSortedIndex;
670 boxNeighborCellOffset = clg.boxNeighborCellOffset;
671 boxNeighborCellOffsetSym= clg.boxNeighborCellOffsetSym;
672 rcutNeighborCellOffset = clg.rcutNeighborCellOffset;
674 unitCellP2 = clg.unitCellP2;
675 numCellDim = clg.numCellDim;
676 cellPadDim = clg.cellPadDim;
677 ghostMarker = clg.ghostMarker;
678 nDecRefRedec = clg.nDecRefRedec;
680 boxNeighborNumber = clg.boxNeighborNumber;
685 CellList_gpu<dim,T,Memory,transform_type> &
686 operator=(CellList_gpu<dim,T,Memory,transform_type> && clg)
688 static_cast<CellDecomposer_sm<dim,T,transform_type> *
>(
this)->swap(*
static_cast<CellDecomposer_sm<dim,T,transform_type> *
>(&clg));
689 numPartInCell.swap(clg.numPartInCell);
690 cellIndexLocalIndexToUnsorted.swap(clg.cellIndexLocalIndexToUnsorted);
691 numPartInCellPrefixSum.swap(clg.numPartInCellPrefixSum);
692 cellIndex_LocalIndex.swap(clg.cellIndex_LocalIndex);
693 sortedToUnsortedIndex.swap(clg.sortedToUnsortedIndex);
694 sortedToSortedIndexNoGhost.swap(clg.sortedToSortedIndexNoGhost);
695 unsortedToSortedIndex.swap(clg.unsortedToSortedIndex);
696 boxNeighborCellOffset.swap(clg.boxNeighborCellOffset);
697 boxNeighborCellOffsetSym.swap(clg.boxNeighborCellOffsetSym);
698 rcutNeighborCellOffset.swap(clg.rcutNeighborCellOffset);
700 unitCellP2 = clg.unitCellP2;
701 numCellDim = clg.numCellDim;
702 cellPadDim = clg.cellPadDim;
703 ghostMarker = clg.ghostMarker;
704 nDecRefRedec = clg.nDecRefRedec;
706 boxNeighborNumber = clg.boxNeighborNumber;
726 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
728 vector & vPosReordered,
729 vector_prp & vPrpReordered,
736 if (stop == (
size_t)-1) stop = vPosReordered.size();
738 if (opt & CL_GPU_RESTORE_POSITION) {
739 CUDA_LAUNCH((reorderParticlesPos),
740 vPosReordered.getGPUIteratorTo(stop-start,64),
741 vPosReordered.toKernel(),
743 sortedToUnsortedIndex.toKernel(),
748 if (opt & CL_GPU_RESTORE_PROPERTY &&
sizeof...(prp)) {
750 (reorderParticlesPrp<
751 decltype(vPrpReordered.toKernel()),
752 decltype(sortedToUnsortedIndex.toKernel()),
754 vPrpReordered.getGPUIteratorTo(stop-start,64),
755 vPrpReordered.toKernel(),
757 sortedToUnsortedIndex.toKernel(),
770 inline size_t getOpt()
const
780 void setOpt(
size_t opt)
787 template<
unsigned int dim,
typename T,
typename Memory,
typename transform_type>
788 class CellList_gpu<dim,T,Memory,transform_type,true> :
public CellDecomposer_sm<dim,T,transform_type>
791 typedef int ids_type;
810 size_t boxNeighborNumber;
847 void InitializeStructures(
848 const size_t (& div)[dim],
852 for (
size_t i = 0 ; i < dim ; i++)
854 numCellDim[i] = div[i];
855 unitCellP2[i] = this->getCellBox().getP2().get(i);
859 boxNeighborNumber = 1;
860 constructNeighborCellOffset(boxNeighborNumber);
863 void constructNeighborCellOffset(
size_t boxNeighborNumber)
865 NNcalc_box(boxNeighborNumber,boxNeighborCellOffset,this->getGrid());
867 boxNeighborCellOffset.template hostToDevice<0>();
874 template<
typename vector,
typename vector_prp>
875 void construct_sparse(
884 this->gpuContext = &gpuContext;
885 this->ghostMarker = ghostMarker;
886 if (stop == (
size_t)-1) stop = vPos.size();
888 cellIndex.resize(stop - start);
889 cellIndex.template fill<0>(0);
891 auto ite_gpu = vPos.getGPUIteratorTo(stop-start,1024);
893 if (
ite_gpu.wthr.x == 0 || vPos.size() == 0 || stop == 0)
896 CUDA_LAUNCH((fill_cellIndex<dim,T,ids_type>),
ite_gpu,
900 this->getTransform(),
907 cellIndexLocalIndexToUnsorted.resize(stop-start);
909 vecSparseCellIndex_PartIndex.
clear();
910 vecSparseCellIndex_PartIndex.template setBackground<0>((
unsigned int)-1);
913 CUDA_LAUNCH((fill_vsCellIndex_PartIndex),
ite_gpu,
914 vecSparseCellIndex_PartIndex.
toKernel(),
919 vecSparseCellIndex_PartIndex.template flush_vd<sstart_<0>>(cellIndexLocalIndexToUnsorted,gpuContext,FLUSH_ON_DEVICE);
921 nonEmptyNeighborCellCount.resize(vecSparseCellIndex_PartIndex.
size()+1);
922 nonEmptyNeighborCellCount.template fill<0>(0);
925 auto itgg = vecSparseCellIndex_PartIndex.getGPUIterator();
926 CUDA_LAUNCH((countNonEmptyNeighborCells),itgg,
927 vecSparseCellIndex_PartIndex.
toKernel(),
928 nonEmptyNeighborCellCount.toKernel(),
929 boxNeighborCellOffset.toKernel()
934 (
unsigned int *)nonEmptyNeighborCellCount.template getDeviceBuffer<0>(),
935 nonEmptyNeighborCellCount.
size(),
936 (
unsigned int *)nonEmptyNeighborCellCount.template getDeviceBuffer<0>(),
940 nonEmptyNeighborCellCount.template deviceToHost<0>(nonEmptyNeighborCellCount.
size()-1, nonEmptyNeighborCellCount.
size()-1);
941 size_t totalNeighborCellCount = nonEmptyNeighborCellCount.template get<0>(nonEmptyNeighborCellCount.
size()-1);
943 neighborPartIndexFrom_To.resize(totalNeighborCellCount);
944 CUDA_LAUNCH((fillNeighborCellList),itgg,
945 vecSparseCellIndex_PartIndex.
toKernel(),
946 nonEmptyNeighborCellCount.toKernel(),
947 boxNeighborCellOffset.toKernel(),
948 neighborPartIndexFrom_To.toKernel(),
949 (
typename decltype(vecSparseCellIndex_PartIndex.
toKernel())::index_type)cellIndexLocalIndexToUnsorted.
size()
952 sortedToUnsortedIndex.resize(stop-start);
953 unsortedToSortedIndex.resize(vPrp.size());
955 auto ite = vPos.getGPUIteratorTo(stop-start,64);
957 CUDA_LAUNCH((constructSortUnsortBidirectMap),
958 vPrp.getGPUIteratorTo(stop-start,64),
959 sortedToUnsortedIndex.toKernel(),
960 unsortedToSortedIndex.toKernel(),
961 cellIndexLocalIndexToUnsorted.toKernel()
964 constructSortedToSortedIndexNoGhost(gpuContext,start,stop,ghostMarker);
966 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
975 void constructSortedToSortedIndexNoGhost(
982 isSortedDomainOrGhost.resize(stop-start+1);
983 auto ite = isSortedDomainOrGhost.getGPUIterator();
985 CUDA_LAUNCH((mark_domain_particles),ite,
986 sortedToUnsortedIndex.toKernel(),
987 isSortedDomainOrGhost.toKernel(),
992 (
unsigned int *)isSortedDomainOrGhost.template getDeviceBuffer<0>(),
993 isSortedDomainOrGhost.
size(),
994 (
unsigned int *)isSortedDomainOrGhost.template getDeviceBuffer<0>(),
998 isSortedDomainOrGhost.template deviceToHost<0>(isSortedDomainOrGhost.
size()-1,isSortedDomainOrGhost.
size()-1);
999 auto totalParticleNoGhostCount = isSortedDomainOrGhost.template get<0>(isSortedDomainOrGhost.
size()-1);
1001 sortedToSortedIndexNoGhost.resize(totalParticleNoGhostCount);
1003 CUDA_LAUNCH((collect_domain_ghost_ids),ite,
1004 isSortedDomainOrGhost.toKernel(),
1005 sortedToSortedIndexNoGhost.toKernel()
1013 typedef int yes_is_gpu_celllist;
1017 static const unsigned int dim_ = dim;
1019 typedef ids_type ids_type_;
1020 typedef transform_type transform_type_;
1021 typedef boost::mpl::bool_<true> is_sparse_;
1030 CellList_gpu(
const CellList_gpu<dim,T,Memory,transform_type> & clg)
1032 this->operator=(clg);
1040 CellList_gpu(CellList_gpu<dim,T,Memory,transform_type> && clg)
1042 this->operator=(std::move(clg));
1049 CellList_gpu() : opt(CL_NON_SYMMETRIC) {}
1053 const size_t (&div)[dim],
1054 const size_t pad = 1)
1055 : opt(CL_NON_SYMMETRIC)
1057 Initialize(box,div,pad);
1060 void setBoxNN(
unsigned int n_NN)
1062 boxNeighborNumber = n_NN;
1063 constructNeighborCellOffset(n_NN);
1066 inline size_t getBoxNN()
const
1068 return boxNeighborNumber;
1073 constructNeighborCellOffset(boxNeighborNumber);
1086 const size_t (&div)[dim],
1087 const size_t pad = 1)
1090 CellDecomposer_sm<dim,T,transform_type>::setDimensions(box, div, mat, pad);
1093 InitializeStructures(this->cellListGrid.getSize(),this->cellListGrid.size(),pad);
1097 getSortToNonSort() {
1098 return sortedToUnsortedIndex;
1102 getNonSortToSort() {
1103 return unsortedToSortedIndex;
1107 getDomainSortIds() {
1108 return sortedToSortedIndexNoGhost;
1117 void setRadius(T radius)
1119 std::cerr <<
"setRadius() is supported by dense cell list only!\n";
1137 template<
typename vector,
typename vector_prp>
1142 size_t ghostMarker = 0,
1147 if (opt & CL_SYMMETRIC) {
1148 std::cout << __FILE__ <<
":" << __LINE__ <<
" symmetric cell list on GPU is not implemented. (And will never be, race conditions make them non suitable for GPU)" << std::endl;
1151 else if (opt & CL_LOCAL_SYMMETRIC) {
1152 std::cout << __FILE__ <<
":" << __LINE__ <<
" local symmetric cell list on GPU is not implemented" << std::endl;
1155 else if (opt & CL_NON_SYMMETRIC) {
1156 construct_sparse(vPos,vPrp,gpuContext,ghostMarker, start, stop);
1159 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
1182 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
1186 vector & vPosReorder,
1187 vector_prp & vPrpReorder,
1189 size_t ghostMarker = 0,
1194 if (opt & CL_SYMMETRIC) {
1195 std::cout << __FILE__ <<
":" << __LINE__ <<
" symmetric cell list on GPU is not implemented. (And will never be, race conditions make them non suitable for GPU)" << std::endl;
1198 else if (opt & CL_LOCAL_SYMMETRIC) {
1199 std::cout << __FILE__ <<
":" << __LINE__ <<
" local symmetric cell list on GPU is not implemented" << std::endl;
1202 else if (opt & CL_NON_SYMMETRIC) {
1203 if (!(opt & CL_GPU_SKIP_CONSTRUCT_ON_STATIC_DOMAIN))
1204 construct_sparse(vPos,vPrp,gpuContext,ghostMarker, start, stop);
1206 if (stop == (
size_t)-1) stop = vPos.size();
1208 if (opt & CL_GPU_REORDER_POSITION) {
1209 CUDA_LAUNCH((reorderParticlesPos),
1210 vPos.getGPUIteratorTo(stop-start,64),
1212 vPosReorder.toKernel(),
1213 unsortedToSortedIndex.toKernel(),
1218 if (opt & CL_GPU_REORDER_PROPERTY &&
sizeof...(prp)) {
1220 (reorderParticlesPrp<
1221 decltype(vPrp.toKernel()),
1222 decltype(unsortedToSortedIndex.toKernel()),
1224 vPrp.getGPUIteratorTo(stop-start,64),
1226 vPrpReorder.toKernel(),
1227 unsortedToSortedIndex.toKernel(),
1233 std::cout <<
"Error: " << __FILE__ <<
":" << __LINE__ <<
" you are calling CellList_gpu.construct() this function is suppose must be compiled with NVCC compiler, but it look like has been compiled by the standard system compiler" << std::endl;
1240 nonEmptyNeighborCellCount.toKernel(),
1241 neighborPartIndexFrom_To.toKernel(),
1242 vecSparseCellIndex_PartIndex.
toKernel(),
1243 sortedToUnsortedIndex.toKernel(),
1244 sortedToSortedIndexNoGhost.toKernel(),
1248 this->getTransform(),
1250 this->cellListSpaceBox,
1262 cellIndexLocalIndexToUnsorted.clear();
1264 sortedToUnsortedIndex.clear();
1270 size_t ghostMarker = 0;
1277 inline size_t getGhostMarker()
1287 inline void setGhostMarker(
size_t ghostMarker)
1289 this->ghostMarker = ghostMarker;
1299 void set_ndec(
size_t nDecRefRedec)
1301 this->nDecRefRedec = nDecRefRedec;
1309 size_t get_ndec()
const
1311 return nDecRefRedec;
1319 void debug_deviceToHost()
1321 cellIndexLocalIndexToUnsorted.template deviceToHost<0>();
1322 cellIndex.template deviceToHost<0>();
1335 inline auto get(
size_t cell,
size_t ele) -> decltype(cellIndexLocalIndexToUnsorted.template get<0>(cellIndex.template get<0>(cell)+
ele))
1337 return cellIndexLocalIndexToUnsorted.template get<0>(cellIndex.template get<0>(cell)+
ele);
1350 inline auto get(
size_t cell,
size_t ele)
const -> decltype(cellIndexLocalIndexToUnsorted.template get<0>(cellIndex.template get<0>(cell)+
ele))
1352 return cellIndexLocalIndexToUnsorted.template get<0>(cellIndex.template get<0>(cell)+
ele);
1360 void swap(CellList_gpu<dim,T,Memory,transform_type,true> & clg)
1362 ((CellDecomposer_sm<dim,T,transform_type> *)
this)->swap(clg);
1363 cellIndexLocalIndexToUnsorted.swap(clg.cellIndexLocalIndexToUnsorted);
1364 cellIndex.swap(clg.cellIndex);
1365 vecSparseCellIndex_PartIndex.swap(clg.vecSparseCellIndex_PartIndex);
1366 nonEmptyNeighborCellCount.swap(clg.nonEmptyNeighborCellCount);
1367 neighborPartIndexFrom_To.swap(clg.neighborPartIndexFrom_To);
1368 boxNeighborCellOffset.swap(clg.boxNeighborCellOffset);
1369 sortedToUnsortedIndex.swap(clg.sortedToUnsortedIndex);
1370 sortedToSortedIndexNoGhost.swap(clg.sortedToSortedIndexNoGhost);
1371 unsortedToSortedIndex.swap(clg.unsortedToSortedIndex);
1373 unitCellP2.swap(clg.unitCellP2);
1374 numCellDim.swap(clg.numCellDim);
1375 cellPadDim.swap(clg.cellPadDim);
1377 size_t g_m_tmp = ghostMarker;
1378 ghostMarker = clg.ghostMarker;
1379 clg.ghostMarker = g_m_tmp;
1381 size_t n_dec_tmp = nDecRefRedec;
1382 nDecRefRedec = clg.nDecRefRedec;
1383 clg.nDecRefRedec = n_dec_tmp;
1385 size_t optTmp = opt;
1389 int boxNN_tmp = boxNeighborNumber;
1390 boxNeighborNumber = clg.boxNeighborNumber;
1391 clg.boxNeighborNumber = boxNN_tmp;
1394 CellList_gpu<dim,T,Memory,transform_type,true> &
1395 operator=(
const CellList_gpu<dim,T,Memory,transform_type,true> & clg)
1397 *
static_cast<CellDecomposer_sm<dim,T,transform_type> *
>(
this) = *
static_cast<const CellDecomposer_sm<dim,T,transform_type> *
>(&clg);
1398 cellIndexLocalIndexToUnsorted = clg.cellIndexLocalIndexToUnsorted;
1399 cellIndex = clg.cellIndex;
1400 vecSparseCellIndex_PartIndex = clg.vecSparseCellIndex_PartIndex;
1401 nonEmptyNeighborCellCount = clg.nonEmptyNeighborCellCount;
1402 neighborPartIndexFrom_To = clg.neighborPartIndexFrom_To;
1403 boxNeighborCellOffset = clg.boxNeighborCellOffset;
1404 sortedToUnsortedIndex = clg.sortedToUnsortedIndex;
1405 sortedToSortedIndexNoGhost = clg.sortedToSortedIndexNoGhost;
1406 unsortedToSortedIndex = clg.unsortedToSortedIndex;
1408 unitCellP2 = clg.unitCellP2;
1409 numCellDim = clg.numCellDim;
1410 cellPadDim = clg.cellPadDim;
1411 ghostMarker = clg.ghostMarker;
1412 nDecRefRedec = clg.nDecRefRedec;
1415 boxNeighborNumber = clg.boxNeighborNumber;
1420 CellList_gpu<dim,T,Memory,transform_type> &
1421 operator=(CellList_gpu<dim,T,Memory,transform_type> && clg)
1423 static_cast<CellDecomposer_sm<dim,T,transform_type> *
>(
this)->swap(*
static_cast<CellDecomposer_sm<dim,T,transform_type> *
>(&clg));
1424 cellIndexLocalIndexToUnsorted.swap(clg.cellIndexLocalIndexToUnsorted);
1425 cellIndex.swap(clg.cellIndex);
1426 vecSparseCellIndex_PartIndex.swap(clg.vecSparseCellIndex_PartIndex);
1427 nonEmptyNeighborCellCount.swap(clg.nonEmptyNeighborCellCount);
1428 neighborPartIndexFrom_To.swap(clg.neighborPartIndexFrom_To);
1429 boxNeighborCellOffset.swap(clg.boxNeighborCellOffset);
1430 sortedToUnsortedIndex.swap(clg.sortedToUnsortedIndex);
1431 sortedToSortedIndexNoGhost.swap(clg.sortedToSortedIndexNoGhost);
1432 unsortedToSortedIndex.swap(clg.unsortedToSortedIndex);
1434 unitCellP2 = clg.unitCellP2;
1435 numCellDim = clg.numCellDim;
1436 cellPadDim = clg.cellPadDim;
1437 ghostMarker = clg.ghostMarker;
1438 nDecRefRedec = clg.nDecRefRedec;
1441 boxNeighborNumber = clg.boxNeighborNumber;
1461 template<
typename vector,
typename vector_prp,
unsigned int ... prp>
1463 vector & vPosReordered,
1464 vector_prp & vPrpReordered,
1471 if (stop == (
size_t)-1) stop = vPosReordered.size();
1473 if (opt & CL_GPU_RESTORE_POSITION) {
1474 CUDA_LAUNCH((reorderParticlesPos),
1475 vPosReordered.getGPUIteratorTo(stop-start,64),
1476 vPosReordered.toKernel(),
1478 sortedToUnsortedIndex.toKernel(),
1483 if (opt & CL_GPU_RESTORE_PROPERTY &&
sizeof...(prp)) {
1485 (reorderParticlesPrp<
1486 decltype(vPrpReordered.toKernel()),
1487 decltype(sortedToUnsortedIndex.toKernel()),
1489 vPrpReordered.getGPUIteratorTo(stop-start,64),
1490 vPrpReordered.toKernel(),
1492 sortedToUnsortedIndex.toKernel(),
1505 inline size_t getOpt()
const
1515 void setOpt(
size_t opt)
1522 template<
template <
typename>
class layout_base,
typename T>
1527 typename T::ids_type_,
1528 typename T::transform_type_,
1529 T::is_sparse_::value> type;
This class represent an N-dimensional box.
This class implement an NxN (dense) matrix.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void clear()
Clear all from all the elements.
size_t size()
Return how many element you have in this map.
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
Implementation of 1-D std::vector like structure.