OpenFPM  5.2.0
Project that contain the implementation of distributed structures
Dcpse.cuh
1 //
2 // Created by Serhii
3 //
4 #ifndef OPENFPM_PDATA_DCPSE_CUH
5 #define OPENFPM_PDATA_DCPSE_CUH
6 
7 #if defined(__NVCC__)
8 
9 #include "Vector/vector_dist.hpp"
10 #include "MonomialBasis.hpp"
11 #include "SupportBuilder.hpp"
12 #include "SupportBuilder.cuh"
13 #include "Vandermonde.hpp"
14 #include "DcpseDiagonalScalingMatrix.hpp"
15 #include "DcpseRhs.hpp"
16 
17 #include <chrono>
18 
19 // CUDA
20 #include <cuda.h>
21 #include <cuda_runtime.h>
22 #include <cusolverDn.h>
23 
24 
25 template<unsigned int dim, typename particles_type, typename T, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename calcKernels_type>
26 __global__ void calcKernels_gpu(particles_type, monomialBasis_type, supportKey_type, supportKey_type, T**, localEps_type, size_t, calcKernels_type);
27 
28 template<unsigned int dim, typename T, typename particles_type, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename matrix_type>
29 __global__ void assembleLocalMatrices_gpu( particles_type, Point<dim, unsigned int>, unsigned int, monomialBasis_type, supportKey_type, supportKey_type, supportKey_type,
30  T**, T**, localEps_type, localEps_type, matrix_type, size_t, size_t);
31 
32 
33 template<unsigned int dim, typename VerletList_type, typename vector_type, class T = typename vector_type::stype>
34 class Dcpse_gpu {
35  static_assert(std::is_floating_point<T>::value, "CUBLAS supports only float or double");
36 
37 public:
38  typedef typename vector_type::value_type part_type;
39  typedef vector_type vtype;
40 
41  #ifdef SE_CLASS1
42  int update_ctr=0;
43  #endif
44  // This works in this way:
45  // 1) User constructs this by giving a domain of points (where one of the properties is the value of our f),
46  // the signature of the differential operator and the error order bound.
47  // 2) The machinery for assembling and solving the linear system for coefficients starts...
48  // 3) The user can then call an evaluate(point) method to get the evaluation of the differential operator
49  // on the given point.
51  double HOverEpsilon=0.9;
52 private:
53  const Point<dim, unsigned int> differentialSignature;
54  const unsigned int differentialOrder;
55  MonomialBasis<dim> monomialBasis;
56 
57  // shared local support previosly built by another operator
58  bool isSharedSupport = false;
59  openfpm::vector_custd<size_t> supportRefs; // Each MPI rank has just access to the local ones
61  openfpm::vector_custd<size_t> supportKeys1D;
62 
63  openfpm::vector_custd<T> localEps; // Each MPI rank has just access to the local ones
64  openfpm::vector_custd<T> localEpsInvPow; // Each MPI rank has just access to the local ones
65  openfpm::vector_custd<T> calcKernels;
66 
68  double rCut;
69  unsigned int convergenceOrder;
70 
71  size_t maxSupportSize;
72  size_t supportKeysTotalN;
73 
74  support_options opt;
75 
76 public:
77 #ifdef SE_CLASS1
78  int getUpdateCtr() const
79  {
80  return update_ctr;
81  }
82 #endif
83 
84  // Here we require the first element of the aggregate to be:
85  // 1) the value of the function f on the point
86  Dcpse_gpu(
88  VerletList_type& verletList,
89  Point<dim, unsigned int> differentialSignature,
90  unsigned int convergenceOrder,
91  T rCut,
92  support_options opt = support_options::RADIUS
93  ):
95  differentialSignature(differentialSignature),
96  differentialOrder(Monomial<dim>(differentialSignature).order()),
97  monomialBasis(differentialSignature.asArray(), convergenceOrder),
98  maxSupportSize(0),
99  supportKeysTotalN(0),
100  opt(opt)
101  {
103  initializeStaticSize(particles, convergenceOrder, rCut);
104  }
105 
106  Dcpse_gpu(
108  VerletList_type& verletList,
109  const Dcpse_gpu<dim, VerletList_type, vector_type, T>& other,
110  Point<dim, unsigned int> differentialSignature,
111  unsigned int convergenceOrder,
112  T rCut,
113  support_options opt = support_options::RADIUS
114  ):
115  particles(particles), opt(opt),
116  differentialSignature(differentialSignature),
117  differentialOrder(Monomial<dim>(differentialSignature).order()),
118  monomialBasis(differentialSignature.asArray(), convergenceOrder),
119  supportRefs(other.supportRefs),
120  supportKeys1D(other.supportKeys1D),
121  kerOffsets(other.kerOffsets),
122  maxSupportSize(other.maxSupportSize),
123  supportKeysTotalN(other.supportKeysTotalN),
124  isSharedSupport(true)
125  {
127  initializeStaticSize(particles, convergenceOrder, rCut);
128  }
129 
130  template<unsigned int prp>
131  void DrawKernel(vector_type &particles, int k)
132  {
133  size_t xpK = k;
134  size_t kerOff = kerOffsets.get(k);
135 
136  size_t supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
137  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
138 
139  for (int i = 0; i < supportKeysSize; i++)
140  {
141  size_t xqK = supportKeys[i];
142 
143  particles.template getProp<prp>(xqK) += calcKernels.get(kerOff+i);
144  }
145  }
146 
147  template<unsigned int prp>
148  void DrawKernelNN(vector_type &particles, int k)
149  {
150  size_t xpK = k;
151  size_t kerOff = kerOffsets.get(k);
152  size_t supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
153  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
154 
155  for (int i = 0; i < supportKeysSize; i++)
156  {
157  size_t xqK = supportKeys[i];
158 
159  particles.template getProp<prp>(xqK) = 1.0;
160  }
161  }
162 
163  template<unsigned int prp>
164  void DrawKernel(vector_type &particles, int k, int i)
165  {
166  size_t xpK = k;
167  size_t kerOff = kerOffsets.get(k);
168  size_t supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
169  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
170 
171  for (int i = 0; i < supportKeysSize; i++)
172  {
173  size_t xqK = supportKeys[i];
174 
175  particles.template getProp<prp>(xqK)[i] += calcKernels.get(kerOff+i);
176  }
177  }
178 
179  void checkMomenta(vector_type &particles)
180  {
183 
184  momenta.resize(monomialBasis.size());
185  momenta_accu.resize(monomialBasis.size());
186 
187  for (int i = 0; i < momenta.size(); i++)
188  {
189  momenta.template get<0>(i) = 3000000000.0;
190  momenta.template get<1>(i) = -3000000000.0;
191  }
192 
193  size_t N = particles.size_local();
194  for (size_t j = 0; j < N; ++j)
195  {
196  double eps = localEps.get(j);
197 
198  for (int i = 0; i < momenta.size(); i++)
199  {
200  momenta_accu.template get<0>(i) = 0.0;
201  }
202 
203  size_t xpK = supportRefs.get(j);
204  Point<dim, T> xp = particles.getPos(xpK);
205 
206  size_t kerOff = kerOffsets.get(xpK);
207  size_t supportKeysSize = kerOffsets.get(j+1)-kerOffsets.get(j);
208  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(j)];
209 
210  for (int i = 0; i < supportKeysSize; i++)
211  {
212  size_t xqK = supportKeys[i];
213  Point<dim, T> xq = particles.getPos(xqK);
214  Point<dim, T> normalizedArg = (xp - xq) / eps;
215 
216  auto ker = calcKernels.get(kerOff+i);
217 
218  int counter = 0;
219  size_t N = monomialBasis.getElements().size();
220 
221  for (size_t i = 0; i < N; ++i)
222  {
223  const Monomial<dim> &m = monomialBasis.getElement(i);
224 
225  T mbValue = m.evaluate(normalizedArg);
226  momenta_accu.template get<0>(counter) += mbValue * ker;
227 
228  ++counter;
229  }
230  }
231 
232  for (int i = 0; i < momenta.size(); i++)
233  {
234  if (momenta_accu.template get<0>(i) < momenta.template get<0>(i))
235  {
236  momenta.template get<0>(i) = momenta_accu.template get<0>(i);
237  }
238 
239  if (momenta_accu.template get<1>(i) > momenta.template get<1>(i))
240  {
241  momenta.template get<1>(i) = momenta_accu.template get<0>(i);
242  }
243  }
244  }
245 
246  for (int i = 0; i < momenta.size(); i++)
247  {
248  std::cout << "MOMENTA: " << monomialBasis.getElements()[i] << "Min: " << momenta.template get<0>(i) << " " << "Max: " << momenta.template get<1>(i) << std::endl;
249  }
250  }
251 
260  template<unsigned int fValuePos, unsigned int DfValuePos>
261  void computeDifferentialOperator(vector_type &particles) {
262  char sign = 1;
263  if (differentialOrder % 2 == 0) {
264  sign = -1;
265  }
266 
267  size_t N = particles.size_local();
268  for (size_t j = 0; j < N; ++j)
269  {
270  double epsInvPow = localEpsInvPow.get(j);
271 
272  T Dfxp = 0;
273  size_t xpK = supportRefs.get(j);
275  T fxp = sign * particles.template getProp<fValuePos>(xpK);
276  size_t kerOff = kerOffsets.get(xpK);
277 
278  size_t supportKeysSize = kerOffsets.get(j+1)-kerOffsets.get(j);
279  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(j)];
280 
281  for (int i = 0; i < supportKeysSize; i++)
282  {
283  size_t xqK = supportKeys[i];
284  T fxq = particles.template getProp<fValuePos>(xqK);
285 
286  Dfxp += (fxq + fxp) * calcKernels.get(kerOff+i);
287  }
288  Dfxp *= epsInvPow;
289 
290  particles.template getProp<DfValuePos>(xpK) = Dfxp;
291  }
292  }
293 
294 
300  inline int getNumNN(const vect_dist_key_dx &key)
301  {
302  return kerOffsets.get(key.getKey()+1)-kerOffsets.get(key.getKey());
303  }
304 
313  inline T getCoeffNN(const vect_dist_key_dx &key, int j)
314  {
315  size_t base = kerOffsets.get(key.getKey());
316  return calcKernels.get(base + j);
317  }
318 
324  inline size_t getIndexNN(const vect_dist_key_dx &key, int j)
325  {
326  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(key.getKey())];
327  return supportKeys[j];
328  }
329 
330 
331  inline T getSign()
332  {
333  T sign = 1.0;
334  if (differentialOrder % 2 == 0) {
335  sign = -1;
336  }
337 
338  return sign;
339  }
340 
341  T getEpsilonInvPrefactor(const vect_dist_key_dx &key)
342  {
343  return localEpsInvPow.get(key.getKey());
344  }
345 
354  template<typename op_type>
355  auto computeDifferentialOperator(const vect_dist_key_dx &key,
356  op_type &o1) -> decltype(is_scalar<std::is_fundamental<decltype(o1.value(
357  key))>::value>::analyze(key, o1)) {
358 
359  typedef decltype(is_scalar<std::is_fundamental<decltype(o1.value(key))>::value>::analyze(key, o1)) expr_type;
360 
361  T sign = 1.0;
362  if (differentialOrder % 2 == 0) {
363  sign = -1;
364  }
365 
366  double eps = localEps.get(key.getKey());
367  double epsInvPow = localEpsInvPow.get(key.getKey());
368 
369  auto &particles = o1.getVector();
370 
371 #ifdef SE_CLASS1
372  if(particles.getMapCtr()!=this->getUpdateCtr())
373  {
374  std::cerr<<__FILE__<<":"<<__LINE__<<" Error: You forgot a DCPSE operator update after map."<<std::endl;
375  }
376 #endif
377 
378  expr_type Dfxp = 0;
379  size_t xpK = supportRefs.get(key.getKey());
380  Point<dim, T> xp = particles.getPos(xpK);
381  expr_type fxp = sign * o1.value(key);
382  size_t kerOff = kerOffsets.get(xpK);
383 
384  size_t supportKeysSize = kerOffsets.get(key.getKey()+1)-kerOffsets.get(key.getKey());
385  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(key.getKey())];
386 
387  for (int i = 0; i < supportKeysSize; i++)
388  {
389  size_t xqK = supportKeys[i];
390  expr_type fxq = o1.value(vect_dist_key_dx(xqK));
391  Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+i);
392  }
393  Dfxp = Dfxp * epsInvPow;
394 
395  // T trueDfxp = particles.template getProp<2>(xpK);
396  // Store Dfxp in the right position
397  return Dfxp;
398  }
399 
409  template<typename op_type>
410  auto computeDifferentialOperator(const vect_dist_key_dx &key,
411  op_type &o1,
412  int i) -> typename decltype(is_scalar<std::is_fundamental<decltype(o1.value(
413  key))>::value>::analyze(key, o1))::coord_type {
414 
415  typedef typename decltype(is_scalar<std::is_fundamental<decltype(o1.value(key))>::value>::analyze(key, o1))::coord_type expr_type;
416 
417  T sign = 1.0;
418  if (differentialOrder % 2 == 0) {
419  sign = -1;
420  }
421 
422  double eps = localEps.get(key.getKey());
423  double epsInvPow = localEpsInvPow.get(key.getKey());
424 
425  auto &particles = o1.getVector();
426 
427 #ifdef SE_CLASS1
428  if(particles.getMapCtr()!=this->getUpdateCtr())
429  {
430  std::cerr<<__FILE__<<":"<<__LINE__<<" Error: You forgot a DCPSE operator update after map."<<std::endl;
431  }
432 #endif
433 
434  expr_type Dfxp = 0;
435  size_t xpK = supportRefs.get(key.getKey());
436 
437  Point<dim, T> xp = particles.getPos(xpK);
438  expr_type fxp = sign * o1.value(key)[i];
439  size_t kerOff = kerOffsets.get(xpK);
440  size_t supportKeysSize = kerOffsets.get(key.getKey()+1)-kerOffsets.get(key.getKey());
441  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(key.getKey())];
442 
443  for (int j = 0; j < supportKeysSize; j++)
444  {
445  size_t xqK = supportKeys[j];
446  expr_type fxq = o1.value(vect_dist_key_dx(xqK))[i];
447  Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+j);
448  }
449  Dfxp = Dfxp * epsInvPow;
450  //
451  //T trueDfxp = particles.template getProp<2>(xpK);
452  // Store Dfxp in the right position
453  return Dfxp;
454  }
455 
456  void initializeUpdate(vector_type &particles)
457  {
458 #ifdef SE_CLASS1
459  update_ctr=particles.getMapCtr();
460 #endif
461 
462  kerOffsets.clear();
463  supportKeys1D.clear();
464  supportRefs.clear();
465  localEps.clear();
466  localEpsInvPow.clear();
467  calcKernels.clear();
468 
469  initializeStaticSize(particles, convergenceOrder, rCut);
470  }
471 
472  /*
473  *
474  Save computed DCPSE coefficients to file
475  *
476  */
477  void save(const std::string &file){
478  auto & v_cl=create_vcluster();
479  size_t req = 0;
480 
481  Packer<decltype(supportRefs),CudaMemory>::packRequest(supportRefs,req);
482  Packer<decltype(kerOffsets),CudaMemory>::packRequest(kerOffsets,req);
483  Packer<decltype(supportKeys1D),CudaMemory>::packRequest(supportKeys1D,req);
484  Packer<decltype(localEps),CudaMemory>::packRequest(localEps,req);
485  Packer<decltype(localEpsInvPow),CudaMemory>::packRequest(localEpsInvPow,req);
486  Packer<decltype(calcKernels),CudaMemory>::packRequest(calcKernels,req);
487 
488  // allocate the memory
489  CudaMemory pmem;
490  ExtPreAlloc<CudaMemory> mem(req,pmem);
491 
492  //Packing
493  Pack_stat sts;
494 
495  Packer<decltype(supportRefs),CudaMemory>::pack(mem,supportRefs,sts);
496  Packer<decltype(kerOffsets),CudaMemory>::pack(mem,kerOffsets,sts);
497  Packer<decltype(supportKeys1D),CudaMemory>::pack(mem,supportKeys1D,sts);
498  Packer<decltype(localEps),CudaMemory>::pack(mem,localEps,sts);
499  Packer<decltype(localEpsInvPow),CudaMemory>::pack(mem,localEpsInvPow,sts);
500  Packer<decltype(calcKernels),CudaMemory>::pack(mem,calcKernels,sts);
501 
502  // Save into a binary file
503  std::ofstream dump (file+"_"+std::to_string(v_cl.rank()), std::ios::out | std::ios::binary);
504  if (dump.is_open() == false)
505  { std::cerr << __FILE__ << ":" << __LINE__ <<" Unable to write since dump is open at rank "<<v_cl.rank()<<std::endl;
506  return;
507  }
508  dump.write ((const char *)pmem.getPointer(), pmem.size());
509  return;
510  }
511 
516  void load(const std::string & file)
517  {
518  auto & v_cl=create_vcluster();
519  std::ifstream fs (file+"_"+std::to_string(v_cl.rank()), std::ios::in | std::ios::binary | std::ios::ate );
520  if (fs.is_open() == false)
521  {
522  std::cerr << __FILE__ << ":" << __LINE__ << " error, opening file: " << file << std::endl;
523  return;
524  }
525 
526  // take the size of the file
527  size_t sz = fs.tellg();
528 
529  fs.close();
530 
531  // reopen the file without ios::ate to read
532  std::ifstream input (file+"_"+std::to_string(v_cl.rank()), std::ios::in | std::ios::binary );
533  if (input.is_open() == false)
534  {//some message here maybe
535  return;}
536 
537  // Create the CudaMemory memory
538  size_t req = 0;
539  req += sz;
540  CudaMemory pmem;
541  ExtPreAlloc<CudaMemory> mem(req,pmem);
542 
543  mem.allocate(pmem.size());
544 
545  // read
546  input.read((char *)pmem.getPointer(), sz);
547 
548  //close the file
549  input.close();
550 
551  //Unpacking
552  Unpack_stat ps;
553 
554  Unpacker<decltype(supportRefs),CudaMemory>::unpack(mem,supportRefs,ps);
555  Unpacker<decltype(kerOffsets),CudaMemory>::unpack(mem,kerOffsets,ps);
556  Unpacker<decltype(supportKeys1D),CudaMemory>::unpack(mem,supportKeys1D,ps);
557  Unpacker<decltype(localEps),CudaMemory>::unpack(mem,localEps,ps);
558  Unpacker<decltype(localEpsInvPow),CudaMemory>::unpack(mem,localEpsInvPow,ps);
559  Unpacker<decltype(calcKernels),CudaMemory>::unpack(mem,calcKernels,ps);
560  }
561 
562 private:
563  template <typename U>
564  void initializeStaticSize(
566  unsigned int convergenceOrder,
567  U rCut
568  ) {
569 #ifdef SE_CLASS1
570  this->update_ctr=particles.getMapCtr();
571 #endif
572  this->rCut=rCut;
573  this->convergenceOrder=convergenceOrder;
574 
575  if (!isSharedSupport) {
576  supportRefs.resize(particles.size_local());
577  }
578  localEps.resize(particles.size_local());
579  localEpsInvPow.resize(particles.size_local());
580 
581 std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
582  auto it = particles.getDomainIterator();
583 
584  if (opt==support_options::RADIUS) {
585  if (!isSharedSupport) {
586  while (it.isNext()) {
587  auto key = it.get();
588  supportRefs.get(key.getKey()) = key.getKey();
589  ++it;
590  }
591 
592  SupportBuilderGPU<vector_type> supportBuilder(particles, rCut);
593  supportBuilder.getSupport(supportRefs.size(), kerOffsets, supportKeys1D, maxSupportSize, supportKeysTotalN);
594  }
595  }
596 
597  else
598  {
599  std::cerr<<__FILE__<<":"<<__LINE__<<" Error: DC-PSE on GPU supports support_options::RADIUS only!"<<std::endl;
600  }
601 
602 std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
603 std::chrono::duration<double> time_span2 = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
604 std::cout << "Support building took " << time_span2.count() * 1000. << " milliseconds." << std::endl;
605 
606  assembleLocalMatrices_t(rCut);
607  }
608 
609  // Cublas subroutine selector: float or double
610  // rCut is needed to overcome limitation of nested template class specialization
611  template <typename U> void assembleLocalMatrices_t(U rCut) {
612  throw std::invalid_argument("DCPSE operator error: CUBLAS supports only float or double"); }
613 
614  void assembleLocalMatrices_t(float rCut) {
615  assembleLocalMatrices(cublasSgetrfBatched, cublasStrsmBatched); }
616 
617  void assembleLocalMatrices_t(double rCut) {
618  assembleLocalMatrices(cublasDgetrfBatched, cublasDtrsmBatched); }
619 
620  template<typename cublasLUDec_type, typename cublasTriangSolve_type>
621  void assembleLocalMatrices(cublasLUDec_type cublasLUDecFunc, cublasTriangSolve_type cublasTriangSolveFunc) {
622  std::chrono::high_resolution_clock::time_point t3 = std::chrono::high_resolution_clock::now();
623 
624  // move monomial basis to kernel
625  auto& basis = monomialBasis.getBasis();
626  openfpm::vector_custd<Monomial_gpu<dim>> basisTemp(basis.begin(), basis.end());
627  basisTemp.template hostToDevice();
628  MonomialBasis<dim, aggregate<Monomial_gpu<dim>>, openfpm::vector_custd_ker, memory_traits_inte> monomialBasisKernel(basisTemp.toKernel());
629 
630  size_t numMatrices = supportRefs.size();
631  size_t monomialBasisSize = monomialBasis.size();
632 
633  int numSMs, numSMsMult = 1;
634  cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
635  size_t numThreads = numSMs*numSMsMult*256;
636  std::cout << "numThreads " << numThreads << " numMatrices " << numMatrices << std::endl;
637 
638  // B is an intermediate matrix
639  openfpm::vector_custd<T> BMat(numThreads * maxSupportSize * monomialBasisSize);
640  // allocate device space for A, b
641  openfpm::vector_custd<T> AMat(numMatrices*monomialBasisSize*monomialBasisSize);
642  openfpm::vector_custd<T> bVec(numMatrices*monomialBasisSize);
643 
644  // create array of pointers to pass T** pointers to cublas subroutines
645  openfpm::vector_custd<T*> AMatPointers(numMatrices);
646  openfpm::vector_custd<T*> bVecPointers(numMatrices);
647 
648  auto AMatKernel = AMat.toKernel(); T* AMatKernelPointer = (T*) AMatKernel.getPointer();
649  for (size_t i = 0; i < numMatrices; i++) AMatPointers.get(i) = AMatKernelPointer + i*monomialBasisSize*monomialBasisSize;
650 
651  auto bVecKernel = bVec.toKernel(); T* bVecKernelPointer = (T*) bVecKernel.getPointer();
652  for (size_t i = 0; i < numMatrices; i++) bVecPointers.get(i) = bVecKernelPointer + i*monomialBasisSize;
653 
654  std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
655  std::chrono::duration<double> time_span0 = std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t3);
656  std::cout << "Preallocation took " << time_span0.count() * 1000. << " milliseconds." << std::endl;
657 
658  // assemble local matrices on GPU
659  std::chrono::high_resolution_clock::time_point t9 = std::chrono::high_resolution_clock::now();
661  supportRefs.template hostToDevice();
662  AMatPointers.template hostToDevice();
663  bVecPointers.template hostToDevice();
664 
665  auto AMatPointersKernel = AMatPointers.toKernel(); T** AMatPointersKernelPointer = (T**) AMatPointersKernel.getPointer();
666  auto bVecPointersKernel = bVecPointers.toKernel(); T** bVecPointersKernelPointer = (T**) bVecPointersKernel.getPointer();
667 
668  assembleLocalMatrices_gpu<<<numSMsMult*numSMs, 256>>>(particles.toKernel(), differentialSignature, differentialOrder, monomialBasisKernel, supportRefs.toKernel(), kerOffsets.toKernel(), supportKeys1D.toKernel(),
669  AMatPointersKernelPointer, bVecPointersKernelPointer, localEps.toKernel(), localEpsInvPow.toKernel(), BMat.toKernel(), numMatrices, maxSupportSize);
670 
671  localEps.template deviceToHost();
672  localEpsInvPow.template deviceToHost();
673 
674  std::chrono::high_resolution_clock::time_point t10 = std::chrono::high_resolution_clock::now();
675  std::chrono::duration<double> time_span3 = std::chrono::duration_cast<std::chrono::duration<double>>(t10 - t9);
676  std::cout << "assembleLocalMatrices_gpu took " << time_span3.count() * 1000. << " milliseconds." << std::endl;
677 
678  //cublas lu solver
679  std::chrono::high_resolution_clock::time_point t7 = std::chrono::high_resolution_clock::now();
680  cublasHandle_t cublas_handle; cublasCreate_v2(&cublas_handle);
681 
682  openfpm::vector_custd<int> infoArray(numMatrices); auto infoArrayKernel = infoArray.toKernel();
683  cublasLUDecFunc(cublas_handle, monomialBasisSize, AMatPointersKernelPointer, monomialBasisSize, NULL, (int*) infoArrayKernel.getPointer(), numMatrices);
684  cudaDeviceSynchronize();
685 
686  infoArray.template deviceToHost();
687  for (size_t i = 0; i < numMatrices; i++)
688  if (infoArray.get(i) != 0) fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
689 
690  const double alpha = 1.f;
691  cublasTriangSolveFunc(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, monomialBasisSize, 1, &alpha, AMatPointersKernelPointer, monomialBasisSize, bVecPointersKernelPointer, monomialBasisSize, numMatrices);
692  cublasTriangSolveFunc(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, monomialBasisSize, 1, &alpha, AMatPointersKernelPointer, monomialBasisSize, bVecPointersKernelPointer, monomialBasisSize, numMatrices);
693  cudaDeviceSynchronize();
694 
695  std::chrono::high_resolution_clock::time_point t8 = std::chrono::high_resolution_clock::now();
696  std::chrono::duration<double> time_span4 = std::chrono::duration_cast<std::chrono::duration<double>>(t8 - t7);
697  std::cout << "cublas took " << time_span4.count() * 1000. << " milliseconds." << std::endl;
698 
699  std::chrono::high_resolution_clock::time_point t5 = std::chrono::high_resolution_clock::now();
700  // populate the calcKernels on GPU
701  calcKernels.resize(supportKeysTotalN);
702  localEps.template hostToDevice();
703  auto it2 = particles.getDomainIteratorGPU(512);
704  calcKernels_gpu<dim><<<it2.wthr,it2.thr>>>(particles.toKernel(), monomialBasisKernel, kerOffsets.toKernel(), supportKeys1D.toKernel(), bVecPointersKernelPointer, localEps.toKernel(), numMatrices, calcKernels.toKernel());
705  calcKernels.template deviceToHost();
706 
707  std::chrono::high_resolution_clock::time_point t6 = std::chrono::high_resolution_clock::now();
708  std::chrono::duration<double> time_span5 = std::chrono::duration_cast<std::chrono::duration<double>>(t6 - t5);
709  std::cout << "calcKernels_gpu took " << time_span5.count() * 1000. << " milliseconds." << std::endl;
710 
711  // free the resources
712  cublasDestroy_v2(cublas_handle);
713 
714  std::chrono::high_resolution_clock::time_point t4 = std::chrono::high_resolution_clock::now();
715  std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t4 - t3);
716  std::cout << "Matrices inverse took " << time_span.count() * 1000. << " milliseconds." << std::endl;
717  }
718 
719  T computeKernel(Point<dim, T> x, EMatrix<T, Eigen::Dynamic, 1> & a) const {
720  unsigned int counter = 0;
721  T res = 0, expFactor = exp(-norm2(x));
722 
723  size_t N = monomialBasis.getElements().size();
724  for (size_t i = 0; i < N; ++i)
725  {
726  const Monomial<dim> &m = monomialBasis.getElement(i);
727 
728  T coeff = a(counter);
729  T mbValue = m.evaluate(x);
730  res += coeff * mbValue * expFactor;
731  ++counter;
732  }
733  return res;
734  }
735 
736 
737  // template <unsigned int a_dim>
738  // T computeKernel(Point<dim, T> x, const T (& a) [a_dim]) const {
739  T computeKernel(Point<dim, T> x, const T* a) const {
740  unsigned int counter = 0;
741  T res = 0, expFactor = exp(-norm2(x));
742 
743  size_t N = monomialBasis.getElements().size();
744  for (size_t i = 0; i < N; ++i)
745  {
746  const Monomial<dim> &m = monomialBasis.getElement(i);
747 
748  T coeff = a[counter];
749  T mbValue = m.evaluate(x);
750  res += coeff * mbValue * expFactor;
751  ++counter;
752  }
753  return res;
754  }
755 
756 
757  T conditionNumber(const EMatrix<T, -1, -1> &V, T condTOL) const {
758  std::cout << "conditionNumber" << std::endl;
759  Eigen::JacobiSVD<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> svd(V);
760  T cond = svd.singularValues()(0)
761  / svd.singularValues()(svd.singularValues().size() - 1);
762  if (cond > condTOL) {
763  std::cout
764  << "WARNING: cond(V) = " << cond
765  << " is greater than TOL = " << condTOL
766  << ", numPoints(V) = " << V.rows()
767  << std::endl; // debug
768  }
769  return cond;
770  }
771 
772 };
773 
774 
775 template<unsigned int dim, typename T, typename particles_type, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename matrix_type>
776 __global__ void assembleLocalMatrices_gpu(
777  particles_type particles, Point<dim, unsigned int> differentialSignature, unsigned int differentialOrder, monomialBasis_type monomialBasis,
778  supportKey_type supportRefs, supportKey_type kerOffsets, supportKey_type supportKeys1D, T** h_A, T** h_b, localEps_type localEps, localEps_type localEpsInvPow,
779  matrix_type BMat, size_t numMatrices, size_t maxSupportSize)
780  {
781  auto p_key = GET_PARTICLE(particles);
782  size_t monomialBasisSize = monomialBasis.size();
783  size_t BStartPos = maxSupportSize * monomialBasisSize * p_key; T* B = &((T*)BMat.getPointer())[BStartPos];
784  const auto& basisElements = monomialBasis.getElements();
785  int rhsSign = (Monomial_gpu<dim>(differentialSignature).order() % 2 == 0) ? 1 : -1;
786 
787  for (;
788  p_key < numMatrices;
789  p_key += blockDim.x * gridDim.x)
790  {
791  Point<dim, T> xa = particles.getPos(p_key);
792 
793  size_t supportKeysSize = kerOffsets.get(p_key+1)-kerOffsets.get(p_key);
794  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
795  size_t xpK = supportRefs.get(p_key);
796 
797  assert(supportKeysSize >= monomialBasis.size());
798 
799  T FACTOR = 2, avgNeighbourSpacing = 0;
800  for (int i = 0 ; i < supportKeysSize; i++) {
801  Point<dim,T> off = xa; off -= particles.getPos(supportKeys[i]);
802  for (size_t j = 0; j < dim; ++j)
803  avgNeighbourSpacing += fabs(off.value(j));
804  }
805 
806  avgNeighbourSpacing /= supportKeysSize;
807  T eps = FACTOR * avgNeighbourSpacing;
808 
809  assert(eps != 0);
810 
811  localEps.get(p_key) = eps;
812  localEpsInvPow.get(p_key) = 1.0 / pow(eps,differentialOrder);
813 
814  // EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> B = E * V;
815  for (int i = 0; i < supportKeysSize; ++i)
816  for (int j = 0; j < monomialBasisSize; ++j) {
817  Point<dim,T> off = xa; off -= particles.getPos(supportKeys[i]);
818  const Monomial_gpu<dim>& m = basisElements.get(j);
819 
820  T V_ij = m.evaluate(off) / pow(eps, m.order());
821  T E_ii = exp(- norm2(off) / (2.0 * eps * eps));
822  B[i*monomialBasisSize+j] = E_ii * V_ij;
823  }
824 
825  T sum = 0.0;
826  // EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> A = B.transpose() * B;
827  for (int i = 0; i < monomialBasisSize; ++i)
828  for (int j = 0; j < monomialBasisSize; ++j) {
829  for (int k = 0; k < supportKeysSize; ++k)
830  sum += B[k*monomialBasisSize+i] * B[k*monomialBasisSize+j];
831 
832  h_A[p_key][i*monomialBasisSize+j] = sum; sum = 0.0;
833  }
834 
835  // Compute RHS vector b
836  for (size_t i = 0; i < monomialBasisSize; ++i) {
837  const Monomial_gpu<dim>& dm = basisElements.get(i).getDerivative(differentialSignature);
838  h_b[p_key][i] = rhsSign * dm.evaluate(Point<dim, T>(0));
839  }
840  }
841 }
842 
843 template<unsigned int dim, typename particles_type, typename T, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename calcKernels_type>
844 __global__ void calcKernels_gpu(particles_type particles, monomialBasis_type monomialBasis, supportKey_type kerOffsets, supportKey_type supportKeys1D,
845  T** h_b, localEps_type localEps, size_t numMatrices, calcKernels_type calcKernels)
846  {
847  auto p_key = GET_PARTICLE(particles);
848  Point<dim, T> xa = particles.getPos(p_key);
849 
850  size_t monomialBasisSize = monomialBasis.size();
851  const auto& basisElements = monomialBasis.getElements();
852  size_t supportKeysSize = kerOffsets.get(p_key+1)-kerOffsets.get(p_key);
853  size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
854 
855  T* calcKernelsLocal = &((T*)calcKernels.getPointer())[kerOffsets.get(p_key)];
856  T eps = localEps.get(p_key);
857 
858  for (size_t j = 0; j < supportKeysSize; ++j)
859  {
860  size_t xqK = supportKeys[j];
861  Point<dim, T> xq = particles.getPos(xqK);
862  Point<dim, T> offNorm = (xa - xq) / eps;
863  T expFactor = exp(-norm2(offNorm));
864 
865  T res = 0;
866  for (size_t i = 0; i < monomialBasisSize; ++i) {
867  const Monomial_gpu<dim> &m = basisElements.get(i);
868  T mbValue = m.evaluate(offNorm);
869  T coeff = h_b[p_key][i];
870 
871  res += coeff * mbValue * expFactor;
872  }
873  calcKernelsLocal[j] = res;
874  }
875 }
876 
877 #endif
878 #endif //OPENFPM_PDATA_DCPSE_CUH
879 
virtual size_t size() const
the the size of the allocated memory
Definition: CudaMemory.cu:245
virtual void * getPointer()
get a readable pointer with the data
Definition: CudaMemory.cu:354
Packing status object.
Definition: Pack_stat.hpp:61
Packing class.
Definition: Packer.hpp:50
__device__ __host__ T & value(size_t i)
Return the reference to the value at coordinate i.
Definition: Point.hpp:434
Unpacking status object.
Definition: Pack_stat.hpp:16
Unpacker class.
Definition: Unpacker.hpp:34
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:204
size_t size()
Stub size.
Definition: map_vector.hpp:212
Grid key for a distributed grid.
__device__ __host__ size_t getKey() const
Get the key.
Distributed vector.
size_t size_local() const
return the local size of the vector
auto getPos(vect_dist_key_dx vec_key) -> decltype(vPos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.
void hostToDevicePos()
Move the memory from the device to host memory.
void ghost_get_subset()
Stub does not do anything.
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Definition: aggregate.hpp:221
Transform the boost::fusion::vector into memory specification (memory_traits)
Definition: memory_conf.hpp:84
It model an expression expr1 + ... exprn.
Definition: sum.hpp:93