OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
Dcpse.cuh
1//
2// Created by Serhii
3//
4#ifndef OPENFPM_PDATA_DCPSE_CUH
5#define OPENFPM_PDATA_DCPSE_CUH
6
7#if defined(__NVCC__)
8
9#include "Vector/vector_dist.hpp"
10#include "MonomialBasis.hpp"
11#include "SupportBuilder.hpp"
12#include "SupportBuilder.cuh"
13#include "Support.hpp"
14#include "Vandermonde.hpp"
15#include "DcpseDiagonalScalingMatrix.hpp"
16#include "DcpseRhs.hpp"
17
18#include <chrono>
19
20// CUDA
21#include <cuda.h>
22#include <cuda_runtime.h>
23#include <cusolverDn.h>
24
25
26template<unsigned int dim, typename particles_type, typename T, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename calcKernels_type>
27__global__ void calcKernels_gpu(particles_type, monomialBasis_type, supportKey_type, supportKey_type, T**, localEps_type, size_t, calcKernels_type);
28
29template<unsigned int dim, typename T, typename particles_type, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename matrix_type>
30__global__ void assembleLocalMatrices_gpu( particles_type, Point<dim, unsigned int>, unsigned int, monomialBasis_type, supportKey_type, supportKey_type, supportKey_type,
31 T**, T**, localEps_type, localEps_type, matrix_type, size_t, size_t);
32
33
34template<unsigned int dim, typename vector_type, class T = typename vector_type::stype>
35class Dcpse_gpu {
36 static_assert(std::is_floating_point<T>::value, "CUBLAS supports only float or double");
37
38public:
39 typedef typename vector_type::value_type part_type;
40 typedef vector_type vtype;
41
42 #ifdef SE_CLASS1
43 int update_ctr=0;
44 #endif
45 // This works in this way:
46 // 1) User constructs this by giving a domain of points (where one of the properties is the value of our f),
47 // the signature of the differential operator and the error order bound.
48 // 2) The machinery for assembling and solving the linear system for coefficients starts...
49 // 3) The user can then call an evaluate(point) method to get the evaluation of the differential operator
50 // on the given point.
52 double HOverEpsilon=0.9;
53private:
54 const Point<dim, unsigned int> differentialSignature;
55 const unsigned int differentialOrder;
56 MonomialBasis<dim> monomialBasis;
57
58 // shared local support previosly built by another operator
59 bool isSharedSupport = false;
60 openfpm::vector_custd<size_t> supportRefs; // Each MPI rank has just access to the local ones
63
64 openfpm::vector_custd<T> localEps; // Each MPI rank has just access to the local ones
65 openfpm::vector_custd<T> localEpsInvPow; // Each MPI rank has just access to the local ones
66 openfpm::vector_custd<T> calcKernels;
67
69
71 double rCut;
72 unsigned int convergenceOrder;
73 double supportSizeFactor;
74
75 size_t maxSupportSize;
76 size_t supportKeysTotalN;
77
78 support_options opt;
79
80public:
81#ifdef SE_CLASS1
82 int getUpdateCtr() const
83 {
84 return update_ctr;
85 }
86#endif
87
88 // Here we require the first element of the aggregate to be:
89 // 1) the value of the function f on the point
90 Dcpse_gpu(vector_type &particles,
91 Point<dim, unsigned int> differentialSignature,
92 unsigned int convergenceOrder,
93 T rCut,
94 T supportSizeFactor = 1,
95 support_options opt = support_options::N_PARTICLES)
97 differentialSignature(differentialSignature),
98 differentialOrder(Monomial<dim>(differentialSignature).order()),
99 monomialBasis(differentialSignature.asArray(), convergenceOrder),
100 maxSupportSize(0),
101 supportKeysTotalN(0),
102 opt(opt)
103 {
105 initializeStaticSize(particles, convergenceOrder, rCut, supportSizeFactor);
106 }
107
108 Dcpse_gpu(vector_type &particles,
109 const Dcpse_gpu<dim, vector_type, T>& other,
110 Point<dim, unsigned int> differentialSignature,
111 unsigned int convergenceOrder,
112 T rCut,
113 T supportSizeFactor = 1,
114 support_options opt = support_options::N_PARTICLES)
115 :particles(particles), opt(opt),
116 differentialSignature(differentialSignature),
117 differentialOrder(Monomial<dim>(differentialSignature).order()),
118 monomialBasis(differentialSignature.asArray(), convergenceOrder),
119 subsetKeyPid(other.subsetKeyPid),
120 supportRefs(other.supportRefs),
121 supportKeys1D(other.supportKeys1D),
122 kerOffsets(other.kerOffsets),
123 maxSupportSize(other.maxSupportSize),
124 supportKeysTotalN(other.supportKeysTotalN),
125 isSharedSupport(true)
126 {
128 initializeStaticSize(particles, convergenceOrder, rCut, supportSizeFactor);
129 }
130
131 template<unsigned int prp>
132 void DrawKernel(vector_type &particles, int k)
133 {
134 size_t xpK = k;
135 size_t kerOff = kerOffsets.get(k);
136
137 size_t supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
138 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
139
140 for (int i = 0; i < supportKeysSize; i++)
141 {
142 size_t xqK = supportKeys[i];
143
144 particles.template getProp<prp>(xqK) += calcKernels.get(kerOff+i);
145 }
146 }
147
148 template<unsigned int prp>
149 void DrawKernelNN(vector_type &particles, int k)
150 {
151 size_t xpK = k;
152 size_t kerOff = kerOffsets.get(k);
153 size_t supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
154 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
155
156 for (int i = 0; i < supportKeysSize; i++)
157 {
158 size_t xqK = supportKeys[i];
159
160 particles.template getProp<prp>(xqK) = 1.0;
161 }
162 }
163
164 template<unsigned int prp>
165 void DrawKernel(vector_type &particles, int k, int i)
166 {
167 size_t xpK = k;
168 size_t kerOff = kerOffsets.get(k);
169 size_t supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
170 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
171
172 for (int i = 0; i < supportKeysSize; i++)
173 {
174 size_t xqK = supportKeys[i];
175
176 particles.template getProp<prp>(xqK)[i] += calcKernels.get(kerOff+i);
177 }
178 }
179
180 void checkMomenta(vector_type &particles)
181 {
184
185 momenta.resize(monomialBasis.size());
186 momenta_accu.resize(monomialBasis.size());
187
188 for (int i = 0; i < momenta.size(); i++)
189 {
190 momenta.template get<0>(i) = 3000000000.0;
191 momenta.template get<1>(i) = -3000000000.0;
192 }
193
194 size_t N = particles.size_local();
195 for (size_t j = 0; j < N; ++j)
196 {
197 double eps = localEps.get(j);
198
199 for (int i = 0; i < momenta.size(); i++)
200 {
201 momenta_accu.template get<0>(i) = 0.0;
202 }
203
204 size_t xpK = supportRefs.get(j);
206
207 size_t kerOff = kerOffsets.get(xpK);
208 size_t supportKeysSize = kerOffsets.get(j+1)-kerOffsets.get(j);
209 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(j)];
210
211 for (int i = 0; i < supportKeysSize; i++)
212 {
213 size_t xqK = supportKeys[i];
215 Point<dim, T> normalizedArg = (xp - xq) / eps;
216
217 auto ker = calcKernels.get(kerOff+i);
218
219 int counter = 0;
220 size_t N = monomialBasis.getElements().size();
221
222 for (size_t i = 0; i < N; ++i)
223 {
224 const Monomial<dim> &m = monomialBasis.getElement(i);
225
226 T mbValue = m.evaluate(normalizedArg);
227 momenta_accu.template get<0>(counter) += mbValue * ker;
228
229 ++counter;
230 }
231 }
232
233 for (int i = 0; i < momenta.size(); i++)
234 {
235 if (momenta_accu.template get<0>(i) < momenta.template get<0>(i))
236 {
237 momenta.template get<0>(i) = momenta_accu.template get<0>(i);
238 }
239
240 if (momenta_accu.template get<1>(i) > momenta.template get<1>(i))
241 {
242 momenta.template get<1>(i) = momenta_accu.template get<0>(i);
243 }
244 }
245 }
246
247 for (int i = 0; i < momenta.size(); i++)
248 {
249 std::cout << "MOMENTA: " << monomialBasis.getElements()[i] << "Min: " << momenta.template get<0>(i) << " " << "Max: " << momenta.template get<1>(i) << std::endl;
250 }
251 }
252
261 template<unsigned int fValuePos, unsigned int DfValuePos>
262 void computeDifferentialOperator(vector_type &particles) {
263 char sign = 1;
264 if (differentialOrder % 2 == 0) {
265 sign = -1;
266 }
267
268 size_t N = particles.size_local();
269 for (size_t j = 0; j < N; ++j)
270 {
271 double epsInvPow = localEpsInvPow.get(j);
272
273 T Dfxp = 0;
274 size_t xpK = supportRefs.get(j);
276 T fxp = sign * particles.template getProp<fValuePos>(xpK);
277 size_t kerOff = kerOffsets.get(xpK);
278
279 size_t supportKeysSize = kerOffsets.get(j+1)-kerOffsets.get(j);
280 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(j)];
281
282 for (int i = 0; i < supportKeysSize; i++)
283 {
284 size_t xqK = supportKeys[i];
285 T fxq = particles.template getProp<fValuePos>(xqK);
286
287 Dfxp += (fxq + fxp) * calcKernels.get(kerOff+i);
288 }
289 Dfxp *= epsInvPow;
290
291 particles.template getProp<DfValuePos>(xpK) = Dfxp;
292 }
293 }
294
295
301 inline int getNumNN(const vect_dist_key_dx &key)
302 {
303 return kerOffsets.get(key.getKey()+1)-kerOffsets.get(key.getKey());
304 }
305
314 inline T getCoeffNN(const vect_dist_key_dx &key, int j)
315 {
316 size_t base = kerOffsets.get(key.getKey());
317 return calcKernels.get(base + j);
318 }
319
325 inline size_t getIndexNN(const vect_dist_key_dx &key, int j)
326 {
327 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(key.getKey())];
328 return supportKeys[j];
329 }
330
331
332 inline T getSign()
333 {
334 T sign = 1.0;
335 if (differentialOrder % 2 == 0) {
336 sign = -1;
337 }
338
339 return sign;
340 }
341
342 T getEpsilonInvPrefactor(const vect_dist_key_dx &key)
343 {
344 return localEpsInvPow.get(key.getKey());
345 }
346
355 template<typename op_type>
356 auto computeDifferentialOperator(const vect_dist_key_dx &key,
357 op_type &o1) -> decltype(is_scalar<std::is_fundamental<decltype(o1.value(
358 key))>::value>::analyze(key, o1)) {
359
360 typedef decltype(is_scalar<std::is_fundamental<decltype(o1.value(key))>::value>::analyze(key, o1)) expr_type;
361
362 T sign = 1.0;
363 if (differentialOrder % 2 == 0) {
364 sign = -1;
365 }
366
367 size_t localKey = subsetKeyPid.get(key.getKey());
368 double eps = localEps.get(localKey);
369 double epsInvPow = localEpsInvPow.get(localKey);
370
371 auto &particles = o1.getVector();
372
373#ifdef SE_CLASS1
374 if(particles.getMapCtr()!=this->getUpdateCtr())
375 {
376 std::cerr<<__FILE__<<":"<<__LINE__<<" Error: You forgot a DCPSE operator update after map."<<std::endl;
377 }
378#endif
379
380 expr_type Dfxp = 0;
381 size_t xpK = supportRefs.get(localKey);
383 expr_type fxp = sign * o1.value(key);
384 size_t kerOff = kerOffsets.get(xpK);
385
386 size_t supportKeysSize = kerOffsets.get(localKey+1)-kerOffsets.get(localKey);
387 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(localKey)];
388
389 for (int i = 0; i < supportKeysSize; i++)
390 {
391 size_t xqK = supportKeys[i];
392 expr_type fxq = o1.value(vect_dist_key_dx(xqK));
393 Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+i);
394 }
395 Dfxp = Dfxp * epsInvPow;
396
397 // T trueDfxp = particles.template getProp<2>(xpK);
398 // Store Dfxp in the right position
399 return Dfxp;
400 }
401
411 template<typename op_type>
412 auto computeDifferentialOperator(const vect_dist_key_dx &key,
413 op_type &o1,
414 int i) -> typename decltype(is_scalar<std::is_fundamental<decltype(o1.value(
415 key))>::value>::analyze(key, o1))::coord_type {
416
417 typedef typename decltype(is_scalar<std::is_fundamental<decltype(o1.value(key))>::value>::analyze(key, o1))::coord_type expr_type;
418
419 T sign = 1.0;
420 if (differentialOrder % 2 == 0) {
421 sign = -1;
422 }
423
424 size_t localKey = subsetKeyPid.get(key.getKey());
425 double eps = localEps.get(localKey);
426 double epsInvPow = localEpsInvPow.get(localKey);
427
428 auto &particles = o1.getVector();
429
430#ifdef SE_CLASS1
431 if(particles.getMapCtr()!=this->getUpdateCtr())
432 {
433 std::cerr<<__FILE__<<":"<<__LINE__<<" Error: You forgot a DCPSE operator update after map."<<std::endl;
434 }
435#endif
436
437 expr_type Dfxp = 0;
438 size_t xpK = supportRefs.get(localKey);
439
441 expr_type fxp = sign * o1.value(key)[i];
442 size_t kerOff = kerOffsets.get(xpK);
443 size_t supportKeysSize = kerOffsets.get(localKey+1)-kerOffsets.get(localKey);
444 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(localKey)];
445
446 for (int j = 0; j < supportKeysSize; j++)
447 {
448 size_t xqK = supportKeys[j];
449 expr_type fxq = o1.value(vect_dist_key_dx(xqK))[i];
450 Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+j);
451 }
452 Dfxp = Dfxp * epsInvPow;
453 //
454 //T trueDfxp = particles.template getProp<2>(xpK);
455 // Store Dfxp in the right position
456 return Dfxp;
457 }
458
459 void initializeUpdate(vector_type &particles)
460 {
461#ifdef SE_CLASS1
462 update_ctr=particles.getMapCtr();
463#endif
464
465 kerOffsets.clear();
466 supportKeys1D.clear();
467 supportRefs.clear();
468 localEps.clear();
469 localEpsInvPow.clear();
470 calcKernels.clear();
471 subsetKeyPid.clear();
472
473 initializeStaticSize(particles, convergenceOrder, rCut, supportSizeFactor);
474 }
475
476 /*
477 *
478 Save computed DCPSE coefficients to file
479 *
480 */
481 void save(const std::string &file){
482 auto & v_cl=create_vcluster();
483 size_t req = 0;
484
485 Packer<decltype(supportRefs),CudaMemory>::packRequest(supportRefs,req);
486 Packer<decltype(kerOffsets),CudaMemory>::packRequest(kerOffsets,req);
487 Packer<decltype(supportKeys1D),CudaMemory>::packRequest(supportKeys1D,req);
488 Packer<decltype(localEps),CudaMemory>::packRequest(localEps,req);
489 Packer<decltype(localEpsInvPow),CudaMemory>::packRequest(localEpsInvPow,req);
490 Packer<decltype(calcKernels),CudaMemory>::packRequest(calcKernels,req);
491 Packer<decltype(subsetKeyPid),CudaMemory>::packRequest(subsetKeyPid,req);
492
493 // allocate the memory
494 CudaMemory pmem;
495 ExtPreAlloc<CudaMemory> mem(req,pmem);
496
497 //Packing
498 Pack_stat sts;
499
500 Packer<decltype(supportRefs),CudaMemory>::pack(mem,supportRefs,sts);
501 Packer<decltype(kerOffsets),CudaMemory>::pack(mem,kerOffsets,sts);
502 Packer<decltype(supportKeys1D),CudaMemory>::pack(mem,supportKeys1D,sts);
503 Packer<decltype(localEps),CudaMemory>::pack(mem,localEps,sts);
504 Packer<decltype(localEpsInvPow),CudaMemory>::pack(mem,localEpsInvPow,sts);
505 Packer<decltype(calcKernels),CudaMemory>::pack(mem,calcKernels,sts);
506 Packer<decltype(subsetKeyPid),CudaMemory>::pack(mem,subsetKeyPid,sts);
507
508 // Save into a binary file
509 std::ofstream dump (file+"_"+std::to_string(v_cl.rank()), std::ios::out | std::ios::binary);
510 if (dump.is_open() == false)
511 { std::cerr << __FILE__ << ":" << __LINE__ <<" Unable to write since dump is open at rank "<<v_cl.rank()<<std::endl;
512 return;
513 }
514 dump.write ((const char *)pmem.getPointer(), pmem.size());
515 return;
516 }
517
522 void load(const std::string & file)
523 {
524 auto & v_cl=create_vcluster();
525 std::ifstream fs (file+"_"+std::to_string(v_cl.rank()), std::ios::in | std::ios::binary | std::ios::ate );
526 if (fs.is_open() == false)
527 {
528 std::cerr << __FILE__ << ":" << __LINE__ << " error, opening file: " << file << std::endl;
529 return;
530 }
531
532 // take the size of the file
533 size_t sz = fs.tellg();
534
535 fs.close();
536
537 // reopen the file without ios::ate to read
538 std::ifstream input (file+"_"+std::to_string(v_cl.rank()), std::ios::in | std::ios::binary );
539 if (input.is_open() == false)
540 {//some message here maybe
541 return;}
542
543 // Create the CudaMemory memory
544 size_t req = 0;
545 req += sz;
546 CudaMemory pmem;
547 ExtPreAlloc<CudaMemory> mem(req,pmem);
548
549 mem.allocate(pmem.size());
550
551 // read
552 input.read((char *)pmem.getPointer(), sz);
553
554 //close the file
555 input.close();
556
557 //Unpacking
558 Unpack_stat ps;
559
560 Unpacker<decltype(supportRefs),CudaMemory>::unpack(mem,supportRefs,ps);
561 Unpacker<decltype(kerOffsets),CudaMemory>::unpack(mem,kerOffsets,ps);
562 Unpacker<decltype(supportKeys1D),CudaMemory>::unpack(mem,supportKeys1D,ps);
563 Unpacker<decltype(localEps),CudaMemory>::unpack(mem,localEps,ps);
564 Unpacker<decltype(localEpsInvPow),CudaMemory>::unpack(mem,localEpsInvPow,ps);
565 Unpacker<decltype(calcKernels),CudaMemory>::unpack(mem,calcKernels,ps);
566 Unpacker<decltype(subsetKeyPid),CudaMemory>::unpack(mem,subsetKeyPid,ps);
567 }
568
569private:
570 template <typename U>
571 void initializeStaticSize(vector_type &particles,
572 unsigned int convergenceOrder,
573 U rCut,
574 U supportSizeFactor) {
575#ifdef SE_CLASS1
576 this->update_ctr=particles.getMapCtr();
577#endif
578 this->rCut=rCut;
579 this->supportSizeFactor=supportSizeFactor;
580 this->convergenceOrder=convergenceOrder;
581
582 if (!isSharedSupport) {
583 subsetKeyPid.resize(particles.size_local_orig());
584 supportRefs.resize(particles.size_local());
585 }
586 localEps.resize(particles.size_local());
587 localEpsInvPow.resize(particles.size_local());
588
589std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
590 auto it = particles.getDomainIterator();
591
592 if (opt==support_options::RADIUS) {
593 if (!isSharedSupport) {
594 while (it.isNext()) {
595 auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
596 supportRefs.get(key_o.getKey()) = key_o.getKey();
597 ++it;
598 }
599
600 SupportBuilderGPU<vector_type> supportBuilder(particles, rCut);
601 supportBuilder.getSupport(supportRefs.size(), kerOffsets, supportKeys1D, maxSupportSize, supportKeysTotalN);
602 }
603 } else {
604 if (!isSharedSupport){
605 openfpm::vector<openfpm::vector<size_t>> tempSupportKeys(supportRefs.size());
606 size_t requiredSupportSize = monomialBasis.size() * supportSizeFactor;
607 // need to resize supportKeys1D to yet unknown supportKeysTotalN
608 // add() takes too long
609 SupportBuilder<vector_type,vector_type> supportBuilder(particles, particles, differentialSignature, rCut, differentialOrder == 0);
610 kerOffsets.resize(supportRefs.size()+1);
611
612 while (it.isNext()) {
613 auto key_o = it.get(); subsetKeyPid.get(particles.getOriginKey(key_o).getKey()) = key_o.getKey();
614
615 Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
616 supportRefs.get(key_o.getKey()) = key_o.getKey();
617 tempSupportKeys.get(key_o.getKey()) = support.getKeys();
618 kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
619
620 if (maxSupportSize < support.size()) maxSupportSize = support.size();
621 supportKeysTotalN += support.size();
622 ++it;
623 }
624
625 kerOffsets.get(supportRefs.size()) = supportKeysTotalN;
626 supportKeys1D.resize(supportKeysTotalN);
627
628 size_t offset = 0;
629 for (size_t i = 0; i < tempSupportKeys.size(); ++i)
630 for (size_t j = 0; j < tempSupportKeys.get(i).size(); ++j, ++offset)
631 supportKeys1D.get(offset) = tempSupportKeys.get(i).get(j);
632 }
633
634 kerOffsets.hostToDevice(); supportKeys1D.hostToDevice();
635 }
636
637std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
638std::chrono::duration<double> time_span2 = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
639std::cout << "Support building took " << time_span2.count() * 1000. << " milliseconds." << std::endl;
640
641 assembleLocalMatrices_t(rCut);
642 }
643
644 // Cublas subroutine selector: float or double
645 // rCut is needed to overcome limitation of nested template class specialization
646 template <typename U> void assembleLocalMatrices_t(U rCut) {
647 throw std::invalid_argument("DCPSE operator error: CUBLAS supports only float or double"); }
648
649 void assembleLocalMatrices_t(float rCut) {
650 assembleLocalMatrices(cublasSgetrfBatched, cublasStrsmBatched); }
651
652 void assembleLocalMatrices_t(double rCut) {
653 assembleLocalMatrices(cublasDgetrfBatched, cublasDtrsmBatched); }
654
655 template<typename cublasLUDec_type, typename cublasTriangSolve_type>
656 void assembleLocalMatrices(cublasLUDec_type cublasLUDecFunc, cublasTriangSolve_type cublasTriangSolveFunc) {
657 std::chrono::high_resolution_clock::time_point t3 = std::chrono::high_resolution_clock::now();
658
659 // move monomial basis to kernel
660 auto& basis = monomialBasis.getBasis();
661 openfpm::vector_custd<Monomial_gpu<dim>> basisTemp(basis.begin(), basis.end());
662 basisTemp.template hostToDevice();
663 MonomialBasis<dim, aggregate<Monomial_gpu<dim>>, openfpm::vector_custd_ker, memory_traits_inte> monomialBasisKernel(basisTemp.toKernel());
664
665 size_t numMatrices = supportRefs.size();
666 size_t monomialBasisSize = monomialBasis.size();
667
668 int numSMs, numSMsMult = 1;
669 cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
670 size_t numThreads = numSMs*numSMsMult*256;
671 std::cout << "numThreads " << numThreads << " numMatrices " << numMatrices << std::endl;
672
673 // B is an intermediate matrix
674 openfpm::vector_custd<T> BMat(numThreads * maxSupportSize * monomialBasisSize);
675 // allocate device space for A, b
676 openfpm::vector_custd<T> AMat(numMatrices*monomialBasisSize*monomialBasisSize);
677 openfpm::vector_custd<T> bVec(numMatrices*monomialBasisSize);
678
679 // create array of pointers to pass T** pointers to cublas subroutines
680 openfpm::vector_custd<T*> AMatPointers(numMatrices);
681 openfpm::vector_custd<T*> bVecPointers(numMatrices);
682
683 auto AMatKernel = AMat.toKernel(); T* AMatKernelPointer = (T*) AMatKernel.getPointer();
684 for (size_t i = 0; i < numMatrices; i++) AMatPointers.get(i) = AMatKernelPointer + i*monomialBasisSize*monomialBasisSize;
685
686 auto bVecKernel = bVec.toKernel(); T* bVecKernelPointer = (T*) bVecKernel.getPointer();
687 for (size_t i = 0; i < numMatrices; i++) bVecPointers.get(i) = bVecKernelPointer + i*monomialBasisSize;
688
689 std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
690 std::chrono::duration<double> time_span0 = std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t3);
691 std::cout << "Preallocation took " << time_span0.count() * 1000. << " milliseconds." << std::endl;
692
693 // assemble local matrices on GPU
694 std::chrono::high_resolution_clock::time_point t9 = std::chrono::high_resolution_clock::now();
696 supportRefs.template hostToDevice();
697 AMatPointers.template hostToDevice();
698 bVecPointers.template hostToDevice();
699
700 auto AMatPointersKernel = AMatPointers.toKernel(); T** AMatPointersKernelPointer = (T**) AMatPointersKernel.getPointer();
701 auto bVecPointersKernel = bVecPointers.toKernel(); T** bVecPointersKernelPointer = (T**) bVecPointersKernel.getPointer();
702
703 assembleLocalMatrices_gpu<<<numSMsMult*numSMs, 256>>>(particles.toKernel(), differentialSignature, differentialOrder, monomialBasisKernel, supportRefs.toKernel(), kerOffsets.toKernel(), supportKeys1D.toKernel(),
704 AMatPointersKernelPointer, bVecPointersKernelPointer, localEps.toKernel(), localEpsInvPow.toKernel(), BMat.toKernel(), numMatrices, maxSupportSize);
705
706 localEps.template deviceToHost();
707 localEpsInvPow.template deviceToHost();
708
709 std::chrono::high_resolution_clock::time_point t10 = std::chrono::high_resolution_clock::now();
710 std::chrono::duration<double> time_span3 = std::chrono::duration_cast<std::chrono::duration<double>>(t10 - t9);
711 std::cout << "assembleLocalMatrices_gpu took " << time_span3.count() * 1000. << " milliseconds." << std::endl;
712
713 //cublas lu solver
714 std::chrono::high_resolution_clock::time_point t7 = std::chrono::high_resolution_clock::now();
715 cublasHandle_t cublas_handle; cublasCreate_v2(&cublas_handle);
716
717 openfpm::vector_custd<int> infoArray(numMatrices); auto infoArrayKernel = infoArray.toKernel();
718 cublasLUDecFunc(cublas_handle, monomialBasisSize, AMatPointersKernelPointer, monomialBasisSize, NULL, (int*) infoArrayKernel.getPointer(), numMatrices);
719 cudaDeviceSynchronize();
720
721 infoArray.template deviceToHost();
722 for (size_t i = 0; i < numMatrices; i++)
723 if (infoArray.get(i) != 0) fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
724
725 const double alpha = 1.f;
726 cublasTriangSolveFunc(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, monomialBasisSize, 1, &alpha, AMatPointersKernelPointer, monomialBasisSize, bVecPointersKernelPointer, monomialBasisSize, numMatrices);
727 cublasTriangSolveFunc(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, monomialBasisSize, 1, &alpha, AMatPointersKernelPointer, monomialBasisSize, bVecPointersKernelPointer, monomialBasisSize, numMatrices);
728 cudaDeviceSynchronize();
729
730 std::chrono::high_resolution_clock::time_point t8 = std::chrono::high_resolution_clock::now();
731 std::chrono::duration<double> time_span4 = std::chrono::duration_cast<std::chrono::duration<double>>(t8 - t7);
732 std::cout << "cublas took " << time_span4.count() * 1000. << " milliseconds." << std::endl;
733
734 std::chrono::high_resolution_clock::time_point t5 = std::chrono::high_resolution_clock::now();
735 // populate the calcKernels on GPU
736 calcKernels.resize(supportKeysTotalN);
737 localEps.template hostToDevice();
738 auto it2 = particles.getDomainIteratorGPU(512);
739 calcKernels_gpu<dim><<<it2.wthr,it2.thr>>>(particles.toKernel(), monomialBasisKernel, kerOffsets.toKernel(), supportKeys1D.toKernel(), bVecPointersKernelPointer, localEps.toKernel(), numMatrices, calcKernels.toKernel());
740 calcKernels.template deviceToHost();
741
742 std::chrono::high_resolution_clock::time_point t6 = std::chrono::high_resolution_clock::now();
743 std::chrono::duration<double> time_span5 = std::chrono::duration_cast<std::chrono::duration<double>>(t6 - t5);
744 std::cout << "calcKernels_gpu took " << time_span5.count() * 1000. << " milliseconds." << std::endl;
745
746 // free the resources
747 cublasDestroy_v2(cublas_handle);
748
749 std::chrono::high_resolution_clock::time_point t4 = std::chrono::high_resolution_clock::now();
750 std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t4 - t3);
751 std::cout << "Matrices inverse took " << time_span.count() * 1000. << " milliseconds." << std::endl;
752 }
753
754 T computeKernel(Point<dim, T> x, EMatrix<T, Eigen::Dynamic, 1> & a) const {
755 unsigned int counter = 0;
756 T res = 0, expFactor = exp(-norm2(x));
757
758 size_t N = monomialBasis.getElements().size();
759 for (size_t i = 0; i < N; ++i)
760 {
761 const Monomial<dim> &m = monomialBasis.getElement(i);
762
763 T coeff = a(counter);
764 T mbValue = m.evaluate(x);
765 res += coeff * mbValue * expFactor;
766 ++counter;
767 }
768 return res;
769 }
770
771
772 // template <unsigned int a_dim>
773 // T computeKernel(Point<dim, T> x, const T (& a) [a_dim]) const {
774 T computeKernel(Point<dim, T> x, const T* a) const {
775 unsigned int counter = 0;
776 T res = 0, expFactor = exp(-norm2(x));
777
778 size_t N = monomialBasis.getElements().size();
779 for (size_t i = 0; i < N; ++i)
780 {
781 const Monomial<dim> &m = monomialBasis.getElement(i);
782
783 T coeff = a[counter];
784 T mbValue = m.evaluate(x);
785 res += coeff * mbValue * expFactor;
786 ++counter;
787 }
788 return res;
789 }
790
791
792 T conditionNumber(const EMatrix<T, -1, -1> &V, T condTOL) const {
793 std::cout << "conditionNumber" << std::endl;
794 Eigen::JacobiSVD<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> svd(V);
795 T cond = svd.singularValues()(0)
796 / svd.singularValues()(svd.singularValues().size() - 1);
797 if (cond > condTOL) {
798 std::cout
799 << "WARNING: cond(V) = " << cond
800 << " is greater than TOL = " << condTOL
801 << ", numPoints(V) = " << V.rows()
802 << std::endl; // debug
803 }
804 return cond;
805 }
806
807};
808
809
810template<unsigned int dim, typename T, typename particles_type, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename matrix_type>
811__global__ void assembleLocalMatrices_gpu(
812 particles_type particles, Point<dim, unsigned int> differentialSignature, unsigned int differentialOrder, monomialBasis_type monomialBasis,
813 supportKey_type supportRefs, supportKey_type kerOffsets, supportKey_type supportKeys1D, T** h_A, T** h_b, localEps_type localEps, localEps_type localEpsInvPow,
814 matrix_type BMat, size_t numMatrices, size_t maxSupportSize)
815 {
816 auto p_key = GET_PARTICLE(particles);
817 size_t monomialBasisSize = monomialBasis.size();
818 size_t BStartPos = maxSupportSize * monomialBasisSize * p_key; T* B = &((T*)BMat.getPointer())[BStartPos];
819 const auto& basisElements = monomialBasis.getElements();
820 int rhsSign = (Monomial_gpu<dim>(differentialSignature).order() % 2 == 0) ? 1 : -1;
821
822 for (;
823 p_key < numMatrices;
824 p_key += blockDim.x * gridDim.x)
825 {
826 Point<dim, T> xa = particles.getPos(p_key);
827
828 size_t supportKeysSize = kerOffsets.get(p_key+1)-kerOffsets.get(p_key);
829 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
830 size_t xpK = supportRefs.get(p_key);
831
832 assert(supportKeysSize >= monomialBasis.size());
833
834 T FACTOR = 2, avgNeighbourSpacing = 0;
835 for (int i = 0 ; i < supportKeysSize; i++) {
836 Point<dim,T> off = xa; off -= particles.getPosOrig(supportKeys[i]);
837 for (size_t j = 0; j < dim; ++j)
838 avgNeighbourSpacing += fabs(off.value(j));
839 }
840
841 avgNeighbourSpacing /= supportKeysSize;
842 T eps = FACTOR * avgNeighbourSpacing;
843
844 assert(eps != 0);
845
846 localEps.get(p_key) = eps;
847 localEpsInvPow.get(p_key) = 1.0 / pow(eps,differentialOrder);
848
849 // EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> B = E * V;
850 for (int i = 0; i < supportKeysSize; ++i)
851 for (int j = 0; j < monomialBasisSize; ++j) {
852 Point<dim,T> off = xa; off -= particles.getPosOrig(supportKeys[i]);
853 const Monomial_gpu<dim>& m = basisElements.get(j);
854
855 T V_ij = m.evaluate(off) / pow(eps, m.order());
856 T E_ii = exp(- norm2(off) / (2.0 * eps * eps));
857 B[i*monomialBasisSize+j] = E_ii * V_ij;
858 }
859
860 T sum = 0.0;
861 // EMatrix<T, Eigen::Dynamic, Eigen::Dynamic> A = B.transpose() * B;
862 for (int i = 0; i < monomialBasisSize; ++i)
863 for (int j = 0; j < monomialBasisSize; ++j) {
864 for (int k = 0; k < supportKeysSize; ++k)
865 sum += B[k*monomialBasisSize+i] * B[k*monomialBasisSize+j];
866
867 h_A[p_key][i*monomialBasisSize+j] = sum; sum = 0.0;
868 }
869
870 // Compute RHS vector b
871 for (size_t i = 0; i < monomialBasisSize; ++i) {
872 const Monomial_gpu<dim>& dm = basisElements.get(i).getDerivative(differentialSignature);
873 h_b[p_key][i] = rhsSign * dm.evaluate(Point<dim, T>(0));
874 }
875 }
876}
877
878template<unsigned int dim, typename particles_type, typename T, typename monomialBasis_type, typename supportKey_type, typename localEps_type, typename calcKernels_type>
879__global__ void calcKernels_gpu(particles_type particles, monomialBasis_type monomialBasis, supportKey_type kerOffsets, supportKey_type supportKeys1D,
880 T** h_b, localEps_type localEps, size_t numMatrices, calcKernels_type calcKernels)
881 {
882 auto p_key = GET_PARTICLE(particles);
883 Point<dim, T> xa = particles.getPos(p_key);
884
885 size_t monomialBasisSize = monomialBasis.size();
886 const auto& basisElements = monomialBasis.getElements();
887 size_t supportKeysSize = kerOffsets.get(p_key+1)-kerOffsets.get(p_key);
888 size_t* supportKeys = &((size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
889
890 T* calcKernelsLocal = &((T*)calcKernels.getPointer())[kerOffsets.get(p_key)];
891 T eps = localEps.get(p_key);
892
893 for (size_t j = 0; j < supportKeysSize; ++j)
894 {
895 size_t xqK = supportKeys[j];
897 Point<dim, T> offNorm = (xa - xq) / eps;
898 T expFactor = exp(-norm2(offNorm));
899
900 T res = 0;
901 for (size_t i = 0; i < monomialBasisSize; ++i) {
902 const Monomial_gpu<dim> &m = basisElements.get(i);
903 T mbValue = m.evaluate(offNorm);
904 T coeff = h_b[p_key][i];
905
906 res += coeff * mbValue * expFactor;
907 }
908 calcKernelsLocal[j] = res;
909 }
910}
911
912#endif
913#endif //OPENFPM_PDATA_DCPSE_CUH
914
virtual size_t size() const
the the size of the allocated memory
virtual void * getPointer()
get a readable pointer with the data
Packing status object.
Definition Pack_stat.hpp:61
Packing class.
Definition Packer.hpp:50
This class implement the point shape in an N-dimensional space.
Definition Point.hpp:28
__device__ __host__ T & value(size_t i)
Return the reference to the value at coordinate i.
Definition Point.hpp:419
Unpacking status object.
Definition Pack_stat.hpp:16
Unpacker class.
Definition Unpacker.hpp:34
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.
Grid key for a distributed grid.
__device__ __host__ size_t getKey() const
Get the key.
vect_dist_key_dx get()
Get the actual key.
Distributed vector.
size_t size_local() const
return the local size of the vector
auto getPos(vect_dist_key_dx vec_key) -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.
void hostToDevicePos()
Move the memory from the device to host memory.
auto getPosOrig(vect_dist_key_dx vec_key) const -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
void ghost_get_subset()
Stub does not do anything.
size_t size_local_orig() const
return the local size of the vector
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)
It model an expression expr1 * expr2.
Definition mul.hpp:120
It model an expression expr1 + ... exprn.
Definition sum.hpp:93