4#ifndef OPENFPM_PDATA_DCPSE_CUH
5#define OPENFPM_PDATA_DCPSE_CUH
9#include "Vector/vector_dist.hpp"
10#include "MonomialBasis.hpp"
11#include "SupportBuilder.hpp"
12#include "SupportBuilder.cuh"
14#include "Vandermonde.hpp"
15#include "DcpseDiagonalScalingMatrix.hpp"
16#include "DcpseRhs.hpp"
22#include <cuda_runtime.h>
23#include <cusolverDn.h>
26template<
unsigned int dim,
typename particles_type,
typename T,
typename monomialBasis_type,
typename supportKey_type,
typename localEps_type,
typename calcKernels_type>
27__global__
void calcKernels_gpu(
particles_type, monomialBasis_type, supportKey_type, supportKey_type, T**, localEps_type,
size_t, calcKernels_type);
29template<
unsigned int dim,
typename T,
typename particles_type,
typename monomialBasis_type,
typename supportKey_type,
typename localEps_type,
typename matrix_type>
31 T**, T**, localEps_type, localEps_type, matrix_type,
size_t,
size_t);
34template<
unsigned int dim,
typename vector_type,
class T =
typename vector_type::stype>
36 static_assert(std::is_floating_point<T>::value,
"CUBLAS supports only float or double");
52 double HOverEpsilon=0.9;
55 const unsigned int differentialOrder;
59 bool isSharedSupport =
false;
72 unsigned int convergenceOrder;
73 double supportSizeFactor;
75 size_t maxSupportSize;
76 size_t supportKeysTotalN;
82 int getUpdateCtr()
const
92 unsigned int convergenceOrder,
94 T supportSizeFactor = 1,
95 support_options opt = support_options::N_PARTICLES)
97 differentialSignature(differentialSignature),
98 differentialOrder(
Monomial<dim>(differentialSignature).order()),
99 monomialBasis(differentialSignature.asArray(), convergenceOrder),
101 supportKeysTotalN(0),
105 initializeStaticSize(
particles, convergenceOrder, rCut, supportSizeFactor);
109 const Dcpse_gpu<dim, vector_type, T>& other,
111 unsigned int convergenceOrder,
113 T supportSizeFactor = 1,
114 support_options opt = support_options::N_PARTICLES)
116 differentialSignature(differentialSignature),
117 differentialOrder(
Monomial<dim>(differentialSignature).order()),
118 monomialBasis(differentialSignature.asArray(), convergenceOrder),
119 subsetKeyPid(other.subsetKeyPid),
120 supportRefs(other.supportRefs),
121 supportKeys1D(other.supportKeys1D),
122 kerOffsets(other.kerOffsets),
123 maxSupportSize(other.maxSupportSize),
124 supportKeysTotalN(other.supportKeysTotalN),
125 isSharedSupport(true)
128 initializeStaticSize(
particles, convergenceOrder, rCut, supportSizeFactor);
131 template<
unsigned int prp>
135 size_t kerOff = kerOffsets.get(k);
137 size_t supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
138 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
140 for (
int i = 0; i < supportKeysSize; i++)
142 size_t xqK = supportKeys[i];
144 particles.template getProp<prp>(xqK) += calcKernels.get(kerOff+i);
148 template<
unsigned int prp>
152 size_t kerOff = kerOffsets.get(k);
153 size_t supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
154 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
156 for (
int i = 0; i < supportKeysSize; i++)
158 size_t xqK = supportKeys[i];
160 particles.template getProp<prp>(xqK) = 1.0;
164 template<
unsigned int prp>
168 size_t kerOff = kerOffsets.get(k);
169 size_t supportKeysSize = kerOffsets.get(k+1)-kerOffsets.get(k);
170 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(k)];
172 for (
int i = 0; i < supportKeysSize; i++)
174 size_t xqK = supportKeys[i];
176 particles.template getProp<prp>(xqK)[i] += calcKernels.get(kerOff+i);
185 momenta.resize(monomialBasis.size());
186 momenta_accu.resize(monomialBasis.size());
188 for (
int i = 0; i < momenta.
size(); i++)
190 momenta.template get<0>(i) = 3000000000.0;
191 momenta.template get<1>(i) = -3000000000.0;
195 for (
size_t j = 0; j < N; ++j)
197 double eps = localEps.get(j);
199 for (
int i = 0; i < momenta.
size(); i++)
201 momenta_accu.template get<0>(i) = 0.0;
204 size_t xpK = supportRefs.get(j);
207 size_t kerOff = kerOffsets.get(xpK);
208 size_t supportKeysSize = kerOffsets.get(j+1)-kerOffsets.get(j);
209 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(j)];
211 for (
int i = 0; i < supportKeysSize; i++)
213 size_t xqK = supportKeys[i];
217 auto ker = calcKernels.get(kerOff+i);
220 size_t N = monomialBasis.getElements().size();
222 for (
size_t i = 0; i < N; ++i)
226 T mbValue = m.evaluate(normalizedArg);
227 momenta_accu.template get<0>(counter) += mbValue * ker;
233 for (
int i = 0; i < momenta.
size(); i++)
235 if (momenta_accu.template get<0>(i) < momenta.template get<0>(i))
237 momenta.template get<0>(i) = momenta_accu.template get<0>(i);
240 if (momenta_accu.template get<1>(i) > momenta.template get<1>(i))
242 momenta.template get<1>(i) = momenta_accu.template get<0>(i);
247 for (
int i = 0; i < momenta.
size(); i++)
249 std::cout <<
"MOMENTA: " << monomialBasis.getElements()[i] <<
"Min: " << momenta.template get<0>(i) <<
" " <<
"Max: " << momenta.template get<1>(i) << std::endl;
261 template<
unsigned int fValuePos,
unsigned int DfValuePos>
264 if (differentialOrder % 2 == 0) {
269 for (
size_t j = 0; j < N; ++j)
271 double epsInvPow = localEpsInvPow.get(j);
274 size_t xpK = supportRefs.get(j);
276 T fxp = sign *
particles.template getProp<fValuePos>(xpK);
277 size_t kerOff = kerOffsets.get(xpK);
279 size_t supportKeysSize = kerOffsets.get(j+1)-kerOffsets.get(j);
280 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(j)];
282 for (
int i = 0; i < supportKeysSize; i++)
284 size_t xqK = supportKeys[i];
285 T fxq =
particles.template getProp<fValuePos>(xqK);
287 Dfxp += (fxq + fxp) * calcKernels.get(kerOff+i);
291 particles.template getProp<DfValuePos>(xpK) = Dfxp;
303 return kerOffsets.get(key.
getKey()+1)-kerOffsets.get(key.
getKey());
316 size_t base = kerOffsets.get(key.
getKey());
317 return calcKernels.get(base + j);
327 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(key.
getKey())];
328 return supportKeys[j];
335 if (differentialOrder % 2 == 0) {
344 return localEpsInvPow.get(key.
getKey());
355 template<
typename op_type>
357 op_type &o1) ->
decltype(is_scalar<std::is_fundamental<
decltype(o1.value(
358 key))>::value>::analyze(key, o1)) {
360 typedef decltype(is_scalar<std::is_fundamental<
decltype(o1.value(key))>::value>::analyze(key, o1)) expr_type;
363 if (differentialOrder % 2 == 0) {
367 size_t localKey = subsetKeyPid.get(key.
getKey());
368 double eps = localEps.get(localKey);
369 double epsInvPow = localEpsInvPow.get(localKey);
374 if(
particles.getMapCtr()!=this->getUpdateCtr())
376 std::cerr<<__FILE__<<
":"<<__LINE__<<
" Error: You forgot a DCPSE operator update after map."<<std::endl;
381 size_t xpK = supportRefs.get(localKey);
383 expr_type fxp = sign * o1.value(key);
384 size_t kerOff = kerOffsets.get(xpK);
386 size_t supportKeysSize = kerOffsets.get(localKey+1)-kerOffsets.get(localKey);
387 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(localKey)];
389 for (
int i = 0; i < supportKeysSize; i++)
391 size_t xqK = supportKeys[i];
393 Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+i);
395 Dfxp = Dfxp * epsInvPow;
411 template<
typename op_type>
414 int i) ->
typename decltype(is_scalar<std::is_fundamental<
decltype(o1.value(
415 key))>::value>::analyze(key, o1))::coord_type {
417 typedef typename decltype(is_scalar<std::is_fundamental<
decltype(o1.value(key))>::value>::analyze(key, o1))::coord_type expr_type;
420 if (differentialOrder % 2 == 0) {
424 size_t localKey = subsetKeyPid.get(key.
getKey());
425 double eps = localEps.get(localKey);
426 double epsInvPow = localEpsInvPow.get(localKey);
431 if(
particles.getMapCtr()!=this->getUpdateCtr())
433 std::cerr<<__FILE__<<
":"<<__LINE__<<
" Error: You forgot a DCPSE operator update after map."<<std::endl;
438 size_t xpK = supportRefs.get(localKey);
441 expr_type fxp = sign * o1.value(key)[i];
442 size_t kerOff = kerOffsets.get(xpK);
443 size_t supportKeysSize = kerOffsets.get(localKey+1)-kerOffsets.get(localKey);
444 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(localKey)];
446 for (
int j = 0; j < supportKeysSize; j++)
448 size_t xqK = supportKeys[j];
450 Dfxp = Dfxp + (fxq + fxp) * calcKernels.get(kerOff+j);
452 Dfxp = Dfxp * epsInvPow;
466 supportKeys1D.clear();
469 localEpsInvPow.clear();
471 subsetKeyPid.clear();
473 initializeStaticSize(
particles, convergenceOrder, rCut, supportSizeFactor);
481 void save(
const std::string &file){
482 auto & v_cl=create_vcluster();
489 Packer<
decltype(localEpsInvPow),
CudaMemory>::packRequest(localEpsInvPow,req);
509 std::ofstream dump (file+
"_"+std::to_string(v_cl.rank()), std::ios::out | std::ios::binary);
510 if (dump.is_open() ==
false)
511 { std::cerr << __FILE__ <<
":" << __LINE__ <<
" Unable to write since dump is open at rank "<<v_cl.rank()<<std::endl;
522 void load(
const std::string & file)
524 auto & v_cl=create_vcluster();
525 std::ifstream fs (file+
"_"+std::to_string(v_cl.rank()), std::ios::in | std::ios::binary | std::ios::ate );
526 if (fs.is_open() ==
false)
528 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, opening file: " << file << std::endl;
533 size_t sz = fs.tellg();
538 std::ifstream input (file+
"_"+std::to_string(v_cl.rank()), std::ios::in | std::ios::binary );
539 if (input.is_open() ==
false)
549 mem.allocate(pmem.
size());
570 template <
typename U>
572 unsigned int convergenceOrder,
574 U supportSizeFactor) {
579 this->supportSizeFactor=supportSizeFactor;
580 this->convergenceOrder=convergenceOrder;
582 if (!isSharedSupport) {
589std::chrono::high_resolution_clock::time_point
t1 = std::chrono::high_resolution_clock::now();
592 if (opt==support_options::RADIUS) {
593 if (!isSharedSupport) {
594 while (it.isNext()) {
595 auto key_o = it.
get(); subsetKeyPid.get(
particles.getOriginKey(key_o).
getKey()) = key_o.getKey();
596 supportRefs.get(key_o.getKey()) = key_o.getKey();
601 supportBuilder.getSupport(supportRefs.
size(), kerOffsets, supportKeys1D, maxSupportSize, supportKeysTotalN);
604 if (!isSharedSupport){
606 size_t requiredSupportSize = monomialBasis.size() * supportSizeFactor;
610 kerOffsets.resize(supportRefs.
size()+1);
612 while (it.isNext()) {
613 auto key_o = it.get(); subsetKeyPid.get(
particles.getOriginKey(key_o).
getKey()) = key_o.getKey();
615 Support support = supportBuilder.getSupport(it, requiredSupportSize, opt);
616 supportRefs.get(key_o.getKey()) = key_o.getKey();
617 tempSupportKeys.get(key_o.getKey()) = support.getKeys();
618 kerOffsets.get(key_o.getKey()) = supportKeysTotalN;
620 if (maxSupportSize < support.size()) maxSupportSize = support.size();
621 supportKeysTotalN += support.size();
625 kerOffsets.get(supportRefs.
size()) = supportKeysTotalN;
626 supportKeys1D.resize(supportKeysTotalN);
629 for (
size_t i = 0; i < tempSupportKeys.size(); ++i)
630 for (
size_t j = 0; j < tempSupportKeys.get(i).size(); ++j, ++offset)
631 supportKeys1D.get(offset) = tempSupportKeys.get(i).get(j);
634 kerOffsets.hostToDevice(); supportKeys1D.hostToDevice();
637std::chrono::high_resolution_clock::time_point
t2 = std::chrono::high_resolution_clock::now();
638std::chrono::duration<double> time_span2 = std::chrono::duration_cast<std::chrono::duration<double>>(
t2 -
t1);
639std::cout <<
"Support building took " << time_span2.count() * 1000. <<
" milliseconds." << std::endl;
641 assembleLocalMatrices_t(rCut);
646 template <
typename U>
void assembleLocalMatrices_t(U rCut) {
647 throw std::invalid_argument(
"DCPSE operator error: CUBLAS supports only float or double"); }
649 void assembleLocalMatrices_t(
float rCut) {
650 assembleLocalMatrices(cublasSgetrfBatched, cublasStrsmBatched); }
652 void assembleLocalMatrices_t(
double rCut) {
653 assembleLocalMatrices(cublasDgetrfBatched, cublasDtrsmBatched); }
655 template<
typename cublasLUDec_type,
typename cublasTriangSolve_type>
656 void assembleLocalMatrices(cublasLUDec_type cublasLUDecFunc, cublasTriangSolve_type cublasTriangSolveFunc) {
657 std::chrono::high_resolution_clock::time_point
t3 = std::chrono::high_resolution_clock::now();
660 auto& basis = monomialBasis.getBasis();
662 basisTemp.template hostToDevice();
665 size_t numMatrices = supportRefs.
size();
666 size_t monomialBasisSize = monomialBasis.size();
668 int numSMs, numSMsMult = 1;
669 cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
670 size_t numThreads = numSMs*numSMsMult*256;
671 std::cout <<
"numThreads " << numThreads <<
" numMatrices " << numMatrices << std::endl;
683 auto AMatKernel = AMat.toKernel(); T* AMatKernelPointer = (T*) AMatKernel.getPointer();
684 for (
size_t i = 0; i < numMatrices; i++) AMatPointers.get(i) = AMatKernelPointer + i*monomialBasisSize*monomialBasisSize;
686 auto bVecKernel = bVec.toKernel(); T* bVecKernelPointer = (T*) bVecKernel.getPointer();
687 for (
size_t i = 0; i < numMatrices; i++) bVecPointers.get(i) = bVecKernelPointer + i*monomialBasisSize;
689 std::chrono::high_resolution_clock::time_point
t1 = std::chrono::high_resolution_clock::now();
690 std::chrono::duration<double> time_span0 = std::chrono::duration_cast<std::chrono::duration<double>>(
t1 -
t3);
691 std::cout <<
"Preallocation took " << time_span0.count() * 1000. <<
" milliseconds." << std::endl;
694 std::chrono::high_resolution_clock::time_point t9 = std::chrono::high_resolution_clock::now();
696 supportRefs.template hostToDevice();
697 AMatPointers.template hostToDevice();
698 bVecPointers.template hostToDevice();
700 auto AMatPointersKernel = AMatPointers.toKernel(); T** AMatPointersKernelPointer = (T**) AMatPointersKernel.getPointer();
701 auto bVecPointersKernel = bVecPointers.toKernel(); T** bVecPointersKernelPointer = (T**) bVecPointersKernel.getPointer();
703 assembleLocalMatrices_gpu<<<numSMsMult*numSMs, 256>>>(
particles.toKernel(), differentialSignature, differentialOrder, monomialBasisKernel, supportRefs.toKernel(), kerOffsets.toKernel(), supportKeys1D.toKernel(),
704 AMatPointersKernelPointer, bVecPointersKernelPointer, localEps.toKernel(), localEpsInvPow.toKernel(), BMat.toKernel(), numMatrices, maxSupportSize);
706 localEps.template deviceToHost();
707 localEpsInvPow.template deviceToHost();
709 std::chrono::high_resolution_clock::time_point t10 = std::chrono::high_resolution_clock::now();
710 std::chrono::duration<double> time_span3 = std::chrono::duration_cast<std::chrono::duration<double>>(t10 - t9);
711 std::cout <<
"assembleLocalMatrices_gpu took " << time_span3.count() * 1000. <<
" milliseconds." << std::endl;
714 std::chrono::high_resolution_clock::time_point t7 = std::chrono::high_resolution_clock::now();
715 cublasHandle_t cublas_handle; cublasCreate_v2(&cublas_handle);
718 cublasLUDecFunc(cublas_handle, monomialBasisSize, AMatPointersKernelPointer, monomialBasisSize, NULL, (
int*) infoArrayKernel.getPointer(), numMatrices);
719 cudaDeviceSynchronize();
721 infoArray.template deviceToHost();
722 for (
size_t i = 0; i < numMatrices; i++)
723 if (infoArray.get(i) != 0) fprintf(stderr,
"Factorization of matrix %d Failed: Matrix may be singular\n", i);
725 const double alpha = 1.f;
726 cublasTriangSolveFunc(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, monomialBasisSize, 1, &alpha, AMatPointersKernelPointer, monomialBasisSize, bVecPointersKernelPointer, monomialBasisSize, numMatrices);
727 cublasTriangSolveFunc(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, monomialBasisSize, 1, &alpha, AMatPointersKernelPointer, monomialBasisSize, bVecPointersKernelPointer, monomialBasisSize, numMatrices);
728 cudaDeviceSynchronize();
730 std::chrono::high_resolution_clock::time_point t8 = std::chrono::high_resolution_clock::now();
731 std::chrono::duration<double> time_span4 = std::chrono::duration_cast<std::chrono::duration<double>>(t8 - t7);
732 std::cout <<
"cublas took " << time_span4.count() * 1000. <<
" milliseconds." << std::endl;
734 std::chrono::high_resolution_clock::time_point
t5 = std::chrono::high_resolution_clock::now();
736 calcKernels.resize(supportKeysTotalN);
737 localEps.template hostToDevice();
738 auto it2 =
particles.getDomainIteratorGPU(512);
739 calcKernels_gpu<dim><<<it2.wthr,it2.thr>>>(
particles.toKernel(), monomialBasisKernel, kerOffsets.toKernel(), supportKeys1D.toKernel(), bVecPointersKernelPointer, localEps.toKernel(), numMatrices, calcKernels.toKernel());
740 calcKernels.template deviceToHost();
742 std::chrono::high_resolution_clock::time_point
t6 = std::chrono::high_resolution_clock::now();
743 std::chrono::duration<double> time_span5 = std::chrono::duration_cast<std::chrono::duration<double>>(
t6 -
t5);
744 std::cout <<
"calcKernels_gpu took " << time_span5.count() * 1000. <<
" milliseconds." << std::endl;
747 cublasDestroy_v2(cublas_handle);
749 std::chrono::high_resolution_clock::time_point
t4 = std::chrono::high_resolution_clock::now();
750 std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(
t4 -
t3);
751 std::cout <<
"Matrices inverse took " << time_span.count() * 1000. <<
" milliseconds." << std::endl;
754 T computeKernel(
Point<dim, T> x, EMatrix<T, Eigen::Dynamic, 1> & a)
const {
755 unsigned int counter = 0;
756 T res = 0, expFactor = exp(-norm2(x));
758 size_t N = monomialBasis.getElements().size();
759 for (
size_t i = 0; i < N; ++i)
763 T coeff = a(counter);
764 T mbValue = m.evaluate(x);
765 res += coeff * mbValue * expFactor;
775 unsigned int counter = 0;
776 T res = 0, expFactor = exp(-norm2(x));
778 size_t N = monomialBasis.getElements().size();
779 for (
size_t i = 0; i < N; ++i)
783 T coeff = a[counter];
784 T mbValue = m.evaluate(x);
785 res += coeff * mbValue * expFactor;
792 T conditionNumber(
const EMatrix<T, -1, -1> &V, T condTOL)
const {
793 std::cout <<
"conditionNumber" << std::endl;
794 Eigen::JacobiSVD<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> svd(V);
795 T cond = svd.singularValues()(0)
796 / svd.singularValues()(svd.singularValues().size() - 1);
797 if (cond > condTOL) {
799 <<
"WARNING: cond(V) = " << cond
800 <<
" is greater than TOL = " << condTOL
801 <<
", numPoints(V) = " << V.rows()
810template<
unsigned int dim,
typename T,
typename particles_type,
typename monomialBasis_type,
typename supportKey_type,
typename localEps_type,
typename matrix_type>
811__global__
void assembleLocalMatrices_gpu(
813 supportKey_type supportRefs, supportKey_type kerOffsets, supportKey_type supportKeys1D, T** h_A, T** h_b, localEps_type localEps, localEps_type localEpsInvPow,
814 matrix_type BMat,
size_t numMatrices,
size_t maxSupportSize)
817 size_t monomialBasisSize = monomialBasis.size();
818 size_t BStartPos = maxSupportSize * monomialBasisSize * p_key; T* B = &((T*)BMat.getPointer())[BStartPos];
819 const auto& basisElements = monomialBasis.getElements();
820 int rhsSign = (
Monomial_gpu<dim>(differentialSignature).order() % 2 == 0) ? 1 : -1;
824 p_key += blockDim.x * gridDim.x)
828 size_t supportKeysSize = kerOffsets.get(p_key+1)-kerOffsets.get(p_key);
829 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
830 size_t xpK = supportRefs.get(p_key);
832 assert(supportKeysSize >= monomialBasis.size());
834 T FACTOR = 2, avgNeighbourSpacing = 0;
835 for (
int i = 0 ; i < supportKeysSize; i++) {
837 for (
size_t j = 0; j < dim; ++j)
838 avgNeighbourSpacing += fabs(off.
value(j));
841 avgNeighbourSpacing /= supportKeysSize;
842 T eps = FACTOR * avgNeighbourSpacing;
846 localEps.get(p_key) = eps;
847 localEpsInvPow.get(p_key) = 1.0 / pow(eps,differentialOrder);
850 for (
int i = 0; i < supportKeysSize; ++i)
851 for (
int j = 0; j < monomialBasisSize; ++j) {
855 T V_ij = m.evaluate(off) / pow(eps, m.order());
856 T E_ii = exp(- norm2(off) / (2.0 * eps * eps));
857 B[i*monomialBasisSize+j] = E_ii * V_ij;
862 for (
int i = 0; i < monomialBasisSize; ++i)
863 for (
int j = 0; j < monomialBasisSize; ++j) {
864 for (
int k = 0; k < supportKeysSize; ++k)
865 sum += B[k*monomialBasisSize+i] * B[k*monomialBasisSize+j];
867 h_A[p_key][i*monomialBasisSize+j] =
sum;
sum = 0.0;
871 for (
size_t i = 0; i < monomialBasisSize; ++i) {
872 const Monomial_gpu<dim>& dm = basisElements.get(i).getDerivative(differentialSignature);
878template<
unsigned int dim,
typename particles_type,
typename T,
typename monomialBasis_type,
typename supportKey_type,
typename localEps_type,
typename calcKernels_type>
879__global__
void calcKernels_gpu(
particles_type particles, monomialBasis_type monomialBasis, supportKey_type kerOffsets, supportKey_type supportKeys1D,
880 T** h_b, localEps_type localEps,
size_t numMatrices, calcKernels_type calcKernels)
885 size_t monomialBasisSize = monomialBasis.size();
886 const auto& basisElements = monomialBasis.getElements();
887 size_t supportKeysSize = kerOffsets.get(p_key+1)-kerOffsets.get(p_key);
888 size_t* supportKeys = &((
size_t*)supportKeys1D.getPointer())[kerOffsets.get(p_key)];
890 T* calcKernelsLocal = &((T*)calcKernels.getPointer())[kerOffsets.get(p_key)];
891 T eps = localEps.get(p_key);
893 for (
size_t j = 0; j < supportKeysSize; ++j)
895 size_t xqK = supportKeys[j];
898 T expFactor = exp(-norm2(offNorm));
901 for (
size_t i = 0; i < monomialBasisSize; ++i) {
903 T mbValue = m.evaluate(offNorm);
904 T coeff = h_b[p_key][i];
906 res += coeff * mbValue * expFactor;
908 calcKernelsLocal[j] = res;
virtual size_t size() const
the the size of the allocated memory
virtual void * getPointer()
get a readable pointer with the data
This class implement the point shape in an N-dimensional space.
__device__ __host__ T & value(size_t i)
Return the reference to the value at coordinate i.
Implementation of 1-D std::vector like structure.
Grid key for a distributed grid.
__device__ __host__ size_t getKey() const
Get the key.
vect_dist_key_dx get()
Get the actual key.
size_t size_local() const
return the local size of the vector
auto getPos(vect_dist_key_dx vec_key) -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.
void hostToDevicePos()
Move the memory from the device to host memory.
auto getPosOrig(vect_dist_key_dx vec_key) const -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
void ghost_get_subset()
Stub does not do anything.
size_t size_local_orig() const
return the local size of the vector
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)
It model an expression expr1 * expr2.
It model an expression expr1 + ... exprn.