7#define CUDIFY_BOOST_CONTEXT_STACK_SIZE 32768
15#include "Vector/map_vector.hpp"
16#include "util/stat/common_statistics.hpp"
27constexpr int hbtype = 0;
28constexpr int radius = 1;
29constexpr int hphb = 2;
30constexpr int elsc = 3;
32#ifndef NUM_TD_PER_THREAD
36#define NUM_TD_PER_THREAD 4
88 gettimeofday(&tv, NULL);
89 return tv.tv_usec + tv.tv_sec*1e6;
94 double ms = ((end-start)/params.iterations)*1e-3;
97 double runtime = ms*1e-3;
98 double ops_per_wi = 27*poses_per_wi
99 + params.natlig*(3 + 18*poses_per_wi + params.natpro*(11 + 30*poses_per_wi))
101 double total_ops = ops_per_wi * (params.nposes/poses_per_wi);
102 double flops = total_ops / runtime;
103 double gflops = flops / 1e9;
105 gflops_data.add(gflops);
107 double interactions =
108 (double)params.nposes
109 * (
double)params.natlig
110 * (double)params.natpro;
111 double interactions_per_sec = interactions / runtime;
114 printf(
"- Total time: %7.2lf ms\n", (end-start)*1e-3);
115 printf(
"- Average time: %7.2lf ms\n", ms);
116 printf(
"- Interactions/s: %7.2lf billion\n", (interactions_per_sec / 1e9));
117 printf(
"- GFLOP/s: %7.2lf\n", gflops);
139#define HARDNESS 38.0f
143__device__
inline void compute_transformation_matrix(
const float transform_0,
144 const float transform_1,
145 const float transform_2,
146 const float transform_3,
147 const float transform_4,
148 const float transform_5,
149 Transform* transform)
151 const float sx = sin(transform_0);
152 const float cx = cos(transform_0);
153 const float sy = sin(transform_1);
154 const float cy = cos(transform_1);
155 const float sz = sin(transform_2);
156 const float cz = cos(transform_2);
158 transform[0].x = cy*cz;
159 transform[0].y = sx*sy*cz - cx*sz;
160 transform[0].z = cx*sy*cz + sx*sz;
161 transform[0].w = transform_3;
162 transform[1].x = cy*sz;
163 transform[1].y = sx*sy*sz + cx*cz;
164 transform[1].z = cx*sy*sz - sx*cz;
165 transform[1].w = transform_4;
166 transform[2].x = -sy;
167 transform[2].y = sx*cy;
168 transform[2].z = cx*cy;
169 transform[2].w = transform_5;
174template<
typename vector_atom,
typename vector_ff,
typename vector_tr,
typename vector_out>
175__global__
void fasten_main(
const int natlig,
177 const vector_atom protein_molecule,
178 const vector_atom ligand_molecule,
179 const vector_tr transforms,
181 const vector_ff global_forcefield,
182 const int num_atom_types,
183 const int numTransforms)
186 int ix = blockIdx.x*blockDim.x*NUM_TD_PER_THREAD + threadIdx.x;
190 ix = ix < numTransforms ? ix : numTransforms - NUM_TD_PER_THREAD;
193 __shared__ FFParams forcefield[100];
194 if(ix < num_atom_types)
196 forcefield[ix].hbtype = global_forcefield.template get<hbtype>(ix);
197 forcefield[ix].radius = global_forcefield.template get<radius>(ix);
198 forcefield[ix].hphb = global_forcefield.template get<hphb>(ix);
199 forcefield[ix].elsc = global_forcefield.template get<elsc>(ix);
206 float etot[NUM_TD_PER_THREAD];
207 Transform transform[NUM_TD_PER_THREAD][3];
208 const int lsz = blockDim.x;
210 for (
int i = 0; i < NUM_TD_PER_THREAD; i++)
212 int index = ix + i*lsz;
213 compute_transformation_matrix(
214 transforms.template get<0>(index),
215 transforms.template get<1>(index),
216 transforms.template get<2>(index),
217 transforms.template get<3>(index),
218 transforms.template get<4>(index),
219 transforms.template get<5>(index),
233 const Atom l_atom = {ligand_molecule.template get<pos>(il)[x],
234 ligand_molecule.template get<pos>(il)[y],
235 ligand_molecule.template get<pos>(il)[z],
236 ligand_molecule.template get<ind>(il)};
238 const FFParams l_params = {global_forcefield.template get<hbtype>(l_atom.index),
239 global_forcefield.template get<radius>(l_atom.index),
240 global_forcefield.template get<hphb>(l_atom.index),
241 global_forcefield.template get<elsc>(l_atom.index)};
242 const bool lhphb_ltz = l_params.hphb<ZERO;
243 const bool lhphb_gtz = l_params.hphb>ZERO;
245 float lpos_x[NUM_TD_PER_THREAD];
246 float lpos_y[NUM_TD_PER_THREAD];
247 float lpos_z[NUM_TD_PER_THREAD];
248 const float4 linitpos = make_float4(l_atom.x,l_atom.y,l_atom.z,ONE);
250 for (
int i = 0; i < NUM_TD_PER_THREAD; i++)
253 lpos_x[i] = transform[i][0].w + linitpos.x*transform[i][0].x +
254 linitpos.y*transform[i][0].y + linitpos.z*transform[i][0].z;
255 lpos_y[i] = transform[i][1].w + linitpos.x*transform[i][1].x +
256 linitpos.y*transform[i][1].y + linitpos.z*transform[i][1].z;
257 lpos_z[i] = transform[i][2].w + linitpos.x*transform[i][2].x +
258 linitpos.y*transform[i][2].y + linitpos.z*transform[i][2].z;
266 const Atom p_atom = {protein_molecule.template get<pos>(ip)[x],
267 protein_molecule.template get<pos>(ip)[y],
268 protein_molecule.template get<pos>(ip)[z],
269 protein_molecule.template get<ind>(ip)};
271 const FFParams p_params = {global_forcefield.template get<hbtype>(p_atom.index),
272 global_forcefield.template get<radius>(p_atom.index),
273 global_forcefield.template get<hphb>(p_atom.index),
274 global_forcefield.template get<elsc>(p_atom.index)};
276 const float radij = p_params.radius + l_params.radius;
277 const float r_radij = 1.0f/radij;
279 const float elcdst = (p_params.hbtype==HBTYPE_F && l_params.hbtype==HBTYPE_F) ? FOUR : TWO;
280 const float elcdst1 = (p_params.hbtype==HBTYPE_F && l_params.hbtype==HBTYPE_F) ? QUARTER : HALF;
281 const bool type_E = ((p_params.hbtype==HBTYPE_E || l_params.hbtype==HBTYPE_E));
283 const bool phphb_ltz = p_params.hphb<ZERO;
284 const bool phphb_gtz = p_params.hphb>ZERO;
285 const bool phphb_nz = p_params.hphb!=ZERO;
286 const float p_hphb = p_params.hphb * (phphb_ltz && lhphb_gtz ? -ONE : ONE);
287 const float l_hphb = l_params.hphb * (phphb_gtz && lhphb_ltz ? -ONE : ONE);
288 const float distdslv = (phphb_ltz ? (lhphb_ltz ? NPNPDIST : NPPDIST) : (lhphb_ltz ? NPPDIST : -FLT_MAX) );
290 float r_distdslv = 1.0f/distdslv;
292 const float chrg_init = l_params.elsc * p_params.elsc;
293 const float dslv_init = p_hphb + l_hphb;
296 for (
int i = 0; i < NUM_TD_PER_THREAD; i++)
299 const float x = lpos_x[i] - p_atom.x;
300 const float y = lpos_y[i] - p_atom.y;
301 const float z = lpos_z[i] - p_atom.z;
302 const float distij = sqrtf(x*x + y*y + z*z);
305 const float distbb = distij - radij;
306 const bool zone1 = (distbb < ZERO);
309 etot[i] += (ONE - (distij*r_radij)) * (zone1 ? 2*HARDNESS : ZERO);
312 float chrg_e = chrg_init * ((zone1 ? 1 : (ONE - distbb*elcdst1))
313 * (distbb<elcdst ? 1 : ZERO));
314 const float neg_chrg_e = -fabs(chrg_e);
315 chrg_e = type_E ? neg_chrg_e : chrg_e;
316 etot[i] += chrg_e*CNSTNT;
319 const float coeff = (ONE - (distbb *r_distdslv));
320 float dslv_e = dslv_init * ((distbb<distdslv && phphb_nz) ? 1 : ZERO);
321 dslv_e *= (zone1 ? 1 : coeff);
325 while (++ip < natpro);
327 while (++il < natlig);
330 const int td_base = blockIdx.x*blockDim.x*NUM_TD_PER_THREAD + threadIdx.x;
331 if (td_base < numTransforms)
334 for (
int i = 0; i < NUM_TD_PER_THREAD; i++)
336 etotals.template get<0>(td_base+i*blockDim.x) = etot[i]*HALF;
342void runCUDA(OpenFPM & _openfpm)
344 _openfpm.d_protein.hostToDevice<pos,ind>();
345 _openfpm.d_ligand.hostToDevice<pos,ind>();
346 _openfpm.d_forcefield.hostToDevice<hbtype,radius,hphb,elsc>();
347 _openfpm.d_results.resize(params.nposes);
348 _openfpm.d_poses.template hostToDevice<0,1,2,3,4,5>();
350 size_t global = ceil(params.nposes/(
double)_openfpm.posesPerWI);
351 global = ceil(global/(
double)_openfpm.wgsize);
352 size_t local = _openfpm.wgsize;
353 size_t shared = params.ntypes *
sizeof(FFParams);
355 cudaDeviceSynchronize();
357 double start = getTimestamp();
359 for(
int ii = 0; ii < params.iterations; ++ii)
362 CUDA_LAUNCH_DIM3(fasten_main,global, local,
365 _openfpm.d_protein.toKernel(),
366 _openfpm.d_ligand.toKernel(),
367 _openfpm.d_poses.toKernel(),
368 _openfpm.d_results.toKernel(),
369 _openfpm.d_forcefield.toKernel(),
375 cudaDeviceSynchronize();
377 double end = getTimestamp();
379 _openfpm.d_results.deviceToHost<0>();
381 printTimings(start, end, _openfpm.posesPerWI, _openfpm.gflops_data);
384#define MAX_PLATFORMS 8
385#define MAX_DEVICES 32
386#define MAX_INFO_STRING 256
388#define DATA_DIR "bm1"
389#define FILE_LIGAND "/ligand.in"
390#define FILE_PROTEIN "/protein.in"
391#define FILE_FORCEFIELD "/forcefield.in"
392#define FILE_POSES "/poses.in"
393#define FILE_REF_ENERGIES "/ref_energies.out"
395#define REF_NPOSES 65536
401#define HARDNESS 38.0f
405void printTimings(
double start,
double end,
double poses_per_wi);
406void checkError(
int err,
const char *op);
408FILE* openFile(
const char *parent,
const char *child,
409 const char* mode,
long *length)
411 char name[strlen(parent) + strlen(child) + 1];
412 strcpy(name, parent);
416 if (!(file = fopen(name, mode)))
418 fprintf(stderr,
"Failed to open '%s'\n", name);
422 fseek(file, 0, SEEK_END);
423 *length = ftell(file);
429int parseInt(
const char *str)
432 int value = strtoul(str, &next, 10);
433 return strlen(next) ? -1 : value;
436void loadParameters(
int argc,
char *argv[], OpenFPM & _openfpm)
439 params.deckDir = DATA_DIR;
440 params.iterations = 8;
441 _openfpm.wgsize = 256;
442 _openfpm.posesPerWI = NUM_TD_PER_THREAD;
445 for (
int i = 1; i < argc; i++)
447 if (!strcmp(argv[i],
"--device") || !strcmp(argv[i],
"-d"))
449 if (++i >= argc || (_openfpm.deviceIndex = parseInt(argv[i])) < 0)
451 printf(
"Invalid device index\n");
455 else if (!strcmp(argv[i],
"--iterations") || !strcmp(argv[i],
"-i"))
457 if (++i >= argc || (params.iterations = parseInt(argv[i])) < 0)
459 printf(
"Invalid number of iterations\n");
463 else if (!strcmp(argv[i],
"--numposes") || !strcmp(argv[i],
"-n"))
465 if (++i >= argc || (nposes = parseInt(argv[i])) < 0)
467 printf(
"Invalid number of poses\n");
471 else if (!strcmp(argv[i],
"--posesperwi") || !strcmp(argv[i],
"-p"))
473 if (++i >= argc || (_openfpm.posesPerWI = parseInt(argv[i])) < 0)
475 printf(
"Invalid poses-per-workitem value\n");
479 else if (!strcmp(argv[i],
"--wgsize") || !strcmp(argv[i],
"-w"))
481 if (++i >= argc || (_openfpm.wgsize = parseInt(argv[i])) < 0)
483 printf(
"Invalid work-group size\n");
487 else if (!strcmp(argv[i],
"--deck"))
491 printf(
"Invalid deck\n");
494 params.deckDir = argv[i];
496 else if (!strcmp(argv[i],
"--help") || !strcmp(argv[i],
"-h"))
499 printf(
"Usage: ./bude [OPTIONS]\n\n");
500 printf(
"Options:\n");
501 printf(
" -h --help Print this message\n");
502 printf(
" --list List available devices\n");
503 printf(
" --device INDEX Select device at INDEX\n");
504 printf(
" -i --iterations I Repeat kernel I times\n");
505 printf(
" -n --numposes N Compute results for N poses\n");
506 printf(
" -p --poserperwi PPWI Compute PPWI poses per work-item\n");
507 printf(
" -w --wgsize WGSIZE Run with work-group size WGSIZE\n");
508 printf(
" --deck DECK Use the DECK directory as input deck\n");
514 printf(
"Unrecognized argument '%s' (try '--help')\n", argv[i]);
522 file = openFile(params.deckDir, FILE_LIGAND,
"rb", &length);
523 params.natlig = length /
sizeof(Atom);
524 _openfpm.d_ligand.resize(params.natlig);
526 for (
int i = 0 ; i < _openfpm.d_ligand.size() ; i++)
528 fread(&_openfpm.d_ligand.template get<pos>(i)[0],
sizeof(
float),1,file);
529 fread(&_openfpm.d_ligand.template get<pos>(i)[1],
sizeof(
float),1,file);
530 fread(&_openfpm.d_ligand.template get<pos>(i)[2],
sizeof(
float),1,file);
531 fread(&_openfpm.d_ligand.template get<ind>(i),
sizeof(
int),1,file);
536 file = openFile(params.deckDir, FILE_PROTEIN,
"rb", &length);
537 params.natpro = length /
sizeof(Atom);
539 _openfpm.d_protein.resize(params.natpro);
541 for (
int i = 0 ; i < _openfpm.d_protein.size() ; i++)
543 fread(&_openfpm.d_protein.template get<pos>(i)[0],
sizeof(
float),1,file);
544 fread(&_openfpm.d_protein.template get<pos>(i)[1],
sizeof(
float),1,file);
545 fread(&_openfpm.d_protein.template get<pos>(i)[2],
sizeof(
float),1,file);
546 fread(&_openfpm.d_protein.template get<ind>(i),
sizeof(
int),1,file);
551 file = openFile(params.deckDir, FILE_FORCEFIELD,
"rb", &length);
552 params.ntypes = length /
sizeof(FFParams);
554 _openfpm.d_forcefield.resize(params.ntypes);
556 for (
int i = 0 ; i < _openfpm.d_forcefield.size() ; i++)
558 fread(&_openfpm.d_forcefield.template get<hbtype>(i),
sizeof(
int),1,file);
559 fread(&_openfpm.d_forcefield.template get<radius>(i),
sizeof(
float),1,file);
560 fread(&_openfpm.d_forcefield.template get<hphb>(i),
sizeof(
float),1,file);
561 fread(&_openfpm.d_forcefield.template get<elsc>(i),
sizeof(
float),1,file);
566 file = openFile(params.deckDir, FILE_POSES,
"rb", &length);
567 _openfpm.d_poses.resize(nposes);
569 long available = length / 6 /
sizeof(float);
571 while (params.nposes < nposes)
573 long fetch = nposes - params.nposes;
574 if (fetch > available)
577 fseek(file, 0*available*
sizeof(
float), SEEK_SET);
578 for (
int k = 0 ; k < fetch ; k++)
579 {fread(&_openfpm.d_poses.template get<0>(params.nposes+k),
sizeof(
float),1,file);}
581 fseek(file, 1*available*
sizeof(
float), SEEK_SET);
582 for (
int k = 0 ; k < fetch ; k++)
583 {fread(&_openfpm.d_poses.template get<1>(params.nposes+k),
sizeof(
float),1,file);}
585 fseek(file, 2*available*
sizeof(
float), SEEK_SET);
586 for (
int k = 0 ; k < fetch ; k++)
587 {fread(&_openfpm.d_poses.template get<2>(params.nposes+k),
sizeof(
float),1,file);}
589 fseek(file, 3*available*
sizeof(
float), SEEK_SET);
590 for (
int k = 0 ; k < fetch ; k++)
591 {fread(&_openfpm.d_poses.template get<3>(params.nposes+k),
sizeof(
float),1,file);}
593 fseek(file, 4*available*
sizeof(
float), SEEK_SET);
594 for (
int k = 0 ; k < fetch ; k++)
595 {fread(&_openfpm.d_poses.template get<4>(params.nposes+k),
sizeof(
float),1,file);}
597 fseek(file, 5*available*
sizeof(
float), SEEK_SET);
598 for (
int k = 0 ; k < fetch ; k++)
599 {fread(&_openfpm.d_poses.template get<5>(params.nposes+k),
sizeof(
float),1,file);}
604 params.nposes += fetch;
609#if !defined(__APPLE__) && !defined(__powerpc64__)
611#include <xmmintrin.h>
612#include <pmmintrin.h>
615int main(
int argc,
char *argv[])
617#if !defined(__APPLE__) && !defined(__powerpc64__)
618 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
619 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
624 loadParameters(argc, argv, _openfpm);
626 printf(
"Poses : %d\n", params.nposes);
627 printf(
"Iterations: %d\n", params.iterations);
628 printf(
"Ligands : %d\n", params.natlig);
629 printf(
"Proteins : %d\n", params.natpro);
630 printf(
"Deck : %s\n", params.deckDir);
631 float *resultsRef = (
float *)malloc(params.nposes*
sizeof(
float));
634 for (
int i = 0 ; i < 30 ; i++)
636 printf(
"Iteration %d\n",i);
644 standard_deviation(_openfpm.gflops_data,mean,dev);
646 printf(
"\n\n\nMean %f ~ %f GFlops/s \n\n\n",mean,dev);
647 FILE* perf_out = openFile(
"./",
"performance_out",
"w", NULL);
649 sprintf(out,
"%f %f",mean,dev);
650 fwrite(out,1,strlen(out),perf_out);
654 FILE* ref_energies = openFile(params.deckDir, FILE_REF_ENERGIES,
"r", NULL);
655 size_t n_ref_poses = params.nposes;
656 if (params.nposes > REF_NPOSES) {
657 printf(
"Only validating the first %d poses.\n", REF_NPOSES);
658 n_ref_poses = REF_NPOSES;
661 for (
size_t i = 0; i < n_ref_poses; i++)
662 fscanf(ref_energies,
"%f", &resultsRef[i]);
664 fclose(ref_energies);
666 float maxdiff = -100.0f;
667 printf(
"\n Reference CUDA (diff)\n");
668 for (
int i = 0; i < n_ref_poses; i++)
670 if (fabs(resultsRef[i]) < 1.f && fabs(_openfpm.d_results.template get<0>(i)) < 1.f)
continue;
672 float diff = fabs(resultsRef[i] - _openfpm.d_results.template get<0>(i)) / _openfpm.d_results.template get<0>(i);
673 if (diff > maxdiff) {
679 printf(
"%7.2f vs %7.2f (%5.2f%%)\n", resultsRef[i], _openfpm.d_results.template get<0>(i), 100*diff);
681 printf(
"\nLargest difference was %.3f%%\n\n", maxdiff*100);
688int main(
int argc,
char *argv[])
Implementation of 1-D std::vector like structure.