OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
main.cu
1 #ifdef __NVCC__
2 
7 #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 32768
8 
9 //#define SE_CLASS1
10 
11 #include <float.h>
12 #include <stdio.h>
13 #include <sys/time.h>
14 
15 #include "Vector/map_vector.hpp"
16 #include "util/stat/common_statistics.hpp"
17 
18 //#define USE_SHARED
19 
20 constexpr int pos = 0;
21 constexpr int ind = 1;
22 
23 constexpr int x = 0;
24 constexpr int y = 1;
25 constexpr int z = 2;
26 
27 constexpr int hbtype = 0;
28 constexpr int radius = 1;
29 constexpr int hphb = 2;
30 constexpr int elsc = 3;
31 
32 #ifndef NUM_TD_PER_THREAD
33 // Good for CPU
34 //#define NUM_TD_PER_THREAD 256
35 // Good for GPU
36 #define NUM_TD_PER_THREAD 4
37 #endif
38 
39 typedef struct
40 {
41  float x, y, z;
42  int index;
43 } Atom;
44 
45 typedef struct
46 {
47  int hbtype;
48  float radius;
49  float hphb;
50  float elsc;
51 } FFParams;
52 
53 typedef struct
54 {
55  int natlig;
56  int natpro;
57  int ntypes;
58  int nposes;
59  char * deckDir;
60  int iterations;
61 } Params;
62 
63 Params params;
64 
65 typedef struct
66 {
67  // _lin = AOS
69  // AOS
71  // AOS
73  // SOA
75  // SOA
77  openfpm::vector<double> gflops_data;
78 
79  int deviceIndex;
80  int wgsize;
81  int posesPerWI;
82 } OpenFPM;
83 
84 
85 double getTimestamp()
86 {
87  struct timeval tv;
88  gettimeofday(&tv, NULL);
89  return tv.tv_usec + tv.tv_sec*1e6;
90 }
91 
92 void printTimings(double start, double end, double poses_per_wi, openfpm::vector<double> & gflops_data)
93 {
94  double ms = ((end-start)/params.iterations)*1e-3;
95 
96  // Compute FLOP/s
97  double runtime = ms*1e-3;
98  double ops_per_wi = 27*poses_per_wi
99  + params.natlig*(3 + 18*poses_per_wi + params.natpro*(11 + 30*poses_per_wi))
100  + poses_per_wi;
101  double total_ops = ops_per_wi * (params.nposes/poses_per_wi);
102  double flops = total_ops / runtime;
103  double gflops = flops / 1e9;
104 
105  gflops_data.add(gflops);
106 
107  double interactions =
108  (double)params.nposes
109  * (double)params.natlig
110  * (double)params.natpro;
111  double interactions_per_sec = interactions / runtime;
112 
113  // Print stats
114  printf("- Total time: %7.2lf ms\n", (end-start)*1e-3);
115  printf("- Average time: %7.2lf ms\n", ms);
116  printf("- Interactions/s: %7.2lf billion\n", (interactions_per_sec / 1e9));
117  printf("- GFLOP/s: %7.2lf\n", gflops);
118 }
119 
120 // Numeric constants
121 #define ZERO 0.0f
122 #define QUARTER 0.25f
123 #define HALF 0.5f
124 #define ONE 1.0f
125 #define TWO 2.0f
126 #define FOUR 4.0f
127 #define CNSTNT 45.0f
128 
129 #define HBTYPE_F 70
130 #define HBTYPE_E 69
131 
132 // The data structure for one atom - 16 bytes
133 
134 typedef struct
135 {
136  float x, y, z, w;
137 } Transform;
138 
139 #define HARDNESS 38.0f
140 #define NPNPDIST 5.5f
141 #define NPPDIST 1.0f
142 
143 __device__ inline void compute_transformation_matrix(const float transform_0,
144  const float transform_1,
145  const float transform_2,
146  const float transform_3,
147  const float transform_4,
148  const float transform_5,
149  Transform* transform)
150 {
151  const float sx = sin(transform_0);
152  const float cx = cos(transform_0);
153  const float sy = sin(transform_1);
154  const float cy = cos(transform_1);
155  const float sz = sin(transform_2);
156  const float cz = cos(transform_2);
157 
158  transform[0].x = cy*cz;
159  transform[0].y = sx*sy*cz - cx*sz;
160  transform[0].z = cx*sy*cz + sx*sz;
161  transform[0].w = transform_3;
162  transform[1].x = cy*sz;
163  transform[1].y = sx*sy*sz + cx*cz;
164  transform[1].z = cx*sy*sz - sx*cz;
165  transform[1].w = transform_4;
166  transform[2].x = -sy;
167  transform[2].y = sx*cy;
168  transform[2].z = cx*cy;
169  transform[2].w = transform_5;
170 }
171 
172 
173 
174 template<typename vector_atom, typename vector_ff, typename vector_tr, typename vector_out>
175 __global__ void fasten_main(const int natlig,
176  const int natpro,
177  const vector_atom protein_molecule,
178  const vector_atom ligand_molecule,
179  const vector_tr transforms,
180  vector_out etotals,
181  const vector_ff global_forcefield,
182  const int num_atom_types,
183  const int numTransforms)
184 {
185  // Get index of first TD
186  int ix = blockIdx.x*blockDim.x*NUM_TD_PER_THREAD + threadIdx.x;
187 
188  // Have extra threads do the last member intead of return.
189  // A return would disable use of barriers, so not using return is better
190  ix = ix < numTransforms ? ix : numTransforms - NUM_TD_PER_THREAD;
191 
192 #ifdef USE_SHARED
193  __shared__ FFParams forcefield[100];
194  if(ix < num_atom_types)
195  {
196  forcefield[ix].hbtype = global_forcefield.template get<hbtype>(ix);
197  forcefield[ix].radius = global_forcefield.template get<radius>(ix);
198  forcefield[ix].hphb = global_forcefield.template get<hphb>(ix);
199  forcefield[ix].elsc = global_forcefield.template get<elsc>(ix);
200 
201  }
202 #else
203 #endif
204 
205  // Compute transformation matrix to private memory
206  float etot[NUM_TD_PER_THREAD];
207  Transform transform[NUM_TD_PER_THREAD][3];
208  const int lsz = blockDim.x;
209  #pragma omp simd
210  for (int i = 0; i < NUM_TD_PER_THREAD; i++)
211  {
212  int index = ix + i*lsz;
213  compute_transformation_matrix(
214  transforms.template get<0>(index),
215  transforms.template get<1>(index),
216  transforms.template get<2>(index),
217  transforms.template get<3>(index),
218  transforms.template get<4>(index),
219  transforms.template get<5>(index),
220  transform[i]);
221  etot[i] = ZERO;
222  }
223 
224 #ifdef USE_SHARED
225  __syncthreads();
226 #endif
227 
228  // Loop over ligand atoms
229  int il = 0;
230  do
231  {
232  // Load ligand atom data
233  const Atom l_atom = {ligand_molecule.template get<pos>(il)[x],
234  ligand_molecule.template get<pos>(il)[y],
235  ligand_molecule.template get<pos>(il)[z],
236  ligand_molecule.template get<ind>(il)};
237 
238  const FFParams l_params = {global_forcefield.template get<hbtype>(l_atom.index),
239  global_forcefield.template get<radius>(l_atom.index),
240  global_forcefield.template get<hphb>(l_atom.index),
241  global_forcefield.template get<elsc>(l_atom.index)};
242  const bool lhphb_ltz = l_params.hphb<ZERO;
243  const bool lhphb_gtz = l_params.hphb>ZERO;
244 
245  float lpos_x[NUM_TD_PER_THREAD];
246  float lpos_y[NUM_TD_PER_THREAD];
247  float lpos_z[NUM_TD_PER_THREAD];
248  const float4 linitpos = make_float4(l_atom.x,l_atom.y,l_atom.z,ONE);
249  #pragma omp simd
250  for (int i = 0; i < NUM_TD_PER_THREAD; i++)
251  {
252  // Transform ligand atom
253  lpos_x[i] = transform[i][0].w + linitpos.x*transform[i][0].x +
254  linitpos.y*transform[i][0].y + linitpos.z*transform[i][0].z;
255  lpos_y[i] = transform[i][1].w + linitpos.x*transform[i][1].x +
256  linitpos.y*transform[i][1].y + linitpos.z*transform[i][1].z;
257  lpos_z[i] = transform[i][2].w + linitpos.x*transform[i][2].x +
258  linitpos.y*transform[i][2].y + linitpos.z*transform[i][2].z;
259  }
260 
261  // Loop over protein atoms
262  int ip = 0;
263  do
264  {
265  // Load protein atom data
266  const Atom p_atom = {protein_molecule.template get<pos>(ip)[x],
267  protein_molecule.template get<pos>(ip)[y],
268  protein_molecule.template get<pos>(ip)[z],
269  protein_molecule.template get<ind>(ip)};
270 
271  const FFParams p_params = {global_forcefield.template get<hbtype>(p_atom.index),
272  global_forcefield.template get<radius>(p_atom.index),
273  global_forcefield.template get<hphb>(p_atom.index),
274  global_forcefield.template get<elsc>(p_atom.index)};
275 
276  const float radij = p_params.radius + l_params.radius;
277  const float r_radij = 1.0f/radij;
278 
279  const float elcdst = (p_params.hbtype==HBTYPE_F && l_params.hbtype==HBTYPE_F) ? FOUR : TWO;
280  const float elcdst1 = (p_params.hbtype==HBTYPE_F && l_params.hbtype==HBTYPE_F) ? QUARTER : HALF;
281  const bool type_E = ((p_params.hbtype==HBTYPE_E || l_params.hbtype==HBTYPE_E));
282 
283  const bool phphb_ltz = p_params.hphb<ZERO;
284  const bool phphb_gtz = p_params.hphb>ZERO;
285  const bool phphb_nz = p_params.hphb!=ZERO;
286  const float p_hphb = p_params.hphb * (phphb_ltz && lhphb_gtz ? -ONE : ONE);
287  const float l_hphb = l_params.hphb * (phphb_gtz && lhphb_ltz ? -ONE : ONE);
288  const float distdslv = (phphb_ltz ? (lhphb_ltz ? NPNPDIST : NPPDIST) : (lhphb_ltz ? NPPDIST : -FLT_MAX) );
289 
290  float r_distdslv = 1.0f/distdslv;
291 
292  const float chrg_init = l_params.elsc * p_params.elsc;
293  const float dslv_init = p_hphb + l_hphb;
294 
295  #pragma omp simd
296  for (int i = 0; i < NUM_TD_PER_THREAD; i++)
297  {
298  // Calculate distance between atoms
299  const float x = lpos_x[i] - p_atom.x;
300  const float y = lpos_y[i] - p_atom.y;
301  const float z = lpos_z[i] - p_atom.z;
302  const float distij = sqrtf(x*x + y*y + z*z);
303 
304  // Calculate the sum of the sphere radii
305  const float distbb = distij - radij;
306  const bool zone1 = (distbb < ZERO);
307 
308  // Calculate steric energy
309  etot[i] += (ONE - (distij*r_radij)) * (zone1 ? 2*HARDNESS : ZERO);
310 
311  // Calculate formal and dipole charge interactions
312  float chrg_e = chrg_init * ((zone1 ? 1 : (ONE - distbb*elcdst1))
313  * (distbb<elcdst ? 1 : ZERO));
314  const float neg_chrg_e = -fabs(chrg_e);
315  chrg_e = type_E ? neg_chrg_e : chrg_e;
316  etot[i] += chrg_e*CNSTNT;
317 
318  // Calculate the two cases for Nonpolar-Polar repulsive interactions
319  const float coeff = (ONE - (distbb *r_distdslv));
320  float dslv_e = dslv_init * ((distbb<distdslv && phphb_nz) ? 1 : ZERO);
321  dslv_e *= (zone1 ? 1 : coeff);
322  etot[i] += dslv_e;
323  }
324  }
325  while (++ip < natpro); // loop over protein atoms
326  }
327  while (++il < natlig); // loop over ligand atoms
328 
329  // Write results
330  const int td_base = blockIdx.x*blockDim.x*NUM_TD_PER_THREAD + threadIdx.x;
331  if (td_base < numTransforms)
332  {
333  #pragma omp simd
334  for (int i = 0; i < NUM_TD_PER_THREAD; i++)
335  {
336  etotals.template get<0>(td_base+i*blockDim.x) = etot[i]*HALF;
337  }
338  }
339 } //end of fasten_main
340 
341 
342 void runCUDA(OpenFPM & _openfpm)
343 {
344  _openfpm.d_protein.hostToDevice<pos,ind>();
345  _openfpm.d_ligand.hostToDevice<pos,ind>();
346  _openfpm.d_forcefield.hostToDevice<hbtype,radius,hphb,elsc>();
347  _openfpm.d_results.resize(params.nposes);
348  _openfpm.d_poses.template hostToDevice<0,1,2,3,4,5>();
349 
350  size_t global = ceil(params.nposes/(double)_openfpm.posesPerWI);
351  global = ceil(global/(double)_openfpm.wgsize);
352  size_t local = _openfpm.wgsize;
353  size_t shared = params.ntypes * sizeof(FFParams);
354 
355  cudaDeviceSynchronize();
356 
357  double start = getTimestamp();
358 
359  for(int ii = 0; ii < params.iterations; ++ii)
360  {
361 
362  CUDA_LAUNCH_DIM3(fasten_main,global, local,
363  params.natlig,
364  params.natpro,
365  _openfpm.d_protein.toKernel(),
366  _openfpm.d_ligand.toKernel(),
367  _openfpm.d_poses.toKernel(),
368  _openfpm.d_results.toKernel(),
369  _openfpm.d_forcefield.toKernel(),
370  params.ntypes,
371  params.nposes);
372 
373  }
374 
375  cudaDeviceSynchronize();
376 
377  double end = getTimestamp();
378 
379  _openfpm.d_results.deviceToHost<0>();
380 
381  printTimings(start, end, _openfpm.posesPerWI, _openfpm.gflops_data);
382 }
383 
384 #define MAX_PLATFORMS 8
385 #define MAX_DEVICES 32
386 #define MAX_INFO_STRING 256
387 
388 #define DATA_DIR "bm1"
389 #define FILE_LIGAND "/ligand.in"
390 #define FILE_PROTEIN "/protein.in"
391 #define FILE_FORCEFIELD "/forcefield.in"
392 #define FILE_POSES "/poses.in"
393 #define FILE_REF_ENERGIES "/ref_energies.out"
394 
395 #define REF_NPOSES 65536
396 
397 // Energy evaluation parameters
398 #define CNSTNT 45.0f
399 #define HBTYPE_F 70
400 #define HBTYPE_E 69
401 #define HARDNESS 38.0f
402 #define NPNPDIST 5.5f
403 #define NPPDIST 1.0f
404 
405 void printTimings(double start, double end, double poses_per_wi);
406 void checkError(int err, const char *op);
407 
408 FILE* openFile(const char *parent, const char *child,
409  const char* mode, long *length)
410 {
411  char name[strlen(parent) + strlen(child) + 1];
412  strcpy(name, parent);
413  strcat(name, child);
414 
415  FILE *file = NULL;
416  if (!(file = fopen(name, mode)))
417  {
418  fprintf(stderr, "Failed to open '%s'\n", name);
419  exit(1);
420  }
421  if(length){
422  fseek(file, 0, SEEK_END);
423  *length = ftell(file);
424  rewind(file);
425  }
426  return file;
427 }
428 
429 int parseInt(const char *str)
430 {
431  char *next;
432  int value = strtoul(str, &next, 10);
433  return strlen(next) ? -1 : value;
434 }
435 
436 void loadParameters(int argc, char *argv[], OpenFPM & _openfpm)
437 {
438  // Defaults
439  params.deckDir = DATA_DIR;
440  params.iterations = 8;
441  _openfpm.wgsize = 256;
442  _openfpm.posesPerWI = NUM_TD_PER_THREAD;
443  int nposes = 65536;
444 
445  for (int i = 1; i < argc; i++)
446  {
447  if (!strcmp(argv[i], "--device") || !strcmp(argv[i], "-d"))
448  {
449  if (++i >= argc || (_openfpm.deviceIndex = parseInt(argv[i])) < 0)
450  {
451  printf("Invalid device index\n");
452  exit(1);
453  }
454  }
455  else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
456  {
457  if (++i >= argc || (params.iterations = parseInt(argv[i])) < 0)
458  {
459  printf("Invalid number of iterations\n");
460  exit(1);
461  }
462  }
463  else if (!strcmp(argv[i], "--numposes") || !strcmp(argv[i], "-n"))
464  {
465  if (++i >= argc || (nposes = parseInt(argv[i])) < 0)
466  {
467  printf("Invalid number of poses\n");
468  exit(1);
469  }
470  }
471  else if (!strcmp(argv[i], "--posesperwi") || !strcmp(argv[i], "-p"))
472  {
473  if (++i >= argc || (_openfpm.posesPerWI = parseInt(argv[i])) < 0)
474  {
475  printf("Invalid poses-per-workitem value\n");
476  exit(1);
477  }
478  }
479  else if (!strcmp(argv[i], "--wgsize") || !strcmp(argv[i], "-w"))
480  {
481  if (++i >= argc || (_openfpm.wgsize = parseInt(argv[i])) < 0)
482  {
483  printf("Invalid work-group size\n");
484  exit(1);
485  }
486  }
487  else if (!strcmp(argv[i], "--deck"))
488  {
489  if (++i >= argc)
490  {
491  printf("Invalid deck\n");
492  exit(1);
493  }
494  params.deckDir = argv[i];
495  }
496  else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
497  {
498  printf("\n");
499  printf("Usage: ./bude [OPTIONS]\n\n");
500  printf("Options:\n");
501  printf(" -h --help Print this message\n");
502  printf(" --list List available devices\n");
503  printf(" --device INDEX Select device at INDEX\n");
504  printf(" -i --iterations I Repeat kernel I times\n");
505  printf(" -n --numposes N Compute results for N poses\n");
506  printf(" -p --poserperwi PPWI Compute PPWI poses per work-item\n");
507  printf(" -w --wgsize WGSIZE Run with work-group size WGSIZE\n");
508  printf(" --deck DECK Use the DECK directory as input deck\n");
509  printf("\n");
510  exit(0);
511  }
512  else
513  {
514  printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
515  exit(1);
516  }
517  }
518 
519  FILE *file = NULL;
520  long length;
521 
522  file = openFile(params.deckDir, FILE_LIGAND, "rb", &length);
523  params.natlig = length / sizeof(Atom);
524  _openfpm.d_ligand.resize(params.natlig);
525 
526  for (int i = 0 ; i < _openfpm.d_ligand.size() ; i++)
527  {
528  fread(&_openfpm.d_ligand.template get<pos>(i)[0],sizeof(float),1,file);
529  fread(&_openfpm.d_ligand.template get<pos>(i)[1],sizeof(float),1,file);
530  fread(&_openfpm.d_ligand.template get<pos>(i)[2],sizeof(float),1,file);
531  fread(&_openfpm.d_ligand.template get<ind>(i),sizeof(int),1,file);
532  }
533 
534  fclose(file);
535 
536  file = openFile(params.deckDir, FILE_PROTEIN, "rb", &length);
537  params.natpro = length / sizeof(Atom);
538 
539  _openfpm.d_protein.resize(params.natpro);
540 
541  for (int i = 0 ; i < _openfpm.d_protein.size() ; i++)
542  {
543  fread(&_openfpm.d_protein.template get<pos>(i)[0],sizeof(float),1,file);
544  fread(&_openfpm.d_protein.template get<pos>(i)[1],sizeof(float),1,file);
545  fread(&_openfpm.d_protein.template get<pos>(i)[2],sizeof(float),1,file);
546  fread(&_openfpm.d_protein.template get<ind>(i),sizeof(int),1,file);
547  }
548 
549  fclose(file);
550 
551  file = openFile(params.deckDir, FILE_FORCEFIELD, "rb", &length);
552  params.ntypes = length / sizeof(FFParams);
553 
554  _openfpm.d_forcefield.resize(params.ntypes);
555 
556  for (int i = 0 ; i < _openfpm.d_forcefield.size() ; i++)
557  {
558  fread(&_openfpm.d_forcefield.template get<hbtype>(i),sizeof(int),1,file);
559  fread(&_openfpm.d_forcefield.template get<radius>(i),sizeof(float),1,file);
560  fread(&_openfpm.d_forcefield.template get<hphb>(i),sizeof(float),1,file);
561  fread(&_openfpm.d_forcefield.template get<elsc>(i),sizeof(float),1,file);
562  }
563 
564  fclose(file);
565 
566  file = openFile(params.deckDir, FILE_POSES, "rb", &length);
567  _openfpm.d_poses.resize(nposes);
568 
569  long available = length / 6 / sizeof(float);
570  params.nposes = 0;
571  while (params.nposes < nposes)
572  {
573  long fetch = nposes - params.nposes;
574  if (fetch > available)
575  fetch = available;
576 
577  fseek(file, 0*available*sizeof(float), SEEK_SET);
578  for (int k = 0 ; k < fetch ; k++)
579  {fread(&_openfpm.d_poses.template get<0>(params.nposes+k),sizeof(float),1,file);}
580 
581  fseek(file, 1*available*sizeof(float), SEEK_SET);
582  for (int k = 0 ; k < fetch ; k++)
583  {fread(&_openfpm.d_poses.template get<1>(params.nposes+k),sizeof(float),1,file);}
584 
585  fseek(file, 2*available*sizeof(float), SEEK_SET);
586  for (int k = 0 ; k < fetch ; k++)
587  {fread(&_openfpm.d_poses.template get<2>(params.nposes+k),sizeof(float),1,file);}
588 
589  fseek(file, 3*available*sizeof(float), SEEK_SET);
590  for (int k = 0 ; k < fetch ; k++)
591  {fread(&_openfpm.d_poses.template get<3>(params.nposes+k),sizeof(float),1,file);}
592 
593  fseek(file, 4*available*sizeof(float), SEEK_SET);
594  for (int k = 0 ; k < fetch ; k++)
595  {fread(&_openfpm.d_poses.template get<4>(params.nposes+k),sizeof(float),1,file);}
596 
597  fseek(file, 5*available*sizeof(float), SEEK_SET);
598  for (int k = 0 ; k < fetch ; k++)
599  {fread(&_openfpm.d_poses.template get<5>(params.nposes+k),sizeof(float),1,file);}
600 
601 
602  rewind(file);
603 
604  params.nposes += fetch;
605  }
606  fclose(file);
607 }
608 
609 #if !defined(__APPLE__) && !defined(__powerpc64__)
610 #include <fenv.h>
611 #include <xmmintrin.h>
612 #include <pmmintrin.h>
613 #endif
614 
615 int main(int argc, char *argv[])
616 {
617 #if !defined(__APPLE__) && !defined(__powerpc64__)
618  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
619  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
620 #endif
621  init_wrappers();
622 
623  OpenFPM _openfpm;
624  loadParameters(argc, argv, _openfpm);
625  printf("\n");
626  printf("Poses : %d\n", params.nposes);
627  printf("Iterations: %d\n", params.iterations);
628  printf("Ligands : %d\n", params.natlig);
629  printf("Proteins : %d\n", params.natpro);
630  printf("Deck : %s\n", params.deckDir);
631  float *resultsRef = (float *)malloc(params.nposes*sizeof(float));
632 
633  // We run the benchmark 30 times to get mean and variace
634  for (int i = 0 ; i < 30 ; i++)
635  {
636  printf("Iteration %d\n",i);
637 
638  runCUDA(_openfpm);
639  }
640 
641  // calculate mean and variance
642  double mean;
643  double dev;
644  standard_deviation(_openfpm.gflops_data,mean,dev);
645 
646  printf("\n\n\nMean %f ~ %f GFlops/s \n\n\n",mean,dev);
647  FILE* perf_out = openFile("./","performance_out", "w", NULL);
648  char out[256];
649  sprintf(out,"%f %f",mean,dev);
650  fwrite(out,1,strlen(out),perf_out);
651  fclose(perf_out);
652 
653  // Load reference results from file
654  FILE* ref_energies = openFile(params.deckDir, FILE_REF_ENERGIES, "r", NULL);
655  size_t n_ref_poses = params.nposes;
656  if (params.nposes > REF_NPOSES) {
657  printf("Only validating the first %d poses.\n", REF_NPOSES);
658  n_ref_poses = REF_NPOSES;
659  }
660 
661  for (size_t i = 0; i < n_ref_poses; i++)
662  fscanf(ref_energies, "%f", &resultsRef[i]);
663 
664  fclose(ref_energies);
665 
666  float maxdiff = -100.0f;
667  printf("\n Reference CUDA (diff)\n");
668  for (int i = 0; i < n_ref_poses; i++)
669  {
670  if (fabs(resultsRef[i]) < 1.f && fabs(_openfpm.d_results.template get<0>(i)) < 1.f) continue;
671 
672  float diff = fabs(resultsRef[i] - _openfpm.d_results.template get<0>(i)) / _openfpm.d_results.template get<0>(i);
673  if (diff > maxdiff) {
674  maxdiff = diff;
675  // printf ("Maxdiff: %.2f (%.3f vs %.3f)\n", maxdiff, resultsRef[i], resultsCUDA[i]);
676  }
677 
678  if (i < 8)
679  printf("%7.2f vs %7.2f (%5.2f%%)\n", resultsRef[i], _openfpm.d_results.template get<0>(i), 100*diff);
680  }
681  printf("\nLargest difference was %.3f%%\n\n", maxdiff*100);
682 
683  free(resultsRef);
684 }
685 
686 #else
687 
688 int main(int argc, char *argv[])
689 {
690 }
691 
692 #endif
693 
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202