OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
main.cu
1#ifdef __NVCC__
2
7#define CUDIFY_BOOST_CONTEXT_STACK_SIZE 32768
8
9//#define SE_CLASS1
10
11#include <float.h>
12#include <stdio.h>
13#include <sys/time.h>
14
15#include "Vector/map_vector.hpp"
16#include "util/stat/common_statistics.hpp"
17
18//#define USE_SHARED
19
20constexpr int pos = 0;
21constexpr int ind = 1;
22
23constexpr int x = 0;
24constexpr int y = 1;
25constexpr int z = 2;
26
27constexpr int hbtype = 0;
28constexpr int radius = 1;
29constexpr int hphb = 2;
30constexpr int elsc = 3;
31
32#ifndef NUM_TD_PER_THREAD
33// Good for CPU
34//#define NUM_TD_PER_THREAD 256
35// Good for GPU
36#define NUM_TD_PER_THREAD 4
37#endif
38
39typedef struct
40{
41 float x, y, z;
42 int index;
43} Atom;
44
45typedef struct
46{
47 int hbtype;
48 float radius;
49 float hphb;
50 float elsc;
51} FFParams;
52
53typedef struct
54{
55 int natlig;
56 int natpro;
57 int ntypes;
58 int nposes;
59 char * deckDir;
60 int iterations;
61} Params;
62
63Params params;
64
65typedef struct
66{
67 // _lin = AOS
69 // AOS
71 // AOS
73 // SOA
75 // SOA
77 openfpm::vector<double> gflops_data;
78
79 int deviceIndex;
80 int wgsize;
81 int posesPerWI;
82} OpenFPM;
83
84
85double getTimestamp()
86{
87 struct timeval tv;
88 gettimeofday(&tv, NULL);
89 return tv.tv_usec + tv.tv_sec*1e6;
90}
91
92void printTimings(double start, double end, double poses_per_wi, openfpm::vector<double> & gflops_data)
93{
94 double ms = ((end-start)/params.iterations)*1e-3;
95
96 // Compute FLOP/s
97 double runtime = ms*1e-3;
98 double ops_per_wi = 27*poses_per_wi
99 + params.natlig*(3 + 18*poses_per_wi + params.natpro*(11 + 30*poses_per_wi))
100 + poses_per_wi;
101 double total_ops = ops_per_wi * (params.nposes/poses_per_wi);
102 double flops = total_ops / runtime;
103 double gflops = flops / 1e9;
104
105 gflops_data.add(gflops);
106
107 double interactions =
108 (double)params.nposes
109 * (double)params.natlig
110 * (double)params.natpro;
111 double interactions_per_sec = interactions / runtime;
112
113 // Print stats
114 printf("- Total time: %7.2lf ms\n", (end-start)*1e-3);
115 printf("- Average time: %7.2lf ms\n", ms);
116 printf("- Interactions/s: %7.2lf billion\n", (interactions_per_sec / 1e9));
117 printf("- GFLOP/s: %7.2lf\n", gflops);
118}
119
120// Numeric constants
121#define ZERO 0.0f
122#define QUARTER 0.25f
123#define HALF 0.5f
124#define ONE 1.0f
125#define TWO 2.0f
126#define FOUR 4.0f
127#define CNSTNT 45.0f
128
129#define HBTYPE_F 70
130#define HBTYPE_E 69
131
132// The data structure for one atom - 16 bytes
133
134typedef struct
135{
136 float x, y, z, w;
137} Transform;
138
139#define HARDNESS 38.0f
140#define NPNPDIST 5.5f
141#define NPPDIST 1.0f
142
143__device__ inline void compute_transformation_matrix(const float transform_0,
144 const float transform_1,
145 const float transform_2,
146 const float transform_3,
147 const float transform_4,
148 const float transform_5,
149 Transform* transform)
150{
151 const float sx = sin(transform_0);
152 const float cx = cos(transform_0);
153 const float sy = sin(transform_1);
154 const float cy = cos(transform_1);
155 const float sz = sin(transform_2);
156 const float cz = cos(transform_2);
157
158 transform[0].x = cy*cz;
159 transform[0].y = sx*sy*cz - cx*sz;
160 transform[0].z = cx*sy*cz + sx*sz;
161 transform[0].w = transform_3;
162 transform[1].x = cy*sz;
163 transform[1].y = sx*sy*sz + cx*cz;
164 transform[1].z = cx*sy*sz - sx*cz;
165 transform[1].w = transform_4;
166 transform[2].x = -sy;
167 transform[2].y = sx*cy;
168 transform[2].z = cx*cy;
169 transform[2].w = transform_5;
170}
171
172
173
174template<typename vector_atom, typename vector_ff, typename vector_tr, typename vector_out>
175__global__ void fasten_main(const int natlig,
176 const int natpro,
177 const vector_atom protein_molecule,
178 const vector_atom ligand_molecule,
179 const vector_tr transforms,
180 vector_out etotals,
181 const vector_ff global_forcefield,
182 const int num_atom_types,
183 const int numTransforms)
184{
185 // Get index of first TD
186 int ix = blockIdx.x*blockDim.x*NUM_TD_PER_THREAD + threadIdx.x;
187
188 // Have extra threads do the last member intead of return.
189 // A return would disable use of barriers, so not using return is better
190 ix = ix < numTransforms ? ix : numTransforms - NUM_TD_PER_THREAD;
191
192#ifdef USE_SHARED
193 __shared__ FFParams forcefield[100];
194 if(ix < num_atom_types)
195 {
196 forcefield[ix].hbtype = global_forcefield.template get<hbtype>(ix);
197 forcefield[ix].radius = global_forcefield.template get<radius>(ix);
198 forcefield[ix].hphb = global_forcefield.template get<hphb>(ix);
199 forcefield[ix].elsc = global_forcefield.template get<elsc>(ix);
200
201 }
202#else
203#endif
204
205 // Compute transformation matrix to private memory
206 float etot[NUM_TD_PER_THREAD];
207 Transform transform[NUM_TD_PER_THREAD][3];
208 const int lsz = blockDim.x;
209 #pragma omp simd
210 for (int i = 0; i < NUM_TD_PER_THREAD; i++)
211 {
212 int index = ix + i*lsz;
213 compute_transformation_matrix(
214 transforms.template get<0>(index),
215 transforms.template get<1>(index),
216 transforms.template get<2>(index),
217 transforms.template get<3>(index),
218 transforms.template get<4>(index),
219 transforms.template get<5>(index),
220 transform[i]);
221 etot[i] = ZERO;
222 }
223
224#ifdef USE_SHARED
225 __syncthreads();
226#endif
227
228 // Loop over ligand atoms
229 int il = 0;
230 do
231 {
232 // Load ligand atom data
233 const Atom l_atom = {ligand_molecule.template get<pos>(il)[x],
234 ligand_molecule.template get<pos>(il)[y],
235 ligand_molecule.template get<pos>(il)[z],
236 ligand_molecule.template get<ind>(il)};
237
238 const FFParams l_params = {global_forcefield.template get<hbtype>(l_atom.index),
239 global_forcefield.template get<radius>(l_atom.index),
240 global_forcefield.template get<hphb>(l_atom.index),
241 global_forcefield.template get<elsc>(l_atom.index)};
242 const bool lhphb_ltz = l_params.hphb<ZERO;
243 const bool lhphb_gtz = l_params.hphb>ZERO;
244
245 float lpos_x[NUM_TD_PER_THREAD];
246 float lpos_y[NUM_TD_PER_THREAD];
247 float lpos_z[NUM_TD_PER_THREAD];
248 const float4 linitpos = make_float4(l_atom.x,l_atom.y,l_atom.z,ONE);
249 #pragma omp simd
250 for (int i = 0; i < NUM_TD_PER_THREAD; i++)
251 {
252 // Transform ligand atom
253 lpos_x[i] = transform[i][0].w + linitpos.x*transform[i][0].x +
254 linitpos.y*transform[i][0].y + linitpos.z*transform[i][0].z;
255 lpos_y[i] = transform[i][1].w + linitpos.x*transform[i][1].x +
256 linitpos.y*transform[i][1].y + linitpos.z*transform[i][1].z;
257 lpos_z[i] = transform[i][2].w + linitpos.x*transform[i][2].x +
258 linitpos.y*transform[i][2].y + linitpos.z*transform[i][2].z;
259 }
260
261 // Loop over protein atoms
262 int ip = 0;
263 do
264 {
265 // Load protein atom data
266 const Atom p_atom = {protein_molecule.template get<pos>(ip)[x],
267 protein_molecule.template get<pos>(ip)[y],
268 protein_molecule.template get<pos>(ip)[z],
269 protein_molecule.template get<ind>(ip)};
270
271 const FFParams p_params = {global_forcefield.template get<hbtype>(p_atom.index),
272 global_forcefield.template get<radius>(p_atom.index),
273 global_forcefield.template get<hphb>(p_atom.index),
274 global_forcefield.template get<elsc>(p_atom.index)};
275
276 const float radij = p_params.radius + l_params.radius;
277 const float r_radij = 1.0f/radij;
278
279 const float elcdst = (p_params.hbtype==HBTYPE_F && l_params.hbtype==HBTYPE_F) ? FOUR : TWO;
280 const float elcdst1 = (p_params.hbtype==HBTYPE_F && l_params.hbtype==HBTYPE_F) ? QUARTER : HALF;
281 const bool type_E = ((p_params.hbtype==HBTYPE_E || l_params.hbtype==HBTYPE_E));
282
283 const bool phphb_ltz = p_params.hphb<ZERO;
284 const bool phphb_gtz = p_params.hphb>ZERO;
285 const bool phphb_nz = p_params.hphb!=ZERO;
286 const float p_hphb = p_params.hphb * (phphb_ltz && lhphb_gtz ? -ONE : ONE);
287 const float l_hphb = l_params.hphb * (phphb_gtz && lhphb_ltz ? -ONE : ONE);
288 const float distdslv = (phphb_ltz ? (lhphb_ltz ? NPNPDIST : NPPDIST) : (lhphb_ltz ? NPPDIST : -FLT_MAX) );
289
290 float r_distdslv = 1.0f/distdslv;
291
292 const float chrg_init = l_params.elsc * p_params.elsc;
293 const float dslv_init = p_hphb + l_hphb;
294
295 #pragma omp simd
296 for (int i = 0; i < NUM_TD_PER_THREAD; i++)
297 {
298 // Calculate distance between atoms
299 const float x = lpos_x[i] - p_atom.x;
300 const float y = lpos_y[i] - p_atom.y;
301 const float z = lpos_z[i] - p_atom.z;
302 const float distij = sqrtf(x*x + y*y + z*z);
303
304 // Calculate the sum of the sphere radii
305 const float distbb = distij - radij;
306 const bool zone1 = (distbb < ZERO);
307
308 // Calculate steric energy
309 etot[i] += (ONE - (distij*r_radij)) * (zone1 ? 2*HARDNESS : ZERO);
310
311 // Calculate formal and dipole charge interactions
312 float chrg_e = chrg_init * ((zone1 ? 1 : (ONE - distbb*elcdst1))
313 * (distbb<elcdst ? 1 : ZERO));
314 const float neg_chrg_e = -fabs(chrg_e);
315 chrg_e = type_E ? neg_chrg_e : chrg_e;
316 etot[i] += chrg_e*CNSTNT;
317
318 // Calculate the two cases for Nonpolar-Polar repulsive interactions
319 const float coeff = (ONE - (distbb *r_distdslv));
320 float dslv_e = dslv_init * ((distbb<distdslv && phphb_nz) ? 1 : ZERO);
321 dslv_e *= (zone1 ? 1 : coeff);
322 etot[i] += dslv_e;
323 }
324 }
325 while (++ip < natpro); // loop over protein atoms
326 }
327 while (++il < natlig); // loop over ligand atoms
328
329 // Write results
330 const int td_base = blockIdx.x*blockDim.x*NUM_TD_PER_THREAD + threadIdx.x;
331 if (td_base < numTransforms)
332 {
333 #pragma omp simd
334 for (int i = 0; i < NUM_TD_PER_THREAD; i++)
335 {
336 etotals.template get<0>(td_base+i*blockDim.x) = etot[i]*HALF;
337 }
338 }
339} //end of fasten_main
340
341
342void runCUDA(OpenFPM & _openfpm)
343{
344 _openfpm.d_protein.hostToDevice<pos,ind>();
345 _openfpm.d_ligand.hostToDevice<pos,ind>();
346 _openfpm.d_forcefield.hostToDevice<hbtype,radius,hphb,elsc>();
347 _openfpm.d_results.resize(params.nposes);
348 _openfpm.d_poses.template hostToDevice<0,1,2,3,4,5>();
349
350 size_t global = ceil(params.nposes/(double)_openfpm.posesPerWI);
351 global = ceil(global/(double)_openfpm.wgsize);
352 size_t local = _openfpm.wgsize;
353 size_t shared = params.ntypes * sizeof(FFParams);
354
355 cudaDeviceSynchronize();
356
357 double start = getTimestamp();
358
359 for(int ii = 0; ii < params.iterations; ++ii)
360 {
361
362 CUDA_LAUNCH_DIM3(fasten_main,global, local,
363 params.natlig,
364 params.natpro,
365 _openfpm.d_protein.toKernel(),
366 _openfpm.d_ligand.toKernel(),
367 _openfpm.d_poses.toKernel(),
368 _openfpm.d_results.toKernel(),
369 _openfpm.d_forcefield.toKernel(),
370 params.ntypes,
371 params.nposes);
372
373 }
374
375 cudaDeviceSynchronize();
376
377 double end = getTimestamp();
378
379 _openfpm.d_results.deviceToHost<0>();
380
381 printTimings(start, end, _openfpm.posesPerWI, _openfpm.gflops_data);
382}
383
384#define MAX_PLATFORMS 8
385#define MAX_DEVICES 32
386#define MAX_INFO_STRING 256
387
388#define DATA_DIR "bm1"
389#define FILE_LIGAND "/ligand.in"
390#define FILE_PROTEIN "/protein.in"
391#define FILE_FORCEFIELD "/forcefield.in"
392#define FILE_POSES "/poses.in"
393#define FILE_REF_ENERGIES "/ref_energies.out"
394
395#define REF_NPOSES 65536
396
397// Energy evaluation parameters
398#define CNSTNT 45.0f
399#define HBTYPE_F 70
400#define HBTYPE_E 69
401#define HARDNESS 38.0f
402#define NPNPDIST 5.5f
403#define NPPDIST 1.0f
404
405void printTimings(double start, double end, double poses_per_wi);
406void checkError(int err, const char *op);
407
408FILE* openFile(const char *parent, const char *child,
409 const char* mode, long *length)
410{
411 char name[strlen(parent) + strlen(child) + 1];
412 strcpy(name, parent);
413 strcat(name, child);
414
415 FILE *file = NULL;
416 if (!(file = fopen(name, mode)))
417 {
418 fprintf(stderr, "Failed to open '%s'\n", name);
419 exit(1);
420 }
421 if(length){
422 fseek(file, 0, SEEK_END);
423 *length = ftell(file);
424 rewind(file);
425 }
426 return file;
427}
428
429int parseInt(const char *str)
430{
431 char *next;
432 int value = strtoul(str, &next, 10);
433 return strlen(next) ? -1 : value;
434}
435
436void loadParameters(int argc, char *argv[], OpenFPM & _openfpm)
437{
438 // Defaults
439 params.deckDir = DATA_DIR;
440 params.iterations = 8;
441 _openfpm.wgsize = 256;
442 _openfpm.posesPerWI = NUM_TD_PER_THREAD;
443 int nposes = 65536;
444
445 for (int i = 1; i < argc; i++)
446 {
447 if (!strcmp(argv[i], "--device") || !strcmp(argv[i], "-d"))
448 {
449 if (++i >= argc || (_openfpm.deviceIndex = parseInt(argv[i])) < 0)
450 {
451 printf("Invalid device index\n");
452 exit(1);
453 }
454 }
455 else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
456 {
457 if (++i >= argc || (params.iterations = parseInt(argv[i])) < 0)
458 {
459 printf("Invalid number of iterations\n");
460 exit(1);
461 }
462 }
463 else if (!strcmp(argv[i], "--numposes") || !strcmp(argv[i], "-n"))
464 {
465 if (++i >= argc || (nposes = parseInt(argv[i])) < 0)
466 {
467 printf("Invalid number of poses\n");
468 exit(1);
469 }
470 }
471 else if (!strcmp(argv[i], "--posesperwi") || !strcmp(argv[i], "-p"))
472 {
473 if (++i >= argc || (_openfpm.posesPerWI = parseInt(argv[i])) < 0)
474 {
475 printf("Invalid poses-per-workitem value\n");
476 exit(1);
477 }
478 }
479 else if (!strcmp(argv[i], "--wgsize") || !strcmp(argv[i], "-w"))
480 {
481 if (++i >= argc || (_openfpm.wgsize = parseInt(argv[i])) < 0)
482 {
483 printf("Invalid work-group size\n");
484 exit(1);
485 }
486 }
487 else if (!strcmp(argv[i], "--deck"))
488 {
489 if (++i >= argc)
490 {
491 printf("Invalid deck\n");
492 exit(1);
493 }
494 params.deckDir = argv[i];
495 }
496 else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
497 {
498 printf("\n");
499 printf("Usage: ./bude [OPTIONS]\n\n");
500 printf("Options:\n");
501 printf(" -h --help Print this message\n");
502 printf(" --list List available devices\n");
503 printf(" --device INDEX Select device at INDEX\n");
504 printf(" -i --iterations I Repeat kernel I times\n");
505 printf(" -n --numposes N Compute results for N poses\n");
506 printf(" -p --poserperwi PPWI Compute PPWI poses per work-item\n");
507 printf(" -w --wgsize WGSIZE Run with work-group size WGSIZE\n");
508 printf(" --deck DECK Use the DECK directory as input deck\n");
509 printf("\n");
510 exit(0);
511 }
512 else
513 {
514 printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
515 exit(1);
516 }
517 }
518
519 FILE *file = NULL;
520 long length;
521
522 file = openFile(params.deckDir, FILE_LIGAND, "rb", &length);
523 params.natlig = length / sizeof(Atom);
524 _openfpm.d_ligand.resize(params.natlig);
525
526 for (int i = 0 ; i < _openfpm.d_ligand.size() ; i++)
527 {
528 fread(&_openfpm.d_ligand.template get<pos>(i)[0],sizeof(float),1,file);
529 fread(&_openfpm.d_ligand.template get<pos>(i)[1],sizeof(float),1,file);
530 fread(&_openfpm.d_ligand.template get<pos>(i)[2],sizeof(float),1,file);
531 fread(&_openfpm.d_ligand.template get<ind>(i),sizeof(int),1,file);
532 }
533
534 fclose(file);
535
536 file = openFile(params.deckDir, FILE_PROTEIN, "rb", &length);
537 params.natpro = length / sizeof(Atom);
538
539 _openfpm.d_protein.resize(params.natpro);
540
541 for (int i = 0 ; i < _openfpm.d_protein.size() ; i++)
542 {
543 fread(&_openfpm.d_protein.template get<pos>(i)[0],sizeof(float),1,file);
544 fread(&_openfpm.d_protein.template get<pos>(i)[1],sizeof(float),1,file);
545 fread(&_openfpm.d_protein.template get<pos>(i)[2],sizeof(float),1,file);
546 fread(&_openfpm.d_protein.template get<ind>(i),sizeof(int),1,file);
547 }
548
549 fclose(file);
550
551 file = openFile(params.deckDir, FILE_FORCEFIELD, "rb", &length);
552 params.ntypes = length / sizeof(FFParams);
553
554 _openfpm.d_forcefield.resize(params.ntypes);
555
556 for (int i = 0 ; i < _openfpm.d_forcefield.size() ; i++)
557 {
558 fread(&_openfpm.d_forcefield.template get<hbtype>(i),sizeof(int),1,file);
559 fread(&_openfpm.d_forcefield.template get<radius>(i),sizeof(float),1,file);
560 fread(&_openfpm.d_forcefield.template get<hphb>(i),sizeof(float),1,file);
561 fread(&_openfpm.d_forcefield.template get<elsc>(i),sizeof(float),1,file);
562 }
563
564 fclose(file);
565
566 file = openFile(params.deckDir, FILE_POSES, "rb", &length);
567 _openfpm.d_poses.resize(nposes);
568
569 long available = length / 6 / sizeof(float);
570 params.nposes = 0;
571 while (params.nposes < nposes)
572 {
573 long fetch = nposes - params.nposes;
574 if (fetch > available)
575 fetch = available;
576
577 fseek(file, 0*available*sizeof(float), SEEK_SET);
578 for (int k = 0 ; k < fetch ; k++)
579 {fread(&_openfpm.d_poses.template get<0>(params.nposes+k),sizeof(float),1,file);}
580
581 fseek(file, 1*available*sizeof(float), SEEK_SET);
582 for (int k = 0 ; k < fetch ; k++)
583 {fread(&_openfpm.d_poses.template get<1>(params.nposes+k),sizeof(float),1,file);}
584
585 fseek(file, 2*available*sizeof(float), SEEK_SET);
586 for (int k = 0 ; k < fetch ; k++)
587 {fread(&_openfpm.d_poses.template get<2>(params.nposes+k),sizeof(float),1,file);}
588
589 fseek(file, 3*available*sizeof(float), SEEK_SET);
590 for (int k = 0 ; k < fetch ; k++)
591 {fread(&_openfpm.d_poses.template get<3>(params.nposes+k),sizeof(float),1,file);}
592
593 fseek(file, 4*available*sizeof(float), SEEK_SET);
594 for (int k = 0 ; k < fetch ; k++)
595 {fread(&_openfpm.d_poses.template get<4>(params.nposes+k),sizeof(float),1,file);}
596
597 fseek(file, 5*available*sizeof(float), SEEK_SET);
598 for (int k = 0 ; k < fetch ; k++)
599 {fread(&_openfpm.d_poses.template get<5>(params.nposes+k),sizeof(float),1,file);}
600
601
602 rewind(file);
603
604 params.nposes += fetch;
605 }
606 fclose(file);
607}
608
609#if !defined(__APPLE__) && !defined(__powerpc64__)
610#include <fenv.h>
611#include <xmmintrin.h>
612#include <pmmintrin.h>
613#endif
614
615int main(int argc, char *argv[])
616{
617#if !defined(__APPLE__) && !defined(__powerpc64__)
618 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
619 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
620#endif
621 init_wrappers();
622
623 OpenFPM _openfpm;
624 loadParameters(argc, argv, _openfpm);
625 printf("\n");
626 printf("Poses : %d\n", params.nposes);
627 printf("Iterations: %d\n", params.iterations);
628 printf("Ligands : %d\n", params.natlig);
629 printf("Proteins : %d\n", params.natpro);
630 printf("Deck : %s\n", params.deckDir);
631 float *resultsRef = (float *)malloc(params.nposes*sizeof(float));
632
633 // We run the benchmark 30 times to get mean and variace
634 for (int i = 0 ; i < 30 ; i++)
635 {
636 printf("Iteration %d\n",i);
637
638 runCUDA(_openfpm);
639 }
640
641 // calculate mean and variance
642 double mean;
643 double dev;
644 standard_deviation(_openfpm.gflops_data,mean,dev);
645
646 printf("\n\n\nMean %f ~ %f GFlops/s \n\n\n",mean,dev);
647 FILE* perf_out = openFile("./","performance_out", "w", NULL);
648 char out[256];
649 sprintf(out,"%f %f",mean,dev);
650 fwrite(out,1,strlen(out),perf_out);
651 fclose(perf_out);
652
653 // Load reference results from file
654 FILE* ref_energies = openFile(params.deckDir, FILE_REF_ENERGIES, "r", NULL);
655 size_t n_ref_poses = params.nposes;
656 if (params.nposes > REF_NPOSES) {
657 printf("Only validating the first %d poses.\n", REF_NPOSES);
658 n_ref_poses = REF_NPOSES;
659 }
660
661 for (size_t i = 0; i < n_ref_poses; i++)
662 fscanf(ref_energies, "%f", &resultsRef[i]);
663
664 fclose(ref_energies);
665
666 float maxdiff = -100.0f;
667 printf("\n Reference CUDA (diff)\n");
668 for (int i = 0; i < n_ref_poses; i++)
669 {
670 if (fabs(resultsRef[i]) < 1.f && fabs(_openfpm.d_results.template get<0>(i)) < 1.f) continue;
671
672 float diff = fabs(resultsRef[i] - _openfpm.d_results.template get<0>(i)) / _openfpm.d_results.template get<0>(i);
673 if (diff > maxdiff) {
674 maxdiff = diff;
675 // printf ("Maxdiff: %.2f (%.3f vs %.3f)\n", maxdiff, resultsRef[i], resultsCUDA[i]);
676 }
677
678 if (i < 8)
679 printf("%7.2f vs %7.2f (%5.2f%%)\n", resultsRef[i], _openfpm.d_results.template get<0>(i), 100*diff);
680 }
681 printf("\nLargest difference was %.3f%%\n\n", maxdiff*100);
682
683 free(resultsRef);
684}
685
686#else
687
688int main(int argc, char *argv[])
689{
690}
691
692#endif
693
Implementation of 1-D std::vector like structure.