OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
BlockMapGpu_kernels.cuh
1 //
2 // Created by tommaso on 16/05/19.
3 //
4 
5 #ifndef OPENFPM_PDATA_BLOCKMAPGPU_KERNELS_CUH
6 #define OPENFPM_PDATA_BLOCKMAPGPU_KERNELS_CUH
7 
8 //#ifdef __NVCC__
9 
10 #include <cstdlib>
11 #include "BlockMapGpu.hpp"
12 #include "util/cuda_util.hpp"
13 #include "BlockMapGpu_dimensionalityWrappers.cuh"
14 #include "Vector/map_vector_sparse.hpp"
15 #include "util/cuda/scan_ofp.cuh"
16 
17 namespace BlockMapGpuKernels
18 {
19  // for each segments
20  template<typename vector_index_type, typename vector_index_type2>
21  __global__ void compute_predicate(vector_index_type vct_keys_merge,
22  vector_index_type vct_index_merge,
23  unsigned int m,
24  vector_index_type2 pred_ids)
25  {
26  int p = blockIdx.x * blockDim.x + threadIdx.x;
27 
28  if (p >= vct_index_merge.size()) return;
29 
30  unsigned int pp1 = (p+1 == vct_index_merge.size())?p:p+1;
31  unsigned int pm1 = (p == 0)?p:p-1;
32 
33  auto id0 = vct_index_merge.template get<0>(pm1);
34  auto id1 = vct_index_merge.template get<0>(p);
35 
36  auto k0 = vct_keys_merge.template get<0>(pm1);
37  auto k1 = vct_keys_merge.template get<0>(p);
38  auto k2 = vct_keys_merge.template get<0>(pp1);
39 
40  // predicate 0 count old chunks, but when is merged to new data the 1 must be shifted to the new element
41  //
42  pred_ids.template get<0>(p) = ((k0 == k1) && (p != 0) && id0 < m) || (id1 < m && (k1 != k2));
43 
44  //predicate 1 is used to count the new index segments
45  pred_ids.template get<1>(p) = id1 >= m;
46 
47  // predicate 2 is used is used to count everything does not merge
48  pred_ids.template get<2>(p) = (k1 != k2) | (p == vct_index_merge.size()-1);
49 
50  // predicate 3 is used to count old keys that does not reduce with new data
51  pred_ids.template get<3>(p) = id1 < m && ((k1 != k2) | (p == vct_index_merge.size()-1));
52 
53  //predicate 1 is used to count the new index segments
54  pred_ids.template get<4>(p) = id1 < m;
55  }
56 
57  // for each segments
58  template<typename vector_index_type, typename vector_index_type2, typename vector_index_map_type>
59  __global__ void maps_create(vector_index_type2 scan_ids,
60  vector_index_type2 pred_ids,
61  vector_index_type vct_seg_old,
62  vector_index_map_type vct_out_map,
63  vector_index_type vct_copy_old_dst,
64  vector_index_type vct_copy_old_src)
65  {
66  int p = blockIdx.x * blockDim.x + threadIdx.x;
67 
68  if (p >= scan_ids.size()) return;
69 
70  auto id1 = scan_ids.template get<0>(p);
71  bool pred_id1 = pred_ids.template get<0>(p);
72  auto id2 = scan_ids.template get<1>(p);
73  bool pred_id2 = pred_ids.template get<1>(p);
74  auto id3 = scan_ids.template get<2>(p);
75  auto id4 = scan_ids.template get<3>(p);
76  bool pred_id4 = pred_ids.template get<3>(p);
77  auto id5 = scan_ids.template get<4>(p);
78 
79  if (pred_id2 == true)
80  {vct_seg_old.template get<0>(id2) = (pred_id1 == true)?id1:-1;}
81 
82  if (pred_id2 == true)
83  {vct_out_map.template get<0>(id2) = id3;}
84 
85  if (pred_id4 == true)
86  {
87  vct_copy_old_dst.template get<0>(id4) = id3;
88  vct_copy_old_src.template get<0>(id4) = id5;
89  }
90  }
91 
92 
93 
94  // for each segments
95  template<typename vector_index_type, typename vector_data_type>
96  __global__ void copy_old(vector_data_type vct_data_new,
97  vector_index_type index_to_copy,
98  vector_data_type vct_data_old)
99  {
100  int p = blockIdx.x * blockDim.x + threadIdx.x;
101 
102  if (p >= index_to_copy.size()) return;
103 
104  auto id = index_to_copy.template get<0>(p);
105 
106  vct_data_new.get(id) = vct_data_old.get(p);
107  }
108 
109  template<unsigned int maskProp, unsigned int chunksPerBlock, typename InsertBufferT>
110  __global__ void initializeInsertBuffer(InsertBufferT insertBuffer)
111  {
112 #ifdef __NVCC__
113  typedef typename InsertBufferT::value_type AggregateT;
114  typedef BlockTypeOf<AggregateT, maskProp> MaskT;
115 
116  int pos = blockIdx.x * blockDim.x + threadIdx.x;
117  const unsigned int dataBlockId = pos / MaskT::size;
118  const unsigned int offset = pos % MaskT::size;
119  const unsigned int chunkOffset = dataBlockId % chunksPerBlock;
120 
121  __shared__ MaskT mask[chunksPerBlock];
122 
123  if (dataBlockId < insertBuffer.size())
124  {
125  mask[chunkOffset][offset] = 0;
126 
127  __syncthreads();
128 
129  // Here the operator= spreads nicely across threads...
130  insertBuffer.template get<maskProp>(dataBlockId) = mask[chunkOffset];
131  }
132  else
133  {
134  __syncthreads();
135  }
136 #else // __NVCC__
137  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
138 #endif // __NVCC__
139  }
140 
151  template<typename op, typename ScalarT>
152  __device__ inline void applyOp(ScalarT &a, ScalarT b, bool aExist, bool bExist)
153  {
154 #ifdef __NVCC__
155  op op_;
156  if (aExist && bExist)
157  {
158  a = op_(a, b);
159  }
160  else if (bExist)
161  {
162  a = b;
163  }
164 #else // __NVCC__
165  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
166 #endif // __NVCC__
167  }
168 
169 
170  // GridSize = number of segments
171  // BlockSize = chunksPerBlock * chunkSize
172  //
173  template<unsigned int p,
174  unsigned int pSegment,
175  unsigned int pMask,
176  unsigned int chunksPerBlock,
177  typename op,
178  typename IndexVector_segdataT, typename IndexVector_datamapT,
179  typename IndexVector_segoldT, typename IndexVector_outmapT, typename DataVectorT>
180  __global__ void
181  segreduce_total(
182  DataVectorT data_new,
183  DataVectorT data_old,
184  IndexVector_segdataT segments_data,
185  IndexVector_datamapT segments_dataMap,
186  IndexVector_segoldT segments_dataOld,
187  IndexVector_outmapT outputMap,
188  DataVectorT output
189  )
190  {
191 #ifdef __NVCC__
192  typedef typename DataVectorT::value_type AggregateT;
193  typedef BlockTypeOf<AggregateT, p> DataType;
194  typedef BlockTypeOf<AggregateT, pMask> MaskType;
195  typedef typename std::remove_all_extents<DataType>::type BaseBlockType;
196  constexpr unsigned int chunkSize = BaseBlockType::size;
197 
198  unsigned int segmentId = blockIdx.x;
199  int segmentSize = segments_data.template get<pSegment>(segmentId + 1)
200  - segments_data.template get<pSegment>(segmentId);
201 
202  unsigned int start = segments_data.template get<pSegment>(segmentId);
203 
204  unsigned int chunkId = threadIdx.x / chunkSize;
205  unsigned int offset = threadIdx.x % chunkSize;
206 
207  __shared__ ArrayWrapper<DataType> A[chunksPerBlock];
208  __shared__ MaskType AMask[chunksPerBlock];
209  typename ComposeArrayType<DataType>::type bReg;
210  typename MaskType::scalarType aMask, bMask;
211 
212  // Phase 0: Load chunks as first operand of the reduction
213  if (chunkId < segmentSize)
214  {
215  unsigned int m_chunkId = segments_dataMap.template get<0>(start + chunkId);
216 
217  A[chunkId][offset] = RhsBlockWrapper<DataType>(data_new.template get<p>(m_chunkId), offset).value;
218  aMask = data_new.template get<pMask>(m_chunkId)[offset];
219  }
220 
223 
224  int i = chunksPerBlock;
225  for (; i < segmentSize - (int) (chunksPerBlock); i += chunksPerBlock)
226  {
227  unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
228 
229  // it breg = data_new.template get<p>(m_chunkId)
230  generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
231  data_new.template get<p>(m_chunkId),
232  offset);
233  bMask = data_new.template get<pMask>(m_chunkId)[offset];
234 
235  // it reduce A[chunkId][offset] with breg
236  generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
237  bReg,
239  BlockMapGpu_ker<>::exist(bMask));
240  // reduce aMask with bMask
241  aMask = aMask | bMask;
242  }
243 
244 
245  if (i + chunkId < segmentSize)
246  {
247  unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
248 
249  // it breg = data_new.template get<p>(m_chunkId)
250  generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
251  data_new.template get<p>(m_chunkId),
252  offset);
253  bMask = data_new.template get<pMask>(m_chunkId)[offset];
254 
255  // it reduce A[chunkId][offset] with breg
256  generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
257  bReg,
259  BlockMapGpu_ker<>::exist(bMask));
260 
261  // reduce aMask with bMask
262  aMask = aMask | bMask;
263  }
264 
266 
267  AMask[chunkId][offset] = aMask;
268 
269  __syncthreads();
270 
271  // Horizontal reduction finished
272  // Now vertical reduction
273  for (int j = 2; j <= chunksPerBlock && j <= segmentSize; j *= 2)
274  {
275  if (chunkId % j == 0 && chunkId < segmentSize)
276  {
277  unsigned int otherChunkId = chunkId + (j / 2);
278  if (otherChunkId < segmentSize)
279  {
280  aMask = AMask[chunkId][offset];
281  bMask = AMask[otherChunkId][offset];
282  generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
283  A[otherChunkId][offset],
285  BlockMapGpu_ker<>::exist(bMask));
286  AMask[chunkId][offset] = aMask | bMask;
287  }
288  }
289  __syncthreads();
290  }
291 
293 
294  int seg_old = segments_dataOld.template get<0>(segmentId);
295 
296  if (seg_old != -1 && chunkId == 0)
297  {
298  aMask = AMask[0][offset];
299  bMask = data_old.template get<pMask>(seg_old)[offset];
300  generalDimensionFunctor<DataType>::template applyOp<op>(A[0][offset],
301  data_old.template get<p>(seg_old)[offset],
303  BlockMapGpu_ker<>::exist(bMask));
304  AMask[0][offset] = aMask | bMask;
305  }
306 
307  __syncthreads();
308 
310 
311  // Write output
312  if (chunkId == 0)
313  {
314  unsigned int out_id = outputMap.template get<0>(segmentId);
315  generalDimensionFunctor<DataType>::assignWithOffset(output.template get<p>(out_id), A[chunkId].data,
316  offset);
317  }
318 #else // __NVCC__
319  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
320 #endif // __NVCC__
321  }
322 
323  // GridSize = number of segments
324  // BlockSize = chunksPerBlock * chunkSize
325  //
326  template<unsigned int p,
327  unsigned int pSegment,
328  unsigned int pMask,
329  unsigned int chunksPerBlock,
330  typename op,
331  typename IndexVector_segdataT, typename IndexVector_datamapT,
332  typename IndexVector_segoldT, typename IndexVector_outmapT, typename DataVectorT>
333  __global__ void
334  segreduce_total_with_mask(
335  DataVectorT data_new,
336  DataVectorT data_old,
337  IndexVector_segdataT segments_data,
338  IndexVector_datamapT segments_dataMap,
339  IndexVector_segoldT segments_dataOld,
340  IndexVector_outmapT outputMap,
341  DataVectorT output
342  )
343  {
344 #ifdef __NVCC__
345  typedef typename DataVectorT::value_type AggregateT;
346  typedef BlockTypeOf<AggregateT, p> DataType;
347  typedef BlockTypeOf<AggregateT, pMask> MaskType;
348  typedef typename std::remove_all_extents<DataType>::type BaseBlockType;
349  constexpr unsigned int chunkSize = BaseBlockType::size;
350 
351  unsigned int segmentId = blockIdx.x;
352  int segmentSize = segments_data.template get<pSegment>(segmentId + 1)
353  - segments_data.template get<pSegment>(segmentId);
354 
355  unsigned int start = segments_data.template get<pSegment>(segmentId);
356 
357  unsigned int chunkId = threadIdx.x / chunkSize;
358  unsigned int offset = threadIdx.x % chunkSize;
359 
360  __shared__ ArrayWrapper<DataType> A[chunksPerBlock];
361  __shared__ MaskType AMask[chunksPerBlock];
362  typename ComposeArrayType<DataType>::type bReg;
363  typename MaskType::scalarType aMask, bMask;
364 
365  // Phase 0: Load chunks as first operand of the reduction
366  if (chunkId < segmentSize)
367  {
368  unsigned int m_chunkId = segments_dataMap.template get<0>(start + chunkId);
369 
370  A[chunkId][offset] = RhsBlockWrapper<DataType>(data_new.template get<p>(m_chunkId), offset).value;
371  aMask = data_new.template get<pMask>(m_chunkId)[offset];
372  }
373 
376 
377  int i = chunksPerBlock;
378  for (; i < segmentSize - (int) (chunksPerBlock); i += chunksPerBlock)
379  {
380  unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
381 
382  // it breg = data_new.template get<p>(m_chunkId)
383  generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
384  data_new.template get<p>(m_chunkId),
385  offset);
386  bMask = data_new.template get<pMask>(m_chunkId)[offset];
387 
388  // it reduce A[chunkId][offset] with breg
389  generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
390  bReg,
392  BlockMapGpu_ker<>::exist(bMask));
393  // reduce aMask with bMask
394  aMask = aMask | bMask;
395  }
396 
397 
398  if (i + chunkId < segmentSize)
399  {
400  unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
401 
402  // it breg = data_new.template get<p>(m_chunkId)
403  generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
404  data_new.template get<p>(m_chunkId),
405  offset);
406  bMask = data_new.template get<pMask>(m_chunkId)[offset];
407 
408  // it reduce A[chunkId][offset] with breg
409  generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
410  bReg,
412  BlockMapGpu_ker<>::exist(bMask));
413 
414  // reduce aMask with bMask
415  aMask = aMask | bMask;
416  }
417 
419 
420  AMask[chunkId][offset] = aMask;
421 
422  __syncthreads();
423 
424  // Horizontal reduction finished
425  // Now vertical reduction
426  for (int j = 2; j <= chunksPerBlock && j <= segmentSize; j *= 2)
427  {
428  if (chunkId % j == 0 && chunkId < segmentSize)
429  {
430  unsigned int otherChunkId = chunkId + (j / 2);
431  if (otherChunkId < segmentSize)
432  {
433  aMask = AMask[chunkId][offset];
434  bMask = AMask[otherChunkId][offset];
435  generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
436  A[otherChunkId][offset],
438  BlockMapGpu_ker<>::exist(bMask));
439  AMask[chunkId][offset] = aMask | bMask;
440  }
441  }
442  __syncthreads();
443  }
444 
446 
447  int seg_old = segments_dataOld.template get<0>(segmentId);
448 
449  if (seg_old != -1 && chunkId == 0)
450  {
451  aMask = AMask[0][offset];
452  bMask = data_old.template get<pMask>(seg_old)[offset];
453  generalDimensionFunctor<DataType>::template applyOp<op>(A[0][offset],
454  data_old.template get<p>(seg_old)[offset],
456  BlockMapGpu_ker<>::exist(bMask));
457  AMask[0][offset] = aMask | bMask;
458  }
459 
460  __syncthreads();
461 
463 
464  // Write output
465  if (chunkId == 0)
466  {
467  unsigned int out_id = outputMap.template get<0>(segmentId);
468  generalDimensionFunctor<DataType>::assignWithOffset(output.template get<p>(out_id), A[chunkId].data,
469  offset);
470 
471  generalDimensionFunctor<MaskType>::assignWithOffset(output.template get<pMask>(out_id), AMask[chunkId],
472  offset);
473  }
474 #else // __NVCC__
475  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
476 #endif // __NVCC__
477  }
478 
489  template<typename DataVectorT, typename IndexVectorT>
490  __global__ void copy_old_ker(IndexVectorT srcIndices, DataVectorT src, IndexVectorT dstIndices, DataVectorT dst)
491  {
492 #ifdef __NVCC__
493  typedef typename DataVectorT::value_type AggregateT;
494  typedef BlockTypeOf<AggregateT, 0> BlockT0; // The type of the 0-th property
495  unsigned int chunkSize = BlockT0::size;
496 
497  unsigned int chunksPerBlock = blockDim.x / chunkSize;
498  unsigned int chunkOffset = threadIdx.x / chunkSize; // The thread block can work on several chunks in parallel
499 
500  unsigned int chunkBasePos = blockIdx.x * chunksPerBlock;
501 
502  unsigned int p = chunkBasePos + chunkOffset;
503  if (p < srcIndices.size())
504  {
505  auto dstId = dstIndices.template get<0>(p);
506  auto srcId = srcIndices.template get<0>(p);
507 
508  dst.get(dstId) = src.get(srcId);
509  }
510 #else // __NVCC__
511  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
512 #endif // __NVCC__
513  }
514 
515 
516  template<typename IndexVectorT, typename IndexVectorT2>
517  __global__ void copyKeyToDstIndexIfPredicate(IndexVectorT keys, IndexVectorT2 dstIndices, IndexVectorT out)
518  {
519 #ifdef __NVCC__
520  // dstIndices is exclusive scan of predicates
521  unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
522 
523  if (pos >= dstIndices.size()) {return;}
524 
525  unsigned int pm1 = (pos == 0)?0:pos-1;
526 
527  bool predicate = dstIndices.template get<2>(pos) != dstIndices.template get<2>(pm1) || pos == 0;
528 
529  if (predicate)
530  {
531  auto dstPos = dstIndices.template get<2>(pos);
532  out.template get<0>(dstPos) = keys.template get<0>(pos);
533  }
534  }
535 
536 #else // __NVCC__
537  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
538 #endif // __NVCC__
539 }
540 
551 template<unsigned int blockSize,
552  typename vector_data_type,
553  typename vector_datamap_type,
554  typename vector_segoffset_type,
555  typename vector_outmap_type,
556  typename vector_segolddata_type,
557  typename vector_reduction,
558  typename block_functor,
559  unsigned int impl2, unsigned int pSegment=1>
560 struct sparse_vector_reduction_solve_conflict
561 {
563  vector_data_type & vector_data_red;
564 
566  vector_data_type & vector_data;
567 
569  vector_data_type & vector_data_old;
570 
572  vector_data_type & vector_data_unsorted;
573 
575  vector_segoffset_type & segment_offset;
576 
578  vector_datamap_type & vector_data_map;
579 
581  vector_outmap_type & out_map;
582 
584  vector_segolddata_type & segments_oldData;
585 
587  mgpu::ofp_context_t & context;
588 
595  inline sparse_vector_reduction_solve_conflict(vector_data_type & vector_data_red,
596  vector_data_type & vector_data,
597  vector_data_type & vector_data_unsorted,
598  vector_data_type & vector_data_old,
599  vector_datamap_type & vector_data_map,
600  vector_segoffset_type & segment_offset,
601  vector_outmap_type & out_map,
602  vector_segolddata_type & segments_oldData,
603  mgpu::ofp_context_t & context)
604  :vector_data_red(vector_data_red),
605  vector_data(vector_data),
606  vector_data_unsorted(vector_data_unsorted),
607  vector_data_old(vector_data_old),
608  segment_offset(segment_offset),
609  vector_data_map(vector_data_map),
610  out_map(out_map),
611  segments_oldData(segments_oldData),
612  context(context)
613  {};
614 
616  template<typename T>
617  inline void operator()(T& t) const
618  {
619 #ifdef __NVCC__
620 
621  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
622  typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
623  if (reduction_type::is_special() == false)
624  {
625  typedef typename std::remove_reference<vector_data_type>::type::value_type AggregateT;
626 
627  typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
628  typedef typename std::remove_all_extents<typename boost::mpl::at<
629  typename std::remove_reference<vector_data_type>::type::value_type::type,
630  typename reduction_type::prop
631  >::type>::type::scalarType red_type;
632  typedef typename reduction_type::template op_red<red_type> red_op;
633 
634  constexpr unsigned int p = reduction_type::prop::value;
635  constexpr unsigned int pMask = AggregateT::max_prop_real - 1;
636 
637  typedef typename std::remove_all_extents<BlockTypeOf<AggregateT, p>>::type BlockT; // The type of the 0-th property
638 
639  constexpr unsigned int chunksPerBlock = blockSize / BlockT::size;
640  const unsigned int gridSize =
641  segment_offset.size() - 1; // This "-1" is because segments has a trailing extra element
642 
643  if (T::value == 0)
644  {
645  CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total_with_mask<p, pSegment, pMask, chunksPerBlock, red_op>),gridSize, blockSize,
646  vector_data.toKernel(),
647  vector_data_old.toKernel(),
648  segment_offset.toKernel(),
649  vector_data_map.toKernel(),
650  segments_oldData.toKernel(),
651  out_map.toKernel(),
652  vector_data_red.toKernel());
653  }
654  else
655  {
656  CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<p, pSegment, pMask, chunksPerBlock, red_op>),gridSize, blockSize,
657  vector_data.toKernel(),
658  vector_data_old.toKernel(),
659  segment_offset.toKernel(),
660  vector_data_map.toKernel(),
661  segments_oldData.toKernel(),
662  out_map.toKernel(),
663  vector_data_red.toKernel());
664  }
665  }
666 #else
667  std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
668 #endif
669  }
670 };
671 
672 namespace BlockMapGpuFunctors
673 {
677  template<unsigned int blockSize>
678  struct BlockFunctor
679  {
682 
686  openfpm::vector_gpu<aggregate<int>> segments_oldData;
687 
700  template<unsigned int pSegment, typename vector_index_type, typename vector_index_type2, typename vector_data_type, typename ... v_reduce>
701  bool solve_conflicts(vector_index_type &keys, vector_index_type &mergeIndices, vector_index_type2 &segments_new, vector_index_type &data_map,
702  vector_data_type &dataOld, vector_data_type &dataNew,
703  vector_index_type &keysOut, vector_data_type &dataOut,
704  mgpu::ofp_context_t & context)
705  {
706 #ifdef __NVCC__
707  typedef ValueTypeOf<vector_data_type> AggregateT;
708  typedef ValueTypeOf<vector_index_type> AggregateIndexT;
709 
710  typedef BlockTypeOf<AggregateIndexT, 0> IndexT;
711 
712  typedef BlockTypeOf<AggregateT, 0> BlockT0; // The type of the 0-th property
713 
714  constexpr unsigned int chunksPerBlock = blockSize / BlockT0::size;
715  const unsigned int gridSize = mergeIndices.size() / chunksPerBlock + ((mergeIndices.size() % chunksPerBlock) != 0);
716 
718 
719  // Calculate maps
720 
721  auto ite = mergeIndices.getGPUIterator();
722 
723  p_ids.resize(mergeIndices.size());
724  s_ids.resize(mergeIndices.size());
725 
726  // shut-up valgrind uninitialized
727 
728  p_ids.template get<1>(p_ids.size()-1) = 0;
729  CUDA_LAUNCH(BlockMapGpuKernels::compute_predicate,ite,keys.toKernel(),mergeIndices.toKernel(),dataOld.size(),p_ids.toKernel());
730 
731  openfpm::scan((int *)p_ids.template getDeviceBuffer<0>(),
732  s_ids.size(),
733  (int *)s_ids.template getDeviceBuffer<0>(),
734  context);
735 
736  openfpm::scan((int *)p_ids.template getDeviceBuffer<1>(),
737  s_ids.size(),
738  (int *)s_ids.template getDeviceBuffer<1>(),
739  context);
740 
741  openfpm::scan((int *)p_ids.template getDeviceBuffer<2>(),
742  s_ids.size(),
743  (int *)s_ids.template getDeviceBuffer<2>(),
744  context);
745 
746  openfpm::scan((int *)p_ids.template getDeviceBuffer<3>(),
747  s_ids.size(),
748  (int *)s_ids.template getDeviceBuffer<3>(),
749  context);
750 
751  openfpm::scan((int *)p_ids.template getDeviceBuffer<4>(),
752  s_ids.size(),
753  (int *)s_ids.template getDeviceBuffer<4>(),
754  context);
755 
756  s_ids.template deviceToHost<0,1,2,3>(s_ids.size()-1,s_ids.size()-1);
757  p_ids.template deviceToHost<0,1,2,3>(p_ids.size()-1,p_ids.size()-1);
758 
759  size_t copy_old_size = s_ids.template get<3>(s_ids.size()-1) + p_ids.template get<3>(p_ids.size()-1);
760  size_t seg_old_size = s_ids.template get<1>(s_ids.size()-1) + p_ids.template get<1>(p_ids.size()-1);
761  size_t out_map_size = s_ids.template get<1>(s_ids.size()-1) + p_ids.template get<1>(p_ids.size()-1);
762  size_t data_out_size = s_ids.template get<2>(s_ids.size()-1) + p_ids.template get<2>(p_ids.size()-1);
763 
764  segments_oldData.resize(seg_old_size);
765  outputMap.resize(out_map_size);
766  copy_old_dst.resize(copy_old_size);
767  copy_old_src.resize(copy_old_size);
768 
769  CUDA_LAUNCH(BlockMapGpuKernels::maps_create,ite,s_ids.toKernel(),p_ids.toKernel(),segments_oldData.toKernel(),outputMap.toKernel(),copy_old_dst.toKernel(),copy_old_src.toKernel());
770 
771  // Create the output for the keys
772  keysOut.resize(data_out_size); // The final number of keys is one less than the segments values
773 
774  ite = keys.getGPUIterator();
775  CUDA_LAUNCH(BlockMapGpuKernels::copyKeyToDstIndexIfPredicate,ite,keys.toKernel(), s_ids.toKernel(), keysOut.toKernel());
776 
777 
778 
779  // the new keys are now in keysOut
780 
781  // Phase 2 - segreduce on all properties
782  dataOut.reserve(data_out_size+1);
783  dataOut.resize(data_out_size); // Right size for output, i.e. the number of segments
784  typedef boost::mpl::vector<v_reduce...> vv_reduce;
785 
786  sparse_vector_reduction_solve_conflict<blockSize,decltype(dataOut),
787  decltype(data_map),
788  decltype(segments_new),
789  decltype(outputMap),
790  decltype(segments_oldData),
791  vv_reduce,BlockFunctor,2, pSegment>
792  svr(dataOut,dataNew,dataNew,dataOld,data_map,segments_new,outputMap,segments_oldData,context);
793 
794  boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
795 
796  //copy the old chunks
797  if (copy_old_dst.size() != 0)
798  {
799  CUDA_LAUNCH_DIM3(BlockMapGpuKernels::copy_old_ker,copy_old_dst.size(),blockSize,copy_old_src.toKernel(),dataOld.toKernel(),copy_old_dst.toKernel(),dataOut.toKernel());
800  }
801 
802  return true; //todo: check if error in kernel
803 #else // __NVCC__
804  std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
805  return true;
806 #endif // __NVCC__
807  }
808 
810  {
811  return outputMap;
812  }
813 
814  const openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap() const
815  {
816  return outputMap;
817  }
818  };
819 }
820 
821 
822 //#endif //__NVCC__
823 
824 #endif //OPENFPM_PDATA_BLOCKMAPGPU_KERNELS_CUH
size_t size()
Stub size.
Definition: map_vector.hpp:211
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202