OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
BlockMapGpu_kernels.cuh
1//
2// Created by tommaso on 16/05/19.
3//
4
5#ifndef OPENFPM_PDATA_BLOCKMAPGPU_KERNELS_CUH
6#define OPENFPM_PDATA_BLOCKMAPGPU_KERNELS_CUH
7
8//#ifdef __NVCC__
9
10#include <cstdlib>
11#include "BlockMapGpu.hpp"
12#include "util/cuda_util.hpp"
13#include "BlockMapGpu_dimensionalityWrappers.cuh"
14#include "Vector/map_vector_sparse.hpp"
15#include "util/cuda/scan_ofp.cuh"
16
17namespace BlockMapGpuKernels
18{
19 // for each segments
20 template<typename vector_index_type, typename vector_index_type2>
21 __global__ void compute_predicate(vector_index_type vct_keys_merge,
22 vector_index_type vct_index_merge,
23 unsigned int m,
24 vector_index_type2 pred_ids)
25 {
26 int p = blockIdx.x * blockDim.x + threadIdx.x;
27
28 if (p >= vct_index_merge.size()) return;
29
30 unsigned int pp1 = (p+1 == vct_index_merge.size())?p:p+1;
31 unsigned int pm1 = (p == 0)?p:p-1;
32
33 auto id0 = vct_index_merge.template get<0>(pm1);
34 auto id1 = vct_index_merge.template get<0>(p);
35
36 auto k0 = vct_keys_merge.template get<0>(pm1);
37 auto k1 = vct_keys_merge.template get<0>(p);
38 auto k2 = vct_keys_merge.template get<0>(pp1);
39
40 // predicate 0 count old chunks, but when is merged to new data the 1 must be shifted to the new element
41 //
42 pred_ids.template get<0>(p) = ((k0 == k1) && (p != 0) && id0 < m) || (id1 < m && (k1 != k2));
43
44 //predicate 1 is used to count the new index segments
45 pred_ids.template get<1>(p) = id1 >= m;
46
47 // predicate 2 is used is used to count everything does not merge
48 pred_ids.template get<2>(p) = (k1 != k2) | (p == vct_index_merge.size()-1);
49
50 // predicate 3 is used to count old keys that does not reduce with new data
51 pred_ids.template get<3>(p) = id1 < m && ((k1 != k2) | (p == vct_index_merge.size()-1));
52
53 //predicate 1 is used to count the new index segments
54 pred_ids.template get<4>(p) = id1 < m;
55 }
56
57 // for each segments
58 template<typename vector_index_type, typename vector_index_type2, typename vector_index_map_type>
59 __global__ void maps_create(vector_index_type2 scan_ids,
60 vector_index_type2 pred_ids,
61 vector_index_type vct_seg_old,
62 vector_index_map_type vct_out_map,
63 vector_index_type vct_copy_old_dst,
64 vector_index_type vct_copy_old_src)
65 {
66 int p = blockIdx.x * blockDim.x + threadIdx.x;
67
68 if (p >= scan_ids.size()) return;
69
70 auto id1 = scan_ids.template get<0>(p);
71 bool pred_id1 = pred_ids.template get<0>(p);
72 auto id2 = scan_ids.template get<1>(p);
73 bool pred_id2 = pred_ids.template get<1>(p);
74 auto id3 = scan_ids.template get<2>(p);
75 auto id4 = scan_ids.template get<3>(p);
76 bool pred_id4 = pred_ids.template get<3>(p);
77 auto id5 = scan_ids.template get<4>(p);
78
79 if (pred_id2 == true)
80 {vct_seg_old.template get<0>(id2) = (pred_id1 == true)?id1:-1;}
81
82 if (pred_id2 == true)
83 {vct_out_map.template get<0>(id2) = id3;}
84
85 if (pred_id4 == true)
86 {
87 vct_copy_old_dst.template get<0>(id4) = id3;
88 vct_copy_old_src.template get<0>(id4) = id5;
89 }
90 }
91
92
93
94 // for each segments
95 template<typename vector_index_type, typename vector_data_type>
96 __global__ void copy_old(vector_data_type vct_data_new,
97 vector_index_type index_to_copy,
98 vector_data_type vct_data_old)
99 {
100 int p = blockIdx.x * blockDim.x + threadIdx.x;
101
102 if (p >= index_to_copy.size()) return;
103
104 auto id = index_to_copy.template get<0>(p);
105
106 vct_data_new.get(id) = vct_data_old.get(p);
107 }
108
109 template<unsigned int maskProp, unsigned int chunksPerBlock, typename InsertBufferT>
110 __global__ void initializeInsertBuffer(InsertBufferT insertBuffer)
111 {
112#ifdef __NVCC__
113 typedef typename InsertBufferT::value_type AggregateT;
114 typedef BlockTypeOf<AggregateT, maskProp> MaskT;
115
116 int pos = blockIdx.x * blockDim.x + threadIdx.x;
117 const unsigned int dataBlockId = pos / MaskT::size;
118 const unsigned int offset = pos % MaskT::size;
119 const unsigned int chunkOffset = dataBlockId % chunksPerBlock;
120
121 __shared__ MaskT mask[chunksPerBlock];
122
123 if (dataBlockId < insertBuffer.size())
124 {
125 mask[chunkOffset][offset] = 0;
126
127 __syncthreads();
128
129 // Here the operator= spreads nicely across threads...
130 insertBuffer.template get<maskProp>(dataBlockId) = mask[chunkOffset];
131 }
132 else
133 {
134 __syncthreads();
135 }
136#else // __NVCC__
137 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
138#endif // __NVCC__
139 }
140
151 template<typename op, typename ScalarT>
152 __device__ inline void applyOp(ScalarT &a, ScalarT b, bool aExist, bool bExist)
153 {
154#ifdef __NVCC__
155 op op_;
156 if (aExist && bExist)
157 {
158 a = op_(a, b);
159 }
160 else if (bExist)
161 {
162 a = b;
163 }
164#else // __NVCC__
165 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
166#endif // __NVCC__
167 }
168
169
170 // GridSize = number of segments
171 // BlockSize = chunksPerBlock * chunkSize
172 //
173 template<unsigned int p,
174 unsigned int pSegment,
175 unsigned int pMask,
176 unsigned int chunksPerBlock,
177 typename op,
178 typename IndexVector_segdataT, typename IndexVector_datamapT,
179 typename IndexVector_segoldT, typename IndexVector_outmapT, typename DataVectorT>
180 __global__ void
181 segreduce_total(
182 DataVectorT data_new,
183 DataVectorT data_old,
184 IndexVector_segdataT segments_data,
185 IndexVector_datamapT segments_dataMap,
186 IndexVector_segoldT segments_dataOld,
187 IndexVector_outmapT outputMap,
188 DataVectorT output
189 )
190 {
191#ifdef __NVCC__
192 typedef typename DataVectorT::value_type AggregateT;
193 typedef BlockTypeOf<AggregateT, p> DataType;
194 typedef BlockTypeOf<AggregateT, pMask> MaskType;
195 typedef typename std::remove_all_extents<DataType>::type BaseBlockType;
196 constexpr unsigned int chunkSize = BaseBlockType::size;
197
198 unsigned int segmentId = blockIdx.x;
199 int segmentSize = segments_data.template get<pSegment>(segmentId + 1)
200 - segments_data.template get<pSegment>(segmentId);
201
202 unsigned int start = segments_data.template get<pSegment>(segmentId);
203
204 unsigned int chunkId = threadIdx.x / chunkSize;
205 unsigned int offset = threadIdx.x % chunkSize;
206
207 __shared__ ArrayWrapper<DataType> A[chunksPerBlock];
208 __shared__ MaskType AMask[chunksPerBlock];
209 typename ComposeArrayType<DataType>::type bReg;
210 typename MaskType::scalarType aMask, bMask;
211
212 // Phase 0: Load chunks as first operand of the reduction
213 if (chunkId < segmentSize)
214 {
215 unsigned int m_chunkId = segments_dataMap.template get<0>(start + chunkId);
216
217 A[chunkId][offset] = RhsBlockWrapper<DataType>(data_new.template get<p>(m_chunkId), offset).value;
218 aMask = data_new.template get<pMask>(m_chunkId)[offset];
219 }
220
223
224 int i = chunksPerBlock;
225 for (; i < segmentSize - (int) (chunksPerBlock); i += chunksPerBlock)
226 {
227 unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
228
229 // it breg = data_new.template get<p>(m_chunkId)
230 generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
231 data_new.template get<p>(m_chunkId),
232 offset);
233 bMask = data_new.template get<pMask>(m_chunkId)[offset];
234
235 // it reduce A[chunkId][offset] with breg
236 generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
237 bReg,
240 // reduce aMask with bMask
241 aMask = aMask | bMask;
242 }
243
244
245 if (i + chunkId < segmentSize)
246 {
247 unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
248
249 // it breg = data_new.template get<p>(m_chunkId)
250 generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
251 data_new.template get<p>(m_chunkId),
252 offset);
253 bMask = data_new.template get<pMask>(m_chunkId)[offset];
254
255 // it reduce A[chunkId][offset] with breg
256 generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
257 bReg,
260
261 // reduce aMask with bMask
262 aMask = aMask | bMask;
263 }
264
266
267 AMask[chunkId][offset] = aMask;
268
269 __syncthreads();
270
271 // Horizontal reduction finished
272 // Now vertical reduction
273 for (int j = 2; j <= chunksPerBlock && j <= segmentSize; j *= 2)
274 {
275 if (chunkId % j == 0 && chunkId < segmentSize)
276 {
277 unsigned int otherChunkId = chunkId + (j / 2);
278 if (otherChunkId < segmentSize)
279 {
280 aMask = AMask[chunkId][offset];
281 bMask = AMask[otherChunkId][offset];
282 generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
283 A[otherChunkId][offset],
286 AMask[chunkId][offset] = aMask | bMask;
287 }
288 }
289 __syncthreads();
290 }
291
293
294 int seg_old = segments_dataOld.template get<0>(segmentId);
295
296 if (seg_old != -1 && chunkId == 0)
297 {
298 aMask = AMask[0][offset];
299 bMask = data_old.template get<pMask>(seg_old)[offset];
300 generalDimensionFunctor<DataType>::template applyOp<op>(A[0][offset],
301 data_old.template get<p>(seg_old)[offset],
304 AMask[0][offset] = aMask | bMask;
305 }
306
307 __syncthreads();
308
310
311 // Write output
312 if (chunkId == 0)
313 {
314 unsigned int out_id = outputMap.template get<0>(segmentId);
315 generalDimensionFunctor<DataType>::assignWithOffset(output.template get<p>(out_id), A[chunkId].data,
316 offset);
317 }
318#else // __NVCC__
319 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
320#endif // __NVCC__
321 }
322
323 // GridSize = number of segments
324 // BlockSize = chunksPerBlock * chunkSize
325 //
326 template<unsigned int p,
327 unsigned int pSegment,
328 unsigned int pMask,
329 unsigned int chunksPerBlock,
330 typename op,
331 typename IndexVector_segdataT, typename IndexVector_datamapT,
332 typename IndexVector_segoldT, typename IndexVector_outmapT, typename DataVectorT>
333 __global__ void
334 segreduce_total_with_mask(
335 DataVectorT data_new,
336 DataVectorT data_old,
337 IndexVector_segdataT segments_data,
338 IndexVector_datamapT segments_dataMap,
339 IndexVector_segoldT segments_dataOld,
340 IndexVector_outmapT outputMap,
341 DataVectorT output
342 )
343 {
344#ifdef __NVCC__
345 typedef typename DataVectorT::value_type AggregateT;
346 typedef BlockTypeOf<AggregateT, p> DataType;
347 typedef BlockTypeOf<AggregateT, pMask> MaskType;
348 typedef typename std::remove_all_extents<DataType>::type BaseBlockType;
349 constexpr unsigned int chunkSize = BaseBlockType::size;
350
351 unsigned int segmentId = blockIdx.x;
352 int segmentSize = segments_data.template get<pSegment>(segmentId + 1)
353 - segments_data.template get<pSegment>(segmentId);
354
355 unsigned int start = segments_data.template get<pSegment>(segmentId);
356
357 unsigned int chunkId = threadIdx.x / chunkSize;
358 unsigned int offset = threadIdx.x % chunkSize;
359
360 __shared__ ArrayWrapper<DataType> A[chunksPerBlock];
361 __shared__ MaskType AMask[chunksPerBlock];
362 typename ComposeArrayType<DataType>::type bReg;
363 typename MaskType::scalarType aMask, bMask;
364
365 // Phase 0: Load chunks as first operand of the reduction
366 if (chunkId < segmentSize)
367 {
368 unsigned int m_chunkId = segments_dataMap.template get<0>(start + chunkId);
369
370 A[chunkId][offset] = RhsBlockWrapper<DataType>(data_new.template get<p>(m_chunkId), offset).value;
371 aMask = data_new.template get<pMask>(m_chunkId)[offset];
372 }
373
376
377 int i = chunksPerBlock;
378 for (; i < segmentSize - (int) (chunksPerBlock); i += chunksPerBlock)
379 {
380 unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
381
382 // it breg = data_new.template get<p>(m_chunkId)
383 generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
384 data_new.template get<p>(m_chunkId),
385 offset);
386 bMask = data_new.template get<pMask>(m_chunkId)[offset];
387
388 // it reduce A[chunkId][offset] with breg
389 generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
390 bReg,
393 // reduce aMask with bMask
394 aMask = aMask | bMask;
395 }
396
397
398 if (i + chunkId < segmentSize)
399 {
400 unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
401
402 // it breg = data_new.template get<p>(m_chunkId)
403 generalDimensionFunctor<decltype(bReg)>::assignWithOffsetRHS(bReg,
404 data_new.template get<p>(m_chunkId),
405 offset);
406 bMask = data_new.template get<pMask>(m_chunkId)[offset];
407
408 // it reduce A[chunkId][offset] with breg
409 generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
410 bReg,
413
414 // reduce aMask with bMask
415 aMask = aMask | bMask;
416 }
417
419
420 AMask[chunkId][offset] = aMask;
421
422 __syncthreads();
423
424 // Horizontal reduction finished
425 // Now vertical reduction
426 for (int j = 2; j <= chunksPerBlock && j <= segmentSize; j *= 2)
427 {
428 if (chunkId % j == 0 && chunkId < segmentSize)
429 {
430 unsigned int otherChunkId = chunkId + (j / 2);
431 if (otherChunkId < segmentSize)
432 {
433 aMask = AMask[chunkId][offset];
434 bMask = AMask[otherChunkId][offset];
435 generalDimensionFunctor<DataType>::template applyOp<op>(A[chunkId][offset],
436 A[otherChunkId][offset],
439 AMask[chunkId][offset] = aMask | bMask;
440 }
441 }
442 __syncthreads();
443 }
444
446
447 int seg_old = segments_dataOld.template get<0>(segmentId);
448
449 if (seg_old != -1 && chunkId == 0)
450 {
451 aMask = AMask[0][offset];
452 bMask = data_old.template get<pMask>(seg_old)[offset];
453 generalDimensionFunctor<DataType>::template applyOp<op>(A[0][offset],
454 data_old.template get<p>(seg_old)[offset],
457 AMask[0][offset] = aMask | bMask;
458 }
459
460 __syncthreads();
461
463
464 // Write output
465 if (chunkId == 0)
466 {
467 unsigned int out_id = outputMap.template get<0>(segmentId);
468 generalDimensionFunctor<DataType>::assignWithOffset(output.template get<p>(out_id), A[chunkId].data,
469 offset);
470
471 generalDimensionFunctor<MaskType>::assignWithOffset(output.template get<pMask>(out_id), AMask[chunkId],
472 offset);
473 }
474#else // __NVCC__
475 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
476#endif // __NVCC__
477 }
478
489 template<typename DataVectorT, typename IndexVectorT>
490 __global__ void copy_old_ker(IndexVectorT srcIndices, DataVectorT src, IndexVectorT dstIndices, DataVectorT dst)
491 {
492#ifdef __NVCC__
493 typedef typename DataVectorT::value_type AggregateT;
494 typedef BlockTypeOf<AggregateT, 0> BlockT0; // The type of the 0-th property
495 unsigned int chunkSize = BlockT0::size;
496
497 unsigned int chunksPerBlock = blockDim.x / chunkSize;
498 unsigned int chunkOffset = threadIdx.x / chunkSize; // The thread block can work on several chunks in parallel
499
500 unsigned int chunkBasePos = blockIdx.x * chunksPerBlock;
501
502 unsigned int p = chunkBasePos + chunkOffset;
503 if (p < srcIndices.size())
504 {
505 auto dstId = dstIndices.template get<0>(p);
506 auto srcId = srcIndices.template get<0>(p);
507
508 dst.get(dstId) = src.get(srcId);
509 }
510#else // __NVCC__
511 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
512#endif // __NVCC__
513 }
514
515
516 template<typename IndexVectorT, typename IndexVectorT2>
517 __global__ void copyKeyToDstIndexIfPredicate(IndexVectorT keys, IndexVectorT2 dstIndices, IndexVectorT out)
518 {
519#ifdef __NVCC__
520 // dstIndices is exclusive scan of predicates
521 unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
522
523 if (pos >= dstIndices.size()) {return;}
524
525 unsigned int pm1 = (pos == 0)?0:pos-1;
526
527 bool predicate = dstIndices.template get<2>(pos) != dstIndices.template get<2>(pm1) || pos == 0;
528
529 if (predicate)
530 {
531 auto dstPos = dstIndices.template get<2>(pos);
532 out.template get<0>(dstPos) = keys.template get<0>(pos);
533 }
534 }
535
536#else // __NVCC__
537 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
538#endif // __NVCC__
539}
540
551template<unsigned int blockSize,
552 typename vector_data_type,
553 typename vector_datamap_type,
554 typename vector_segoffset_type,
555 typename vector_outmap_type,
556 typename vector_segolddata_type,
557 typename vector_reduction,
558 typename block_functor,
559 unsigned int impl2, unsigned int pSegment=1>
560struct sparse_vector_reduction_solve_conflict
561{
563 vector_data_type & vector_data_red;
564
566 vector_data_type & vector_data;
567
569 vector_data_type & vector_data_old;
570
572 vector_data_type & vector_data_unsorted;
573
575 vector_segoffset_type & segment_offset;
576
578 vector_datamap_type & vector_data_map;
579
581 vector_outmap_type & out_map;
582
584 vector_segolddata_type & segments_oldData;
585
587 gpu::ofp_context_t & context;
588
595 inline sparse_vector_reduction_solve_conflict(vector_data_type & vector_data_red,
596 vector_data_type & vector_data,
597 vector_data_type & vector_data_unsorted,
598 vector_data_type & vector_data_old,
599 vector_datamap_type & vector_data_map,
600 vector_segoffset_type & segment_offset,
601 vector_outmap_type & out_map,
602 vector_segolddata_type & segments_oldData,
603 gpu::ofp_context_t & context)
604 :vector_data_red(vector_data_red),
605 vector_data(vector_data),
606 vector_data_unsorted(vector_data_unsorted),
607 vector_data_old(vector_data_old),
608 segment_offset(segment_offset),
609 vector_data_map(vector_data_map),
610 out_map(out_map),
611 segments_oldData(segments_oldData),
612 context(context)
613 {};
614
616 template<typename T>
617 inline void operator()(T& t) const
618 {
619#ifdef __NVCC__
620
621 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
622 typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,typename reduction_type::prop>::type red_type;
623 if (reduction_type::is_special() == false)
624 {
625 typedef typename std::remove_reference<vector_data_type>::type::value_type AggregateT;
626
627 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
628 typedef typename std::remove_all_extents<typename boost::mpl::at<
629 typename std::remove_reference<vector_data_type>::type::value_type::type,
630 typename reduction_type::prop
631 >::type>::type::scalarType red_type;
632 typedef typename reduction_type::template op_red<red_type> red_op;
633
634 constexpr unsigned int p = reduction_type::prop::value;
635 constexpr unsigned int pMask = AggregateT::max_prop_real - 1;
636
637 typedef typename std::remove_all_extents<BlockTypeOf<AggregateT, p>>::type BlockT; // The type of the 0-th property
638
639 constexpr unsigned int chunksPerBlock = blockSize / BlockT::size;
640 const unsigned int gridSize =
641 segment_offset.size() - 1; // This "-1" is because segments has a trailing extra element
642
643 if (T::value == 0)
644 {
645 CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total_with_mask<p, pSegment, pMask, chunksPerBlock, red_op>),gridSize, blockSize,
646 vector_data.toKernel(),
647 vector_data_old.toKernel(),
648 segment_offset.toKernel(),
649 vector_data_map.toKernel(),
650 segments_oldData.toKernel(),
651 out_map.toKernel(),
652 vector_data_red.toKernel());
653 }
654 else
655 {
656 CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<p, pSegment, pMask, chunksPerBlock, red_op>),gridSize, blockSize,
657 vector_data.toKernel(),
658 vector_data_old.toKernel(),
659 segment_offset.toKernel(),
660 vector_data_map.toKernel(),
661 segments_oldData.toKernel(),
662 out_map.toKernel(),
663 vector_data_red.toKernel());
664 }
665 }
666#else
667 std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
668#endif
669 }
670};
671
672namespace BlockMapGpuFunctors
673{
677 template<unsigned int blockSize>
678 struct BlockFunctor
679 {
682
686 openfpm::vector_gpu<aggregate<int>> segments_oldData;
687
700 template<unsigned int pSegment, typename vector_index_type, typename vector_index_type2, typename vector_data_type, typename ... v_reduce>
701 bool solve_conflicts(vector_index_type &keys, vector_index_type &mergeIndices, vector_index_type2 &segments_new, vector_index_type &data_map,
702 vector_data_type &dataOld, vector_data_type &dataNew,
703 vector_index_type &keysOut, vector_data_type &dataOut,
704 gpu::ofp_context_t & context)
705 {
706#ifdef __NVCC__
707 typedef ValueTypeOf<vector_data_type> AggregateT;
708 typedef ValueTypeOf<vector_index_type> AggregateIndexT;
709
710 typedef BlockTypeOf<AggregateIndexT, 0> IndexT;
711
712 typedef BlockTypeOf<AggregateT, 0> BlockT0; // The type of the 0-th property
713
714 constexpr unsigned int chunksPerBlock = blockSize / BlockT0::size;
715 const unsigned int gridSize = mergeIndices.size() / chunksPerBlock + ((mergeIndices.size() % chunksPerBlock) != 0);
716
718
719 // Calculate maps
720
721 auto ite = mergeIndices.getGPUIterator();
722
723 p_ids.resize(mergeIndices.size());
724 s_ids.resize(mergeIndices.size());
725
726 // shut-up valgrind uninitialized
727
728 p_ids.template get<1>(p_ids.size()-1) = 0;
729 CUDA_LAUNCH(BlockMapGpuKernels::compute_predicate,ite,keys.toKernel(),mergeIndices.toKernel(),dataOld.size(),p_ids.toKernel());
730
731 openfpm::scan((int *)p_ids.template getDeviceBuffer<0>(),
732 s_ids.size(),
733 (int *)s_ids.template getDeviceBuffer<0>(),
734 context);
735
736 openfpm::scan((int *)p_ids.template getDeviceBuffer<1>(),
737 s_ids.size(),
738 (int *)s_ids.template getDeviceBuffer<1>(),
739 context);
740
741 openfpm::scan((int *)p_ids.template getDeviceBuffer<2>(),
742 s_ids.size(),
743 (int *)s_ids.template getDeviceBuffer<2>(),
744 context);
745
746 openfpm::scan((int *)p_ids.template getDeviceBuffer<3>(),
747 s_ids.size(),
748 (int *)s_ids.template getDeviceBuffer<3>(),
749 context);
750
751 openfpm::scan((int *)p_ids.template getDeviceBuffer<4>(),
752 s_ids.size(),
753 (int *)s_ids.template getDeviceBuffer<4>(),
754 context);
755
756 s_ids.template deviceToHost<0,1,2,3>(s_ids.size()-1,s_ids.size()-1);
757 p_ids.template deviceToHost<0,1,2,3>(p_ids.size()-1,p_ids.size()-1);
758
759 size_t copy_old_size = s_ids.template get<3>(s_ids.size()-1) + p_ids.template get<3>(p_ids.size()-1);
760 size_t seg_old_size = s_ids.template get<1>(s_ids.size()-1) + p_ids.template get<1>(p_ids.size()-1);
761 size_t out_map_size = s_ids.template get<1>(s_ids.size()-1) + p_ids.template get<1>(p_ids.size()-1);
762 size_t data_out_size = s_ids.template get<2>(s_ids.size()-1) + p_ids.template get<2>(p_ids.size()-1);
763
764 segments_oldData.resize(seg_old_size);
765 outputMap.resize(out_map_size);
766 copy_old_dst.resize(copy_old_size);
767 copy_old_src.resize(copy_old_size);
768
769 CUDA_LAUNCH(BlockMapGpuKernels::maps_create,ite,s_ids.toKernel(),p_ids.toKernel(),segments_oldData.toKernel(),outputMap.toKernel(),copy_old_dst.toKernel(),copy_old_src.toKernel());
770
771 // Create the output for the keys
772 keysOut.resize(data_out_size); // The final number of keys is one less than the segments values
773
774 ite = keys.getGPUIterator();
775 CUDA_LAUNCH(BlockMapGpuKernels::copyKeyToDstIndexIfPredicate,ite,keys.toKernel(), s_ids.toKernel(), keysOut.toKernel());
776
777
778
779 // the new keys are now in keysOut
780
781 // Phase 2 - segreduce on all properties
782 dataOut.reserve(data_out_size+1);
783 dataOut.resize(data_out_size); // Right size for output, i.e. the number of segments
784 typedef boost::mpl::vector<v_reduce...> vv_reduce;
785
786 sparse_vector_reduction_solve_conflict<blockSize,decltype(dataOut),
787 decltype(data_map),
788 decltype(segments_new),
789 decltype(outputMap),
790 decltype(segments_oldData),
791 vv_reduce,BlockFunctor,2, pSegment>
792 svr(dataOut,dataNew,dataNew,dataOld,data_map,segments_new,outputMap,segments_oldData,context);
793
794 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
795
796 //copy the old chunks
797 if (copy_old_dst.size() != 0)
798 {
799 CUDA_LAUNCH_DIM3(BlockMapGpuKernels::copy_old_ker,copy_old_dst.size(),blockSize,copy_old_src.toKernel(),dataOld.toKernel(),copy_old_dst.toKernel(),dataOut.toKernel());
800 }
801
802 return true; //todo: check if error in kernel
803#else // __NVCC__
804 std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
805 return true;
806#endif // __NVCC__
807 }
808
810 {
811 return outputMap;
812 }
813
814 const openfpm::vector_gpu<aggregate<unsigned int>> & get_outputMap() const
815 {
816 return outputMap;
817 }
818 };
819}
820
821
822//#endif //__NVCC__
823
824#endif //OPENFPM_PDATA_BLOCKMAPGPU_KERNELS_CUH
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data