5#ifndef OPENFPM_PDATA_BLOCKMAPGPU_KERNELS_CUH
6#define OPENFPM_PDATA_BLOCKMAPGPU_KERNELS_CUH
11#include "BlockMapGpu.hpp"
12#include "util/cuda_util.hpp"
13#include "BlockMapGpu_dimensionalityWrappers.cuh"
14#include "Vector/map_vector_sparse.hpp"
15#include "util/cuda/scan_ofp.cuh"
17namespace BlockMapGpuKernels
20 template<
typename vector_index_type,
typename vector_index_type2>
21 __global__
void compute_predicate(vector_index_type vct_keys_merge,
22 vector_index_type vct_index_merge,
24 vector_index_type2 pred_ids)
26 int p = blockIdx.x * blockDim.x + threadIdx.x;
28 if (p >= vct_index_merge.size())
return;
30 unsigned int pp1 = (p+1 == vct_index_merge.size())?p:p+1;
31 unsigned int pm1 = (p == 0)?p:p-1;
33 auto id0 = vct_index_merge.template get<0>(pm1);
34 auto id1 = vct_index_merge.template get<0>(p);
36 auto k0 = vct_keys_merge.template get<0>(pm1);
37 auto k1 = vct_keys_merge.template get<0>(p);
38 auto k2 = vct_keys_merge.template get<0>(pp1);
42 pred_ids.template get<0>(p) = ((k0 == k1) && (p != 0) && id0 < m) || (id1 < m && (k1 != k2));
45 pred_ids.template get<1>(p) = id1 >= m;
48 pred_ids.template get<2>(p) = (k1 != k2) | (p == vct_index_merge.size()-1);
51 pred_ids.template get<3>(p) = id1 < m && ((k1 != k2) | (p == vct_index_merge.size()-1));
54 pred_ids.template get<4>(p) = id1 < m;
58 template<
typename vector_index_type,
typename vector_index_type2,
typename vector_index_map_type>
59 __global__
void maps_create(vector_index_type2 scan_ids,
60 vector_index_type2 pred_ids,
61 vector_index_type vct_seg_old,
62 vector_index_map_type vct_out_map,
63 vector_index_type vct_copy_old_dst,
64 vector_index_type vct_copy_old_src)
66 int p = blockIdx.x * blockDim.x + threadIdx.x;
68 if (p >= scan_ids.size())
return;
70 auto id1 = scan_ids.template get<0>(p);
71 bool pred_id1 = pred_ids.template get<0>(p);
72 auto id2 = scan_ids.template get<1>(p);
73 bool pred_id2 = pred_ids.template get<1>(p);
74 auto id3 = scan_ids.template get<2>(p);
75 auto id4 = scan_ids.template get<3>(p);
76 bool pred_id4 = pred_ids.template get<3>(p);
77 auto id5 = scan_ids.template get<4>(p);
80 {vct_seg_old.template get<0>(id2) = (pred_id1 ==
true)?id1:-1;}
83 {vct_out_map.template get<0>(id2) = id3;}
87 vct_copy_old_dst.template get<0>(id4) = id3;
88 vct_copy_old_src.template get<0>(id4) = id5;
95 template<
typename vector_index_type,
typename vector_data_type>
96 __global__
void copy_old(vector_data_type vct_data_new,
97 vector_index_type index_to_copy,
98 vector_data_type vct_data_old)
100 int p = blockIdx.x * blockDim.x + threadIdx.x;
102 if (p >= index_to_copy.size())
return;
104 auto id = index_to_copy.template get<0>(p);
106 vct_data_new.get(
id) = vct_data_old.get(p);
109 template<
unsigned int maskProp,
unsigned int chunksPerBlock,
typename InsertBufferT>
110 __global__
void initializeInsertBuffer(InsertBufferT insertBuffer)
113 typedef typename InsertBufferT::value_type AggregateT;
114 typedef BlockTypeOf<AggregateT, maskProp> MaskT;
116 int pos = blockIdx.x * blockDim.x + threadIdx.x;
117 const unsigned int dataBlockId = pos / MaskT::size;
118 const unsigned int offset = pos % MaskT::size;
119 const unsigned int chunkOffset = dataBlockId % chunksPerBlock;
121 __shared__ MaskT mask[chunksPerBlock];
123 if (dataBlockId < insertBuffer.size())
125 mask[chunkOffset][offset] = 0;
130 insertBuffer.template get<maskProp>(dataBlockId) = mask[chunkOffset];
137 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
151 template<
typename op,
typename ScalarT>
152 __device__
inline void applyOp(ScalarT &a, ScalarT b,
bool aExist,
bool bExist)
156 if (aExist && bExist)
165 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
173 template<
unsigned int p,
174 unsigned int pSegment,
176 unsigned int chunksPerBlock,
178 typename IndexVector_segdataT,
typename IndexVector_datamapT,
179 typename IndexVector_segoldT,
typename IndexVector_outmapT,
typename DataVectorT>
182 DataVectorT data_new,
183 DataVectorT data_old,
184 IndexVector_segdataT segments_data,
185 IndexVector_datamapT segments_dataMap,
186 IndexVector_segoldT segments_dataOld,
187 IndexVector_outmapT outputMap,
192 typedef typename DataVectorT::value_type AggregateT;
193 typedef BlockTypeOf<AggregateT, p> DataType;
194 typedef BlockTypeOf<AggregateT, pMask> MaskType;
195 typedef typename std::remove_all_extents<DataType>::type BaseBlockType;
196 constexpr unsigned int chunkSize = BaseBlockType::size;
198 unsigned int segmentId = blockIdx.x;
199 int segmentSize = segments_data.template get<pSegment>(segmentId + 1)
200 - segments_data.template get<pSegment>(segmentId);
202 unsigned int start = segments_data.template get<pSegment>(segmentId);
204 unsigned int chunkId = threadIdx.x / chunkSize;
205 unsigned int offset = threadIdx.x % chunkSize;
208 __shared__ MaskType AMask[chunksPerBlock];
209 typename ComposeArrayType<DataType>::type bReg;
210 typename MaskType::scalarType aMask, bMask;
213 if (chunkId < segmentSize)
215 unsigned int m_chunkId = segments_dataMap.template get<0>(start + chunkId);
218 aMask = data_new.template get<pMask>(m_chunkId)[offset];
224 int i = chunksPerBlock;
225 for (; i < segmentSize - (
int) (chunksPerBlock); i += chunksPerBlock)
227 unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
231 data_new.template get<p>(m_chunkId),
233 bMask = data_new.template get<pMask>(m_chunkId)[offset];
241 aMask = aMask | bMask;
245 if (i + chunkId < segmentSize)
247 unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
251 data_new.template get<p>(m_chunkId),
253 bMask = data_new.template get<pMask>(m_chunkId)[offset];
262 aMask = aMask | bMask;
267 AMask[chunkId][offset] = aMask;
273 for (
int j = 2; j <= chunksPerBlock && j <= segmentSize; j *= 2)
275 if (chunkId % j == 0 && chunkId < segmentSize)
277 unsigned int otherChunkId = chunkId + (j / 2);
278 if (otherChunkId < segmentSize)
280 aMask = AMask[chunkId][offset];
281 bMask = AMask[otherChunkId][offset];
283 A[otherChunkId][offset],
286 AMask[chunkId][offset] = aMask | bMask;
294 int seg_old = segments_dataOld.template get<0>(segmentId);
296 if (seg_old != -1 && chunkId == 0)
298 aMask = AMask[0][offset];
299 bMask = data_old.template get<pMask>(seg_old)[offset];
301 data_old.template get<p>(seg_old)[offset],
304 AMask[0][offset] = aMask | bMask;
314 unsigned int out_id = outputMap.template get<0>(segmentId);
319 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
326 template<
unsigned int p,
327 unsigned int pSegment,
329 unsigned int chunksPerBlock,
331 typename IndexVector_segdataT,
typename IndexVector_datamapT,
332 typename IndexVector_segoldT,
typename IndexVector_outmapT,
typename DataVectorT>
334 segreduce_total_with_mask(
335 DataVectorT data_new,
336 DataVectorT data_old,
337 IndexVector_segdataT segments_data,
338 IndexVector_datamapT segments_dataMap,
339 IndexVector_segoldT segments_dataOld,
340 IndexVector_outmapT outputMap,
345 typedef typename DataVectorT::value_type AggregateT;
346 typedef BlockTypeOf<AggregateT, p> DataType;
347 typedef BlockTypeOf<AggregateT, pMask> MaskType;
348 typedef typename std::remove_all_extents<DataType>::type BaseBlockType;
349 constexpr unsigned int chunkSize = BaseBlockType::size;
351 unsigned int segmentId = blockIdx.x;
352 int segmentSize = segments_data.template get<pSegment>(segmentId + 1)
353 - segments_data.template get<pSegment>(segmentId);
355 unsigned int start = segments_data.template get<pSegment>(segmentId);
357 unsigned int chunkId = threadIdx.x / chunkSize;
358 unsigned int offset = threadIdx.x % chunkSize;
361 __shared__ MaskType AMask[chunksPerBlock];
362 typename ComposeArrayType<DataType>::type bReg;
363 typename MaskType::scalarType aMask, bMask;
366 if (chunkId < segmentSize)
368 unsigned int m_chunkId = segments_dataMap.template get<0>(start + chunkId);
371 aMask = data_new.template get<pMask>(m_chunkId)[offset];
377 int i = chunksPerBlock;
378 for (; i < segmentSize - (
int) (chunksPerBlock); i += chunksPerBlock)
380 unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
384 data_new.template get<p>(m_chunkId),
386 bMask = data_new.template get<pMask>(m_chunkId)[offset];
394 aMask = aMask | bMask;
398 if (i + chunkId < segmentSize)
400 unsigned int m_chunkId = segments_dataMap.template get<0>(start + i + chunkId);
404 data_new.template get<p>(m_chunkId),
406 bMask = data_new.template get<pMask>(m_chunkId)[offset];
415 aMask = aMask | bMask;
420 AMask[chunkId][offset] = aMask;
426 for (
int j = 2; j <= chunksPerBlock && j <= segmentSize; j *= 2)
428 if (chunkId % j == 0 && chunkId < segmentSize)
430 unsigned int otherChunkId = chunkId + (j / 2);
431 if (otherChunkId < segmentSize)
433 aMask = AMask[chunkId][offset];
434 bMask = AMask[otherChunkId][offset];
436 A[otherChunkId][offset],
439 AMask[chunkId][offset] = aMask | bMask;
447 int seg_old = segments_dataOld.template get<0>(segmentId);
449 if (seg_old != -1 && chunkId == 0)
451 aMask = AMask[0][offset];
452 bMask = data_old.template get<pMask>(seg_old)[offset];
454 data_old.template get<p>(seg_old)[offset],
457 AMask[0][offset] = aMask | bMask;
467 unsigned int out_id = outputMap.template get<0>(segmentId);
475 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
489 template<
typename DataVectorT,
typename IndexVectorT>
490 __global__
void copy_old_ker(IndexVectorT srcIndices, DataVectorT src, IndexVectorT dstIndices, DataVectorT dst)
493 typedef typename DataVectorT::value_type AggregateT;
494 typedef BlockTypeOf<AggregateT, 0> BlockT0;
495 unsigned int chunkSize = BlockT0::size;
497 unsigned int chunksPerBlock = blockDim.x / chunkSize;
498 unsigned int chunkOffset = threadIdx.x / chunkSize;
500 unsigned int chunkBasePos = blockIdx.x * chunksPerBlock;
502 unsigned int p = chunkBasePos + chunkOffset;
503 if (p < srcIndices.size())
505 auto dstId = dstIndices.template get<0>(p);
506 auto srcId = srcIndices.template get<0>(p);
508 dst.get(dstId) = src.get(srcId);
511 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
516 template<
typename IndexVectorT,
typename IndexVectorT2>
517 __global__
void copyKeyToDstIndexIfPredicate(IndexVectorT keys, IndexVectorT2 dstIndices, IndexVectorT out)
521 unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
523 if (pos >= dstIndices.size()) {
return;}
525 unsigned int pm1 = (pos == 0)?0:pos-1;
527 bool predicate = dstIndices.template get<2>(pos) != dstIndices.template get<2>(pm1) || pos == 0;
531 auto dstPos = dstIndices.template get<2>(pos);
532 out.template get<0>(dstPos) = keys.template get<0>(pos);
537 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
551template<
unsigned int blockSize,
552 typename vector_data_type,
553 typename vector_datamap_type,
554 typename vector_segoffset_type,
555 typename vector_outmap_type,
556 typename vector_segolddata_type,
557 typename vector_reduction,
558 typename block_functor,
559 unsigned int impl2,
unsigned int pSegment=1>
560struct sparse_vector_reduction_solve_conflict
563 vector_data_type & vector_data_red;
566 vector_data_type & vector_data;
569 vector_data_type & vector_data_old;
572 vector_data_type & vector_data_unsorted;
575 vector_segoffset_type & segment_offset;
578 vector_datamap_type & vector_data_map;
581 vector_outmap_type & out_map;
584 vector_segolddata_type & segments_oldData;
595 inline sparse_vector_reduction_solve_conflict(vector_data_type & vector_data_red,
596 vector_data_type & vector_data,
597 vector_data_type & vector_data_unsorted,
598 vector_data_type & vector_data_old,
599 vector_datamap_type & vector_data_map,
600 vector_segoffset_type & segment_offset,
601 vector_outmap_type & out_map,
602 vector_segolddata_type & segments_oldData,
604 :vector_data_red(vector_data_red),
605 vector_data(vector_data),
606 vector_data_unsorted(vector_data_unsorted),
607 vector_data_old(vector_data_old),
608 segment_offset(segment_offset),
609 vector_data_map(vector_data_map),
611 segments_oldData(segments_oldData),
617 inline void operator()(T& t)
const
621 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
622 typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,
typename reduction_type::prop>::type red_type;
623 if (reduction_type::is_special() ==
false)
625 typedef typename std::remove_reference<vector_data_type>::type::value_type AggregateT;
627 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
628 typedef typename std::remove_all_extents<
typename boost::mpl::at<
629 typename std::remove_reference<vector_data_type>::type::value_type::type,
630 typename reduction_type::prop
631 >::type>::type::scalarType red_type;
632 typedef typename reduction_type::template op_red<red_type> red_op;
634 constexpr unsigned int p = reduction_type::prop::value;
635 constexpr unsigned int pMask = AggregateT::max_prop_real - 1;
637 typedef typename std::remove_all_extents<BlockTypeOf<AggregateT, p>>::type BlockT;
639 constexpr unsigned int chunksPerBlock = blockSize / BlockT::size;
640 const unsigned int gridSize =
641 segment_offset.size() - 1;
645 CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total_with_mask<p, pSegment, pMask, chunksPerBlock, red_op>),gridSize, blockSize,
646 vector_data.toKernel(),
647 vector_data_old.toKernel(),
648 segment_offset.toKernel(),
649 vector_data_map.toKernel(),
650 segments_oldData.toKernel(),
652 vector_data_red.toKernel());
656 CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<p, pSegment, pMask, chunksPerBlock, red_op>),gridSize, blockSize,
657 vector_data.toKernel(),
658 vector_data_old.toKernel(),
659 segment_offset.toKernel(),
660 vector_data_map.toKernel(),
661 segments_oldData.toKernel(),
663 vector_data_red.toKernel());
667 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: this file is supposed to be compiled with nvcc" << std::endl;
672namespace BlockMapGpuFunctors
677 template<
unsigned int blockSize>
700 template<
unsigned int pSegment,
typename vector_index_type,
typename vector_index_type2,
typename vector_data_type,
typename ... v_reduce>
701 bool solve_conflicts(vector_index_type &keys, vector_index_type &mergeIndices, vector_index_type2 &segments_new, vector_index_type &data_map,
702 vector_data_type &dataOld, vector_data_type &dataNew,
703 vector_index_type &keysOut, vector_data_type &dataOut,
707 typedef ValueTypeOf<vector_data_type> AggregateT;
708 typedef ValueTypeOf<vector_index_type> AggregateIndexT;
710 typedef BlockTypeOf<AggregateIndexT, 0> IndexT;
712 typedef BlockTypeOf<AggregateT, 0> BlockT0;
714 constexpr unsigned int chunksPerBlock = blockSize / BlockT0::size;
715 const unsigned int gridSize = mergeIndices.
size() / chunksPerBlock + ((mergeIndices.size() % chunksPerBlock) != 0);
721 auto ite = mergeIndices.getGPUIterator();
723 p_ids.resize(mergeIndices.size());
724 s_ids.resize(mergeIndices.size());
728 p_ids.template get<1>(p_ids.size()-1) = 0;
729 CUDA_LAUNCH(BlockMapGpuKernels::compute_predicate,ite,keys.toKernel(),mergeIndices.toKernel(),dataOld.size(),p_ids.toKernel());
731 openfpm::scan((
int *)p_ids.template getDeviceBuffer<0>(),
733 (
int *)s_ids.template getDeviceBuffer<0>(),
736 openfpm::scan((
int *)p_ids.template getDeviceBuffer<1>(),
738 (
int *)s_ids.template getDeviceBuffer<1>(),
741 openfpm::scan((
int *)p_ids.template getDeviceBuffer<2>(),
743 (
int *)s_ids.template getDeviceBuffer<2>(),
746 openfpm::scan((
int *)p_ids.template getDeviceBuffer<3>(),
748 (
int *)s_ids.template getDeviceBuffer<3>(),
751 openfpm::scan((
int *)p_ids.template getDeviceBuffer<4>(),
753 (
int *)s_ids.template getDeviceBuffer<4>(),
756 s_ids.template deviceToHost<0,1,2,3>(s_ids.
size()-1,s_ids.
size()-1);
757 p_ids.template deviceToHost<0,1,2,3>(p_ids.size()-1,p_ids.size()-1);
759 size_t copy_old_size = s_ids.template get<3>(s_ids.
size()-1) + p_ids.template get<3>(p_ids.size()-1);
760 size_t seg_old_size = s_ids.template get<1>(s_ids.
size()-1) + p_ids.template get<1>(p_ids.size()-1);
761 size_t out_map_size = s_ids.template get<1>(s_ids.
size()-1) + p_ids.template get<1>(p_ids.size()-1);
762 size_t data_out_size = s_ids.template get<2>(s_ids.
size()-1) + p_ids.template get<2>(p_ids.size()-1);
764 segments_oldData.resize(seg_old_size);
765 outputMap.resize(out_map_size);
766 copy_old_dst.resize(copy_old_size);
767 copy_old_src.resize(copy_old_size);
769 CUDA_LAUNCH(BlockMapGpuKernels::maps_create,ite,s_ids.toKernel(),p_ids.toKernel(),segments_oldData.toKernel(),outputMap.toKernel(),copy_old_dst.toKernel(),copy_old_src.toKernel());
772 keysOut.resize(data_out_size);
774 ite = keys.getGPUIterator();
775 CUDA_LAUNCH(BlockMapGpuKernels::copyKeyToDstIndexIfPredicate,ite,keys.toKernel(), s_ids.toKernel(), keysOut.toKernel());
782 dataOut.reserve(data_out_size+1);
783 dataOut.resize(data_out_size);
784 typedef boost::mpl::vector<v_reduce...> vv_reduce;
786 sparse_vector_reduction_solve_conflict<blockSize,
decltype(dataOut),
788 decltype(segments_new),
790 decltype(segments_oldData),
791 vv_reduce,BlockFunctor,2, pSegment>
792 svr(dataOut,dataNew,dataNew,dataOld,data_map,segments_new,outputMap,segments_oldData,context);
794 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(v_reduce)>>(svr);
797 if (copy_old_dst.
size() != 0)
799 CUDA_LAUNCH_DIM3(BlockMapGpuKernels::copy_old_ker,copy_old_dst.
size(),blockSize,copy_old_src.toKernel(),dataOld.toKernel(),copy_old_dst.toKernel(),dataOut.toKernel());
804 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
Implementation of 1-D std::vector like structure.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data