OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
segreduce_block_cuda_tests.cu
1 //
2 // Created by tommaso on 20/05/19.
3 //
4 
5 #include "config.h"
6 
7 #define BOOST_TEST_DYN_LINK
8 
9 #include <boost/test/unit_test.hpp>
10 #include "Vector/map_vector.hpp"
11 #include "SparseGridGpu/BlockMapGpu.hpp"
12 //todo: here include SparseGridGpu_kernels and remove local kernel definitions
13 
14 template<typename op, typename ScalarT>
15 __device__ inline ScalarT applyOp(ScalarT a, ScalarT b, bool aExist, bool bExist)
16 {
17  op op_;
18  if (aExist && bExist)
19  {
20  a = op_(a, b);
21  }
22  else if (bExist)
23  {
24  a = b;
25  }
26  return a;
27 }
28 
29 // GridSize = number of segments
30 // BlockSize = chunksPerBlock * chunkSize
31 //
32 template<unsigned int chunksPerBlock, typename op, typename SegType, typename DataType, typename MaskType>
33 __global__ void
34 segreduce_block(
35  DataType *data,
36  SegType *segments,
37  MaskType *masks,
38  DataType *output
39  )
40 {
41  unsigned int segmentId = blockIdx.x;
42  int segmentSize = segments[segmentId + 1] - segments[segmentId];
43 
44  unsigned int start = segments[segmentId];
45 
46  unsigned int chunkId = threadIdx.x / DataType::size;
47  unsigned int offset = threadIdx.x % DataType::size;
48 
49  __shared__ DataType A[chunksPerBlock];
50  __shared__ MaskType AMask[chunksPerBlock];
51  typename DataType::scalarType bReg;
52  typename MaskType::scalarType aMask, bMask;
53 
54  // Phase 0: Load chunks as first operand of the reduction
55  if (chunkId < segmentSize)
56  {
57  A[chunkId][offset] = data[start + chunkId][offset];
58  aMask = masks[start + chunkId][offset];
59  }
60 
61  int i = chunksPerBlock;
62  for ( ; i < segmentSize - (int) (chunksPerBlock); i += chunksPerBlock)
63  {
64  bReg = data[start + i + chunkId][offset];
65  bMask = masks[start + i + chunkId][offset];
66 
67  A[chunkId][offset] = applyOp<op>(A[chunkId][offset],
68  bReg,
71  aMask = aMask | bMask;
72  }
73 
74  if (i + chunkId < segmentSize)
75  {
76  bReg = data[start + i + chunkId][offset];
77  bMask = masks[start + i + chunkId][offset];
78 
79  A[chunkId][offset] = applyOp<op>(A[chunkId][offset],
80  bReg,
83  aMask = aMask | bMask;
84  }
85 
86  AMask[chunkId][offset] = aMask;
87 
88  __syncthreads();
89 
90  // Horizontal reduction finished
91  // Now vertical reduction
92  for (int j = 2; j <= chunksPerBlock && j <= segmentSize; j *= 2)
93  {
94  if (chunkId % j == 0 && chunkId < segmentSize)
95  {
96  unsigned int otherChunkId = chunkId + (j / 2);
97  if (otherChunkId < segmentSize)
98  {
99  aMask = AMask[chunkId][offset];
100  bMask = AMask[otherChunkId][offset];
101  A[chunkId][offset] = applyOp<op>(A[chunkId][offset],
102  A[otherChunkId][offset],
104  BlockMapGpu_ker<>::exist(bMask));
105  AMask[chunkId][offset] = aMask | bMask;
106  }
107  }
108  __syncthreads();
109  }
110 
111  // Write output
112  if (chunkId == 0)
113  {
114  output[segmentId][offset] = A[chunkId][offset];
115  }
116 }
117 
118 BOOST_AUTO_TEST_SUITE(segreduce_block_cuda_tests)
119 
120  BOOST_AUTO_TEST_CASE (segreduce_block_test)
121  {
122  typedef float ScalarT;
123  typedef DataBlock<unsigned char, 64> MaskBlockT;
124  typedef DataBlock<ScalarT, 64> BlockT;
126  segments.resize(8);
127  segments.template get<0>(0) = 0;
128  segments.template get<0>(1) = 4;
129  segments.template get<0>(2) = 5;
130  segments.template get<0>(3) = 7;
131  segments.template get<0>(4) = 8;
132  segments.template get<0>(5) = 11;
133  segments.template get<0>(6) = 17;
134  segments.template get<0>(7) = 18; // Id of first non-existent data
135 
136  segments.template hostToDevice<0>();
137 
138  const unsigned int BITMASK = 0, BLOCK = 1;
139  BlockT block;
140  MaskBlockT mask;
141  for (int i = 0; i < 32; ++i)
142  {
143  block[i] = i + 1;
144  mask[i] = 1;
145  }
146  for (int i = 32; i < 64; ++i)
147  {
148  block[i] = 666;
149  mask[i] = 0;
150  }
151 
153  data.resize(18);
154  for (int i = 0; i < 18; ++i)
155  {
156  data.template get<BITMASK>(i) = mask;
157  data.template get<BLOCK>(i) = block;
158  }
159 
160  data.template hostToDevice<BITMASK, BLOCK>();
161 
162  // Allocate output buffer
164  outputData.resize(segments.size()-1);
165 
166  // template<unsigned int chunksPerBlock, typename op, typename SegType, typename DataType, typename MaskType>
167  // segreduce(DataType *data, SegType *segments, MaskType *masks, DataType *output, MaskType *outputMasks)
168 // segreduce<2, mgpu::maximum_t<ScalarT>> <<< outputData.size(), 2*BlockT::size >>> (
169  CUDA_LAUNCH_DIM3((segreduce_block<2, mgpu::plus_t<ScalarT>>), outputData.size(), 2*BlockT::size,
170  (BlockT *) data.template getDeviceBuffer<BLOCK>(),
171  (int *) segments.template getDeviceBuffer<0>(),
172  (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(),
173  (BlockT *) outputData.template getDeviceBuffer<BLOCK>()
174  );
175 
176  // Segreduce on mask
177  CUDA_LAUNCH_DIM3((segreduce_block<2, mgpu::maximum_t<unsigned char>>), outputData.size(), 2*BlockT::size,
178  (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(),
179  (int *) segments.template getDeviceBuffer<0>(),
180  (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(),
181  (MaskBlockT *) outputData.template getDeviceBuffer<BITMASK>()
182  );
183 
184  outputData.template deviceToHost<BITMASK, BLOCK>();
185 
186  for (int j = 0; j < outputData.size(); ++j)
187  {
188  BlockT outBlock = outputData.template get<BLOCK>(j);
189  MaskBlockT outMask = outputData.template get<BITMASK>(j);
190 
191  int seg_sz = segments.template get<0>(j+1) - segments.template get<0>(j);
192 
193  for (int i = 0; i < BlockT::size; ++i)
194  {
195  if (i < 32)
196  {
197  BOOST_REQUIRE_EQUAL(outBlock[i],seg_sz*(i+1));
198  BOOST_REQUIRE_EQUAL(outMask[i],1);
199  }
200  else
201  {
202  BOOST_REQUIRE_EQUAL(outMask[i],0);
203  }
204  }
205  }
206  }
207 
208 BOOST_AUTO_TEST_SUITE_END()
size_t size()
Stub size.
Definition: map_vector.hpp:211
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
Implementation of 1-D std::vector like structure.
Definition: map_vector.hpp:202