OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
segreduce_block_cuda_tests.cu
1//
2// Created by tommaso on 20/05/19.
3//
4
5#include "config.h"
6
7#define BOOST_TEST_DYN_LINK
8
9#include <boost/test/unit_test.hpp>
10#include "Vector/map_vector.hpp"
11#include "SparseGridGpu/BlockMapGpu.hpp"
12//todo: here include SparseGridGpu_kernels and remove local kernel definitions
13
14template<typename op, typename ScalarT>
15__device__ inline ScalarT applyOp(ScalarT a, ScalarT b, bool aExist, bool bExist)
16{
17 op op_;
18 if (aExist && bExist)
19 {
20 a = op_(a, b);
21 }
22 else if (bExist)
23 {
24 a = b;
25 }
26 return a;
27}
28
29// GridSize = number of segments
30// BlockSize = chunksPerBlock * chunkSize
31//
32template<unsigned int chunksPerBlock, typename op, typename SegType, typename DataType, typename MaskType>
33__global__ void
34segreduce_block(
35 DataType *data,
36 SegType *segments,
37 MaskType *masks,
38 DataType *output
39 )
40{
41 unsigned int segmentId = blockIdx.x;
42 int segmentSize = segments[segmentId + 1] - segments[segmentId];
43
44 unsigned int start = segments[segmentId];
45
46 unsigned int chunkId = threadIdx.x / DataType::size;
47 unsigned int offset = threadIdx.x % DataType::size;
48
49 __shared__ DataType A[chunksPerBlock];
50 __shared__ MaskType AMask[chunksPerBlock];
51 typename DataType::scalarType bReg;
52 typename MaskType::scalarType aMask, bMask;
53
54 // Phase 0: Load chunks as first operand of the reduction
55 if (chunkId < segmentSize)
56 {
57 A[chunkId][offset] = data[start + chunkId][offset];
58 aMask = masks[start + chunkId][offset];
59 }
60
61 int i = chunksPerBlock;
62 for ( ; i < segmentSize - (int) (chunksPerBlock); i += chunksPerBlock)
63 {
64 bReg = data[start + i + chunkId][offset];
65 bMask = masks[start + i + chunkId][offset];
66
67 A[chunkId][offset] = applyOp<op>(A[chunkId][offset],
68 bReg,
71 aMask = aMask | bMask;
72 }
73
74 if (i + chunkId < segmentSize)
75 {
76 bReg = data[start + i + chunkId][offset];
77 bMask = masks[start + i + chunkId][offset];
78
79 A[chunkId][offset] = applyOp<op>(A[chunkId][offset],
80 bReg,
83 aMask = aMask | bMask;
84 }
85
86 AMask[chunkId][offset] = aMask;
87
88 __syncthreads();
89
90 // Horizontal reduction finished
91 // Now vertical reduction
92 for (int j = 2; j <= chunksPerBlock && j <= segmentSize; j *= 2)
93 {
94 if (chunkId % j == 0 && chunkId < segmentSize)
95 {
96 unsigned int otherChunkId = chunkId + (j / 2);
97 if (otherChunkId < segmentSize)
98 {
99 aMask = AMask[chunkId][offset];
100 bMask = AMask[otherChunkId][offset];
101 A[chunkId][offset] = applyOp<op>(A[chunkId][offset],
102 A[otherChunkId][offset],
105 AMask[chunkId][offset] = aMask | bMask;
106 }
107 }
108 __syncthreads();
109 }
110
111 // Write output
112 if (chunkId == 0)
113 {
114 output[segmentId][offset] = A[chunkId][offset];
115 }
116}
117
118BOOST_AUTO_TEST_SUITE(segreduce_block_cuda_tests)
119
120 BOOST_AUTO_TEST_CASE (segreduce_block_test)
121 {
122 typedef float ScalarT;
123 typedef DataBlock<unsigned char, 64> MaskBlockT;
124 typedef DataBlock<ScalarT, 64> BlockT;
126 segments.resize(8);
127 segments.template get<0>(0) = 0;
128 segments.template get<0>(1) = 4;
129 segments.template get<0>(2) = 5;
130 segments.template get<0>(3) = 7;
131 segments.template get<0>(4) = 8;
132 segments.template get<0>(5) = 11;
133 segments.template get<0>(6) = 17;
134 segments.template get<0>(7) = 18; // Id of first non-existent data
135
136 segments.template hostToDevice<0>();
137
138 const unsigned int BITMASK = 0, BLOCK = 1;
139 BlockT block;
140 MaskBlockT mask;
141 for (int i = 0; i < 32; ++i)
142 {
143 block[i] = i + 1;
144 mask[i] = 1;
145 }
146 for (int i = 32; i < 64; ++i)
147 {
148 block[i] = 666;
149 mask[i] = 0;
150 }
151
153 data.resize(18);
154 for (int i = 0; i < 18; ++i)
155 {
156 data.template get<BITMASK>(i) = mask;
157 data.template get<BLOCK>(i) = block;
158 }
159
160 data.template hostToDevice<BITMASK, BLOCK>();
161
162 // Allocate output buffer
164 outputData.resize(segments.size()-1);
165
166 // template<unsigned int chunksPerBlock, typename op, typename SegType, typename DataType, typename MaskType>
167 // segreduce(DataType *data, SegType *segments, MaskType *masks, DataType *output, MaskType *outputMasks)
168// segreduce<2, gpu::maximum_t<ScalarT>> <<< outputData.size(), 2*BlockT::size >>> (
169 CUDA_LAUNCH_DIM3((segreduce_block<2, gpu::plus_t<ScalarT>>), outputData.size(), 2*BlockT::size,
170 (BlockT *) data.template getDeviceBuffer<BLOCK>(),
171 (int *) segments.template getDeviceBuffer<0>(),
172 (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(),
173 (BlockT *) outputData.template getDeviceBuffer<BLOCK>()
174 );
175
176 // Segreduce on mask
177 CUDA_LAUNCH_DIM3((segreduce_block<2, gpu::maximum_t<unsigned char>>), outputData.size(), 2*BlockT::size,
178 (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(),
179 (int *) segments.template getDeviceBuffer<0>(),
180 (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(),
181 (MaskBlockT *) outputData.template getDeviceBuffer<BITMASK>()
182 );
183
184 outputData.template deviceToHost<BITMASK, BLOCK>();
185
186 for (int j = 0; j < outputData.size(); ++j)
187 {
188 BlockT outBlock = outputData.template get<BLOCK>(j);
189 MaskBlockT outMask = outputData.template get<BITMASK>(j);
190
191 int seg_sz = segments.template get<0>(j+1) - segments.template get<0>(j);
192
193 for (int i = 0; i < BlockT::size; ++i)
194 {
195 if (i < 32)
196 {
197 BOOST_REQUIRE_EQUAL(outBlock[i],seg_sz*(i+1));
198 BOOST_REQUIRE_EQUAL(outMask[i],1);
199 }
200 else
201 {
202 BOOST_REQUIRE_EQUAL(outMask[i],0);
203 }
204 }
205 }
206 }
207
208BOOST_AUTO_TEST_SUITE_END()
Implementation of 1-D std::vector like structure.
size_t size()
Stub size.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data