OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_scan_warp_scans3.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include "../../util_arch.cuh"
37#include "../../util_ptx.cuh"
38#include "../../warp/warp_scan.cuh"
39#include "../../util_namespace.cuh"
40
42CUB_NS_PREFIX
43
45namespace cub {
46
50template <
51 typename T,
52 int BLOCK_DIM_X,
53 int BLOCK_DIM_Y,
54 int BLOCK_DIM_Z,
55 int PTX_ARCH>
56struct BlockScanWarpScans
57{
58 //---------------------------------------------------------------------
59 // Types and constants
60 //---------------------------------------------------------------------
61
63 enum
64 {
66 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
67
69 INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
70 OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
71
74 };
75
78
81
82 typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
83
84
86 struct _TempStorage
87 {
89 {
92
93 } aliasable;
94
95 T warp_aggregates[OUTER_WARPS];
96
98 };
99
100
102 struct TempStorage : Uninitialized<_TempStorage> {};
103
104
105 //---------------------------------------------------------------------
106 // Per-thread fields
107 //---------------------------------------------------------------------
108
109 // Thread fields
110 _TempStorage &temp_storage;
111 unsigned int linear_tid;
112 unsigned int warp_id;
113 unsigned int lane_id;
114
115
116 //---------------------------------------------------------------------
117 // Constructors
118 //---------------------------------------------------------------------
119
121 __device__ __forceinline__ BlockScanWarpScans(
122 TempStorage &temp_storage)
123 :
124 temp_storage(temp_storage.Alias()),
125 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
126 warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
127 lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
128 {}
129
130
131 //---------------------------------------------------------------------
132 // Exclusive scans
133 //---------------------------------------------------------------------
134
136 template <typename ScanOp>
137 __device__ __forceinline__ void ExclusiveScan(
138 T input,
139 T &exclusive_output,
140 ScanOp scan_op)
141 {
142 // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
143 T block_aggregate;
144 ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
145 }
146
147
149 template <typename ScanOp>
150 __device__ __forceinline__ void ExclusiveScan(
151 T input,
152 T &exclusive_output,
153 const T &initial_value,
154 ScanOp scan_op)
155 {
156 T block_aggregate;
157 ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
158 }
159
160
162 template <typename ScanOp>
163 __device__ __forceinline__ void ExclusiveScan(
164 T input,
165 T &exclusive_output,
166 ScanOp scan_op,
167 T &block_aggregate)
168 {
169 // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
170 T inclusive_output;
171 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
172 input, inclusive_output, exclusive_output, scan_op);
173
174 // Share outer warp total
175 if (lane_id == OUTER_WARP_THREADS - 1)
176 temp_storage.warp_aggregates[warp_id] = inclusive_output;
177
178 CTA_SYNC();
179
180 if (linear_tid < INNER_WARP_THREADS)
181 {
182 T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
183 T outer_warp_exclusive;
184
185 InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
186 outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
187
188 temp_storage.block_aggregate = block_aggregate;
189 temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive;
190 }
191
192 CTA_SYNC();
193
194 if (warp_id != 0)
195 {
196 // Retrieve block aggregate
197 block_aggregate = temp_storage.block_aggregate;
198
199 // Apply warp prefix to our lane's partial
200 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
201 exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
202 if (lane_id == 0)
203 exclusive_output = outer_warp_exclusive;
204 }
205 }
206
207
209 template <typename ScanOp>
210 __device__ __forceinline__ void ExclusiveScan(
211 T input,
212 T &exclusive_output,
213 const T &initial_value,
214 ScanOp scan_op,
215 T &block_aggregate)
216 {
217 // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
218 T inclusive_output;
219 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
220 input, inclusive_output, exclusive_output, scan_op);
221
222 // Share outer warp total
223 if (lane_id == OUTER_WARP_THREADS - 1)
224 {
225 temp_storage.warp_aggregates[warp_id] = inclusive_output;
226 }
227
228 CTA_SYNC();
229
230 if (linear_tid < INNER_WARP_THREADS)
231 {
232 T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
233 T outer_warp_exclusive;
234
235 InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
236 outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
237
238 temp_storage.block_aggregate = block_aggregate;
239 temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive;
240 }
241
242 CTA_SYNC();
243
244 // Retrieve block aggregate
245 block_aggregate = temp_storage.block_aggregate;
246
247 // Apply warp prefix to our lane's partial
248 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
249 exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
250 if (lane_id == 0)
251 exclusive_output = outer_warp_exclusive;
252 }
253
254
256 template <
257 typename ScanOp,
258 typename BlockPrefixCallbackOp>
259 __device__ __forceinline__ void ExclusiveScan(
260 T input,
261 T &exclusive_output,
262 ScanOp scan_op,
263 BlockPrefixCallbackOp &block_prefix_callback_op)
264 {
265 // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
266 T inclusive_output;
267 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
268 input, inclusive_output, exclusive_output, scan_op);
269
270 // Share outer warp total
271 if (lane_id == OUTER_WARP_THREADS - 1)
272 temp_storage.warp_aggregates[warp_id] = inclusive_output;
273
274 CTA_SYNC();
275
276 if (linear_tid < INNER_WARP_THREADS)
277 {
278 InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
279
280 T upsweep = temp_storage.warp_aggregates[linear_tid];
281 T downsweep_prefix, block_aggregate;
282
283 inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
284
285 // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
286 T block_prefix = block_prefix_callback_op(block_aggregate);
287 block_prefix = inner_scan.Broadcast(block_prefix, 0);
288
289 downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
290 if (linear_tid == 0)
291 downsweep_prefix = block_prefix;
292
293 temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
294 }
295
296 CTA_SYNC();
297
298 // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
299 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
300 exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
301 if (lane_id == 0)
302 exclusive_output = outer_warp_exclusive;
303 }
304
305
306 //---------------------------------------------------------------------
307 // Inclusive scans
308 //---------------------------------------------------------------------
309
311 template <typename ScanOp>
312 __device__ __forceinline__ void InclusiveScan(
313 T input,
314 T &inclusive_output,
315 ScanOp scan_op)
316 {
317 T block_aggregate;
318 InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
319 }
320
321
323 template <typename ScanOp>
324 __device__ __forceinline__ void InclusiveScan(
325 T input,
326 T &inclusive_output,
327 ScanOp scan_op,
328 T &block_aggregate)
329 {
330 // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
331 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
332 input, inclusive_output, scan_op);
333
334 // Share outer warp total
335 if (lane_id == OUTER_WARP_THREADS - 1)
336 temp_storage.warp_aggregates[warp_id] = inclusive_output;
337
338 CTA_SYNC();
339
340 if (linear_tid < INNER_WARP_THREADS)
341 {
342 T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
343 T outer_warp_exclusive;
344
345 InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
346 outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
347
348 temp_storage.block_aggregate = block_aggregate;
349 temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive;
350 }
351
352 CTA_SYNC();
353
354 if (warp_id != 0)
355 {
356 // Retrieve block aggregate
357 block_aggregate = temp_storage.block_aggregate;
358
359 // Apply warp prefix to our lane's partial
360 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
361 inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
362 }
363 }
364
365
367 template <
368 typename ScanOp,
369 typename BlockPrefixCallbackOp>
370 __device__ __forceinline__ void InclusiveScan(
371 T input,
372 T &inclusive_output,
373 ScanOp scan_op,
374 BlockPrefixCallbackOp &block_prefix_callback_op)
375 {
376 // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
377 OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
378 input, inclusive_output, scan_op);
379
380 // Share outer warp total
381 if (lane_id == OUTER_WARP_THREADS - 1)
382 temp_storage.warp_aggregates[warp_id] = inclusive_output;
383
384 CTA_SYNC();
385
386 if (linear_tid < INNER_WARP_THREADS)
387 {
388 InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
389
390 T upsweep = temp_storage.warp_aggregates[linear_tid];
391 T downsweep_prefix, block_aggregate;
392 inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
393
394 // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
395 T block_prefix = block_prefix_callback_op(block_aggregate);
396 block_prefix = inner_scan.Broadcast(block_prefix, 0);
397
398 downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
399 if (linear_tid == 0)
400 downsweep_prefix = block_prefix;
401
402 temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
403 }
404
405 CTA_SYNC();
406
407 // Apply warp prefix to our lane's partial
408 T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
409 inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
410 }
411
412
413};
414
415
416} // CUB namespace
417CUB_NS_POSTFIX // Optional outer namespace(s)
418
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast the value input from warp-lanesrc_lane to all lanes in the warp.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409
CTA_SYNC()
Definition util_ptx.cuh:255
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
Shared memory storage layout type.
T block_aggregate
Shared prefix for the entire thread block.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
WarpScan< T, INNER_WARP_THREADS, PTX_ARCH > InnerWarpScanT
Inner WarpScan utility type.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ BlockScanWarpScans(TempStorage &temp_storage)
Constructor.
WarpScan< T, OUTER_WARP_THREADS, PTX_ARCH > OuterWarpScanT
Outer WarpScan utility type.
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
@ INNER_WARP_THREADS
Number of warp threads.
@ BLOCK_THREADS
The thread block size in threads.
@ OUTER_WARPS
Number of outer scan warps.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
\smemstorage{WarpScan}
InnerWarpScanT::TempStorage inner_warp_scan
Buffer for warp-synchronous inner scan.
Uninitialized< OuterScanArray > outer_warp_scan
Buffer for warp-synchronous outer scans.