OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_scan_warp_scans.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include "../../util_arch.cuh"
37#include "../../util_ptx.cuh"
38#include "../../warp/warp_scan.cuh"
39#include "../../util_namespace.cuh"
40
42CUB_NS_PREFIX
43
45namespace cub {
46
50template <
51 typename T,
52 int BLOCK_DIM_X,
53 int BLOCK_DIM_Y,
54 int BLOCK_DIM_Z,
55 int PTX_ARCH>
57{
58 //---------------------------------------------------------------------
59 // Types and constants
60 //---------------------------------------------------------------------
61
63 enum
64 {
66 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
67
69 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
70
73 };
74
77
80
82
83 struct __align__(32) _TempStorage
84 {
85 T warp_aggregates[WARPS];
86 typename WarpScanT::TempStorage warp_scan[WARPS];
87 T block_prefix;
88 };
89
90
92 struct TempStorage : Uninitialized<_TempStorage> {};
93
94
95 //---------------------------------------------------------------------
96 // Per-thread fields
97 //---------------------------------------------------------------------
98
99 // Thread fields
100 _TempStorage &temp_storage;
101 unsigned int linear_tid;
102 unsigned int warp_id;
103 unsigned int lane_id;
104
105
106 //---------------------------------------------------------------------
107 // Constructors
108 //---------------------------------------------------------------------
109
111 __device__ __forceinline__ BlockScanWarpScans(
112 TempStorage &temp_storage)
113 :
114 temp_storage(temp_storage.Alias()),
115 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
116 warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
117 lane_id(LaneId())
118 {}
119
120
121 //---------------------------------------------------------------------
122 // Utility methods
123 //---------------------------------------------------------------------
124
125 template <typename ScanOp, int WARP>
126 __device__ __forceinline__ void ApplyWarpAggregates(
127 T &warp_prefix,
128 ScanOp scan_op,
129 T &block_aggregate,
130 Int2Type<WARP> /*addend_warp*/)
131 {
132 if (warp_id == WARP)
133 warp_prefix = block_aggregate;
134
135 T addend = temp_storage.warp_aggregates[WARP];
136 block_aggregate = scan_op(block_aggregate, addend);
137
138 ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
139 }
140
141 template <typename ScanOp>
142 __device__ __forceinline__ void ApplyWarpAggregates(
143 T &/*warp_prefix*/,
144 ScanOp /*scan_op*/,
145 T &/*block_aggregate*/,
146 Int2Type<WARPS> /*addend_warp*/)
147 {}
148
149
151 template <typename ScanOp>
152 __device__ __forceinline__ T ComputeWarpPrefix(
153 ScanOp scan_op,
154 T warp_aggregate,
155 T &block_aggregate)
156 {
157 // Last lane in each warp shares its warp-aggregate
158 if (lane_id == WARP_THREADS - 1)
159 temp_storage.warp_aggregates[warp_id] = warp_aggregate;
160
161 CTA_SYNC();
162
163 // Accumulate block aggregates and save the one that is our warp's prefix
164 T warp_prefix;
165 block_aggregate = temp_storage.warp_aggregates[0];
166
167 // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
168 ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
169/*
170 #pragma unroll
171 for (int WARP = 1; WARP < WARPS; ++WARP)
172 {
173 if (warp_id == WARP)
174 warp_prefix = block_aggregate;
175
176 T addend = temp_storage.warp_aggregates[WARP];
177 block_aggregate = scan_op(block_aggregate, addend);
178 }
179*/
180
181 return warp_prefix;
182 }
183
184
186 template <typename ScanOp>
187 __device__ __forceinline__ T ComputeWarpPrefix(
188 ScanOp scan_op,
189 T warp_aggregate,
190 T &block_aggregate,
191 const T &initial_value)
192 {
193 T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
194
195 warp_prefix = scan_op(initial_value, warp_prefix);
196
197 if (warp_id == 0)
198 warp_prefix = initial_value;
199
200 return warp_prefix;
201 }
202
203 //---------------------------------------------------------------------
204 // Exclusive scans
205 //---------------------------------------------------------------------
206
208 template <typename ScanOp>
209 __device__ __forceinline__ void ExclusiveScan(
210 T input,
211 T &exclusive_output,
212 ScanOp scan_op)
213 {
214 // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
215 T block_aggregate;
216 ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
217 }
218
219
221 template <typename ScanOp>
222 __device__ __forceinline__ void ExclusiveScan(
223 T input,
224 T &exclusive_output,
225 const T &initial_value,
226 ScanOp scan_op)
227 {
228 T block_aggregate;
229 ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
230 }
231
232
234 template <typename ScanOp>
235 __device__ __forceinline__ void ExclusiveScan(
236 T input,
237 T &exclusive_output,
238 ScanOp scan_op,
239 T &block_aggregate)
240 {
241 // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
242 T inclusive_output;
243 WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
244
245 // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid.
246 T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
247
248 // Apply warp prefix to our lane's partial
249 if (warp_id != 0)
250 {
251 exclusive_output = scan_op(warp_prefix, exclusive_output);
252 if (lane_id == 0)
253 exclusive_output = warp_prefix;
254 }
255 }
256
257
259 template <typename ScanOp>
260 __device__ __forceinline__ void ExclusiveScan(
261 T input,
262 T &exclusive_output,
263 const T &initial_value,
264 ScanOp scan_op,
265 T &block_aggregate)
266 {
267 // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
268 T inclusive_output;
269 WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
270
271 // Compute the warp-wide prefix and block-wide aggregate for each warp
272 T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
273
274 // Apply warp prefix to our lane's partial
275 exclusive_output = scan_op(warp_prefix, exclusive_output);
276 if (lane_id == 0)
277 exclusive_output = warp_prefix;
278 }
279
280
282 template <
283 typename ScanOp,
284 typename BlockPrefixCallbackOp>
285 __device__ __forceinline__ void ExclusiveScan(
286 T input,
287 T &exclusive_output,
288 ScanOp scan_op,
289 BlockPrefixCallbackOp &block_prefix_callback_op)
290 {
291 // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
292 T block_aggregate;
293 ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
294
295 // Use the first warp to determine the thread block prefix, returning the result in lane0
296 if (warp_id == 0)
297 {
298 T block_prefix = block_prefix_callback_op(block_aggregate);
299 if (lane_id == 0)
300 {
301 // Share the prefix with all threads
302 temp_storage.block_prefix = block_prefix;
303 exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0
304 }
305 }
306
307 CTA_SYNC();
308
309 // Incorporate thread block prefix into outputs
310 T block_prefix = temp_storage.block_prefix;
311 if (linear_tid > 0)
312 {
313 exclusive_output = scan_op(block_prefix, exclusive_output);
314 }
315 }
316
317
318 //---------------------------------------------------------------------
319 // Inclusive scans
320 //---------------------------------------------------------------------
321
323 template <typename ScanOp>
324 __device__ __forceinline__ void InclusiveScan(
325 T input,
326 T &inclusive_output,
327 ScanOp scan_op)
328 {
329 T block_aggregate;
330 InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
331 }
332
333
335 template <typename ScanOp>
336 __device__ __forceinline__ void InclusiveScan(
337 T input,
338 T &inclusive_output,
339 ScanOp scan_op,
340 T &block_aggregate)
341 {
342 WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
343
344 // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid.
345 T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
346
347 // Apply warp prefix to our lane's partial
348 if (warp_id != 0)
349 {
350 inclusive_output = scan_op(warp_prefix, inclusive_output);
351 }
352 }
353
354
356 template <
357 typename ScanOp,
358 typename BlockPrefixCallbackOp>
359 __device__ __forceinline__ void InclusiveScan(
360 T input,
361 T &exclusive_output,
362 ScanOp scan_op,
363 BlockPrefixCallbackOp &block_prefix_callback_op)
364 {
365 T block_aggregate;
366 InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
367
368 // Use the first warp to determine the thread block prefix, returning the result in lane0
369 if (warp_id == 0)
370 {
371 T block_prefix = block_prefix_callback_op(block_aggregate);
372 if (lane_id == 0)
373 {
374 // Share the prefix with all threads
375 temp_storage.block_prefix = block_prefix;
376 }
377 }
378
379 CTA_SYNC();
380
381 // Incorporate thread block prefix into outputs
382 T block_prefix = temp_storage.block_prefix;
383 exclusive_output = scan_op(block_prefix, exclusive_output);
384 }
385
386
387};
388
389
390} // CUB namespace
391CUB_NS_POSTFIX // Optional outer namespace(s)
392
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420
CTA_SYNC()
Definition util_ptx.cuh:255
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
Shared memory storage layout type.
BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread bloc...
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
@ WARP_THREADS
Number of warp threads.
@ WARPS
Number of active warps.
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
struct __align__(32) _TempStorage
Shared memory storage layout type.
__device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate)
Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate ...
WarpScan< T, WARPS, PTX_ARCH > WarpAggregateScan
WarpScan utility type.
__device__ __forceinline__ BlockScanWarpScans(TempStorage &temp_storage)
Constructor.
WarpScan< T, WARP_THREADS, PTX_ARCH > WarpScanT
WarpScan utility type.
@ BLOCK_THREADS
The thread block size in threads.
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP >)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
\smemstorage{WarpScan}