OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
block_scan_warp_scans2.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include "../../util_arch.cuh"
37#include "../../util_ptx.cuh"
38#include "../../warp/warp_scan.cuh"
39#include "../../util_namespace.cuh"
40
42CUB_NS_PREFIX
43
45namespace cub {
46
50template <
51 typename T,
52 int BLOCK_DIM_X,
53 int BLOCK_DIM_Y,
54 int BLOCK_DIM_Z,
55 int PTX_ARCH>
56struct BlockScanWarpScans
57{
58 //---------------------------------------------------------------------
59 // Types and constants
60 //---------------------------------------------------------------------
61
63 enum
64 {
66 WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
67
69 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
70
73 };
74
77
80
83 {
86 T warp_aggregates[WARPS];
88 };
89
90
92 struct TempStorage : Uninitialized<_TempStorage> {};
93
94
95 //---------------------------------------------------------------------
96 // Per-thread fields
97 //---------------------------------------------------------------------
98
99 // Thread fields
100 _TempStorage &temp_storage;
101 unsigned int linear_tid;
102 unsigned int warp_id;
103 unsigned int lane_id;
104
105
106 //---------------------------------------------------------------------
107 // Constructors
108 //---------------------------------------------------------------------
109
111 __device__ __forceinline__ BlockScanWarpScans(
112 TempStorage &temp_storage)
113 :
114 temp_storage(temp_storage.Alias()),
115 linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
116 warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
117 lane_id(LaneId())
118 {}
119
120
121 //---------------------------------------------------------------------
122 // Utility methods
123 //---------------------------------------------------------------------
124
125 template <typename ScanOp, int WARP>
126 __device__ __forceinline__ void ApplyWarpAggregates(
127 T &warp_prefix,
128 ScanOp scan_op,
129 T &block_aggregate,
130 Int2Type<WARP> addend_warp)
131 {
132 if (warp_id == WARP)
133 warp_prefix = block_aggregate;
134
135 T addend = temp_storage.warp_aggregates[WARP];
136 block_aggregate = scan_op(block_aggregate, addend);
137
138 ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
139 }
140
141 template <typename ScanOp>
142 __device__ __forceinline__ void ApplyWarpAggregates(
143 T &warp_prefix,
144 ScanOp scan_op,
145 T &block_aggregate,
146 Int2Type<WARPS> addend_warp)
147 {}
148
149
151 template <typename ScanOp>
152 __device__ __forceinline__ T ComputeWarpPrefix(
153 ScanOp scan_op,
154 T warp_aggregate,
155 T &block_aggregate)
156 {
157 // Last lane in each warp shares its warp-aggregate
158 if (lane_id == WARP_THREADS - 1)
159 temp_storage.warp_aggregates[warp_id] = warp_aggregate;
160
161 CTA_SYNC();
162
163 // Accumulate block aggregates and save the one that is our warp's prefix
164 T warp_prefix;
165 block_aggregate = temp_storage.warp_aggregates[0];
166
167 // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
168 ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
169/*
170 #pragma unroll
171 for (int WARP = 1; WARP < WARPS; ++WARP)
172 {
173 if (warp_id == WARP)
174 warp_prefix = block_aggregate;
175
176 T addend = temp_storage.warp_aggregates[WARP];
177 block_aggregate = scan_op(block_aggregate, addend);
178 }
179*/
180
181 return warp_prefix;
182 }
183
184
186 template <typename ScanOp>
187 __device__ __forceinline__ T ComputeWarpPrefix(
188 ScanOp scan_op,
189 T warp_aggregate,
190 T &block_aggregate,
191 const T &initial_value)
192 {
193 T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
194
195 warp_prefix = scan_op(initial_value, warp_prefix);
196
197 if (warp_id == 0)
198 warp_prefix = initial_value;
199
200 return warp_prefix;
201 }
202
203 //---------------------------------------------------------------------
204 // Exclusive scans
205 //---------------------------------------------------------------------
206
208 template <typename ScanOp>
209 __device__ __forceinline__ void ExclusiveScan(
210 T input,
211 T &exclusive_output,
212 ScanOp scan_op)
213 {
214 // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
215 T block_aggregate;
216 ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
217 }
218
219
221 template <typename ScanOp>
222 __device__ __forceinline__ void ExclusiveScan(
223 T input,
224 T &exclusive_output,
225 const T &initial_value,
226 ScanOp scan_op)
227 {
228 T block_aggregate;
229 ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
230 }
231
232
234 template <typename ScanOp>
235 __device__ __forceinline__ void ExclusiveScan(
236 T input,
237 T &exclusive_output,
238 ScanOp scan_op,
239 T &block_aggregate)
240 {
241 WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
242
243 // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
244 T inclusive_output;
245 my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
246
247 // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid.
248// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
249
250//--------------------------------------------------
251 // Last lane in each warp shares its warp-aggregate
252 if (lane_id == WARP_THREADS - 1)
253 temp_storage.warp_aggregates[warp_id] = inclusive_output;
254
255 CTA_SYNC();
256
257 // Get the warp scan partial
258 T warp_inclusive, warp_prefix;
259 if (lane_id < WARPS)
260 {
261 // Scan the warpscan partials
262 T warp_val = temp_storage.warp_aggregates[lane_id];
263 WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
264 }
265
266 warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id);
267 block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
268//--------------------------------------------------
269
270 // Apply warp prefix to our lane's partial
271 if (warp_id != 0)
272 {
273 exclusive_output = scan_op(warp_prefix, exclusive_output);
274 if (lane_id == 0)
275 exclusive_output = warp_prefix;
276 }
277 }
278
279
281 template <typename ScanOp>
282 __device__ __forceinline__ void ExclusiveScan(
283 T input,
284 T &exclusive_output,
285 const T &initial_value,
286 ScanOp scan_op,
287 T &block_aggregate)
288 {
289 WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
290
291 // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
292 T inclusive_output;
293 my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
294
295 // Compute the warp-wide prefix and block-wide aggregate for each warp
296// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
297
298//--------------------------------------------------
299 // Last lane in each warp shares its warp-aggregate
300 if (lane_id == WARP_THREADS - 1)
301 temp_storage.warp_aggregates[warp_id] = inclusive_output;
302
303 CTA_SYNC();
304
305 // Get the warp scan partial
306 T warp_inclusive, warp_prefix;
307 if (lane_id < WARPS)
308 {
309 // Scan the warpscan partials
310 T warp_val = temp_storage.warp_aggregates[lane_id];
311 WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
312 }
313
314 warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id);
315 block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
316//--------------------------------------------------
317
318 // Apply warp prefix to our lane's partial
319 exclusive_output = scan_op(warp_prefix, exclusive_output);
320 if (lane_id == 0)
321 exclusive_output = warp_prefix;
322 }
323
324
326 template <
327 typename ScanOp,
328 typename BlockPrefixCallbackOp>
329 __device__ __forceinline__ void ExclusiveScan(
330 T input,
331 T &exclusive_output,
332 ScanOp scan_op,
333 BlockPrefixCallbackOp &block_prefix_callback_op)
334 {
335 // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
336 T block_aggregate;
337 ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
338
339 // Use the first warp to determine the thread block prefix, returning the result in lane0
340 if (warp_id == 0)
341 {
342 T block_prefix = block_prefix_callback_op(block_aggregate);
343 if (lane_id == 0)
344 {
345 // Share the prefix with all threads
346 temp_storage.block_prefix = block_prefix;
347 exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0
348 }
349 }
350
351 CTA_SYNC();
352
353 // Incorporate thread block prefix into outputs
354 T block_prefix = temp_storage.block_prefix;
355 if (linear_tid > 0)
356 {
357 exclusive_output = scan_op(block_prefix, exclusive_output);
358 }
359 }
360
361
362 //---------------------------------------------------------------------
363 // Inclusive scans
364 //---------------------------------------------------------------------
365
367 template <typename ScanOp>
368 __device__ __forceinline__ void InclusiveScan(
369 T input,
370 T &inclusive_output,
371 ScanOp scan_op)
372 {
373 T block_aggregate;
374 InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
375 }
376
377
379 template <typename ScanOp>
380 __device__ __forceinline__ void InclusiveScan(
381 T input,
382 T &inclusive_output,
383 ScanOp scan_op,
384 T &block_aggregate)
385 {
386 WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
387
388 // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid.
389 T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
390
391 // Apply warp prefix to our lane's partial
392 if (warp_id != 0)
393 {
394 inclusive_output = scan_op(warp_prefix, inclusive_output);
395 }
396 }
397
398
400 template <
401 typename ScanOp,
402 typename BlockPrefixCallbackOp>
403 __device__ __forceinline__ void InclusiveScan(
404 T input,
405 T &exclusive_output,
406 ScanOp scan_op,
407 BlockPrefixCallbackOp &block_prefix_callback_op)
408 {
409 T block_aggregate;
410 InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
411
412 // Use the first warp to determine the thread block prefix, returning the result in lane0
413 if (warp_id == 0)
414 {
415 T block_prefix = block_prefix_callback_op(block_aggregate);
416 if (lane_id == 0)
417 {
418 // Share the prefix with all threads
419 temp_storage.block_prefix = block_prefix;
420 }
421 }
422
423 CTA_SYNC();
424
425 // Incorporate thread block prefix into outputs
426 T block_prefix = temp_storage.block_prefix;
427 exclusive_output = scan_op(block_prefix, exclusive_output);
428 }
429
430
431};
432
433
434} // CUB namespace
435CUB_NS_POSTFIX // Optional outer namespace(s)
436
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast the value input from warp-lanesrc_lane to all lanes in the warp.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition util_ptx.cuh:409
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420
CTA_SYNC()
Definition util_ptx.cuh:255
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Alias wrapper allowing storage to be unioned.
Shared memory storage layout type.
WarpScanT::TempStorage warp_scan[WARPS]
Buffer for warp-synchronous scans.
T block_prefix
Shared prefix for the entire thread block.
WarpAggregateScanT::TempStorage inner_scan[WARPS]
Buffer for warp-synchronous scans.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
@ WARP_THREADS
Number of warp threads.
@ WARPS
Number of active warps.
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP > addend_warp)
__device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate)
Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate ...
__device__ __forceinline__ BlockScanWarpScans(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARPS > addend_warp)
WarpScan< T, WARP_THREADS, PTX_ARCH > WarpScanT
WarpScan utility type.
@ BLOCK_THREADS
The thread block size in threads.
WarpScan< T, WARPS, PTX_ARCH > WarpAggregateScanT
WarpScan utility type.
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP >)
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
\smemstorage{WarpScan}