OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_scan_warp_scans3.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../../util_arch.cuh"
37 #include "../../util_ptx.cuh"
38 #include "../../warp/warp_scan.cuh"
39 #include "../../util_namespace.cuh"
40 
42 CUB_NS_PREFIX
43 
45 namespace cub {
46 
50 template <
51  typename T,
52  int BLOCK_DIM_X,
53  int BLOCK_DIM_Y,
54  int BLOCK_DIM_Z,
55  int PTX_ARCH>
56 struct BlockScanWarpScans
57 {
58  //---------------------------------------------------------------------
59  // Types and constants
60  //---------------------------------------------------------------------
61 
63  enum
64  {
66  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
67 
69  INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
70  OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
71 
74  };
75 
78 
81 
82  typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
83 
84 
86  struct _TempStorage
87  {
88  union Aliasable
89  {
92 
93  } aliasable;
94 
95  T warp_aggregates[OUTER_WARPS];
96 
98  };
99 
100 
102  struct TempStorage : Uninitialized<_TempStorage> {};
103 
104 
105  //---------------------------------------------------------------------
106  // Per-thread fields
107  //---------------------------------------------------------------------
108 
109  // Thread fields
110  _TempStorage &temp_storage;
111  unsigned int linear_tid;
112  unsigned int warp_id;
113  unsigned int lane_id;
114 
115 
116  //---------------------------------------------------------------------
117  // Constructors
118  //---------------------------------------------------------------------
119 
121  __device__ __forceinline__ BlockScanWarpScans(
122  TempStorage &temp_storage)
123  :
124  temp_storage(temp_storage.Alias()),
125  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
126  warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
127  lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
128  {}
129 
130 
131  //---------------------------------------------------------------------
132  // Exclusive scans
133  //---------------------------------------------------------------------
134 
136  template <typename ScanOp>
137  __device__ __forceinline__ void ExclusiveScan(
138  T input,
139  T &exclusive_output,
140  ScanOp scan_op)
141  {
142  // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
143  T block_aggregate;
144  ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
145  }
146 
147 
149  template <typename ScanOp>
150  __device__ __forceinline__ void ExclusiveScan(
151  T input,
152  T &exclusive_output,
153  const T &initial_value,
154  ScanOp scan_op)
155  {
156  T block_aggregate;
157  ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
158  }
159 
160 
162  template <typename ScanOp>
163  __device__ __forceinline__ void ExclusiveScan(
164  T input,
165  T &exclusive_output,
166  ScanOp scan_op,
167  T &block_aggregate)
168  {
169  // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
170  T inclusive_output;
171  OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
172  input, inclusive_output, exclusive_output, scan_op);
173 
174  // Share outer warp total
175  if (lane_id == OUTER_WARP_THREADS - 1)
176  temp_storage.warp_aggregates[warp_id] = inclusive_output;
177 
178  CTA_SYNC();
179 
180  if (linear_tid < INNER_WARP_THREADS)
181  {
182  T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
183  T outer_warp_exclusive;
184 
185  InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
186  outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
187 
188  temp_storage.block_aggregate = block_aggregate;
189  temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive;
190  }
191 
192  CTA_SYNC();
193 
194  if (warp_id != 0)
195  {
196  // Retrieve block aggregate
197  block_aggregate = temp_storage.block_aggregate;
198 
199  // Apply warp prefix to our lane's partial
200  T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
201  exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
202  if (lane_id == 0)
203  exclusive_output = outer_warp_exclusive;
204  }
205  }
206 
207 
209  template <typename ScanOp>
210  __device__ __forceinline__ void ExclusiveScan(
211  T input,
212  T &exclusive_output,
213  const T &initial_value,
214  ScanOp scan_op,
215  T &block_aggregate)
216  {
217  // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
218  T inclusive_output;
219  OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
220  input, inclusive_output, exclusive_output, scan_op);
221 
222  // Share outer warp total
223  if (lane_id == OUTER_WARP_THREADS - 1)
224  {
225  temp_storage.warp_aggregates[warp_id] = inclusive_output;
226  }
227 
228  CTA_SYNC();
229 
230  if (linear_tid < INNER_WARP_THREADS)
231  {
232  T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
233  T outer_warp_exclusive;
234 
235  InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
236  outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
237 
238  temp_storage.block_aggregate = block_aggregate;
239  temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive;
240  }
241 
242  CTA_SYNC();
243 
244  // Retrieve block aggregate
245  block_aggregate = temp_storage.block_aggregate;
246 
247  // Apply warp prefix to our lane's partial
248  T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
249  exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
250  if (lane_id == 0)
251  exclusive_output = outer_warp_exclusive;
252  }
253 
254 
256  template <
257  typename ScanOp,
258  typename BlockPrefixCallbackOp>
259  __device__ __forceinline__ void ExclusiveScan(
260  T input,
261  T &exclusive_output,
262  ScanOp scan_op,
263  BlockPrefixCallbackOp &block_prefix_callback_op)
264  {
265  // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
266  T inclusive_output;
267  OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
268  input, inclusive_output, exclusive_output, scan_op);
269 
270  // Share outer warp total
271  if (lane_id == OUTER_WARP_THREADS - 1)
272  temp_storage.warp_aggregates[warp_id] = inclusive_output;
273 
274  CTA_SYNC();
275 
276  if (linear_tid < INNER_WARP_THREADS)
277  {
278  InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
279 
280  T upsweep = temp_storage.warp_aggregates[linear_tid];
281  T downsweep_prefix, block_aggregate;
282 
283  inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
284 
285  // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
286  T block_prefix = block_prefix_callback_op(block_aggregate);
287  block_prefix = inner_scan.Broadcast(block_prefix, 0);
288 
289  downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
290  if (linear_tid == 0)
291  downsweep_prefix = block_prefix;
292 
293  temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
294  }
295 
296  CTA_SYNC();
297 
298  // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
299  T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
300  exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
301  if (lane_id == 0)
302  exclusive_output = outer_warp_exclusive;
303  }
304 
305 
306  //---------------------------------------------------------------------
307  // Inclusive scans
308  //---------------------------------------------------------------------
309 
311  template <typename ScanOp>
312  __device__ __forceinline__ void InclusiveScan(
313  T input,
314  T &inclusive_output,
315  ScanOp scan_op)
316  {
317  T block_aggregate;
318  InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
319  }
320 
321 
323  template <typename ScanOp>
324  __device__ __forceinline__ void InclusiveScan(
325  T input,
326  T &inclusive_output,
327  ScanOp scan_op,
328  T &block_aggregate)
329  {
330  // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
331  OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
332  input, inclusive_output, scan_op);
333 
334  // Share outer warp total
335  if (lane_id == OUTER_WARP_THREADS - 1)
336  temp_storage.warp_aggregates[warp_id] = inclusive_output;
337 
338  CTA_SYNC();
339 
340  if (linear_tid < INNER_WARP_THREADS)
341  {
342  T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
343  T outer_warp_exclusive;
344 
345  InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
346  outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
347 
348  temp_storage.block_aggregate = block_aggregate;
349  temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive;
350  }
351 
352  CTA_SYNC();
353 
354  if (warp_id != 0)
355  {
356  // Retrieve block aggregate
357  block_aggregate = temp_storage.block_aggregate;
358 
359  // Apply warp prefix to our lane's partial
360  T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
361  inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
362  }
363  }
364 
365 
367  template <
368  typename ScanOp,
369  typename BlockPrefixCallbackOp>
370  __device__ __forceinline__ void InclusiveScan(
371  T input,
372  T &inclusive_output,
373  ScanOp scan_op,
374  BlockPrefixCallbackOp &block_prefix_callback_op)
375  {
376  // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
377  OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
378  input, inclusive_output, scan_op);
379 
380  // Share outer warp total
381  if (lane_id == OUTER_WARP_THREADS - 1)
382  temp_storage.warp_aggregates[warp_id] = inclusive_output;
383 
384  CTA_SYNC();
385 
386  if (linear_tid < INNER_WARP_THREADS)
387  {
388  InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
389 
390  T upsweep = temp_storage.warp_aggregates[linear_tid];
391  T downsweep_prefix, block_aggregate;
392  inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
393 
394  // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
395  T block_prefix = block_prefix_callback_op(block_aggregate);
396  block_prefix = inner_scan.Broadcast(block_prefix, 0);
397 
398  downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
399  if (linear_tid == 0)
400  downsweep_prefix = block_prefix;
401 
402  temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
403  }
404 
405  CTA_SYNC();
406 
407  // Apply warp prefix to our lane's partial
408  T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
409  inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
410  }
411 
412 
413 };
414 
415 
416 } // CUB namespace
417 CUB_NS_POSTFIX // Optional outer namespace(s)
418 
WarpScan< T, OUTER_WARP_THREADS, PTX_ARCH > OuterWarpScanT
Outer WarpScan utility type.
Optional outer namespace(s)
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
WarpScan< T, INNER_WARP_THREADS, PTX_ARCH > InnerWarpScanT
Inner WarpScan utility type.
The thread block size in threads.
CTA_SYNC()
Definition: util_ptx.cuh:255
Alias wrapper allowing storage to be unioned.
Uninitialized< OuterScanArray > outer_warp_scan
Buffer for warp-synchronous outer scans.
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive prefix scan using the specified binary scan functor across the calling warp....
Definition: warp_scan.cuh:551
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
T block_aggregate
Shared prefix for the entire thread block.
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
InnerWarpScanT::TempStorage inner_warp_scan
Buffer for warp-synchronous inner scan.
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
Shared memory storage layout type.
__device__ __forceinline__ BlockScanWarpScans(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast the value input from warp-lanesrc_lane to all lanes in the warp.
Definition: warp_scan.cuh:922
\smemstorage{WarpScan}
Definition: warp_scan.cuh:192
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
Definition: warp_scan.cuh:146
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....