OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_scan_warp_scans2.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../../util_arch.cuh"
37 #include "../../util_ptx.cuh"
38 #include "../../warp/warp_scan.cuh"
39 #include "../../util_namespace.cuh"
40 
42 CUB_NS_PREFIX
43 
45 namespace cub {
46 
50 template <
51  typename T,
52  int BLOCK_DIM_X,
53  int BLOCK_DIM_Y,
54  int BLOCK_DIM_Z,
55  int PTX_ARCH>
56 struct BlockScanWarpScans
57 {
58  //---------------------------------------------------------------------
59  // Types and constants
60  //---------------------------------------------------------------------
61 
63  enum
64  {
66  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
67 
69  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
70 
73  };
74 
77 
80 
82  struct _TempStorage
83  {
86  T warp_aggregates[WARPS];
88  };
89 
90 
92  struct TempStorage : Uninitialized<_TempStorage> {};
93 
94 
95  //---------------------------------------------------------------------
96  // Per-thread fields
97  //---------------------------------------------------------------------
98 
99  // Thread fields
100  _TempStorage &temp_storage;
101  unsigned int linear_tid;
102  unsigned int warp_id;
103  unsigned int lane_id;
104 
105 
106  //---------------------------------------------------------------------
107  // Constructors
108  //---------------------------------------------------------------------
109 
111  __device__ __forceinline__ BlockScanWarpScans(
112  TempStorage &temp_storage)
113  :
114  temp_storage(temp_storage.Alias()),
115  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
116  warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
117  lane_id(LaneId())
118  {}
119 
120 
121  //---------------------------------------------------------------------
122  // Utility methods
123  //---------------------------------------------------------------------
124 
125  template <typename ScanOp, int WARP>
126  __device__ __forceinline__ void ApplyWarpAggregates(
127  T &warp_prefix,
128  ScanOp scan_op,
129  T &block_aggregate,
130  Int2Type<WARP> addend_warp)
131  {
132  if (warp_id == WARP)
133  warp_prefix = block_aggregate;
134 
135  T addend = temp_storage.warp_aggregates[WARP];
136  block_aggregate = scan_op(block_aggregate, addend);
137 
138  ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
139  }
140 
141  template <typename ScanOp>
142  __device__ __forceinline__ void ApplyWarpAggregates(
143  T &warp_prefix,
144  ScanOp scan_op,
145  T &block_aggregate,
146  Int2Type<WARPS> addend_warp)
147  {}
148 
149 
151  template <typename ScanOp>
152  __device__ __forceinline__ T ComputeWarpPrefix(
153  ScanOp scan_op,
154  T warp_aggregate,
155  T &block_aggregate)
156  {
157  // Last lane in each warp shares its warp-aggregate
158  if (lane_id == WARP_THREADS - 1)
159  temp_storage.warp_aggregates[warp_id] = warp_aggregate;
160 
161  CTA_SYNC();
162 
163  // Accumulate block aggregates and save the one that is our warp's prefix
164  T warp_prefix;
165  block_aggregate = temp_storage.warp_aggregates[0];
166 
167  // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
168  ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
169 /*
170  #pragma unroll
171  for (int WARP = 1; WARP < WARPS; ++WARP)
172  {
173  if (warp_id == WARP)
174  warp_prefix = block_aggregate;
175 
176  T addend = temp_storage.warp_aggregates[WARP];
177  block_aggregate = scan_op(block_aggregate, addend);
178  }
179 */
180 
181  return warp_prefix;
182  }
183 
184 
186  template <typename ScanOp>
187  __device__ __forceinline__ T ComputeWarpPrefix(
188  ScanOp scan_op,
189  T warp_aggregate,
190  T &block_aggregate,
191  const T &initial_value)
192  {
193  T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
194 
195  warp_prefix = scan_op(initial_value, warp_prefix);
196 
197  if (warp_id == 0)
198  warp_prefix = initial_value;
199 
200  return warp_prefix;
201  }
202 
203  //---------------------------------------------------------------------
204  // Exclusive scans
205  //---------------------------------------------------------------------
206 
208  template <typename ScanOp>
209  __device__ __forceinline__ void ExclusiveScan(
210  T input,
211  T &exclusive_output,
212  ScanOp scan_op)
213  {
214  // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
215  T block_aggregate;
216  ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
217  }
218 
219 
221  template <typename ScanOp>
222  __device__ __forceinline__ void ExclusiveScan(
223  T input,
224  T &exclusive_output,
225  const T &initial_value,
226  ScanOp scan_op)
227  {
228  T block_aggregate;
229  ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
230  }
231 
232 
234  template <typename ScanOp>
235  __device__ __forceinline__ void ExclusiveScan(
236  T input,
237  T &exclusive_output,
238  ScanOp scan_op,
239  T &block_aggregate)
240  {
241  WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
242 
243  // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
244  T inclusive_output;
245  my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
246 
247  // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid.
248 // T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
249 
250 //--------------------------------------------------
251  // Last lane in each warp shares its warp-aggregate
252  if (lane_id == WARP_THREADS - 1)
253  temp_storage.warp_aggregates[warp_id] = inclusive_output;
254 
255  CTA_SYNC();
256 
257  // Get the warp scan partial
258  T warp_inclusive, warp_prefix;
259  if (lane_id < WARPS)
260  {
261  // Scan the warpscan partials
262  T warp_val = temp_storage.warp_aggregates[lane_id];
263  WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
264  }
265 
266  warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id);
267  block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
268 //--------------------------------------------------
269 
270  // Apply warp prefix to our lane's partial
271  if (warp_id != 0)
272  {
273  exclusive_output = scan_op(warp_prefix, exclusive_output);
274  if (lane_id == 0)
275  exclusive_output = warp_prefix;
276  }
277  }
278 
279 
281  template <typename ScanOp>
282  __device__ __forceinline__ void ExclusiveScan(
283  T input,
284  T &exclusive_output,
285  const T &initial_value,
286  ScanOp scan_op,
287  T &block_aggregate)
288  {
289  WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
290 
291  // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
292  T inclusive_output;
293  my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
294 
295  // Compute the warp-wide prefix and block-wide aggregate for each warp
296 // T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
297 
298 //--------------------------------------------------
299  // Last lane in each warp shares its warp-aggregate
300  if (lane_id == WARP_THREADS - 1)
301  temp_storage.warp_aggregates[warp_id] = inclusive_output;
302 
303  CTA_SYNC();
304 
305  // Get the warp scan partial
306  T warp_inclusive, warp_prefix;
307  if (lane_id < WARPS)
308  {
309  // Scan the warpscan partials
310  T warp_val = temp_storage.warp_aggregates[lane_id];
311  WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
312  }
313 
314  warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id);
315  block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
316 //--------------------------------------------------
317 
318  // Apply warp prefix to our lane's partial
319  exclusive_output = scan_op(warp_prefix, exclusive_output);
320  if (lane_id == 0)
321  exclusive_output = warp_prefix;
322  }
323 
324 
326  template <
327  typename ScanOp,
328  typename BlockPrefixCallbackOp>
329  __device__ __forceinline__ void ExclusiveScan(
330  T input,
331  T &exclusive_output,
332  ScanOp scan_op,
333  BlockPrefixCallbackOp &block_prefix_callback_op)
334  {
335  // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
336  T block_aggregate;
337  ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
338 
339  // Use the first warp to determine the thread block prefix, returning the result in lane0
340  if (warp_id == 0)
341  {
342  T block_prefix = block_prefix_callback_op(block_aggregate);
343  if (lane_id == 0)
344  {
345  // Share the prefix with all threads
346  temp_storage.block_prefix = block_prefix;
347  exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0
348  }
349  }
350 
351  CTA_SYNC();
352 
353  // Incorporate thread block prefix into outputs
354  T block_prefix = temp_storage.block_prefix;
355  if (linear_tid > 0)
356  {
357  exclusive_output = scan_op(block_prefix, exclusive_output);
358  }
359  }
360 
361 
362  //---------------------------------------------------------------------
363  // Inclusive scans
364  //---------------------------------------------------------------------
365 
367  template <typename ScanOp>
368  __device__ __forceinline__ void InclusiveScan(
369  T input,
370  T &inclusive_output,
371  ScanOp scan_op)
372  {
373  T block_aggregate;
374  InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
375  }
376 
377 
379  template <typename ScanOp>
380  __device__ __forceinline__ void InclusiveScan(
381  T input,
382  T &inclusive_output,
383  ScanOp scan_op,
384  T &block_aggregate)
385  {
386  WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
387 
388  // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid.
389  T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
390 
391  // Apply warp prefix to our lane's partial
392  if (warp_id != 0)
393  {
394  inclusive_output = scan_op(warp_prefix, inclusive_output);
395  }
396  }
397 
398 
400  template <
401  typename ScanOp,
402  typename BlockPrefixCallbackOp>
403  __device__ __forceinline__ void InclusiveScan(
404  T input,
405  T &exclusive_output,
406  ScanOp scan_op,
407  BlockPrefixCallbackOp &block_prefix_callback_op)
408  {
409  T block_aggregate;
410  InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
411 
412  // Use the first warp to determine the thread block prefix, returning the result in lane0
413  if (warp_id == 0)
414  {
415  T block_prefix = block_prefix_callback_op(block_aggregate);
416  if (lane_id == 0)
417  {
418  // Share the prefix with all threads
419  temp_storage.block_prefix = block_prefix;
420  }
421  }
422 
423  CTA_SYNC();
424 
425  // Incorporate thread block prefix into outputs
426  T block_prefix = temp_storage.block_prefix;
427  exclusive_output = scan_op(block_prefix, exclusive_output);
428  }
429 
430 
431 };
432 
433 
434 } // CUB namespace
435 CUB_NS_POSTFIX // Optional outer namespace(s)
436 
__device__ __forceinline__ void InclusiveScan(T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate)
Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate ...
Optional outer namespace(s)
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
The thread block size in threads.
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARPS > addend_warp)
CTA_SYNC()
Definition: util_ptx.cuh:255
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition: util_ptx.cuh:420
Alias wrapper allowing storage to be unioned.
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ void Scan(T input, T &inclusive_output, T &exclusive_output, ScanOp scan_op)
Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the...
Definition: warp_scan.cuh:799
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP > addend_warp)
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
Shared memory storage layout type.
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP >)
WarpScan< T, WARP_THREADS, PTX_ARCH > WarpScanT
WarpScan utility type.
WarpScanT::TempStorage warp_scan[WARPS]
Buffer for warp-synchronous scans.
__device__ __forceinline__ BlockScanWarpScans(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate, const T &initial_value)
Use the warp-wide aggregates and initial-value to compute the calling warp's prefix....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
T block_prefix
Shared prefix for the entire thread block.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast the value input from warp-lanesrc_lane to all lanes in the warp.
Definition: warp_scan.cuh:922
WarpScan< T, WARPS, PTX_ARCH > WarpAggregateScanT
WarpScan utility type.
\smemstorage{WarpScan}
Definition: warp_scan.cuh:192
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
Definition: warp_scan.cuh:146
WarpAggregateScanT::TempStorage inner_scan[WARPS]
Buffer for warp-synchronous scans.
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....