OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
block_scan_warp_scans.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../../util_arch.cuh"
37 #include "../../util_ptx.cuh"
38 #include "../../warp/warp_scan.cuh"
39 #include "../../util_namespace.cuh"
40 
42 CUB_NS_PREFIX
43 
45 namespace cub {
46 
50 template <
51  typename T,
52  int BLOCK_DIM_X,
53  int BLOCK_DIM_Y,
54  int BLOCK_DIM_Z,
55  int PTX_ARCH>
57 {
58  //---------------------------------------------------------------------
59  // Types and constants
60  //---------------------------------------------------------------------
61 
63  enum
64  {
66  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
67 
69  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
70 
73  };
74 
77 
80 
82 
84  {
85  T warp_aggregates[WARPS];
86  typename WarpScanT::TempStorage warp_scan[WARPS];
87  T block_prefix;
88  };
89 
90 
92  struct TempStorage : Uninitialized<_TempStorage> {};
93 
94 
95  //---------------------------------------------------------------------
96  // Per-thread fields
97  //---------------------------------------------------------------------
98 
99  // Thread fields
100  _TempStorage &temp_storage;
101  unsigned int linear_tid;
102  unsigned int warp_id;
103  unsigned int lane_id;
104 
105 
106  //---------------------------------------------------------------------
107  // Constructors
108  //---------------------------------------------------------------------
109 
111  __device__ __forceinline__ BlockScanWarpScans(
112  TempStorage &temp_storage)
113  :
114  temp_storage(temp_storage.Alias()),
115  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
116  warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
117  lane_id(LaneId())
118  {}
119 
120 
121  //---------------------------------------------------------------------
122  // Utility methods
123  //---------------------------------------------------------------------
124 
125  template <typename ScanOp, int WARP>
126  __device__ __forceinline__ void ApplyWarpAggregates(
127  T &warp_prefix,
128  ScanOp scan_op,
129  T &block_aggregate,
130  Int2Type<WARP> /*addend_warp*/)
131  {
132  if (warp_id == WARP)
133  warp_prefix = block_aggregate;
134 
135  T addend = temp_storage.warp_aggregates[WARP];
136  block_aggregate = scan_op(block_aggregate, addend);
137 
138  ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
139  }
140 
141  template <typename ScanOp>
142  __device__ __forceinline__ void ApplyWarpAggregates(
143  T &/*warp_prefix*/,
144  ScanOp /*scan_op*/,
145  T &/*block_aggregate*/,
146  Int2Type<WARPS> /*addend_warp*/)
147  {}
148 
149 
151  template <typename ScanOp>
152  __device__ __forceinline__ T ComputeWarpPrefix(
153  ScanOp scan_op,
154  T warp_aggregate,
155  T &block_aggregate)
156  {
157  // Last lane in each warp shares its warp-aggregate
158  if (lane_id == WARP_THREADS - 1)
159  temp_storage.warp_aggregates[warp_id] = warp_aggregate;
160 
161  CTA_SYNC();
162 
163  // Accumulate block aggregates and save the one that is our warp's prefix
164  T warp_prefix;
165  block_aggregate = temp_storage.warp_aggregates[0];
166 
167  // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
168  ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
169 /*
170  #pragma unroll
171  for (int WARP = 1; WARP < WARPS; ++WARP)
172  {
173  if (warp_id == WARP)
174  warp_prefix = block_aggregate;
175 
176  T addend = temp_storage.warp_aggregates[WARP];
177  block_aggregate = scan_op(block_aggregate, addend);
178  }
179 */
180 
181  return warp_prefix;
182  }
183 
184 
186  template <typename ScanOp>
187  __device__ __forceinline__ T ComputeWarpPrefix(
188  ScanOp scan_op,
189  T warp_aggregate,
190  T &block_aggregate,
191  const T &initial_value)
192  {
193  T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
194 
195  warp_prefix = scan_op(initial_value, warp_prefix);
196 
197  if (warp_id == 0)
198  warp_prefix = initial_value;
199 
200  return warp_prefix;
201  }
202 
203  //---------------------------------------------------------------------
204  // Exclusive scans
205  //---------------------------------------------------------------------
206 
208  template <typename ScanOp>
209  __device__ __forceinline__ void ExclusiveScan(
210  T input,
211  T &exclusive_output,
212  ScanOp scan_op)
213  {
214  // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
215  T block_aggregate;
216  ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
217  }
218 
219 
221  template <typename ScanOp>
222  __device__ __forceinline__ void ExclusiveScan(
223  T input,
224  T &exclusive_output,
225  const T &initial_value,
226  ScanOp scan_op)
227  {
228  T block_aggregate;
229  ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
230  }
231 
232 
234  template <typename ScanOp>
235  __device__ __forceinline__ void ExclusiveScan(
236  T input,
237  T &exclusive_output,
238  ScanOp scan_op,
239  T &block_aggregate)
240  {
241  // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
242  T inclusive_output;
243  WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
244 
245  // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid.
246  T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
247 
248  // Apply warp prefix to our lane's partial
249  if (warp_id != 0)
250  {
251  exclusive_output = scan_op(warp_prefix, exclusive_output);
252  if (lane_id == 0)
253  exclusive_output = warp_prefix;
254  }
255  }
256 
257 
259  template <typename ScanOp>
260  __device__ __forceinline__ void ExclusiveScan(
261  T input,
262  T &exclusive_output,
263  const T &initial_value,
264  ScanOp scan_op,
265  T &block_aggregate)
266  {
267  // Compute warp scan in each warp. The exclusive output from each lane0 is invalid.
268  T inclusive_output;
269  WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
270 
271  // Compute the warp-wide prefix and block-wide aggregate for each warp
272  T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
273 
274  // Apply warp prefix to our lane's partial
275  exclusive_output = scan_op(warp_prefix, exclusive_output);
276  if (lane_id == 0)
277  exclusive_output = warp_prefix;
278  }
279 
280 
282  template <
283  typename ScanOp,
284  typename BlockPrefixCallbackOp>
285  __device__ __forceinline__ void ExclusiveScan(
286  T input,
287  T &exclusive_output,
288  ScanOp scan_op,
289  BlockPrefixCallbackOp &block_prefix_callback_op)
290  {
291  // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid.
292  T block_aggregate;
293  ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
294 
295  // Use the first warp to determine the thread block prefix, returning the result in lane0
296  if (warp_id == 0)
297  {
298  T block_prefix = block_prefix_callback_op(block_aggregate);
299  if (lane_id == 0)
300  {
301  // Share the prefix with all threads
302  temp_storage.block_prefix = block_prefix;
303  exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0
304  }
305  }
306 
307  CTA_SYNC();
308 
309  // Incorporate thread block prefix into outputs
310  T block_prefix = temp_storage.block_prefix;
311  if (linear_tid > 0)
312  {
313  exclusive_output = scan_op(block_prefix, exclusive_output);
314  }
315  }
316 
317 
318  //---------------------------------------------------------------------
319  // Inclusive scans
320  //---------------------------------------------------------------------
321 
323  template <typename ScanOp>
324  __device__ __forceinline__ void InclusiveScan(
325  T input,
326  T &inclusive_output,
327  ScanOp scan_op)
328  {
329  T block_aggregate;
330  InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
331  }
332 
333 
335  template <typename ScanOp>
336  __device__ __forceinline__ void InclusiveScan(
337  T input,
338  T &inclusive_output,
339  ScanOp scan_op,
340  T &block_aggregate)
341  {
342  WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
343 
344  // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid.
345  T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
346 
347  // Apply warp prefix to our lane's partial
348  if (warp_id != 0)
349  {
350  inclusive_output = scan_op(warp_prefix, inclusive_output);
351  }
352  }
353 
354 
356  template <
357  typename ScanOp,
358  typename BlockPrefixCallbackOp>
359  __device__ __forceinline__ void InclusiveScan(
360  T input,
361  T &exclusive_output,
362  ScanOp scan_op,
363  BlockPrefixCallbackOp &block_prefix_callback_op)
364  {
365  T block_aggregate;
366  InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
367 
368  // Use the first warp to determine the thread block prefix, returning the result in lane0
369  if (warp_id == 0)
370  {
371  T block_prefix = block_prefix_callback_op(block_aggregate);
372  if (lane_id == 0)
373  {
374  // Share the prefix with all threads
375  temp_storage.block_prefix = block_prefix;
376  }
377  }
378 
379  CTA_SYNC();
380 
381  // Incorporate thread block prefix into outputs
382  T block_prefix = temp_storage.block_prefix;
383  exclusive_output = scan_op(block_prefix, exclusive_output);
384  }
385 
386 
387 };
388 
389 
390 } // CUB namespace
391 CUB_NS_POSTFIX // Optional outer namespace(s)
392 
__device__ __forceinline__ void InclusiveScan(T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate)
Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate ...
Optional outer namespace(s)
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
struct __align__(32) _TempStorage
Shared memory storage layout type.
The thread block size in threads.
CTA_SYNC()
Definition: util_ptx.cuh:255
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition: util_ptx.cuh:420
Alias wrapper allowing storage to be unioned.
BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread bloc...
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
WarpScan< T, WARPS, PTX_ARCH > WarpAggregateScan
WarpScan utility type.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
Returns the row-major linear thread identifier for a multidimensional thread block.
Definition: util_ptx.cuh:409
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
Shared memory storage layout type.
__device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, ScanOp scan_op, T &block_aggregate, Int2Type< WARP >)
WarpScan< T, WARP_THREADS, PTX_ARCH > WarpScanT
WarpScan utility type.
__device__ __forceinline__ BlockScanWarpScans(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate, const T &initial_value)
Use the warp-wide aggregates and initial-value to compute the calling warp's prefix....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....
\smemstorage{WarpScan}
Definition: warp_scan.cuh:192
The WarpScan class provides collective methods for computing a parallel prefix scan of items partitio...
Definition: warp_scan.cuh:146
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Computes an inclusive thread block-wide prefix scan using the specified binary scan_op functor....
__device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp &block_prefix_callback_op)
Computes an exclusive thread block-wide prefix scan using the specified binary scan_op functor....