OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
warp_scan_smem.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include "../../thread/thread_operators.cuh"
37#include "../../thread/thread_load.cuh"
38#include "../../thread/thread_store.cuh"
39#include "../../util_type.cuh"
40#include "../../util_namespace.cuh"
41
43CUB_NS_PREFIX
44
46namespace cub {
47
51template <
52 typename T,
53 int LOGICAL_WARP_THREADS,
54 int PTX_ARCH>
56{
57 /******************************************************************************
58 * Constants and type definitions
59 ******************************************************************************/
60
61 enum
62 {
64 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
65
68
71
74
76 WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
77 };
78
80 typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
81
84
85 // Alias wrapper allowing storage to be unioned
86 struct TempStorage : Uninitialized<_TempStorage> {};
87
88
89 /******************************************************************************
90 * Thread fields
91 ******************************************************************************/
92
93 _TempStorage &temp_storage;
94 unsigned int lane_id;
95 unsigned int member_mask;
96
97
98 /******************************************************************************
99 * Construction
100 ******************************************************************************/
101
103 __device__ __forceinline__ WarpScanSmem(
104 TempStorage &temp_storage)
105 :
106 temp_storage(temp_storage.Alias()),
107
108 lane_id(IS_ARCH_WARP ?
109 LaneId() :
110 LaneId() % LOGICAL_WARP_THREADS),
111
112 member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
113 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
114 ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
115 {}
116
117
118 /******************************************************************************
119 * Utility methods
120 ******************************************************************************/
121
123 template <
124 bool HAS_IDENTITY,
125 int STEP,
126 typename ScanOp>
127 __device__ __forceinline__ void ScanStep(
128 T &partial,
129 ScanOp scan_op,
130 Int2Type<STEP> /*step*/)
131 {
132 const int OFFSET = 1 << STEP;
133
134 // Share partial into buffer
135 ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
136
137 WARP_SYNC(member_mask);
138
139 // Update partial if addend is in range
140 if (HAS_IDENTITY || (lane_id >= OFFSET))
141 {
142 T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
143 partial = scan_op(addend, partial);
144 }
145 WARP_SYNC(member_mask);
146
147 ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
148 }
149
150
152 template <
153 bool HAS_IDENTITY,
154 typename ScanOp>
155 __device__ __forceinline__ void ScanStep(
156 T &/*partial*/,
157 ScanOp /*scan_op*/,
158 Int2Type<STEPS> /*step*/)
159 {}
160
161
163 __device__ __forceinline__ void InclusiveScan(
164 T input,
165 T &output,
166 Sum scan_op,
167 Int2Type<true> /*is_primitive*/)
168 {
169 T identity = 0;
170 ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
171
172 WARP_SYNC(member_mask);
173
174 // Iterate scan steps
175 output = input;
176 ScanStep<true>(output, scan_op, Int2Type<0>());
177 }
178
179
181 template <typename ScanOp, int IS_PRIMITIVE>
182 __device__ __forceinline__ void InclusiveScan(
183 T input,
184 T &output,
185 ScanOp scan_op,
186 Int2Type<IS_PRIMITIVE> /*is_primitive*/)
187 {
188 // Iterate scan steps
189 output = input;
190 ScanStep<false>(output, scan_op, Int2Type<0>());
191 }
192
193
194 /******************************************************************************
195 * Interface
196 ******************************************************************************/
197
198 //---------------------------------------------------------------------
199 // Broadcast
200 //---------------------------------------------------------------------
201
203 __device__ __forceinline__ T Broadcast(
204 T input,
205 unsigned int src_lane)
206 {
207 if (lane_id == src_lane)
208 {
209 ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
210 }
211
212 WARP_SYNC(member_mask);
213
214 return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
215 }
216
217
218 //---------------------------------------------------------------------
219 // Inclusive operations
220 //---------------------------------------------------------------------
221
223 template <typename ScanOp>
224 __device__ __forceinline__ void InclusiveScan(
225 T input,
226 T &inclusive_output,
227 ScanOp scan_op)
228 {
229 InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
230 }
231
232
234 template <typename ScanOp>
235 __device__ __forceinline__ void InclusiveScan(
236 T input,
237 T &inclusive_output,
238 ScanOp scan_op,
239 T &warp_aggregate)
240 {
241 InclusiveScan(input, inclusive_output, scan_op);
242
243 // Retrieve aggregate
244 ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
245
246 WARP_SYNC(member_mask);
247
248 warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
249
250 WARP_SYNC(member_mask);
251 }
252
253
254 //---------------------------------------------------------------------
255 // Get exclusive from inclusive
256 //---------------------------------------------------------------------
257
259 template <typename ScanOpT, typename IsIntegerT>
260 __device__ __forceinline__ void Update(
261 T /*input*/,
262 T &inclusive,
263 T &exclusive,
264 ScanOpT /*scan_op*/,
265 IsIntegerT /*is_integer*/)
266 {
267 // initial value unknown
268 ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
269
270 WARP_SYNC(member_mask);
271
272 exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
273 }
274
276 __device__ __forceinline__ void Update(
277 T input,
278 T &inclusive,
279 T &exclusive,
280 cub::Sum /*scan_op*/,
281 Int2Type<true> /*is_integer*/)
282 {
283 // initial value presumed 0
284 exclusive = inclusive - input;
285 }
286
288 template <typename ScanOpT, typename IsIntegerT>
289 __device__ __forceinline__ void Update (
290 T /*input*/,
291 T &inclusive,
292 T &exclusive,
293 ScanOpT scan_op,
294 T initial_value,
295 IsIntegerT /*is_integer*/)
296 {
297 inclusive = scan_op(initial_value, inclusive);
298 ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
299
300 WARP_SYNC(member_mask);
301
302 exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
303 if (lane_id == 0)
304 exclusive = initial_value;
305 }
306
308 __device__ __forceinline__ void Update (
309 T input,
310 T &inclusive,
311 T &exclusive,
313 T initial_value,
314 Int2Type<true> /*is_integer*/)
315 {
316 inclusive = scan_op(initial_value, inclusive);
317 exclusive = inclusive - input;
318 }
319
320
322 template <typename ScanOpT, typename IsIntegerT>
323 __device__ __forceinline__ void Update (
324 T /*input*/,
325 T &inclusive,
326 T &exclusive,
327 T &warp_aggregate,
328 ScanOpT /*scan_op*/,
329 IsIntegerT /*is_integer*/)
330 {
331 // Initial value presumed to be unknown or identity (either way our padding is correct)
332 ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
333
334 WARP_SYNC(member_mask);
335
336 exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
337 warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
338 }
339
341 __device__ __forceinline__ void Update (
342 T input,
343 T &inclusive,
344 T &exclusive,
345 T &warp_aggregate,
346 cub::Sum /*scan_o*/,
347 Int2Type<true> /*is_integer*/)
348 {
349 // Initial value presumed to be unknown or identity (either way our padding is correct)
350 ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
351
352 WARP_SYNC(member_mask);
353
354 warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
355 exclusive = inclusive - input;
356 }
357
359 template <typename ScanOpT, typename IsIntegerT>
360 __device__ __forceinline__ void Update (
361 T /*input*/,
362 T &inclusive,
363 T &exclusive,
364 T &warp_aggregate,
365 ScanOpT scan_op,
366 T initial_value,
367 IsIntegerT /*is_integer*/)
368 {
369 // Broadcast warp aggregate
370 ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
371
372 WARP_SYNC(member_mask);
373
374 warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
375
376 WARP_SYNC(member_mask);
377
378 // Update inclusive with initial value
379 inclusive = scan_op(initial_value, inclusive);
380
381 // Get exclusive from exclusive
382 ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
383
384 WARP_SYNC(member_mask);
385
386 exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
387
388 if (lane_id == 0)
389 exclusive = initial_value;
390 }
391
392
393};
394
395
396} // CUB namespace
397CUB_NS_POSTFIX // Optional outer namespace(s)
BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition util_ptx.cuh:273
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420
Optional outer namespace(s)
OutputIteratorT ScanTileStateT int ScanOpT scan_op
Binary scan functor.
Type equality test.
Definition util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Statically determine log2(N), rounded up.
Statically determine if N is a power-of-two.
Default sum functor.
Type traits.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA ...
__device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op, Int2Type< IS_PRIMITIVE >)
Inclusive prefix scan.
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, cub::Sum, Int2Type< true >)
Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
Inclusive scan.
__device__ __forceinline__ WarpScanSmem(TempStorage &temp_storage)
Constructor.
@ IS_POW_OF_TWO
Whether the logical warp size is a power-of-two.
@ HALF_WARP_THREADS
The number of threads in half a warp.
@ STEPS
The number of warp scan steps.
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
@ WARP_SMEM_ELEMENTS
The number of shared memory elements per warp.
CellT _TempStorage[WARP_SMEM_ELEMENTS]
Shared memory storage layout type (1.5 warps-worth of elements for each warp)
__device__ __forceinline__ void ScanStep(T &partial, ScanOp scan_op, Int2Type< STEP >)
Basic inclusive scan iteration (template unrolled, inductive-case specialization)
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT, IsIntegerT)
Update inclusive, exclusive, and warp aggregate using input and inclusive.
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, cub::Sum scan_op, T initial_value, Int2Type< true >)
Update inclusive and exclusive using initial value using input and inclusive (specialized for summati...
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, ScanOpT scan_op, T initial_value, IsIntegerT)
Update inclusive and exclusive using initial value using input, inclusive, and initial value.
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, ScanOpT, IsIntegerT)
Update inclusive and exclusive using input and inclusive.
__device__ __forceinline__ void Update(T input, T &inclusive, T &exclusive, T &warp_aggregate, cub::Sum, Int2Type< true >)
Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation ...
__device__ __forceinline__ void InclusiveScan(T input, T &output, Sum scan_op, Int2Type< true >)
Inclusive prefix scan (specialized for summation across primitive types)
__device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
Broadcast.
__device__ __forceinline__ void Update(T, T &inclusive, T &exclusive, T &warp_aggregate, ScanOpT scan_op, T initial_value, IsIntegerT)
Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value.
__device__ __forceinline__ void ScanStep(T &, ScanOp, Int2Type< STEPS >)
Basic inclusive scan iteration(template unrolled, base-case specialization)
__device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &warp_aggregate)
Inclusive scan with aggregate.