OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
warp_reduce_smem.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include "../../thread/thread_operators.cuh"
37#include "../../thread/thread_load.cuh"
38#include "../../thread/thread_store.cuh"
39#include "../../util_type.cuh"
40#include "../../util_namespace.cuh"
41
43CUB_NS_PREFIX
44
46namespace cub {
47
51template <
52 typename T,
53 int LOGICAL_WARP_THREADS,
54 int PTX_ARCH>
56{
57 /******************************************************************************
58 * Constants and type definitions
59 ******************************************************************************/
60
61 enum
62 {
64 IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
65
68
71
74
76 WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
77
79 UNSET = 0x0, // Is initially unset
80 SET = 0x1, // Is initially set
81 SEEN = 0x2, // Has seen another head flag from a successor peer
82 };
83
85 typedef unsigned char SmemFlag;
86
89 {
90 T reduce[WARP_SMEM_ELEMENTS];
92 };
93
94 // Alias wrapper allowing storage to be unioned
95 struct TempStorage : Uninitialized<_TempStorage> {};
96
97
98 /******************************************************************************
99 * Thread fields
100 ******************************************************************************/
101
102 _TempStorage &temp_storage;
103 unsigned int lane_id;
104 unsigned int member_mask;
105
106
107 /******************************************************************************
108 * Construction
109 ******************************************************************************/
110
112 __device__ __forceinline__ WarpReduceSmem(
113 TempStorage &temp_storage)
114 :
115 temp_storage(temp_storage.Alias()),
116
117 lane_id(IS_ARCH_WARP ?
118 LaneId() :
119 LaneId() % LOGICAL_WARP_THREADS),
120
121 member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
122 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
123 ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
124 {}
125
126 /******************************************************************************
127 * Utility methods
128 ******************************************************************************/
129
130 //---------------------------------------------------------------------
131 // Regular reduction
132 //---------------------------------------------------------------------
133
137 template <
138 bool ALL_LANES_VALID,
139 typename ReductionOp,
140 int STEP>
141 __device__ __forceinline__ T ReduceStep(
142 T input,
143 int valid_items,
144 ReductionOp reduction_op,
145 Int2Type<STEP> /*step*/)
146 {
147 const int OFFSET = 1 << STEP;
148
149 // Share input through buffer
150 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
151
152 WARP_SYNC(member_mask);
153
154 // Update input if peer_addend is in range
155 if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
156 {
157 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
158 input = reduction_op(input, peer_addend);
159 }
160
161 WARP_SYNC(member_mask);
162
163 return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
164 }
165
166
170 template <
171 bool ALL_LANES_VALID,
172 typename ReductionOp>
173 __device__ __forceinline__ T ReduceStep(
174 T input,
175 int valid_items,
176 ReductionOp /*reduction_op*/,
177 Int2Type<STEPS> /*step*/)
178 {
179 return input;
180 }
181
182
183 //---------------------------------------------------------------------
184 // Segmented reduction
185 //---------------------------------------------------------------------
186
187
191 template <
192 bool HEAD_SEGMENTED,
193 typename FlagT,
194 typename ReductionOp>
195 __device__ __forceinline__ T SegmentedReduce(
196 T input,
197 FlagT flag,
198 ReductionOp reduction_op,
199 Int2Type<true> /*has_ballot*/)
200 {
201 // Get the start flags for each thread in the warp.
202 int warp_flags = WARP_BALLOT(flag, member_mask);
203
204 if (!HEAD_SEGMENTED)
205 warp_flags <<= 1;
206
207 // Keep bits above the current thread.
208 warp_flags &= LaneMaskGt();
209
210 // Accommodate packing of multiple logical warps in a single physical warp
211 if (!IS_ARCH_WARP)
212 {
213 warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
214 }
215
216 // Find next flag
217 int next_flag = __clz(__brev(warp_flags));
218
219 // Clip the next segment at the warp boundary if necessary
220 if (LOGICAL_WARP_THREADS != 32)
221 next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
222
223 #pragma unroll
224 for (int STEP = 0; STEP < STEPS; STEP++)
225 {
226 const int OFFSET = 1 << STEP;
227
228 // Share input into buffer
229 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
230
231 WARP_SYNC(member_mask);
232
233 // Update input if peer_addend is in range
234 if (OFFSET + lane_id < next_flag)
235 {
236 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
237 input = reduction_op(input, peer_addend);
238 }
239
240 WARP_SYNC(member_mask);
241 }
242
243 return input;
244 }
245
246
250 template <
251 bool HEAD_SEGMENTED,
252 typename FlagT,
253 typename ReductionOp>
254 __device__ __forceinline__ T SegmentedReduce(
255 T input,
256 FlagT flag,
257 ReductionOp reduction_op,
258 Int2Type<false> /*has_ballot*/)
259 {
260 enum
261 {
262 UNSET = 0x0, // Is initially unset
263 SET = 0x1, // Is initially set
264 SEEN = 0x2, // Has seen another head flag from a successor peer
265 };
266
267 // Alias flags onto shared data storage
268 volatile SmemFlag *flag_storage = temp_storage.flags;
269
270 SmemFlag flag_status = (flag) ? SET : UNSET;
271
272 for (int STEP = 0; STEP < STEPS; STEP++)
273 {
274 const int OFFSET = 1 << STEP;
275
276 // Share input through buffer
277 ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
278
279 WARP_SYNC(member_mask);
280
281 // Get peer from buffer
282 T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
283
284 WARP_SYNC(member_mask);
285
286 // Share flag through buffer
287 flag_storage[lane_id] = flag_status;
288
289 // Get peer flag from buffer
290 SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
291
292 // Update input if peer was in range
293 if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
294 {
295 if (HEAD_SEGMENTED)
296 {
297 // Head-segmented
298 if ((flag_status & SEEN) == 0)
299 {
300 // Has not seen a more distant head flag
301 if (peer_flag_status & SET)
302 {
303 // Has now seen a head flag
304 flag_status |= SEEN;
305 }
306 else
307 {
308 // Peer is not a head flag: grab its count
309 input = reduction_op(input, peer_addend);
310 }
311
312 // Update seen status to include that of peer
313 flag_status |= (peer_flag_status & SEEN);
314 }
315 }
316 else
317 {
318 // Tail-segmented. Simply propagate flag status
319 if (!flag_status)
320 {
321 input = reduction_op(input, peer_addend);
322 flag_status |= peer_flag_status;
323 }
324
325 }
326 }
327 }
328
329 return input;
330 }
331
332
333 /******************************************************************************
334 * Interface
335 ******************************************************************************/
336
340 template <
341 bool ALL_LANES_VALID,
342 typename ReductionOp>
343 __device__ __forceinline__ T Reduce(
344 T input,
345 int valid_items,
346 ReductionOp reduction_op)
347 {
348 return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
349 }
350
351
355 template <
356 bool HEAD_SEGMENTED,
357 typename FlagT,
358 typename ReductionOp>
359 __device__ __forceinline__ T SegmentedReduce(
360 T input,
361 FlagT flag,
362 ReductionOp reduction_op)
363 {
364 return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
365 }
366
367
368};
369
370
371} // CUB namespace
372CUB_NS_POSTFIX // Optional outer namespace(s)
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition util_ptx.cuh:273
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
Definition util_ptx.cuh:461
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition util_ptx.cuh:310
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition util_ptx.cuh:420
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Statically determine log2(N), rounded up.
Statically determine if N is a power-of-two.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Shared memory storage layout type (1.5 warps-worth of elements for each warp)
WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA ...
@ IS_POW_OF_TWO
Whether the logical warp size is a power-of-two.
@ IS_ARCH_WARP
Whether the logical warp size and the PTX warp size coincide.
@ HALF_WARP_THREADS
The number of threads in half a warp.
@ WARP_SMEM_ELEMENTS
The number of shared memory elements per warp.
@ UNSET
FlagT status (when not using ballot)
@ STEPS
The number of warp scan steps.
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp reduction_op, Int2Type< STEP >)
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< false >)
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp, Int2Type< STEPS >)
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< true >)
__device__ __forceinline__ WarpReduceSmem(TempStorage &temp_storage)
Constructor.
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
unsigned char SmemFlag
Shared memory flag type.
__device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)