OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
warp_reduce_smem.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../../thread/thread_operators.cuh"
37 #include "../../thread/thread_load.cuh"
38 #include "../../thread/thread_store.cuh"
39 #include "../../util_type.cuh"
40 #include "../../util_namespace.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
51 template <
52  typename T,
53  int LOGICAL_WARP_THREADS,
54  int PTX_ARCH>
56 {
57  /******************************************************************************
58  * Constants and type definitions
59  ******************************************************************************/
60 
61  enum
62  {
64  IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
65 
68 
71 
73  HALF_WARP_THREADS = 1 << (STEPS - 1),
74 
76  WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
77 
79  UNSET = 0x0, // Is initially unset
80  SET = 0x1, // Is initially set
81  SEEN = 0x2, // Has seen another head flag from a successor peer
82  };
83 
85  typedef unsigned char SmemFlag;
86 
88  struct _TempStorage
89  {
90  T reduce[WARP_SMEM_ELEMENTS];
92  };
93 
94  // Alias wrapper allowing storage to be unioned
95  struct TempStorage : Uninitialized<_TempStorage> {};
96 
97 
98  /******************************************************************************
99  * Thread fields
100  ******************************************************************************/
101 
102  _TempStorage &temp_storage;
103  unsigned int lane_id;
104  unsigned int member_mask;
105 
106 
107  /******************************************************************************
108  * Construction
109  ******************************************************************************/
110 
112  __device__ __forceinline__ WarpReduceSmem(
113  TempStorage &temp_storage)
114  :
115  temp_storage(temp_storage.Alias()),
116 
117  lane_id(IS_ARCH_WARP ?
118  LaneId() :
119  LaneId() % LOGICAL_WARP_THREADS),
120 
121  member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
122  0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
123  ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
124  {}
125 
126  /******************************************************************************
127  * Utility methods
128  ******************************************************************************/
129 
130  //---------------------------------------------------------------------
131  // Regular reduction
132  //---------------------------------------------------------------------
133 
137  template <
138  bool ALL_LANES_VALID,
139  typename ReductionOp,
140  int STEP>
141  __device__ __forceinline__ T ReduceStep(
142  T input,
143  int valid_items,
144  ReductionOp reduction_op,
145  Int2Type<STEP> /*step*/)
146  {
147  const int OFFSET = 1 << STEP;
148 
149  // Share input through buffer
150  ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
151 
152  WARP_SYNC(member_mask);
153 
154  // Update input if peer_addend is in range
155  if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
156  {
157  T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
158  input = reduction_op(input, peer_addend);
159  }
160 
161  WARP_SYNC(member_mask);
162 
163  return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
164  }
165 
166 
170  template <
171  bool ALL_LANES_VALID,
172  typename ReductionOp>
173  __device__ __forceinline__ T ReduceStep(
174  T input,
175  int valid_items,
176  ReductionOp /*reduction_op*/,
177  Int2Type<STEPS> /*step*/)
178  {
179  return input;
180  }
181 
182 
183  //---------------------------------------------------------------------
184  // Segmented reduction
185  //---------------------------------------------------------------------
186 
187 
191  template <
192  bool HEAD_SEGMENTED,
193  typename FlagT,
194  typename ReductionOp>
195  __device__ __forceinline__ T SegmentedReduce(
196  T input,
197  FlagT flag,
198  ReductionOp reduction_op,
199  Int2Type<true> /*has_ballot*/)
200  {
201  // Get the start flags for each thread in the warp.
202  int warp_flags = WARP_BALLOT(flag, member_mask);
203 
204  if (!HEAD_SEGMENTED)
205  warp_flags <<= 1;
206 
207  // Keep bits above the current thread.
208  warp_flags &= LaneMaskGt();
209 
210  // Accommodate packing of multiple logical warps in a single physical warp
211  if (!IS_ARCH_WARP)
212  {
213  warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
214  }
215 
216  // Find next flag
217  int next_flag = __clz(__brev(warp_flags));
218 
219  // Clip the next segment at the warp boundary if necessary
220  if (LOGICAL_WARP_THREADS != 32)
221  next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
222 
223  #pragma unroll
224  for (int STEP = 0; STEP < STEPS; STEP++)
225  {
226  const int OFFSET = 1 << STEP;
227 
228  // Share input into buffer
229  ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
230 
231  WARP_SYNC(member_mask);
232 
233  // Update input if peer_addend is in range
234  if (OFFSET + lane_id < next_flag)
235  {
236  T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
237  input = reduction_op(input, peer_addend);
238  }
239 
240  WARP_SYNC(member_mask);
241  }
242 
243  return input;
244  }
245 
246 
250  template <
251  bool HEAD_SEGMENTED,
252  typename FlagT,
253  typename ReductionOp>
254  __device__ __forceinline__ T SegmentedReduce(
255  T input,
256  FlagT flag,
257  ReductionOp reduction_op,
258  Int2Type<false> /*has_ballot*/)
259  {
260  enum
261  {
262  UNSET = 0x0, // Is initially unset
263  SET = 0x1, // Is initially set
264  SEEN = 0x2, // Has seen another head flag from a successor peer
265  };
266 
267  // Alias flags onto shared data storage
268  volatile SmemFlag *flag_storage = temp_storage.flags;
269 
270  SmemFlag flag_status = (flag) ? SET : UNSET;
271 
272  for (int STEP = 0; STEP < STEPS; STEP++)
273  {
274  const int OFFSET = 1 << STEP;
275 
276  // Share input through buffer
277  ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
278 
279  WARP_SYNC(member_mask);
280 
281  // Get peer from buffer
282  T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
283 
284  WARP_SYNC(member_mask);
285 
286  // Share flag through buffer
287  flag_storage[lane_id] = flag_status;
288 
289  // Get peer flag from buffer
290  SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
291 
292  // Update input if peer was in range
293  if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
294  {
295  if (HEAD_SEGMENTED)
296  {
297  // Head-segmented
298  if ((flag_status & SEEN) == 0)
299  {
300  // Has not seen a more distant head flag
301  if (peer_flag_status & SET)
302  {
303  // Has now seen a head flag
304  flag_status |= SEEN;
305  }
306  else
307  {
308  // Peer is not a head flag: grab its count
309  input = reduction_op(input, peer_addend);
310  }
311 
312  // Update seen status to include that of peer
313  flag_status |= (peer_flag_status & SEEN);
314  }
315  }
316  else
317  {
318  // Tail-segmented. Simply propagate flag status
319  if (!flag_status)
320  {
321  input = reduction_op(input, peer_addend);
322  flag_status |= peer_flag_status;
323  }
324 
325  }
326  }
327  }
328 
329  return input;
330  }
331 
332 
333  /******************************************************************************
334  * Interface
335  ******************************************************************************/
336 
340  template <
341  bool ALL_LANES_VALID,
342  typename ReductionOp>
343  __device__ __forceinline__ T Reduce(
344  T input,
345  int valid_items,
346  ReductionOp reduction_op)
347  {
348  return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
349  }
350 
351 
355  template <
356  bool HEAD_SEGMENTED,
357  typename FlagT,
358  typename ReductionOp>
359  __device__ __forceinline__ T SegmentedReduce(
360  T input,
361  FlagT flag,
362  ReductionOp reduction_op)
363  {
364  return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
365  }
366 
367 
368 };
369 
370 
371 } // CUB namespace
372 CUB_NS_POSTFIX // Optional outer namespace(s)
__device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)
Optional outer namespace(s)
The number of shared memory elements per warp.
Whether the logical warp size and the PTX warp size coincide.
FlagT status (when not using ballot)
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
Definition: util_ptx.cuh:310
__device__ __forceinline__ unsigned int LaneId()
Returns the warp lane ID of the calling thread.
Definition: util_ptx.cuh:420
The number of warp scan steps.
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< true >)
Shared memory storage layout type (1.5 warps-worth of elements for each warp)
Statically determine if N is a power-of-two.
Definition: util_type.cuh:155
WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA ...
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
The number of threads in half a warp.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
__device__ __forceinline__ WarpReduceSmem(TempStorage &temp_storage)
Constructor.
Statically determine log2(N), rounded up.
Definition: util_type.cuh:132
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
Whether the logical warp size is a power-of-two.
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp reduction_op, Int2Type< STEP >)
__device__ __forceinline__ unsigned int LaneMaskGt()
Returns the warp lane mask of all lanes greater than the calling thread.
Definition: util_ptx.cuh:461
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
Definition: util_ptx.cuh:273
unsigned char SmemFlag
Shared memory flag type.
__device__ __forceinline__ T ReduceStep(T input, int valid_items, ReductionOp, Int2Type< STEPS >)
__device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type< false >)