OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
agent_reduce.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "../block/block_load.cuh"
39 #include "../block/block_reduce.cuh"
40 #include "../grid/grid_mapping.cuh"
41 #include "../grid/grid_even_share.cuh"
42 #include "../util_type.cuh"
43 #include "../iterator/cache_modified_input_iterator.cuh"
44 #include "../util_namespace.cuh"
45 
46 
48 CUB_NS_PREFIX
49 
51 namespace cub {
52 
53 
54 /******************************************************************************
55  * Tuning policy types
56  ******************************************************************************/
57 
61 template <
62  int _BLOCK_THREADS,
63  int _ITEMS_PER_THREAD,
64  int _VECTOR_LOAD_LENGTH,
65  BlockReduceAlgorithm _BLOCK_ALGORITHM,
66  CacheLoadModifier _LOAD_MODIFIER>
68 {
69  enum
70  {
71  BLOCK_THREADS = _BLOCK_THREADS,
72  ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
73  VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH,
74  };
75 
76  static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
77  static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
78 };
79 
80 
81 
82 /******************************************************************************
83  * Thread block abstractions
84  ******************************************************************************/
85 
93 template <
94  typename AgentReducePolicy,
95  typename InputIteratorT,
96  typename OutputIteratorT,
97  typename OffsetT,
98  typename ReductionOp>
100 {
101 
102  //---------------------------------------------------------------------
103  // Types and constants
104  //---------------------------------------------------------------------
105 
107  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
108 
110  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
111  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
112  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
113 
116 
118  typedef typename If<IsPointer<InputIteratorT>::VALUE,
119  CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
120  InputIteratorT>::Type // Directly use the supplied input iterator type
122 
124  enum
125  {
126  BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS,
127  ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD,
128  VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
129  TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
130 
131  // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
132  ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
133  (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
135 
136  };
137 
138  static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER;
139  static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
140 
143 
146  {
147  typename BlockReduceT::TempStorage reduce;
148  };
149 
151  struct TempStorage : Uninitialized<_TempStorage> {};
152 
153 
154  //---------------------------------------------------------------------
155  // Per-thread fields
156  //---------------------------------------------------------------------
157 
159  InputIteratorT d_in;
161  ReductionOp reduction_op;
162 
163 
164  //---------------------------------------------------------------------
165  // Utility
166  //---------------------------------------------------------------------
167 
168 
169  // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
170  template <typename Iterator>
171  static __device__ __forceinline__ bool IsAligned(
172  Iterator d_in,
173  Int2Type<true> /*can_vectorize*/)
174  {
175  return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
176  }
177 
178  // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
179  template <typename Iterator>
180  static __device__ __forceinline__ bool IsAligned(
181  Iterator /*d_in*/,
182  Int2Type<false> /*can_vectorize*/)
183  {
184  return false;
185  }
186 
187 
188  //---------------------------------------------------------------------
189  // Constructor
190  //---------------------------------------------------------------------
191 
195  __device__ __forceinline__ AgentReduce(
197  InputIteratorT d_in,
198  ReductionOp reduction_op)
199  :
200  temp_storage(temp_storage.Alias()),
201  d_in(d_in),
204  {}
205 
206 
207  //---------------------------------------------------------------------
208  // Tile consumption
209  //---------------------------------------------------------------------
210 
214  template <int IS_FIRST_TILE>
215  __device__ __forceinline__ void ConsumeTile(
216  OutputT &thread_aggregate,
218  int /*valid_items*/,
219  Int2Type<true> /*is_full_tile*/,
220  Int2Type<false> /*can_vectorize*/)
221  {
222  OutputT items[ITEMS_PER_THREAD];
223 
224  // Load items in striped fashion
225  LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
226 
227  // Reduce items within each thread stripe
228  thread_aggregate = (IS_FIRST_TILE) ?
230  internal::ThreadReduce(items, reduction_op, thread_aggregate);
231  }
232 
233 
237  template <int IS_FIRST_TILE>
238  __device__ __forceinline__ void ConsumeTile(
239  OutputT &thread_aggregate,
241  int /*valid_items*/,
242  Int2Type<true> /*is_full_tile*/,
243  Int2Type<true> /*can_vectorize*/)
244  {
245  // Alias items as an array of VectorT and load it in striped fashion
246  enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
247 
248  // Fabricate a vectorized input iterator
249  InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
251  reinterpret_cast<VectorT*>(d_in_unqualified));
252 
253  // Load items as vector items
254  InputT input_items[ITEMS_PER_THREAD];
255  VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
256  #pragma unroll
257  for (int i = 0; i < WORDS; ++i)
258  vec_items[i] = d_vec_in[BLOCK_THREADS * i];
259 
260  // Convert from input type to output type
261  OutputT items[ITEMS_PER_THREAD];
262  #pragma unroll
263  for (int i = 0; i < ITEMS_PER_THREAD; ++i)
264  items[i] = input_items[i];
265 
266  // Reduce items within each thread stripe
267  thread_aggregate = (IS_FIRST_TILE) ?
269  internal::ThreadReduce(items, reduction_op, thread_aggregate);
270  }
271 
272 
276  template <int IS_FIRST_TILE, int CAN_VECTORIZE>
277  __device__ __forceinline__ void ConsumeTile(
278  OutputT &thread_aggregate,
280  int valid_items,
281  Int2Type<false> /*is_full_tile*/,
282  Int2Type<CAN_VECTORIZE> /*can_vectorize*/)
283  {
284  // Partial tile
285  int thread_offset = threadIdx.x;
286 
287  // Read first item
288  if ((IS_FIRST_TILE) && (thread_offset < valid_items))
289  {
290  thread_aggregate = d_wrapped_in[block_offset + thread_offset];
291  thread_offset += BLOCK_THREADS;
292  }
293 
294  // Continue reading items (block-striped)
295  while (thread_offset < valid_items)
296  {
297  OutputT item = d_wrapped_in[block_offset + thread_offset];
298  thread_aggregate = reduction_op(thread_aggregate, item);
299  thread_offset += BLOCK_THREADS;
300  }
301  }
302 
303 
304  //---------------------------------------------------------------
305  // Consume a contiguous segment of tiles
306  //---------------------------------------------------------------------
307 
311  template <int CAN_VECTORIZE>
312  __device__ __forceinline__ OutputT ConsumeRange(
314  Int2Type<CAN_VECTORIZE> can_vectorize)
315  {
316  OutputT thread_aggregate;
317 
318  if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
319  {
320  // First tile isn't full (not all threads have valid items)
321  int valid_items = even_share.block_end - even_share.block_offset;
322  ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
323  return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
324  }
325 
326  // At least one full block
327  ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
328  even_share.block_offset += even_share.block_stride;
329 
330  // Consume subsequent full tiles of input
331  while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
332  {
333  ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
334  even_share.block_offset += even_share.block_stride;
335  }
336 
337  // Consume a partially-full tile
338  if (even_share.block_offset < even_share.block_end)
339  {
340  int valid_items = even_share.block_end - even_share.block_offset;
341  ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
342  }
343 
344  // Compute block-wide reduction (all threads have valid items)
345  return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
346  }
347 
348 
352  __device__ __forceinline__ OutputT ConsumeRange(
354  OffsetT block_end)
355  {
357  even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
358 
359  return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
362  }
363 
364 
368  __device__ __forceinline__ OutputT ConsumeTiles(
370  {
371  // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
372  even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
373 
374  return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
377 
378  }
379 
380 };
381 
382 
383 } // CUB namespace
384 CUB_NS_POSTFIX // Optional outer namespace(s)
385 
Type equality test.
Definition: util_type.cuh:98
InputIteratorT d_in
Input data to reduce.
Alias wrapper allowing storage to be unioned.
std::iterator_traits< InputIteratorT >::value_type InputT
The input value type.
Type traits.
Definition: util_type.cuh:1158
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
Definition: thread_load.cuh:62
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Optional outer namespace(s)
BlockReduceAlgorithm
Number of items per vectorized load.
__device__ __forceinline__ AgentReduce(TempStorage &temp_storage, InputIteratorT d_in, ReductionOp reduction_op)
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
__device__ __forceinline__ OutputT ConsumeRange(OffsetT block_offset, OffsetT block_end)
Reduce a contiguous segment of input tiles.
CubVector< InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH >::Type VectorT
Vector type of InputT for data movement.
< Cache load modifier for reading input elements
If<(Equals< typename std::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typename std::iterator_traits< InputIteratorT >::value_type, typename std::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
OffsetT OffsetT
[in] Total number of input data items
_TempStorage & temp_storage
Reference to temp_storage.
static const BlockReduceAlgorithm BLOCK_ALGORITHM
Cooperative block-wide reduction algorithm to use.
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
BlockReduce< OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM > BlockReduceT
Parameterized BlockReduce primitive.
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Definition: util_type.cuh:275
WrappedInputIteratorT d_wrapped_in
Wrapped input data to reduce.
Pointer vs. iterator.
Definition: util_type.cuh:170
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.
Definition: util_type.cuh:634
ReductionOp reduction_op
Binary reduction operator.
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
If< IsPointer< InputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT >, InputIteratorT >::Type WrappedInputIteratorT
Input iterator wrapper type (for applying cache modifier)
#define CUB_MIN(a, b)
Select minimum(a, b)
Definition: util_macro.cuh:66
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int valid_items, Int2Type< false >, Int2Type< CAN_VECTORIZE >)
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Definition: util_type.cuh:454
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< false >)
Items per thread (per tile of input)
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
\smemstorage{BlockReduce}
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
Shared memory type required by this thread block.
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< true >)