OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
agent_reduce.cuh
Go to the documentation of this file.
1/******************************************************************************
2 * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the NVIDIA CORPORATION nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *
27 ******************************************************************************/
28
34#pragma once
35
36#include <iterator>
37
38#include "../block/block_load.cuh"
39#include "../block/block_reduce.cuh"
40#include "../grid/grid_mapping.cuh"
41#include "../grid/grid_even_share.cuh"
42#include "../util_type.cuh"
43#include "../iterator/cache_modified_input_iterator.cuh"
44#include "../util_namespace.cuh"
45
46
48CUB_NS_PREFIX
49
51namespace cub {
52
53
54/******************************************************************************
55 * Tuning policy types
56 ******************************************************************************/
57
61template <
62 int _BLOCK_THREADS,
63 int _ITEMS_PER_THREAD,
64 int _VECTOR_LOAD_LENGTH,
65 BlockReduceAlgorithm _BLOCK_ALGORITHM,
66 CacheLoadModifier _LOAD_MODIFIER>
68{
69 enum
70 {
71 BLOCK_THREADS = _BLOCK_THREADS,
72 ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
73 VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH,
74 };
75
76 static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
77 static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
78};
79
80
81
82/******************************************************************************
83 * Thread block abstractions
84 ******************************************************************************/
85
93template <
94 typename AgentReducePolicy,
95 typename InputIteratorT,
96 typename OutputIteratorT,
97 typename OffsetT,
98 typename ReductionOp>
100{
101
102 //---------------------------------------------------------------------
103 // Types and constants
104 //---------------------------------------------------------------------
105
107 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
108
110 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
111 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
112 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
113
116
119 CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
120 InputIteratorT>::Type // Directly use the supplied input iterator type
122
124 enum
125 {
126 BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS,
127 ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD,
128 VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
129 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
130
131 // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
132 ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
133 (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
135
136 };
137
138 static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER;
139 static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
140
143
146 {
147 typename BlockReduceT::TempStorage reduce;
148 };
149
151 struct TempStorage : Uninitialized<_TempStorage> {};
152
153
154 //---------------------------------------------------------------------
155 // Per-thread fields
156 //---------------------------------------------------------------------
157
159 InputIteratorT d_in;
161 ReductionOp reduction_op;
162
163
164 //---------------------------------------------------------------------
165 // Utility
166 //---------------------------------------------------------------------
167
168
169 // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
170 template <typename Iterator>
171 static __device__ __forceinline__ bool IsAligned(
172 Iterator d_in,
173 Int2Type<true> /*can_vectorize*/)
174 {
175 return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
176 }
177
178 // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
179 template <typename Iterator>
180 static __device__ __forceinline__ bool IsAligned(
181 Iterator /*d_in*/,
182 Int2Type<false> /*can_vectorize*/)
183 {
184 return false;
185 }
186
187
188 //---------------------------------------------------------------------
189 // Constructor
190 //---------------------------------------------------------------------
191
195 __device__ __forceinline__ AgentReduce(
197 InputIteratorT d_in,
198 ReductionOp reduction_op)
199 :
200 temp_storage(temp_storage.Alias()),
201 d_in(d_in),
204 {}
205
206
207 //---------------------------------------------------------------------
208 // Tile consumption
209 //---------------------------------------------------------------------
210
214 template <int IS_FIRST_TILE>
215 __device__ __forceinline__ void ConsumeTile(
216 OutputT &thread_aggregate,
218 int /*valid_items*/,
219 Int2Type<true> /*is_full_tile*/,
220 Int2Type<false> /*can_vectorize*/)
221 {
222 OutputT items[ITEMS_PER_THREAD];
223
224 // Load items in striped fashion
225 LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
226
227 // Reduce items within each thread stripe
228 thread_aggregate = (IS_FIRST_TILE) ?
230 internal::ThreadReduce(items, reduction_op, thread_aggregate);
231 }
232
233
237 template <int IS_FIRST_TILE>
238 __device__ __forceinline__ void ConsumeTile(
239 OutputT &thread_aggregate,
241 int /*valid_items*/,
242 Int2Type<true> /*is_full_tile*/,
243 Int2Type<true> /*can_vectorize*/)
244 {
245 // Alias items as an array of VectorT and load it in striped fashion
246 enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
247
248 // Fabricate a vectorized input iterator
249 InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
251 reinterpret_cast<VectorT*>(d_in_unqualified));
252
253 // Load items as vector items
254 InputT input_items[ITEMS_PER_THREAD];
255 VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
256 #pragma unroll
257 for (int i = 0; i < WORDS; ++i)
258 vec_items[i] = d_vec_in[BLOCK_THREADS * i];
259
260 // Convert from input type to output type
261 OutputT items[ITEMS_PER_THREAD];
262 #pragma unroll
263 for (int i = 0; i < ITEMS_PER_THREAD; ++i)
264 items[i] = input_items[i];
265
266 // Reduce items within each thread stripe
267 thread_aggregate = (IS_FIRST_TILE) ?
269 internal::ThreadReduce(items, reduction_op, thread_aggregate);
270 }
271
272
276 template <int IS_FIRST_TILE, int CAN_VECTORIZE>
277 __device__ __forceinline__ void ConsumeTile(
278 OutputT &thread_aggregate,
280 int valid_items,
281 Int2Type<false> /*is_full_tile*/,
282 Int2Type<CAN_VECTORIZE> /*can_vectorize*/)
283 {
284 // Partial tile
285 int thread_offset = threadIdx.x;
286
287 // Read first item
288 if ((IS_FIRST_TILE) && (thread_offset < valid_items))
289 {
290 thread_aggregate = d_wrapped_in[block_offset + thread_offset];
291 thread_offset += BLOCK_THREADS;
292 }
293
294 // Continue reading items (block-striped)
295 while (thread_offset < valid_items)
296 {
297 OutputT item = d_wrapped_in[block_offset + thread_offset];
298 thread_aggregate = reduction_op(thread_aggregate, item);
299 thread_offset += BLOCK_THREADS;
300 }
301 }
302
303
304 //---------------------------------------------------------------
305 // Consume a contiguous segment of tiles
306 //---------------------------------------------------------------------
307
311 template <int CAN_VECTORIZE>
312 __device__ __forceinline__ OutputT ConsumeRange(
314 Int2Type<CAN_VECTORIZE> can_vectorize)
315 {
316 OutputT thread_aggregate;
317
318 if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
319 {
320 // First tile isn't full (not all threads have valid items)
321 int valid_items = even_share.block_end - even_share.block_offset;
322 ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
323 return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
324 }
325
326 // At least one full block
327 ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
328 even_share.block_offset += even_share.block_stride;
329
330 // Consume subsequent full tiles of input
331 while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
332 {
333 ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
334 even_share.block_offset += even_share.block_stride;
335 }
336
337 // Consume a partially-full tile
338 if (even_share.block_offset < even_share.block_end)
339 {
340 int valid_items = even_share.block_end - even_share.block_offset;
341 ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
342 }
343
344 // Compute block-wide reduction (all threads have valid items)
345 return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
346 }
347
348
352 __device__ __forceinline__ OutputT ConsumeRange(
354 OffsetT block_end)
355 {
357 even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
358
359 return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
362 }
363
364
368 __device__ __forceinline__ OutputT ConsumeTiles(
370 {
371 // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
372 even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
373
374 return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
377
378 }
379
380};
381
382
383} // CUB namespace
384CUB_NS_POSTFIX // Optional outer namespace(s)
385
The BlockReduce class provides collective methods for computing a parallel reduction of items partiti...
A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
CacheLoadModifier
Enumeration of cache modifiers for memory load operations.
#define CUB_MIN(a, b)
Select minimum(a, b)
__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op, T prefix, Int2Type< LENGTH >)
Optional outer namespace(s)
OffsetT int int GridEvenShare< OffsetT > even_share
< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
OffsetT OffsetT
[in] Total number of input data items
BlockReduceAlgorithm
< Cache load modifier for reading input elements
static const CacheLoadModifier LOAD_MODIFIER
Cache load modifier for reading input elements.
static const BlockReduceAlgorithm BLOCK_ALGORITHM
Cooperative block-wide reduction algorithm to use.
@ BLOCK_THREADS
Threads per thread block.
@ VECTOR_LOAD_LENGTH
Number of items per vectorized load.
@ ITEMS_PER_THREAD
Items per thread (per tile of input)
Alias wrapper allowing storage to be unioned.
Shared memory type required by this thread block.
AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide ...
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< true >)
BlockReduce< OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM > BlockReduceT
Parameterized BlockReduce primitive.
ReductionOp reduction_op
Binary reduction operator.
__device__ __forceinline__ OutputT ConsumeRange(GridEvenShare< OffsetT > &even_share, Int2Type< CAN_VECTORIZE > can_vectorize)
Reduce a contiguous segment of input tiles.
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int, Int2Type< true >, Int2Type< false >)
WrappedInputIteratorT d_wrapped_in
Wrapped input data to reduce.
__device__ __forceinline__ AgentReduce(TempStorage &temp_storage, InputIteratorT d_in, ReductionOp reduction_op)
CubVector< InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH >::Type VectorT
Vector type of InputT for data movement.
__device__ __forceinline__ OutputT ConsumeTiles(GridEvenShare< OffsetT > &even_share)
__device__ __forceinline__ OutputT ConsumeRange(OffsetT block_offset, OffsetT block_end)
Reduce a contiguous segment of input tiles.
InputIteratorT d_in
Input data to reduce.
If< IsPointer< InputIteratorT >::VALUE, CacheModifiedInputIterator< AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT >, InputIteratorT >::Type WrappedInputIteratorT
Input iterator wrapper type (for applying cache modifier)
_TempStorage & temp_storage
Reference to temp_storage.
__device__ __forceinline__ void ConsumeTile(OutputT &thread_aggregate, OffsetT block_offset, int valid_items, Int2Type< false >, Int2Type< CAN_VECTORIZE >)
std::iterator_traits< InputIteratorT >::value_type InputT
The input value type.
If<(Equals< typenamestd::iterator_traits< OutputIteratorT >::value_type, void >::VALUE), typenamestd::iterator_traits< InputIteratorT >::value_type, typenamestd::iterator_traits< OutputIteratorT >::value_type >::Type OutputT
The output value type.
\smemstorage{BlockReduce}
Exposes a member typedef Type that names the corresponding CUDA vector type if one exists....
Type equality test.
Definition util_type.cuh:99
GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-sha...
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static c...
Pointer vs. iterator.
Type traits.
A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions.