OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
device_segmented_reduce.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39
40#include "../iterator/arg_index_input_iterator.cuh"
43#include "../util_type.cuh"
44#include "../util_namespace.cuh"
45
47CUB_NS_PREFIX
48
50namespace cub {
51
52
66{
126 template <
127 typename InputIteratorT,
128 typename OutputIteratorT,
129 typename OffsetIteratorT,
130 typename ReductionOp,
131 typename T>
132 CUB_RUNTIME_FUNCTION
133 static cudaError_t Reduce(
134 void *d_temp_storage,
135 size_t &temp_storage_bytes,
136 InputIteratorT d_in,
137 OutputIteratorT d_out,
138 int num_segments,
139 OffsetIteratorT d_begin_offsets,
140 OffsetIteratorT d_end_offsets,
141 ReductionOp reduction_op,
142 T initial_value,
143 cudaStream_t stream = 0,
144 bool debug_synchronous = false)
145 {
146 // Signed integer type for global offsets
147 typedef int OffsetT;
148
150 d_temp_storage,
151 temp_storage_bytes,
152 d_in,
153 d_out,
154 num_segments,
158 initial_value,
159 stream,
160 debug_synchronous);
161 }
162
163
210 template <
211 typename InputIteratorT,
212 typename OutputIteratorT,
213 typename OffsetIteratorT>
214 CUB_RUNTIME_FUNCTION
215 static cudaError_t Sum(
216 void *d_temp_storage,
217 size_t &temp_storage_bytes,
218 InputIteratorT d_in,
219 OutputIteratorT d_out,
220 int num_segments,
221 OffsetIteratorT d_begin_offsets,
222 OffsetIteratorT d_end_offsets,
223 cudaStream_t stream = 0,
224 bool debug_synchronous = false)
225 {
226 // Signed integer type for global offsets
227 typedef int OffsetT;
228
229 // The output value type
230 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
231 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
232 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
233
235 d_temp_storage,
236 temp_storage_bytes,
237 d_in,
238 d_out,
239 num_segments,
242 cub::Sum(),
243 OutputT(), // zero-initialize
244 stream,
245 debug_synchronous);
246 }
247
248
295 template <
296 typename InputIteratorT,
297 typename OutputIteratorT,
298 typename OffsetIteratorT>
299 CUB_RUNTIME_FUNCTION
300 static cudaError_t Min(
301 void *d_temp_storage,
302 size_t &temp_storage_bytes,
303 InputIteratorT d_in,
304 OutputIteratorT d_out,
305 int num_segments,
306 OffsetIteratorT d_begin_offsets,
307 OffsetIteratorT d_end_offsets,
308 cudaStream_t stream = 0,
309 bool debug_synchronous = false)
310 {
311 // Signed integer type for global offsets
312 typedef int OffsetT;
313
314 // The input value type
315 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
316
318 d_temp_storage,
319 temp_storage_bytes,
320 d_in,
321 d_out,
322 num_segments,
325 cub::Min(),
326 Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
327 stream,
328 debug_synchronous);
329 }
330
331
380 template <
381 typename InputIteratorT,
382 typename OutputIteratorT,
383 typename OffsetIteratorT>
384 CUB_RUNTIME_FUNCTION
385 static cudaError_t ArgMin(
386 void *d_temp_storage,
387 size_t &temp_storage_bytes,
388 InputIteratorT d_in,
389 OutputIteratorT d_out,
390 int num_segments,
391 OffsetIteratorT d_begin_offsets,
392 OffsetIteratorT d_end_offsets,
393 cudaStream_t stream = 0,
394 bool debug_synchronous = false)
395 {
396 // Signed integer type for global offsets
397 typedef int OffsetT;
398
399 // The input type
400 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
401
402 // The output tuple type
403 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
404 KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
405 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
406
407 // The output value type
408 typedef typename OutputTupleT::Value OutputValueT;
409
410 // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
412 ArgIndexInputIteratorT d_indexed_in(d_in);
413
414 // Initial value
415 OutputTupleT initial_value(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
416
418 d_temp_storage,
419 temp_storage_bytes,
420 d_indexed_in,
421 d_out,
422 num_segments,
425 cub::ArgMin(),
426 initial_value,
427 stream,
428 debug_synchronous);
429 }
430
431
478 template <
479 typename InputIteratorT,
480 typename OutputIteratorT,
481 typename OffsetIteratorT>
482 CUB_RUNTIME_FUNCTION
483 static cudaError_t Max(
484 void *d_temp_storage,
485 size_t &temp_storage_bytes,
486 InputIteratorT d_in,
487 OutputIteratorT d_out,
488 int num_segments,
489 OffsetIteratorT d_begin_offsets,
490 OffsetIteratorT d_end_offsets,
491 cudaStream_t stream = 0,
492 bool debug_synchronous = false)
493 {
494 // Signed integer type for global offsets
495 typedef int OffsetT;
496
497 // The input value type
498 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
499
501 d_temp_storage,
502 temp_storage_bytes,
503 d_in,
504 d_out,
505 num_segments,
508 cub::Max(),
509 Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
510 stream,
511 debug_synchronous);
512 }
513
514
563 template <
564 typename InputIteratorT,
565 typename OutputIteratorT,
566 typename OffsetIteratorT>
567 CUB_RUNTIME_FUNCTION
568 static cudaError_t ArgMax(
569 void *d_temp_storage,
570 size_t &temp_storage_bytes,
571 InputIteratorT d_in,
572 OutputIteratorT d_out,
573 int num_segments,
574 OffsetIteratorT d_begin_offsets,
575 OffsetIteratorT d_end_offsets,
576 cudaStream_t stream = 0,
577 bool debug_synchronous = false)
578 {
579 // Signed integer type for global offsets
580 typedef int OffsetT;
581
582 // The input type
583 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
584
585 // The output tuple type
586 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
587 KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
588 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
589
590 // The output value type
591 typedef typename OutputTupleT::Value OutputValueT;
592
593 // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
595 ArgIndexInputIteratorT d_indexed_in(d_in);
596
597 // Initial value
598 OutputTupleT initial_value(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
599
601 d_temp_storage,
602 temp_storage_bytes,
603 d_indexed_in,
604 d_out,
605 num_segments,
608 cub::ArgMax(),
609 initial_value,
610 stream,
611 debug_synchronous);
612 }
613
614};
615
616} // CUB namespace
617CUB_NS_POSTFIX // Optional outer namespace(s)
618
619
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across mult...
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum in each segment using the greater-than ('>') operator,...
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented sum using the addition ('+') operator.
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum in each segment using the less-than ('<') operator,...
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented minimum using the less-than ('<') operator.
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented maximum using the greater-than ('>') operator.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Type equality test.
Definition util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
A key identifier paired with a corresponding value.
Default max functor.
Default min functor.
Default sum functor.
Type traits.