OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
device_reduce.cuh
Go to the documentation of this file.
1
2/******************************************************************************
3 * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of the NVIDIA CORPORATION nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 ******************************************************************************/
29
35#pragma once
36
37#include <stdio.h>
38#include <iterator>
39#include <limits>
40
41#include "../iterator/arg_index_input_iterator.cuh"
44#include "../util_namespace.cuh"
45
47CUB_NS_PREFIX
48
50namespace cub {
51
52
85{
142 template <
143 typename InputIteratorT,
144 typename OutputIteratorT,
145 typename ReductionOpT,
146 typename T>
147 CUB_RUNTIME_FUNCTION
148 static cudaError_t Reduce(
149 void *d_temp_storage,
150 size_t &temp_storage_bytes,
151 InputIteratorT d_in,
152 OutputIteratorT d_out,
153 int num_items,
154 ReductionOpT reduction_op,
155 T init,
156 cudaStream_t stream = 0,
157 bool debug_synchronous = false)
158 {
159 // Signed integer type for global offsets
160 typedef int OffsetT;
161
163 d_temp_storage,
164 temp_storage_bytes,
165 d_in,
166 d_out,
167 num_items,
169 init,
170 stream,
171 debug_synchronous);
172 }
173
174
225 template <
226 typename InputIteratorT,
227 typename OutputIteratorT>
228 CUB_RUNTIME_FUNCTION
229 static cudaError_t Sum(
230 void *d_temp_storage,
231 size_t &temp_storage_bytes,
232 InputIteratorT d_in,
233 OutputIteratorT d_out,
234 int num_items,
235 cudaStream_t stream = 0,
236 bool debug_synchronous = false)
237 {
238 // Signed integer type for global offsets
239 typedef int OffsetT;
240
241 // The output value type
242 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
243 typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
244 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
245
247 d_temp_storage,
248 temp_storage_bytes,
249 d_in,
250 d_out,
251 num_items,
252 cub::Sum(),
253 OutputT(), // zero-initialize
254 stream,
255 debug_synchronous);
256 }
257
258
302 template <
303 typename InputIteratorT,
304 typename OutputIteratorT>
305 CUB_RUNTIME_FUNCTION
306 static cudaError_t Min(
307 void *d_temp_storage,
308 size_t &temp_storage_bytes,
309 InputIteratorT d_in,
310 OutputIteratorT d_out,
311 int num_items,
312 cudaStream_t stream = 0,
313 bool debug_synchronous = false)
314 {
315 // Signed integer type for global offsets
316 typedef int OffsetT;
317
318 // The input value type
319 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
320
322 d_temp_storage,
323 temp_storage_bytes,
324 d_in,
325 d_out,
326 num_items,
327 cub::Min(),
328 Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
329 stream,
330 debug_synchronous);
331 }
332
333
379 template <
380 typename InputIteratorT,
381 typename OutputIteratorT>
382 CUB_RUNTIME_FUNCTION
383 static cudaError_t ArgMin(
384 void *d_temp_storage,
385 size_t &temp_storage_bytes,
386 InputIteratorT d_in,
387 OutputIteratorT d_out,
388 int num_items,
389 cudaStream_t stream = 0,
390 bool debug_synchronous = false)
391 {
392 // Signed integer type for global offsets
393 typedef int OffsetT;
394
395 // The input type
396 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
397
398 // The output tuple type
399 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
400 KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
401 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
402
403 // The output value type
404 typedef typename OutputTupleT::Value OutputValueT;
405
406 // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
408 ArgIndexInputIteratorT d_indexed_in(d_in);
409
410 // Initial value
411 OutputTupleT initial_value(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
412
414 d_temp_storage,
415 temp_storage_bytes,
416 d_indexed_in,
417 d_out,
418 num_items,
419 cub::ArgMin(),
420 initial_value,
421 stream,
422 debug_synchronous);
423 }
424
425
469 template <
470 typename InputIteratorT,
471 typename OutputIteratorT>
472 CUB_RUNTIME_FUNCTION
473 static cudaError_t Max(
474 void *d_temp_storage,
475 size_t &temp_storage_bytes,
476 InputIteratorT d_in,
477 OutputIteratorT d_out,
478 int num_items,
479 cudaStream_t stream = 0,
480 bool debug_synchronous = false)
481 {
482 // Signed integer type for global offsets
483 typedef int OffsetT;
484
485 // The input value type
486 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
487
489 d_temp_storage,
490 temp_storage_bytes,
491 d_in,
492 d_out,
493 num_items,
494 cub::Max(),
495 Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
496 stream,
497 debug_synchronous);
498 }
499
500
546 template <
547 typename InputIteratorT,
548 typename OutputIteratorT>
549 CUB_RUNTIME_FUNCTION
550 static cudaError_t ArgMax(
551 void *d_temp_storage,
552 size_t &temp_storage_bytes,
553 InputIteratorT d_in,
554 OutputIteratorT d_out,
555 int num_items,
556 cudaStream_t stream = 0,
557 bool debug_synchronous = false)
558 {
559 // Signed integer type for global offsets
560 typedef int OffsetT;
561
562 // The input type
563 typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
564
565 // The output tuple type
566 typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
567 KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
568 typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
569
570 // The output value type
571 typedef typename OutputTupleT::Value OutputValueT;
572
573 // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
575 ArgIndexInputIteratorT d_indexed_in(d_in);
576
577 // Initial value
578 OutputTupleT initial_value(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
579
581 d_temp_storage,
582 temp_storage_bytes,
583 d_indexed_in,
584 d_out,
585 num_items,
586 cub::ArgMax(),
587 initial_value,
588 stream,
589 debug_synchronous);
590 }
591
592
679 template <
680 typename KeysInputIteratorT,
681 typename UniqueOutputIteratorT,
682 typename ValuesInputIteratorT,
683 typename AggregatesOutputIteratorT,
684 typename NumRunsOutputIteratorT,
685 typename ReductionOpT>
686 CUB_RUNTIME_FUNCTION __forceinline__
687 static cudaError_t ReduceByKey(
688 void *d_temp_storage,
689 size_t &temp_storage_bytes,
690 KeysInputIteratorT d_keys_in,
691 UniqueOutputIteratorT d_unique_out,
692 ValuesInputIteratorT d_values_in,
693 AggregatesOutputIteratorT d_aggregates_out,
694 NumRunsOutputIteratorT d_num_runs_out,
695 ReductionOpT reduction_op,
696 int num_items,
697 cudaStream_t stream = 0,
698 bool debug_synchronous = false)
699 {
700 // Signed integer type for global offsets
701 typedef int OffsetT;
702
703 // FlagT iterator type (not used)
704
705 // Selection op (not used)
706
707 // Default == operator
708 typedef Equality EqualityOp;
709
711 d_temp_storage,
712 temp_storage_bytes,
713 d_keys_in,
718 EqualityOp(),
720 num_items,
721 stream,
722 debug_synchronous);
723 }
724
725};
726
731} // CUB namespace
732CUB_NS_POSTFIX // Optional outer namespace(s)
733
734
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
Optional outer namespace(s)
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
KeyT const ValueT * d_values_in
[in] Input values buffer
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of...
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index o...
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide minimum using the less-than ('<') operator.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t ReduceByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of t...
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide maximum using the greater-than ('>') operator.
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide sum using the addition (+) operator.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Default equality functor.
Type equality test.
Definition util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73
A key identifier paired with a corresponding value.
Default max functor.
Default min functor.
Default sum functor.
Type traits.