OpenFPM  5.2.0
Project that contain the implementation of distributed structures
device_reduce.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 #include <limits>
40 
41 #include "../iterator/arg_index_input_iterator.cuh"
44 #include "../util_namespace.cuh"
45 
47 CUB_NS_PREFIX
48 
50 namespace cub {
51 
52 
85 {
142  template <
143  typename InputIteratorT,
144  typename OutputIteratorT,
145  typename ReductionOpT,
146  typename T>
147  CUB_RUNTIME_FUNCTION
148  static cudaError_t Reduce(
149  void *d_temp_storage,
150  size_t &temp_storage_bytes,
151  InputIteratorT d_in,
152  OutputIteratorT d_out,
153  int num_items,
154  ReductionOpT reduction_op,
155  T init,
156  cudaStream_t stream = 0,
157  bool debug_synchronous = false)
158  {
159  // Signed integer type for global offsets
160  typedef int OffsetT;
161 
163  d_temp_storage,
164  temp_storage_bytes,
165  d_in,
166  d_out,
167  num_items,
168  reduction_op,
169  init,
170  stream,
171  debug_synchronous);
172  }
173 
174 
225  template <
226  typename InputIteratorT,
227  typename OutputIteratorT>
228  CUB_RUNTIME_FUNCTION
229  static cudaError_t Sum(
230  void *d_temp_storage,
231  size_t &temp_storage_bytes,
232  InputIteratorT d_in,
233  OutputIteratorT d_out,
234  int num_items,
235  cudaStream_t stream = 0,
236  bool debug_synchronous = false)
237  {
238  // Signed integer type for global offsets
239  typedef int OffsetT;
240 
241  // The output value type
242  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
243  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
244  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
245 
247  d_temp_storage,
248  temp_storage_bytes,
249  d_in,
250  d_out,
251  num_items,
252  cub::Sum(),
253  OutputT(), // zero-initialize
254  stream,
255  debug_synchronous);
256  }
257 
258 
302  template <
303  typename InputIteratorT,
304  typename OutputIteratorT>
305  CUB_RUNTIME_FUNCTION
306  static cudaError_t Min(
307  void *d_temp_storage,
308  size_t &temp_storage_bytes,
309  InputIteratorT d_in,
310  OutputIteratorT d_out,
311  int num_items,
312  cudaStream_t stream = 0,
313  bool debug_synchronous = false)
314  {
315  // Signed integer type for global offsets
316  typedef int OffsetT;
317 
318  // The input value type
319  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
320 
322  d_temp_storage,
323  temp_storage_bytes,
324  d_in,
325  d_out,
326  num_items,
327  cub::Min(),
328  Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
329  stream,
330  debug_synchronous);
331  }
332 
333 
379  template <
380  typename InputIteratorT,
381  typename OutputIteratorT>
382  CUB_RUNTIME_FUNCTION
383  static cudaError_t ArgMin(
384  void *d_temp_storage,
385  size_t &temp_storage_bytes,
386  InputIteratorT d_in,
387  OutputIteratorT d_out,
388  int num_items,
389  cudaStream_t stream = 0,
390  bool debug_synchronous = false)
391  {
392  // Signed integer type for global offsets
393  typedef int OffsetT;
394 
395  // The input type
396  typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
397 
398  // The output tuple type
399  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
400  KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
401  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
402 
403  // The output value type
404  typedef typename OutputTupleT::Value OutputValueT;
405 
406  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
408  ArgIndexInputIteratorT d_indexed_in(d_in);
409 
410  // Initial value
411  OutputTupleT initial_value(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
412 
414  d_temp_storage,
415  temp_storage_bytes,
416  d_indexed_in,
417  d_out,
418  num_items,
419  cub::ArgMin(),
420  initial_value,
421  stream,
422  debug_synchronous);
423  }
424 
425 
469  template <
470  typename InputIteratorT,
471  typename OutputIteratorT>
472  CUB_RUNTIME_FUNCTION
473  static cudaError_t Max(
474  void *d_temp_storage,
475  size_t &temp_storage_bytes,
476  InputIteratorT d_in,
477  OutputIteratorT d_out,
478  int num_items,
479  cudaStream_t stream = 0,
480  bool debug_synchronous = false)
481  {
482  // Signed integer type for global offsets
483  typedef int OffsetT;
484 
485  // The input value type
486  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
487 
489  d_temp_storage,
490  temp_storage_bytes,
491  d_in,
492  d_out,
493  num_items,
494  cub::Max(),
495  Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
496  stream,
497  debug_synchronous);
498  }
499 
500 
546  template <
547  typename InputIteratorT,
548  typename OutputIteratorT>
549  CUB_RUNTIME_FUNCTION
550  static cudaError_t ArgMax(
551  void *d_temp_storage,
552  size_t &temp_storage_bytes,
553  InputIteratorT d_in,
554  OutputIteratorT d_out,
555  int num_items,
556  cudaStream_t stream = 0,
557  bool debug_synchronous = false)
558  {
559  // Signed integer type for global offsets
560  typedef int OffsetT;
561 
562  // The input type
563  typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
564 
565  // The output tuple type
566  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
567  KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
568  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
569 
570  // The output value type
571  typedef typename OutputTupleT::Value OutputValueT;
572 
573  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
575  ArgIndexInputIteratorT d_indexed_in(d_in);
576 
577  // Initial value
578  OutputTupleT initial_value(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
579 
581  d_temp_storage,
582  temp_storage_bytes,
583  d_indexed_in,
584  d_out,
585  num_items,
586  cub::ArgMax(),
587  initial_value,
588  stream,
589  debug_synchronous);
590  }
591 
592 
679  template <
680  typename KeysInputIteratorT,
681  typename UniqueOutputIteratorT,
682  typename ValuesInputIteratorT,
683  typename AggregatesOutputIteratorT,
684  typename NumRunsOutputIteratorT,
685  typename ReductionOpT>
686  CUB_RUNTIME_FUNCTION __forceinline__
687  static cudaError_t ReduceByKey(
688  void *d_temp_storage,
689  size_t &temp_storage_bytes,
690  KeysInputIteratorT d_keys_in,
691  UniqueOutputIteratorT d_unique_out,
692  ValuesInputIteratorT d_values_in,
693  AggregatesOutputIteratorT d_aggregates_out,
694  NumRunsOutputIteratorT d_num_runs_out,
695  ReductionOpT reduction_op,
696  int num_items,
697  cudaStream_t stream = 0,
698  bool debug_synchronous = false)
699  {
700  // Signed integer type for global offsets
701  typedef int OffsetT;
702 
703  // FlagT iterator type (not used)
704 
705  // Selection op (not used)
706 
707  // Default == operator
708  typedef Equality EqualityOp;
709 
711  d_temp_storage,
712  temp_storage_bytes,
713  d_keys_in,
714  d_unique_out,
715  d_values_in,
718  EqualityOp(),
719  reduction_op,
720  num_items,
721  stream,
722  debug_synchronous);
723  }
724 
725 };
726 
727 
728 } // CUB namespace
729 CUB_NS_POSTFIX // Optional outer namespace(s)
730 
731 
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
Optional outer namespace(s)
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
KeyT const ValueT * d_values_in
[in] Input values buffer
OffsetT OffsetT
[in] Total number of input data items
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of...
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index o...
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide minimum using the less-than ('<') operator.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t ReduceByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of t...
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide maximum using the greater-than ('>') operator.
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide sum using the addition (+) operator.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Default equality functor.
Type equality test.
Definition: util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:73
A key identifier paired with a corresponding value.
Definition: util_type.cuh:667
Default max functor.
Default min functor.
Default sum functor.
Type traits.
Definition: util_type.cuh:1158