OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
device_reduce.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 #include <limits>
40 
41 #include "../iterator/arg_index_input_iterator.cuh"
44 #include "../util_namespace.cuh"
45 
47 CUB_NS_PREFIX
48 
50 namespace cub {
51 
52 
85 {
142  template <
143  typename InputIteratorT,
144  typename OutputIteratorT,
145  typename ReductionOpT,
146  typename T>
147  CUB_RUNTIME_FUNCTION
148  static cudaError_t Reduce(
149  void *d_temp_storage,
150  size_t &temp_storage_bytes,
151  InputIteratorT d_in,
152  OutputIteratorT d_out,
153  int num_items,
154  ReductionOpT reduction_op,
155  T init,
156  cudaStream_t stream = 0,
157  bool debug_synchronous = false)
158  {
159  // Signed integer type for global offsets
160  typedef int OffsetT;
161 
163  d_temp_storage,
164  temp_storage_bytes,
165  d_in,
166  d_out,
167  num_items,
168  reduction_op,
169  init,
170  stream,
171  debug_synchronous);
172  }
173 
174 
225  template <
226  typename InputIteratorT,
227  typename OutputIteratorT>
228  CUB_RUNTIME_FUNCTION
229  static cudaError_t Sum(
230  void *d_temp_storage,
231  size_t &temp_storage_bytes,
232  InputIteratorT d_in,
233  OutputIteratorT d_out,
234  int num_items,
235  cudaStream_t stream = 0,
236  bool debug_synchronous = false)
237  {
238  // Signed integer type for global offsets
239  typedef int OffsetT;
240 
241  // The output value type
242  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
243  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
244  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
245 
247  d_temp_storage,
248  temp_storage_bytes,
249  d_in,
250  d_out,
251  num_items,
252  cub::Sum(),
253  OutputT(), // zero-initialize
254  stream,
255  debug_synchronous);
256  }
257 
258 
302  template <
303  typename InputIteratorT,
304  typename OutputIteratorT>
305  CUB_RUNTIME_FUNCTION
306  static cudaError_t Min(
307  void *d_temp_storage,
308  size_t &temp_storage_bytes,
309  InputIteratorT d_in,
310  OutputIteratorT d_out,
311  int num_items,
312  cudaStream_t stream = 0,
313  bool debug_synchronous = false)
314  {
315  // Signed integer type for global offsets
316  typedef int OffsetT;
317 
318  // The input value type
319  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
320 
322  d_temp_storage,
323  temp_storage_bytes,
324  d_in,
325  d_out,
326  num_items,
327  cub::Min(),
328  Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
329  stream,
330  debug_synchronous);
331  }
332 
333 
379  template <
380  typename InputIteratorT,
381  typename OutputIteratorT>
382  CUB_RUNTIME_FUNCTION
383  static cudaError_t ArgMin(
384  void *d_temp_storage,
385  size_t &temp_storage_bytes,
386  InputIteratorT d_in,
387  OutputIteratorT d_out,
388  int num_items,
389  cudaStream_t stream = 0,
390  bool debug_synchronous = false)
391  {
392  // Signed integer type for global offsets
393  typedef int OffsetT;
394 
395  // The input type
396  typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
397 
398  // The output tuple type
399  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
400  KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
401  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
402 
403  // The output value type
404  typedef typename OutputTupleT::Value OutputValueT;
405 
406  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
408  ArgIndexInputIteratorT d_indexed_in(d_in);
409 
410  // Initial value
411  OutputTupleT initial_value(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
412 
414  d_temp_storage,
415  temp_storage_bytes,
416  d_indexed_in,
417  d_out,
418  num_items,
419  cub::ArgMin(),
420  initial_value,
421  stream,
422  debug_synchronous);
423  }
424 
425 
469  template <
470  typename InputIteratorT,
471  typename OutputIteratorT>
472  CUB_RUNTIME_FUNCTION
473  static cudaError_t Max(
474  void *d_temp_storage,
475  size_t &temp_storage_bytes,
476  InputIteratorT d_in,
477  OutputIteratorT d_out,
478  int num_items,
479  cudaStream_t stream = 0,
480  bool debug_synchronous = false)
481  {
482  // Signed integer type for global offsets
483  typedef int OffsetT;
484 
485  // The input value type
486  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
487 
489  d_temp_storage,
490  temp_storage_bytes,
491  d_in,
492  d_out,
493  num_items,
494  cub::Max(),
495  Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
496  stream,
497  debug_synchronous);
498  }
499 
500 
546  template <
547  typename InputIteratorT,
548  typename OutputIteratorT>
549  CUB_RUNTIME_FUNCTION
550  static cudaError_t ArgMax(
551  void *d_temp_storage,
552  size_t &temp_storage_bytes,
553  InputIteratorT d_in,
554  OutputIteratorT d_out,
555  int num_items,
556  cudaStream_t stream = 0,
557  bool debug_synchronous = false)
558  {
559  // Signed integer type for global offsets
560  typedef int OffsetT;
561 
562  // The input type
563  typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
564 
565  // The output tuple type
566  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
567  KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
568  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
569 
570  // The output value type
571  typedef typename OutputTupleT::Value OutputValueT;
572 
573  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
575  ArgIndexInputIteratorT d_indexed_in(d_in);
576 
577  // Initial value
578  OutputTupleT initial_value(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
579 
581  d_temp_storage,
582  temp_storage_bytes,
583  d_indexed_in,
584  d_out,
585  num_items,
586  cub::ArgMax(),
587  initial_value,
588  stream,
589  debug_synchronous);
590  }
591 
592 
679  template <
680  typename KeysInputIteratorT,
681  typename UniqueOutputIteratorT,
682  typename ValuesInputIteratorT,
683  typename AggregatesOutputIteratorT,
684  typename NumRunsOutputIteratorT,
685  typename ReductionOpT>
686  CUB_RUNTIME_FUNCTION __forceinline__
687  static cudaError_t ReduceByKey(
688  void *d_temp_storage,
689  size_t &temp_storage_bytes,
690  KeysInputIteratorT d_keys_in,
691  UniqueOutputIteratorT d_unique_out,
692  ValuesInputIteratorT d_values_in,
693  AggregatesOutputIteratorT d_aggregates_out,
694  NumRunsOutputIteratorT d_num_runs_out,
695  ReductionOpT reduction_op,
696  int num_items,
697  cudaStream_t stream = 0,
698  bool debug_synchronous = false)
699  {
700  // Signed integer type for global offsets
701  typedef int OffsetT;
702 
703  // FlagT iterator type (not used)
704 
705  // Selection op (not used)
706 
707  // Default == operator
708  typedef Equality EqualityOp;
709 
711  d_temp_storage,
712  temp_storage_bytes,
713  d_keys_in,
714  d_unique_out,
715  d_values_in,
718  EqualityOp(),
719  reduction_op,
720  num_items,
721  stream,
722  debug_synchronous);
723  }
724 
725 };
726 
731 } // CUB namespace
732 CUB_NS_POSTFIX // Optional outer namespace(s)
733 
734 
Type equality test.
Definition: util_type.cuh:98
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide minimum using the less-than ('<') operator.
Type traits.
Definition: util_type.cuh:1158
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Optional outer namespace(s)
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
A key identifier paired with a corresponding value.
Definition: util_type.cuh:666
Default equality functor.
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index o...
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
OffsetT OffsetT
[in] Total number of input data items
Default max functor.
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
KeyT const ValueT * d_values_in
[in] Input values buffer
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of t...
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide maximum using the greater-than ('>') operator.
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide sum using the addition (+) operator.
DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of...
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t ReduceByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
Default sum functor.
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
Default min functor.
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items