OpenFPM  5.2.0
Project that contain the implementation of distributed structures
device_segmented_reduce.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "../iterator/arg_index_input_iterator.cuh"
43 #include "../util_type.cuh"
44 #include "../util_namespace.cuh"
45 
47 CUB_NS_PREFIX
48 
50 namespace cub {
51 
52 
66 {
126  template <
127  typename InputIteratorT,
128  typename OutputIteratorT,
129  typename OffsetIteratorT,
130  typename ReductionOp,
131  typename T>
132  CUB_RUNTIME_FUNCTION
133  static cudaError_t Reduce(
134  void *d_temp_storage,
135  size_t &temp_storage_bytes,
136  InputIteratorT d_in,
137  OutputIteratorT d_out,
138  int num_segments,
139  OffsetIteratorT d_begin_offsets,
140  OffsetIteratorT d_end_offsets,
141  ReductionOp reduction_op,
142  T initial_value,
143  cudaStream_t stream = 0,
144  bool debug_synchronous = false)
145  {
146  // Signed integer type for global offsets
147  typedef int OffsetT;
148 
150  d_temp_storage,
151  temp_storage_bytes,
152  d_in,
153  d_out,
154  num_segments,
157  reduction_op,
158  initial_value,
159  stream,
160  debug_synchronous);
161  }
162 
163 
210  template <
211  typename InputIteratorT,
212  typename OutputIteratorT,
213  typename OffsetIteratorT>
214  CUB_RUNTIME_FUNCTION
215  static cudaError_t Sum(
216  void *d_temp_storage,
217  size_t &temp_storage_bytes,
218  InputIteratorT d_in,
219  OutputIteratorT d_out,
220  int num_segments,
221  OffsetIteratorT d_begin_offsets,
222  OffsetIteratorT d_end_offsets,
223  cudaStream_t stream = 0,
224  bool debug_synchronous = false)
225  {
226  // Signed integer type for global offsets
227  typedef int OffsetT;
228 
229  // The output value type
230  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
231  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
232  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
233 
235  d_temp_storage,
236  temp_storage_bytes,
237  d_in,
238  d_out,
239  num_segments,
242  cub::Sum(),
243  OutputT(), // zero-initialize
244  stream,
245  debug_synchronous);
246  }
247 
248 
295  template <
296  typename InputIteratorT,
297  typename OutputIteratorT,
298  typename OffsetIteratorT>
299  CUB_RUNTIME_FUNCTION
300  static cudaError_t Min(
301  void *d_temp_storage,
302  size_t &temp_storage_bytes,
303  InputIteratorT d_in,
304  OutputIteratorT d_out,
305  int num_segments,
306  OffsetIteratorT d_begin_offsets,
307  OffsetIteratorT d_end_offsets,
308  cudaStream_t stream = 0,
309  bool debug_synchronous = false)
310  {
311  // Signed integer type for global offsets
312  typedef int OffsetT;
313 
314  // The input value type
315  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
316 
318  d_temp_storage,
319  temp_storage_bytes,
320  d_in,
321  d_out,
322  num_segments,
325  cub::Min(),
326  Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
327  stream,
328  debug_synchronous);
329  }
330 
331 
380  template <
381  typename InputIteratorT,
382  typename OutputIteratorT,
383  typename OffsetIteratorT>
384  CUB_RUNTIME_FUNCTION
385  static cudaError_t ArgMin(
386  void *d_temp_storage,
387  size_t &temp_storage_bytes,
388  InputIteratorT d_in,
389  OutputIteratorT d_out,
390  int num_segments,
391  OffsetIteratorT d_begin_offsets,
392  OffsetIteratorT d_end_offsets,
393  cudaStream_t stream = 0,
394  bool debug_synchronous = false)
395  {
396  // Signed integer type for global offsets
397  typedef int OffsetT;
398 
399  // The input type
400  typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
401 
402  // The output tuple type
403  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
404  KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
405  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
406 
407  // The output value type
408  typedef typename OutputTupleT::Value OutputValueT;
409 
410  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
412  ArgIndexInputIteratorT d_indexed_in(d_in);
413 
414  // Initial value
415  OutputTupleT initial_value(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
416 
418  d_temp_storage,
419  temp_storage_bytes,
420  d_indexed_in,
421  d_out,
422  num_segments,
425  cub::ArgMin(),
426  initial_value,
427  stream,
428  debug_synchronous);
429  }
430 
431 
478  template <
479  typename InputIteratorT,
480  typename OutputIteratorT,
481  typename OffsetIteratorT>
482  CUB_RUNTIME_FUNCTION
483  static cudaError_t Max(
484  void *d_temp_storage,
485  size_t &temp_storage_bytes,
486  InputIteratorT d_in,
487  OutputIteratorT d_out,
488  int num_segments,
489  OffsetIteratorT d_begin_offsets,
490  OffsetIteratorT d_end_offsets,
491  cudaStream_t stream = 0,
492  bool debug_synchronous = false)
493  {
494  // Signed integer type for global offsets
495  typedef int OffsetT;
496 
497  // The input value type
498  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
499 
501  d_temp_storage,
502  temp_storage_bytes,
503  d_in,
504  d_out,
505  num_segments,
508  cub::Max(),
509  Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
510  stream,
511  debug_synchronous);
512  }
513 
514 
563  template <
564  typename InputIteratorT,
565  typename OutputIteratorT,
566  typename OffsetIteratorT>
567  CUB_RUNTIME_FUNCTION
568  static cudaError_t ArgMax(
569  void *d_temp_storage,
570  size_t &temp_storage_bytes,
571  InputIteratorT d_in,
572  OutputIteratorT d_out,
573  int num_segments,
574  OffsetIteratorT d_begin_offsets,
575  OffsetIteratorT d_end_offsets,
576  cudaStream_t stream = 0,
577  bool debug_synchronous = false)
578  {
579  // Signed integer type for global offsets
580  typedef int OffsetT;
581 
582  // The input type
583  typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
584 
585  // The output tuple type
586  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
587  KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
588  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
589 
590  // The output value type
591  typedef typename OutputTupleT::Value OutputValueT;
592 
593  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
595  ArgIndexInputIteratorT d_indexed_in(d_in);
596 
597  // Initial value
598  OutputTupleT initial_value(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
599 
601  d_temp_storage,
602  temp_storage_bytes,
603  d_indexed_in,
604  d_out,
605  num_segments,
608  cub::ArgMax(),
609  initial_value,
610  stream,
611  debug_synchronous);
612  }
613 
614 };
615 
616 } // CUB namespace
617 CUB_NS_POSTFIX // Optional outer namespace(s)
618 
619 
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
Optional outer namespace(s)
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
OffsetT OffsetT
[in] Total number of input data items
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across mult...
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum in each segment using the greater-than ('>') operator,...
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented sum using the addition ('+') operator.
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum in each segment using the less-than ('<') operator,...
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented minimum using the less-than ('<') operator.
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented maximum using the greater-than ('>') operator.
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Type equality test.
Definition: util_type.cuh:99
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:73
A key identifier paired with a corresponding value.
Definition: util_type.cuh:667
Default max functor.
Default min functor.
Default sum functor.
Type traits.
Definition: util_type.cuh:1158