OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
device_segmented_reduce.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "../iterator/arg_index_input_iterator.cuh"
43 #include "../util_type.cuh"
44 #include "../util_namespace.cuh"
45 
47 CUB_NS_PREFIX
48 
50 namespace cub {
51 
52 
66 {
126  template <
127  typename InputIteratorT,
128  typename OutputIteratorT,
129  typename OffsetIteratorT,
130  typename ReductionOp,
131  typename T>
132  CUB_RUNTIME_FUNCTION
133  static cudaError_t Reduce(
134  void *d_temp_storage,
135  size_t &temp_storage_bytes,
136  InputIteratorT d_in,
137  OutputIteratorT d_out,
138  int num_segments,
139  OffsetIteratorT d_begin_offsets,
140  OffsetIteratorT d_end_offsets,
141  ReductionOp reduction_op,
142  T initial_value,
143  cudaStream_t stream = 0,
144  bool debug_synchronous = false)
145  {
146  // Signed integer type for global offsets
147  typedef int OffsetT;
148 
150  d_temp_storage,
151  temp_storage_bytes,
152  d_in,
153  d_out,
154  num_segments,
157  reduction_op,
158  initial_value,
159  stream,
160  debug_synchronous);
161  }
162 
163 
210  template <
211  typename InputIteratorT,
212  typename OutputIteratorT,
213  typename OffsetIteratorT>
214  CUB_RUNTIME_FUNCTION
215  static cudaError_t Sum(
216  void *d_temp_storage,
217  size_t &temp_storage_bytes,
218  InputIteratorT d_in,
219  OutputIteratorT d_out,
220  int num_segments,
221  OffsetIteratorT d_begin_offsets,
222  OffsetIteratorT d_end_offsets,
223  cudaStream_t stream = 0,
224  bool debug_synchronous = false)
225  {
226  // Signed integer type for global offsets
227  typedef int OffsetT;
228 
229  // The output value type
230  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
231  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
232  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type
233 
235  d_temp_storage,
236  temp_storage_bytes,
237  d_in,
238  d_out,
239  num_segments,
242  cub::Sum(),
243  OutputT(), // zero-initialize
244  stream,
245  debug_synchronous);
246  }
247 
248 
295  template <
296  typename InputIteratorT,
297  typename OutputIteratorT,
298  typename OffsetIteratorT>
299  CUB_RUNTIME_FUNCTION
300  static cudaError_t Min(
301  void *d_temp_storage,
302  size_t &temp_storage_bytes,
303  InputIteratorT d_in,
304  OutputIteratorT d_out,
305  int num_segments,
306  OffsetIteratorT d_begin_offsets,
307  OffsetIteratorT d_end_offsets,
308  cudaStream_t stream = 0,
309  bool debug_synchronous = false)
310  {
311  // Signed integer type for global offsets
312  typedef int OffsetT;
313 
314  // The input value type
315  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
316 
318  d_temp_storage,
319  temp_storage_bytes,
320  d_in,
321  d_out,
322  num_segments,
325  cub::Min(),
326  Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
327  stream,
328  debug_synchronous);
329  }
330 
331 
380  template <
381  typename InputIteratorT,
382  typename OutputIteratorT,
383  typename OffsetIteratorT>
384  CUB_RUNTIME_FUNCTION
385  static cudaError_t ArgMin(
386  void *d_temp_storage,
387  size_t &temp_storage_bytes,
388  InputIteratorT d_in,
389  OutputIteratorT d_out,
390  int num_segments,
391  OffsetIteratorT d_begin_offsets,
392  OffsetIteratorT d_end_offsets,
393  cudaStream_t stream = 0,
394  bool debug_synchronous = false)
395  {
396  // Signed integer type for global offsets
397  typedef int OffsetT;
398 
399  // The input type
400  typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
401 
402  // The output tuple type
403  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
404  KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
405  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
406 
407  // The output value type
408  typedef typename OutputTupleT::Value OutputValueT;
409 
410  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
412  ArgIndexInputIteratorT d_indexed_in(d_in);
413 
414  // Initial value
415  OutputTupleT initial_value(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
416 
418  d_temp_storage,
419  temp_storage_bytes,
420  d_indexed_in,
421  d_out,
422  num_segments,
425  cub::ArgMin(),
426  initial_value,
427  stream,
428  debug_synchronous);
429  }
430 
431 
478  template <
479  typename InputIteratorT,
480  typename OutputIteratorT,
481  typename OffsetIteratorT>
482  CUB_RUNTIME_FUNCTION
483  static cudaError_t Max(
484  void *d_temp_storage,
485  size_t &temp_storage_bytes,
486  InputIteratorT d_in,
487  OutputIteratorT d_out,
488  int num_segments,
489  OffsetIteratorT d_begin_offsets,
490  OffsetIteratorT d_end_offsets,
491  cudaStream_t stream = 0,
492  bool debug_synchronous = false)
493  {
494  // Signed integer type for global offsets
495  typedef int OffsetT;
496 
497  // The input value type
498  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
499 
501  d_temp_storage,
502  temp_storage_bytes,
503  d_in,
504  d_out,
505  num_segments,
508  cub::Max(),
509  Traits<InputT>::Lowest(), // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
510  stream,
511  debug_synchronous);
512  }
513 
514 
563  template <
564  typename InputIteratorT,
565  typename OutputIteratorT,
566  typename OffsetIteratorT>
567  CUB_RUNTIME_FUNCTION
568  static cudaError_t ArgMax(
569  void *d_temp_storage,
570  size_t &temp_storage_bytes,
571  InputIteratorT d_in,
572  OutputIteratorT d_out,
573  int num_segments,
574  OffsetIteratorT d_begin_offsets,
575  OffsetIteratorT d_end_offsets,
576  cudaStream_t stream = 0,
577  bool debug_synchronous = false)
578  {
579  // Signed integer type for global offsets
580  typedef int OffsetT;
581 
582  // The input type
583  typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
584 
585  // The output tuple type
586  typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
587  KeyValuePair<OffsetT, InputValueT>, // ... then the key value pair OffsetT + InputValueT
588  typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT; // ... else the output iterator's value type
589 
590  // The output value type
591  typedef typename OutputTupleT::Value OutputValueT;
592 
593  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
595  ArgIndexInputIteratorT d_indexed_in(d_in);
596 
597  // Initial value
598  OutputTupleT initial_value(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
599 
601  d_temp_storage,
602  temp_storage_bytes,
603  d_indexed_in,
604  d_out,
605  num_segments,
608  cub::ArgMax(),
609  initial_value,
610  stream,
611  debug_synchronous);
612  }
613 
614 };
615 
616 } // CUB namespace
617 CUB_NS_POSTFIX // Optional outer namespace(s)
618 
619 
Type equality test.
Definition: util_type.cuh:98
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented sum using the addition ('+') operator.
Type traits.
Definition: util_type.cuh:1158
DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across mult...
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Optional outer namespace(s)
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum in each segment using the greater-than ('>') operator,...
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
A key identifier paired with a corresponding value.
Definition: util_type.cuh:666
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum in each segment using the less-than ('<') operator,...
OffsetT OffsetT
[in] Total number of input data items
Default max functor.
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented maximum using the greater-than ('>') operator.
Type selection (IF ? ThenType : ElseType)
Definition: util_type.cuh:72
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented minimum using the less-than ('<') operator.
Default sum functor.
Default min functor.
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items