doxygen/openfpm/device__reduce_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <stdio.h>

#include <iterator>

#include <limits>


#include "../iterator/arg_index_input_iterator.cuh"

#include "dispatch/dispatch_reduce.cuh"

#include "dispatch/dispatch_reduce_by_key.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


struct DeviceReduce

{

    template <

        typename                    InputIteratorT,

        typename                    OutputIteratorT,

        typename                    ReductionOpT,

        typename                    T>

    CUB_RUNTIME_FUNCTION

    static cudaError_t Reduce(

        void                        *d_temp_storage,

        size_t                      &temp_storage_bytes,

        InputIteratorT              d_in,

        OutputIteratorT             d_out,

        int                         num_items,

        ReductionOpT                reduction_op,

        T                           init,

        cudaStream_t                stream              = 0,

        bool                        debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_in,

            d_out,

            num_items,

            reduction_op,

            init,

            stream,

            debug_synchronous);

    }


    template <

        typename                    InputIteratorT,

        typename                    OutputIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t Sum(

        void                        *d_temp_storage,

        size_t                      &temp_storage_bytes,

        InputIteratorT              d_in,

        OutputIteratorT             d_out,

        int                         num_items,

        cudaStream_t                stream              = 0,

        bool                        debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The output value type

        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,

            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type


        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_in,

            d_out,

            num_items,

            cub::Sum(),

            OutputT(),            // zero-initialize

            stream,

            debug_synchronous);

    }


    template <

        typename                    InputIteratorT,

        typename                    OutputIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t Min(

        void                        *d_temp_storage,

        size_t                      &temp_storage_bytes,

        InputIteratorT              d_in,

        OutputIteratorT             d_out,

        int                         num_items,

        cudaStream_t                stream              = 0,

        bool                        debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The input value type

        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;


        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_in,

            d_out,

            num_items,

            cub::Min(),

            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent

            stream,

            debug_synchronous);

    }


    template <

        typename                    InputIteratorT,

        typename                    OutputIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t ArgMin(

        void                        *d_temp_storage,

        size_t                      &temp_storage_bytes,

        InputIteratorT              d_in,

        OutputIteratorT             d_out,

        int                         num_items,

        cudaStream_t                stream              = 0,

        bool                        debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The input type

        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;


        // The output tuple type

        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type


        // The output value type

        typedef typename OutputTupleT::Value OutputValueT;


        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples

        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;

        ArgIndexInputIteratorT d_indexed_in(d_in);


        // Initial value

        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent


        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_indexed_in,

            d_out,

            num_items,

            cub::ArgMin(),

            initial_value,

            stream,

            debug_synchronous);

    }


    template <

        typename                    InputIteratorT,

        typename                    OutputIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t Max(

        void                        *d_temp_storage,

        size_t                      &temp_storage_bytes,

        InputIteratorT              d_in,

        OutputIteratorT             d_out,

        int                         num_items,

        cudaStream_t                stream              = 0,

        bool                        debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The input value type

        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;


        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_in,

            d_out,

            num_items,

            cub::Max(),

            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent

            stream,

            debug_synchronous);

    }


    template <

        typename                    InputIteratorT,

        typename                    OutputIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t ArgMax(

        void                        *d_temp_storage,

        size_t                      &temp_storage_bytes,

        InputIteratorT              d_in,

        OutputIteratorT             d_out,

        int                         num_items,

        cudaStream_t                stream              = 0,

        bool                        debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The input type

        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;


        // The output tuple type

        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type


        // The output value type

        typedef typename OutputTupleT::Value OutputValueT;


        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples

        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;

        ArgIndexInputIteratorT d_indexed_in(d_in);


        // Initial value

        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent


        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_indexed_in,

            d_out,

            num_items,

            cub::ArgMax(),

            initial_value,

            stream,

            debug_synchronous);

    }


    template <

        typename                    KeysInputIteratorT,

        typename                    UniqueOutputIteratorT,

        typename                    ValuesInputIteratorT,

        typename                    AggregatesOutputIteratorT,

        typename                    NumRunsOutputIteratorT,

        typename                    ReductionOpT>

    CUB_RUNTIME_FUNCTION __forceinline__

    static cudaError_t ReduceByKey(

        void                        *d_temp_storage,

        size_t                      &temp_storage_bytes,

        KeysInputIteratorT          d_keys_in,

        UniqueOutputIteratorT       d_unique_out,

        ValuesInputIteratorT        d_values_in,

        AggregatesOutputIteratorT   d_aggregates_out,

        NumRunsOutputIteratorT      d_num_runs_out,

        ReductionOpT                reduction_op,

        int                         num_items,

        cudaStream_t                stream             = 0,

        bool                        debug_synchronous  = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // FlagT iterator type (not used)


        // Selection op (not used)


        // Default == operator

        typedef Equality EqualityOp;


        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_keys_in,

            d_unique_out,

            d_values_in,

            d_aggregates_out,

            d_num_runs_out,

            EqualityOp(),

            reduction_op,

            num_items,

            stream,

            debug_synchronous);

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::ArgIndexInputIterator
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
Definition arg_index_input_iterator.cuh:114

dispatch_reduce.cuh

dispatch_reduce_by_key.cuh

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::init
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
Definition dispatch_reduce.cuh:119

cub::d_unique_out
UniqueOutputIteratorT d_unique_out
< Pointer to the input sequence of keys
Definition dispatch_reduce_by_key.cuh:74

cub::d_num_runs_out
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT NumRunsOutputIteratorT d_num_runs_out
Pointer to total number of runs encountered (i.e., the length of d_unique_out)
Definition dispatch_reduce_by_key.cuh:77

cub::num_items
KeyT const ValueT ValueT OffsetT OffsetT num_items
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:168

cub::reduction_op
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Definition dispatch_reduce.cuh:75

cub::d_values_in
KeyT const ValueT * d_values_in
[in] Input values buffer
Definition dispatch_radix_sort.cuh:165

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::d_aggregates_out
UniqueOutputIteratorT ValuesInputIteratorT AggregatesOutputIteratorT d_aggregates_out
Pointer to the output sequence of value aggregates (one aggregate per run)
Definition dispatch_reduce_by_key.cuh:76

cub::d_out
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Definition dispatch_reduce.cuh:71

cub::ArgMax
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
Definition thread_operators.cuh:138

cub::ArgMin
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
Definition thread_operators.cuh:173

cub::DeviceReduce
DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of...
Definition device_reduce.cuh:85

cub::DeviceReduce::ArgMax
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index o...
Definition device_reduce.cuh:550

cub::DeviceReduce::Min
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide minimum using the less-than ('<') operator.
Definition device_reduce.cuh:306

cub::DeviceReduce::ReduceByKey
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t ReduceByKey(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
Definition device_reduce.cuh:687

cub::DeviceReduce::ArgMin
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of t...
Definition device_reduce.cuh:383

cub::DeviceReduce::Max
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide maximum using the greater-than ('>') operator.
Definition device_reduce.cuh:473

cub::DeviceReduce::Reduce
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
Definition device_reduce.cuh:148

cub::DeviceReduce::Sum
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide sum using the addition (+) operator.
Definition device_reduce.cuh:229

cub::DispatchReduceByKey::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config)
< Function type of cub::DeviceReduceByKeyKernelT
Definition dispatch_reduce_by_key.cuh:353

cub::DispatchReduce::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Definition dispatch_reduce.cuh:631

cub::Equality
Default equality functor.
Definition thread_operators.cuh:60

cub::Equals
Type equality test.
Definition util_type.cuh:99

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::KeyValuePair
A key identifier paired with a corresponding value.
Definition util_type.cuh:667

cub::Max
Default max functor.
Definition thread_operators.cuh:124

cub::Min
Default min functor.
Definition thread_operators.cuh:159

cub::Sum
Default sum functor.
Definition thread_operators.cuh:110

cub::Traits
Type traits.
Definition util_type.cuh:1158