doxygen/openfpm/device__segmented__reduce_8cuh_source.html

/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


#pragma once


#include <stdio.h>

#include <iterator>


#include "../iterator/arg_index_input_iterator.cuh"

#include "dispatch/dispatch_reduce.cuh"

#include "dispatch/dispatch_reduce_by_key.cuh"

#include "../util_type.cuh"

#include "../util_namespace.cuh"


CUB_NS_PREFIX


namespace cub {


struct DeviceSegmentedReduce

{

    template <

        typename            InputIteratorT,

        typename            OutputIteratorT,

        typename            OffsetIteratorT,

        typename            ReductionOp,

        typename            T>

    CUB_RUNTIME_FUNCTION

    static cudaError_t Reduce(

        void                *d_temp_storage,

        size_t              &temp_storage_bytes,

        InputIteratorT      d_in,

        OutputIteratorT     d_out,

        int                 num_segments,

        OffsetIteratorT     d_begin_offsets,

        OffsetIteratorT     d_end_offsets,

        ReductionOp         reduction_op,

        T                   initial_value,

        cudaStream_t        stream              = 0,

        bool                debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_in,

            d_out,

            num_segments,

            d_begin_offsets,

            d_end_offsets,

            reduction_op,

            initial_value,

            stream,

            debug_synchronous);

    }


    template <

        typename            InputIteratorT,

        typename            OutputIteratorT,

        typename            OffsetIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t Sum(

        void                *d_temp_storage,

        size_t              &temp_storage_bytes,

        InputIteratorT      d_in,

        OutputIteratorT     d_out,

        int                 num_segments,

        OffsetIteratorT     d_begin_offsets,

        OffsetIteratorT     d_end_offsets,

        cudaStream_t        stream              = 0,

        bool                debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The output value type

        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,

            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type


        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_in,

            d_out,

            num_segments,

            d_begin_offsets,

            d_end_offsets,

            cub::Sum(),

            OutputT(),            // zero-initialize

            stream,

            debug_synchronous);

    }


    template <

        typename            InputIteratorT,

        typename            OutputIteratorT,

        typename            OffsetIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t Min(

        void                *d_temp_storage,

        size_t              &temp_storage_bytes,

        InputIteratorT      d_in,

        OutputIteratorT     d_out,

        int                 num_segments,

        OffsetIteratorT     d_begin_offsets,

        OffsetIteratorT     d_end_offsets,

        cudaStream_t        stream              = 0,

        bool                debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The input value type

        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;


        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_in,

            d_out,

            num_segments,

            d_begin_offsets,

            d_end_offsets,

            cub::Min(),

            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent

            stream,

            debug_synchronous);

    }


    template <

        typename            InputIteratorT,

        typename            OutputIteratorT,

        typename            OffsetIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t ArgMin(

        void                *d_temp_storage,

        size_t              &temp_storage_bytes,

        InputIteratorT      d_in,

        OutputIteratorT     d_out,

        int                 num_segments,

        OffsetIteratorT     d_begin_offsets,

        OffsetIteratorT     d_end_offsets,

        cudaStream_t        stream              = 0,

        bool                debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The input type

        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;


        // The output tuple type

        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type


        // The output value type

        typedef typename OutputTupleT::Value OutputValueT;


        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples

        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;

        ArgIndexInputIteratorT d_indexed_in(d_in);


        // Initial value

        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent


        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_indexed_in,

            d_out,

            num_segments,

            d_begin_offsets,

            d_end_offsets,

            cub::ArgMin(),

            initial_value,

            stream,

            debug_synchronous);

    }


    template <

        typename            InputIteratorT,

        typename            OutputIteratorT,

        typename            OffsetIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t Max(

        void                *d_temp_storage,

        size_t              &temp_storage_bytes,

        InputIteratorT      d_in,

        OutputIteratorT     d_out,

        int                 num_segments,

        OffsetIteratorT     d_begin_offsets,

        OffsetIteratorT     d_end_offsets,

        cudaStream_t        stream              = 0,

        bool                debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The input value type

        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;


        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_in,

            d_out,

            num_segments,

            d_begin_offsets,

            d_end_offsets,

            cub::Max(),

            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent

            stream,

            debug_synchronous);

    }


    template <

        typename            InputIteratorT,

        typename            OutputIteratorT,

        typename            OffsetIteratorT>

    CUB_RUNTIME_FUNCTION

    static cudaError_t ArgMax(

        void                *d_temp_storage,

        size_t              &temp_storage_bytes,

        InputIteratorT      d_in,

        OutputIteratorT     d_out,

        int                 num_segments,

        OffsetIteratorT     d_begin_offsets,

        OffsetIteratorT     d_end_offsets,

        cudaStream_t        stream              = 0,

        bool                debug_synchronous   = false)

    {

        // Signed integer type for global offsets

        typedef int OffsetT;


        // The input type

        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;


        // The output tuple type

        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?

            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT

            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type


        // The output value type

        typedef typename OutputTupleT::Value OutputValueT;


        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples

        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;

        ArgIndexInputIteratorT d_indexed_in(d_in);


        // Initial value

        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent


        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(

            d_temp_storage,

            temp_storage_bytes,

            d_indexed_in,

            d_out,

            num_segments,

            d_begin_offsets,

            d_end_offsets,

            cub::ArgMax(),

            initial_value,

            stream,

            debug_synchronous);

    }


};


}               // CUB namespace

CUB_NS_POSTFIX  // Optional outer namespace(s)


cub::ArgIndexInputIterator
A random-access input wrapper for pairing dereferenced values with their corresponding indices (formi...
Definition arg_index_input_iterator.cuh:114

dispatch_reduce.cuh

dispatch_reduce_by_key.cuh

cub
Optional outer namespace(s)
Definition agent_histogram.cuh:48

cub::reduction_op
OutputIteratorT OffsetT GridEvenShare< OffsetT > ReductionOpT reduction_op
< [in] Binary reduction functor
Definition dispatch_reduce.cuh:75

cub::d_begin_offsets
KeyT const ValueT ValueT OffsetIteratorT d_begin_offsets
[in] Pointer to the sequence of beginning offsets of length num_segments, such that d_begin_offsets[i...
Definition dispatch_radix_sort.cuh:334

cub::OffsetT
OffsetT OffsetT
[in] Total number of input data items
Definition dispatch_radix_sort.cuh:75

cub::d_end_offsets
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT d_end_offsets
[in] Pointer to the sequence of ending offsets of length num_segments, such that d_end_offsets[i]-1 i...
Definition dispatch_radix_sort.cuh:335

cub::d_out
OutputIteratorT d_out
< [in] Pointer to the input sequence of data items
Definition dispatch_reduce.cuh:71

cub::ArgMax
Arg max functor (keeps the value and offset of the first occurrence of the larger item)
Definition thread_operators.cuh:138

cub::ArgMin
Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
Definition thread_operators.cuh:173

cub::DeviceSegmentedReduce
DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across mult...
Definition device_segmented_reduce.cuh:66

cub::DeviceSegmentedReduce::ArgMax
static CUB_RUNTIME_FUNCTION cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide maximum in each segment using the greater-than ('>') operator,...
Definition device_segmented_reduce.cuh:568

cub::DeviceSegmentedReduce::Reduce
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.
Definition device_segmented_reduce.cuh:133

cub::DeviceSegmentedReduce::Sum
static CUB_RUNTIME_FUNCTION cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented sum using the addition ('+') operator.
Definition device_segmented_reduce.cuh:215

cub::DeviceSegmentedReduce::ArgMin
static CUB_RUNTIME_FUNCTION cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Finds the first device-wide minimum in each segment using the less-than ('<') operator,...
Definition device_segmented_reduce.cuh:385

cub::DeviceSegmentedReduce::Min
static CUB_RUNTIME_FUNCTION cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented minimum using the less-than ('<') operator.
Definition device_segmented_reduce.cuh:300

cub::DeviceSegmentedReduce::Max
static CUB_RUNTIME_FUNCTION cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented maximum using the greater-than ('>') operator.
Definition device_segmented_reduce.cuh:483

cub::DispatchSegmentedReduce::Dispatch
CUB_RUNTIME_FUNCTION static __forceinline__ cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOpT reduction_op, OutputT init, cudaStream_t stream, bool debug_synchronous)
Definition dispatch_reduce.cuh:835

cub::Equals
Type equality test.
Definition util_type.cuh:99

cub::If
Type selection (IF ? ThenType : ElseType)
Definition util_type.cuh:73

cub::KeyValuePair
A key identifier paired with a corresponding value.
Definition util_type.cuh:667

cub::Max
Default max functor.
Definition thread_operators.cuh:124

cub::Min
Default min functor.
Definition thread_operators.cuh:159

cub::Sum
Default sum functor.
Definition thread_operators.cuh:110

cub::Traits
Type traits.
Definition util_type.cuh:1158