doxygen/openfpm/segreduce__ofp_8cuh_source.html

/*

 * segreduce_ofp.hpp

 *

 *  Created on: May 15, 2019

 *      Author: i-bird

 */


 #ifndef SEGREDUCE_OFP_HPP_

 #define SEGREDUCE_OFP_HPP_


 #ifdef __NVCC__


 #include "util/cuda_launch.hpp"

 #include "util/ofp_context.hpp"


 #if CUDART_VERSION >= 11000

    // Here we have for sure CUDA >= 11

    #ifndef CUDA_ON_CPU

        #ifdef __HIP__

            #include "hipcub/hipcub.hpp"

        #else

            #include "cub/cub.cuh"

        #endif

    #endif

#else

    #include "cub_old/cub.cuh"

#endif


 namespace openfpm

 {

    template<typename input_it,

             typename segments_it, typename output_it, typename op_t, typename type_t>

    void segreduce(input_it input, int count, segments_it segments,

                    int num_segments, output_it output, op_t op, type_t init,

                    gpu::ofp_context_t & context)

     {

 #ifdef CUDA_ON_CPU


        int i = 0;

        for ( ; i < num_segments - 1; i++)

        {

            int j = segments[i];

            output[i] = init;

            if (j == segments[i+1]) {continue;}

            output[i] = input[j];

            ++j;

            for ( ; j < segments[i+1] ; j++)

            {

                output[i] = op(output[i],input[j]);

            }

        }


        // Last segment

        int j = segments[i];

        if (j != count)

        {

            output[i] = input[j];

            ++j;

            for ( ; j < count ; j++)

            {

                output[i] = op(output[i],input[j]);

            }

        }


 #else

        #ifdef __HIP__


            size_t temp_storage_bytes = 0;


            hipcub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output,

                num_segments, segments, segments + 1, op, init);


            auto & temporal = context.getTemporalCUB();

            temporal.resize(temp_storage_bytes);


            hipcub::DeviceSegmentedReduce::Reduce(temporal.getDeviceBuffer<0>(), temp_storage_bytes, input, output,

                num_segments, segments, segments + 1, op, init);


        #else


            size_t temp_storage_bytes = 0;


            cub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output,

                num_segments, segments, segments + 1, op, init);


            auto & temporal = context.getTemporalCUB();

            temporal.resize(temp_storage_bytes);


            cub::DeviceSegmentedReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes, input, output,

                num_segments, segments, segments + 1, op, init);


        #endif

 #endif

     }

 }


 #endif /* __NVCC__ */


 #endif /* SCAN_OFP_HPP_ */


cub.cuh

cub::init
OutputIteratorT OffsetT ReductionOpT OuputT init
< [in] The initial value of the reduction
Definition dispatch_reduce.cuh:119

openfpm
convert a type into constant type
Definition aggregate.hpp:293

cub::DeviceSegmentedReduce::Reduce
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, OffsetIteratorT d_begin_offsets, OffsetIteratorT d_end_offsets, ReductionOp reduction_op, T initial_value, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide segmented reduction using the specified binary reduction_op functor.
Definition device_segmented_reduce.cuh:133

gpu::ofp_context_t
Definition ofp_context.hpp:303