doxygen/openfpm/reduce__ofp_8cuh_source.html

/*

 * reduce_ofp.hpp

 *

 *  Created on: May 15, 2019

 *      Author: i-bird

 */


#ifndef REDUCE_OFP_HPP_

#define REDUCE_OFP_HPP_


#ifdef __NVCC__


#include "util/cuda_launch.hpp"

#include "util/ofp_context.hpp"


#if CUDART_VERSION >= 11000

    // Here we have for sure CUDA >= 11

    #ifndef CUDA_ON_CPU

        #ifdef __HIP__

            #include "hipcub/hipcub.hpp"

        #else

            #include "cub/cub.cuh"

        #endif

    #endif

#else

    #include "cub_old/cub.cuh"

#endif


namespace openfpm

{

    template<typename input_it, typename output_it, typename reduce_op>

            void reduce(input_it input, int count, output_it output, reduce_op op, gpu::ofp_context_t& context)

    {

#ifdef CUDA_ON_CPU


    output[0] = 0;

    for (int i = 0 ; i < count ; i++)

    {

        output[0] = op(output[0],input[i]);

    }


#else


    #ifdef __HIP__


        size_t temp_storage_bytes = 0;

        hipcub::DeviceReduce::Reduce(NULL,

            temp_storage_bytes,input, output, count, op, false);


        auto & temporal = context.getTemporalCUB();

        temporal.resize(temp_storage_bytes);


        hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(),

            temp_storage_bytes,input, output, count, op, false);

    #else


        size_t temp_storage_bytes = 0;

        cub::DeviceReduce::Reduce(NULL,

            temp_storage_bytes, input, output, count, op, false);


        auto & temporal = context.getTemporalCUB();

        temporal.resize(temp_storage_bytes);


        cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(),

            temp_storage_bytes, input, output, count, op, false);


    #endif

#endif

    }

}


#endif


#endif /* REDUCE_OFP_HPP_ */

cub.cuh

openfpm
convert a type into constant type
Definition aggregate.hpp:293

cub::DeviceReduce::Reduce
static CUB_RUNTIME_FUNCTION cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream=0, bool debug_synchronous=false)
Computes a device-wide reduction using the specified binary reduction_op functor and initial value in...
Definition device_reduce.cuh:148

gpu::ofp_context_t
Definition ofp_context.hpp:303

reduce_op
this class is a functor for "for_each" algorithm
Definition trash_bin.hpp:143