8#ifndef MAP_VECTOR_SPARSE_HPP_
9#define MAP_VECTOR_SPARSE_HPP_
11#include "util/cuda_launch.hpp"
12#include "Vector/map_vector.hpp"
13#include "Vector/cuda/map_vector_sparse_cuda_ker.cuh"
14#include "Vector/cuda/map_vector_sparse_cuda_kernels.cuh"
15#include "util/ofp_context.hpp"
20 #include "util/cuda/kernels.cuh"
23#include "util/cuda/scan_ofp.cuh"
24#include "util/cuda/sort_ofp.cuh"
25#include "util/cuda/segreduce_ofp.cuh"
26#include "util/cuda/merge_ofp.cuh"
35template<
typename OfpmVectorT>
36using ValueTypeOf =
typename std::remove_reference<OfpmVectorT>::type::value_type;
41 template<
typename sg_type>
49 htoD(sg_type &
sg,
unsigned int lele)
58 sg.template hostToDevice<T::value>(lele,lele);
62 constexpr int VECTOR_SPARSE_STANDARD = 1;
63 constexpr int VECTOR_SPARSE_BLOCK = 2;
65 template<
typename reduction_type,
unsigned int impl>
68 template<
typename encap_src,
typename encap_dst>
69 static inline void process(encap_src & src, encap_dst & dst)
71 dst = reduction_type::red(dst,src);
75 template<
typename reduction_type>
78 template<
typename encap_src,
typename encap_dst>
79 static inline void process(encap_src & src, encap_dst & dst)
81 for (
size_t i = 0 ; i < encap_src::size ; i++)
83 dst[i] = reduction_type::red(dst[i],src[i]);
88 template<
typename reduction_type>
91 template<
typename encap_src,
typename encap_dst,
unsigned int N1>
92 static inline void process(encap_src & src, encap_dst (& dst)[N1])
94 for (
unsigned int j = 0 ; j < N1 ; j++)
96 for (
size_t i = 0 ; i < encap_dst::size ; i++)
98 dst[i] = reduction_type::red(dst[i][j],src[j][i]);
103 template<
unsigned int N1,
unsigned int blockSize,
typename encap_src,
typename encap_dst>
104 static inline void process_e(encap_src & src, encap_dst & dst)
106 for (
unsigned int j = 0 ; j < N1 ; j++)
108 for (
size_t i = 0 ; i < blockSize ; i++)
110 dst[i] = reduction_type::red(dst[i][j],src[i][j]);
120 template<
unsigned int impl,
typename block_functor>
123 template <
unsigned int p,
typename vector_index_type>
124 static void extendSegments(vector_index_type & segments,
size_t dataSize)
128 segments.resize(segments.size()+1);
129 segments.template get<p>(segments.size() - 1) = dataSize;
130 segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1);
132 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: this file is supposed to be compiled with nvcc" << std::endl;
136 template <
unsigned int pSegment,
typename vector_reduction,
typename T,
typename vector_data_type,
typename vector_index_type ,
typename vector_index_type2>
137 static void segreduce(vector_data_type & vector_data,
138 vector_data_type & vector_data_unsorted,
139 vector_index_type & vector_data_map,
140 vector_index_type2 & segment_offset,
141 vector_data_type & vector_data_red,
146 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
147 typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
148 typedef typename reduction_type::template op_red<red_type> red_op;
149 typedef typename boost::mpl::at<typename vector_index_type::value_type::type,boost::mpl::int_<0>>::type seg_type;
150 typename reduction_type::template op_initial_value<red_type> initial_value_functor;
152 assert((std::is_same<seg_type,int>::value ==
true));
155 (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(),
156 (
int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size()-1,
157 (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(),
158 red_op(), initial_value_functor(), context);
160 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: this file is supposed to be compiled with nvcc" << std::endl;
181 typename vector_data_type,
182 typename vector_index_type,
183 typename vector_index_type2,
184 typename vector_index_dtmp_type,
186 typename ... v_reduce>
188 vector_index_type & vct_index_old,
189 vector_index_type & vct_index_merge,
190 vector_index_type & vct_index_merge_id,
191 vector_index_type & vct_index_out,
192 vector_index_dtmp_type & vct_index_dtmp,
193 vector_index_type & data_map,
194 vector_index_type2 & segments_new,
195 vector_data_type & vct_data_old,
196 vector_data_type & vct_add_data,
197 vector_data_type & vct_add_data_unique,
198 vector_data_type & vct_data_out,
206 CUDA_LAUNCH((solve_conflicts<
207 decltype(vct_index_merge.toKernel()),
208 decltype(vct_data_old.toKernel()),
209 decltype(vct_index_dtmp.toKernel()),
214 vct_index_merge.toKernel(),vct_data_old.toKernel(),
215 vct_index_merge_id.toKernel(),vct_add_data_unique.toKernel(),
216 vct_index_out.toKernel(),vct_data_out.toKernel(),
217 vct_index_dtmp.toKernel(),
218 vct_index_old.size());
222 (Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),
223 vct_index_dtmp.size(),
224 (Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),
228 vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.size()-1,vct_index_dtmp.size()-1);
229 int size = vct_index_dtmp.template get<1>(vct_index_dtmp.size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.size()-1);
231 vct_index_old.resize(size);
232 vct_data_old.resize(size);
234 CUDA_LAUNCH(realign,itew,vct_index_out.toKernel(),vct_data_out.toKernel(),
235 vct_index_old.toKernel(), vct_data_old.toKernel(),
236 vct_index_dtmp.toKernel());
240 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: this file is supposed to be compiled with nvcc" << std::endl;
246 template<
typename block_functor>
249 template <
unsigned int p,
typename vector_index_type>
250 static void extendSegments(vector_index_type & segments,
size_t dataSize)
254 segments.resize(segments.size()+1);
255 segments.template get<p>(segments.size() - 1) = dataSize;
256 segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1);
258 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: this file is supposed to be compiled with nvcc" << std::endl;
262 template <
unsigned int pSegment,
typename vector_reduction,
typename T,
typename vector_data_type,
typename vector_index_type ,
typename vector_index_type2>
263 static void segreduce(vector_data_type & vector_data,
264 vector_data_type & vector_data_unsorted,
265 vector_index_type & vector_data_map,
266 vector_index_type2 & segment_offset,
267 vector_data_type & vector_data_red,
275 typename vector_data_type,
276 typename vector_index_type,
277 typename vector_index_type2,
278 typename vector_index_dtmp_type,
280 typename ... v_reduce>
282 vector_index_type & vct_index_old,
283 vector_index_type & vct_index_merge,
284 vector_index_type & vct_index_merge_id,
285 vector_index_type & vct_index_out,
286 vector_index_dtmp_type & vct_index_dtmp,
287 vector_index_type & data_map,
288 vector_index_type2 & segments_new,
289 vector_data_type & vct_data,
290 vector_data_type & vct_add_data,
291 vector_data_type & vct_add_data_unique,
292 vector_data_type & vct_data_out,
299 blf.template solve_conflicts<1,
300 decltype(vct_index_merge),
301 decltype(segments_new),
304 (vct_index_merge, vct_index_merge_id, segments_new, data_map,
305 vct_data, vct_add_data,
306 vct_index_old, vct_data_out,
308 vct_data_out.swap(vct_data);
311 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: this file is supposed to be compiled with nvcc" << std::endl;
316 template<
typename Ti>
322 bool operator<(
const reorder & t)
const
328 template<
typename reduction_type,
typename vector_reduction,
typename T,
unsigned int impl,
typename red_type>
331 template<
typename vector_data_type,
typename vector_index_type,
typename vector_index_type_reo>
332 static inline void red(
size_t & i, vector_data_type & vector_data_red,
333 vector_data_type & vector_data,
334 vector_index_type & vector_index,
335 vector_index_type_reo & reorder_add_index_cpu)
337 size_t start = reorder_add_index_cpu.get(i).id;
338 red_type
red = vector_data.template get<reduction_type::prop::value>(i);
341 for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
346 vector_data_red.add();
347 vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1) =
red;
352 vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
360 template<
typename reduction_type,
typename vector_reduction,
typename T,
unsigned int impl,
typename red_type,
unsigned int N1>
363 template<
typename vector_data_type,
typename vector_index_type,
typename vector_index_type_reo>
364 static inline void red(
size_t & i, vector_data_type & vector_data_red,
365 vector_data_type & vector_data,
366 vector_index_type & vector_index,
367 vector_index_type_reo & reorder_add_index_cpu)
369 size_t start = reorder_add_index_cpu.get(i).id;
372 for (
size_t k = 0 ; k < N1 ; k++)
374 red[k] = vector_data.template get<reduction_type::prop::value>(i)[k];
378 for ( ; i+j < reorder_add_index_cpu.size() && reorder_add_index_cpu.get(i+j).id == start ; j++)
380 auto ev = vector_data.template get<reduction_type::prop::value>(i+j);
385 vector_data_red.add();
387 for (
size_t k = 0 ; k < N1 ; k++)
389 vector_data_red.template get<reduction_type::prop::value>(vector_data_red.size()-1)[k] =
red[k];
395 vector_index.template get<0>(vector_index.size() - 1) = reorder_add_index_cpu.get(i).id;
412 template<
typename vector_data_type,
413 typename vector_index_type,
414 typename vector_index_type_reo,
415 typename vector_reduction,
448 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
449 typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,
typename reduction_type::prop>::type red_type;
451 if (reduction_type::is_special() ==
false)
491 template<
typename encap_src,
493 typename vector_reduction>
517 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
519 dst.template get<reduction_type::prop::value>() =
src.template get<reduction_type::prop::value>();
524 template<
unsigned int impl,
typename vector_reduction,
typename T,
typename red_type>
527 template<
typename encap_src,
typename encap_dst>
528 static inline void red(encap_src & src, encap_dst & dst)
530 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
536 template<
unsigned int impl,
typename vector_reduction,
typename T,
typename red_type,
unsigned int N1>
539 template<
typename encap_src,
typename encap_dst>
540 static inline void red(encap_src & src, encap_dst & dst)
542 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
544 auto src_e = src.template get<reduction_type::prop::value>();
545 auto dst_e = dst.template get<reduction_type::prop::value>();
561 template<
typename encap_src,
563 typename vector_reduction,
588 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
589 typedef typename boost::mpl::at<typename encap_src::T_type::type, typename reduction_type::prop>::type red_type;
608 template<
typename vector_data_type,
609 typename vector_index_type,
610 typename vector_index_type2,
611 typename vector_reduction,
612 typename block_functor,
613 unsigned int impl2,
unsigned int pSegment=1>
665 typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
666 typedef typename boost::mpl::at<typename ValueTypeOf<vector_data_type>::type,
typename reduction_type::prop>::type red_type;
667 if (reduction_type::is_special() ==
false)
679 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: this file is supposed to be compiled with nvcc" << std::endl;
687 template<
unsigned int pSegment,
typename vector_reduction,
typename T,
typename vector_index_type,
typename vector_data_type>
688 static bool seg_reduce(vector_index_type & segments, vector_data_type & src, vector_data_type & dst)
693 template<
typename vector_index_type,
typename vector_data_type,
typename ... v_reduce>
694 static bool solve_conflicts(vector_index_type &keys, vector_index_type &merge_indices,
695 vector_data_type &data1, vector_data_type &data2,
696 vector_index_type &indices_tmp, vector_data_type &data_tmp,
697 vector_index_type &keysOut, vector_data_type &dataOut,
726 template<
typename vector_data_type,
typename vector_index_type,
typename vector_reduction>
760 typedef typename boost::mpl::at<vector_reduction,T>::type reduction_type;
763 typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
765 if (reduction_type::is_special() ==
true)
774 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: this file si supposed to be compiled with nvcc" << std::endl;
780 typename Ti =
long int,
784 typename grow_p=grow_policy_double,
785 unsigned int impl=vect_isel<T>::value,
786 unsigned int impl2 = VECTOR_SPARSE_STANDARD,
787 typename block_functor = stub_block_functor>
828 int n_gpu_add_block_slot = 0;
829 int n_gpu_rem_block_slot = 0;
837 template<
bool prefetch>
840 if (vct_index.
size() == 0) {
id = 0;
return -1;}
841 const Ti *base = &vct_index.template get<0>(0);
842 const Ti *end = (
const Ti *)vct_index.template getPointer<0>() + vct_index.
size();
843 Ti n = vct_data.
size()-1;
849 __builtin_prefetch(base + half/2, 0, 0);
850 __builtin_prefetch(base + half + half/2, 0, 0);
852 base = (base[half] < x) ? base+half : base;
856 int off = (*base < x);
857 id = base - &vct_index.template get<0>(0) + off;
858 return (base + off != end)?*(base + off):-1;
867 template<
bool prefetch>
870 Ti v = _branchfree_search_nobck<prefetch>(x,
id);
871 id = (x == v)?
id:vct_data.
size()-1;
896 vct_nadd_index.resize(vct_nadd_index.
size()+1);
897 vct_nadd_index.template get<0>(vct_nadd_index.
size()-1) = 0;
898 vct_nadd_index.template hostToDevice<0>(vct_nadd_index.
size()-1,vct_nadd_index.
size()-1);
901 vct_index_tmp4.resize(vct_nadd_index.
size());
903 openfpm::scan((Ti *)vct_nadd_index.template getDeviceBuffer<0>(),
904 vct_nadd_index.
size(),
905 (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() ,
908 vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.
size()-1,vct_index_tmp4.
size()-1);
909 size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.
size()-1);
912 vct_add_cont_index.resize(n_ele);
913 vct_add_cont_index_map.resize(n_ele);
915 if (impl2 == VECTOR_SPARSE_STANDARD)
917 vct_add_data_cont.resize(n_ele);
921 vct_segment_index_map.resize(n_ele);
924 if (n_gpu_add_block_slot >= 128)
927 itew.wthr.x = vct_nadd_index.
size()-1;
934 CUDA_LAUNCH(construct_insert_list_key_only,itew,vct_add_index.toKernel(),
935 vct_nadd_index.toKernel(),
936 vct_index_tmp4.toKernel(),
937 vct_add_cont_index.toKernel(),
938 vct_add_cont_index_map.toKernel(),
939 n_gpu_add_block_slot);
943 auto itew = vct_add_index.getGPUIterator();
945 CUDA_LAUNCH(construct_insert_list_key_only_small_pool,itew,vct_add_index.toKernel(),
946 vct_nadd_index.toKernel(),
947 vct_index_tmp4.toKernel(),
948 vct_add_cont_index.toKernel(),
949 vct_add_cont_index_map.toKernel(),
950 n_gpu_add_block_slot);
975 itew.wthr.x = vct_nadd_index.
size()-1;
982 size_t n_ele = vct_add_cont_index.size();
984 n_gpu_add_block_slot = 0;
988 (Ti *)vct_add_cont_index.template getDeviceBuffer<0>(),
989 (Ti *)vct_add_cont_index_map.template getDeviceBuffer<0>(),
990 vct_add_cont_index.size(),
991 gpu::template less_t<Ti>(),
994 auto ite = vct_add_cont_index.getGPUIterator();
998 if (impl2 == VECTOR_SPARSE_STANDARD)
1000 vct_add_data_reord.resize(n_ele);
1001 CUDA_LAUNCH(reorder_vector_data,ite,vct_add_cont_index_map.toKernel(),vct_add_data_cont.toKernel(),vct_add_data_reord.toKernel());
1013 template<
typename ... v_reduce>
1022 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1024 auto ite = vct_add_index_sort.getGPUIterator();
1028 vct_add_index_unique.resize(vct_add_index_sort.size()+1);
1030 ite = vct_add_index_sort.getGPUIterator();
1032 vct_index_tmp4.resize(vct_add_index_sort.size()+1);
1036 find_buffer_offsets_for_scan
1038 decltype(vct_add_index_sort.toKernel()),
1039 decltype(vct_index_tmp4.toKernel())
1043 vct_add_index_sort.toKernel(),
1044 vct_index_tmp4.toKernel());
1046 openfpm::scan((Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_index_tmp4.
size(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),context);
1048 vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.
size()-1,vct_index_tmp4.
size()-1);
1049 int n_ele_unique = vct_index_tmp4.template get<0>(vct_index_tmp4.
size()-1);
1051 vct_add_index_unique.resize(n_ele_unique);
1053 if (impl2 == VECTOR_SPARSE_STANDARD)
1055 vct_add_data_unique.resize(n_ele_unique);
1059 (construct_index_unique<0>),
1061 vct_add_index_sort.toKernel(),
1062 vct_index_tmp4.toKernel(),
1063 vct_add_index_unique.toKernel());
1065 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1070 vct_m_index.resize(vct_index.
size());
1072 if (vct_m_index.
size() != 0)
1074 ite = vct_m_index.getGPUIterator();
1075 CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1081 vct_merge_index.resize(vct_index.
size() + vct_add_index_unique.
size());
1082 vct_merge_index_map.resize(vct_index.
size() + vct_add_index_unique.
size());
1083 vct_index_tmp3.resize(vct_index.
size() + vct_add_index_unique.
size());
1087 if (impl2 == VECTOR_SPARSE_STANDARD)
1089 vct_add_data_cont.reserve(vct_index.
size() + vct_add_index_unique.
size()+1);
1090 vct_add_data_cont.resize(vct_index.
size() + vct_add_index_unique.
size());
1093 ite = vct_add_index_unique.getGPUIterator();
1094 vct_index_tmp4.resize(vct_add_index_unique.
size());
1095 CUDA_LAUNCH((set_indexes<0>),ite,vct_index_tmp4.toKernel(),vct_index.
size());
1099 itew.wthr.x = vct_merge_index.size() / 128 + (vct_merge_index.size() % 128 != 0);
1106 vct_index_dtmp.resize(itew.wthr.x);
1111 openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.
size(),
1112 (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.
size(),
1113 (Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),
gpu::less_t<Ti>(),context);
1121 template<
typename ... v_reduce>
1130 itew.wthr.x = vct_index_tmp.
size() / 128 + (vct_index_tmp.
size() % 128 != 0);
1137 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1143 ::template extendSegments<1>(vct_add_index_unique, vct_add_data_reord_map.size());
1145 if (impl2 == VECTOR_SPARSE_STANDARD)
1148 decltype(vct_add_data_reord_map),
1149 decltype(vct_add_index_unique),vv_reduce,block_functor,impl2>
1151 vct_add_data_unique,
1154 vct_add_data_reord_map,
1155 vct_add_index_unique,
1159 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,
sizeof...(v_reduce)>>(svr);
1160 vct_add_index_unique.remove(vct_add_index_unique.
size()-1);
1163 sparse_vector_special<
typename std::remove_reference<
decltype(vct_add_data)>::type,
1164 decltype(vct_add_index_unique),
1165 vv_reduce> svr2(vct_add_data_unique,vct_add_data_reord,vct_add_index_unique,context);
1166 boost::mpl::for_each_ref<boost::mpl::range_c<int,0,
sizeof...(v_reduce)>>(svr2);
1171 scalar_block_implementation_switch<impl2, block_functor>::template solveConflicts<
1173 decltype(vct_index),
1174 decltype(segments_new),
1175 decltype(vct_index_dtmp),
1185 vct_add_data_reord_map,
1189 vct_add_data_unique,
1198 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1202 template<
typename ... v_reduce>
1203 void flush_on_gpu_insert(vector<
aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1204 vector<
aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1205 vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1211 if (n_gpu_add_block_slot == 0 || vct_add_index.
size() == 0)
1216 size_t n_ele = make_continuos(vct_nadd_index,vct_add_index,vct_add_index_cont_0,vct_add_index_cont_1,
1217 vct_add_data,vct_add_data_cont,context);
1221 if (vct_add_index_cont_0.
size() == 0)
1224 reorder_indexes(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,vct_add_data,context);
1226 merge_indexes<v_reduce ... >(vct_add_index_cont_0,vct_add_index_unique,
1227 vct_index_tmp,vct_index_tmp2,
1230 merge_datas<v_reduce ... >(vct_add_data_reord,vct_add_index_unique,vct_add_data,vct_add_index_cont_1,context);
1233 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1238 void flush_on_gpu_remove(
1244 vct_nrem_index.resize(vct_nrem_index.
size()+1);
1245 vct_nrem_index.template get<0>(vct_nrem_index.
size()-1) = 0;
1246 vct_nrem_index.template hostToDevice<0>(vct_nrem_index.
size()-1,vct_nrem_index.
size()-1);
1249 vct_index_tmp4.resize(vct_nrem_index.
size());
1251 openfpm::scan((Ti *)vct_nrem_index.template getDeviceBuffer<0>(), vct_nrem_index.
size(), (Ti *)vct_index_tmp4.template getDeviceBuffer<0>() , context);
1253 vct_index_tmp4.template deviceToHost<0>(vct_index_tmp4.
size()-1,vct_index_tmp4.
size()-1);
1254 size_t n_ele = vct_index_tmp4.template get<0>(vct_index_tmp4.
size()-1);
1257 vct_add_index_cont_0.resize(n_ele);
1258 vct_add_index_cont_1.resize(n_ele);
1261 itew.wthr.x = vct_nrem_index.
size()-1;
1268 CUDA_LAUNCH(construct_remove_list,itew,vct_rem_index.toKernel(),
1269 vct_nrem_index.toKernel(),
1270 vct_index_tmp4.toKernel(),
1271 vct_add_index_cont_0.toKernel(),
1272 vct_add_index_cont_1.toKernel(),
1273 n_gpu_rem_block_slot);
1276 openfpm::sort((Ti *)vct_add_index_cont_0.template getDeviceBuffer<0>(),(Ti *)vct_add_index_cont_1.template getDeviceBuffer<0>(),
1277 vct_add_index_cont_0.
size(), gpu::template less_t<Ti>(), context);
1279 auto ite = vct_add_index_cont_0.getGPUIterator();
1283 vct_add_index_unique.resize(vct_add_index_cont_0.
size()+1);
1285 ite = vct_add_index_cont_0.getGPUIterator();
1289 CUDA_LAUNCH((find_buffer_offsets_zero<0,
decltype(vct_add_index_cont_0.toKernel()),
decltype(vct_add_index_unique.toKernel())>),
1291 vct_add_index_cont_0.toKernel(),(
int *)mem.
getDevicePointer(),vct_add_index_unique.toKernel());
1296 vct_add_index_unique.resize(n_ele_unique);
1298 openfpm::sort((Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),
1299 vct_add_index_unique.
size(),gpu::template less_t<Ti>(),context);
1304 vct_m_index.resize(vct_index.
size() + vct_add_index_unique.
size());
1306 ite = vct_m_index.getGPUIterator();
1307 CUDA_LAUNCH((set_indexes<0>),ite,vct_m_index.toKernel(),0);
1309 ite = vct_add_index_unique.getGPUIterator();
1310 CUDA_LAUNCH((set_indexes<1>),ite,vct_add_index_unique.toKernel(),vct_index.
size());
1315 vct_index_tmp.resize(vct_index.
size() + vct_add_index_unique.
size());
1316 vct_index_tmp2.resize(vct_index.
size() + vct_add_index_unique.
size());
1318 itew.wthr.x = vct_index_tmp.
size() / 128 + (vct_index_tmp.
size() % 128 != 0);
1325 vct_index_dtmp.resize(itew.wthr.x);
1329 openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.
size(),
1330 (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.
size(),
1331 (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),
gpu::less_t<Ti>(),context);
1333 vct_index_tmp3.resize(128*itew.wthr.x);
1335 CUDA_LAUNCH((solve_conflicts_remove<
decltype(vct_index_tmp.toKernel()),
decltype(vct_index_dtmp.toKernel()),128>),
1337 vct_index_tmp.toKernel(),
1338 vct_index_tmp2.toKernel(),
1339 vct_index_tmp3.toKernel(),
1340 vct_m_index.toKernel(),
1341 vct_index_dtmp.toKernel(),
1345 openfpm::scan((Ti*)vct_index_dtmp.template getDeviceBuffer<0>(),vct_index_dtmp.
size(),(Ti *)vct_index_dtmp.template getDeviceBuffer<1>(),context);
1348 vct_index_dtmp.template deviceToHost<0,1>(vct_index_dtmp.
size()-1,vct_index_dtmp.
size()-1);
1349 int size = vct_index_dtmp.template get<1>(vct_index_dtmp.
size()-1) + vct_index_dtmp.template get<0>(vct_index_dtmp.
size()-1);
1351 vct_add_data_cont.resize(
size);
1352 vct_index.resize(
size);
1354 CUDA_LAUNCH(realign_remove,itew,vct_index_tmp3.toKernel(),vct_m_index.toKernel(),vct_data.toKernel(),
1355 vct_index.toKernel(),vct_add_data_cont.toKernel(),
1356 vct_index_dtmp.toKernel());
1358 vct_data.swap(vct_add_data_cont);
1361 std::cout << __FILE__ <<
":" << __LINE__ <<
" error: you are suppose to compile this file with nvcc, if you want to use it with gpu" << std::endl;
1368 vct_data.resize(vct_data.
size()+1);
1369 vct_data.get(vct_data.
size()-1) = bck;
1371 htoD<
decltype(vct_data)> trf(vct_data,vct_data.
size()-1);
1372 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1375 template<
typename ... v_reduce>
1376 void flush_on_gpu(vector<
aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
1377 vector<
aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
1378 vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
1381 flush_on_gpu_insert<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);
1384 template<
typename ... v_reduce>
1387 if (vct_add_index.
size() == 0)
1391 reorder_add_index_cpu.resize(vct_add_index.
size());
1392 vct_add_data_cont.resize(vct_add_index.
size());
1394 for (
size_t i = 0 ; i < reorder_add_index_cpu.
size() ; i++)
1396 reorder_add_index_cpu.get(i).id = vct_add_index.template get<0>(i);
1397 reorder_add_index_cpu.get(i).id2 = i;
1400 reorder_add_index_cpu.sort();
1403 for (
size_t i = 0 ; i < reorder_add_index_cpu.
size() ; i++)
1405 vct_add_data_cont.get(i) = vct_add_data.get(reorder_add_index_cpu.get(i).id2);
1408 typedef boost::mpl::vector<v_reduce...> vv_reduce;
1410 sparse_vector_reduction_cpu<
decltype(vct_add_data),
1411 decltype(vct_add_index_unique),
1412 decltype(reorder_add_index_cpu),
1415 svr(vct_add_data_unique,
1417 vct_add_index_unique,
1418 reorder_add_index_cpu);
1420 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(v_reduce)>>(svr);
1424 vector<T,Memory,layout_base,grow_p,impl> vct_data_tmp;
1425 vector<aggregate<Ti>,Memory,layout_base,grow_p> vct_index_tmp;
1427 vct_data_tmp.resize(vct_data.
size() + vct_add_data_unique.
size());
1428 vct_index_tmp.resize(vct_index.
size() + vct_add_index_unique.
size());
1434 for ( ; i < vct_data_tmp.size() ; i++)
1436 Ti id_a = (ai < vct_add_index_unique.
size())?vct_add_index_unique.template get<0>(ai):std::numeric_limits<Ti>::max();
1437 Ti id_d = (di < vct_index.
size())?vct_index.template get<0>(di):std::numeric_limits<Ti>::max();
1441 vct_index_tmp.template get<0>(i) = id_a;
1445 auto dst = vct_data_tmp.get(i);
1446 auto src = vct_add_data_unique.get(ai);
1448 sparse_vector_reduction_solve_conflict_assign_cpu<
decltype(vct_data_tmp.get(i)),
1449 decltype(vct_add_data.get(ai)),
1453 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(v_reduce)>>(sva);
1456 dst = vct_data_tmp.get(i);
1457 src = vct_data.get(di);
1459 sparse_vector_reduction_solve_conflict_reduce_cpu<
decltype(vct_data_tmp.get(i)),
1460 decltype(vct_data.get(di)),
1464 boost::mpl::for_each_ref<boost::mpl::range_c<
int,0,
sizeof...(v_reduce)>>(svr);
1468 vct_data_tmp.resize(vct_data_tmp.size()-1);
1469 vct_index_tmp.resize(vct_index_tmp.size()-1);
1473 vct_index_tmp.template get<0>(i) = vct_add_index_unique.template get<0>(ai);
1474 vct_data_tmp.get(i) = vct_add_data_unique.get(ai);
1480 vct_index_tmp.template get<0>(i) = vct_index.template get<0>(di);
1481 vct_data_tmp.get(i) = vct_data.get(di);
1486 vct_index.swap(vct_index_tmp);
1487 vct_data.swap(vct_data_tmp);
1489 vct_add_data.clear();
1490 vct_add_index.clear();
1491 vct_add_index_unique.clear();
1492 vct_add_data_unique.clear();
1553 this->_branchfree_search<false>(
id,di);
1570 template <
unsigned int p>
1571 inline auto get(Ti
id)
const ->
decltype(vct_data.template get<p>(
id))
1574 this->_branchfree_search<false>(
id,di);
1575 return vct_data.template get<p>(di);
1588 inline auto get(Ti
id)
const ->
decltype(vct_data.get(
id))
1591 this->_branchfree_search<false>(
id,di);
1592 return vct_data.get(di);
1623 template <
unsigned int p>
1626 return vct_data.template get<p>(vct_data.
size()-1);
1636 return vct_data.get(vct_data.
size()-1);
1639 template<
unsigned int p>
1640 void setBackground(
const typename boost::mpl::at<
typename T::type, boost::mpl::int_<p>>::type & bck_)
1643 typename std::remove_reference<
decltype(vct_data.template get<p>(vct_data.
size()-1))>::type>
1644 ::meta_copy_d_(bck_,vct_data.template get<p>(vct_data.
size()-1));
1646 vct_data.template hostToDevice<p>(vct_data.
size()-1,vct_data.
size()-1);
1649 ::meta_copy_(bck_,bck.template get<p>());
1659 template <
unsigned int p>
1660 auto insert(Ti
ele) ->
decltype(vct_data.template get<p>(0))
1662 vct_add_index.add();
1663 vct_add_index.template get<0>(vct_add_index.
size()-1) =
ele;
1665 return vct_add_data.template get<p>(vct_add_data.
size()-1);
1675 template <
unsigned int p>
1687 return vct_data.template get<p>(di);
1692 vct_index.insert(di);
1695 return vct_data.template get<p>(di);
1709 Ti v = _branchfree_search_nobck<true>(
ele,di);
1714 return vct_data.get(di);
1718 vct_index.insert(di);
1719 vct_data.insert(di);
1722 vct_index.template get<0>(di) =
ele;
1724 return vct_data.get(di);
1734 vct_add_index.add();
1735 vct_add_index.template get<0>(vct_add_index.
size()-1) =
ele;
1737 return vct_add_data.get(vct_add_data.
size()-1);
1747 template<
typename ... v_reduce>
1750 flush_type opt = FLUSH_ON_HOST,
1754 vct_data.resize(vct_index.
size());
1756 if (opt & flush_type::FLUSH_ON_DEVICE)
1757 {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context,i);}
1759 {this->flush_on_cpu<v_reduce ... >();}
1771 template<
typename ... v_reduce>
1774 flush_type opt = FLUSH_ON_HOST)
1777 vct_data.resize(vct_index.
size());
1779 if (opt & flush_type::FLUSH_ON_DEVICE)
1780 {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1782 {this->flush_on_cpu<v_reduce ... >();}
1792 template<
typename ... v_reduce>
1796 vct_data.resize(vct_index.
size());
1798 if (opt & flush_type::FLUSH_ON_DEVICE)
1799 {this->flush_on_gpu<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);}
1801 {this->flush_on_cpu<v_reduce ... >();}
1813 vct_data.resize(vct_data.
size()-1);
1815 if (opt & flush_type::FLUSH_ON_DEVICE)
1816 {this->flush_on_gpu_remove(context);}
1819 std::cerr << __FILE__ <<
":" << __LINE__ <<
" error, flush_remove on CPU has not implemented yet";
1831 return vct_index.
size();
1849 template<
unsigned int ... prp>
1852 vct_index.template deviceToHost<0>();
1861 template<
unsigned int ... prp>
1864 vct_index.template hostToDevice<0>();
1876 vct_add_index.toKernel(),
1877 vct_rem_index.toKernel(),vct_add_data.toKernel(),
1878 vct_nadd_index.toKernel(),
1879 vct_nrem_index.toKernel(),
1880 n_gpu_add_block_slot,
1881 n_gpu_rem_block_slot);
1894 vct_add_index.resize(nblock*nslot);
1895 vct_nadd_index.resize(nblock);
1896 vct_add_data.resize(nblock*nslot);
1897 n_gpu_add_block_slot = nslot;
1898 vct_nadd_index.template fill<0>(0);
1909 vct_nadd_index.resize(vct_add_index.
size());
1911 if (vct_nadd_index.
size() != 0)
1913 auto ite = vct_nadd_index.getGPUIterator();
1914 CUDA_LAUNCH((set_one_insert_buffer),ite,vct_nadd_index.toKernel());
1916 n_gpu_add_block_slot = 1;
1926 return vct_add_data;
1937 vct_rem_index.resize(nblock*nslot);
1938 vct_nrem_index.resize(nblock);
1939 n_gpu_rem_block_slot = nslot;
1940 vct_nrem_index.template fill<0>(0);
1950 auto getGPUIterator() ->
decltype(vct_index.getGPUIterator())
1952 return vct_index.getGPUIterator();
1965 vct_add_index.clear();
1966 vct_add_data.clear();
1969 vct_data.resize(vct_data.
size()+1);
1970 vct_data.get(vct_data.
size()-1) = bck;
1972 htoD<
decltype(vct_data)> trf(vct_data,vct_data.
size()-1);
1973 boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(trf);
1976 n_gpu_add_block_slot = 0;
1977 n_gpu_rem_block_slot = 0;
1982 vct_data.swap(sp.vct_data);
1983 vct_index.swap(sp.vct_index);
1984 vct_add_index.swap(sp.vct_add_index);
1985 vct_add_data.swap(sp.vct_add_data);
1987 size_t max_ele_ = sp.max_ele;
1988 sp.max_ele = max_ele;
1989 this->max_ele = max_ele_;
1992 vector<T,Memory,layout_base,grow_p> & private_get_vct_add_data()
1994 return vct_add_data;
1997 vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index()
1999 return vct_add_index;
2002 const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_add_index()
const
2004 return vct_add_index;
2007 vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index()
2009 return vct_nadd_index;
2012 const vector<aggregate<Ti>,Memory,layout_base,grow_p> & private_get_vct_nadd_index()
const
2014 return vct_nadd_index;
2017 auto getSegmentToOutMap() ->
decltype(blf.get_outputMap())
2019 return blf.get_outputMap();
2022 auto getSegmentToOutMap() const -> decltype(blf.get_outputMap())
2024 return blf.get_outputMap();
2033 vct_add_data.resize(0);
2034 vct_add_data.shrink_to_fit();
2036 vct_add_data.resize(0);
2037 vct_add_data.shrink_to_fit();
2039 vct_add_data_reord.resize(0);
2040 vct_add_data_reord.shrink_to_fit();
2042 vct_add_data_cont.resize(0);
2043 vct_add_data_cont.shrink_to_fit();
2045 vct_add_data_unique.resize(0);
2046 vct_add_data_unique.shrink_to_fit();
2055 return vct_add_index_unique;
2058 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & getSegmentToMergeIndexMap()
const
2060 return vct_add_index_unique;
2083 return vct_add_index_cont_1;
2106 return vct_index_tmp2;
2111 template<
typename T,
unsigned int blockSwitch = VECTOR_SPARSE_STANDARD,
typename block_functor = stub_block_functor,
typename indexT =
int>
2119 vect_isel<T>::value,
2124 template<
typename T,
typename block_functor = stub_block_functor,
typename indexT =
long int>
2132 vect_isel<T>::value,
2133 VECTOR_SPARSE_BLOCK,
virtual void * getDevicePointer()
get a readable pointer with the data
virtual void deviceToHost()
Move memory from device to host.
virtual void fill(unsigned char c)
fill the buffer with a byte
virtual void * getPointer()
get a readable pointer with the data
virtual bool allocate(size_t sz)
allocate memory
This class allocate, and destroy CPU memory.
auto insertFlush(Ti ele, bool &is_new) -> decltype(vct_data.get(0))
It insert an element in the sparse vector.
auto insert(Ti ele) -> decltype(vct_data.template get< p >(0))
It insert an element in the sparse vector.
auto getBackground() const -> decltype(vct_data.template get< p >(vct_data.size() -1))
Set the background to bck (which value get must return when the value is not find)
vector< aggregate< Ti >, Memory, layout_base, grow_p > & getMappingVector()
Return the mapping vector.
void flush_vd(vector< T, Memory, layout_base, grow_p > &vct_add_data_reord, gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array but save the insert buffer in v
void flush_remove(gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
auto getIndexBuffer() const -> const decltype(vct_index)&
Get the indices buffer.
auto getGPUInsertBuffer() -> decltype(vct_add_data)&
Get the GPU insert buffer.
void _branchfree_search(Ti x, Ti &id) const
get the element i
void deviceToHost()
Transfer from device to host.
void setGPURemoveBuffer(int nblock, int nslot)
set the gpu remove buffer for every block
auto getIndexBuffer() -> decltype(vct_index)&
Get the indices buffer.
auto get(Ti id) const -> decltype(vct_data.template get< p >(id))
Get an element of the vector.
auto insert(Ti ele) -> decltype(vct_data.get(0))
It insert an element in the sparse vector.
openfpm::sparse_index< Ti > get_sparse(Ti id) const
Get the sparse index.
void clear()
Clear all from all the elements.
vector_sparse_gpu_ker< T, Ti, layout_base > toKernel()
toKernel function transform this structure into one that can be used on GPU
void merge_indexes(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_sort, vector< aggregate< Ti, Ti >, Memory, layout_base, grow_p > &vct_add_index_unique, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_merge_index, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_merge_index_map, gpu::ofp_context_t &context)
Merge indexes.
size_t size()
Return how many element you have in this map.
vector< aggregate< Ti >, Memory, layout_base, grow_p > & private_get_vct_index()
Return the sorted vector of the indexes.
auto getBackground() const -> decltype(vct_data.get(vct_data.size() -1))
Set the background to bck (which value get must return when the value is not find)
auto insertFlush(Ti ele, bool &is_new) -> decltype(vct_data.template get< p >(0))
It insert an element in the sparse vector.
void removeUnusedBuffers()
Eliminate many internal temporary buffer you can use this between flushes if you get some out of memo...
void flush(gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST)
merge the added element to the main data array
void hostToDevice()
Transfer from host to device.
void setGPUInsertBuffer(int nblock, int nslot)
set the gpu insert buffer for every block
void swapIndexVector(vector< aggregate< Ti >, Memory, layout_base, grow_p > &iv)
Ti _branchfree_search_nobck(Ti x, Ti &id) const
get the element i
auto getDataBuffer() const -> const decltype(vct_data)&
Get the data buffer.
auto getDataBuffer() -> decltype(vct_data)&
Get the data buffer.
auto get(Ti id) const -> decltype(vct_data.get(id))
Get an element of the vector.
vector< aggregate< Ti >, Memory, layout_base, grow_p > & getMergeIndexMapVector()
Return the merge mapping vector.
void preFlush()
In case we manually set the added index buffer and the add data buffer we have to call this function ...
void reorder_indexes(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_cont_index, vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_cont_index_map, vector< T, Memory, layout_base, grow_p > &vct_add_data_reord, vector< T, Memory, layout_base, grow_p > &vct_add_data_cont, gpu::ofp_context_t &context)
sort the continuos array of inserted key
void resize(size_t n)
resize to n elements
void flush_v(vector< aggregate< Ti >, Memory, layout_base, grow_p > &vct_add_index_cont_0, gpu::ofp_context_t &context, flush_type opt=FLUSH_ON_HOST, int i=0)
merge the added element to the main data array but save the insert buffer in v
Implementation of 1-D std::vector like structure.
KeyT const ValueT ValueT OffsetIteratorT OffsetIteratorT int
[in] The number of segments that comprise the sorting data
convert a type into constant type
aggregate of properties, from a list of object if create a struct that follow the OPENFPM native stru...
Transform the boost::fusion::vector into memory specification (memory_traits)
inter_memc< typenameT::type >::type type
for each element in the vector interleave memory_c
Transform the boost::fusion::vector into memory specification (memory_traits)
__device__ __host__ void operator()(T &t) const
It call the copy function for each property.
sg_type & sg
encapsulated source object
Functor switch to select the vector sparse for standars scalar and blocked implementation.
static void solveConflicts(vector_index_type &vct_index_old, vector_index_type &vct_index_merge, vector_index_type &vct_index_merge_id, vector_index_type &vct_index_out, vector_index_dtmp_type &vct_index_dtmp, vector_index_type &data_map, vector_index_type2 &segments_new, vector_data_type &vct_data_old, vector_data_type &vct_add_data, vector_data_type &vct_add_data_unique, vector_data_type &vct_data_out, ite_gpu< 1 > &itew, block_functor &blf, gpu::ofp_context_t &context)
this class is a functor for "for_each" algorithm
vector_data_type & vector_data_red
Vector in which to the reduction.
void operator()(T &t) const
It call the copy function for each property.
vector_index_type_reo & reorder_add_index_cpu
reorder vector index
vector_index_type & vector_index
Index type vector.
vector_data_type & vector_data
Vector in which to the reduction.
sparse_vector_reduction_cpu(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_index_type &vector_index, vector_index_type_reo &reorder_add_index_cpu)
constructor
this class is a functor for "for_each" algorithm
void operator()(T &t) const
It call the copy function for each property.
sparse_vector_reduction_solve_conflict_assign_cpu(encap_src &src, encap_dst &dst)
constructor
encap_dst & dst
destination
this class is a functor for "for_each" algorithm
void operator()(T &t) const
It call the copy function for each property.
encap_dst & dst
destination
sparse_vector_reduction_solve_conflict_reduce_cpu(encap_src &src, encap_dst &dst)
constructor
this class is a functor for "for_each" algorithm
gpu::ofp_context_t & context
gpu context
block_functor & blf
block functor
vector_data_type & vector_data_unsorted
new data in an unsorted way
vector_data_type & vector_data
new datas
vector_data_type & vector_data_red
Vector in which to the reduction.
sparse_vector_reduction(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_data_type &vector_data_unsorted, vector_index_type &vector_data_map, vector_index_type2 &segment_offset, block_functor &blf, gpu::ofp_context_t &context)
constructor
vector_index_type & vector_data_map
map of the data
vector_index_type2 & segment_offset
segment of offsets
void operator()(T &t) const
It call the copy function for each property.
this class is a functor for "for_each" algorithm
vector_data_type & vector_data
Vector in which to the reduction.
gpu::ofp_context_t & context
gpu context
void operator()(T &t) const
It call the copy function for each property.
vector_index_type & segment_offset
segment of offsets
vector_data_type & vector_data_red
Vector in which to the reduction.
sparse_vector_special(vector_data_type &vector_data_red, vector_data_type &vector_data, vector_index_type &segment_offset, gpu::ofp_context_t &context)
constructor
temporal buffer for reductions