OpenFPM_pdata  4.1.0
Project that contain the implementation of distributed structures
 
Loading...
Searching...
No Matches
main.cu
1
124#ifdef __NVCC__
125
127#define OPENMPI
129
130#define SCAN_WITH_CUB <------ MODERNGPU is broken on RTX use CUB library for scan
131//#define EXTERNAL_SET_GPU <----- In case you want to distribute the GPUs differently from the default
132
133#include "Vector/vector_dist.hpp"
134
136
137template<typename vector_type>
138__global__ void translate_fill_prop(vector_type vd)
139{
140 auto p = GET_PARTICLE(vd);
141
142 vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1];
143
144 vd.template getProp<1>(p)[0] = vd.getPos(p)[0];
145 vd.template getProp<1>(p)[1] = vd.getPos(p)[1];
146
147 vd.template getProp<2>(p)[0][0] = vd.getPos(p)[0];
148 vd.template getProp<2>(p)[0][1] = vd.getPos(p)[1];
149 vd.template getProp<2>(p)[1][0] = vd.getPos(p)[0] + vd.getPos(p)[1];
150 vd.template getProp<2>(p)[1][1] = vd.getPos(p)[1] - vd.getPos(p)[0];
151
152 vd.getPos(p)[0] += 0.01f;
153 vd.getPos(p)[1] += 0.01f;
154}
155
157
158int main(int argc, char* argv[])
159{
160 // OpenFPM GPU distribution
161
162 // OpenFPM by default select GPU 0 for process 0, gpu 1 for process 1 and so on ... . In case of multi-node is the same each node has
163 // has a group of processes and these group of processes are distributed across the available GPU on that node.
164
165 // If you want to override this behaviour use #define EXTERNAL_SET_GPU at the very beginning of the program and use
166 // cudaSetDevice to select the GPU for that particular process before openfpm_init
167 // Note: To get the process number do MPI_Init and and use the MPI_Comm_rank. VCluster is not available before openfpm_init
168 // A code snippet in case we want to skip GPU 0
169 // MPI_Init(&argc,&argv);
170 // int rank;
171 // MPI_Comm_rank(MPI_COMM_WORLD,&rank);
172 // cudaSetDevice(1+rank);
173
175
176 // initialize the library
177 openfpm_init(&argc,&argv);
178
179 // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y
180 Box<2,float> domain({0.0,0.0},{1.0,1.0});
181
182 // Here we define the boundary conditions of our problem
183 size_t bc[2]={PERIODIC,PERIODIC};
184
185 // extended boundary around the domain, and the processor domain
186 Ghost<2,float> g(0.05);
187
189
190 // the scalar is the element at position 0 in the aggregate
191 const int scalar = 0;
192
193 // the vector is the element at position 1 in the aggregate
194 const int vector = 1;
195
196 // the tensor is the element at position 2 in the aggregate
197 const int tensor = 2;
198
199 auto it = vd.getDomainIterator();
200
201 while (it.isNext())
202 {
203 auto key = it.get();
204
205 // we define x, assign a random position between 0.0 and 1.0
206 vd.getPos(key)[0] = (float)rand() / RAND_MAX;
207
208 // we define y, assign a random position between 0.0 and 1.0
209 vd.getPos(key)[1] = (float)rand() / RAND_MAX;
210
211 // next particle
212 ++it;
213 }
214
215 vd.map();
216
218
220
221 vd.hostToDevicePos();
222 vd.template hostToDeviceProp<scalar,vector,tensor>();
223
225
227
228 auto ite = vd.getDomainIteratorGPU();
229 // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
230 CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
231
233
235
236 vd.deviceToHostPos();
237 vd.deviceToHostProp<0,1,2>();
238
239 // We write on a file
240 vd.write("output");
241
243
245
246 for (int j = 0 ; j < 100 ; j++)
247 {
248 auto ite = vd.getDomainIteratorGPU();
249 // translate_fill_prop<<<ite.wthr,ite.thr>>>(vd.toKernel());
250 CUDA_LAUNCH(translate_fill_prop,ite,vd.toKernel());
251
252 vd.map(RUN_ON_DEVICE);
253 vd.template ghost_get<0,1,2>(RUN_ON_DEVICE);
254
255 if ( j % 10 == 0)
256 {
257 // offload to host
258 vd.deviceToHostPos();
259 vd.template deviceToHostProp<0,1,2>();
260
261 // write
262 vd.write_frame("output_f",j);
263 }
264 }
265
267
269
270 bool active = is_mpi_rdma_cuda_active();
271
272 std::cout << "Is MPI rdma active on CUDA " << active << std::endl;
273
275
276 openfpm_finalize();
277}
278
279#else
280
281int main(int argc, char* argv[])
282{
283 return 0;
284}
285
286#endif
This class represent an N-dimensional box.
Definition Box.hpp:61
vect_dist_key_dx get()
Get the actual key.
Distributed vector.
bool write_frame(std::string out, size_t iteration, int opt=VTK_WRITER)
Output particle position and properties.
void deviceToHostPos()
Move the memory from the device to host memory.
auto getPos(vect_dist_key_dx vec_key) -> decltype(v_pos.template get< 0 >(vec_key.getKey()))
Get the position of an element.
vector_dist_iterator getDomainIterator() const
Get an iterator that traverse the particles in the domain.
void hostToDevicePos()
Move the memory from the device to host memory.
void map(size_t opt=NONE)
It move all the particles that does not belong to the local processor to the respective processor.
bool write(std::string out, int opt=VTK_WRITER)
Output particle position and properties.
void deviceToHostProp()
Move the memory from the device to host memory.