Rheolef  7.1
an efficient C++ finite element environment
geo_mpi_dual.cc
Go to the documentation of this file.
1 #include "rheolef/config.h"
22 #ifdef _RHEOLEF_HAVE_MPI
23 // mesh2dual
24 #include "geo_partition_scotch.h"
25 #include <algorithm> // fill, copy
26 
27 namespace rheolef {
28 using namespace std;
29 
30 static void ikeysort(int total_elems, KeyValueType *pbase);
31 
32 // -------------------------------------------------------------------------
33 // This function converts a mesh into a dual graph
34 // -------------------------------------------------------------------------
35 void geo_dual (
36  my_idxtype *elmdist,
37  my_idxtype *eptr,
38  vector<my_idxtype>& eind,
39  int *ncommonnodes,
40  vector<my_idxtype>& xadj,
41  vector<my_idxtype>& adjncy,
42  const mpi::communicator& comm)
43 {
44  int i, j, jj, k, kk, m;
45  int pe, count, mask, pass;
46  int lnns, my_nns, node;
47  int firstelm, firstnode, lnode, nrecv, nsend;
48  my_idxtype maxcount;
49 
50  int npes = comm.size();
51  int mype = comm.rank();
52  srand (mype);
53 
54  int nelms = elmdist[mype+1]-elmdist[mype];
55  check_macro(nelms >= 0, "unexpected nelms <= 0");
56 
57  mask = (1<<11)-1;
58 
59  // ----------------------------
60  // 1) determine number of nodes
61  // ----------------------------
62  int gminnode = mpi::all_reduce (comm, eind[idxamin(eptr[nelms], eind)], mpi::minimum<int>());
63  for (i=0; i<eptr[nelms]; i++)
64  eind[i] -= gminnode;
65 
66  int gmaxnode = mpi::all_reduce (comm, eind[idxamax(eptr[nelms], eind)], mpi::maximum<int>());
67  // ----------------------------------------
68  // 2) construct node distribution array
69  // ----------------------------------------
70  vector<my_idxtype> nodedist (npes+1, 0);
71  for (nodedist[0]=0, i=0, j=gmaxnode+1; i<npes; i++) {
72  k = j/(npes-i);
73  nodedist[i+1] = nodedist[i]+k;
74  j -= k;
75  }
76  my_nns = nodedist[mype+1]-nodedist[mype];
77  firstnode = nodedist[mype];
78 
79  vector<KeyValueType> nodelist (eptr[nelms]);
80  vector<my_idxtype> auxarray (eptr[nelms]);
81  vector<my_idxtype> htable (amax(my_nns, mask+1), -1);
82  vector<int> scounts (4*npes+2);
83  int *rcounts = scounts.begin().operator->() + npes;
84  int *sdispl = scounts.begin().operator->() + 2*npes;
85  int *rdispl = scounts.begin().operator->() + 3*npes+1;
86  // -----------------------------------------------
87  // 3) first find a local numbering of the nodes
88  // -----------------------------------------------
89  for (i=0; i<nelms; i++) {
90  for (j=eptr[i]; j<eptr[i+1]; j++) {
91  nodelist[j].key = eind[j];
92  nodelist[j].val = j;
93  auxarray[j] = i; /* remember the local element ID that uses this node */
94  }
95  }
96  ikeysort(eptr[nelms], nodelist.begin().operator->());
97 
98  for (count=1, i=1; i<eptr[nelms]; i++) {
99  if (nodelist[i].key > nodelist[i-1].key)
100  count++;
101  }
102 
103  lnns = count;
104  vector<my_idxtype> nmap (lnns);
105  // -----------------------------------------------
106  // 4) renumber the nodes of the elements array
107  // -----------------------------------------------
108  count = 1;
109  nmap[0] = nodelist[0].key;
110  eind[nodelist[0].val] = 0;
111  nodelist[0].val = auxarray[nodelist[0].val]; /* Store the local element ID */
112  for (i=1; i<eptr[nelms]; i++) {
113  if (nodelist[i].key > nodelist[i-1].key) {
114  nmap[count] = nodelist[i].key;
115  count++;
116  }
117  eind[nodelist[i].val] = count-1;
118  nodelist[i].val = auxarray[nodelist[i].val]; /* Store the local element ID */
119  }
120  comm.barrier();
121  // --------------------------------------------------------
122  // 5) perform comms necessary to construct node-element list
123  // --------------------------------------------------------
124  fill (scounts.begin(), scounts.begin() + npes, 0);
125  for (pe=i=0; i<eptr[nelms]; i++) {
126  while (nodelist[i].key >= nodedist[pe+1])
127  pe++;
128  scounts[pe] += 2;
129  }
130  check_macro(pe < npes, "unexpected pe");
131 
132  mpi::all_to_all (comm, scounts.begin().operator->(), rcounts);
133 
134  icopy(npes, scounts.begin().operator->(), sdispl);
135  init_csr_ptr(npes, sdispl);
136 
137  icopy(npes, rcounts, rdispl);
138  init_csr_ptr(npes, rdispl);
139 
140  check_macro(sdispl[npes] == eptr[nelms]*2, "unexpected sdispl[]");
141 
142  nrecv = rdispl[npes]/2;
143  vector<KeyValueType> recvbuffer (amax(1, nrecv));
144 
145  // no boost::mpi equivalent:
146  MPI_Alltoallv(nodelist.begin().operator->(), scounts.begin().operator->(), sdispl, IDX_DATATYPE, recvbuffer.begin().operator->(), rcounts, rdispl, IDX_DATATYPE, comm);
147  // --------------------------------------------------------
148  // 6) construct global node-element list
149  // --------------------------------------------------------
150  vector<my_idxtype> gnptr (my_nns+1, 0);
151  for (i=0; i<npes; i++) {
152  for (j=rdispl[i]/2; j<rdispl[i+1]/2; j++) {
153  lnode = recvbuffer[j].key-firstnode;
154  check_macro(lnode >= 0 && lnode < my_nns, "unexpected lnode range")
155 
156  gnptr[lnode]++;
157  }
158  }
159  init_csr_ptr (my_nns, gnptr.begin());
160 
161  vector<my_idxtype> gnind (amax(1, gnptr[my_nns]));
162  for (pe=0; pe<npes; pe++) {
163  firstelm = elmdist[pe];
164  for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
165  lnode = recvbuffer[j].key-firstnode;
166  gnind[gnptr[lnode]++] = recvbuffer[j].val+firstelm;
167  }
168  }
169  SHIFTCSR(i, my_nns, gnptr);
170  // --------------------------------------------------------
171  // 7) send the node-element info to the relevant processors
172  // --------------------------------------------------------
173  fill (scounts.begin(), scounts.begin() + npes, 0);
174 
175  /* use a hash table to ensure that each node is sent to a proc only once */
176  for (pe=0; pe<npes; pe++) {
177  for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
178  lnode = recvbuffer[j].key-firstnode;
179  if (htable[lnode] == -1) {
180  scounts[pe] += gnptr[lnode+1]-gnptr[lnode];
181  htable[lnode] = 1;
182  }
183  }
184 
185  /* now reset the hash table */
186  for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
187  lnode = recvbuffer[j].key-firstnode;
188  htable[lnode] = -1;
189  }
190  }
191  mpi::all_to_all (comm, scounts.begin().operator->(), rcounts);
192  icopy(npes, scounts.begin().operator->(), sdispl);
193  init_csr_ptr(npes, sdispl);
194 
195  // --------------------------------------------------------
196  // 8) create the send buffer
197  // --------------------------------------------------------
198  nsend = sdispl[npes];
199  nodelist.clear();
200  vector<my_idxtype> sbuffer (amax(1, nsend));
201  count = 0;
202  for (pe=0; pe<npes; pe++) {
203  for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
204  lnode = recvbuffer[j].key-firstnode;
205  if (htable[lnode] == -1) {
206  for (k=gnptr[lnode]; k<gnptr[lnode+1]; k++) {
207  if (k == gnptr[lnode])
208  sbuffer[count++] = -1*(gnind[k]+1);
209  else
210  sbuffer[count++] = gnind[k];
211  }
212  htable[lnode] = 1;
213  }
214  }
215  check_macro(count == sdispl[pe+1], "unexpected count");
216 
217  /* now reset the hash table */
218  for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
219  lnode = recvbuffer[j].key-firstnode;
220  htable[lnode] = -1;
221  }
222  }
223 
224  icopy(npes, rcounts, rdispl);
225  init_csr_ptr(npes, rdispl);
226 
227  nrecv = rdispl[npes];
228  recvbuffer.clear();
229  vector<my_idxtype> rbuffer (amax(1, nrecv));
230 
231  // no boost::mpi equivalent:
232  MPI_Alltoallv(sbuffer.begin().operator->(), scounts.begin().operator->(), sdispl, IDX_DATATYPE, rbuffer.begin().operator->(), rcounts, rdispl, IDX_DATATYPE, comm);
233 
234  k = -1;
235  vector<my_idxtype> nptr (lnns+1, 0);
236  my_idxtype *nind = rbuffer.begin().operator->(); // QUOI ??? TODO: comprendre .... re-utilisation de la mem ?
237  for (pe=0; pe<npes; pe++) {
238  for (j=rdispl[pe]; j<rdispl[pe+1]; j++) {
239  if (nind[j] < 0) {
240  k++;
241  nind[j] = (-1*nind[j])-1;
242  }
243  nptr[k]++;
244  }
245  }
246  init_csr_ptr(lnns, nptr.begin());
247 
248  check_macro(k+1 == lnns, "unexpected k+1");
249  check_macro(nptr[lnns] == nrecv, "unexpected nptr[]")
250 
251  xadj.resize(nelms+1);
252  fill (xadj.begin(), xadj.end(), 0);
253  my_idxtype *myxadj = xadj.begin().operator->(); // TODO: suppress ptrs !
254  fill (htable.begin(), htable.begin() + mask+1, -1);
255  firstelm = elmdist[mype];
256 
257  // ---------------------------------------------------------------------------
258  // 9) Two passes -- in first pass, simply find out the memory requirements
259  // ---------------------------------------------------------------------------
260  maxcount = 200;
261  vector<my_idxtype> ind (maxcount);
262  vector<my_idxtype> wgt (maxcount);
263  my_idxtype *myadjncy = NULL;
264 
265  for (pass=0; pass<2; pass++) {
266  for (i=0; i<nelms; i++) {
267  for (count=0, j=eptr[i]; j<eptr[i+1]; j++) {
268  node = eind[j];
269 
270  for (k=nptr[node]; k<nptr[node+1]; k++) {
271  if ((kk=nind[k]) == firstelm+i) continue;
272  m = htable[(kk&mask)];
273  if (m == -1) {
274  ind[count] = kk;
275  wgt[count] = 1;
276  htable[(kk&mask)] = count++;
277  } else {
278  if (ind[m] == kk) {
279  wgt[m]++;
280  } else {
281  for (jj=0; jj<count; jj++) {
282  if (ind[jj] == kk) {
283  wgt[jj]++;
284  break;
285  }
286  }
287  if (jj == count) {
288  ind[count] = kk;
289  wgt[count++] = 1;
290  }
291  }
292  }
293  // Adjust the memory.
294  if (count == maxcount-1) {
295  ind.resize (2*maxcount);
296  wgt.resize (2*maxcount);
297  maxcount *= 2;
298  }
299  }
300  }
301  for (j=0; j<count; j++) {
302  htable[(ind[j]&mask)] = -1;
303  if (wgt[j] >= *ncommonnodes) {
304  if (pass == 0)
305  myxadj[i]++;
306  else
307  myadjncy[myxadj[i]++] = ind[j];
308  }
309  }
310  }
311 
312  if (pass == 0) {
313  init_csr_ptr(nelms, myxadj);
314  adjncy.resize (myxadj[nelms]);
315  myadjncy = adjncy.begin().operator->(); // TODO: avoid pointers !
316  }
317  else {
318  SHIFTCSR(i, nelms, myxadj);
319  }
320  }
321  // ---------------------------------------------------------------------------
322  // 10) correctly renumber the elements array */
323  // ---------------------------------------------------------------------------
324  for (i=0; i<eptr[nelms]; i++)
325  eind[i] = nmap[eind[i]] + gminnode;
326 }
327 // -------------------------------------------------------------------------
328 // quick sort algorithm
329 // -------------------------------------------------------------------------
330 
331 /* Discontinue quicksort algorithm when partition gets below this size.
332  This particular magic number was chosen to work best on a Sun 4/260. */
333 #define MAX_THRESH 20
334 
335 /* Byte-wise swap two items of size SIZE. */
336 #define QSSWAP(a, b, stmp) do { stmp = (a); (a) = (b); (b) = stmp; } while (0)
337 
338 /* Stack node declarations used to store unfulfilled partition obligations. */
339 typedef struct {
340  KeyValueType *lo;
341  KeyValueType *hi;
342 } stack_node;
343 
344 
345 /* The next 4 #defines implement a very fast in-line stack abstraction. */
346 #define STACK_SIZE (8 * sizeof(unsigned long int))
347 #define PUSH(low, high) ((void) ((top->lo = (low)), (top->hi = (high)), ++top))
348 #define POP(low, high) ((void) (--top, (low = top->lo), (high = top->hi)))
349 #define STACK_NOT_EMPTY (stack < top)
350 
351 
352 static
353 void
354 ikeysort(int total_elems, KeyValueType *pbase) {
355  KeyValueType pivot, stmp;
356  if (total_elems == 0)
357  /* Avoid lossage with unsigned arithmetic below. */
358  return;
359 
360  if (total_elems > MAX_THRESH) {
361  KeyValueType *lo = pbase;
362  KeyValueType *hi = &lo[total_elems - 1];
363  stack_node stack[STACK_SIZE]; /* Largest size needed for 32-bit int!!! */
364  stack_node *top = stack + 1;
365 
366  while (STACK_NOT_EMPTY) {
367  KeyValueType *left_ptr;
368  KeyValueType *right_ptr;
369  KeyValueType *mid = lo + ((hi - lo) >> 1);
370 
371  if (mid->key < lo->key)
372  QSSWAP(*mid, *lo, stmp);
373  if (hi->key < mid->key)
374  QSSWAP(*mid, *hi, stmp);
375  else
376  goto jump_over;
377  if (mid->key < lo->key)
378  QSSWAP(*mid, *lo, stmp);
379 
380 jump_over:;
381  pivot = *mid;
382  left_ptr = lo + 1;
383  right_ptr = hi - 1;
384 
385  /* Here's the famous ``collapse the walls'' section of quicksort.
386  Gotta like those tight inner loops! They are the main reason
387  that this algorithm runs much faster than others. */
388  do {
389  while (left_ptr->key < pivot.key)
390  left_ptr++;
391 
392  while (pivot.key < right_ptr->key)
393  right_ptr--;
394 
395  if (left_ptr < right_ptr) {
396  QSSWAP (*left_ptr, *right_ptr, stmp);
397  left_ptr++;
398  right_ptr--;
399  }
400  else if (left_ptr == right_ptr) {
401  left_ptr++;
402  right_ptr--;
403  break;
404  }
405  } while (left_ptr <= right_ptr);
406 
407  /* Set up pointers for next iteration. First determine whether
408  left and right partitions are below the threshold size. If so,
409  ignore one or both. Otherwise, push the larger partition's
410  bounds on the stack and continue sorting the smaller one. */
411 
412  if ((size_t) (right_ptr - lo) <= MAX_THRESH) {
413  if ((size_t) (hi - left_ptr) <= MAX_THRESH)
414  /* Ignore both small partitions. */
415  POP (lo, hi);
416  else
417  /* Ignore small left partition. */
418  lo = left_ptr;
419  }
420  else if ((size_t) (hi - left_ptr) <= MAX_THRESH)
421  /* Ignore small right partition. */
422  hi = right_ptr;
423  else if ((right_ptr - lo) > (hi - left_ptr)) {
424  /* Push larger left partition indices. */
425  PUSH (lo, right_ptr);
426  lo = left_ptr;
427  }
428  else {
429  /* Push larger right partition indices. */
430  PUSH (left_ptr, hi);
431  hi = right_ptr;
432  }
433  }
434  }
435  /* Once the BASE_PTR array is partially sorted by quicksort the rest
436  is completely sorted using insertion sort, since this is efficient
437  for partitions below MAX_THRESH size. BASE_PTR points to the beginning
438  of the array to sort, and END_PTR points at the very last element in
439  the array (*not* one beyond it!). */
440  {
441  KeyValueType *end_ptr = &pbase[total_elems - 1];
442  KeyValueType *tmp_ptr = pbase;
443  KeyValueType *thresh = (end_ptr < pbase + MAX_THRESH ? end_ptr : pbase + MAX_THRESH);
444  KeyValueType *run_ptr;
445 
446  /* Find smallest element in first threshold and place it at the
447  array's beginning. This is the smallest array element,
448  and the operation speeds up insertion sort's inner loop. */
449 
450  for (run_ptr = tmp_ptr + 1; run_ptr <= thresh; run_ptr++)
451  if (run_ptr->key < tmp_ptr->key)
452  tmp_ptr = run_ptr;
453 
454  if (tmp_ptr != pbase)
455  QSSWAP(*tmp_ptr, *pbase, stmp);
456 
457  /* Insertion sort, running from left-hand-side up to right-hand-side. */
458  run_ptr = pbase + 1;
459  while (++run_ptr <= end_ptr) {
460  tmp_ptr = run_ptr - 1;
461  while (run_ptr->key < tmp_ptr->key)
462  tmp_ptr--;
463 
464  tmp_ptr++;
465  if (tmp_ptr != run_ptr) {
466  KeyValueType elmnt = *run_ptr;
467  KeyValueType *mptr;
468 
469  for (mptr=run_ptr; mptr>tmp_ptr; mptr--)
470  *mptr = *(mptr-1);
471  *mptr = elmnt;
472  }
473  }
474  }
475 }
476 
477 } // namespace rheolef
478 #endif // _RHEOLEF_HAVE_MPI
rheolef::idxamin
int idxamin(int n, const std::vector< my_idxtype > &x)
Definition: geo_partition_scotch.h:150
rheolef::geo_dual
void geo_dual(my_idxtype *elmdist, my_idxtype *eptr, vector< my_idxtype > &eind, int *ncommonnodes, vector< my_idxtype > &xadj, vector< my_idxtype > &adjncy, const mpi::communicator &comm)
Definition: geo_mpi_dual.cc:35
rheolef::KeyValueType
struct KeyValueType KeyValueType
Definition: geo_partition_scotch.h:122
STACK_SIZE
#define STACK_SIZE
Definition: geo_mpi_dual.cc:346
POP
#define POP(low, high)
Definition: geo_mpi_dual.cc:348
rheolef::my_idxtype
int my_idxtype
Definition: geo_partition_scotch.h:28
PUSH
#define PUSH(low, high)
Definition: geo_mpi_dual.cc:347
QSSWAP
#define QSSWAP(a, b, stmp)
Definition: geo_mpi_dual.cc:336
IDX_DATATYPE
#define IDX_DATATYPE
Definition: geo_partition_scotch.h:29
mkgeo_sector.m
m
Definition: mkgeo_sector.sh:118
amax
#define amax(a, b)
Definition: geo_partition_scotch.h:124
rheolef
This file is part of Rheolef.
Definition: compiler_eigen.h:37
STACK_NOT_EMPTY
#define STACK_NOT_EMPTY
Definition: geo_mpi_dual.cc:349
rheolef::idxamax
int idxamax(int n, const std::vector< my_idxtype > &x)
Definition: geo_partition_scotch.h:160
MAX_THRESH
#define MAX_THRESH
Definition: geo_mpi_dual.cc:333
geo_partition_scotch.h
icopy
#define icopy(n, a, b)
Definition: geo_partition_scotch.h:125
SHIFTCSR
#define SHIFTCSR(i, n, a)
Definition: geo_partition_scotch.h:127