LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_affinity.h
1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 ~Mask() { hwloc_bitmap_free(mask); }
33 void set(int i) override { hwloc_bitmap_set(mask, i); }
34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36 void zero() override { hwloc_bitmap_zero(mask); }
37 bool empty() const override { return hwloc_bitmap_iszero(mask); }
38 void copy(const KMPAffinity::Mask *src) override {
39 const Mask *convert = static_cast<const Mask *>(src);
40 hwloc_bitmap_copy(mask, convert->mask);
41 }
42 void bitwise_and(const KMPAffinity::Mask *rhs) override {
43 const Mask *convert = static_cast<const Mask *>(rhs);
44 hwloc_bitmap_and(mask, mask, convert->mask);
45 }
46 void bitwise_or(const KMPAffinity::Mask *rhs) override {
47 const Mask *convert = static_cast<const Mask *>(rhs);
48 hwloc_bitmap_or(mask, mask, convert->mask);
49 }
50 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51 bool is_equal(const KMPAffinity::Mask *rhs) const override {
52 const Mask *convert = static_cast<const Mask *>(rhs);
53 return hwloc_bitmap_isequal(mask, convert->mask);
54 }
55 int begin() const override { return hwloc_bitmap_first(mask); }
56 int end() const override { return -1; }
57 int next(int previous) const override {
58 return hwloc_bitmap_next(mask, previous);
59 }
60 int get_system_affinity(bool abort_on_error) override {
61 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62 "Illegal get affinity operation when not capable");
63 long retval =
64 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65 if (retval >= 0) {
66 return 0;
67 }
68 int error = errno;
69 if (abort_on_error) {
70 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71 KMP_ERR(error), __kmp_msg_null);
72 }
73 return error;
74 }
75 int set_system_affinity(bool abort_on_error) const override {
76 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77 "Illegal set affinity operation when not capable");
78 long retval =
79 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80 if (retval >= 0) {
81 return 0;
82 }
83 int error = errno;
84 if (abort_on_error) {
85 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86 KMP_ERR(error), __kmp_msg_null);
87 }
88 return error;
89 }
90#if KMP_OS_WINDOWS
91 int set_process_affinity(bool abort_on_error) const override {
92 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93 "Illegal set process affinity operation when not capable");
94 int error = 0;
95 const hwloc_topology_support *support =
96 hwloc_topology_get_support(__kmp_hwloc_topology);
97 if (support->cpubind->set_proc_cpubind) {
98 int retval;
99 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100 HWLOC_CPUBIND_PROCESS);
101 if (retval >= 0)
102 return 0;
103 error = errno;
104 if (abort_on_error)
105 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106 KMP_ERR(error), __kmp_msg_null);
107 }
108 return error;
109 }
110#endif
111 int get_proc_group() const override {
112 int group = -1;
113#if KMP_OS_WINDOWS
114 if (__kmp_num_proc_groups == 1) {
115 return 1;
116 }
117 for (int i = 0; i < __kmp_num_proc_groups; i++) {
118 // On windows, the long type is always 32 bits
119 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120 unsigned long second_32_bits =
121 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122 if (first_32_bits == 0 && second_32_bits == 0) {
123 continue;
124 }
125 if (group >= 0) {
126 return -1;
127 }
128 group = i;
129 }
130#endif /* KMP_OS_WINDOWS */
131 return group;
132 }
133 };
134 void determine_capable(const char *var) override {
135 const hwloc_topology_support *topology_support;
136 if (__kmp_hwloc_topology == NULL) {
137 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138 __kmp_hwloc_error = TRUE;
139 if (__kmp_affinity.flags.verbose) {
140 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141 }
142 }
143 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144 __kmp_hwloc_error = TRUE;
145 if (__kmp_affinity.flags.verbose) {
146 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147 }
148 }
149 }
150 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151 // Is the system capable of setting/getting this thread's affinity?
152 // Also, is topology discovery possible? (pu indicates ability to discover
153 // processing units). And finally, were there no errors when calling any
154 // hwloc_* API functions?
155 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156 topology_support->cpubind->get_thisthread_cpubind &&
157 topology_support->discovery->pu && !__kmp_hwloc_error) {
158 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159 KMP_AFFINITY_ENABLE(TRUE);
160 } else {
161 // indicate that hwloc didn't work and disable affinity
162 __kmp_hwloc_error = TRUE;
163 KMP_AFFINITY_DISABLE();
164 }
165 }
166 void bind_thread(int which) override {
167 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168 "Illegal set affinity operation when not capable");
169 KMPAffinity::Mask *mask;
170 KMP_CPU_ALLOC_ON_STACK(mask);
171 KMP_CPU_ZERO(mask);
172 KMP_CPU_SET(which, mask);
173 __kmp_set_system_affinity(mask, TRUE);
174 KMP_CPU_FREE_FROM_STACK(mask);
175 }
176 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178 KMPAffinity::Mask *allocate_mask_array(int num) override {
179 return new Mask[num];
180 }
181 void deallocate_mask_array(KMPAffinity::Mask *array) override {
182 Mask *hwloc_array = static_cast<Mask *>(array);
183 delete[] hwloc_array;
184 }
185 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186 int index) override {
187 Mask *hwloc_array = static_cast<Mask *>(array);
188 return &(hwloc_array[index]);
189 }
190 api_type get_api_type() const override { return HWLOC; }
191};
192#endif /* KMP_USE_HWLOC */
193
194#if KMP_OS_LINUX || KMP_OS_FREEBSD
195#if KMP_OS_LINUX
196/* On some of the older OS's that we build on, these constants aren't present
197 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
198 all systems of the same arch where they are defined, and they cannot change.
199 stone forever. */
200#include <sys/syscall.h>
201#if KMP_ARCH_X86 || KMP_ARCH_ARM
202#ifndef __NR_sched_setaffinity
203#define __NR_sched_setaffinity 241
204#elif __NR_sched_setaffinity != 241
205#error Wrong code for setaffinity system call.
206#endif /* __NR_sched_setaffinity */
207#ifndef __NR_sched_getaffinity
208#define __NR_sched_getaffinity 242
209#elif __NR_sched_getaffinity != 242
210#error Wrong code for getaffinity system call.
211#endif /* __NR_sched_getaffinity */
212#elif KMP_ARCH_AARCH64
213#ifndef __NR_sched_setaffinity
214#define __NR_sched_setaffinity 122
215#elif __NR_sched_setaffinity != 122
216#error Wrong code for setaffinity system call.
217#endif /* __NR_sched_setaffinity */
218#ifndef __NR_sched_getaffinity
219#define __NR_sched_getaffinity 123
220#elif __NR_sched_getaffinity != 123
221#error Wrong code for getaffinity system call.
222#endif /* __NR_sched_getaffinity */
223#elif KMP_ARCH_RISCV64
224#ifndef __NR_sched_setaffinity
225#define __NR_sched_setaffinity 122
226#elif __NR_sched_setaffinity != 122
227#error Wrong code for setaffinity system call.
228#endif /* __NR_sched_setaffinity */
229#ifndef __NR_sched_getaffinity
230#define __NR_sched_getaffinity 123
231#elif __NR_sched_getaffinity != 123
232#error Wrong code for getaffinity system call.
233#endif /* __NR_sched_getaffinity */
234#elif KMP_ARCH_X86_64
235#ifndef __NR_sched_setaffinity
236#define __NR_sched_setaffinity 203
237#elif __NR_sched_setaffinity != 203
238#error Wrong code for setaffinity system call.
239#endif /* __NR_sched_setaffinity */
240#ifndef __NR_sched_getaffinity
241#define __NR_sched_getaffinity 204
242#elif __NR_sched_getaffinity != 204
243#error Wrong code for getaffinity system call.
244#endif /* __NR_sched_getaffinity */
245#elif KMP_ARCH_PPC64
246#ifndef __NR_sched_setaffinity
247#define __NR_sched_setaffinity 222
248#elif __NR_sched_setaffinity != 222
249#error Wrong code for setaffinity system call.
250#endif /* __NR_sched_setaffinity */
251#ifndef __NR_sched_getaffinity
252#define __NR_sched_getaffinity 223
253#elif __NR_sched_getaffinity != 223
254#error Wrong code for getaffinity system call.
255#endif /* __NR_sched_getaffinity */
256#elif KMP_ARCH_MIPS
257#ifndef __NR_sched_setaffinity
258#define __NR_sched_setaffinity 4239
259#elif __NR_sched_setaffinity != 4239
260#error Wrong code for setaffinity system call.
261#endif /* __NR_sched_setaffinity */
262#ifndef __NR_sched_getaffinity
263#define __NR_sched_getaffinity 4240
264#elif __NR_sched_getaffinity != 4240
265#error Wrong code for getaffinity system call.
266#endif /* __NR_sched_getaffinity */
267#elif KMP_ARCH_MIPS64
268#ifndef __NR_sched_setaffinity
269#define __NR_sched_setaffinity 5195
270#elif __NR_sched_setaffinity != 5195
271#error Wrong code for setaffinity system call.
272#endif /* __NR_sched_setaffinity */
273#ifndef __NR_sched_getaffinity
274#define __NR_sched_getaffinity 5196
275#elif __NR_sched_getaffinity != 5196
276#error Wrong code for getaffinity system call.
277#endif /* __NR_sched_getaffinity */
278#elif KMP_ARCH_LOONGARCH64
279#ifndef __NR_sched_setaffinity
280#define __NR_sched_setaffinity 122
281#elif __NR_sched_setaffinity != 122
282#error Wrong code for setaffinity system call.
283#endif /* __NR_sched_setaffinity */
284#ifndef __NR_sched_getaffinity
285#define __NR_sched_getaffinity 123
286#elif __NR_sched_getaffinity != 123
287#error Wrong code for getaffinity system call.
288#endif /* __NR_sched_getaffinity */
289#elif KMP_ARCH_RISCV64
290#ifndef __NR_sched_setaffinity
291#define __NR_sched_setaffinity 122
292#elif __NR_sched_setaffinity != 122
293#error Wrong code for setaffinity system call.
294#endif /* __NR_sched_setaffinity */
295#ifndef __NR_sched_getaffinity
296#define __NR_sched_getaffinity 123
297#elif __NR_sched_getaffinity != 123
298#error Wrong code for getaffinity system call.
299#endif /* __NR_sched_getaffinity */
300#elif KMP_ARCH_VE
301#ifndef __NR_sched_setaffinity
302#define __NR_sched_setaffinity 203
303#elif __NR_sched_setaffinity != 203
304#error Wrong code for setaffinity system call.
305#endif /* __NR_sched_setaffinity */
306#ifndef __NR_sched_getaffinity
307#define __NR_sched_getaffinity 204
308#elif __NR_sched_getaffinity != 204
309#error Wrong code for getaffinity system call.
310#endif /* __NR_sched_getaffinity */
311#else
312#error Unknown or unsupported architecture
313#endif /* KMP_ARCH_* */
314#elif KMP_OS_FREEBSD
315#include <pthread.h>
316#include <pthread_np.h>
317#endif
318class KMPNativeAffinity : public KMPAffinity {
319 class Mask : public KMPAffinity::Mask {
320 typedef unsigned long mask_t;
321 typedef decltype(__kmp_affin_mask_size) mask_size_type;
322 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
323 static const mask_t ONE = 1;
324 mask_size_type get_num_mask_types() const {
325 return __kmp_affin_mask_size / sizeof(mask_t);
326 }
327
328 public:
329 mask_t *mask;
330 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
331 ~Mask() {
332 if (mask)
333 __kmp_free(mask);
334 }
335 void set(int i) override {
336 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
337 }
338 bool is_set(int i) const override {
339 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
340 }
341 void clear(int i) override {
342 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
343 }
344 void zero() override {
345 mask_size_type e = get_num_mask_types();
346 for (mask_size_type i = 0; i < e; ++i)
347 mask[i] = (mask_t)0;
348 }
349 bool empty() const override {
350 mask_size_type e = get_num_mask_types();
351 for (mask_size_type i = 0; i < e; ++i)
352 if (mask[i] != (mask_t)0)
353 return false;
354 return true;
355 }
356 void copy(const KMPAffinity::Mask *src) override {
357 const Mask *convert = static_cast<const Mask *>(src);
358 mask_size_type e = get_num_mask_types();
359 for (mask_size_type i = 0; i < e; ++i)
360 mask[i] = convert->mask[i];
361 }
362 void bitwise_and(const KMPAffinity::Mask *rhs) override {
363 const Mask *convert = static_cast<const Mask *>(rhs);
364 mask_size_type e = get_num_mask_types();
365 for (mask_size_type i = 0; i < e; ++i)
366 mask[i] &= convert->mask[i];
367 }
368 void bitwise_or(const KMPAffinity::Mask *rhs) override {
369 const Mask *convert = static_cast<const Mask *>(rhs);
370 mask_size_type e = get_num_mask_types();
371 for (mask_size_type i = 0; i < e; ++i)
372 mask[i] |= convert->mask[i];
373 }
374 void bitwise_not() override {
375 mask_size_type e = get_num_mask_types();
376 for (mask_size_type i = 0; i < e; ++i)
377 mask[i] = ~(mask[i]);
378 }
379 bool is_equal(const KMPAffinity::Mask *rhs) const override {
380 const Mask *convert = static_cast<const Mask *>(rhs);
381 mask_size_type e = get_num_mask_types();
382 for (mask_size_type i = 0; i < e; ++i)
383 if (mask[i] != convert->mask[i])
384 return false;
385 return true;
386 }
387 int begin() const override {
388 int retval = 0;
389 while (retval < end() && !is_set(retval))
390 ++retval;
391 return retval;
392 }
393 int end() const override {
394 int e;
395 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
396 return e;
397 }
398 int next(int previous) const override {
399 int retval = previous + 1;
400 while (retval < end() && !is_set(retval))
401 ++retval;
402 return retval;
403 }
404 int get_system_affinity(bool abort_on_error) override {
405 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
406 "Illegal get affinity operation when not capable");
407#if KMP_OS_LINUX
408 long retval =
409 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
410#elif KMP_OS_FREEBSD
411 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
412 reinterpret_cast<cpuset_t *>(mask));
413 int retval = (r == 0 ? 0 : -1);
414#endif
415 if (retval >= 0) {
416 return 0;
417 }
418 int error = errno;
419 if (abort_on_error) {
420 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
421 KMP_ERR(error), __kmp_msg_null);
422 }
423 return error;
424 }
425 int set_system_affinity(bool abort_on_error) const override {
426 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
427 "Illegal set affinity operation when not capable");
428#if KMP_OS_LINUX
429 long retval =
430 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
431#elif KMP_OS_FREEBSD
432 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
433 reinterpret_cast<cpuset_t *>(mask));
434 int retval = (r == 0 ? 0 : -1);
435#endif
436 if (retval >= 0) {
437 return 0;
438 }
439 int error = errno;
440 if (abort_on_error) {
441 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
442 KMP_ERR(error), __kmp_msg_null);
443 }
444 return error;
445 }
446 };
447 void determine_capable(const char *env_var) override {
448 __kmp_affinity_determine_capable(env_var);
449 }
450 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
451 KMPAffinity::Mask *allocate_mask() override {
452 KMPNativeAffinity::Mask *retval = new Mask();
453 return retval;
454 }
455 void deallocate_mask(KMPAffinity::Mask *m) override {
456 KMPNativeAffinity::Mask *native_mask =
457 static_cast<KMPNativeAffinity::Mask *>(m);
458 delete native_mask;
459 }
460 KMPAffinity::Mask *allocate_mask_array(int num) override {
461 return new Mask[num];
462 }
463 void deallocate_mask_array(KMPAffinity::Mask *array) override {
464 Mask *linux_array = static_cast<Mask *>(array);
465 delete[] linux_array;
466 }
467 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
468 int index) override {
469 Mask *linux_array = static_cast<Mask *>(array);
470 return &(linux_array[index]);
471 }
472 api_type get_api_type() const override { return NATIVE_OS; }
473};
474#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
475
476#if KMP_OS_WINDOWS
477class KMPNativeAffinity : public KMPAffinity {
478 class Mask : public KMPAffinity::Mask {
479 typedef ULONG_PTR mask_t;
480 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
481 mask_t *mask;
482
483 public:
484 Mask() {
485 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
486 }
487 ~Mask() {
488 if (mask)
489 __kmp_free(mask);
490 }
491 void set(int i) override {
492 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
493 }
494 bool is_set(int i) const override {
495 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
496 }
497 void clear(int i) override {
498 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
499 }
500 void zero() override {
501 for (int i = 0; i < __kmp_num_proc_groups; ++i)
502 mask[i] = 0;
503 }
504 bool empty() const override {
505 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
506 if (mask[i])
507 return false;
508 return true;
509 }
510 void copy(const KMPAffinity::Mask *src) override {
511 const Mask *convert = static_cast<const Mask *>(src);
512 for (int i = 0; i < __kmp_num_proc_groups; ++i)
513 mask[i] = convert->mask[i];
514 }
515 void bitwise_and(const KMPAffinity::Mask *rhs) override {
516 const Mask *convert = static_cast<const Mask *>(rhs);
517 for (int i = 0; i < __kmp_num_proc_groups; ++i)
518 mask[i] &= convert->mask[i];
519 }
520 void bitwise_or(const KMPAffinity::Mask *rhs) override {
521 const Mask *convert = static_cast<const Mask *>(rhs);
522 for (int i = 0; i < __kmp_num_proc_groups; ++i)
523 mask[i] |= convert->mask[i];
524 }
525 void bitwise_not() override {
526 for (int i = 0; i < __kmp_num_proc_groups; ++i)
527 mask[i] = ~(mask[i]);
528 }
529 bool is_equal(const KMPAffinity::Mask *rhs) const override {
530 const Mask *convert = static_cast<const Mask *>(rhs);
531 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
532 if (mask[i] != convert->mask[i])
533 return false;
534 return true;
535 }
536 int begin() const override {
537 int retval = 0;
538 while (retval < end() && !is_set(retval))
539 ++retval;
540 return retval;
541 }
542 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
543 int next(int previous) const override {
544 int retval = previous + 1;
545 while (retval < end() && !is_set(retval))
546 ++retval;
547 return retval;
548 }
549 int set_process_affinity(bool abort_on_error) const override {
550 if (__kmp_num_proc_groups <= 1) {
551 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
552 DWORD error = GetLastError();
553 if (abort_on_error) {
554 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
555 __kmp_msg_null);
556 }
557 return error;
558 }
559 }
560 return 0;
561 }
562 int set_system_affinity(bool abort_on_error) const override {
563 if (__kmp_num_proc_groups > 1) {
564 // Check for a valid mask.
565 GROUP_AFFINITY ga;
566 int group = get_proc_group();
567 if (group < 0) {
568 if (abort_on_error) {
569 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
570 }
571 return -1;
572 }
573 // Transform the bit vector into a GROUP_AFFINITY struct
574 // and make the system call to set affinity.
575 ga.Group = group;
576 ga.Mask = mask[group];
577 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
578
579 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
580 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
581 DWORD error = GetLastError();
582 if (abort_on_error) {
583 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
584 __kmp_msg_null);
585 }
586 return error;
587 }
588 } else {
589 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
590 DWORD error = GetLastError();
591 if (abort_on_error) {
592 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
593 __kmp_msg_null);
594 }
595 return error;
596 }
597 }
598 return 0;
599 }
600 int get_system_affinity(bool abort_on_error) override {
601 if (__kmp_num_proc_groups > 1) {
602 this->zero();
603 GROUP_AFFINITY ga;
604 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
605 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
606 DWORD error = GetLastError();
607 if (abort_on_error) {
608 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
609 KMP_ERR(error), __kmp_msg_null);
610 }
611 return error;
612 }
613 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
614 (ga.Mask == 0)) {
615 return -1;
616 }
617 mask[ga.Group] = ga.Mask;
618 } else {
619 mask_t newMask, sysMask, retval;
620 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
621 DWORD error = GetLastError();
622 if (abort_on_error) {
623 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
624 KMP_ERR(error), __kmp_msg_null);
625 }
626 return error;
627 }
628 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
629 if (!retval) {
630 DWORD error = GetLastError();
631 if (abort_on_error) {
632 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
633 KMP_ERR(error), __kmp_msg_null);
634 }
635 return error;
636 }
637 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
638 if (!newMask) {
639 DWORD error = GetLastError();
640 if (abort_on_error) {
641 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
642 KMP_ERR(error), __kmp_msg_null);
643 }
644 }
645 *mask = retval;
646 }
647 return 0;
648 }
649 int get_proc_group() const override {
650 int group = -1;
651 if (__kmp_num_proc_groups == 1) {
652 return 1;
653 }
654 for (int i = 0; i < __kmp_num_proc_groups; i++) {
655 if (mask[i] == 0)
656 continue;
657 if (group >= 0)
658 return -1;
659 group = i;
660 }
661 return group;
662 }
663 };
664 void determine_capable(const char *env_var) override {
665 __kmp_affinity_determine_capable(env_var);
666 }
667 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
668 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
669 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
670 KMPAffinity::Mask *allocate_mask_array(int num) override {
671 return new Mask[num];
672 }
673 void deallocate_mask_array(KMPAffinity::Mask *array) override {
674 Mask *windows_array = static_cast<Mask *>(array);
675 delete[] windows_array;
676 }
677 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
678 int index) override {
679 Mask *windows_array = static_cast<Mask *>(array);
680 return &(windows_array[index]);
681 }
682 api_type get_api_type() const override { return NATIVE_OS; }
683};
684#endif /* KMP_OS_WINDOWS */
685#endif /* KMP_AFFINITY_SUPPORTED */
686
687// Describe an attribute for a level in the machine topology
688struct kmp_hw_attr_t {
689 int core_type : 8;
690 int core_eff : 8;
691 unsigned valid : 1;
692 unsigned reserved : 15;
693
694 static const int UNKNOWN_CORE_EFF = -1;
695
696 kmp_hw_attr_t()
697 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
698 valid(0), reserved(0) {}
699 void set_core_type(kmp_hw_core_type_t type) {
700 valid = 1;
701 core_type = type;
702 }
703 void set_core_eff(int eff) {
704 valid = 1;
705 core_eff = eff;
706 }
707 kmp_hw_core_type_t get_core_type() const {
708 return (kmp_hw_core_type_t)core_type;
709 }
710 int get_core_eff() const { return core_eff; }
711 bool is_core_type_valid() const {
712 return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
713 }
714 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
715 operator bool() const { return valid; }
716 void clear() {
717 core_type = KMP_HW_CORE_TYPE_UNKNOWN;
718 core_eff = UNKNOWN_CORE_EFF;
719 valid = 0;
720 }
721 bool contains(const kmp_hw_attr_t &other) const {
722 if (!valid && !other.valid)
723 return true;
724 if (valid && other.valid) {
725 if (other.is_core_type_valid()) {
726 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
727 return false;
728 }
729 if (other.is_core_eff_valid()) {
730 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
731 return false;
732 }
733 return true;
734 }
735 return false;
736 }
737#if KMP_AFFINITY_SUPPORTED
738 bool contains(const kmp_affinity_attrs_t &attr) const {
739 if (!valid && !attr.valid)
740 return true;
741 if (valid && attr.valid) {
742 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
743 return (is_core_type_valid() &&
744 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
745 if (attr.core_eff != UNKNOWN_CORE_EFF)
746 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
747 return true;
748 }
749 return false;
750 }
751#endif // KMP_AFFINITY_SUPPORTED
752 bool operator==(const kmp_hw_attr_t &rhs) const {
753 return (rhs.valid == valid && rhs.core_eff == core_eff &&
754 rhs.core_type == core_type);
755 }
756 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
757};
758
759#if KMP_AFFINITY_SUPPORTED
760KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
761#endif
762
763class kmp_hw_thread_t {
764public:
765 static const int UNKNOWN_ID = -1;
766 static const int MULTIPLE_ID = -2;
767 static int compare_ids(const void *a, const void *b);
768 static int compare_compact(const void *a, const void *b);
769 int ids[KMP_HW_LAST];
770 int sub_ids[KMP_HW_LAST];
771 bool leader;
772 int os_id;
773 kmp_hw_attr_t attrs;
774
775 void print() const;
776 void clear() {
777 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
778 ids[i] = UNKNOWN_ID;
779 leader = false;
780 attrs.clear();
781 }
782};
783
784class kmp_topology_t {
785
786 struct flags_t {
787 int uniform : 1;
788 int reserved : 31;
789 };
790
791 int depth;
792
793 // The following arrays are all 'depth' long and have been
794 // allocated to hold up to KMP_HW_LAST number of objects if
795 // needed so layers can be added without reallocation of any array
796
797 // Orderd array of the types in the topology
798 kmp_hw_t *types;
799
800 // Keep quick topology ratios, for non-uniform topologies,
801 // this ratio holds the max number of itemAs per itemB
802 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
803 int *ratio;
804
805 // Storage containing the absolute number of each topology layer
806 int *count;
807
808 // The number of core efficiencies. This is only useful for hybrid
809 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
810 int num_core_efficiencies;
811 int num_core_types;
812 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
813
814 // The hardware threads array
815 // hw_threads is num_hw_threads long
816 // Each hw_thread's ids and sub_ids are depth deep
817 int num_hw_threads;
818 kmp_hw_thread_t *hw_threads;
819
820 // Equivalence hash where the key is the hardware topology item
821 // and the value is the equivalent hardware topology type in the
822 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
823 // known equivalence for the topology type
824 kmp_hw_t equivalent[KMP_HW_LAST];
825
826 // Flags describing the topology
827 flags_t flags;
828
829 // Compact value used during sort_compact()
830 int compact;
831
832 // Insert a new topology layer after allocation
833 void _insert_layer(kmp_hw_t type, const int *ids);
834
835#if KMP_GROUP_AFFINITY
836 // Insert topology information about Windows Processor groups
837 void _insert_windows_proc_groups();
838#endif
839
840 // Count each item & get the num x's per y
841 // e.g., get the number of cores and the number of threads per core
842 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
843 void _gather_enumeration_information();
844
845 // Remove layers that don't add information to the topology.
846 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
847 void _remove_radix1_layers();
848
849 // Find out if the topology is uniform
850 void _discover_uniformity();
851
852 // Set all the sub_ids for each hardware thread
853 void _set_sub_ids();
854
855 // Set global affinity variables describing the number of threads per
856 // core, the number of packages, the number of cores per package, and
857 // the number of cores.
858 void _set_globals();
859
860 // Set the last level cache equivalent type
861 void _set_last_level_cache();
862
863 // Return the number of cores with a particular attribute, 'attr'.
864 // If 'find_all' is true, then find all cores on the machine, otherwise find
865 // all cores per the layer 'above'
866 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
867 bool find_all = false) const;
868
869public:
870 // Force use of allocate()/deallocate()
871 kmp_topology_t() = delete;
872 kmp_topology_t(const kmp_topology_t &t) = delete;
873 kmp_topology_t(kmp_topology_t &&t) = delete;
874 kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
875 kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
876
877 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
878 static void deallocate(kmp_topology_t *);
879
880 // Functions used in create_map() routines
881 kmp_hw_thread_t &at(int index) {
882 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
883 return hw_threads[index];
884 }
885 const kmp_hw_thread_t &at(int index) const {
886 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
887 return hw_threads[index];
888 }
889 int get_num_hw_threads() const { return num_hw_threads; }
890 void sort_ids() {
891 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
892 kmp_hw_thread_t::compare_ids);
893 }
894 // Check if the hardware ids are unique, if they are
895 // return true, otherwise return false
896 bool check_ids() const;
897
898 // Function to call after the create_map() routine
899 void canonicalize();
900 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
901
902// Functions used after canonicalize() called
903
904#if KMP_AFFINITY_SUPPORTED
905 // Set the granularity for affinity settings
906 void set_granularity(kmp_affinity_t &stgs) const;
907 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
908 bool restrict_to_mask(const kmp_affin_mask_t *mask);
909 bool filter_hw_subset();
910#endif
911 bool is_uniform() const { return flags.uniform; }
912 // Tell whether a type is a valid type in the topology
913 // returns KMP_HW_UNKNOWN when there is no equivalent type
914 kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
915 if (type == KMP_HW_UNKNOWN)
916 return KMP_HW_UNKNOWN;
917 return equivalent[type];
918 }
919 // Set type1 = type2
920 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
921 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
922 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
923 kmp_hw_t real_type2 = equivalent[type2];
924 if (real_type2 == KMP_HW_UNKNOWN)
925 real_type2 = type2;
926 equivalent[type1] = real_type2;
927 // This loop is required since any of the types may have been set to
928 // be equivalent to type1. They all must be checked and reset to type2.
929 KMP_FOREACH_HW_TYPE(type) {
930 if (equivalent[type] == type1) {
931 equivalent[type] = real_type2;
932 }
933 }
934 }
935 // Calculate number of types corresponding to level1
936 // per types corresponding to level2 (e.g., number of threads per core)
937 int calculate_ratio(int level1, int level2) const {
938 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
939 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
940 int r = 1;
941 for (int level = level1; level > level2; --level)
942 r *= ratio[level];
943 return r;
944 }
945 int get_ratio(int level) const {
946 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
947 return ratio[level];
948 }
949 int get_depth() const { return depth; };
950 kmp_hw_t get_type(int level) const {
951 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
952 return types[level];
953 }
954 int get_level(kmp_hw_t type) const {
955 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
956 int eq_type = equivalent[type];
957 if (eq_type == KMP_HW_UNKNOWN)
958 return -1;
959 for (int i = 0; i < depth; ++i)
960 if (types[i] == eq_type)
961 return i;
962 return -1;
963 }
964 int get_count(int level) const {
965 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
966 return count[level];
967 }
968 // Return the total number of cores with attribute 'attr'
969 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
970 return _get_ncores_with_attr(attr, -1, true);
971 }
972 // Return the number of cores with attribute
973 // 'attr' per topology level 'above'
974 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
975 return _get_ncores_with_attr(attr, above, false);
976 }
977
978#if KMP_AFFINITY_SUPPORTED
979 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
980 void sort_compact(kmp_affinity_t &affinity) {
981 compact = affinity.compact;
982 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
983 kmp_hw_thread_t::compare_compact);
984 }
985#endif
986 void print(const char *env_var = "KMP_AFFINITY") const;
987 void dump() const;
988};
989extern kmp_topology_t *__kmp_topology;
990
991class kmp_hw_subset_t {
992 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
993
994public:
995 // Describe a machine topology item in KMP_HW_SUBSET
996 struct item_t {
997 kmp_hw_t type;
998 int num_attrs;
999 int num[MAX_ATTRS];
1000 int offset[MAX_ATTRS];
1001 kmp_hw_attr_t attr[MAX_ATTRS];
1002 };
1003 // Put parenthesis around max to avoid accidental use of Windows max macro.
1004 const static int USE_ALL = (std::numeric_limits<int>::max)();
1005
1006private:
1007 int depth;
1008 int capacity;
1009 item_t *items;
1010 kmp_uint64 set;
1011 bool absolute;
1012 // The set must be able to handle up to KMP_HW_LAST number of layers
1013 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1014 // Sorting the KMP_HW_SUBSET items to follow topology order
1015 // All unknown topology types will be at the beginning of the subset
1016 static int hw_subset_compare(const void *i1, const void *i2) {
1017 kmp_hw_t type1 = ((const item_t *)i1)->type;
1018 kmp_hw_t type2 = ((const item_t *)i2)->type;
1019 int level1 = __kmp_topology->get_level(type1);
1020 int level2 = __kmp_topology->get_level(type2);
1021 return level1 - level2;
1022 }
1023
1024public:
1025 // Force use of allocate()/deallocate()
1026 kmp_hw_subset_t() = delete;
1027 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1028 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1029 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1030 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1031
1032 static kmp_hw_subset_t *allocate() {
1033 int initial_capacity = 5;
1034 kmp_hw_subset_t *retval =
1035 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1036 retval->depth = 0;
1037 retval->capacity = initial_capacity;
1038 retval->set = 0ull;
1039 retval->absolute = false;
1040 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1041 return retval;
1042 }
1043 static void deallocate(kmp_hw_subset_t *subset) {
1044 __kmp_free(subset->items);
1045 __kmp_free(subset);
1046 }
1047 void set_absolute() { absolute = true; }
1048 bool is_absolute() const { return absolute; }
1049 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1050 for (int i = 0; i < depth; ++i) {
1051 // Found an existing item for this layer type
1052 // Add the num, offset, and attr to this item
1053 if (items[i].type == type) {
1054 int idx = items[i].num_attrs++;
1055 if ((size_t)idx >= MAX_ATTRS)
1056 return;
1057 items[i].num[idx] = num;
1058 items[i].offset[idx] = offset;
1059 items[i].attr[idx] = attr;
1060 return;
1061 }
1062 }
1063 if (depth == capacity - 1) {
1064 capacity *= 2;
1065 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1066 for (int i = 0; i < depth; ++i)
1067 new_items[i] = items[i];
1068 __kmp_free(items);
1069 items = new_items;
1070 }
1071 items[depth].num_attrs = 1;
1072 items[depth].type = type;
1073 items[depth].num[0] = num;
1074 items[depth].offset[0] = offset;
1075 items[depth].attr[0] = attr;
1076 depth++;
1077 set |= (1ull << type);
1078 }
1079 int get_depth() const { return depth; }
1080 const item_t &at(int index) const {
1081 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1082 return items[index];
1083 }
1084 item_t &at(int index) {
1085 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1086 return items[index];
1087 }
1088 void remove(int index) {
1089 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1090 set &= ~(1ull << items[index].type);
1091 for (int j = index + 1; j < depth; ++j) {
1092 items[j - 1] = items[j];
1093 }
1094 depth--;
1095 }
1096 void sort() {
1097 KMP_DEBUG_ASSERT(__kmp_topology);
1098 qsort(items, depth, sizeof(item_t), hw_subset_compare);
1099 }
1100 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1101 void dump() const {
1102 printf("**********************\n");
1103 printf("*** kmp_hw_subset: ***\n");
1104 printf("* depth: %d\n", depth);
1105 printf("* items:\n");
1106 for (int i = 0; i < depth; ++i) {
1107 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1108 for (int j = 0; j < items[i].num_attrs; ++j) {
1109 printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1110 items[i].offset[j]);
1111 if (!items[i].attr[j]) {
1112 printf(" (none)\n");
1113 } else {
1114 printf(
1115 " core_type = %s, core_eff = %d\n",
1116 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1117 items[i].attr[j].get_core_eff());
1118 }
1119 }
1120 }
1121 printf("* set: 0x%llx\n", set);
1122 printf("* absolute: %d\n", absolute);
1123 printf("**********************\n");
1124 }
1125};
1126extern kmp_hw_subset_t *__kmp_hw_subset;
1127
1128/* A structure for holding machine-specific hierarchy info to be computed once
1129 at init. This structure represents a mapping of threads to the actual machine
1130 hierarchy, or to our best guess at what the hierarchy might be, for the
1131 purpose of performing an efficient barrier. In the worst case, when there is
1132 no machine hierarchy information, it produces a tree suitable for a barrier,
1133 similar to the tree used in the hyper barrier. */
1134class hierarchy_info {
1135public:
1136 /* Good default values for number of leaves and branching factor, given no
1137 affinity information. Behaves a bit like hyper barrier. */
1138 static const kmp_uint32 maxLeaves = 4;
1139 static const kmp_uint32 minBranch = 4;
1145 kmp_uint32 maxLevels;
1146
1151 kmp_uint32 depth;
1152 kmp_uint32 base_num_threads;
1153 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1154 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1155 // 2=initialization in progress
1156 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1157
1162 kmp_uint32 *numPerLevel;
1163 kmp_uint32 *skipPerLevel;
1164
1165 void deriveLevels() {
1166 int hier_depth = __kmp_topology->get_depth();
1167 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1168 numPerLevel[level] = __kmp_topology->get_ratio(i);
1169 }
1170 }
1171
1172 hierarchy_info()
1173 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1174
1175 void fini() {
1176 if (!uninitialized && numPerLevel) {
1177 __kmp_free(numPerLevel);
1178 numPerLevel = NULL;
1179 uninitialized = not_initialized;
1180 }
1181 }
1182
1183 void init(int num_addrs) {
1184 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1185 &uninitialized, not_initialized, initializing);
1186 if (bool_result == 0) { // Wait for initialization
1187 while (TCR_1(uninitialized) != initialized)
1188 KMP_CPU_PAUSE();
1189 return;
1190 }
1191 KMP_DEBUG_ASSERT(bool_result == 1);
1192
1193 /* Added explicit initialization of the data fields here to prevent usage of
1194 dirty value observed when static library is re-initialized multiple times
1195 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1196 OpenMP). */
1197 depth = 1;
1198 resizing = 0;
1199 maxLevels = 7;
1200 numPerLevel =
1201 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1202 skipPerLevel = &(numPerLevel[maxLevels]);
1203 for (kmp_uint32 i = 0; i < maxLevels;
1204 ++i) { // init numPerLevel[*] to 1 item per level
1205 numPerLevel[i] = 1;
1206 skipPerLevel[i] = 1;
1207 }
1208
1209 // Sort table by physical ID
1210 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1211 deriveLevels();
1212 } else {
1213 numPerLevel[0] = maxLeaves;
1214 numPerLevel[1] = num_addrs / maxLeaves;
1215 if (num_addrs % maxLeaves)
1216 numPerLevel[1]++;
1217 }
1218
1219 base_num_threads = num_addrs;
1220 for (int i = maxLevels - 1; i >= 0;
1221 --i) // count non-empty levels to get depth
1222 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1223 depth++;
1224
1225 kmp_uint32 branch = minBranch;
1226 if (numPerLevel[0] == 1)
1227 branch = num_addrs / maxLeaves;
1228 if (branch < minBranch)
1229 branch = minBranch;
1230 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1231 while (numPerLevel[d] > branch ||
1232 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1233 if (numPerLevel[d] & 1)
1234 numPerLevel[d]++;
1235 numPerLevel[d] = numPerLevel[d] >> 1;
1236 if (numPerLevel[d + 1] == 1)
1237 depth++;
1238 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1239 }
1240 if (numPerLevel[0] == 1) {
1241 branch = branch >> 1;
1242 if (branch < 4)
1243 branch = minBranch;
1244 }
1245 }
1246
1247 for (kmp_uint32 i = 1; i < depth; ++i)
1248 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1249 // Fill in hierarchy in the case of oversubscription
1250 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1251 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1252
1253 uninitialized = initialized; // One writer
1254 }
1255
1256 // Resize the hierarchy if nproc changes to something larger than before
1257 void resize(kmp_uint32 nproc) {
1258 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1259 while (bool_result == 0) { // someone else is trying to resize
1260 KMP_CPU_PAUSE();
1261 if (nproc <= base_num_threads) // happy with other thread's resize
1262 return;
1263 else // try to resize
1264 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1265 }
1266 KMP_DEBUG_ASSERT(bool_result != 0);
1267 if (nproc <= base_num_threads)
1268 return; // happy with other thread's resize
1269
1270 // Calculate new maxLevels
1271 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1272 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1273 // First see if old maxLevels is enough to contain new size
1274 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1275 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1276 numPerLevel[i - 1] *= 2;
1277 old_sz *= 2;
1278 depth++;
1279 }
1280 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1281 while (nproc > old_sz) {
1282 old_sz *= 2;
1283 incs++;
1284 depth++;
1285 }
1286 maxLevels += incs;
1287
1288 // Resize arrays
1289 kmp_uint32 *old_numPerLevel = numPerLevel;
1290 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1291 numPerLevel = skipPerLevel = NULL;
1292 numPerLevel =
1293 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1294 skipPerLevel = &(numPerLevel[maxLevels]);
1295
1296 // Copy old elements from old arrays
1297 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1298 // init numPerLevel[*] to 1 item per level
1299 numPerLevel[i] = old_numPerLevel[i];
1300 skipPerLevel[i] = old_skipPerLevel[i];
1301 }
1302
1303 // Init new elements in arrays to 1
1304 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1305 // init numPerLevel[*] to 1 item per level
1306 numPerLevel[i] = 1;
1307 skipPerLevel[i] = 1;
1308 }
1309
1310 // Free old arrays
1311 __kmp_free(old_numPerLevel);
1312 }
1313
1314 // Fill in oversubscription levels of hierarchy
1315 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1316 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1317
1318 base_num_threads = nproc;
1319 resizing = 0; // One writer
1320 }
1321};
1322#endif // KMP_AFFINITY_H