| /* |
| * cpuset user library implementation. |
| * |
| * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved. |
| * |
| * Paul Jackson <pj@sgi.com> |
| */ |
| |
| /* |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU Lesser General Public License as published by |
| * the Free Software Foundation; either version 2.1 of the License, or |
| * (at your option) any later version. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public License |
| * along with this program; if not, write to the Free Software |
| * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| */ |
| |
| #define _GNU_SOURCE /* need to see pread() and syscall() */ |
| #include <unistd.h> |
| |
| #include <ctype.h> |
| #include <dirent.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <fts.h> |
| #include <limits.h> |
| #include <signal.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/stat.h> |
| #include <sys/syscall.h> |
| #include <sys/types.h> |
| #include <time.h> |
| #include <utime.h> |
| #include <sys/utsname.h> /* for cpuset_would_crash_kernel() */ |
| |
| #include "bitmask.h" |
| #include "cpuset.h" |
| #include "common.h" |
| #include "test.h" |
| #include "lapi/syscalls.h" |
| #include "config.h" |
| |
| #if HAVE_LINUX_MEMPOLICY_H |
| #include <linux/mempolicy.h> |
| |
| /* Bump version, and update Change History, when libcpuset API changes */ |
| #define CPUSET_VERSION 3 |
| |
| /* |
| * For a history of what changed in each version, see the "Change |
| * History" section, at the end of the libcpuset master document. |
| */ |
| |
| int cpuset_version(void) |
| { |
| return CPUSET_VERSION; |
| } |
| |
| struct cpuset { |
| struct bitmask *cpus; |
| struct bitmask *mems; |
| char cpu_exclusive; |
| char mem_exclusive; |
| char mem_hardwall; |
| char notify_on_release; |
| char memory_migrate; |
| char memory_pressure_enabled; |
| char memory_spread_page; |
| char memory_spread_slab; |
| char sched_load_balance; |
| int sched_relax_domain_level; |
| |
| /* |
| * Each field 'x' above gets an 'x_valid' field below. |
| * The apply_cpuset_settings() will only set those fields whose |
| * corresponding *_valid flags are set. The cpuset_alloc() |
| * routine clears these flags as part of the clear in calloc(), |
| * and the various cpuset_set*() routines set these flags when |
| * setting the corresponding value. |
| * |
| * The purpose of these valid fields is to ensure that when |
| * we create a new cpuset, we don't accidentally overwrite |
| * some non-zero kernel default, such as an inherited |
| * memory_spread_* flag, just because the user application |
| * code didn't override the default zero settings resulting |
| * from the calloc() call in cpuset_alloc(). |
| * |
| * The choice of 'char' for the type of the flags above, |
| * but a bitfield for the flags below, is somewhat capricious. |
| */ |
| unsigned cpus_valid:1; |
| unsigned mems_valid:1; |
| unsigned cpu_exclusive_valid:1; |
| unsigned mem_exclusive_valid:1; |
| unsigned mem_hardwall_valid:1; |
| unsigned notify_on_release_valid:1; |
| unsigned memory_migrate_valid:1; |
| unsigned memory_pressure_enabled_valid:1; |
| unsigned memory_spread_page_valid:1; |
| unsigned memory_spread_slab_valid:1; |
| unsigned sched_load_balance_valid:1; |
| unsigned sched_relax_domain_level_valid:1; |
| |
| /* |
| * if the relative variable was modified, use following flags |
| * to put a mark |
| */ |
| unsigned cpus_dirty:1; |
| unsigned mems_dirty:1; |
| unsigned cpu_exclusive_dirty:1; |
| unsigned mem_exclusive_dirty:1; |
| unsigned mem_hardwall_dirty:1; |
| unsigned notify_on_release_dirty:1; |
| unsigned memory_migrate_dirty:1; |
| unsigned memory_pressure_enabled_dirty:1; |
| unsigned memory_spread_page_dirty:1; |
| unsigned memory_spread_slab_dirty:1; |
| unsigned sched_load_balance_dirty:1; |
| unsigned sched_relax_domain_level_dirty:1; |
| }; |
| |
| /* Presumed cpuset file system mount point */ |
| static const char *cpusetmnt = "/dev/cpuset"; |
| |
| /* Stashed copy of cpunodemap[], mapping each cpu to its node. */ |
| static const char *mapfile = "/var/run/cpunodemap"; |
| |
| /* The primary source for the cpunodemap[] is available below here. */ |
| static const char *sysdevices = "/sys/devices/system"; |
| |
| /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */ |
| #define SMALL_BUFSZ 16 |
| |
| /* |
| * The 'mask_size_file' is used to ferrit out the kernel cpumask_t |
| * and nodemask_t sizes. The lines in this file that begin with the |
| * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask |
| * and nodemask string, respectively. The lengths of these strings |
| * reflect the kernel's internal cpumask_t and nodemask_t sizes, |
| * which sizes are needed to correctly call the sched_setaffinity |
| * and set_mempolicy system calls, and to size user level |
| * bitmasks to match the kernels. |
| */ |
| |
| static const char *mask_size_file = "/proc/self/status"; |
| static const char *cpumask_prefix = "Cpus_allowed:\t"; |
| static const char *nodemask_prefix = "Mems_allowed:\t"; |
| |
| /* |
| * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits. |
| * |
| * The first time we need these, we parse the Cpus_allowed and |
| * Mems_allowed lines from mask_size_file ("/proc/self/status"). |
| */ |
| |
| static int cpumask_sz; |
| static int nodemask_sz; |
| |
| /* |
| * These defaults only kick in if we fail to size the kernel |
| * cpumask and nodemask by reading the Cpus_allowed and |
| * Mems_allowed fields from the /proc/self/status file. |
| */ |
| |
| #define DEFCPUBITS (512) |
| #define DEFNODEBITS (DEFCPUBITS/2) |
| |
| /* |
| * Arch-neutral API for obtaining NUMA distances between CPUs |
| * and Memory Nodes, via the files: |
| * /sys/devices/system/node/nodeN/distance |
| * which have lines such as: |
| * 46 66 10 20 |
| * which say that for cpu on node N (from the path above), the |
| * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20, |
| * respectively. |
| */ |
| |
| static const char *distance_directory = "/sys/devices/system/node"; |
| |
| /* |
| * Someday, we should disable, then later discard, the SN code |
| * marked ALTERNATE_SN_DISTMAP. |
| */ |
| |
| #define ALTERNATE_SN_DISTMAP 1 |
| #ifdef ALTERNATE_SN_DISTMAP |
| |
| /* |
| * Alternative SN (SGI ia64) architecture specific API for obtaining |
| * NUMA distances between CPUs and Memory Nodes is via the file |
| * /proc/sgi_sn/sn_topology, which has lines such as: |
| * |
| * node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20 |
| * |
| * which says that for each CPU on node 2, the distance to nodes |
| * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively. |
| * |
| * This file has other lines as well, which start with other |
| * keywords than "node". Ignore these other lines. |
| */ |
| |
| static const char *sn_topology = "/proc/sgi_sn/sn_topology"; |
| static const char *sn_top_node_prefix = "node "; |
| |
| #endif |
| |
| /* |
| * Check that cpusets supported, /dev/cpuset mounted. |
| * If ok, return 0. |
| * If not, return -1 and set errno: |
| * ENOSYS - kernel doesn't support cpusets |
| * ENODEV - /dev/cpuset not mounted |
| */ |
| |
| static enum { |
| check_notdone, |
| check_enosys, |
| check_enodev, |
| check_ok |
| } check_state = check_notdone; |
| |
| static int check(void) |
| { |
| if (check_state == check_notdone) { |
| struct stat statbuf; |
| |
| if (stat("/proc/self/cpuset", &statbuf) < 0) { |
| check_state = check_enosys; |
| goto done; |
| } |
| |
| if (stat("/dev/cpuset/tasks", &statbuf) < 0) { |
| check_state = check_enodev; |
| goto done; |
| } |
| |
| check_state = check_ok; |
| } |
| done: |
| switch (check_state) { |
| case check_enosys: |
| errno = ENOSYS; |
| return -1; |
| case check_enodev: |
| errno = ENODEV; |
| return -1; |
| default: |
| break; |
| } |
| return 0; |
| } |
| |
| static void chomp(char *s) |
| { |
| char *t; |
| |
| for (t = s + strlen(s) - 1; t >= s; t--) { |
| if (*t == '\n' || *t == '\r') |
| *t = '\0'; |
| else |
| break; |
| } |
| } |
| |
| /* |
| * Determine number of bytes in a seekable open file, without |
| * assuming that stat(2) on that file has a useful size. |
| * Has side affect of leaving the file rewound to the beginnning. |
| */ |
| static int filesize(FILE * fp) |
| { |
| int sz = 0; |
| rewind(fp); |
| while (fgetc(fp) != EOF) |
| sz++; |
| rewind(fp); |
| return sz; |
| } |
| |
| /* Are strings s1 and s2 equal? */ |
| static int streq(const char *s1, const char *s2) |
| { |
| return strcmp(s1, s2) == 0; |
| } |
| |
| /* Is string 'pre' a prefix of string 's'? */ |
| static int strprefix(const char *s, const char *pre) |
| { |
| return strncmp(s, pre, strlen(pre)) == 0; |
| } |
| |
| /* |
| * char *flgets(char *buf, int buflen, FILE *fp) |
| * |
| * Obtain one line from input file fp. Copy up to first |
| * buflen-1 chars of line into buffer buf, discarding any remainder |
| * of line. Stop reading at newline, discarding newline. |
| * Nul terminate result and return pointer to buffer buf |
| * on success, or NULL if nothing more to read or failure. |
| */ |
| |
| static char *flgets(char *buf, int buflen, FILE * fp) |
| { |
| int c = -1; |
| char *bp; |
| |
| bp = buf; |
| while ((--buflen > 0) && ((c = getc(fp)) >= 0)) { |
| if (c == '\n') |
| goto newline; |
| *bp++ = c; |
| } |
| if ((c < 0) && (bp == buf)) |
| return NULL; |
| |
| if (c > 0) { |
| while ((c = getc(fp)) >= 0) { |
| if (c == '\n') |
| break; |
| } |
| } |
| |
| newline: |
| *bp++ = '\0'; |
| return buf; |
| } |
| |
| /* |
| * sgetc(const char *inputbuf, int *offsetptr) |
| * |
| * Return next char from nul-terminated input buffer inputbuf, |
| * starting at offset *offsetptr. Increment *offsetptr. |
| * If next char would be nul ('\0'), return EOF and don't |
| * increment *offsetptr. |
| */ |
| |
| static int sgetc(const char *inputbuf, int *offsetptr) |
| { |
| char c; |
| |
| if ((c = inputbuf[*offsetptr]) != 0) { |
| *offsetptr = *offsetptr + 1; |
| return c; |
| } else { |
| return EOF; |
| } |
| } |
| |
| /* |
| * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) |
| * |
| * Obtain next line from nul-terminated input buffer 'inputbuf', |
| * starting at offset *offsetptr. Copy up to first buflen-1 |
| * chars of line into output buffer buf, discarding any remainder |
| * of line. Stop reading at newline, discarding newline. |
| * Nul terminate result and return pointer to output buffer |
| * buf on success, or NULL if nothing more to read. |
| */ |
| |
| static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) |
| { |
| int c = -1; |
| char *bp; |
| |
| bp = buf; |
| while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) { |
| if (c == '\n') |
| goto newline; |
| *bp++ = c; |
| } |
| if ((c < 0) && (bp == buf)) |
| return NULL; |
| |
| if (c > 0) { |
| while ((c = sgetc(inputbuf, offsetptr)) >= 0) { |
| if (c == '\n') |
| break; |
| } |
| } |
| |
| newline: |
| *bp++ = '\0'; |
| return buf; |
| } |
| |
| /* |
| * time_t get_mtime(char *path) |
| * |
| * Return modtime of file at location path, else return 0. |
| */ |
| |
| static time_t get_mtime(const char *path) |
| { |
| struct stat statbuf; |
| |
| if (stat(path, &statbuf) != 0) |
| return 0; |
| return statbuf.st_mtime; |
| } |
| |
| /* |
| * int set_mtime(const char *path, time_t mtime) |
| * |
| * Set modtime of file 'path' to 'mtime'. Return 0 on success, |
| * or -1 on error, setting errno. |
| */ |
| |
| static int set_mtime(const char *path, time_t mtime) |
| { |
| struct utimbuf times; |
| |
| times.actime = mtime; |
| times.modtime = mtime; |
| return utime(path, ×); |
| } |
| |
| /* |
| * True if two pathnames resolve to same file. |
| * False if either path can not be stat'd, |
| * or if the two paths resolve to a different file. |
| */ |
| |
| static int samefile(const char *path1, const char *path2) |
| { |
| struct stat sb1, sb2; |
| |
| if (stat(path1, &sb1) != 0) |
| return 0; |
| if (stat(path2, &sb2) != 0) |
| return 0; |
| return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev; |
| } |
| |
| #define slash(c) (*(c) == '/') |
| #define eocomp(c) (slash(c) || !*(c)) |
| #define dot1(c) (*(c) == '.' && eocomp(c+1)) |
| |
| /* In place path compression. Remove extra dots and slashes. */ |
| static char *pathcomp(char *p) |
| { |
| char *a = p; |
| char *b = p; |
| |
| if (!p || !*p) |
| return p; |
| if (slash(p)) |
| *b++ = *a++; |
| for (;;) { |
| if (slash(a)) |
| while (slash(++a)) |
| continue; |
| if (!*a) { |
| if (b == p) |
| *b++ = '.'; |
| *b = '\0'; |
| return (p); |
| } else if (dot1(a)) { |
| a++; |
| } else { |
| if ((b != p) && !slash(b - 1)) |
| *b++ = '/'; |
| while (!eocomp(a)) |
| *b++ = *a++; |
| } |
| } |
| } |
| |
| #undef slash |
| #undef eocomp |
| #undef dot1 |
| |
| /* |
| * pathcat2(buf, buflen, name1, name2) |
| * |
| * Return buf, of length buflen, with name1/name2 stored in it. |
| */ |
| |
| static char *pathcat2(char *buf, int buflen, const char *name1, |
| const char *name2) |
| { |
| (void)snprintf(buf, buflen, "%s/%s", name1, name2); |
| return pathcomp(buf); |
| } |
| |
| /* |
| * pathcat3(buf, buflen, name1, name2, name3) |
| * |
| * Return buf, of length buflen, with name1/name2/name3 stored in it. |
| */ |
| |
| static char *pathcat3(char *buf, int buflen, const char *name1, |
| const char *name2, const char *name3) |
| { |
| (void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3); |
| return pathcomp(buf); |
| } |
| |
| /* |
| * fullpath(buf, buflen, name) |
| * |
| * Put full path of cpuset 'name' in buffer 'buf'. If name |
| * starts with a slash (``/``) character, then this a path |
| * relative to ``/dev/cpuset``, otherwise it is relative to |
| * the current tasks cpuset. Return 0 on success, else |
| * -1 on error, setting errno. |
| */ |
| |
| static int fullpath(char *buf, int buflen, const char *name) |
| { |
| int len; |
| |
| /* easy case */ |
| if (*name == '/') { |
| pathcat2(buf, buflen, cpusetmnt, name); |
| pathcomp(buf); |
| return 0; |
| } |
| |
| /* hard case */ |
| snprintf(buf, buflen, "%s/", cpusetmnt); |
| len = strlen(buf); |
| if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL) |
| return -1; |
| if (strlen(buf) >= buflen - 1 - strlen(name)) { |
| errno = E2BIG; |
| return -1; |
| } |
| strcat(buf, "/"); |
| strcat(buf, name); |
| pathcomp(buf); |
| return 0; |
| } |
| |
| /* |
| * fullpath2(buf, buflen, name1, name2) |
| * |
| * Like fullpath(), only concatenate two pathname components on end. |
| */ |
| |
| static int fullpath2(char *buf, int buflen, const char *name1, |
| const char *name2) |
| { |
| if (fullpath(buf, buflen, name1) < 0) |
| return -1; |
| if (strlen(buf) >= buflen - 1 - strlen(name2)) { |
| errno = E2BIG; |
| return -1; |
| } |
| strcat(buf, "/"); |
| strcat(buf, name2); |
| pathcomp(buf); |
| return 0; |
| } |
| |
| /* |
| * Convert the string length of an ascii hex mask to the number |
| * of bits represented by that mask. |
| * |
| * The cpumask and nodemask values in /proc/self/status are in an |
| * ascii format that uses 9 characters for each 32 bits of mask. |
| */ |
| static int s2nbits(const char *s) |
| { |
| return strlen(s) * 32 / 9; |
| } |
| |
| static void update_mask_sizes(void) |
| { |
| FILE *fp = NULL; |
| char *buf = NULL; |
| int fsize; |
| |
| if ((fp = fopen(mask_size_file, "r")) == NULL) |
| goto done; |
| fsize = filesize(fp); |
| if ((buf = malloc(fsize)) == NULL) |
| goto done; |
| |
| /* |
| * Beware: mask sizing arithmetic is fussy. |
| * The trailing newline left by fgets() is required. |
| */ |
| while (fgets(buf, fsize, fp)) { |
| if (strprefix(buf, cpumask_prefix)) |
| cpumask_sz = s2nbits(buf + strlen(cpumask_prefix)); |
| if (strprefix(buf, nodemask_prefix)) |
| nodemask_sz = s2nbits(buf + strlen(nodemask_prefix)); |
| } |
| done: |
| free(buf); |
| if (fp != NULL) |
| fclose(fp); |
| if (cpumask_sz == 0) |
| cpumask_sz = DEFCPUBITS; |
| if (nodemask_sz == 0) |
| nodemask_sz = DEFNODEBITS; |
| } |
| |
| /* Allocate a new struct cpuset */ |
| struct cpuset *cpuset_alloc(void) |
| { |
| struct cpuset *cp = NULL; |
| int nbits; |
| |
| if ((cp = calloc(1, sizeof(struct cpuset))) == NULL) |
| goto err; |
| |
| nbits = cpuset_cpus_nbits(); |
| if ((cp->cpus = bitmask_alloc(nbits)) == NULL) |
| goto err; |
| |
| nbits = cpuset_mems_nbits(); |
| if ((cp->mems = bitmask_alloc(nbits)) == NULL) |
| goto err; |
| |
| return cp; |
| err: |
| if (cp && cp->cpus) |
| bitmask_free(cp->cpus); |
| if (cp && cp->mems) |
| bitmask_free(cp->mems); |
| free(cp); |
| return NULL; |
| } |
| |
| /* Free struct cpuset *cp */ |
| void cpuset_free(struct cpuset *cp) |
| { |
| if (!cp) |
| return; |
| if (cp->cpus) |
| bitmask_free(cp->cpus); |
| if (cp->mems) |
| bitmask_free(cp->mems); |
| free(cp); |
| } |
| |
| /* Number of bits in a CPU bitmask on current system */ |
| int cpuset_cpus_nbits(void) |
| { |
| if (cpumask_sz == 0) |
| update_mask_sizes(); |
| return cpumask_sz; |
| } |
| |
| /* Number of bits in a Memory bitmask on current system */ |
| int cpuset_mems_nbits(void) |
| { |
| if (nodemask_sz == 0) |
| update_mask_sizes(); |
| return nodemask_sz; |
| } |
| |
| /* Set CPUs in cpuset cp to bitmask cpus */ |
| int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus) |
| { |
| if (cp->cpus) |
| bitmask_free(cp->cpus); |
| cp->cpus = bitmask_alloc(bitmask_nbits(cpus)); |
| if (cp->cpus == NULL) |
| return -1; |
| bitmask_copy(cp->cpus, cpus); |
| cp->cpus_valid = 1; |
| cp->cpus_dirty = 1; |
| return 0; |
| } |
| |
| /* Set Memory Nodes in cpuset cp to bitmask mems */ |
| int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems) |
| { |
| if (cp->mems) |
| bitmask_free(cp->mems); |
| cp->mems = bitmask_alloc(bitmask_nbits(mems)); |
| if (cp->mems == NULL) |
| return -1; |
| bitmask_copy(cp->mems, mems); |
| cp->mems_valid = 1; |
| cp->mems_dirty = 1; |
| return 0; |
| } |
| |
| /* Set integer value optname of cpuset cp */ |
| int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value) |
| { |
| if (streq(optionname, "cpu_exclusive")) { |
| cp->cpu_exclusive = ! !value; |
| cp->cpu_exclusive_valid = 1; |
| cp->cpu_exclusive_dirty = 1; |
| } else if (streq(optionname, "mem_exclusive")) { |
| cp->mem_exclusive = ! !value; |
| cp->mem_exclusive_valid = 1; |
| cp->mem_exclusive_dirty = 1; |
| } else if (streq(optionname, "mem_hardwall")) { |
| cp->mem_hardwall = ! !value; |
| cp->mem_hardwall_valid = 1; |
| cp->mem_hardwall_dirty = 1; |
| } else if (streq(optionname, "notify_on_release")) { |
| cp->notify_on_release = ! !value; |
| cp->notify_on_release_valid = 1; |
| cp->notify_on_release_dirty = 1; |
| } else if (streq(optionname, "memory_pressure_enabled")) { |
| cp->memory_pressure_enabled = ! !value; |
| cp->memory_pressure_enabled_valid = 1; |
| cp->memory_pressure_enabled_dirty = 1; |
| } else if (streq(optionname, "memory_migrate")) { |
| cp->memory_migrate = ! !value; |
| cp->memory_migrate_valid = 1; |
| cp->memory_migrate_dirty = 1; |
| } else if (streq(optionname, "memory_spread_page")) { |
| cp->memory_spread_page = ! !value; |
| cp->memory_spread_page_valid = 1; |
| cp->memory_spread_page_dirty = 1; |
| } else if (streq(optionname, "memory_spread_slab")) { |
| cp->memory_spread_slab = ! !value; |
| cp->memory_spread_slab_valid = 1; |
| cp->memory_spread_slab_dirty = 1; |
| } else if (streq(optionname, "sched_load_balance")) { |
| cp->sched_load_balance = ! !value; |
| cp->sched_load_balance_valid = 1; |
| cp->sched_load_balance_dirty = 1; |
| } else if (streq(optionname, "sched_relax_domain_level")) { |
| cp->sched_relax_domain_level = value; |
| cp->sched_relax_domain_level_valid = 1; |
| cp->sched_relax_domain_level_dirty = 1; |
| } else |
| return -2; /* optionname not recognized */ |
| return 0; |
| } |
| |
| /* [optional] Set string value optname */ |
| int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname, |
| UNUSED const char *value) |
| { |
| return -2; /* For now, all string options unrecognized */ |
| } |
| |
| /* Return handle for reading memory_pressure. */ |
| int cpuset_open_memory_pressure(const char *cpusetpath) |
| { |
| char buf[PATH_MAX]; |
| |
| fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure"); |
| return open(buf, O_RDONLY); |
| } |
| |
| /* Return current memory_pressure of cpuset. */ |
| int cpuset_read_memory_pressure(int han) |
| { |
| char buf[SMALL_BUFSZ]; |
| |
| if (pread(han, buf, sizeof(buf), 0L) < 0) |
| return -1; |
| return atoi(buf); |
| } |
| |
| /* Close handle for reading memory pressure. */ |
| void cpuset_close_memory_pressure(int han) |
| { |
| close(han); |
| } |
| |
| /* |
| * Resolve cpuset pointer (to that of current task if cp == NULL). |
| * |
| * If cp not NULL, just return it. If cp is NULL, return pointer |
| * to temporary cpuset for current task, and set *cp_tofree to |
| * pointer to that same temporary cpuset, to be freed later. |
| * |
| * Return NULL and set errno on error. Errors can occur when |
| * resolving the current tasks cpuset. |
| */ |
| static const struct cpuset *resolve_cp(const struct cpuset *cp, |
| struct cpuset **cp_tofree) |
| { |
| const struct cpuset *rcp; |
| |
| if (cp) { |
| rcp = cp; |
| } else { |
| struct cpuset *cp1 = cpuset_alloc(); |
| if (cp1 == NULL) |
| goto err; |
| if (cpuset_cpusetofpid(cp1, 0) < 0) { |
| cpuset_free(cp1); |
| goto err; |
| } |
| *cp_tofree = cp1; |
| rcp = cp1; |
| } |
| return rcp; |
| err: |
| return NULL; |
| } |
| |
| /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */ |
| int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus) |
| { |
| struct cpuset *cp_tofree = NULL; |
| const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); |
| |
| if (!cp1) |
| goto err; |
| if (cp1->cpus == NULL) { |
| errno = EINVAL; |
| goto err; |
| } |
| bitmask_copy(cpus, cp1->cpus); |
| cpuset_free(cp_tofree); |
| return 0; |
| err: |
| cpuset_free(cp_tofree); |
| return -1; |
| } |
| |
| /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */ |
| int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems) |
| { |
| struct cpuset *cp_tofree = NULL; |
| const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); |
| |
| if (!cp1) |
| goto err; |
| if (cp1->mems == NULL) { |
| errno = EINVAL; |
| goto err; |
| } |
| bitmask_copy(mems, cp1->mems); |
| cpuset_free(cp_tofree); |
| return 0; |
| err: |
| cpuset_free(cp_tofree); |
| return -1; |
| } |
| |
| /* Return number of CPUs in cpuset cp (current task if cp == NULL) */ |
| int cpuset_cpus_weight(const struct cpuset *cp) |
| { |
| struct cpuset *cp_tofree = NULL; |
| const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); |
| int w = -1; |
| |
| if (!cp1) |
| goto err; |
| if (cp1->cpus == NULL) { |
| errno = EINVAL; |
| goto err; |
| } |
| w = bitmask_weight(cp1->cpus); |
| /* fall into ... */ |
| err: |
| cpuset_free(cp_tofree); |
| return w; |
| } |
| |
| /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */ |
| int cpuset_mems_weight(const struct cpuset *cp) |
| { |
| struct cpuset *cp_tofree = NULL; |
| const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); |
| int w = -1; |
| |
| if (!cp1) |
| goto err; |
| if (cp1->mems == NULL) { |
| errno = EINVAL; |
| goto err; |
| } |
| w = bitmask_weight(cp1->mems); |
| /* fall into ... */ |
| err: |
| cpuset_free(cp_tofree); |
| return w; |
| } |
| |
| /* Return integer value of option optname in cp */ |
| int cpuset_get_iopt(const struct cpuset *cp, const char *optionname) |
| { |
| if (streq(optionname, "cpu_exclusive")) |
| return cp->cpu_exclusive; |
| else if (streq(optionname, "mem_exclusive")) |
| return cp->mem_exclusive; |
| else if (streq(optionname, "mem_hardwall")) |
| return cp->mem_hardwall; |
| else if (streq(optionname, "notify_on_release")) |
| return cp->notify_on_release; |
| else if (streq(optionname, "memory_pressure_enabled")) |
| return cp->memory_pressure_enabled; |
| else if (streq(optionname, "memory_migrate")) |
| return cp->memory_migrate; |
| else if (streq(optionname, "memory_spread_page")) |
| return cp->memory_spread_page; |
| else if (streq(optionname, "memory_spread_slab")) |
| return cp->memory_spread_slab; |
| else if (streq(optionname, "sched_load_balance")) |
| return cp->sched_load_balance; |
| else if (streq(optionname, "sched_relax_domain_level")) |
| return cp->sched_relax_domain_level; |
| else |
| return -2; /* optionname not recognized */ |
| } |
| |
| /* [optional] Return string value of optname */ |
| const char *cpuset_get_sopt(UNUSED const struct cpuset *cp, |
| UNUSED const char *optionname) |
| { |
| return NULL; /* For now, all string options unrecognized */ |
| } |
| |
| static int read_flag(const char *filepath, char *flagp) |
| { |
| char buf[SMALL_BUFSZ]; /* buffer a "0" or "1" flag line */ |
| int fd = -1; |
| |
| if ((fd = open(filepath, O_RDONLY)) < 0) |
| goto err; |
| if (read(fd, buf, sizeof(buf)) < 1) |
| goto err; |
| if (atoi(buf)) |
| *flagp = 1; |
| else |
| *flagp = 0; |
| close(fd); |
| return 0; |
| err: |
| if (fd >= 0) |
| close(fd); |
| return -1; |
| } |
| |
| static int load_flag(const char *path, char *flagp, const char *flag) |
| { |
| char buf[PATH_MAX]; |
| |
| pathcat2(buf, sizeof(buf), path, flag); |
| return read_flag(buf, flagp); |
| } |
| |
| static int read_number(const char *filepath, int *numberp) |
| { |
| char buf[SMALL_BUFSZ]; |
| int fd = -1; |
| |
| if ((fd = open(filepath, O_RDONLY)) < 0) |
| goto err; |
| if (read(fd, buf, sizeof(buf)) < 1) |
| goto err; |
| *numberp = atoi(buf); |
| close(fd); |
| return 0; |
| err: |
| if (fd >= 0) |
| close(fd); |
| return -1; |
| } |
| |
| static int load_number(const char *path, int *numberp, const char *file) |
| { |
| char buf[PATH_MAX]; |
| |
| pathcat2(buf, sizeof(buf), path, file); |
| return read_number(buf, numberp); |
| } |
| |
| static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits) |
| { |
| FILE *fp = NULL; |
| char *buf = NULL; |
| int buflen; |
| struct bitmask *bmp = NULL; |
| |
| if ((fp = fopen(filepath, "r")) == NULL) |
| goto err; |
| buflen = filesize(fp) + 1; /* + 1 for nul term */ |
| if ((buf = malloc(buflen)) == NULL) |
| goto err; |
| if (flgets(buf, buflen, fp) == NULL) |
| goto err; |
| fclose(fp); |
| fp = NULL; |
| |
| if ((bmp = bitmask_alloc(nbits)) == NULL) |
| goto err; |
| if (*buf && bitmask_parselist(buf, bmp) < 0) |
| goto err; |
| if (*bmpp) |
| bitmask_free(*bmpp); |
| *bmpp = bmp; |
| free(buf); |
| buf = NULL; |
| return 0; |
| err: |
| if (buf != NULL) |
| free(buf); |
| if (fp != NULL) |
| fclose(fp); |
| if (bmp != NULL) |
| bitmask_free(bmp); |
| return -1; |
| } |
| |
| static int load_mask(const char *path, struct bitmask **bmpp, |
| int nbits, const char *mask) |
| { |
| char buf[PATH_MAX]; |
| |
| pathcat2(buf, sizeof(buf), path, mask); |
| return read_mask(buf, bmpp, nbits); |
| } |
| |
| /* Write string to file at given filepath. Create or truncate file. */ |
| static int write_string_file(const char *filepath, const char *str) |
| { |
| int fd = -1; |
| |
| if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0) |
| goto err; |
| if (write(fd, str, strlen(str)) < 0) |
| goto err; |
| close(fd); |
| return 0; |
| err: |
| if (fd >= 0) |
| close(fd); |
| return -1; |
| } |
| |
| /* Size and allocate buffer. Write bitmask into it. Caller must free */ |
| static char *sprint_mask_buf(const struct bitmask *bmp) |
| { |
| char *buf = NULL; |
| int buflen; |
| char c; |
| |
| /* First bitmask_displaylist() call just to get the length */ |
| buflen = bitmask_displaylist(&c, 1, bmp) + 1; /* "+ 1" for nul */ |
| if ((buf = malloc(buflen)) == NULL) |
| return NULL; |
| bitmask_displaylist(buf, buflen, bmp); |
| return buf; |
| } |
| |
| static int exists_flag(const char *path, const char *flag) |
| { |
| char buf[PATH_MAX]; |
| struct stat statbuf; |
| int rc; |
| |
| pathcat2(buf, sizeof(buf), path, flag); |
| rc = (stat(buf, &statbuf) == 0); |
| errno = 0; |
| return rc; |
| } |
| |
| static int store_flag(const char *path, const char *flag, int val) |
| { |
| char buf[PATH_MAX]; |
| |
| pathcat2(buf, sizeof(buf), path, flag); |
| return write_string_file(buf, val ? "1" : "0"); |
| } |
| |
| static int store_number(const char *path, const char *file, int val) |
| { |
| char buf[PATH_MAX]; |
| char data[SMALL_BUFSZ]; |
| |
| memset(data, 0, sizeof(data)); |
| pathcat2(buf, sizeof(buf), path, file); |
| snprintf(data, sizeof(data), "%d", val); |
| return write_string_file(buf, data); |
| } |
| |
| static int store_mask(const char *path, const char *mask, |
| const struct bitmask *bmp) |
| { |
| char maskpath[PATH_MAX]; |
| char *bp = NULL; |
| int rc; |
| |
| if (bmp == NULL) |
| return 0; |
| pathcat2(maskpath, sizeof(maskpath), path, mask); |
| if ((bp = sprint_mask_buf(bmp)) == NULL) |
| return -1; |
| rc = write_string_file(maskpath, bp); |
| free(bp); |
| return rc; |
| } |
| |
| /* |
| * Return 1 if 'cpu' is online, else 0 if offline. Tests the file |
| * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents |
| * were N == cpu number. |
| */ |
| |
| char cpu_online(unsigned int cpu) |
| { |
| char online; |
| char cpupath[PATH_MAX]; |
| |
| (void)snprintf(cpupath, sizeof(cpupath), |
| "/sys/devices/system/cpu/cpu%d/online", cpu); |
| if (read_flag(cpupath, &online) < 0) |
| return 0; /* oops - guess that cpu's not there */ |
| return online; |
| } |
| |
| /* |
| * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()), |
| * to the node on which that cpu resides or cpuset_mems_nbits(). |
| * |
| * To avoid every user having to recalculate this relation |
| * from various clues in the sysfs file system (below the |
| * path /sys/devices/system) a copy of this map is kept at |
| * /var/run/cpunodemap. |
| * |
| * The system automatically cleans out files below |
| * /var/run on each system reboot (see the init script |
| * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry |
| * about stale data in this file across reboots. If the file |
| * is missing, let the first process that needs it, and has |
| * permission to write in the /var/run directory, rebuild it. |
| * |
| * If using this cached data, remember the mtime of the mapfile |
| * the last time we read it in case something like a hotplug |
| * event results in the file being removed and rebuilt, so we |
| * can detect if we're using a stale cache, and need to reload. |
| * |
| * The mtime of this file is set to the time when we did |
| * the recalculation of the map, from the clues beneath |
| * /sys/devices/system. This is done so that a program |
| * won't see the mapfile it just wrote as being newer than what |
| * it just wrote out (store_map) and read the same map back in |
| * (load_file). |
| */ |
| |
| /* |
| * Hold flockfile(stdin) while using cpunodemap for posix thread safety. |
| * |
| * Note on locking and flockfile(FILE *): |
| * |
| * We use flockfile() and funlockfile() instead of directly |
| * calling pthread_mutex_lock and pthread_mutex_unlock on |
| * a pthread_mutex_t, because this avoids forcing the app |
| * to link with libpthread. The glibc implementation of |
| * flockfile/funlockfile will fall back to no-ops if libpthread |
| * doesn't happen to be linked. |
| * |
| * Since flockfile already has the moderately convoluted |
| * combination of weak and strong symbols required to accomplish |
| * this, it is easier to use flockfile() on some handy FILE * |
| * stream as a surrogate for pthread locking than it is to so |
| * re-invent that wheel. |
| * |
| * Forcing all apps that use cpusets to link with libpthread |
| * would force non-transparent initialization on apps that |
| * might not be prepared to handle it. |
| * |
| * The application using libcpuset should never notice this |
| * odd use of flockfile(), because we never return to the |
| * application from any libcpuset call with any such lock held. |
| * We just use this locking for guarding some non-atomic cached |
| * data updates and accesses, internal to some libcpuset calls. |
| * Also, flockfile() allows recursive nesting, so if the app |
| * calls libcpuset holding such a file lock, we won't deadlock |
| * if we go to acquire the same lock. We'll just get the lock |
| * and increment its counter while we hold it. |
| */ |
| |
| static struct cpunodemap { |
| int *map; /* map[cpumask_sz]: maps cpu to its node */ |
| time_t mtime; /* modtime of mapfile when last read */ |
| } cpunodemap; |
| |
| /* |
| * rebuild_map() - Rebuild cpunodemap[] from scratch. |
| * |
| * Situation: |
| * Neither our in-memory cpunodemap[] array nor the |
| * cache of it in mapfile is current. |
| * Action: |
| * Rebuild it from first principles and the information |
| * available below /sys/devices/system. |
| */ |
| |
| static void rebuild_map(void) |
| { |
| char buf[PATH_MAX]; |
| DIR *dir1, *dir2; |
| struct dirent *dent1, *dent2; |
| int ncpus = cpuset_cpus_nbits(); |
| int nmems = cpuset_mems_nbits(); |
| unsigned int cpu, mem; |
| |
| for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) |
| cpunodemap.map[cpu] = -1; |
| pathcat2(buf, sizeof(buf), sysdevices, "node"); |
| if ((dir1 = opendir(buf)) == NULL) |
| return; |
| while ((dent1 = readdir(dir1)) != NULL) { |
| if (sscanf(dent1->d_name, "node%u", &mem) < 1) |
| continue; |
| pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name); |
| if ((dir2 = opendir(buf)) == NULL) |
| continue; |
| while ((dent2 = readdir(dir2)) != NULL) { |
| if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1) |
| continue; |
| if (cpu >= (unsigned int)ncpus |
| || mem >= (unsigned int)nmems) |
| continue; |
| cpunodemap.map[cpu] = mem; |
| } |
| closedir(dir2); |
| } |
| closedir(dir1); |
| cpunodemap.mtime = time(0); |
| } |
| |
| /* |
| * load_map() - Load cpunodemap[] from mapfile. |
| * |
| * Situation: |
| * The cpunodemap in mapfile is more recent than |
| * what we have in the cpunodemap[] array. |
| * Action: |
| * Reload the cpunodemap[] array from the file. |
| */ |
| |
| static void load_map(void) |
| { |
| char buf[SMALL_BUFSZ]; /* buffer 1 line of mapfile */ |
| FILE *mapfp; /* File stream on mapfile */ |
| int ncpus = cpuset_cpus_nbits(); |
| int nmems = cpuset_mems_nbits(); |
| unsigned int cpu, mem; |
| |
| if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL) |
| return; |
| cpunodemap.mtime = get_mtime(mapfile); |
| if ((mapfp = fopen(mapfile, "r")) == NULL) |
| return; |
| for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) |
| cpunodemap.map[cpu] = nmems; |
| while (flgets(buf, sizeof(buf), mapfp) != NULL) { |
| if (sscanf(buf, "%u %u", &cpu, &mem) < 2) |
| continue; |
| if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems) |
| continue; |
| cpunodemap.map[cpu] = mem; |
| } |
| fclose(mapfp); |
| } |
| |
| /* |
| * store_map() - Write cpunodemap[] out to mapfile. |
| * |
| * Situation: |
| * The cpunodemap in the cpunodemap[] array is |
| * more recent than the one in mapfile. |
| * Action: |
| * Write cpunodemap[] out to mapfile. |
| */ |
| |
| static void store_map(void) |
| { |
| char buf[PATH_MAX]; |
| int fd = -1; |
| FILE *mapfp = NULL; |
| int ncpus = cpuset_cpus_nbits(); |
| int nmems = cpuset_mems_nbits(); |
| unsigned int cpu, mem; |
| |
| snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX"); |
| if ((fd = mkstemp(buf)) < 0) |
| goto err; |
| if ((mapfp = fdopen(fd, "w")) == NULL) |
| goto err; |
| for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { |
| mem = cpunodemap.map[cpu]; |
| if (mem < (unsigned int)nmems) |
| fprintf(mapfp, "%u %u\n", cpu, mem); |
| } |
| fclose(mapfp); |
| set_mtime(buf, cpunodemap.mtime); |
| if (rename(buf, mapfile) < 0) |
| goto err; |
| /* mkstemp() creates mode 0600 - change to world readable */ |
| (void)chmod(mapfile, 0444); |
| return; |
| err: |
| if (mapfp != NULL) { |
| fclose(mapfp); |
| fd = -1; |
| } |
| if (fd >= 0) |
| close(fd); |
| (void)unlink(buf); |
| } |
| |
| /* |
| * Load and gain thread safe access to the <cpu, node> map. |
| * |
| * Return 0 on success with flockfile(stdin) held. |
| * Each successful get_map() call must be matched with a |
| * following put_map() call to release the lock. |
| * |
| * On error, return -1 with errno set and no lock held. |
| */ |
| |
| static int get_map(void) |
| { |
| time_t file_mtime; |
| |
| flockfile(stdin); |
| |
| if (cpunodemap.map == NULL) { |
| cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int)); |
| if (cpunodemap.map == NULL) |
| goto err; |
| } |
| |
| /* If no one has a good cpunodemap, rebuild from scratch */ |
| file_mtime = get_mtime(mapfile); |
| if (cpunodemap.mtime == 0 && file_mtime == 0) |
| rebuild_map(); |
| |
| /* If either cpunodemap[] or mapfile newer, update other with it */ |
| file_mtime = get_mtime(mapfile); |
| if (cpunodemap.mtime < file_mtime) |
| load_map(); |
| else if (cpunodemap.mtime > file_mtime) |
| store_map(); |
| return 0; |
| err: |
| funlockfile(stdin); |
| return -1; |
| } |
| |
| static void put_map(void) |
| { |
| funlockfile(stdin); |
| } |
| |
| /* Set cpus to those local to Memory Nodes mems */ |
| int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus) |
| { |
| int ncpus = cpuset_cpus_nbits(); |
| unsigned int cpu; |
| |
| if (check() < 0) |
| return -1; |
| |
| get_map(); |
| bitmask_clearall(cpus); |
| for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { |
| if (bitmask_isbitset(mems, cpunodemap.map[cpu])) |
| bitmask_setbit(cpus, cpu); |
| } |
| put_map(); |
| return 0; |
| } |
| |
| /* Set mems to those local to CPUs cpus */ |
| int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems) |
| { |
| int ncpus = cpuset_cpus_nbits(); |
| unsigned int cpu; |
| |
| if (check() < 0) |
| return -1; |
| |
| get_map(); |
| bitmask_clearall(mems); |
| for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { |
| if (bitmask_isbitset(cpus, cpu)) |
| bitmask_setbit(mems, cpunodemap.map[cpu]); |
| } |
| put_map(); |
| return 0; |
| } |
| |
| /* |
| * distmap[] |
| * |
| * Array of ints of size cpumask_sz by nodemask_sz. |
| * |
| * Element distmap[cpu][mem] is the distance between CPU cpu |
| * and Memory Node mem. Distances are weighted to roughly |
| * approximate the cost of memory references, and scaled so that |
| * the distance from a CPU to its local Memory Node is ten (10). |
| * |
| * The first call to cpuset_cpumemdist() builds this map, from |
| * whatever means the kernel provides to obtain these distances. |
| * |
| * These distances derive from ACPI SLIT table entries, which are |
| * eight bits in size. |
| * |
| * Hold flockfile(stdout) while using distmap for posix thread safety. |
| */ |
| |
| typedef unsigned char distmap_entry_t; /* type of distmap[] entries */ |
| |
| static distmap_entry_t *distmap; /* maps <cpu, mem> to distance */ |
| |
| #define DISTMAP_MAX UCHAR_MAX /* maximum value in distmap[] */ |
| |
| #define I(i,j) ((i) * nmems + (j)) /* 2-D array index simulation */ |
| |
| /* |
| * Parse arch neutral lines from 'distance' files of form: |
| * |
| * 46 66 10 20 |
| * |
| * The lines contain a space separated list of distances, which is parsed |
| * into array dists[] of each nodes distance from the specified node. |
| * |
| * Result is placed in distmap[ncpus][nmems]: |
| * |
| * For each cpu c on node: |
| * For each node position n in list of distances: |
| * distmap[c][n] = dists[n] |
| */ |
| |
| static int parse_distmap_line(unsigned int node, char *buf) |
| { |
| char *p, *q; |
| int ncpus = cpuset_cpus_nbits(); |
| int nmems = cpuset_mems_nbits(); |
| unsigned int c, n; |
| distmap_entry_t *dists = NULL; |
| struct bitmask *cpus = NULL, *mems = NULL; |
| int ret = -1; |
| |
| p = buf; |
| if ((dists = calloc(nmems, sizeof(*dists))) == NULL) |
| goto err; |
| for (n = 0; n < (unsigned int)nmems; n++) |
| dists[n] = DISTMAP_MAX; |
| |
| for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { |
| unsigned int d; |
| |
| if ((p = strpbrk(p, "0123456789")) == NULL) |
| break; |
| d = strtoul(p, &q, 10); |
| if (p == q) |
| break; |
| if (d < DISTMAP_MAX) |
| dists[n] = (distmap_entry_t) d; |
| } |
| |
| if ((mems = bitmask_alloc(nmems)) == NULL) |
| goto err; |
| bitmask_setbit(mems, node); |
| |
| if ((cpus = bitmask_alloc(ncpus)) == NULL) |
| goto err; |
| cpuset_localcpus(mems, cpus); |
| |
| for (c = bitmask_first(cpus); c < (unsigned int)ncpus; |
| c = bitmask_next(cpus, c + 1)) |
| for (n = 0; n < (unsigned int)nmems; n++) |
| distmap[I(c, n)] = dists[n]; |
| ret = 0; |
| /* fall into ... */ |
| err: |
| bitmask_free(mems); |
| bitmask_free(cpus); |
| free(dists); |
| return ret; |
| } |
| |
| static int parse_distance_file(unsigned int node, const char *path) |
| { |
| FILE *fp; |
| char *buf = NULL; |
| int buflen; |
| |
| if ((fp = fopen(path, "r")) == NULL) |
| goto err; |
| |
| buflen = filesize(fp); |
| |
| if ((buf = malloc(buflen)) == NULL) |
| goto err; |
| |
| if (flgets(buf, buflen, fp) == NULL) |
| goto err; |
| |
| if (parse_distmap_line(node, buf) < 0) |
| goto err; |
| |
| free(buf); |
| fclose(fp); |
| return 0; |
| err: |
| free(buf); |
| if (fp) |
| fclose(fp); |
| return -1; |
| } |
| |
| static void build_distmap(void) |
| { |
| static int tried_before = 0; |
| int ncpus = cpuset_cpus_nbits(); |
| int nmems = cpuset_mems_nbits(); |
| int c, m; |
| DIR *dir = NULL; |
| struct dirent *dent; |
| |
| if (tried_before) |
| goto err; |
| tried_before = 1; |
| |
| if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) |
| goto err; |
| |
| for (c = 0; c < ncpus; c++) |
| for (m = 0; m < nmems; m++) |
| distmap[I(c, m)] = DISTMAP_MAX; |
| |
| if ((dir = opendir(distance_directory)) == NULL) |
| goto err; |
| while ((dent = readdir(dir)) != NULL) { |
| char buf[PATH_MAX]; |
| unsigned int node; |
| |
| if (sscanf(dent->d_name, "node%u", &node) < 1) |
| continue; |
| pathcat3(buf, sizeof(buf), distance_directory, dent->d_name, |
| "distance"); |
| if (parse_distance_file(node, buf) < 0) |
| goto err; |
| } |
| closedir(dir); |
| return; |
| err: |
| if (dir) |
| closedir(dir); |
| free(distmap); |
| distmap = NULL; |
| } |
| |
| #ifdef ALTERNATE_SN_DISTMAP |
| |
| /* |
| * Parse SN architecture specific line of form: |
| * |
| * node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10 |
| * |
| * Second field is node number. The "dist" field is the colon separated list |
| * of distances, which is parsed into array dists[] of each nodes distance |
| * from that node. |
| * |
| * Result is placed in distmap[ncpus][nmems]: |
| * |
| * For each cpu c on that node: |
| * For each node position n in list of distances: |
| * distmap[c][n] = dists[n] |
| */ |
| |
| static void parse_distmap_line_sn(char *buf) |
| { |
| char *p, *pend, *q; |
| int ncpus = cpuset_cpus_nbits(); |
| int nmems = cpuset_mems_nbits(); |
| unsigned long c, n, node; |
| distmap_entry_t *dists = NULL; |
| struct bitmask *cpus = NULL, *mems = NULL; |
| |
| if ((p = strchr(buf, ' ')) == NULL) |
| goto err; |
| if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems) |
| goto err; |
| if ((p = strstr(q, " dist ")) == NULL) |
| goto err; |
| p += strlen(" dist "); |
| if ((pend = strchr(p, ' ')) != NULL) |
| *pend = '\0'; |
| if ((dists = calloc(nmems, sizeof(*dists))) == NULL) |
| goto err; |
| for (n = 0; n < (unsigned int)nmems; n++) |
| dists[n] = DISTMAP_MAX; |
| |
| for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { |
| unsigned long d; |
| |
| if ((p = strpbrk(p, "0123456789")) == NULL) |
| break; |
| d = strtoul(p, &q, 10); |
| if (p == q) |
| break; |
| if (d < DISTMAP_MAX) |
| dists[n] = (distmap_entry_t) d; |
| } |
| |
| if ((mems = bitmask_alloc(nmems)) == NULL) |
| goto err; |
| bitmask_setbit(mems, node); |
| |
| if ((cpus = bitmask_alloc(ncpus)) == NULL) |
| goto err; |
| cpuset_localcpus(mems, cpus); |
| |
| for (c = bitmask_first(cpus); c < (unsigned int)ncpus; |
| c = bitmask_next(cpus, c + 1)) |
| for (n = 0; n < (unsigned int)nmems; n++) |
| distmap[I(c, n)] = dists[n]; |
| /* fall into ... */ |
| err: |
| bitmask_free(mems); |
| bitmask_free(cpus); |
| free(dists); |
| } |
| |
| static void build_distmap_sn(void) |
| { |
| int ncpus = cpuset_cpus_nbits(); |
| int nmems = cpuset_mems_nbits(); |
| int c, m; |
| static int tried_before = 0; |
| FILE *fp = NULL; |
| char *buf = NULL; |
| int buflen; |
| |
| if (tried_before) |
| goto err; |
| tried_before = 1; |
| |
| if ((fp = fopen(sn_topology, "r")) == NULL) |
| goto err; |
| |
| if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) |
| goto err; |
| |
| for (c = 0; c < ncpus; c++) |
| for (m = 0; m < nmems; m++) |
| distmap[I(c, m)] = DISTMAP_MAX; |
| |
| buflen = filesize(fp); |
| if ((buf = malloc(buflen)) == NULL) |
| goto err; |
| |
| while (flgets(buf, buflen, fp) != NULL) |
| if (strprefix(buf, sn_top_node_prefix)) |
| parse_distmap_line_sn(buf); |
| |
| free(buf); |
| fclose(fp); |
| return; |
| err: |
| free(buf); |
| free(distmap); |
| distmap = NULL; |
| if (fp) |
| fclose(fp); |
| } |
| |
| #endif |
| |
| /* [optional] Hardware distance from CPU to Memory Node */ |
| unsigned int cpuset_cpumemdist(int cpu, int mem) |
| { |
| int ncpus = cpuset_cpus_nbits(); |
| int nmems = cpuset_mems_nbits(); |
| distmap_entry_t r = DISTMAP_MAX; |
| |
| flockfile(stdout); |
| |
| if (check() < 0) |
| goto err; |
| |
| if (distmap == NULL) |
| build_distmap(); |
| |
| #ifdef ALTERNATE_SN_DISTMAP |
| if (distmap == NULL) |
| build_distmap_sn(); |
| #endif |
| |
| if (distmap == NULL) |
| goto err; |
| |
| if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems) |
| goto err; |
| |
| r = distmap[I(cpu, mem)]; |
| /* fall into ... */ |
| err: |
| funlockfile(stdout); |
| return r; |
| } |
| |
| /* [optional] Return Memory Node closest to cpu */ |
| int cpuset_cpu2node(int cpu) |
| { |
| int ncpus = cpuset_cpus_nbits(); |
| int nmems = cpuset_mems_nbits(); |
| struct bitmask *cpus = NULL, *mems = NULL; |
| int r = -1; |
| |
| if (check() < 0) |
| goto err; |
| |
| if ((cpus = bitmask_alloc(ncpus)) == NULL) |
| goto err; |
| bitmask_setbit(cpus, cpu); |
| |
| if ((mems = bitmask_alloc(nmems)) == NULL) |
| goto err; |
| cpuset_localmems(cpus, mems); |
| r = bitmask_first(mems); |
| /* fall into ... */ |
| err: |
| bitmask_free(cpus); |
| bitmask_free(mems); |
| return r; |
| } |
| |
| static int apply_cpuset_settings(const char *path, const struct cpuset *cp) |
| { |
| if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) { |
| if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0) |
| goto err; |
| } |
| |
| if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) { |
| if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0) |
| goto err; |
| } |
| |
| if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) { |
| if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0) |
| goto err; |
| } |
| |
| if (cp->notify_on_release_valid && cp->notify_on_release_dirty) { |
| if (store_flag(path, "notify_on_release", cp->notify_on_release) |
| < 0) |
| goto err; |
| } |
| |
| if (cp->memory_migrate_valid && |
| cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) { |
| if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0) |
| goto err; |
| } |
| |
| if (cp->memory_pressure_enabled_valid && |
| cp->memory_pressure_enabled_dirty && |
| exists_flag(path, "memory_pressure_enabled")) { |
| if (store_flag |
| (path, "memory_pressure_enabled", |
| cp->memory_pressure_enabled) < 0) |
| goto err; |
| } |
| |
| if (cp->memory_spread_page_valid && |
| cp->memory_spread_page_dirty && |
| exists_flag(path, "memory_spread_page")) { |
| if (store_flag |
| (path, "memory_spread_page", cp->memory_spread_page) < 0) |
| goto err; |
| } |
| |
| if (cp->memory_spread_slab_valid && |
| cp->memory_spread_slab_dirty && |
| exists_flag(path, "memory_spread_slab")) { |
| if (store_flag |
| (path, "memory_spread_slab", cp->memory_spread_slab) < 0) |
| goto err; |
| } |
| |
| if (cp->sched_load_balance_valid && |
| cp->sched_load_balance_dirty && |
| exists_flag(path, "sched_load_balance")) { |
| if (store_flag |
| (path, "sched_load_balance", cp->sched_load_balance) < 0) |
| goto err; |
| } |
| |
| if (cp->sched_relax_domain_level_valid && |
| cp->sched_relax_domain_level_dirty && |
| exists_flag(path, "sched_relax_domain_level")) { |
| if (store_number |
| (path, "sched_relax_domain_level", |
| cp->sched_relax_domain_level) < 0) |
| goto err; |
| } |
| |
| if (cp->cpus_valid && cp->cpus_dirty) { |
| if (store_mask(path, "cpus", cp->cpus) < 0) |
| goto err; |
| } |
| |
| if (cp->mems_valid && cp->mems_dirty) { |
| if (store_mask(path, "mems", cp->mems) < 0) |
| goto err; |
| } |
| return 0; |
| err: |
| return -1; |
| } |
| |
| /* |
| * get_siblings() - helper routine for cpuset_would_crash_kernel(), below. |
| * |
| * Extract max value of any 'siblings' field in /proc/cpuinfo. |
| * Cache the result - only need to extract once in lifetime of task. |
| * |
| * The siblings field is the number of logical CPUs in a physical |
| * processor package. It is equal to the product of the number of |
| * cores in that package, times the number of hyper-threads per core. |
| * The bug that cpuset_would_crash_kernel() is detecting arises |
| * when a cpu_exclusive cpuset tries to include just some, not all, |
| * of the sibling logical CPUs available in a processor package. |
| * |
| * In the improbable case that a system has mixed values of siblings |
| * (some processor packages have more than others, perhaps due to |
| * partially enabling Hyper-Threading), we take the worse case value, |
| * the largest siblings value. This might be overkill. I don't know |
| * if this kernel bug considers each processor package's siblings |
| * separately or not. But it sure is easier this way ... |
| * |
| * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from |
| * open to close, the first time called. |
| */ |
| |
| static int get_siblings(void) |
| { |
| static int siblings; |
| char buf[32]; /* big enough for one 'siblings' line */ |
| FILE *fp; |
| |
| if (siblings) |
| return siblings; |
| |
| if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) |
| return 4; /* wing it - /proc not mounted ? */ |
| while (flgets(buf, sizeof(buf), fp) != NULL) { |
| int s; |
| |
| if (sscanf(buf, "siblings : %d", &s) < 1) |
| continue; |
| if (s > siblings) |
| siblings = s; |
| } |
| fclose(fp); |
| if (siblings == 0) |
| siblings = 1; /* old kernel, no siblings, default to 1 */ |
| return siblings; |
| } |
| |
| /* |
| * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic |
| * scheduler domain code invoked for cpu_exclusive cpusets that causes |
| * the kernel to freeze, requiring a hardware reset. |
| * |
| * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive' |
| * cpuset is defined where that cpusets 'cpus' are not on package |
| * boundaries then the kernel will freeze, usually as soon as this |
| * cpuset is created, requiring a hardware reset. |
| * |
| * A cpusets 'cpus' are not on package boundaries if the cpuset |
| * includes a proper non-empty subset (some, but not all) of the |
| * logical cpus on a processor package. This requires multiple |
| * logical CPUs per package, available with either Hyper-Thread or |
| * Multi-Core support. Without one of these features, there is only |
| * one logical CPU per physical package, and it's not possible to |
| * have a proper, non-empty subset of a set of cardinality one. |
| * |
| * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC |
| * on i386 and x86_64 arch's. |
| * |
| * The objective of this routine cpuset_would_crash_kernel() is to |
| * determine if a proposed cpuset setting would crash the kernel due |
| * to this bug, so that the caller can avoid the crash. |
| * |
| * Ideally we'd check for exactly these conditions here, but computing |
| * the package (identified by the 'physical id' field of /proc/cpuinfo) |
| * of each cpu in a cpuset is more effort than it's worth here. |
| * |
| * Also there is no obvious way to identify exactly whether the kernel |
| * one is executing on has this bug, short of trying it, and seeing |
| * if the kernel just crashed. |
| * |
| * So for now, we look for a simpler set of conditions, that meets |
| * our immediate need - avoid this crash on SUSE SLES10 systems that |
| * are susceptible to it. We look for the kernel version 2.6.16.*, |
| * which is the base kernel of SUSE SLES10, and for i386 or x86_64 |
| * processors, which had CONFIG_SCHED_MC enabled. |
| * |
| * If these simpler conditions are met, we further simplify the check, |
| * by presuming that the logical CPUs are numbered on processor |
| * package boundaries. If each package has S siblings, we assume |
| * that CPUs numbered N through N + S -1 are on the same package, |
| * for any CPU N such that N mod S == 0. |
| * |
| * Yes, this is a hack, focused on avoiding kernel freezes on |
| * susceptible SUSE SLES10 systems. |
| */ |
| |
| static int cpuset_would_crash_kernel(const struct cpuset *cp) |
| { |
| static int susceptible_system = -1; |
| |
| if (!cp->cpu_exclusive) |
| goto ok; |
| |
| if (susceptible_system == -1) { |
| struct utsname u; |
| int rel_2_6_16, arch_i386, arch_x86_64; |
| |
| if (uname(&u) < 0) |
| goto fail; |
| rel_2_6_16 = strprefix(u.release, "2.6.16."); |
| arch_i386 = streq(u.machine, "i386"); |
| arch_x86_64 = streq(u.machine, "x86_64"); |
| susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64); |
| } |
| |
| if (susceptible_system) { |
| int ncpus = cpuset_cpus_nbits(); |
| int siblings = get_siblings(); |
| unsigned int cpu; |
| |
| for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) { |
| int s, num_set = 0; |
| |
| for (s = 0; s < siblings; s++) { |
| if (bitmask_isbitset(cp->cpus, cpu + s)) |
| num_set++; |
| } |
| |
| /* If none or all siblings set, we're still ok */ |
| if (num_set == 0 || num_set == siblings) |
| continue; |
| |
| /* Found one that would crash kernel. Fail. */ |
| errno = ENXIO; |
| goto fail; |
| } |
| } |
| /* If not susceptible, or if all ok, fall into "ok" ... */ |
| ok: |
| return 0; /* would not crash */ |
| fail: |
| return 1; /* would crash */ |
| } |
| |
| /* compare two cpuset and mark the dirty variable */ |
| static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2) |
| { |
| if (cp1->cpu_exclusive_valid && |
| cp1->cpu_exclusive != cp2->cpu_exclusive) |
| cp1->cpu_exclusive_dirty = 1; |
| |
| if (cp1->mem_exclusive_valid && |
| cp1->mem_exclusive != cp2->mem_exclusive) |
| cp1->mem_exclusive_dirty = 1; |
| |
| if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall) |
| cp1->mem_hardwall_dirty = 1; |
| |
| if (cp1->notify_on_release_valid && |
| cp1->notify_on_release != cp2->notify_on_release) |
| cp1->notify_on_release_dirty = 1; |
| |
| if (cp1->memory_migrate_valid && |
| cp1->memory_migrate != cp2->memory_migrate) |
| cp1->memory_migrate_dirty = 1; |
| |
| if (cp1->memory_pressure_enabled_valid && |
| cp1->memory_pressure_enabled != cp2->memory_pressure_enabled) |
| cp1->memory_pressure_enabled_dirty = 1; |
| |
| if (cp1->memory_spread_page_valid && |
| cp1->memory_spread_page != cp2->memory_spread_page) |
| cp1->memory_spread_page_dirty = 1; |
| |
| if (cp1->memory_spread_slab_valid && |
| cp1->memory_spread_slab != cp2->memory_spread_slab) |
| cp1->memory_spread_slab_dirty = 1; |
| |
| if (cp1->sched_load_balance_valid && |
| cp1->sched_load_balance != cp2->sched_load_balance) |
| cp1->sched_load_balance_dirty = 1; |
| |
| if (cp1->sched_relax_domain_level_valid && |
| cp1->sched_relax_domain_level != cp2->sched_relax_domain_level) |
| cp1->sched_relax_domain_level_dirty = 1; |
| |
| if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus)) |
| cp1->cpus_dirty = 1; |
| if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems)) |
| cp1->mems_dirty = 1; |
| } |
| |
| /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */ |
| static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new) |
| { |
| char buf[PATH_MAX]; |
| int do_rmdir_on_err = 0; |
| int do_restore_cp_sav_on_err = 0; |
| struct cpuset *cp_sav = NULL; |
| int sav_errno; |
| |
| if (check() < 0) |
| goto err; |
| |
| if (cpuset_would_crash_kernel(cp)) |
| goto err; |
| |
| fullpath(buf, sizeof(buf), relpath); |
| |
| if (new) { |
| if (mkdir(buf, 0755) < 0) |
| goto err; |
| /* we made it, so we should remove it on error */ |
| do_rmdir_on_err = 1; |
| } |
| |
| if ((cp_sav = cpuset_alloc()) == NULL) |
| goto err; |
| if (cpuset_query(cp_sav, relpath) < 0) |
| goto err; |
| /* we have old settings to restore on error */ |
| do_restore_cp_sav_on_err = 1; |
| |
| /* check which variable need to restore on error */ |
| mark_dirty_variable(cp_sav, cp); |
| |
| if (apply_cpuset_settings(buf, cp) < 0) |
| goto err; |
| |
| cpuset_free(cp_sav); |
| return 0; |
| err: |
| sav_errno = errno; |
| if (do_restore_cp_sav_on_err) |
| (void)apply_cpuset_settings(buf, cp_sav); |
| if (cp_sav) |
| cpuset_free(cp_sav); |
| if (do_rmdir_on_err) |
| (void)rmdir(buf); |
| errno = sav_errno; |
| return -1; |
| } |
| |
| /* Create cpuset 'cp' at location 'relpath' */ |
| int cpuset_create(const char *relpath, const struct cpuset *cp) |
| { |
| return cr_or_mod(relpath, cp, 1); |
| } |
| |
| /* Delete cpuset at location 'path' (if empty) */ |
| int cpuset_delete(const char *relpath) |
| { |
| char buf[PATH_MAX]; |
| |
| if (check() < 0) |
| goto err; |
| |
| fullpath(buf, sizeof(buf), relpath); |
| if (rmdir(buf) < 0) |
| goto err; |
| |
| return 0; |
| err: |
| return -1; |
| } |
| |
| /* Set cpuset cp to the cpuset at location 'path' */ |
| int cpuset_query(struct cpuset *cp, const char *relpath) |
| { |
| char buf[PATH_MAX]; |
| |
| if (check() < 0) |
| goto err; |
| |
| fullpath(buf, sizeof(buf), relpath); |
| |
| if (load_flag(buf, &cp->cpu_exclusive, "cpuset.cpu_exclusive") < 0) |
| goto err; |
| cp->cpu_exclusive_valid = 1; |
| |
| if (load_flag(buf, &cp->mem_exclusive, "cpuset.mem_exclusive") < 0) |
| goto err; |
| cp->mem_exclusive_valid = 1; |
| |
| if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0) |
| goto err; |
| cp->notify_on_release_valid = 1; |
| |
| if (exists_flag(buf, "cpuset.memory_migrate")) { |
| if (load_flag(buf, &cp->memory_migrate, "cpuset.memory_migrate") < 0) |
| goto err; |
| cp->memory_migrate_valid = 1; |
| } |
| |
| if (exists_flag(buf, "cpuset.mem_hardwall")) { |
| if (load_flag(buf, &cp->mem_hardwall, "cpuset.mem_hardwall") < 0) |
| goto err; |
| cp->mem_hardwall_valid = 1; |
| } |
| |
| if (exists_flag(buf, "cpuset.memory_pressure_enabled")) { |
| if (load_flag |
| (buf, &cp->memory_pressure_enabled, |
| "cpuset.memory_pressure_enabled") < 0) |
| goto err; |
| cp->memory_pressure_enabled_valid = 1; |
| } |
| |
| if (exists_flag(buf, "cpuset.memory_spread_page")) { |
| if (load_flag |
| (buf, &cp->memory_spread_page, "cpuset.memory_spread_page") < 0) |
| goto err; |
| cp->memory_spread_page_valid = 1; |
| } |
| |
| if (exists_flag(buf, "cpuset.memory_spread_slab")) { |
| if (load_flag |
| (buf, &cp->memory_spread_slab, "cpuset.memory_spread_slab") < 0) |
| goto err; |
| cp->memory_spread_slab_valid = 1; |
| } |
| |
| if (exists_flag(buf, "cpuset.sched_load_balance")) { |
| if (load_flag |
| (buf, &cp->sched_load_balance, "cpuset.sched_load_balance") < 0) |
| goto err; |
| cp->sched_load_balance_valid = 1; |
| } |
| |
| if (exists_flag(buf, "cpuset.sched_relax_domain_level")) { |
| if (load_number |
| (buf, &cp->sched_relax_domain_level, |
| "cpuset.sched_relax_domain_level") < 0) |
| goto err; |
| cp->sched_relax_domain_level_valid = 1; |
| } |
| |
| if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpuset.cpus") < 0) |
| goto err; |
| cp->cpus_valid = 1; |
| |
| if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "cpuset.mems") < 0) |
| goto err; |
| cp->mems_valid = 1; |
| |
| return 0; |
| err: |
| return -1; |
| } |
| |
| /* Modify cpuset at location 'relpath' to values of 'cp' */ |
| int cpuset_modify(const char *relpath, const struct cpuset *cp) |
| { |
| return cr_or_mod(relpath, cp, 0); |
| } |
| |
| /* Get cpuset path of pid into buf */ |
| char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size) |
| { |
| int fd; /* dual use: cpuset file for pid and self */ |
| int rc; /* dual use: snprintf and read return codes */ |
| |
| if (check() < 0) |
| return NULL; |
| |
| /* borrow result buf[] to build cpuset file path */ |
| if (pid == 0) |
| rc = snprintf(buf, size, "/proc/self/cpuset"); |
| else |
| rc = snprintf(buf, size, "/proc/%d/cpuset", pid); |
| if (rc >= (int)size) { |
| errno = E2BIG; |
| return NULL; |
| } |
| if ((fd = open(buf, O_RDONLY)) < 0) { |
| int e = errno; |
| if (e == ENOENT) |
| e = ESRCH; |
| if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0) |
| e = ENOSYS; |
| else |
| close(fd); |
| errno = e; |
| return NULL; |
| } |
| rc = read(fd, buf, size); |
| close(fd); |
| if (rc < 0) |
| return NULL; |
| if (rc >= (int)size) { |
| errno = E2BIG; |
| return NULL; |
| } |
| buf[rc] = 0; |
| chomp(buf); |
| return buf; |
| |
| } |
| |
| /* Get cpuset 'cp' of pid */ |
| int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid) |
| { |
| char buf[PATH_MAX]; |
| |
| if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) |
| return -1; |
| if (cpuset_query(cp, buf) < 0) |
| return -1; |
| return 0; |
| } |
| |
| /* [optional] Return mountpoint of cpuset filesystem */ |
| const char *cpuset_mountpoint(void) |
| { |
| if (check() < 0) { |
| switch (errno) { |
| case ENODEV: |
| return "[cpuset filesystem not mounted]"; |
| default: |
| return "[cpuset filesystem not supported]"; |
| } |
| } |
| return cpusetmnt; |
| } |
| |
| /* Return true if path is a directory. */ |
| static int isdir(const char *path) |
| { |
| struct stat statbuf; |
| |
| if (stat(path, &statbuf) < 0) |
| return 0; |
| return S_ISDIR(statbuf.st_mode); |
| } |
| |
| /* |
| * [optional] cpuset_collides_exclusive() - True if would collide exclusive. |
| * |
| * Return true iff the specified cpuset would overlap with any |
| * sibling cpusets in either cpus or mems, where either this |
| * cpuset or the sibling is cpu_exclusive or mem_exclusive. |
| * |
| * cpuset_create() fails with errno == EINVAL if the requested cpuset |
| * would overlap with any sibling, where either one is cpu_exclusive or |
| * mem_exclusive. This is a common, and not obvious error. The |
| * following routine checks for this particular case, so that code |
| * creating cpusets can better identify the situation, perhaps to issue |
| * a more informative error message. |
| * |
| * Can also be used to diagnose cpuset_modify failures. This |
| * routine ignores any existing cpuset with the same path as the |
| * given 'cpusetpath', and only looks for exclusive collisions with |
| * sibling cpusets of that path. |
| * |
| * In case of any error, returns (0) -- does not collide. Presumably |
| * any actual attempt to create or modify a cpuset will encounter the |
| * same error, and report it usefully. |
| * |
| * This routine is not particularly efficient; most likely code creating or |
| * modifying a cpuset will want to try the operation first, and then if that |
| * fails with errno EINVAL, perhaps call this routine to determine if an |
| * exclusive cpuset collision caused the error. |
| */ |
| |
| int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1) |
| { |
| char parent[PATH_MAX]; |
| char *p; |
| char *pathcopy = NULL; |
| char *base; |
| DIR *dir = NULL; |
| struct dirent *dent; |
| struct cpuset *cp2 = NULL; |
| struct bitmask *cpus1 = NULL, *cpus2 = NULL; |
| struct bitmask *mems1 = NULL, *mems2 = NULL; |
| int ret; |
| |
| if (check() < 0) |
| goto err; |
| |
| fullpath(parent, sizeof(parent), cpusetpath); |
| if (streq(parent, cpusetmnt)) |
| goto err; /* only one cpuset root - can't collide */ |
| pathcopy = strdup(parent); |
| p = strrchr(parent, '/'); |
| if (!p) |
| goto err; /* huh? - impossible - run and hide */ |
| *p = 0; /* now parent is dirname of fullpath */ |
| |
| p = strrchr(pathcopy, '/'); |
| base = p + 1; /* now base is basename of fullpath */ |
| if (!*base) |
| goto err; /* this is also impossible - run away */ |
| |
| if ((dir = opendir(parent)) == NULL) |
| goto err; |
| if ((cp2 = cpuset_alloc()) == NULL) |
| goto err; |
| if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) |
| goto err; |
| if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) |
| goto err; |
| if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL) |
| goto err; |
| if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL) |
| goto err; |
| |
| while ((dent = readdir(dir)) != NULL) { |
| char child[PATH_MAX]; |
| |
| if (streq(dent->d_name, ".") || streq(dent->d_name, "..")) |
| continue; |
| if (streq(dent->d_name, base)) |
| continue; |
| pathcat2(child, sizeof(child), parent, dent->d_name); |
| if (!isdir(child)) |
| continue; |
| if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0) |
| goto err; |
| if (cp1->cpu_exclusive || cp2->cpu_exclusive) { |
| cpuset_getcpus(cp1, cpus1); |
| cpuset_getcpus(cp2, cpus2); |
| if (bitmask_intersects(cpus1, cpus2)) |
| goto collides; |
| } |
| if (cp1->mem_exclusive || cp2->mem_exclusive) { |
| cpuset_getmems(cp1, mems1); |
| cpuset_getmems(cp2, mems2); |
| if (bitmask_intersects(mems1, mems2)) |
| goto collides; |
| } |
| } |
| err: |
| /* error, or did not collide */ |
| ret = 0; |
| goto done; |
| collides: |
| /* collides */ |
| ret = 1; |
| /* fall into ... */ |
| done: |
| if (dir) |
| closedir(dir); |
| cpuset_free(cp2); |
| free(pathcopy); |
| bitmask_free(cpus1); |
| bitmask_free(cpus2); |
| bitmask_free(mems1); |
| bitmask_free(mems2); |
| return ret; |
| } |
| |
| /* |
| * [optional] cpuset_nuke() - Remove cpuset anyway possible |
| * |
| * Remove a cpuset, including killing tasks in it, and |
| * removing any descendent cpusets and killing their tasks. |
| * |
| * Tasks can take a long time (minutes on some configurations) |
| * to exit. Loop up to 'seconds' seconds, trying to kill them. |
| * |
| * How we do it: |
| * 1) First, kill all the pids, looping until there are |
| * no more pids in this cpuset or below, or until the |
| * 'seconds' timeout limit is exceeded. |
| * 2) Then depth first recursively rmdir the cpuset directories. |
| * 3) If by this point the original cpuset is gone, we succeeded. |
| * |
| * If the timeout is exceeded, and tasks still exist, fail with |
| * errno == ETIME. |
| * |
| * We sleep a variable amount of time. After the first attempt to |
| * kill all the tasks in the cpuset or its descendents, we sleep 1 |
| * second, the next time 2 seconds, increasing 1 second each loop |
| * up to a max of 10 seconds. If more loops past 10 are required |
| * to kill all the tasks, we sleep 10 seconds each subsequent loop. |
| * In any case, before the last loop, we sleep however many seconds |
| * remain of the original timeout 'seconds' requested. The total |
| * time of all sleeps will be no more than the requested 'seconds'. |
| * |
| * If the cpuset started out empty of any tasks, or if the passed in |
| * 'seconds' was zero, then this routine will return quickly, having |
| * not slept at all. Otherwise, this routine will at a minimum send |
| * a SIGKILL to all the tasks in this cpuset subtree, then sleep one |
| * second, before looking to see if any tasks remain. If tasks remain |
| * in the cpuset subtree, and a longer 'seconds' timeout was requested |
| * (more than one), it will continue to kill remaining tasks and sleep, |
| * in a loop, for as long as time and tasks remain. |
| * |
| * The signal sent for the kill is hardcoded to SIGKILL (9). If some |
| * other signal should be sent first, use a separate code loop, |
| * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to |
| * scan the task pids in a cpuset. If SIGKILL should -not- be sent, |
| * this cpuset_nuke() routine can still be called to recursively |
| * remove a cpuset subtree, by specifying a timeout of zero 'seconds'. |
| * |
| * On success, returns 0 with errno == 0. |
| * |
| * On failure, returns -1, with errno possibly one of: |
| * EACCES - search permission denied on intervening directory |
| * ETIME - timed out - tasks remain after 'seconds' timeout |
| * EMFILE - too many open files |
| * ENODEV - /dev/cpuset not mounted |
| * ENOENT - component of cpuset path doesn't exist |
| * ENOMEM - out of memory |
| * ENOSYS - kernel doesn't support cpusets |
| * ENOTDIR - component of cpuset path is not a directory |
| * EPERM - lacked permission to kill a task |
| * EPERM - lacked permission to read cpusets or files therein |
| */ |
| |
| void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree); |
| |
| int cpuset_nuke(const char *relpath, unsigned int seconds) |
| { |
| unsigned int secs_left = seconds; /* total sleep seconds left */ |
| unsigned int secs_loop = 1; /* how much sleep next loop */ |
| unsigned int secs_slept; /* seconds slept in sleep() */ |
| struct cpuset_pidlist *pl = NULL; /* pids in cpuset subtree */ |
| struct cpuset_fts_tree *cs_tree; |
| const struct cpuset_fts_entry *cs_entry; |
| int ret, sav_errno = 0; |
| |
| if (check() < 0) |
| return -1; |
| |
| if (seconds == 0) |
| goto rmdir_cpusets; |
| |
| while (1) { |
| int plen, j; |
| |
| if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) { |
| /* missing cpuset is as good as if already nuked */ |
| if (errno == ENOENT) { |
| ret = 0; |
| goto no_more_cpuset; |
| } |
| |
| /* other problems reading cpuset are bad news */ |
| sav_errno = errno; |
| goto failed; |
| } |
| |
| if ((plen = cpuset_pidlist_length(pl)) == 0) |
| goto rmdir_cpusets; |
| |
| for (j = 0; j < plen; j++) { |
| pid_t pid; |
| |
| if ((pid = cpuset_get_pidlist(pl, j)) > 1) { |
| if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { |
| sav_errno = errno; |
| goto failed; |
| } |
| } |
| } |
| |
| if (secs_left == 0) |
| goto took_too_long; |
| |
| cpuset_freepidlist(pl); |
| pl = NULL; |
| |
| secs_slept = secs_loop - sleep(secs_loop); |
| |
| /* Ensure forward progress */ |
| if (secs_slept == 0) |
| secs_slept = 1; |
| |
| /* Ensure sane sleep() return (unnecessary?) */ |
| if (secs_slept > secs_loop) |
| secs_slept = secs_loop; |
| |
| secs_left -= secs_slept; |
| |
| if (secs_loop < 10) |
| secs_loop++; |
| |
| secs_loop = MIN(secs_left, secs_loop); |
| } |
| |
| took_too_long: |
| sav_errno = ETIME; |
| /* fall into ... */ |
| failed: |
| cpuset_freepidlist(pl); |
| errno = sav_errno; |
| return -1; |
| |
| rmdir_cpusets: |
| /* Let's try removing cpuset(s) now. */ |
| cpuset_freepidlist(pl); |
| |
| if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT) |
| return -1; |
| ret = 0; |
| cpuset_fts_reverse(cs_tree); /* rmdir's must be done bottom up */ |
| while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { |
| char buf[PATH_MAX]; |
| |
| fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry)); |
| if (rmdir(buf) < 0 && errno != ENOENT) { |
| sav_errno = errno; |
| ret = -1; |
| } |
| } |
| cpuset_fts_close(cs_tree); |
| /* fall into ... */ |
| no_more_cpuset: |
| if (ret == 0) |
| errno = 0; |
| else |
| errno = sav_errno; |
| return ret; |
| } |
| |
| /* |
| * When recursively reading all the tasks files from a subtree, |
| * chain together the read results, one pidblock per tasks file, |
| * containing the raw unprocessed ascii as read(2) in. After |
| * we gather up this raw data, we then go back to count how |
| * many pid's there are in total, allocate an array of pid_t |
| * of that size, and transform the raw ascii data into this |
| * array of pid_t's. |
| */ |
| |
| struct pidblock { |
| char *buf; |
| int buflen; |
| struct pidblock *next; |
| }; |
| |
| /* |
| * Chain the raw contents of a file onto the pbhead list. |
| * |
| * We malloc "+ 1" extra byte for a nul-terminator, so that |
| * the strtoul() loop in pid_transform() won't scan past |
| * the end of pb->buf[] and accidentally find more pids. |
| */ |
| static void add_pidblock(const char *file, struct pidblock **ppbhead) |
| { |
| FILE *fp = NULL; |
| struct pidblock *pb = NULL; |
| int fsz; |
| |
| if ((fp = fopen(file, "r")) == NULL) |
| goto err; |
| fsz = filesize(fp); |
| if (fsz == 0) |
| goto err; |
| if ((pb = calloc(1, sizeof(*pb))) == NULL) |
| goto err; |
| pb->buflen = fsz; |
| if ((pb->buf = malloc(pb->buflen + 1)) == NULL) |
| goto err; |
| if (fread(pb->buf, 1, pb->buflen, fp) > 0) { |
| pb->buf[pb->buflen] = '\0'; |
| pb->next = *ppbhead; |
| *ppbhead = pb; |
| } |
| fclose(fp); |
| return; |
| err: |
| if (fp) |
| fclose(fp); |
| free(pb); |
| } |
| |
| static void read_task_file(const char *relpath, struct pidblock **ppbhead) |
| { |
| char buf[PATH_MAX]; |
| |
| fullpath2(buf, sizeof(buf), relpath, "tasks"); |
| add_pidblock(buf, ppbhead); |
| } |
| |
| struct cpuset_pidlist { |
| pid_t *pids; |
| int npids; |
| }; |
| |
| /* Count how many pids in buf (one per line - just count newlines) */ |
| static int pidcount(const char *buf, int buflen) |
| { |
| int n = 0; |
| const char *cp; |
| |
| for (cp = buf; cp < buf + buflen; cp++) { |
| if (*cp == '\n') |
| n++; |
| } |
| return n; |
| } |
| |
| /* Transform one-per-line ascii pids in pb to pid_t entries in pl */ |
| static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n) |
| { |
| char *a, *b; |
| |
| for (a = pb->buf; a < pb->buf + pb->buflen; a = b) { |
| pid_t p = strtoul(a, &b, 10); |
| if (a == b) |
| break; |
| pl->pids[n++] = p; |
| } |
| return n; |
| } |
| |
| static void free_pidblocks(struct pidblock *pbhead) |
| { |
| struct pidblock *pb, *nextpb; |
| |
| for (pb = pbhead; pb; pb = nextpb) { |
| nextpb = pb->next; |
| free(pb->buf); |
| free(pb); |
| } |
| } |
| |
| /* numeric comparison routine for qsort */ |
| static int numericsort(const void *m1, const void *m2) |
| { |
| pid_t p1 = *(pid_t *) m1; |
| pid_t p2 = *(pid_t *) m2; |
| |
| return p1 - p2; |
| } |
| |
| /* Return list pids in cpuset 'path' */ |
| struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath, |
| int recursiveflag) |
| { |
| struct pidblock *pb = NULL; |
| struct cpuset_pidlist *pl = NULL; |
| struct pidblock *pbhead = NULL; |
| int n; |
| |
| if (check() < 0) |
| goto err; |
| |
| if (recursiveflag) { |
| struct cpuset_fts_tree *cs_tree; |
| const struct cpuset_fts_entry *cs_entry; |
| |
| if ((cs_tree = cpuset_fts_open(relpath)) == NULL) |
| goto err; |
| while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { |
| if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET) |
| continue; |
| read_task_file(cpuset_fts_get_path(cs_entry), &pbhead); |
| } |
| cpuset_fts_close(cs_tree); |
| } else { |
| read_task_file(relpath, &pbhead); |
| } |
| |
| if ((pl = calloc(1, sizeof(*pl))) == NULL) |
| goto err; |
| pl->npids = 0; |
| for (pb = pbhead; pb; pb = pb->next) |
| pl->npids += pidcount(pb->buf, pb->buflen); |
| if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL) |
| goto err; |
| n = 0; |
| for (pb = pbhead; pb; pb = pb->next) |
| n = pid_transform(pb, pl, n); |
| free_pidblocks(pbhead); |
| qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort); |
| return pl; |
| err: |
| cpuset_freepidlist(pl); |
| free_pidblocks(pbhead); |
| return NULL; |
| } |
| |
| /* Return number of elements in pidlist */ |
| int cpuset_pidlist_length(const struct cpuset_pidlist *pl) |
| { |
| if (pl) |
| return pl->npids; |
| else |
| return 0; |
| } |
| |
| /* Return i'th element of pidlist */ |
| pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i) |
| { |
| if (pl && i >= 0 && i < pl->npids) |
| return pl->pids[i]; |
| else |
| return (pid_t) - 1; |
| } |
| |
| /* Free pidlist */ |
| void cpuset_freepidlist(struct cpuset_pidlist *pl) |
| { |
| if (pl && pl->pids) |
| free(pl->pids); |
| free(pl); |
| } |
| |
| static int __cpuset_move(pid_t pid, const char *path) |
| { |
| char buf[SMALL_BUFSZ]; |
| |
| snprintf(buf, sizeof(buf), "%u", pid); |
| return write_string_file(path, buf); |
| } |
| |
| /* Move task (pid == 0 for current) to a cpuset */ |
| int cpuset_move(pid_t pid, const char *relpath) |
| { |
| char buf[PATH_MAX]; |
| |
| if (check() < 0) |
| return -1; |
| |
| if (pid == 0) |
| pid = getpid(); |
| |
| fullpath2(buf, sizeof(buf), relpath, "tasks"); |
| return __cpuset_move(pid, buf); |
| } |
| |
| /* Move all tasks in pidlist to a cpuset */ |
| int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath) |
| { |
| int i; |
| char buf[PATH_MAX]; |
| int ret; |
| |
| if (check() < 0) |
| return -1; |
| |
| fullpath2(buf, sizeof(buf), relpath, "tasks"); |
| |
| ret = 0; |
| for (i = 0; i < pl->npids; i++) |
| if (__cpuset_move(pl->pids[i], buf) < 0) |
| ret = -1; |
| return ret; |
| } |
| |
| /* |
| * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a |
| * cpuset to another cpuset |
| * |
| * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may |
| * race with tasks being added to or forking into fromrelpath. Loop |
| * repeatedly, reading the tasks file of cpuset fromrelpath and writing |
| * any task pid's found there to the tasks file of cpuset torelpath, |
| * up to ten attempts, or until the tasks file of cpuset fromrelpath |
| * is empty, or until fromrelpath is no longer present. |
| * |
| * Returns 0 with errno == 0 if able to empty the tasks file of cpuset |
| * fromrelpath. Of course it is still possible that some independent |
| * task could add another task to cpuset fromrelpath at the same time |
| * that such a successful result is being returned, so there can be |
| * no guarantee that a successful return means that fromrelpath is |
| * still empty of tasks. |
| * |
| * We are careful to allow for the possibility that the cpuset |
| * fromrelpath might disappear out from under us, perhaps because it |
| * has notify_on_release set and gets automatically removed as soon |
| * as we detach its last task from it. Consider a missing fromrelpath |
| * to be a successful move. |
| * |
| * If called with fromrelpath and torelpath pathnames that evaluate to |
| * the same cpuset, then treat that as if cpuset_reattach() was called, |
| * rebinding each task in this cpuset one time, and return success or |
| * failure depending on the return of that cpuset_reattach() call. |
| * |
| * On failure, returns -1, with errno possibly one of: |
| * EACCES - search permission denied on intervening directory |
| * ENOTEMPTY - tasks remain after multiple attempts to move them |
| * EMFILE - too many open files |
| * ENODEV - /dev/cpuset not mounted |
| * ENOENT - component of cpuset path doesn't exist |
| * ENOMEM - out of memory |
| * ENOSYS - kernel doesn't support cpusets |
| * ENOTDIR - component of cpuset path is not a directory |
| * EPERM - lacked permission to kill a task |
| * EPERM - lacked permission to read cpusets or files therein |
| * |
| * This is an [optional] function. Use cpuset_function to invoke it. |
| */ |
| |
| #define NUMBER_MOVE_TASK_ATTEMPTS 10 |
| |
| int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath) |
| { |
| char fromfullpath[PATH_MAX]; |
| char tofullpath[PATH_MAX]; |
| int i; |
| struct cpuset_pidlist *pl = NULL; |
| int sav_errno; |
| |
| fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath); |
| fullpath(tofullpath, sizeof(tofullpath), torelpath); |
| |
| if (samefile(fromfullpath, tofullpath)) |
| return cpuset_reattach(fromrelpath); |
| |
| for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) { |
| int plen, j; |
| |
| if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) { |
| /* missing cpuset is as good as if all moved */ |
| if (errno == ENOENT) |
| goto no_more_cpuset; |
| |
| /* other problems reading cpuset are bad news */ |
| sav_errno = errno; |
| goto failed; |
| } |
| |
| if ((plen = cpuset_pidlist_length(pl)) == 0) |
| goto no_more_pids; |
| |
| for (j = 0; j < plen; j++) { |
| pid_t pid; |
| |
| pid = cpuset_get_pidlist(pl, j); |
| if (cpuset_move(pid, torelpath) < 0) { |
| /* missing task is as good as if moved */ |
| if (errno == ESRCH) |
| continue; |
| |
| /* other per-task errors are bad news */ |
| sav_errno = errno; |
| goto failed; |
| } |
| } |
| |
| cpuset_freepidlist(pl); |
| pl = NULL; |
| } |
| |
| sav_errno = ENOTEMPTY; |
| /* fall into ... */ |
| failed: |
| cpuset_freepidlist(pl); |
| errno = sav_errno; |
| return -1; |
| |
| no_more_pids: |
| no_more_cpuset: |
| /* Success - all tasks (or entire cpuset ;) gone. */ |
| cpuset_freepidlist(pl); |
| errno = 0; |
| return 0; |
| } |
| |
| /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */ |
| int cpuset_migrate(pid_t pid, const char *relpath) |
| { |
| char buf[PATH_MAX]; |
| char buf2[PATH_MAX]; |
| char memory_migrate_flag; |
| int r; |
| |
| if (check() < 0) |
| return -1; |
| |
| if (pid == 0) |
| pid = getpid(); |
| |
| fullpath(buf2, sizeof(buf2), relpath); |
| |
| if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) |
| return -1; |
| if (store_flag(buf2, "memory_migrate", 1) < 0) |
| return -1; |
| |
| fullpath2(buf, sizeof(buf), relpath, "tasks"); |
| |
| r = __cpuset_move(pid, buf); |
| |
| store_flag(buf2, "memory_migrate", memory_migrate_flag); |
| return r; |
| } |
| |
| /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */ |
| int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath) |
| { |
| int i; |
| char buf[PATH_MAX]; |
| char buf2[PATH_MAX]; |
| char memory_migrate_flag; |
| int ret; |
| |
| if (check() < 0) |
| return -1; |
| |
| fullpath(buf2, sizeof(buf2), relpath); |
| |
| if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) |
| return -1; |
| if (store_flag(buf2, "memory_migrate", 1) < 0) |
| return -1; |
| |
| fullpath2(buf, sizeof(buf), relpath, "tasks"); |
| |
| ret = 0; |
| for (i = 0; i < pl->npids; i++) |
| if (__cpuset_move(pl->pids[i], buf) < 0) |
| ret = -1; |
| |
| if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0) |
| ret = -1; |
| return ret; |
| } |
| |
| /* Rebind cpus_allowed of each task in cpuset 'path' */ |
| int cpuset_reattach(const char *relpath) |
| { |
| struct cpuset_pidlist *pl; |
| int rc; |
| |
| if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL) |
| return -1; |
| rc = cpuset_move_all(pl, relpath); |
| cpuset_freepidlist(pl); |
| return rc; |
| } |
| |
| /* Map cpuset relative cpu number to system wide cpu number */ |
| int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu) |
| { |
| struct cpuset *cp_tofree = NULL; |
| const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); |
| int pos = -1; |
| |
| if (!cp1) |
| goto err; |
| pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu); |
| /* fall into ... */ |
| err: |
| cpuset_free(cp_tofree); |
| return pos; |
| } |
| |
| /* Map system wide cpu number to cpuset relative cpu number */ |
| int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu) |
| { |
| struct cpuset *cp_tofree = NULL; |
| const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); |
| int pos = -1; |
| |
| if (!cp1) |
| goto err; |
| pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu); |
| /* fall into ... */ |
| err: |
| cpuset_free(cp_tofree); |
| return pos; |
| } |
| |
| /* Map cpuset relative mem number to system wide mem number */ |
| int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem) |
| { |
| struct cpuset *cp_tofree = NULL; |
| const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); |
| int pos = -1; |
| |
| if (!cp1) |
| goto err; |
| pos = bitmask_rel_to_abs_pos(cp1->mems, mem); |
| /* fall into ... */ |
| err: |
| cpuset_free(cp_tofree); |
| return pos; |
| } |
| |
| /* Map system wide mem number to cpuset relative mem number */ |
| int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem) |
| { |
| struct cpuset *cp_tofree = NULL; |
| const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); |
| int pos = -1; |
| |
| if (!cp1) |
| goto err; |
| pos = bitmask_abs_to_rel_pos(cp1->mems, mem); |
| /* fall into ... */ |
| err: |
| cpuset_free(cp_tofree); |
| return pos; |
| } |
| |
| /* Map pid's cpuset relative cpu number to system wide cpu number */ |
| int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu) |
| { |
| struct cpuset *cp; |
| int rc = -1; |
| |
| if ((cp = cpuset_alloc()) == NULL) |
| goto done; |
| if (cpuset_cpusetofpid(cp, pid) < 0) |
| goto done; |
| rc = cpuset_c_rel_to_sys_cpu(cp, cpu); |
| done: |
| cpuset_free(cp); |
| return rc; |
| } |
| |
| /* Map system wide cpu number to pid's cpuset relative cpu number */ |
| int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu) |
| { |
| struct cpuset *cp; |
| int rc = -1; |
| |
| if ((cp = cpuset_alloc()) == NULL) |
| goto done; |
| if (cpuset_cpusetofpid(cp, pid) < 0) |
| goto done; |
| rc = cpuset_c_sys_to_rel_cpu(cp, cpu); |
| done: |
| cpuset_free(cp); |
| return rc; |
| } |
| |
| /* Map pid's cpuset relative mem number to system wide mem number */ |
| int cpuset_p_rel_to_sys_mem(pid_t pid, int mem) |
| { |
| struct cpuset *cp; |
| int rc = -1; |
| |
| if ((cp = cpuset_alloc()) == NULL) |
| goto done; |
| if (cpuset_cpusetofpid(cp, pid) < 0) |
| goto done; |
| rc = cpuset_c_rel_to_sys_mem(cp, mem); |
| done: |
| cpuset_free(cp); |
| return rc; |
| } |
| |
| /* Map system wide mem number to pid's cpuset relative mem number */ |
| int cpuset_p_sys_to_rel_mem(pid_t pid, int mem) |
| { |
| struct cpuset *cp; |
| int rc = -1; |
| |
| if ((cp = cpuset_alloc()) == NULL) |
| goto done; |
| if (cpuset_cpusetofpid(cp, pid) < 0) |
| goto done; |
| rc = cpuset_c_sys_to_rel_mem(cp, mem); |
| done: |
| cpuset_free(cp); |
| return rc; |
| } |
| |
| /* |
| * Override glibc's calls for get/set affinity - they have |
| * something using cpu_set_t that will die when NR_CPUS > 1024. |
| * Go directly to the 'real' system calls. Also override calls |
| * for get_mempolicy and set_mempolicy. None of these |
| * calls are yet (July 2004) guaranteed to be in all glibc versions |
| * that we care about. |
| */ |
| |
| static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask) |
| { |
| return ltp_syscall(__NR_sched_setaffinity, pid, len, mask); |
| } |
| |
| static int get_mempolicy(int *policy, unsigned long *nmask, |
| unsigned long maxnode, void *addr, int flags) |
| { |
| return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode, |
| addr, flags); |
| } |
| |
| static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode) |
| { |
| return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode); |
| } |
| |
| struct cpuset_placement { |
| struct bitmask *cpus; |
| struct bitmask *mems; |
| char *path; |
| }; |
| |
| /* Allocate and fill in a placement struct - cpatures current placement */ |
| struct cpuset_placement *cpuset_get_placement(pid_t pid) |
| { |
| struct cpuset_placement *plc; |
| struct cpuset *cp = NULL; |
| char buf[PATH_MAX]; |
| int nbits; |
| |
| if ((plc = calloc(1, sizeof(*plc))) == NULL) |
| goto err; |
| |
| nbits = cpuset_cpus_nbits(); |
| if ((plc->cpus = bitmask_alloc(nbits)) == NULL) |
| goto err; |
| |
| nbits = cpuset_mems_nbits(); |
| if ((plc->mems = bitmask_alloc(nbits)) == NULL) |
| goto err; |
| |
| if ((cp = cpuset_alloc()) == NULL) |
| goto err; |
| if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) |
| goto err; |
| if (cpuset_query(cp, buf) < 0) |
| goto err; |
| |
| bitmask_copy(plc->cpus, cp->cpus); |
| bitmask_copy(plc->mems, cp->mems); |
| plc->path = strdup(buf); |
| |
| cpuset_free(cp); |
| return plc; |
| err: |
| cpuset_free(cp); |
| cpuset_free_placement(plc); |
| return NULL; |
| } |
| |
| /* Compare two placement structs - use to detect changes in placement */ |
| int cpuset_equal_placement(const struct cpuset_placement *plc1, |
| const struct cpuset_placement *plc2) |
| { |
| return bitmask_equal(plc1->cpus, plc2->cpus) && |
| bitmask_equal(plc1->mems, plc2->mems) && |
| streq(plc1->path, plc2->path); |
| } |
| |
| /* Free a placement struct */ |
| void cpuset_free_placement(struct cpuset_placement *plc) |
| { |
| if (!plc) |
| return; |
| bitmask_free(plc->cpus); |
| bitmask_free(plc->mems); |
| free(plc->path); |
| free(plc); |
| } |
| |
| /* |
| * A cpuset_fts_open() call constructs a linked list of entries |
| * called a "cpuset_fts_tree", with one entry per cpuset below |
| * the specified path. The cpuset_fts_read() routine returns the |
| * next entry on this list. The various cpuset_fts_get_*() calls |
| * return attributes of the specified entry. The cpuset_fts_close() |
| * call frees the linked list and all associated data. All cpuset |
| * entries and attributes for the cpuset_fts_tree returned from a |
| * given cpuset_fts_open() call remain allocated and unchanged until |
| * that cpuset_fts_tree is closed by a cpuset_fts_close() call. Any |
| * subsequent changes to the cpuset filesystem will go unnoticed |
| * (not affect open cpuset_fts_tree's.) |
| */ |
| |
| struct cpuset_fts_entry; |
| void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree); |
| |
| struct cpuset_fts_tree { |
| struct cpuset_fts_entry *head; /* head of linked entry list */ |
| struct cpuset_fts_entry *next; /* cpuset_fts_read() offset */ |
| }; |
| |
| struct cpuset_fts_entry { |
| struct cpuset_fts_entry *next; /* linked entry list chain */ |
| struct cpuset *cpuset; |
| struct stat *stat; |
| char *path; |
| int info; |
| int err; |
| }; |
| |
| /* Open a handle on a cpuset hierarchy. All the real work is done here. */ |
| struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath) |
| { |
| FTS *fts = NULL; |
| FTSENT *ftsent; |
| char *path_argv[2]; |
| char buf[PATH_MAX]; |
| struct cpuset_fts_tree *cs_tree = NULL; |
| struct cpuset_fts_entry *ep; /* the latest new list entry */ |
| struct cpuset_fts_entry **pnlep; /* ptr to next list entry ptr */ |
| char *relpath; |
| int fts_flags; |
| |
| fullpath(buf, sizeof(buf), cpusetpath); |
| path_argv[0] = buf; |
| path_argv[1] = NULL; |
| |
| fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV; |
| fts = fts_open(path_argv, fts_flags, NULL); |
| if (fts == NULL) |
| goto err; |
| |
| cs_tree = malloc(sizeof(*cs_tree)); |
| if (cs_tree == NULL) |
| goto err; |
| pnlep = &cs_tree->head; |
| *pnlep = NULL; |
| |
| while ((ftsent = fts_read(fts)) != NULL) { |
| if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR) |
| continue; |
| |
| /* ftsent is a directory (perhaps unreadable) ==> cpuset */ |
| ep = calloc(1, sizeof(*ep)); |
| if (ep == NULL) |
| goto err; |
| *pnlep = ep; |
| pnlep = &ep->next; |
| |
| /* Set entry's path, and if DNR, error */ |
| relpath = ftsent->fts_path + strlen(cpusetmnt); |
| if (strlen(relpath) == 0) |
| relpath = "/"; |
| ep->path = strdup(relpath); |
| if (ep->path == NULL) |
| goto err; |
| if (ftsent->fts_info == FTS_DNR) { |
| ep->info = CPUSET_FTS_ERR_DNR; |
| ep->err = ftsent->fts_errno; |
| continue; |
| } |
| |
| /* ftsent is a -readable- cpuset: set entry's stat, etc */ |
| ep->stat = calloc(1, sizeof(struct stat)); |
| if (ep->stat == NULL) |
| goto err; |
| if (stat(ftsent->fts_path, ep->stat) < 0) { |
| ep->info = CPUSET_FTS_ERR_STAT; |
| ep->err = ftsent->fts_errno; |
| continue; |
| } |
| |
| ep->cpuset = calloc(1, sizeof(struct cpuset)); |
| if (ep->cpuset == NULL) |
| goto err; |
| if (cpuset_query(ep->cpuset, relpath) < 0) { |
| ep->info = CPUSET_FTS_ERR_CPUSET; |
| ep->err = errno; |
| continue; |
| } |
| ep->info = CPUSET_FTS_CPUSET; |
| } |
| |
| (void)fts_close(fts); |
| cpuset_fts_rewind(cs_tree); |
| return cs_tree; |
| |
| err: |
| if (cs_tree) |
| cpuset_fts_close(cs_tree); |
| if (fts) |
| (void)fts_close(fts); |
| return NULL; |
| } |
| |
| /* Return pointer to next cpuset entry in hierarchy */ |
| const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree) |
| { |
| const struct cpuset_fts_entry *cs_entry = cs_tree->next; |
| if (cs_tree->next != NULL) /* seek to next entry */ |
| cs_tree->next = cs_tree->next->next; |
| return cs_entry; |
| } |
| |
| /* Reverse list of cpusets, in place. Simulates pre-order/post-order flip. */ |
| void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree) |
| { |
| struct cpuset_fts_entry *cs1, *cs2, *cs3; |
| |
| /* |
| * At each step, cs1 < cs2 < cs3 and the cs2->next pointer |
| * is redirected from cs3 to cs1. |
| */ |
| |
| cs1 = cs2 = NULL; |
| cs3 = cs_tree->head; |
| while (cs3) { |
| cs1 = cs2; |
| cs2 = cs3; |
| cs3 = cs3->next; |
| cs2->next = cs1; |
| } |
| cs_tree->head = cs2; |
| cpuset_fts_rewind(cs_tree); |
| } |
| |
| /* Rewind cpuset list to beginning */ |
| void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree) |
| { |
| cs_tree->next = cs_tree->head; |
| } |
| |
| /* Return pointer to nul-terminated cpuset path of entry in hierarchy */ |
| const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry) |
| { |
| return cs_entry->path; |
| } |
| |
| /* Return pointer to stat(2) structure of a cpuset entry's directory */ |
| const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry) |
| { |
| return cs_entry->stat; |
| } |
| |
| /* Return pointer to cpuset structure of a cpuset entry */ |
| const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry |
| *cs_entry) |
| { |
| return cs_entry->cpuset; |
| } |
| |
| /* Return value of errno (0 if no error) on attempted cpuset operations */ |
| int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry) |
| { |
| return cs_entry->err; |
| } |
| |
| /* Return operation identity causing error */ |
| int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry) |
| { |
| return cs_entry->info; |
| } |
| |
| /* Close a cpuset hierarchy handle (free's all associated memory) */ |
| void cpuset_fts_close(struct cpuset_fts_tree *cs_tree) |
| { |
| struct cpuset_fts_entry *cs_entry = cs_tree->head; |
| |
| while (cs_entry) { |
| struct cpuset_fts_entry *ep = cs_entry; |
| |
| cs_entry = cs_entry->next; |
| free(ep->path); |
| free(ep->stat); |
| cpuset_free(ep->cpuset); |
| free(ep); |
| } |
| free(cs_tree); |
| } |
| |
| /* Bind current task to cpu (uses sched_setaffinity(2)) */ |
| int cpuset_cpubind(int cpu) |
| { |
| struct bitmask *bmp; |
| int r; |
| |
| if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL) |
| return -1; |
| bitmask_setbit(bmp, cpu); |
| r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp)); |
| bitmask_free(bmp); |
| return r; |
| } |
| |
| /* |
| * int cpuset_latestcpu(pid_t pid) |
| * |
| * Return most recent CPU on which task pid executed. If pid == 0, |
| * examine current task. |
| * |
| * The last used CPU is visible for a given pid as field #39 (starting |
| * with #1) in the file /proc/pid/stat. Currently this file has 41 |
| * fields, in which case this is the 3rd to the last field. |
| * |
| * Unfortunately field #2 is a command name and might have embedded |
| * whitespace. So we can't just count white space separated fields. |
| * Fortunately, this command name is surrounded by parentheses, as |
| * for example "(sh)", and that closing parenthesis is the last ')' |
| * character in the line. No remaining fields can have embedded |
| * whitespace or parentheses. So instead of looking for the 39th |
| * white space separated field, we can look for the 37th white space |
| * separated field past the last ')' character on the line. |
| */ |
| |
| /* Return most recent CPU on which task pid executed */ |
| int cpuset_latestcpu(pid_t pid) |
| { |
| char buf[PATH_MAX]; |
| char *bp; |
| int fd = -1; |
| int cpu = -1; |
| |
| if (pid == 0) |
| snprintf(buf, sizeof(buf), "/proc/self/stat"); |
| else |
| snprintf(buf, sizeof(buf), "/proc/%d/stat", pid); |
| |
| if ((fd = open(buf, O_RDONLY)) < 0) |
| goto err; |
| if (read(fd, buf, sizeof(buf)) < 1) |
| goto err; |
| close(fd); |
| |
| bp = strrchr(buf, ')'); |
| if (bp) |
| sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */ |
| &cpu); |
| if (cpu < 0) |
| errno = EINVAL; |
| return cpu; |
| err: |
| if (fd >= 0) |
| close(fd); |
| return -1; |
| } |
| |
| /* Bind current task to memory (uses set_mempolicy(2)) */ |
| int cpuset_membind(int mem) |
| { |
| struct bitmask *bmp; |
| int r; |
| |
| if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL) |
| return -1; |
| bitmask_setbit(bmp, mem); |
| r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1); |
| bitmask_free(bmp); |
| return r; |
| } |
| |
| /* [optional] Return Memory Node holding page at specified addr */ |
| int cpuset_addr2node(void *addr) |
| { |
| int node = -1; |
| |
| if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) { |
| /* I realize this seems redundant, but I _want_ to make sure |
| * that this value is -1. */ |
| node = -1; |
| } |
| return node; |
| } |
| |
| /* |
| * Transform cpuset into Text Format Representation in buffer 'buf', |
| * of length 'buflen', nul-terminated if space allows. Return number |
| * of characters that would have been written, if enough space had |
| * been available, in the same way that snprintf() does. |
| */ |
| |
| /* Export cpuset settings to a regular file */ |
| int cpuset_export(const struct cpuset *cp, char *buf, int buflen) |
| { |
| char *tmp = NULL; |
| int n = 0; |
| |
| if (cp->cpu_exclusive) |
| n += snprintf(buf + n, MAX(buflen - n, 0), "cpu_exclusive\n"); |
| |
| if (cp->mem_exclusive) |
| n += snprintf(buf + n, MAX(buflen - n, 0), "mem_exclusive\n"); |
| |
| if (cp->notify_on_release) |
| n += snprintf(buf + n, MAX(buflen - n, 0), |
| "notify_on_release\n"); |
| |
| if (cp->memory_pressure_enabled) |
| n += snprintf(buf + n, MAX(buflen - n, 0), |
| "memory_pressure_enabled\n"); |
| |
| if (cp->memory_migrate) |
| n += snprintf(buf + n, MAX(buflen - n, 0), "memory_migrate\n"); |
| |
| if (cp->memory_spread_page) |
| n += snprintf(buf + n, MAX(buflen - n, 0), |
| "memory_spread_page\n"); |
| |
| if (cp->memory_spread_slab) |
| n += snprintf(buf + n, MAX(buflen - n, 0), |
| "memory_spread_slab\n"); |
| |
| if ((tmp = sprint_mask_buf(cp->cpus)) == NULL) |
| return -1; |
| n += snprintf(buf + n, MAX(buflen - n, 0), "cpus %s\n", tmp); |
| free(tmp); |
| tmp = NULL; |
| |
| if ((tmp = sprint_mask_buf(cp->mems)) == NULL) |
| return -1; |
| n += snprintf(buf + n, MAX(buflen - n, 0), "mems %s\n", tmp); |
| free(tmp); |
| tmp = NULL; |
| |
| return n; |
| } |
| |
| static int import_list(UNUSED const char *tok, const char *arg, |
| struct bitmask *bmp, char *emsg, int elen) |
| { |
| if (bitmask_parselist(arg, bmp) < 0) { |
| if (emsg) |
| snprintf(emsg, elen, "Invalid list format: %s", arg); |
| return -1; |
| } |
| return 0; |
| } |
| |
| static void stolower(char *s) |
| { |
| while (*s) { |
| unsigned char c = *s; |
| *s = tolower(c); |
| s++; |
| } |
| } |
| |
| /* Import cpuset settings from a regular file */ |
| int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum, |
| char *emsg, int elen) |
| { |
| char *linebuf = NULL; |
| int linebuflen; |
| int linenum = 0; |
| int offset = 0; |
| |
| linebuflen = strlen(buf) + 1; |
| if ((linebuf = malloc(linebuflen)) == NULL) { |
| if (emsg) |
| snprintf(emsg, elen, "Insufficient memory"); |
| goto err; |
| } |
| |
| while (slgets(linebuf, linebuflen, buf, &offset)) { |
| char *tok, *arg; |
| char *ptr; /* for strtok_r */ |
| |
| linenum++; |
| if ((tok = strchr(linebuf, '#')) != NULL) |
| *tok = 0; |
| if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL) |
| continue; |
| stolower(tok); |
| |
| arg = strtok_r(0, " \t", &ptr); |
| |
| if (streq(tok, "cpu_exclusive")) { |
| cp->cpu_exclusive = 1; |
| goto eol; |
| } |
| if (streq(tok, "mem_exclusive")) { |
| cp->mem_exclusive = 1; |
| goto eol; |
| } |
| if (streq(tok, "notify_on_release")) { |
| cp->notify_on_release = 1; |
| goto eol; |
| } |
| if (streq(tok, "memory_pressure_enabled")) { |
| cp->memory_pressure_enabled = 1; |
| goto eol; |
| } |
| if (streq(tok, "memory_migrate")) { |
| cp->memory_migrate = 1; |
| goto eol; |
| } |
| if (streq(tok, "memory_spread_page")) { |
| cp->memory_spread_page = 1; |
| goto eol; |
| } |
| if (streq(tok, "memory_spread_slab")) { |
| cp->memory_spread_slab = 1; |
| goto eol; |
| } |
| if (streq(tok, "cpu") || streq(tok, "cpus")) { |
| if (import_list(tok, arg, cp->cpus, emsg, elen) < 0) |
| goto err; |
| goto eol; |
| } |
| if (streq(tok, "mem") || streq(tok, "mems")) { |
| if (import_list(tok, arg, cp->mems, emsg, elen) < 0) |
| goto err; |
| goto eol; |
| } |
| if (emsg) |
| snprintf(emsg, elen, "Unrecognized token: '%s'", tok); |
| goto err; |
| eol: |
| if ((tok = strtok_r(0, " \t", &ptr)) != NULL) { |
| if (emsg) |
| snprintf(emsg, elen, "Surplus token: '%s'", |
| tok); |
| goto err; |
| } |
| continue; |
| } |
| |
| free(linebuf); |
| |
| if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems)) |
| cpuset_localcpus(cp->mems, cp->cpus); |
| else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems)) |
| cpuset_localmems(cp->cpus, cp->mems); |
| |
| /* |
| * All cpuset attributes are determined in an import. |
| * Those that aren't explicitly specified are presumed |
| * to be unchanged (zero, if it's a freshly allocated |
| * struct cpuset.) |
| */ |
| |
| cp->cpus_valid = 1; |
| cp->mems_valid = 1; |
| cp->cpu_exclusive_valid = 1; |
| cp->mem_exclusive_valid = 1; |
| cp->notify_on_release_valid = 1; |
| cp->memory_migrate_valid = 1; |
| cp->memory_pressure_enabled_valid = 1; |
| cp->memory_spread_page_valid = 1; |
| cp->memory_spread_slab_valid = 1; |
| |
| return 0; |
| err: |
| if (elinenum) |
| *elinenum = linenum; |
| free(linebuf); |
| return -1; |
| } |
| |
| /* Pin current task CPU (and memory) */ |
| int cpuset_pin(int relcpu) |
| { |
| struct cpuset_placement *plc1 = NULL, *plc2 = NULL; |
| int cpu, r; |
| |
| if (check() < 0) |
| return -1; |
| |
| do { |
| cpuset_free_placement(plc1); |
| plc1 = cpuset_get_placement(0); |
| |
| r = 0; |
| if (cpuset_unpin() < 0) |
| r = -1; |
| cpu = cpuset_p_rel_to_sys_cpu(0, relcpu); |
| if (cpuset_cpubind(cpu) < 0) |
| r = -1; |
| |
| cpuset_free_placement(plc2); |
| plc2 = cpuset_get_placement(0); |
| } while (!cpuset_equal_placement(plc1, plc2)); |
| |
| cpuset_free_placement(plc1); |
| cpuset_free_placement(plc2); |
| return r; |
| } |
| |
| /* Return number CPUs in current tasks cpuset */ |
| int cpuset_size(void) |
| { |
| struct cpuset_placement *plc1 = NULL, *plc2 = NULL; |
| int r; |
| |
| if (check() < 0) |
| return -1; |
| |
| do { |
| cpuset_free_placement(plc1); |
| plc1 = cpuset_get_placement(0); |
| |
| r = cpuset_cpus_weight(0); |
| |
| cpuset_free_placement(plc2); |
| plc2 = cpuset_get_placement(0); |
| } while (!cpuset_equal_placement(plc1, plc2)); |
| |
| cpuset_free_placement(plc1); |
| cpuset_free_placement(plc2); |
| return r; |
| } |
| |
| /* Return relative CPU number, within current cpuset, last executed on */ |
| int cpuset_where(void) |
| { |
| struct cpuset_placement *plc1 = NULL, *plc2 = NULL; |
| int r; |
| |
| if (check() < 0) |
| return -1; |
| |
| do { |
| cpuset_free_placement(plc1); |
| plc1 = cpuset_get_placement(0); |
| |
| r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0)); |
| |
| cpuset_free_placement(plc2); |
| plc2 = cpuset_get_placement(0); |
| } while (!cpuset_equal_placement(plc1, plc2)); |
| |
| cpuset_free_placement(plc1); |
| cpuset_free_placement(plc2); |
| return r; |
| } |
| |
| /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */ |
| int cpuset_unpin(void) |
| { |
| struct bitmask *cpus = NULL, *mems = NULL; |
| int r = -1; |
| |
| if (check() < 0) |
| goto err; |
| |
| /* |
| * Don't need cpuset_*_placement() guard against concurrent |
| * cpuset migration, because none of the following depends |
| * on the tasks cpuset placement. |
| */ |
| |
| if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL) |
| goto err; |
| bitmask_setall(cpus); |
| if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0) |
| goto err; |
| |
| if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL) |
| goto err; |
| if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems), |
| bitmask_nbits(mems) + 1) < 0) |
| goto err; |
| r = 0; |
| /* fall into ... */ |
| err: |
| bitmask_free(cpus); |
| bitmask_free(mems); |
| return r; |
| |
| } |
| |
| struct cpuset_function_list { |
| const char *fname; |
| void *func; |
| } flist[] = { |
| { |
| "cpuset_version", cpuset_version}, { |
| "cpuset_alloc", cpuset_alloc}, { |
| "cpuset_free", cpuset_free}, { |
| "cpuset_cpus_nbits", cpuset_cpus_nbits}, { |
| "cpuset_mems_nbits", cpuset_mems_nbits}, { |
| "cpuset_setcpus", cpuset_setcpus}, { |
| "cpuset_setmems", cpuset_setmems}, { |
| "cpuset_set_iopt", cpuset_set_iopt}, { |
| "cpuset_set_sopt", cpuset_set_sopt}, { |
| "cpuset_getcpus", cpuset_getcpus}, { |
| "cpuset_getmems", cpuset_getmems}, { |
| "cpuset_cpus_weight", cpuset_cpus_weight}, { |
| "cpuset_mems_weight", cpuset_mems_weight}, { |
| "cpuset_get_iopt", cpuset_get_iopt}, { |
| "cpuset_get_sopt", cpuset_get_sopt}, { |
| "cpuset_localcpus", cpuset_localcpus}, { |
| "cpuset_localmems", cpuset_localmems}, { |
| "cpuset_cpumemdist", cpuset_cpumemdist}, { |
| "cpuset_cpu2node", cpuset_cpu2node}, { |
| "cpuset_addr2node", cpuset_addr2node}, { |
| "cpuset_create", cpuset_create}, { |
| "cpuset_delete", cpuset_delete}, { |
| "cpuset_query", cpuset_query}, { |
| "cpuset_modify", cpuset_modify}, { |
| "cpuset_getcpusetpath", cpuset_getcpusetpath}, { |
| "cpuset_cpusetofpid", cpuset_cpusetofpid}, { |
| "cpuset_mountpoint", cpuset_mountpoint}, { |
| "cpuset_collides_exclusive", cpuset_collides_exclusive}, { |
| "cpuset_nuke", cpuset_nuke}, { |
| "cpuset_init_pidlist", cpuset_init_pidlist}, { |
| "cpuset_pidlist_length", cpuset_pidlist_length}, { |
| "cpuset_get_pidlist", cpuset_get_pidlist}, { |
| "cpuset_freepidlist", cpuset_freepidlist}, { |
| "cpuset_move", cpuset_move}, { |
| "cpuset_move_all", cpuset_move_all}, { |
| "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, { |
| "cpuset_migrate", cpuset_migrate}, { |
| "cpuset_migrate_all", cpuset_migrate_all}, { |
| "cpuset_reattach", cpuset_reattach}, { |
| "cpuset_open_memory_pressure", cpuset_open_memory_pressure}, { |
| "cpuset_read_memory_pressure", cpuset_read_memory_pressure}, { |
| "cpuset_close_memory_pressure", cpuset_close_memory_pressure}, { |
| "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, { |
| "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, { |
| "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, { |
| "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, { |
| "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, { |
| "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, { |
| "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, { |
| "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, { |
| "cpuset_get_placement", cpuset_get_placement}, { |
| "cpuset_equal_placement", cpuset_equal_placement}, { |
| "cpuset_free_placement", cpuset_free_placement}, { |
| "cpuset_fts_open", cpuset_fts_open}, { |
| "cpuset_fts_read", cpuset_fts_read}, { |
| "cpuset_fts_reverse", cpuset_fts_reverse}, { |
| "cpuset_fts_rewind", cpuset_fts_rewind}, { |
| "cpuset_fts_get_path", cpuset_fts_get_path}, { |
| "cpuset_fts_get_stat", cpuset_fts_get_stat}, { |
| "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, { |
| "cpuset_fts_get_errno", cpuset_fts_get_errno}, { |
| "cpuset_fts_get_info", cpuset_fts_get_info}, { |
| "cpuset_fts_close", cpuset_fts_close}, { |
| "cpuset_cpubind", cpuset_cpubind}, { |
| "cpuset_latestcpu", cpuset_latestcpu}, { |
| "cpuset_membind", cpuset_membind}, { |
| "cpuset_export", cpuset_export}, { |
| "cpuset_import", cpuset_import}, { |
| "cpuset_function", cpuset_function}, { |
| "cpuset_pin", cpuset_pin}, { |
| "cpuset_size", cpuset_size}, { |
| "cpuset_where", cpuset_where}, { |
| "cpuset_unpin", cpuset_unpin},}; |
| |
| /* Return pointer to a libcpuset.so function, or NULL */ |
| void *cpuset_function(const char *function_name) |
| { |
| unsigned int i; |
| |
| for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++) |
| if (streq(function_name, flist[i].fname)) |
| return flist[i].func; |
| return NULL; |
| } |
| |
| /* Fortran interface to basic cpuset routines */ |
| int cpuset_pin_(int *ptr_relcpu) |
| { |
| return cpuset_pin(*ptr_relcpu); |
| } |
| |
| int cpuset_size_(void) |
| { |
| return cpuset_size(); |
| } |
| |
| int cpuset_where_(void) |
| { |
| return cpuset_where(); |
| } |
| |
| int cpuset_unpin_(void) |
| { |
| return cpuset_unpin(); |
| } |
| |
| #endif /* HAVE_LINUX_MEMPOLICY_H */ |