Back to index

numactl  2.0.8~rc4
affinity.c
Go to the documentation of this file.
00001 /* Support for specifying IO affinity by various means.
00002    Copyright 2010 Intel Corporation
00003    Author: Andi Kleen
00004 
00005    libnuma is free software; you can redistribute it and/or
00006    modify it under the terms of the GNU Lesser General Public
00007    License as published by the Free Software Foundation; version
00008    2.1.
00009 
00010    libnuma is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013    Lesser General Public License for more details.
00014 
00015    You should find a copy of v2.1 of the GNU Lesser General Public License
00016    somewhere on your Linux system; if not, write to the Free Software
00017    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
00018 
00019 /* Notebook:
00020    - Separate real errors from no NUMA with fallback
00021    - Infiniband
00022    - FCoE?
00023    - Support for other special IO devices
00024    - Specifying cpu subsets inside the IO node?
00025    - Handle multiple IO nodes (needs kernel changes)
00026    - Better support for multi-path IO?
00027  */
00028 #define _GNU_SOURCE 1
00029 #include <string.h>
00030 #include <errno.h>
00031 #include <sys/stat.h>
00032 #include <netdb.h>
00033 #include <unistd.h>
00034 #include <stdio.h>
00035 #include <stdlib.h>
00036 #include <sys/socket.h>
00037 #include <sys/ioctl.h>
00038 #include <net/if.h>
00039 #include <dirent.h>
00040 #include <linux/rtnetlink.h>
00041 #include <linux/netlink.h>
00042 #include <sys/types.h>
00043 #include <ctype.h>
00044 #include <assert.h>
00045 #include <regex.h>
00046 #include "numa.h"
00047 #include "numaint.h"
00048 #include "sysfs.h"
00049 #include "affinity.h"
00050 #include "rtnetlink.h"
00051 
00052 static int badchar(char *s)
00053 {
00054        if (strpbrk(s, "/."))
00055               return 1;
00056        return 0;
00057 }
00058 
00059 static int node_parse_failure(int ret, char *cls, char *dev)
00060 {
00061        if (!cls)
00062               cls = "";
00063        if (ret == -2)
00064               numa_warn(W_node_parse1,
00065                        "Kernel does not know node mask for%s%s device `%s'",
00066                             *cls ? " " : "", cls, dev);
00067        else
00068               numa_warn(W_node_parse2,
00069                        "Cannot read node mask for %s device `%s'",
00070                        cls, dev);
00071        return -1;
00072 }
00073 
00074 /* Generic sysfs class lookup */
00075 static int affinity_class(struct bitmask *mask, char *cls, char *dev)
00076 {
00077        int ret;
00078        while (isspace(*dev))
00079               dev++;
00080        if (badchar(dev)) {
00081               numa_warn(W_badchar, "Illegal characters in `%s' specification",
00082                        dev);
00083               return -1;
00084        }
00085 
00086        /* Somewhat hackish: extract device from symlink path.
00087           Better would be a direct backlink. This knows slightly too
00088           much about the actual sysfs layout. */
00089        char path[1024];
00090        char *fn = NULL;
00091        if (asprintf(&fn, "/sys/class/%s/%s", cls, dev) > 0 &&
00092            readlink(fn, path, sizeof path) > 0) {
00093               regex_t re;
00094               regmatch_t match[2];
00095               char *p;
00096 
00097               regcomp(&re, "(/devices/pci[0-9a-fA-F:/]+\\.[0-9]+)/",
00098                      REG_EXTENDED);
00099               ret = regexec(&re, path, 2, match, 0);
00100               regfree(&re);
00101               if (ret == 0) {
00102                      free(fn);
00103                      assert(match[0].rm_so > 0);
00104                      assert(match[0].rm_eo > 0);
00105                      path[match[1].rm_eo + 1] = 0;
00106                      p = path + match[0].rm_so;
00107                      ret = sysfs_node_read(mask, "/sys/%s/numa_node", p);
00108                      if (ret < 0)
00109                             return node_parse_failure(ret, NULL, p);
00110                      return ret;
00111               }
00112        }
00113        free(fn);
00114 
00115        ret = sysfs_node_read(mask, "/sys/class/%s/%s/device/numa_node",
00116                            cls, dev);
00117        if (ret < 0)
00118               return node_parse_failure(ret, cls, dev);
00119        return 0;
00120 }
00121 
00122 
00123 /* Turn file (or device node) into class name */
00124 static int affinity_file(struct bitmask *mask, char *cls, char *file)
00125 {
00126        struct stat st;
00127        DIR *dir;
00128        int n;
00129        unsigned maj = 0, min = 0;
00130        dev_t d;
00131        struct dirent de, *dep;
00132 
00133        cls = "block";
00134        char fn[sizeof("/sys/class/") + strlen(cls)];
00135        if (stat(file, &st) < 0) {
00136               numa_warn(W_blockdev1, "Cannot stat file %s", file);
00137               return -1;
00138        }
00139        d = st.st_dev;
00140        if (S_ISCHR(st.st_mode)) {
00141               /* Better choice than misc? Most likely misc will not work
00142                  anyways unless the kernel is fixed. */
00143               cls = "misc";
00144               d = st.st_rdev;
00145        } else if (S_ISBLK(st.st_mode))
00146               d = st.st_rdev;
00147 
00148        sprintf(fn, "/sys/class/%s", cls);
00149        dir = opendir(fn);
00150        if (!dir) {
00151               numa_warn(W_blockdev2, "Cannot enumerate %s devices in sysfs",
00152                        cls);
00153               return -1;
00154        }
00155        while (readdir_r(dir, &de, &dep) == 0 && dep) {
00156               char *name = dep->d_name;
00157               if (*name == '.')
00158                      continue;
00159               char *dev;
00160               char fn2[sizeof("/sys/class/block//dev") + strlen(name)];
00161 
00162               n = -1;
00163               if (sprintf(fn2, "/sys/class/block/%s/dev", name) < 0)
00164                      break;
00165               dev = sysfs_read(fn2);
00166               if (dev) {
00167                      n = sscanf(dev, "%u:%u", &maj, &min);
00168                      free(dev);
00169               }
00170               if (n != 2) {
00171                      numa_warn(W_blockdev3, "Cannot parse sysfs device %s",
00172                               name);
00173                      continue;
00174               }
00175 
00176               if (major(d) != maj || minor(d) != min)
00177                      continue;
00178 
00179               closedir(dir);
00180               return affinity_class(mask, "block", name);
00181        }
00182        closedir(dir);
00183        numa_warn(W_blockdev5, "Cannot find block device %x:%x in sysfs for `%s'",
00184                 maj, min, file);
00185        return -1;
00186 }
00187 
00188 /* Look up interface of route using rtnetlink. */
00189 static int find_route(struct sockaddr *dst, int *iifp)
00190 {
00191        struct rtattr *rta;
00192        const int hdrlen = NLMSG_LENGTH(sizeof(struct rtmsg));
00193        struct {
00194               struct nlmsghdr msg;
00195               struct rtmsg rt;
00196               char buf[256];
00197        } req = {
00198               .msg = {
00199                      .nlmsg_len = hdrlen,
00200                      .nlmsg_type = RTM_GETROUTE,
00201                      .nlmsg_flags = NLM_F_REQUEST,
00202               },
00203               .rt = {
00204                      .rtm_family = dst->sa_family,
00205               },
00206        };
00207        struct sockaddr_nl adr = {
00208               .nl_family = AF_NETLINK,
00209        };
00210 
00211        if (rta_put_address(&req.msg, RTA_DST, dst) < 0) {
00212               numa_warn(W_netlink1, "Cannot handle network family %x",
00213                        dst->sa_family);
00214               return -1;
00215        }
00216 
00217        if (rtnetlink_request(&req.msg, sizeof req, &adr) < 0) {
00218               numa_warn(W_netlink2, "Cannot request rtnetlink route: %s",
00219                        strerror(errno));
00220               return -1;
00221        }
00222 
00223        /* Fish the interface out of the netlink soup. */
00224        rta = NULL;
00225        while ((rta = rta_get(&req.msg, rta, hdrlen)) != NULL) {
00226               if (rta->rta_type == RTA_OIF) {
00227                      memcpy(iifp, RTA_DATA(rta), sizeof(int));
00228                      return 0;
00229               }
00230        }
00231 
00232        numa_warn(W_netlink3, "rtnetlink query did not return interface");
00233        return -1;
00234 }
00235 
00236 static int iif_to_name(int iif, struct ifreq *ifr)
00237 {
00238        int n;
00239        int sk = socket(PF_INET, SOCK_DGRAM, 0);
00240        if (sk < 0)
00241               return -1;
00242        ifr->ifr_ifindex = iif;
00243        n = ioctl(sk, SIOCGIFNAME, ifr);
00244        close(sk);
00245        return n;
00246 }
00247 
00248 /* Resolve an IP address to the nodes of a network device.
00249    This generally only attempts to handle simple cases:
00250    no multi-path, no bounding etc. In these cases only
00251    the first interface or none is chosen. */
00252 static int affinity_ip(struct bitmask *mask, char *cls, char *id)
00253 {
00254        struct addrinfo *ai;
00255        int n;
00256        int iif;
00257        struct ifreq ifr;
00258 
00259        if ((n = getaddrinfo(id, NULL, NULL, &ai)) != 0) {
00260               numa_warn(W_net1, "Cannot resolve %s: %s",
00261                        id, gai_strerror(n));
00262               return -1;
00263        }
00264 
00265        if (find_route(&ai->ai_addr[0], &iif) < 0)
00266               goto out_ai;
00267 
00268        if (iif_to_name(iif, &ifr) < 0) {
00269               numa_warn(W_net2, "Cannot resolve network interface %d", iif);
00270               goto out_ai;
00271        }
00272 
00273        freeaddrinfo(ai);
00274        return affinity_class(mask, "net", ifr.ifr_name);
00275 
00276 out_ai:
00277        freeaddrinfo(ai);
00278        return -1;
00279 }
00280 
00281 /* Look up affinity for a PCI device */
00282 static int affinity_pci(struct bitmask *mask, char *cls, char *id)
00283 {
00284        unsigned seg, bus, dev, func;
00285        int n, ret;
00286 
00287        /* Func is optional. */
00288        if ((n = sscanf(id, "%x:%x:%x.%x",&seg,&bus,&dev,&func)) == 4 || n == 3) {
00289               if (n == 3)
00290                      func = 0;
00291        }
00292        /* Segment is optional too */
00293        else if ((n = sscanf(id, "%x:%x.%x",&bus,&dev,&func)) == 3 || n == 2) {
00294               seg = 0;
00295               if (n == 2)
00296                      func = 0;
00297        } else {
00298               numa_warn(W_pci1, "Cannot parse PCI device `%s'", id);
00299               return -1;
00300        }
00301        ret = sysfs_node_read(mask,
00302                      "/sys/devices/pci%04x:%02x/%04x:%02x:%02x.%x/numa_node",
00303                            seg, bus, seg, bus, dev, func);
00304        if (ret < 0)
00305               return node_parse_failure(ret, cls, id);
00306        return 0;
00307 }
00308 
00309 static struct handler {
00310        char first;
00311        char *name;
00312        char *cls;
00313        int (*handler)(struct bitmask *mask, char *cls, char *desc);
00314 } handlers[] = {
00315        { 'n', "netdev:", "net",   affinity_class },
00316        { 'i', "ip:",     NULL,    affinity_ip    },
00317        { 'f', "file:",   NULL,    affinity_file  },
00318        { 'b', "block:",  "block", affinity_class },
00319        { 'p', "pci:",    NULL,        affinity_pci   },
00320        {}
00321 };
00322 
00323 hidden int resolve_affinity(char *id, struct bitmask *mask)
00324 {
00325        struct handler *h;
00326 
00327        for (h = &handlers[0]; h->first; h++) {
00328               int len;
00329               if (id[0] != h->first)
00330                      continue;
00331               len = strlen(h->name);
00332               if (!strncmp(id, h->name, len)) {
00333                      int ret = h->handler(mask, h->cls, id + len);
00334                      if (ret == -2) {
00335                             numa_warn(W_nonode, "Kernel does not know node for %s\n",
00336                                      id + len);
00337                      }
00338                      return ret;
00339               }
00340        }
00341        return NO_IO_AFFINITY;
00342 }
00343