tools/lguest/lguest.c

   1 /*P:100
   2  * This is the Launcher code, a simple program which lays out the "physical"
   3  * memory for the new Guest by mapping the kernel image and the virtual
   4  * devices, then opens /dev/lguest to tell the kernel about the Guest and
   5  * control it.
   6 :*/
   7 #define _LARGEFILE64_SOURCE
   8 #define _GNU_SOURCE
   9 #include <stdio.h>
  10 #include <string.h>
  11 #include <unistd.h>
  12 #include <err.h>
  13 #include <stdint.h>
  14 #include <stdlib.h>
  15 #include <elf.h>
  16 #include <sys/mman.h>
  17 #include <sys/param.h>
  18 #include <sys/types.h>
  19 #include <sys/stat.h>
  20 #include <sys/wait.h>
  21 #include <sys/eventfd.h>
  22 #include <fcntl.h>
  23 #include <stdbool.h>
  24 #include <errno.h>
  25 #include <ctype.h>
  26 #include <sys/socket.h>
  27 #include <sys/ioctl.h>
  28 #include <sys/time.h>
  29 #include <time.h>
  30 #include <netinet/in.h>
  31 #include <net/if.h>
  32 #include <linux/sockios.h>
  33 #include <linux/if_tun.h>
  34 #include <sys/uio.h>
  35 #include <termios.h>
  36 #include <getopt.h>
  37 #include <assert.h>
  38 #include <sched.h>
  39 #include <limits.h>
  40 #include <stddef.h>
  41 #include <signal.h>
  42 #include <pwd.h>
  43 #include <grp.h>
  44 #include <sys/user.h>
  45 #include <linux/pci_regs.h>
  46
  47 #ifndef VIRTIO_F_ANY_LAYOUT
  48 #define VIRTIO_F_ANY_LAYOUT             27
  49 #endif
  50
  51 /*L:110
  52  * We can ignore the 43 include files we need for this program, but I do want
  53  * to draw attention to the use of kernel-style types.
  54  *
  55  * As Linus said, "C is a Spartan language, and so should your naming be."  I
  56  * like these abbreviations, so we define them here.  Note that u64 is always
  57  * unsigned long long, which works on all Linux systems: this means that we can
  58  * use %llu in printf for any u64.
  59  */
  60 typedef unsigned long long u64;
  61 typedef uint32_t u32;
  62 typedef uint16_t u16;
  63 typedef uint8_t u8;
  64 /*:*/
  65
  66 #define VIRTIO_CONFIG_NO_LEGACY
  67 #define VIRTIO_PCI_NO_LEGACY
  68 #define VIRTIO_BLK_NO_LEGACY
  69
  70 /* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */
  71 #include "../../include/uapi/linux/virtio_config.h"
  72 #include "../../include/uapi/linux/virtio_net.h"
  73 #include "../../include/uapi/linux/virtio_blk.h"
  74 #include "../../include/uapi/linux/virtio_console.h"
  75 #include "../../include/uapi/linux/virtio_rng.h"
  76 #include <linux/virtio_ring.h>
  77 #include "../../include/uapi/linux/virtio_pci.h"
  78 #include <asm/bootparam.h>
  79 #include "../../include/linux/lguest_launcher.h"
  80
  81 #define BRIDGE_PFX "bridge:"
  82 #ifndef SIOCBRADDIF
  83 #define SIOCBRADDIF     0x89a2          /* add interface to bridge      */
  84 #endif
  85 /* We can have up to 256 pages for devices. */
  86 #define DEVICE_PAGES 256
  87 /* This will occupy 3 pages: it must be a power of 2. */
  88 #define VIRTQUEUE_NUM 256
  89
  90 /*L:120
  91  * verbose is both a global flag and a macro.  The C preprocessor allows
  92  * this, and although I wouldn't recommend it, it works quite nicely here.
  93  */
  94 static bool verbose;
  95 #define verbose(args...) \
  96         do { if (verbose) printf(args); } while(0)
  97 /*:*/
  98
  99 /* The pointer to the start of guest memory. */
 100 static void *guest_base;
 101 /* The maximum guest physical address allowed, and maximum possible. */
 102 static unsigned long guest_limit, guest_max, guest_mmio;
 103 /* The /dev/lguest file descriptor. */
 104 static int lguest_fd;
 105
 106 /* a per-cpu variable indicating whose vcpu is currently running */
 107 static unsigned int __thread cpu_id;
 108
 109 /* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */
 110 #define MAX_PCI_DEVICES 32
 111
 112 /* This is our list of devices. */
 113 struct device_list {
 114         /* Counter to assign interrupt numbers. */
 115         unsigned int next_irq;
 116
 117         /* Counter to print out convenient device numbers. */
 118         unsigned int device_num;
 119
 120         /* PCI devices. */
 121         struct device *pci[MAX_PCI_DEVICES];
 122 };
 123
 124 /* The list of Guest devices, based on command line arguments. */
 125 static struct device_list devices;
 126
 127 struct virtio_pci_cfg_cap {
 128         struct virtio_pci_cap cap;
 129         u32 pci_cfg_data; /* Data for BAR access. */
 130 };
 131
 132 struct virtio_pci_mmio {
 133         struct virtio_pci_common_cfg cfg;
 134         u16 notify;
 135         u8 isr;
 136         u8 padding;
 137         /* Device-specific configuration follows this. */
 138 };
 139
 140 /* This is the layout (little-endian) of the PCI config space. */
 141 struct pci_config {
 142         u16 vendor_id, device_id;
 143         u16 command, status;
 144         u8 revid, prog_if, subclass, class;
 145         u8 cacheline_size, lat_timer, header_type, bist;
 146         u32 bar[6];
 147         u32 cardbus_cis_ptr;
 148         u16 subsystem_vendor_id, subsystem_device_id;
 149         u32 expansion_rom_addr;
 150         u8 capabilities, reserved1[3];
 151         u32 reserved2;
 152         u8 irq_line, irq_pin, min_grant, max_latency;
 153
 154         /* Now, this is the linked capability list. */
 155         struct virtio_pci_cap common;
 156         struct virtio_pci_notify_cap notify;
 157         struct virtio_pci_cap isr;
 158         struct virtio_pci_cap device;
 159         struct virtio_pci_cfg_cap cfg_access;
 160 };
 161
 162 /* The device structure describes a single device. */
 163 struct device {
 164         /* The name of this device, for --verbose. */
 165         const char *name;
 166
 167         /* Any queues attached to this device */
 168         struct virtqueue *vq;
 169
 170         /* Is it operational */
 171         bool running;
 172
 173         /* Has it written FEATURES_OK but not re-checked it? */
 174         bool wrote_features_ok;
 175
 176         /* PCI configuration */
 177         union {
 178                 struct pci_config config;
 179                 u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
 180         };
 181
 182         /* Features we offer, and those accepted. */
 183         u64 features, features_accepted;
 184
 185         /* Device-specific config hangs off the end of this. */
 186         struct virtio_pci_mmio *mmio;
 187
 188         /* PCI MMIO resources (all in BAR0) */
 189         size_t mmio_size;
 190         u32 mmio_addr;
 191
 192         /* Device-specific data. */
 193         void *priv;
 194 };
 195
 196 /* The virtqueue structure describes a queue attached to a device. */
 197 struct virtqueue {
 198         struct virtqueue *next;
 199
 200         /* Which device owns me. */
 201         struct device *dev;
 202
 203         /* Name for printing errors. */
 204         const char *name;
 205
 206         /* The actual ring of buffers. */
 207         struct vring vring;
 208
 209         /* The information about this virtqueue (we only use queue_size on) */
 210         struct virtio_pci_common_cfg pci_config;
 211
 212         /* Last available index we saw. */
 213         u16 last_avail_idx;
 214
 215         /* How many are used since we sent last irq? */
 216         unsigned int pending_used;
 217
 218         /* Eventfd where Guest notifications arrive. */
 219         int eventfd;
 220
 221         /* Function for the thread which is servicing this virtqueue. */
 222         void (*service)(struct virtqueue *vq);
 223         pid_t thread;
 224 };
 225
 226 /* Remember the arguments to the program so we can "reboot" */
 227 static char **main_args;
 228
 229 /* The original tty settings to restore on exit. */
 230 static struct termios orig_term;
 231
 232 /*
 233  * We have to be careful with barriers: our devices are all run in separate
 234  * threads and so we need to make sure that changes visible to the Guest happen
 235  * in precise order.
 236  */
 237 #define wmb() __asm__ __volatile__("" : : : "memory")
 238 #define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
 239 #define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
 240
 241 /* Wrapper for the last available index.  Makes it easier to change. */
 242 #define lg_last_avail(vq)       ((vq)->last_avail_idx)
 243
 244 /*
 245  * The virtio configuration space is defined to be little-endian.  x86 is
 246  * little-endian too, but it's nice to be explicit so we have these helpers.
 247  */
 248 #define cpu_to_le16(v16) (v16)
 249 #define cpu_to_le32(v32) (v32)
 250 #define cpu_to_le64(v64) (v64)
 251 #define le16_to_cpu(v16) (v16)
 252 #define le32_to_cpu(v32) (v32)
 253 #define le64_to_cpu(v64) (v64)
 254
 255 /*
 256  * A real device would ignore weird/non-compliant driver behaviour.  We
 257  * stop and flag it, to help debugging Linux problems.
 258  */
 259 #define bad_driver(d, fmt, ...) \
 260         errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
 261 #define bad_driver_vq(vq, fmt, ...)                            \
 262         errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
 263              vq->name, ## __VA_ARGS__)
 264
 265 /* Is this iovec empty? */
 266 static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
 267 {
 268         unsigned int i;
 269
 270         for (i = 0; i < num_iov; i++)
 271                 if (iov[i].iov_len)
 272                         return false;
 273         return true;
 274 }
 275
 276 /* Take len bytes from the front of this iovec. */
 277 static void iov_consume(struct device *d,
 278                         struct iovec iov[], unsigned num_iov,
 279                         void *dest, unsigned len)
 280 {
 281         unsigned int i;
 282
 283         for (i = 0; i < num_iov; i++) {
 284                 unsigned int used;
 285
 286                 used = iov[i].iov_len < len ? iov[i].iov_len : len;
 287                 if (dest) {
 288                         memcpy(dest, iov[i].iov_base, used);
 289                         dest += used;
 290                 }
 291                 iov[i].iov_base += used;
 292                 iov[i].iov_len -= used;
 293                 len -= used;
 294         }
 295         if (len != 0)
 296                 bad_driver(d, "iovec too short!");
 297 }
 298
 299 /*L:100
 300  * The Launcher code itself takes us out into userspace, that scary place where
 301  * pointers run wild and free!  Unfortunately, like most userspace programs,
 302  * it's quite boring (which is why everyone likes to hack on the kernel!).
 303  * Perhaps if you make up an Lguest Drinking Game at this point, it will get
 304  * you through this section.  Or, maybe not.
 305  *
 306  * The Launcher sets up a big chunk of memory to be the Guest's "physical"
 307  * memory and stores it in "guest_base".  In other words, Guest physical ==
 308  * Launcher virtual with an offset.
 309  *
 310  * This can be tough to get your head around, but usually it just means that we
 311  * use these trivial conversion functions when the Guest gives us its
 312  * "physical" addresses:
 313  */
 314 static void *from_guest_phys(unsigned long addr)
 315 {
 316         return guest_base + addr;
 317 }
 318
 319 static unsigned long to_guest_phys(const void *addr)
 320 {
 321         return (addr - guest_base);
 322 }
 323
 324 /*L:130
 325  * Loading the Kernel.
 326  *
 327  * We start with couple of simple helper routines.  open_or_die() avoids
 328  * error-checking code cluttering the callers:
 329  */
 330 static int open_or_die(const char *name, int flags)
 331 {
 332         int fd = open(name, flags);
 333         if (fd < 0)
 334                 err(1, "Failed to open %s", name);
 335         return fd;
 336 }
 337
 338 /* map_zeroed_pages() takes a number of pages. */
 339 static void *map_zeroed_pages(unsigned int num)
 340 {
 341         int fd = open_or_die("/dev/zero", O_RDONLY);
 342         void *addr;
 343
 344         /*
 345          * We use a private mapping (ie. if we write to the page, it will be
 346          * copied). We allocate an extra two pages PROT_NONE to act as guard
 347          * pages against read/write attempts that exceed allocated space.
 348          */
 349         addr = mmap(NULL, getpagesize() * (num+2),
 350                     PROT_NONE, MAP_PRIVATE, fd, 0);
 351
 352         if (addr == MAP_FAILED)
 353                 err(1, "Mmapping %u pages of /dev/zero", num);
 354
 355         if (mprotect(addr + getpagesize(), getpagesize() * num,
 356                      PROT_READ|PROT_WRITE) == -1)
 357                 err(1, "mprotect rw %u pages failed", num);
 358
 359         /*
 360          * One neat mmap feature is that you can close the fd, and it
 361          * stays mapped.
 362          */
 363         close(fd);
 364
 365         /* Return address after PROT_NONE page */
 366         return addr + getpagesize();
 367 }
 368
 369 /* Get some bytes which won't be mapped into the guest. */
 370 static unsigned long get_mmio_region(size_t size)
 371 {
 372         unsigned long addr = guest_mmio;
 373         size_t i;
 374
 375         if (!size)
 376                 return addr;
 377
 378         /* Size has to be a power of 2 (and multiple of 16) */
 379         for (i = 1; i < size; i <<= 1);
 380
 381         guest_mmio += i;
 382
 383         return addr;
 384 }
 385
 386 /*
 387  * This routine is used to load the kernel or initrd.  It tries mmap, but if
 388  * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
 389  * it falls back to reading the memory in.
 390  */
 391 static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
 392 {
 393         ssize_t r;
 394
 395         /*
 396          * We map writable even though for some segments are marked read-only.
 397          * The kernel really wants to be writable: it patches its own
 398          * instructions.
 399          *
 400          * MAP_PRIVATE means that the page won't be copied until a write is
 401          * done to it.  This allows us to share untouched memory between
 402          * Guests.
 403          */
 404         if (mmap(addr, len, PROT_READ|PROT_WRITE,
 405                  MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
 406                 return;
 407
 408         /* pread does a seek and a read in one shot: saves a few lines. */
 409         r = pread(fd, addr, len, offset);
 410         if (r != len)
 411                 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
 412 }
 413
 414 /*
 415  * This routine takes an open vmlinux image, which is in ELF, and maps it into
 416  * the Guest memory.  ELF = Embedded Linking Format, which is the format used
 417  * by all modern binaries on Linux including the kernel.
 418  *
 419  * The ELF headers give *two* addresses: a physical address, and a virtual
 420  * address.  We use the physical address; the Guest will map itself to the
 421  * virtual address.
 422  *
 423  * We return the starting address.
 424  */
 425 static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
 426 {
 427         Elf32_Phdr phdr[ehdr->e_phnum];
 428         unsigned int i;
 429
 430         /*
 431          * Sanity checks on the main ELF header: an x86 executable with a
 432          * reasonable number of correctly-sized program headers.
 433          */
 434         if (ehdr->e_type != ET_EXEC
 435             || ehdr->e_machine != EM_386
 436             || ehdr->e_phentsize != sizeof(Elf32_Phdr)
 437             || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
 438                 errx(1, "Malformed elf header");
 439
 440         /*
 441          * An ELF executable contains an ELF header and a number of "program"
 442          * headers which indicate which parts ("segments") of the program to
 443          * load where.
 444          */
 445
 446         /* We read in all the program headers at once: */
 447         if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
 448                 err(1, "Seeking to program headers");
 449         if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
 450                 err(1, "Reading program headers");
 451
 452         /*
 453          * Try all the headers: there are usually only three.  A read-only one,
 454          * a read-write one, and a "note" section which we don't load.
 455          */
 456         for (i = 0; i < ehdr->e_phnum; i++) {
 457                 /* If this isn't a loadable segment, we ignore it */
 458                 if (phdr[i].p_type != PT_LOAD)
 459                         continue;
 460
 461                 verbose("Section %i: size %i addr %p\n",
 462                         i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
 463
 464                 /* We map this section of the file at its physical address. */
 465                 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
 466                        phdr[i].p_offset, phdr[i].p_filesz);
 467         }
 468
 469         /* The entry point is given in the ELF header. */
 470         return ehdr->e_entry;
 471 }
 472
 473 /*L:150
 474  * A bzImage, unlike an ELF file, is not meant to be loaded.  You're supposed
 475  * to jump into it and it will unpack itself.  We used to have to perform some
 476  * hairy magic because the unpacking code scared me.
 477  *
 478  * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
 479  * a small patch to jump over the tricky bits in the Guest, so now we just read
 480  * the funky header so we know where in the file to load, and away we go!
 481  */
 482 static unsigned long load_bzimage(int fd)
 483 {
 484         struct boot_params boot;
 485         int r;
 486         /* Modern bzImages get loaded at 1M. */
 487         void *p = from_guest_phys(0x100000);
 488
 489         /*
 490          * Go back to the start of the file and read the header.  It should be
 491          * a Linux boot header (see Documentation/x86/boot.txt)
 492          */
 493         lseek(fd, 0, SEEK_SET);
 494         read(fd, &boot, sizeof(boot));
 495
 496         /* Inside the setup_hdr, we expect the magic "HdrS" */
 497         if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
 498                 errx(1, "This doesn't look like a bzImage to me");
 499
 500         /* Skip over the extra sectors of the header. */
 501         lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
 502
 503         /* Now read everything into memory. in nice big chunks. */
 504         while ((r = read(fd, p, 65536)) > 0)
 505                 p += r;
 506
 507         /* Finally, code32_start tells us where to enter the kernel. */
 508         return boot.hdr.code32_start;
 509 }
 510
 511 /*L:140
 512  * Loading the kernel is easy when it's a "vmlinux", but most kernels
 513  * come wrapped up in the self-decompressing "bzImage" format.  With a little
 514  * work, we can load those, too.
 515  */
 516 static unsigned long load_kernel(int fd)
 517 {
 518         Elf32_Ehdr hdr;
 519
 520         /* Read in the first few bytes. */
 521         if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
 522                 err(1, "Reading kernel");
 523
 524         /* If it's an ELF file, it starts with "\177ELF" */
 525         if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
 526                 return map_elf(fd, &hdr);
 527
 528         /* Otherwise we assume it's a bzImage, and try to load it. */
 529         return load_bzimage(fd);
 530 }
 531
 532 /*
 533  * This is a trivial little helper to align pages.  Andi Kleen hated it because
 534  * it calls getpagesize() twice: "it's dumb code."
 535  *
 536  * Kernel guys get really het up about optimization, even when it's not
 537  * necessary.  I leave this code as a reaction against that.
 538  */
 539 static inline unsigned long page_align(unsigned long addr)
 540 {
 541         /* Add upwards and truncate downwards. */
 542         return ((addr + getpagesize()-1) & ~(getpagesize()-1));
 543 }
 544
 545 /*L:180
 546  * An "initial ram disk" is a disk image loaded into memory along with the
 547  * kernel which the kernel can use to boot from without needing any drivers.
 548  * Most distributions now use this as standard: the initrd contains the code to
 549  * load the appropriate driver modules for the current machine.
 550  *
 551  * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
 552  * kernels.  He sent me this (and tells me when I break it).
 553  */
 554 static unsigned long load_initrd(const char *name, unsigned long mem)
 555 {
 556         int ifd;
 557         struct stat st;
 558         unsigned long len;
 559
 560         ifd = open_or_die(name, O_RDONLY);
 561         /* fstat() is needed to get the file size. */
 562         if (fstat(ifd, &st) < 0)
 563                 err(1, "fstat() on initrd '%s'", name);
 564
 565         /*
 566          * We map the initrd at the top of memory, but mmap wants it to be
 567          * page-aligned, so we round the size up for that.
 568          */
 569         len = page_align(st.st_size);
 570         map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
 571         /*
 572          * Once a file is mapped, you can close the file descriptor.  It's a
 573          * little odd, but quite useful.
 574          */
 575         close(ifd);
 576         verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
 577
 578         /* We return the initrd size. */
 579         return len;
 580 }
 581 /*:*/
 582
 583 /*
 584  * Simple routine to roll all the commandline arguments together with spaces
 585  * between them.
 586  */
 587 static void concat(char *dst, char *args[])
 588 {
 589         unsigned int i, len = 0;
 590
 591         for (i = 0; args[i]; i++) {
 592                 if (i) {
 593                         strcat(dst+len, " ");
 594                         len++;
 595                 }
 596                 strcpy(dst+len, args[i]);
 597                 len += strlen(args[i]);
 598         }
 599         /* In case it's empty. */
 600         dst[len] = '\0';
 601 }
 602
 603 /*L:185
 604  * This is where we actually tell the kernel to initialize the Guest.  We
 605  * saw the arguments it expects when we looked at initialize() in lguest_user.c:
 606  * the base of Guest "physical" memory, the top physical page to allow and the
 607  * entry point for the Guest.
 608  */
 609 static void tell_kernel(unsigned long start)
 610 {
 611         unsigned long args[] = { LHREQ_INITIALIZE,
 612                                  (unsigned long)guest_base,
 613                                  guest_limit / getpagesize(), start,
 614                                  (guest_mmio+getpagesize()-1) / getpagesize() };
 615         verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
 616                 guest_base, guest_base + guest_limit,
 617                 guest_limit, guest_mmio);
 618         lguest_fd = open_or_die("/dev/lguest", O_RDWR);
 619         if (write(lguest_fd, args, sizeof(args)) < 0)
 620                 err(1, "Writing to /dev/lguest");
 621 }
 622 /*:*/
 623
 624 /*L:200
 625  * Device Handling.
 626  *
 627  * When the Guest gives us a buffer, it sends an array of addresses and sizes.
 628  * We need to make sure it's not trying to reach into the Launcher itself, so
 629  * we have a convenient routine which checks it and exits with an error message
 630  * if something funny is going on:
 631  */
 632 static void *_check_pointer(struct device *d,
 633                             unsigned long addr, unsigned int size,
 634                             unsigned int line)
 635 {
 636         /*
 637          * Check if the requested address and size exceeds the allocated memory,
 638          * or addr + size wraps around.
 639          */
 640         if ((addr + size) > guest_limit || (addr + size) < addr)
 641                 bad_driver(d, "%s:%i: Invalid address %#lx",
 642                            __FILE__, line, addr);
 643         /*
 644          * We return a pointer for the caller's convenience, now we know it's
 645          * safe to use.
 646          */
 647         return from_guest_phys(addr);
 648 }
 649 /* A macro which transparently hands the line number to the real function. */
 650 #define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
 651
 652 /*
 653  * Each buffer in the virtqueues is actually a chain of descriptors.  This
 654  * function returns the next descriptor in the chain, or vq->vring.num if we're
 655  * at the end.
 656  */
 657 static unsigned next_desc(struct device *d, struct vring_desc *desc,
 658                           unsigned int i, unsigned int max)
 659 {
 660         unsigned int next;
 661
 662         /* If this descriptor says it doesn't chain, we're done. */
 663         if (!(desc[i].flags & VRING_DESC_F_NEXT))
 664                 return max;
 665
 666         /* Check they're not leading us off end of descriptors. */
 667         next = desc[i].next;
 668         /* Make sure compiler knows to grab that: we don't want it changing! */
 669         wmb();
 670
 671         if (next >= max)
 672                 bad_driver(d, "Desc next is %u", next);
 673
 674         return next;
 675 }
 676
 677 /*
 678  * This actually sends the interrupt for this virtqueue, if we've used a
 679  * buffer.
 680  */
 681 static void trigger_irq(struct virtqueue *vq)
 682 {
 683         unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
 684
 685         /* Don't inform them if nothing used. */
 686         if (!vq->pending_used)
 687                 return;
 688         vq->pending_used = 0;
 689
 690         /*
 691          * 2.4.7.1:
 692          *
 693          *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
 694          *    The driver MUST set flags to 0 or 1.
 695          */
 696         if (vq->vring.avail->flags > 1)
 697                 bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
 698
 699         /*
 700          * 2.4.7.2:
 701          *
 702          *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
 703          *
 704          *     - The device MUST ignore the used_event value.
 705          *     - After the device writes a descriptor index into the used ring:
 706          *         - If flags is 1, the device SHOULD NOT send an interrupt.
 707          *         - If flags is 0, the device MUST send an interrupt.
 708          */
 709         if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
 710                 return;
 711         }
 712
 713         /*
 714          * 4.1.4.5.1:
 715          *
 716          *  If MSI-X capability is disabled, the device MUST set the Queue
 717          *  Interrupt bit in ISR status before sending a virtqueue notification
 718          *  to the driver.
 719          */
 720         vq->dev->mmio->isr = 0x1;
 721
 722         /* Send the Guest an interrupt tell them we used something up. */
 723         if (write(lguest_fd, buf, sizeof(buf)) != 0)
 724                 err(1, "Triggering irq %i", vq->dev->config.irq_line);
 725 }
 726
 727 /*
 728  * This looks in the virtqueue for the first available buffer, and converts
 729  * it to an iovec for convenient access.  Since descriptors consist of some
 730  * number of output then some number of input descriptors, it's actually two
 731  * iovecs, but we pack them into one and note how many of each there were.
 732  *
 733  * This function waits if necessary, and returns the descriptor number found.
 734  */
 735 static unsigned wait_for_vq_desc(struct virtqueue *vq,
 736                                  struct iovec iov[],
 737                                  unsigned int *out_num, unsigned int *in_num)
 738 {
 739         unsigned int i, head, max;
 740         struct vring_desc *desc;
 741         u16 last_avail = lg_last_avail(vq);
 742
 743         /*
 744          * 2.4.7.1:
 745          *
 746          *   The driver MUST handle spurious interrupts from the device.
 747          *
 748          * That's why this is a while loop.
 749          */
 750
 751         /* There's nothing available? */
 752         while (last_avail == vq->vring.avail->idx) {
 753                 u64 event;
 754
 755                 /*
 756                  * Since we're about to sleep, now is a good time to tell the
 757                  * Guest about what we've used up to now.
 758                  */
 759                 trigger_irq(vq);
 760
 761                 /* OK, now we need to know about added descriptors. */
 762                 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
 763
 764                 /*
 765                  * They could have slipped one in as we were doing that: make
 766                  * sure it's written, then check again.
 767                  */
 768                 mb();
 769                 if (last_avail != vq->vring.avail->idx) {
 770                         vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
 771                         break;
 772                 }
 773
 774                 /* Nothing new?  Wait for eventfd to tell us they refilled. */
 775                 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
 776                         errx(1, "Event read failed?");
 777
 778                 /* We don't need to be notified again. */
 779                 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
 780         }
 781
 782         /* Check it isn't doing very strange things with descriptor numbers. */
 783         if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
 784                 bad_driver_vq(vq, "Guest moved used index from %u to %u",
 785                               last_avail, vq->vring.avail->idx);
 786
 787         /*
 788          * Make sure we read the descriptor number *after* we read the ring
 789          * update; don't let the cpu or compiler change the order.
 790          */
 791         rmb();
 792
 793         /*
 794          * Grab the next descriptor number they're advertising, and increment
 795          * the index we've seen.
 796          */
 797         head = vq->vring.avail->ring[last_avail % vq->vring.num];
 798         lg_last_avail(vq)++;
 799
 800         /* If their number is silly, that's a fatal mistake. */
 801         if (head >= vq->vring.num)
 802                 bad_driver_vq(vq, "Guest says index %u is available", head);
 803
 804         /* When we start there are none of either input nor output. */
 805         *out_num = *in_num = 0;
 806
 807         max = vq->vring.num;
 808         desc = vq->vring.desc;
 809         i = head;
 810
 811         /*
 812          * We have to read the descriptor after we read the descriptor number,
 813          * but there's a data dependency there so the CPU shouldn't reorder
 814          * that: no rmb() required.
 815          */
 816
 817         do {
 818                 /*
 819                  * If this is an indirect entry, then this buffer contains a
 820                  * descriptor table which we handle as if it's any normal
 821                  * descriptor chain.
 822                  */
 823                 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
 824                         /* 2.4.5.3.1:
 825                          *
 826                          *  The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
 827                          *  flag unless the VIRTIO_F_INDIRECT_DESC feature was
 828                          *  negotiated.
 829                          */
 830                         if (!(vq->dev->features_accepted &
 831                               (1<<VIRTIO_RING_F_INDIRECT_DESC)))
 832                                 bad_driver_vq(vq, "vq indirect not negotiated");
 833
 834                         /*
 835                          * 2.4.5.3.1:
 836                          *
 837                          *   The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
 838                          *   flag within an indirect descriptor (ie. only one
 839                          *   table per descriptor).
 840                          */
 841                         if (desc != vq->vring.desc)
 842                                 bad_driver_vq(vq, "Indirect within indirect");
 843
 844                         /*
 845                          * Proposed update VIRTIO-134 spells this out:
 846                          *
 847                          *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
 848                          *   and VIRTQ_DESC_F_NEXT in flags.
 849                          */
 850                         if (desc[i].flags & VRING_DESC_F_NEXT)
 851                                 bad_driver_vq(vq, "indirect and next together");
 852
 853                         if (desc[i].len % sizeof(struct vring_desc))
 854                                 bad_driver_vq(vq,
 855                                               "Invalid size for indirect table");
 856                         /*
 857                          * 2.4.5.3.2:
 858                          *
 859                          *  The device MUST ignore the write-only flag
 860                          *  (flags&VIRTQ_DESC_F_WRITE) in the descriptor that
 861                          *  refers to an indirect table.
 862                          *
 863                          * We ignore it here: :)
 864                          */
 865
 866                         max = desc[i].len / sizeof(struct vring_desc);
 867                         desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
 868                         i = 0;
 869
 870                         /* 2.4.5.3.1:
 871                          *
 872                          *  A driver MUST NOT create a descriptor chain longer
 873                          *  than the Queue Size of the device.
 874                          */
 875                         if (max > vq->pci_config.queue_size)
 876                                 bad_driver_vq(vq,
 877                                               "indirect has too many entries");
 878                 }
 879
 880                 /* Grab the first descriptor, and check it's OK. */
 881                 iov[*out_num + *in_num].iov_len = desc[i].len;
 882                 iov[*out_num + *in_num].iov_base
 883                         = check_pointer(vq->dev, desc[i].addr, desc[i].len);
 884                 /* If this is an input descriptor, increment that count. */
 885                 if (desc[i].flags & VRING_DESC_F_WRITE)
 886                         (*in_num)++;
 887                 else {
 888                         /*
 889                          * If it's an output descriptor, they're all supposed
 890                          * to come before any input descriptors.
 891                          */
 892                         if (*in_num)
 893                                 bad_driver_vq(vq,
 894                                               "Descriptor has out after in");
 895                         (*out_num)++;
 896                 }
 897
 898                 /* If we've got too many, that implies a descriptor loop. */
 899                 if (*out_num + *in_num > max)
 900                         bad_driver_vq(vq, "Looped descriptor");
 901         } while ((i = next_desc(vq->dev, desc, i, max)) != max);
 902
 903         return head;
 904 }
 905
 906 /*
 907  * After we've used one of their buffers, we tell the Guest about it.  Sometime
 908  * later we'll want to send them an interrupt using trigger_irq(); note that
 909  * wait_for_vq_desc() does that for us if it has to wait.
 910  */
 911 static void add_used(struct virtqueue *vq, unsigned int head, int len)
 912 {
 913         struct vring_used_elem *used;
 914
 915         /*
 916          * The virtqueue contains a ring of used buffers.  Get a pointer to the
 917          * next entry in that used ring.
 918          */
 919         used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
 920         used->id = head;
 921         used->len = len;
 922         /* Make sure buffer is written before we update index. */
 923         wmb();
 924         vq->vring.used->idx++;
 925         vq->pending_used++;
 926 }
 927
 928 /* And here's the combo meal deal.  Supersize me! */
 929 static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
 930 {
 931         add_used(vq, head, len);
 932         trigger_irq(vq);
 933 }
 934
 935 /*
 936  * The Console
 937  *
 938  * We associate some data with the console for our exit hack.
 939  */
 940 struct console_abort {
 941         /* How many times have they hit ^C? */
 942         int count;
 943         /* When did they start? */
 944         struct timeval start;
 945 };
 946
 947 /* This is the routine which handles console input (ie. stdin). */
 948 static void console_input(struct virtqueue *vq)
 949 {
 950         int len;
 951         unsigned int head, in_num, out_num;
 952         struct console_abort *abort = vq->dev->priv;
 953         struct iovec iov[vq->vring.num];
 954
 955         /* Make sure there's a descriptor available. */
 956         head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
 957         if (out_num)
 958                 bad_driver_vq(vq, "Output buffers in console in queue?");
 959
 960         /* Read into it.  This is where we usually wait. */
 961         len = readv(STDIN_FILENO, iov, in_num);
 962         if (len <= 0) {
 963                 /* Ran out of input? */
 964                 warnx("Failed to get console input, ignoring console.");
 965                 /*
 966                  * For simplicity, dying threads kill the whole Launcher.  So
 967                  * just nap here.
 968                  */
 969                 for (;;)
 970                         pause();
 971         }
 972
 973         /* Tell the Guest we used a buffer. */
 974         add_used_and_trigger(vq, head, len);
 975
 976         /*
 977          * Three ^C within one second?  Exit.
 978          *
 979          * This is such a hack, but works surprisingly well.  Each ^C has to
 980          * be in a buffer by itself, so they can't be too fast.  But we check
 981          * that we get three within about a second, so they can't be too
 982          * slow.
 983          */
 984         if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
 985                 abort->count = 0;
 986                 return;
 987         }
 988
 989         abort->count++;
 990         if (abort->count == 1)
 991                 gettimeofday(&abort->start, NULL);
 992         else if (abort->count == 3) {
 993                 struct timeval now;
 994                 gettimeofday(&now, NULL);
 995                 /* Kill all Launcher processes with SIGINT, like normal ^C */
 996                 if (now.tv_sec <= abort->start.tv_sec+1)
 997                         kill(0, SIGINT);
 998                 abort->count = 0;
 999         }
1000 }
1001
1002 /* This is the routine which handles console output (ie. stdout). */
1003 static void console_output(struct virtqueue *vq)
1004 {
1005         unsigned int head, out, in;
1006         struct iovec iov[vq->vring.num];
1007
1008         /* We usually wait in here, for the Guest to give us something. */
1009         head = wait_for_vq_desc(vq, iov, &out, &in);
1010         if (in)
1011                 bad_driver_vq(vq, "Input buffers in console output queue?");
1012
1013         /* writev can return a partial write, so we loop here. */
1014         while (!iov_empty(iov, out)) {
1015                 int len = writev(STDOUT_FILENO, iov, out);
1016                 if (len <= 0) {
1017                         warn("Write to stdout gave %i (%d)", len, errno);
1018                         break;
1019                 }
1020                 iov_consume(vq->dev, iov, out, NULL, len);
1021         }
1022
1023         /*
1024          * We're finished with that buffer: if we're going to sleep,
1025          * wait_for_vq_desc() will prod the Guest with an interrupt.
1026          */
1027         add_used(vq, head, 0);
1028 }
1029
1030 /*
1031  * The Network
1032  *
1033  * Handling output for network is also simple: we get all the output buffers
1034  * and write them to /dev/net/tun.
1035  */
1036 struct net_info {
1037         int tunfd;
1038 };
1039
1040 static void net_output(struct virtqueue *vq)
1041 {
1042         struct net_info *net_info = vq->dev->priv;
1043         unsigned int head, out, in;
1044         struct iovec iov[vq->vring.num];
1045
1046         /* We usually wait in here for the Guest to give us a packet. */
1047         head = wait_for_vq_desc(vq, iov, &out, &in);
1048         if (in)
1049                 bad_driver_vq(vq, "Input buffers in net output queue?");
1050         /*
1051          * Send the whole thing through to /dev/net/tun.  It expects the exact
1052          * same format: what a coincidence!
1053          */
1054         if (writev(net_info->tunfd, iov, out) < 0)
1055                 warnx("Write to tun failed (%d)?", errno);
1056
1057         /*
1058          * Done with that one; wait_for_vq_desc() will send the interrupt if
1059          * all packets are processed.
1060          */
1061         add_used(vq, head, 0);
1062 }
1063
1064 /*
1065  * Handling network input is a bit trickier, because I've tried to optimize it.
1066  *
1067  * First we have a helper routine which tells is if from this file descriptor
1068  * (ie. the /dev/net/tun device) will block:
1069  */
1070 static bool will_block(int fd)
1071 {
1072         fd_set fdset;
1073         struct timeval zero = { 0, 0 };
1074         FD_ZERO(&fdset);
1075         FD_SET(fd, &fdset);
1076         return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
1077 }
1078
1079 /*
1080  * This handles packets coming in from the tun device to our Guest.  Like all
1081  * service routines, it gets called again as soon as it returns, so you don't
1082  * see a while(1) loop here.
1083  */
1084 static void net_input(struct virtqueue *vq)
1085 {
1086         int len;
1087         unsigned int head, out, in;
1088         struct iovec iov[vq->vring.num];
1089         struct net_info *net_info = vq->dev->priv;
1090
1091         /*
1092          * Get a descriptor to write an incoming packet into.  This will also
1093          * send an interrupt if they're out of descriptors.
1094          */
1095         head = wait_for_vq_desc(vq, iov, &out, &in);
1096         if (out)
1097                 bad_driver_vq(vq, "Output buffers in net input queue?");
1098
1099         /*
1100          * If it looks like we'll block reading from the tun device, send them
1101          * an interrupt.
1102          */
1103         if (vq->pending_used && will_block(net_info->tunfd))
1104                 trigger_irq(vq);
1105
1106         /*
1107          * Read in the packet.  This is where we normally wait (when there's no
1108          * incoming network traffic).
1109          */
1110         len = readv(net_info->tunfd, iov, in);
1111         if (len <= 0)
1112                 warn("Failed to read from tun (%d).", errno);
1113
1114         /*
1115          * Mark that packet buffer as used, but don't interrupt here.  We want
1116          * to wait until we've done as much work as we can.
1117          */
1118         add_used(vq, head, len);
1119 }
1120 /*:*/
1121
1122 /* This is the helper to create threads: run the service routine in a loop. */
1123 static int do_thread(void *_vq)
1124 {
1125         struct virtqueue *vq = _vq;
1126
1127         for (;;)
1128                 vq->service(vq);
1129         return 0;
1130 }
1131
1132 /*
1133  * When a child dies, we kill our entire process group with SIGTERM.  This
1134  * also has the side effect that the shell restores the console for us!
1135  */
1136 static void kill_launcher(int signal)
1137 {
1138         kill(0, SIGTERM);
1139 }
1140
1141 static void reset_vq_pci_config(struct virtqueue *vq)
1142 {
1143         vq->pci_config.queue_size = VIRTQUEUE_NUM;
1144         vq->pci_config.queue_enable = 0;
1145 }
1146
1147 static void reset_device(struct device *dev)
1148 {
1149         struct virtqueue *vq;
1150
1151         verbose("Resetting device %s\n", dev->name);
1152
1153         /* Clear any features they've acked. */
1154         dev->features_accepted = 0;
1155
1156         /* We're going to be explicitly killing threads, so ignore them. */
1157         signal(SIGCHLD, SIG_IGN);
1158
1159         /*
1160          * 4.1.4.3.1:
1161          *
1162          *   The device MUST present a 0 in queue_enable on reset.
1163          *
1164          * This means we set it here, and reset the saved ones in every vq.
1165          */
1166         dev->mmio->cfg.queue_enable = 0;
1167
1168         /* Get rid of the virtqueue threads */
1169         for (vq = dev->vq; vq; vq = vq->next) {
1170                 vq->last_avail_idx = 0;
1171                 reset_vq_pci_config(vq);
1172                 if (vq->thread != (pid_t)-1) {
1173                         kill(vq->thread, SIGTERM);
1174                         waitpid(vq->thread, NULL, 0);
1175                         vq->thread = (pid_t)-1;
1176                 }
1177         }
1178         dev->running = false;
1179         dev->wrote_features_ok = false;
1180
1181         /* Now we care if threads die. */
1182         signal(SIGCHLD, (void *)kill_launcher);
1183 }
1184
1185 static void cleanup_devices(void)
1186 {
1187         unsigned int i;
1188
1189         for (i = 1; i < MAX_PCI_DEVICES; i++) {
1190                 struct device *d = devices.pci[i];
1191                 if (!d)
1192                         continue;
1193                 reset_device(d);
1194         }
1195
1196         /* If we saved off the original terminal settings, restore them now. */
1197         if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
1198                 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
1199 }
1200
1201 /*L:217
1202  * We do PCI.  This is mainly done to let us test the kernel virtio PCI
1203  * code.
1204  */
1205
1206 /* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */
1207 static struct device pci_host_bridge;
1208
1209 static void init_pci_host_bridge(void)
1210 {
1211         pci_host_bridge.name = "PCI Host Bridge";
1212         pci_host_bridge.config.class = 0x06; /* bridge */
1213         pci_host_bridge.config.subclass = 0; /* host bridge */
1214         devices.pci[0] = &pci_host_bridge;
1215 }
1216
1217 /* The IO ports used to read the PCI config space. */
1218 #define PCI_CONFIG_ADDR 0xCF8
1219 #define PCI_CONFIG_DATA 0xCFC
1220
1221 /*
1222  * Not really portable, but does help readability: this is what the Guest
1223  * writes to the PCI_CONFIG_ADDR IO port.
1224  */
1225 union pci_config_addr {
1226         struct {
1227                 unsigned mbz: 2;
1228                 unsigned offset: 6;
1229                 unsigned funcnum: 3;
1230                 unsigned devnum: 5;
1231                 unsigned busnum: 8;
1232                 unsigned reserved: 7;
1233                 unsigned enabled : 1;
1234         } bits;
1235         u32 val;
1236 };
1237
1238 /*
1239  * We cache what they wrote to the address port, so we know what they're
1240  * talking about when they access the data port.
1241  */
1242 static union pci_config_addr pci_config_addr;
1243
1244 static struct device *find_pci_device(unsigned int index)
1245 {
1246         return devices.pci[index];
1247 }
1248
1249 /* PCI can do 1, 2 and 4 byte reads; we handle that here. */
1250 static void ioread(u16 off, u32 v, u32 mask, u32 *val)
1251 {
1252         assert(off < 4);
1253         assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
1254         *val = (v >> (off * 8)) & mask;
1255 }
1256
1257 /* PCI can do 1, 2 and 4 byte writes; we handle that here. */
1258 static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
1259 {
1260         assert(off < 4);
1261         assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
1262         *dst &= ~(mask << (off * 8));
1263         *dst |= (v & mask) << (off * 8);
1264 }
1265
1266 /*
1267  * Where PCI_CONFIG_DATA accesses depends on the previous write to
1268  * PCI_CONFIG_ADDR.
1269  */
1270 static struct device *dev_and_reg(u32 *reg)
1271 {
1272         if (!pci_config_addr.bits.enabled)
1273                 return NULL;
1274
1275         if (pci_config_addr.bits.funcnum != 0)
1276                 return NULL;
1277
1278         if (pci_config_addr.bits.busnum != 0)
1279                 return NULL;
1280
1281         if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
1282                 return NULL;
1283
1284         *reg = pci_config_addr.bits.offset;
1285         return find_pci_device(pci_config_addr.bits.devnum);
1286 }
1287
1288 /*
1289  * We can get invalid combinations of values while they're writing, so we
1290  * only fault if they try to write with some invalid bar/offset/length.
1291  */
1292 static bool valid_bar_access(struct device *d,
1293                              struct virtio_pci_cfg_cap *cfg_access)
1294 {
1295         /* We only have 1 bar (BAR0) */
1296         if (cfg_access->cap.bar != 0)
1297                 return false;
1298
1299         /* Check it's within BAR0. */
1300         if (cfg_access->cap.offset >= d->mmio_size
1301             || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
1302                 return false;
1303
1304         /* Check length is 1, 2 or 4. */
1305         if (cfg_access->cap.length != 1
1306             && cfg_access->cap.length != 2
1307             && cfg_access->cap.length != 4)
1308                 return false;
1309
1310         /*
1311          * 4.1.4.7.2:
1312          *
1313          *  The driver MUST NOT write a cap.offset which is not a multiple of
1314          *  cap.length (ie. all accesses MUST be aligned).
1315          */
1316         if (cfg_access->cap.offset % cfg_access->cap.length != 0)
1317                 return false;
1318
1319         /* Return pointer into word in BAR0. */
1320         return true;
1321 }
1322
1323 /* Is this accessing the PCI config address port?. */
1324 static bool is_pci_addr_port(u16 port)
1325 {
1326         return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
1327 }
1328
1329 static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
1330 {
1331         iowrite(port - PCI_CONFIG_ADDR, val, mask,
1332                 &pci_config_addr.val);
1333         verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
1334                 pci_config_addr.bits.enabled ? "" : " DISABLED",
1335                 val, mask,
1336                 pci_config_addr.bits.busnum,
1337                 pci_config_addr.bits.devnum,
1338                 pci_config_addr.bits.funcnum,
1339                 pci_config_addr.bits.offset);
1340         return true;
1341 }
1342
1343 static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
1344 {
1345         ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
1346 }
1347
1348 /* Is this accessing the PCI config data port?. */
1349 static bool is_pci_data_port(u16 port)
1350 {
1351         return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
1352 }
1353
1354 static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
1355
1356 static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
1357 {
1358         u32 reg, portoff;
1359         struct device *d = dev_and_reg(&reg);
1360
1361         /* Complain if they don't belong to a device. */
1362         if (!d)
1363                 return false;
1364
1365         /* They can do 1 byte writes, etc. */
1366         portoff = port - PCI_CONFIG_DATA;
1367
1368         /*
1369          * PCI uses a weird way to determine the BAR size: the OS
1370          * writes all 1's, and sees which ones stick.
1371          */
1372         if (&d->config_words[reg] == &d->config.bar[0]) {
1373                 int i;
1374
1375                 iowrite(portoff, val, mask, &d->config.bar[0]);
1376                 for (i = 0; (1 << i) < d->mmio_size; i++)
1377                         d->config.bar[0] &= ~(1 << i);
1378                 return true;
1379         } else if ((&d->config_words[reg] > &d->config.bar[0]
1380                     && &d->config_words[reg] <= &d->config.bar[6])
1381                    || &d->config_words[reg] == &d->config.expansion_rom_addr) {
1382                 /* Allow writing to any other BAR, or expansion ROM */
1383                 iowrite(portoff, val, mask, &d->config_words[reg]);
1384                 return true;
1385                 /* We let them overide latency timer and cacheline size */
1386         } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
1387                 /* Only let them change the first two fields. */
1388                 if (mask == 0xFFFFFFFF)
1389                         mask = 0xFFFF;
1390                 iowrite(portoff, val, mask, &d->config_words[reg]);
1391                 return true;
1392         } else if (&d->config_words[reg] == (void *)&d->config.command
1393                    && mask == 0xFFFF) {
1394                 /* Ignore command writes. */
1395                 return true;
1396         } else if (&d->config_words[reg]
1397                    == (void *)&d->config.cfg_access.cap.bar
1398                    || &d->config_words[reg]
1399                    == &d->config.cfg_access.cap.length
1400                    || &d->config_words[reg]
1401                    == &d->config.cfg_access.cap.offset) {
1402
1403                 /*
1404                  * The VIRTIO_PCI_CAP_PCI_CFG capability
1405                  * provides a backdoor to access the MMIO
1406                  * regions without mapping them.  Weird, but
1407                  * useful.
1408                  */
1409                 iowrite(portoff, val, mask, &d->config_words[reg]);
1410                 return true;
1411         } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
1412                 u32 write_mask;
1413
1414                 /*
1415                  * 4.1.4.7.1:
1416                  *
1417                  *  Upon detecting driver write access to pci_cfg_data, the
1418                  *  device MUST execute a write access at offset cap.offset at
1419                  *  BAR selected by cap.bar using the first cap.length bytes
1420                  *  from pci_cfg_data.
1421                  */
1422
1423                 /* Must be bar 0 */
1424                 if (!valid_bar_access(d, &d->config.cfg_access))
1425                         return false;
1426
1427                 iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
1428
1429                 /*
1430                  * Now emulate a write.  The mask we use is set by
1431                  * len, *not* this write!
1432                  */
1433                 write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
1434                 verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
1435                         d->config.cfg_access.pci_cfg_data, write_mask,
1436                         d->config.cfg_access.cap.bar,
1437                         d->config.cfg_access.cap.offset,
1438                         d->config.cfg_access.cap.length);
1439
1440                 emulate_mmio_write(d, d->config.cfg_access.cap.offset,
1441                                    d->config.cfg_access.pci_cfg_data,
1442                                    write_mask);
1443                 return true;
1444         }
1445
1446         /*
1447          * 4.1.4.1:
1448          *
1449          *  The driver MUST NOT write into any field of the capability
1450          *  structure, with the exception of those with cap_type
1451          *  VIRTIO_PCI_CAP_PCI_CFG...
1452          */
1453         return false;
1454 }
1455
1456 static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
1457
1458 static void pci_data_ioread(u16 port, u32 mask, u32 *val)
1459 {
1460         u32 reg;
1461         struct device *d = dev_and_reg(&reg);
1462
1463         if (!d)
1464                 return;
1465
1466         /* Read through the PCI MMIO access window is special */
1467         if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
1468                 u32 read_mask;
1469
1470                 /*
1471                  * 4.1.4.7.1:
1472                  *
1473                  *  Upon detecting driver read access to pci_cfg_data, the
1474                  *  device MUST execute a read access of length cap.length at
1475                  *  offset cap.offset at BAR selected by cap.bar and store the
1476                  *  first cap.length bytes in pci_cfg_data.
1477                  */
1478                 /* Must be bar 0 */
1479                 if (!valid_bar_access(d, &d->config.cfg_access))
1480                         bad_driver(d,
1481                              "Invalid cfg_access to bar%u, offset %u len %u",
1482                              d->config.cfg_access.cap.bar,
1483                              d->config.cfg_access.cap.offset,
1484                              d->config.cfg_access.cap.length);
1485
1486                 /*
1487                  * Read into the window.  The mask we use is set by
1488                  * len, *not* this read!
1489                  */
1490                 read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
1491                 d->config.cfg_access.pci_cfg_data
1492                         = emulate_mmio_read(d,
1493                                             d->config.cfg_access.cap.offset,
1494                                             read_mask);
1495                 verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
1496                         d->config.cfg_access.pci_cfg_data, read_mask,
1497                         d->config.cfg_access.cap.bar,
1498                         d->config.cfg_access.cap.offset,
1499                         d->config.cfg_access.cap.length);
1500         }
1501         ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
1502 }
1503
1504 /*L:216
1505  * This is where we emulate a handful of Guest instructions.  It's ugly
1506  * and we used to do it in the kernel but it grew over time.
1507  */
1508
1509 /*
1510  * We use the ptrace syscall's pt_regs struct to talk about registers
1511  * to lguest: these macros convert the names to the offsets.
1512  */
1513 #define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
1514 #define setreg(name, val) \
1515         setreg_off(offsetof(struct user_regs_struct, name), (val))
1516
1517 static u32 getreg_off(size_t offset)
1518 {
1519         u32 r;
1520         unsigned long args[] = { LHREQ_GETREG, offset };
1521
1522         if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1523                 err(1, "Getting register %u", offset);
1524         if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
1525                 err(1, "Reading register %u", offset);
1526
1527         return r;
1528 }
1529
1530 static void setreg_off(size_t offset, u32 val)
1531 {
1532         unsigned long args[] = { LHREQ_SETREG, offset, val };
1533
1534         if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1535                 err(1, "Setting register %u", offset);
1536 }
1537
1538 /* Get register by instruction encoding */
1539 static u32 getreg_num(unsigned regnum, u32 mask)
1540 {
1541         /* 8 bit ops use regnums 4-7 for high parts of word */
1542         if (mask == 0xFF && (regnum & 0x4))
1543                 return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
1544
1545         switch (regnum) {
1546         case 0: return getreg(eax) & mask;
1547         case 1: return getreg(ecx) & mask;
1548         case 2: return getreg(edx) & mask;
1549         case 3: return getreg(ebx) & mask;
1550         case 4: return getreg(esp) & mask;
1551         case 5: return getreg(ebp) & mask;
1552         case 6: return getreg(esi) & mask;
1553         case 7: return getreg(edi) & mask;
1554         }
1555         abort();
1556 }
1557
1558 /* Set register by instruction encoding */
1559 static void setreg_num(unsigned regnum, u32 val, u32 mask)
1560 {
1561         /* Don't try to set bits out of range */
1562         assert(~(val & ~mask));
1563
1564         /* 8 bit ops use regnums 4-7 for high parts of word */
1565         if (mask == 0xFF && (regnum & 0x4)) {
1566                 /* Construct the 16 bits we want. */
1567                 val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
1568                 setreg_num(regnum & 0x3, val, 0xFFFF);
1569                 return;
1570         }
1571
1572         switch (regnum) {
1573         case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
1574         case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
1575         case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
1576         case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
1577         case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
1578         case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
1579         case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
1580         case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
1581         }
1582         abort();
1583 }
1584
1585 /* Get bytes of displacement appended to instruction, from r/m encoding */
1586 static u32 insn_displacement_len(u8 mod_reg_rm)
1587 {
1588         /* Switch on the mod bits */
1589         switch (mod_reg_rm >> 6) {
1590         case 0:
1591                 /* If mod == 0, and r/m == 101, 16-bit displacement follows */
1592                 if ((mod_reg_rm & 0x7) == 0x5)
1593                         return 2;
1594                 /* Normally, mod == 0 means no literal displacement */
1595                 return 0;
1596         case 1:
1597                 /* One byte displacement */
1598                 return 1;
1599         case 2:
1600                 /* Four byte displacement */
1601                 return 4;
1602         case 3:
1603                 /* Register mode */
1604                 return 0;
1605         }
1606         abort();
1607 }
1608
1609 static void emulate_insn(const u8 insn[])
1610 {
1611         unsigned long args[] = { LHREQ_TRAP, 13 };
1612         unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
1613         unsigned int eax, port, mask;
1614         /*
1615          * Default is to return all-ones on IO port reads, which traditionally
1616          * means "there's nothing there".
1617          */
1618         u32 val = 0xFFFFFFFF;
1619
1620         /*
1621          * This must be the Guest kernel trying to do something, not userspace!
1622          * The bottom two bits of the CS segment register are the privilege
1623          * level.
1624          */
1625         if ((getreg(xcs) & 3) != 0x1)
1626                 goto no_emulate;
1627
1628         /* Decoding x86 instructions is icky. */
1629
1630         /*
1631          * Around 2.6.33, the kernel started using an emulation for the
1632          * cmpxchg8b instruction in early boot on many configurations.  This
1633          * code isn't paravirtualized, and it tries to disable interrupts.
1634          * Ignore it, which will Mostly Work.
1635          */
1636         if (insn[insnlen] == 0xfa) {
1637                 /* "cli", or Clear Interrupt Enable instruction.  Skip it. */
1638                 insnlen = 1;
1639                 goto skip_insn;
1640         }
1641
1642         /*
1643          * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
1644          */
1645         if (insn[insnlen] == 0x66) {
1646                 small_operand = 1;
1647                 /* The instruction is 1 byte so far, read the next byte. */
1648                 insnlen = 1;
1649         }
1650
1651         /* If the lower bit isn't set, it's a single byte access */
1652         byte_access = !(insn[insnlen] & 1);
1653
1654         /*
1655          * Now we can ignore the lower bit and decode the 4 opcodes
1656          * we need to emulate.
1657          */
1658         switch (insn[insnlen] & 0xFE) {
1659         case 0xE4: /* in     <next byte>,%al */
1660                 port = insn[insnlen+1];
1661                 insnlen += 2;
1662                 in = 1;
1663                 break;
1664         case 0xEC: /* in     (%dx),%al */
1665                 port = getreg(edx) & 0xFFFF;
1666                 insnlen += 1;
1667                 in = 1;
1668                 break;
1669         case 0xE6: /* out    %al,<next byte> */
1670                 port = insn[insnlen+1];
1671                 insnlen += 2;
1672                 break;
1673         case 0xEE: /* out    %al,(%dx) */
1674                 port = getreg(edx) & 0xFFFF;
1675                 insnlen += 1;
1676                 break;
1677         default:
1678                 /* OK, we don't know what this is, can't emulate. */
1679                 goto no_emulate;
1680         }
1681
1682         /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
1683         if (byte_access)
1684                 mask = 0xFF;
1685         else if (small_operand)
1686                 mask = 0xFFFF;
1687         else
1688                 mask = 0xFFFFFFFF;
1689
1690         /*
1691          * If it was an "IN" instruction, they expect the result to be read
1692          * into %eax, so we change %eax.
1693          */
1694         eax = getreg(eax);
1695
1696         if (in) {
1697                 /* This is the PS/2 keyboard status; 1 means ready for output */
1698                 if (port == 0x64)
1699                         val = 1;
1700                 else if (is_pci_addr_port(port))
1701                         pci_addr_ioread(port, mask, &val);
1702                 else if (is_pci_data_port(port))
1703                         pci_data_ioread(port, mask, &val);
1704
1705                 /* Clear the bits we're about to read */
1706                 eax &= ~mask;
1707                 /* Copy bits in from val. */
1708                 eax |= val & mask;
1709                 /* Now update the register. */
1710                 setreg(eax, eax);
1711         } else {
1712                 if (is_pci_addr_port(port)) {
1713                         if (!pci_addr_iowrite(port, mask, eax))
1714                                 goto bad_io;
1715                 } else if (is_pci_data_port(port)) {
1716                         if (!pci_data_iowrite(port, mask, eax))
1717                                 goto bad_io;
1718                 }
1719                 /* There are many other ports, eg. CMOS clock, serial
1720                  * and parallel ports, so we ignore them all. */
1721         }
1722
1723         verbose("IO %s of %x to %u: %#08x\n",
1724                 in ? "IN" : "OUT", mask, port, eax);
1725 skip_insn:
1726         /* Finally, we've "done" the instruction, so move past it. */
1727         setreg(eip, getreg(eip) + insnlen);
1728         return;
1729
1730 bad_io:
1731         warnx("Attempt to %s port %u (%#x mask)",
1732               in ? "read from" : "write to", port, mask);
1733
1734 no_emulate:
1735         /* Inject trap into Guest. */
1736         if (write(lguest_fd, args, sizeof(args)) < 0)
1737                 err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
1738 }
1739
1740 static struct device *find_mmio_region(unsigned long paddr, u32 *off)
1741 {
1742         unsigned int i;
1743
1744         for (i = 1; i < MAX_PCI_DEVICES; i++) {
1745                 struct device *d = devices.pci[i];
1746
1747                 if (!d)
1748                         continue;
1749                 if (paddr < d->mmio_addr)
1750                         continue;
1751                 if (paddr >= d->mmio_addr + d->mmio_size)
1752                         continue;
1753                 *off = paddr - d->mmio_addr;
1754                 return d;
1755         }
1756         return NULL;
1757 }
1758
1759 /* FIXME: Use vq array. */
1760 static struct virtqueue *vq_by_num(struct device *d, u32 num)
1761 {
1762         struct virtqueue *vq = d->vq;
1763
1764         while (num-- && vq)
1765                 vq = vq->next;
1766
1767         return vq;
1768 }
1769
1770 static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
1771                            struct virtqueue *vq)
1772 {
1773         vq->pci_config = *cfg;
1774 }
1775
1776 static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
1777                               struct virtqueue *vq)
1778 {
1779         /* Only restore the per-vq part */
1780         size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
1781
1782         memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
1783                sizeof(*cfg) - off);
1784 }
1785
1786 /*
1787  * 4.1.4.3.2:
1788  *
1789  *  The driver MUST configure the other virtqueue fields before
1790  *  enabling the virtqueue with queue_enable.
1791  *
1792  * When they enable the virtqueue, we check that their setup is valid.
1793  */
1794 static void check_virtqueue(struct device *d, struct virtqueue *vq)
1795 {
1796         /* Because lguest is 32 bit, all the descriptor high bits must be 0 */
1797         if (vq->pci_config.queue_desc_hi
1798             || vq->pci_config.queue_avail_hi
1799             || vq->pci_config.queue_used_hi)
1800                 bad_driver_vq(vq, "invalid 64-bit queue address");
1801
1802         /*
1803          * 2.4.1:
1804          *
1805          *  The driver MUST ensure that the physical address of the first byte
1806          *  of each virtqueue part is a multiple of the specified alignment
1807          *  value in the above table.
1808          */
1809         if (vq->pci_config.queue_desc_lo % 16
1810             || vq->pci_config.queue_avail_lo % 2
1811             || vq->pci_config.queue_used_lo % 4)
1812                 bad_driver_vq(vq, "invalid alignment in queue addresses");
1813
1814         /* Initialize the virtqueue and check they're all in range. */
1815         vq->vring.num = vq->pci_config.queue_size;
1816         vq->vring.desc = check_pointer(vq->dev,
1817                                        vq->pci_config.queue_desc_lo,
1818                                        sizeof(*vq->vring.desc) * vq->vring.num);
1819         vq->vring.avail = check_pointer(vq->dev,
1820                                         vq->pci_config.queue_avail_lo,
1821                                         sizeof(*vq->vring.avail)
1822                                         + (sizeof(vq->vring.avail->ring[0])
1823                                            * vq->vring.num));
1824         vq->vring.used = check_pointer(vq->dev,
1825                                        vq->pci_config.queue_used_lo,
1826                                        sizeof(*vq->vring.used)
1827                                        + (sizeof(vq->vring.used->ring[0])
1828                                           * vq->vring.num));
1829
1830         /*
1831          * 2.4.9.1:
1832          *
1833          *   The driver MUST initialize flags in the used ring to 0
1834          *   when allocating the used ring.
1835          */
1836         if (vq->vring.used->flags != 0)
1837                 bad_driver_vq(vq, "invalid initial used.flags %#x",
1838                               vq->vring.used->flags);
1839 }
1840
1841 static void start_virtqueue(struct virtqueue *vq)
1842 {
1843         /*
1844          * Create stack for thread.  Since the stack grows upwards, we point
1845          * the stack pointer to the end of this region.
1846          */
1847         char *stack = malloc(32768);
1848
1849         /* Create a zero-initialized eventfd. */
1850         vq->eventfd = eventfd(0, 0);
1851         if (vq->eventfd < 0)
1852                 err(1, "Creating eventfd");
1853
1854         /*
1855          * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
1856          * we get a signal if it dies.
1857          */
1858         vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
1859         if (vq->thread == (pid_t)-1)
1860                 err(1, "Creating clone");
1861 }
1862
1863 static void start_virtqueues(struct device *d)
1864 {
1865         struct virtqueue *vq;
1866
1867         for (vq = d->vq; vq; vq = vq->next) {
1868                 if (vq->pci_config.queue_enable)
1869                         start_virtqueue(vq);
1870         }
1871 }
1872
1873 static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
1874 {
1875         struct virtqueue *vq;
1876
1877         switch (off) {
1878         case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
1879                 /*
1880                  * 4.1.4.3.1:
1881                  *
1882                  * The device MUST present the feature bits it is offering in
1883                  * device_feature, starting at bit device_feature_select ∗ 32
1884                  * for any device_feature_select written by the driver
1885                  */
1886                 if (val == 0)
1887                         d->mmio->cfg.device_feature = d->features;
1888                 else if (val == 1)
1889                         d->mmio->cfg.device_feature = (d->features >> 32);
1890                 else
1891                         d->mmio->cfg.device_feature = 0;
1892                 goto feature_write_through32;
1893         case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
1894                 if (val > 1)
1895                         bad_driver(d, "Unexpected driver select %u", val);
1896                 goto feature_write_through32;
1897         case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
1898                 if (d->mmio->cfg.guest_feature_select == 0) {
1899                         d->features_accepted &= ~((u64)0xFFFFFFFF);
1900                         d->features_accepted |= val;
1901                 } else {
1902                         assert(d->mmio->cfg.guest_feature_select == 1);
1903                         d->features_accepted &= 0xFFFFFFFF;
1904                         d->features_accepted |= ((u64)val) << 32;
1905                 }
1906                 /*
1907                  * 2.2.1:
1908                  *
1909                  *   The driver MUST NOT accept a feature which the device did
1910                  *   not offer
1911                  */
1912                 if (d->features_accepted & ~d->features)
1913                         bad_driver(d, "over-accepted features %#llx of %#llx",
1914                                    d->features_accepted, d->features);
1915                 goto feature_write_through32;
1916         case offsetof(struct virtio_pci_mmio, cfg.device_status): {
1917                 u8 prev;
1918
1919                 verbose("%s: device status -> %#x\n", d->name, val);
1920                 /*
1921                  * 4.1.4.3.1:
1922                  *
1923                  *  The device MUST reset when 0 is written to device_status,
1924                  *  and present a 0 in device_status once that is done.
1925                  */
1926                 if (val == 0) {
1927                         reset_device(d);
1928                         goto write_through8;
1929                 }
1930
1931                 /* 2.1.1: The driver MUST NOT clear a device status bit. */
1932                 if (d->mmio->cfg.device_status & ~val)
1933                         bad_driver(d, "unset of device status bit %#x -> %#x",
1934                                    d->mmio->cfg.device_status, val);
1935
1936                 /*
1937                  * 2.1.2:
1938                  *
1939                  *  The device MUST NOT consume buffers or notify the driver
1940                  *  before DRIVER_OK.
1941                  */
1942                 if (val & VIRTIO_CONFIG_S_DRIVER_OK
1943                     && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
1944                         start_virtqueues(d);
1945
1946                 /*
1947                  * 3.1.1:
1948                  *
1949                  *   The driver MUST follow this sequence to initialize a device:
1950                  *   - Reset the device.
1951                  *   - Set the ACKNOWLEDGE status bit: the guest OS has
1952                  *     notice the device.
1953                  *   - Set the DRIVER status bit: the guest OS knows how
1954                  *     to drive the device.
1955                  *   - Read device feature bits, and write the subset
1956                  *     of feature bits understood by the OS and driver
1957                  *     to the device. During this step the driver MAY
1958                  *     read (but MUST NOT write) the device-specific
1959                  *     configuration fields to check that it can
1960                  *     support the device before accepting it.
1961                  *   - Set the FEATURES_OK status bit.  The driver
1962                  *     MUST not accept new feature bits after this
1963                  *     step.
1964                  *   - Re-read device status to ensure the FEATURES_OK
1965                  *     bit is still set: otherwise, the device does
1966                  *     not support our subset of features and the
1967                  *     device is unusable.
1968                  *   - Perform device-specific setup, including
1969                  *     discovery of virtqueues for the device,
1970                  *     optional per-bus setup, reading and possibly
1971                  *     writing the device’s virtio configuration
1972                  *     space, and population of virtqueues.
1973                  *   - Set the DRIVER_OK status bit. At this point the
1974                  *     device is “live”.
1975                  */
1976                 prev = 0;
1977                 switch (val & ~d->mmio->cfg.device_status) {
1978                 case VIRTIO_CONFIG_S_DRIVER_OK:
1979                         prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */
1980                 case VIRTIO_CONFIG_S_FEATURES_OK:
1981                         prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */
1982                 case VIRTIO_CONFIG_S_DRIVER:
1983                         prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */
1984                 case VIRTIO_CONFIG_S_ACKNOWLEDGE:
1985                         break;
1986                 default:
1987                         bad_driver(d, "unknown device status bit %#x -> %#x",
1988                                    d->mmio->cfg.device_status, val);
1989                 }
1990                 if (d->mmio->cfg.device_status != prev)
1991                         bad_driver(d, "unexpected status transition %#x -> %#x",
1992                                    d->mmio->cfg.device_status, val);
1993
1994                 /* If they just wrote FEATURES_OK, we make sure they read */
1995                 switch (val & ~d->mmio->cfg.device_status) {
1996                 case VIRTIO_CONFIG_S_FEATURES_OK:
1997                         d->wrote_features_ok = true;
1998                         break;
1999                 case VIRTIO_CONFIG_S_DRIVER_OK:
2000                         if (d->wrote_features_ok)
2001                                 bad_driver(d, "did not re-read FEATURES_OK");
2002                         break;
2003                 }
2004                 goto write_through8;
2005         }
2006         case offsetof(struct virtio_pci_mmio, cfg.queue_select):
2007                 vq = vq_by_num(d, val);
2008                 /*
2009                  * 4.1.4.3.1:
2010                  *
2011                  *  The device MUST present a 0 in queue_size if the virtqueue
2012                  *  corresponding to the current queue_select is unavailable.
2013                  */
2014                 if (!vq) {
2015                         d->mmio->cfg.queue_size = 0;
2016                         goto write_through16;
2017                 }
2018                 /* Save registers for old vq, if it was a valid vq */
2019                 if (d->mmio->cfg.queue_size)
2020                         save_vq_config(&d->mmio->cfg,
2021                                        vq_by_num(d, d->mmio->cfg.queue_select));
2022                 /* Restore the registers for the queue they asked for */
2023                 restore_vq_config(&d->mmio->cfg, vq);
2024                 goto write_through16;
2025         case offsetof(struct virtio_pci_mmio, cfg.queue_size):
2026                 /*
2027                  * 4.1.4.3.2:
2028                  *
2029                  *  The driver MUST NOT write a value which is not a power of 2
2030                  *  to queue_size.
2031                  */
2032                 if (val & (val-1))
2033                         bad_driver(d, "invalid queue size %u", val);
2034                 if (d->mmio->cfg.queue_enable)
2035                         bad_driver(d, "changing queue size on live device");
2036                 goto write_through16;
2037         case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
2038                 bad_driver(d, "attempt to set MSIX vector to %u", val);
2039         case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
2040                 struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
2041
2042                 /*
2043                  * 4.1.4.3.2:
2044                  *
2045                  *  The driver MUST NOT write a 0 to queue_enable.
2046                  */
2047                 if (val != 1)
2048                         bad_driver(d, "setting queue_enable to %u", val);
2049
2050                 /*
2051                  * 3.1.1:
2052                  *
2053                  *  7. Perform device-specific setup, including discovery of
2054                  *     virtqueues for the device, optional per-bus setup,
2055                  *     reading and possibly writing the device’s virtio
2056                  *     configuration space, and population of virtqueues.
2057                  *  8. Set the DRIVER_OK status bit.
2058                  *
2059                  * All our devices require all virtqueues to be enabled, so
2060                  * they should have done that before setting DRIVER_OK.
2061                  */
2062                 if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
2063                         bad_driver(d, "enabling vq after DRIVER_OK");
2064
2065                 d->mmio->cfg.queue_enable = val;
2066                 save_vq_config(&d->mmio->cfg, vq);
2067                 check_virtqueue(d, vq);
2068                 goto write_through16;
2069         }
2070         case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
2071                 bad_driver(d, "attempt to write to queue_notify_off");
2072         case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
2073         case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
2074         case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
2075         case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
2076         case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
2077         case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
2078                 /*
2079                  * 4.1.4.3.2:
2080                  *
2081                  *  The driver MUST configure the other virtqueue fields before
2082                  *  enabling the virtqueue with queue_enable.
2083                  */
2084                 if (d->mmio->cfg.queue_enable)
2085                         bad_driver(d, "changing queue on live device");
2086
2087                 /*
2088                  * 3.1.1:
2089                  *
2090                  *  The driver MUST follow this sequence to initialize a device:
2091                  *...
2092                  *  5. Set the FEATURES_OK status bit. The driver MUST not
2093                  *  accept new feature bits after this step.
2094                  */
2095                 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
2096                         bad_driver(d, "setting up vq before FEATURES_OK");
2097
2098                 /*
2099                  *  6. Re-read device status to ensure the FEATURES_OK bit is
2100                  *     still set...
2101                  */
2102                 if (d->wrote_features_ok)
2103                         bad_driver(d, "didn't re-read FEATURES_OK before setup");
2104
2105                 goto write_through32;
2106         case offsetof(struct virtio_pci_mmio, notify):
2107                 vq = vq_by_num(d, val);
2108                 if (!vq)
2109                         bad_driver(d, "Invalid vq notification on %u", val);
2110                 /* Notify the process handling this vq by adding 1 to eventfd */
2111                 write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
2112                 goto write_through16;
2113         case offsetof(struct virtio_pci_mmio, isr):
2114                 bad_driver(d, "Unexpected write to isr");
2115         /* Weird corner case: write to emerg_wr of console */
2116         case sizeof(struct virtio_pci_mmio)
2117                 + offsetof(struct virtio_console_config, emerg_wr):
2118                 if (strcmp(d->name, "console") == 0) {
2119                         char c = val;
2120                         write(STDOUT_FILENO, &c, 1);
2121                         goto write_through32;
2122                 }
2123                 /* Fall through... */
2124         default:
2125                 /*
2126                  * 4.1.4.3.2:
2127                  *
2128                  *   The driver MUST NOT write to device_feature, num_queues,
2129                  *   config_generation or queue_notify_off.
2130                  */
2131                 bad_driver(d, "Unexpected write to offset %u", off);
2132         }
2133
2134 feature_write_through32:
2135         /*
2136          * 3.1.1:
2137          *
2138          *   The driver MUST follow this sequence to initialize a device:
2139          *...
2140          *   - Set the DRIVER status bit: the guest OS knows how
2141          *     to drive the device.
2142          *   - Read device feature bits, and write the subset
2143          *     of feature bits understood by the OS and driver
2144          *     to the device.
2145          *...
2146          *   - Set the FEATURES_OK status bit. The driver MUST not
2147          *     accept new feature bits after this step.
2148          */
2149         if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2150                 bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
2151         if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
2152                 bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
2153
2154         /*
2155          * 4.1.3.1:
2156          *
2157          *  The driver MUST access each field using the “natural” access
2158          *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
2159          *  16-bit fields and 8-bit accesses for 8-bit fields.
2160          */
2161 write_through32:
2162         if (mask != 0xFFFFFFFF) {
2163                 bad_driver(d, "non-32-bit write to offset %u (%#x)",
2164                            off, getreg(eip));
2165                 return;
2166         }
2167         memcpy((char *)d->mmio + off, &val, 4);
2168         return;
2169
2170 write_through16:
2171         if (mask != 0xFFFF)
2172                 bad_driver(d, "non-16-bit write to offset %u (%#x)",
2173                            off, getreg(eip));
2174         memcpy((char *)d->mmio + off, &val, 2);
2175         return;
2176
2177 write_through8:
2178         if (mask != 0xFF)
2179                 bad_driver(d, "non-8-bit write to offset %u (%#x)",
2180                            off, getreg(eip));
2181         memcpy((char *)d->mmio + off, &val, 1);
2182         return;
2183 }
2184
2185 static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
2186 {
2187         u8 isr;
2188         u32 val = 0;
2189
2190         switch (off) {
2191         case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
2192         case offsetof(struct virtio_pci_mmio, cfg.device_feature):
2193         case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
2194         case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
2195                 /*
2196                  * 3.1.1:
2197                  *
2198                  *   The driver MUST follow this sequence to initialize a device:
2199                  *...
2200                  *   - Set the DRIVER status bit: the guest OS knows how
2201                  *     to drive the device.
2202                  *   - Read device feature bits, and write the subset
2203                  *     of feature bits understood by the OS and driver
2204                  *     to the device.
2205                  */
2206                 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2207                         bad_driver(d,
2208                                    "feature read before VIRTIO_CONFIG_S_DRIVER");
2209                 goto read_through32;
2210         case offsetof(struct virtio_pci_mmio, cfg.msix_config):
2211                 bad_driver(d, "read of msix_config");
2212         case offsetof(struct virtio_pci_mmio, cfg.num_queues):
2213                 goto read_through16;
2214         case offsetof(struct virtio_pci_mmio, cfg.device_status):
2215                 /* As they did read, any write of FEATURES_OK is now fine. */
2216                 d->wrote_features_ok = false;
2217                 goto read_through8;
2218         case offsetof(struct virtio_pci_mmio, cfg.config_generation):
2219                 /*
2220                  * 4.1.4.3.1:
2221                  *
2222                  *  The device MUST present a changed config_generation after
2223                  *  the driver has read a device-specific configuration value
2224                  *  which has changed since any part of the device-specific
2225                  *  configuration was last read.
2226                  *
2227                  * This is simple: none of our devices change config, so this
2228                  * is always 0.
2229                  */
2230                 goto read_through8;
2231         case offsetof(struct virtio_pci_mmio, notify):
2232                 /*
2233                  * 3.1.1:
2234                  *
2235                  *   The driver MUST NOT notify the device before setting
2236                  *   DRIVER_OK.
2237                  */
2238                 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
2239                         bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
2240                 goto read_through16;
2241         case offsetof(struct virtio_pci_mmio, isr):
2242                 if (mask != 0xFF)
2243                         bad_driver(d, "non-8-bit read from offset %u (%#x)",
2244                                    off, getreg(eip));
2245                 isr = d->mmio->isr;
2246                 /*
2247                  * 4.1.4.5.1:
2248                  *
2249                  *  The device MUST reset ISR status to 0 on driver read.
2250                  */
2251                 d->mmio->isr = 0;
2252                 return isr;
2253         case offsetof(struct virtio_pci_mmio, padding):
2254                 bad_driver(d, "read from padding (%#x)", getreg(eip));
2255         default:
2256                 /* Read from device config space, beware unaligned overflow */
2257                 if (off > d->mmio_size - 4)
2258                         bad_driver(d, "read past end (%#x)", getreg(eip));
2259
2260                 /*
2261                  * 3.1.1:
2262                  *  The driver MUST follow this sequence to initialize a device:
2263                  *...
2264                  *  3. Set the DRIVER status bit: the guest OS knows how to
2265                  *  drive the device.
2266                  *  4. Read device feature bits, and write the subset of
2267                  *  feature bits understood by the OS and driver to the
2268                  *  device. During this step the driver MAY read (but MUST NOT
2269                  *  write) the device-specific configuration fields to check
2270                  *  that it can support the device before accepting it.
2271                  */
2272                 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2273                         bad_driver(d,
2274                                    "config read before VIRTIO_CONFIG_S_DRIVER");
2275
2276                 if (mask == 0xFFFFFFFF)
2277                         goto read_through32;
2278                 else if (mask == 0xFFFF)
2279                         goto read_through16;
2280                 else
2281                         goto read_through8;
2282         }
2283
2284         /*
2285          * 4.1.3.1:
2286          *
2287          *  The driver MUST access each field using the “natural” access
2288          *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
2289          *  16-bit fields and 8-bit accesses for 8-bit fields.
2290          */
2291 read_through32:
2292         if (mask != 0xFFFFFFFF)
2293                 bad_driver(d, "non-32-bit read to offset %u (%#x)",
2294                            off, getreg(eip));
2295         memcpy(&val, (char *)d->mmio + off, 4);
2296         return val;
2297
2298 read_through16:
2299         if (mask != 0xFFFF)
2300                 bad_driver(d, "non-16-bit read to offset %u (%#x)",
2301                            off, getreg(eip));
2302         memcpy(&val, (char *)d->mmio + off, 2);
2303         return val;
2304
2305 read_through8:
2306         if (mask != 0xFF)
2307                 bad_driver(d, "non-8-bit read to offset %u (%#x)",
2308                            off, getreg(eip));
2309         memcpy(&val, (char *)d->mmio + off, 1);
2310         return val;
2311 }
2312
2313 static void emulate_mmio(unsigned long paddr, const u8 *insn)
2314 {
2315         u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
2316         struct device *d = find_mmio_region(paddr, &off);
2317         unsigned long args[] = { LHREQ_TRAP, 14 };
2318
2319         if (!d) {
2320                 warnx("MMIO touching %#08lx (not a device)", paddr);
2321                 goto reinject;
2322         }
2323
2324         /* Prefix makes it a 16 bit op */
2325         if (insn[0] == 0x66) {
2326                 mask = 0xFFFF;
2327                 insnlen++;
2328         }
2329
2330         /* iowrite */
2331         if (insn[insnlen] == 0x89) {
2332                 /* Next byte is r/m byte: bits 3-5 are register. */
2333                 val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
2334                 emulate_mmio_write(d, off, val, mask);
2335                 insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
2336         } else if (insn[insnlen] == 0x8b) { /* ioread */
2337                 /* Next byte is r/m byte: bits 3-5 are register. */
2338                 val = emulate_mmio_read(d, off, mask);
2339                 setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
2340                 insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
2341         } else if (insn[0] == 0x88) { /* 8-bit iowrite */
2342                 mask = 0xff;
2343                 /* Next byte is r/m byte: bits 3-5 are register. */
2344                 val = getreg_num((insn[1] >> 3) & 0x7, mask);
2345                 emulate_mmio_write(d, off, val, mask);
2346                 insnlen = 2 + insn_displacement_len(insn[1]);
2347         } else if (insn[0] == 0x8a) { /* 8-bit ioread */
2348                 mask = 0xff;
2349                 val = emulate_mmio_read(d, off, mask);
2350                 setreg_num((insn[1] >> 3) & 0x7, val, mask);
2351                 insnlen = 2 + insn_displacement_len(insn[1]);
2352         } else {
2353                 warnx("Unknown MMIO instruction touching %#08lx:"
2354                      " %02x %02x %02x %02x at %u",
2355                      paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
2356         reinject:
2357                 /* Inject trap into Guest. */
2358                 if (write(lguest_fd, args, sizeof(args)) < 0)
2359                         err(1, "Reinjecting trap 14 for fault at %#x",
2360                             getreg(eip));
2361                 return;
2362         }
2363
2364         /* Finally, we've "done" the instruction, so move past it. */
2365         setreg(eip, getreg(eip) + insnlen);
2366 }
2367
2368 /*L:190
2369  * Device Setup
2370  *
2371  * All devices need a descriptor so the Guest knows it exists, and a "struct
2372  * device" so the Launcher can keep track of it.  We have common helper
2373  * routines to allocate and manage them.
2374  */
2375 static void add_pci_virtqueue(struct device *dev,
2376                               void (*service)(struct virtqueue *),
2377                               const char *name)
2378 {
2379         struct virtqueue **i, *vq = malloc(sizeof(*vq));
2380
2381         /* Initialize the virtqueue */
2382         vq->next = NULL;
2383         vq->last_avail_idx = 0;
2384         vq->dev = dev;
2385         vq->name = name;
2386
2387         /*
2388          * This is the routine the service thread will run, and its Process ID
2389          * once it's running.
2390          */
2391         vq->service = service;
2392         vq->thread = (pid_t)-1;
2393
2394         /* Initialize the configuration. */
2395         reset_vq_pci_config(vq);
2396         vq->pci_config.queue_notify_off = 0;
2397
2398         /* Add one to the number of queues */
2399         vq->dev->mmio->cfg.num_queues++;
2400
2401         /*
2402          * Add to tail of list, so dev->vq is first vq, dev->vq->next is
2403          * second.
2404          */
2405         for (i = &dev->vq; *i; i = &(*i)->next);
2406         *i = vq;
2407 }
2408
2409 /* The Guest accesses the feature bits via the PCI common config MMIO region */
2410 static void add_pci_feature(struct device *dev, unsigned bit)
2411 {
2412         dev->features |= (1ULL << bit);
2413 }
2414
2415 /* For devices with no config. */
2416 static void no_device_config(struct device *dev)
2417 {
2418         dev->mmio_addr = get_mmio_region(dev->mmio_size);
2419
2420         dev->config.bar[0] = dev->mmio_addr;
2421         /* Bottom 4 bits must be zero */
2422         assert(~(dev->config.bar[0] & 0xF));
2423 }
2424
2425 /* This puts the device config into BAR0 */
2426 static void set_device_config(struct device *dev, const void *conf, size_t len)
2427 {
2428         /* Set up BAR 0 */
2429         dev->mmio_size += len;
2430         dev->mmio = realloc(dev->mmio, dev->mmio_size);
2431         memcpy(dev->mmio + 1, conf, len);
2432
2433         /*
2434          * 4.1.4.6:
2435          *
2436          *  The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG
2437          *  capability for any device type which has a device-specific
2438          *  configuration.
2439          */
2440         /* Hook up device cfg */
2441         dev->config.cfg_access.cap.cap_next
2442                 = offsetof(struct pci_config, device);
2443
2444         /*
2445          * 4.1.4.6.1:
2446          *
2447          *  The offset for the device-specific configuration MUST be 4-byte
2448          *  aligned.
2449          */
2450         assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
2451
2452         /* Fix up device cfg field length. */
2453         dev->config.device.length = len;
2454
2455         /* The rest is the same as the no-config case */
2456         no_device_config(dev);
2457 }
2458
2459 static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
2460                      size_t bar_offset, size_t bar_bytes, u8 next)
2461 {
2462         cap->cap_vndr = PCI_CAP_ID_VNDR;
2463         cap->cap_next = next;
2464         cap->cap_len = caplen;
2465         cap->cfg_type = type;
2466         cap->bar = 0;
2467         memset(cap->padding, 0, sizeof(cap->padding));
2468         cap->offset = bar_offset;
2469         cap->length = bar_bytes;
2470 }
2471
2472 /*
2473  * This sets up the pci_config structure, as defined in the virtio 1.0
2474  * standard (and PCI standard).
2475  */
2476 static void init_pci_config(struct pci_config *pci, u16 type,
2477                             u8 class, u8 subclass)
2478 {
2479         size_t bar_offset, bar_len;
2480
2481         /*
2482          * 4.1.4.4.1:
2483          *
2484          *  The device MUST either present notify_off_multiplier as an even
2485          *  power of 2, or present notify_off_multiplier as 0.
2486          *
2487          * 2.1.2:
2488          *
2489          *   The device MUST initialize device status to 0 upon reset.
2490          */
2491         memset(pci, 0, sizeof(*pci));
2492
2493         /* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */
2494         pci->vendor_id = 0x1AF4;
2495         /* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */
2496         pci->device_id = 0x1040 + type;
2497
2498         /*
2499          * PCI have specific codes for different types of devices.
2500          * Linux doesn't care, but it's a good clue for people looking
2501          * at the device.
2502          */
2503         pci->class = class;
2504         pci->subclass = subclass;
2505
2506         /*
2507          * 4.1.2.1:
2508          *
2509          *  Non-transitional devices SHOULD have a PCI Revision ID of 1 or
2510          *  higher
2511          */
2512         pci->revid = 1;
2513
2514         /*
2515          * 4.1.2.1:
2516          *
2517          *  Non-transitional devices SHOULD have a PCI Subsystem Device ID of
2518          *  0x40 or higher.
2519          */
2520         pci->subsystem_device_id = 0x40;
2521
2522         /* We use our dummy interrupt controller, and irq_line is the irq */
2523         pci->irq_line = devices.next_irq++;
2524         pci->irq_pin = 0;
2525
2526         /* Support for extended capabilities. */
2527         pci->status = (1 << 4);
2528
2529         /* Link them in. */
2530         /*
2531          * 4.1.4.3.1:
2532          *
2533          *  The device MUST present at least one common configuration
2534          *  capability.
2535          */
2536         pci->capabilities = offsetof(struct pci_config, common);
2537
2538         /* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */
2539         assert(pci->capabilities % 4 == 0);
2540
2541         bar_offset = offsetof(struct virtio_pci_mmio, cfg);
2542         bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
2543         init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
2544                  bar_offset, bar_len,
2545                  offsetof(struct pci_config, notify));
2546
2547         /*
2548          * 4.1.4.4.1:
2549          *
2550          *  The device MUST present at least one notification capability.
2551          */
2552         bar_offset += bar_len;
2553         bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
2554
2555         /*
2556          * 4.1.4.4.1:
2557          *
2558          *  The cap.offset MUST be 2-byte aligned.
2559          */
2560         assert(pci->common.cap_next % 2 == 0);
2561
2562         /* FIXME: Use a non-zero notify_off, for per-queue notification? */
2563         /*
2564          * 4.1.4.4.1:
2565          *
2566          *  The value cap.length presented by the device MUST be at least 2 and
2567          *  MUST be large enough to support queue notification offsets for all
2568          *  supported queues in all possible configurations.
2569          */
2570         assert(bar_len >= 2);
2571
2572         init_cap(&pci->notify.cap, sizeof(pci->notify),
2573                  VIRTIO_PCI_CAP_NOTIFY_CFG,
2574                  bar_offset, bar_len,
2575                  offsetof(struct pci_config, isr));
2576
2577         bar_offset += bar_len;
2578         bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
2579         /*
2580          * 4.1.4.5.1:
2581          *
2582          *  The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG
2583          *  capability.
2584          */
2585         init_cap(&pci->isr, sizeof(pci->isr),
2586                  VIRTIO_PCI_CAP_ISR_CFG,
2587                  bar_offset, bar_len,
2588                  offsetof(struct pci_config, cfg_access));
2589
2590         /*
2591          * 4.1.4.7.1:
2592          *
2593          * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG
2594          * capability.
2595          */
2596         /* This doesn't have any presence in the BAR */
2597         init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
2598                  VIRTIO_PCI_CAP_PCI_CFG,
2599                  0, 0, 0);
2600
2601         bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
2602         assert(bar_offset == sizeof(struct virtio_pci_mmio));
2603
2604         /*
2605          * This gets sewn in and length set in set_device_config().
2606          * Some devices don't have a device configuration interface, so
2607          * we never expose this if we don't call set_device_config().
2608          */
2609         init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
2610                  bar_offset, 0, 0);
2611 }
2612
2613 /*
2614  * This routine does all the creation and setup of a new device, but we don't
2615  * actually place the MMIO region until we know the size (if any) of the
2616  * device-specific config.  And we don't actually start the service threads
2617  * until later.
2618  *
2619  * See what I mean about userspace being boring?
2620  */
2621 static struct device *new_pci_device(const char *name, u16 type,
2622                                      u8 class, u8 subclass)
2623 {
2624         struct device *dev = malloc(sizeof(*dev));
2625
2626         /* Now we populate the fields one at a time. */
2627         dev->name = name;
2628         dev->vq = NULL;
2629         dev->running = false;
2630         dev->wrote_features_ok = false;
2631         dev->mmio_size = sizeof(struct virtio_pci_mmio);
2632         dev->mmio = calloc(1, dev->mmio_size);
2633         dev->features = (u64)1 << VIRTIO_F_VERSION_1;
2634         dev->features_accepted = 0;
2635
2636         if (devices.device_num + 1 >= MAX_PCI_DEVICES)
2637                 errx(1, "Can only handle 31 PCI devices");
2638
2639         init_pci_config(&dev->config, type, class, subclass);
2640         assert(!devices.pci[devices.device_num+1]);
2641         devices.pci[++devices.device_num] = dev;
2642
2643         return dev;
2644 }
2645
2646 /*
2647  * Our first setup routine is the console.  It's a fairly simple device, but
2648  * UNIX tty handling makes it uglier than it could be.
2649  */
2650 static void setup_console(void)
2651 {
2652         struct device *dev;
2653         struct virtio_console_config conf;
2654
2655         /* If we can save the initial standard input settings... */
2656         if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
2657                 struct termios term = orig_term;
2658                 /*
2659                  * Then we turn off echo, line buffering and ^C etc: We want a
2660                  * raw input stream to the Guest.
2661                  */
2662                 term.c_lflag &= ~(ISIG|ICANON|ECHO);
2663                 tcsetattr(STDIN_FILENO, TCSANOW, &term);
2664         }
2665
2666         dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
2667
2668         /* We store the console state in dev->priv, and initialize it. */
2669         dev->priv = malloc(sizeof(struct console_abort));
2670         ((struct console_abort *)dev->priv)->count = 0;
2671
2672         /*
2673          * The console needs two virtqueues: the input then the output.  When
2674          * they put something the input queue, we make sure we're listening to
2675          * stdin.  When they put something in the output queue, we write it to
2676          * stdout.
2677          */
2678         add_pci_virtqueue(dev, console_input, "input");
2679         add_pci_virtqueue(dev, console_output, "output");
2680
2681         /* We need a configuration area for the emerg_wr early writes. */
2682         add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
2683         set_device_config(dev, &conf, sizeof(conf));
2684
2685         verbose("device %u: console\n", devices.device_num);
2686 }
2687 /*:*/
2688
2689 /*M:010
2690  * Inter-guest networking is an interesting area.  Simplest is to have a
2691  * --sharenet=<name> option which opens or creates a named pipe.  This can be
2692  * used to send packets to another guest in a 1:1 manner.
2693  *
2694  * More sophisticated is to use one of the tools developed for project like UML
2695  * to do networking.
2696  *
2697  * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
2698  * completely generic ("here's my vring, attach to your vring") and would work
2699  * for any traffic.  Of course, namespace and permissions issues need to be
2700  * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
2701  * multiple inter-guest channels behind one interface, although it would
2702  * require some manner of hotplugging new virtio channels.
2703  *
2704  * Finally, we could use a virtio network switch in the kernel, ie. vhost.
2705 :*/
2706
2707 static u32 str2ip(const char *ipaddr)
2708 {
2709         unsigned int b[4];
2710
2711         if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
2712                 errx(1, "Failed to parse IP address '%s'", ipaddr);
2713         return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
2714 }
2715
2716 static void str2mac(const char *macaddr, unsigned char mac[6])
2717 {
2718         unsigned int m[6];
2719         if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
2720                    &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
2721                 errx(1, "Failed to parse mac address '%s'", macaddr);
2722         mac[0] = m[0];
2723         mac[1] = m[1];
2724         mac[2] = m[2];
2725         mac[3] = m[3];
2726         mac[4] = m[4];
2727         mac[5] = m[5];
2728 }
2729
2730 /*
2731  * This code is "adapted" from libbridge: it attaches the Host end of the
2732  * network device to the bridge device specified by the command line.
2733  *
2734  * This is yet another James Morris contribution (I'm an IP-level guy, so I
2735  * dislike bridging), and I just try not to break it.
2736  */
2737 static void add_to_bridge(int fd, const char *if_name, const char *br_name)
2738 {
2739         int ifidx;
2740         struct ifreq ifr;
2741
2742         if (!*br_name)
2743                 errx(1, "must specify bridge name");
2744
2745         ifidx = if_nametoindex(if_name);
2746         if (!ifidx)
2747                 errx(1, "interface %s does not exist!", if_name);
2748
2749         strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
2750         ifr.ifr_name[IFNAMSIZ-1] = '\0';
2751         ifr.ifr_ifindex = ifidx;
2752         if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
2753                 err(1, "can't add %s to bridge %s", if_name, br_name);
2754 }
2755
2756 /*
2757  * This sets up the Host end of the network device with an IP address, brings
2758  * it up so packets will flow, the copies the MAC address into the hwaddr
2759  * pointer.
2760  */
2761 static void configure_device(int fd, const char *tapif, u32 ipaddr)
2762 {
2763         struct ifreq ifr;
2764         struct sockaddr_in sin;
2765
2766         memset(&ifr, 0, sizeof(ifr));
2767         strcpy(ifr.ifr_name, tapif);
2768
2769         /* Don't read these incantations.  Just cut & paste them like I did! */
2770         sin.sin_family = AF_INET;
2771         sin.sin_addr.s_addr = htonl(ipaddr);
2772         memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
2773         if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
2774                 err(1, "Setting %s interface address", tapif);
2775         ifr.ifr_flags = IFF_UP;
2776         if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
2777                 err(1, "Bringing interface %s up", tapif);
2778 }
2779
2780 static int get_tun_device(char tapif[IFNAMSIZ])
2781 {
2782         struct ifreq ifr;
2783         int vnet_hdr_sz;
2784         int netfd;
2785
2786         /* Start with this zeroed.  Messy but sure. */
2787         memset(&ifr, 0, sizeof(ifr));
2788
2789         /*
2790          * We open the /dev/net/tun device and tell it we want a tap device.  A
2791          * tap device is like a tun device, only somehow different.  To tell
2792          * the truth, I completely blundered my way through this code, but it
2793          * works now!
2794          */
2795         netfd = open_or_die("/dev/net/tun", O_RDWR);
2796         ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
2797         strcpy(ifr.ifr_name, "tap%d");
2798         if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
2799                 err(1, "configuring /dev/net/tun");
2800
2801         if (ioctl(netfd, TUNSETOFFLOAD,
2802                   TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
2803                 err(1, "Could not set features for tun device");
2804
2805         /*
2806          * We don't need checksums calculated for packets coming in this
2807          * device: trust us!
2808          */
2809         ioctl(netfd, TUNSETNOCSUM, 1);
2810
2811         /*
2812          * In virtio before 1.0 (aka legacy virtio), we added a 16-bit
2813          * field at the end of the network header iff
2814          * VIRTIO_NET_F_MRG_RXBUF was negotiated.  For virtio 1.0,
2815          * that became the norm, but we need to tell the tun device
2816          * about our expanded header (which is called
2817          * virtio_net_hdr_mrg_rxbuf in the legacy system).
2818          */
2819         vnet_hdr_sz = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2820         if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
2821                 err(1, "Setting tun header size to %u", vnet_hdr_sz);
2822
2823         memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
2824         return netfd;
2825 }
2826
2827 /*L:195
2828  * Our network is a Host<->Guest network.  This can either use bridging or
2829  * routing, but the principle is the same: it uses the "tun" device to inject
2830  * packets into the Host as if they came in from a normal network card.  We
2831  * just shunt packets between the Guest and the tun device.
2832  */
2833 static void setup_tun_net(char *arg)
2834 {
2835         struct device *dev;
2836         struct net_info *net_info = malloc(sizeof(*net_info));
2837         int ipfd;
2838         u32 ip = INADDR_ANY;
2839         bool bridging = false;
2840         char tapif[IFNAMSIZ], *p;
2841         struct virtio_net_config conf;
2842
2843         net_info->tunfd = get_tun_device(tapif);
2844
2845         /* First we create a new network device. */
2846         dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
2847         dev->priv = net_info;
2848
2849         /* Network devices need a recv and a send queue, just like console. */
2850         add_pci_virtqueue(dev, net_input, "rx");
2851         add_pci_virtqueue(dev, net_output, "tx");
2852
2853         /*
2854          * We need a socket to perform the magic network ioctls to bring up the
2855          * tap interface, connect to the bridge etc.  Any socket will do!
2856          */
2857         ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
2858         if (ipfd < 0)
2859                 err(1, "opening IP socket");
2860
2861         /* If the command line was --tunnet=bridge:<name> do bridging. */
2862         if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
2863                 arg += strlen(BRIDGE_PFX);
2864                 bridging = true;
2865         }
2866
2867         /* A mac address may follow the bridge name or IP address */
2868         p = strchr(arg, ':');
2869         if (p) {
2870                 str2mac(p+1, conf.mac);
2871                 add_pci_feature(dev, VIRTIO_NET_F_MAC);
2872                 *p = '\0';
2873         }
2874
2875         /* arg is now either an IP address or a bridge name */
2876         if (bridging)
2877                 add_to_bridge(ipfd, tapif, arg);
2878         else
2879                 ip = str2ip(arg);
2880
2881         /* Set up the tun device. */
2882         configure_device(ipfd, tapif, ip);
2883
2884         /* Expect Guest to handle everything except UFO */
2885         add_pci_feature(dev, VIRTIO_NET_F_CSUM);
2886         add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
2887         add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
2888         add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
2889         add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
2890         add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
2891         add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
2892         add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
2893         /* We handle indirect ring entries */
2894         add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
2895         set_device_config(dev, &conf, sizeof(conf));
2896
2897         /* We don't need the socket any more; setup is done. */
2898         close(ipfd);
2899
2900         if (bridging)
2901                 verbose("device %u: tun %s attached to bridge: %s\n",
2902                         devices.device_num, tapif, arg);
2903         else
2904                 verbose("device %u: tun %s: %s\n",
2905                         devices.device_num, tapif, arg);
2906 }
2907 /*:*/
2908
2909 /* This hangs off device->priv. */
2910 struct vblk_info {
2911         /* The size of the file. */
2912         off64_t len;
2913
2914         /* The file descriptor for the file. */
2915         int fd;
2916
2917 };
2918
2919 /*L:210
2920  * The Disk
2921  *
2922  * The disk only has one virtqueue, so it only has one thread.  It is really
2923  * simple: the Guest asks for a block number and we read or write that position
2924  * in the file.
2925  *
2926  * Before we serviced each virtqueue in a separate thread, that was unacceptably
2927  * slow: the Guest waits until the read is finished before running anything
2928  * else, even if it could have been doing useful work.
2929  *
2930  * We could have used async I/O, except it's reputed to suck so hard that
2931  * characters actually go missing from your code when you try to use it.
2932  */
2933 static void blk_request(struct virtqueue *vq)
2934 {
2935         struct vblk_info *vblk = vq->dev->priv;
2936         unsigned int head, out_num, in_num, wlen;
2937         int ret, i;
2938         u8 *in;
2939         struct virtio_blk_outhdr out;
2940         struct iovec iov[vq->vring.num];
2941         off64_t off;
2942
2943         /*
2944          * Get the next request, where we normally wait.  It triggers the
2945          * interrupt to acknowledge previously serviced requests (if any).
2946          */
2947         head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
2948
2949         /* Copy the output header from the front of the iov (adjusts iov) */
2950         iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
2951
2952         /* Find and trim end of iov input array, for our status byte. */
2953         in = NULL;
2954         for (i = out_num + in_num - 1; i >= out_num; i--) {
2955                 if (iov[i].iov_len > 0) {
2956                         in = iov[i].iov_base + iov[i].iov_len - 1;
2957                         iov[i].iov_len--;
2958                         break;
2959                 }
2960         }
2961         if (!in)
2962                 bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
2963
2964         /*
2965          * For historical reasons, block operations are expressed in 512 byte
2966          * "sectors".
2967          */
2968         off = out.sector * 512;
2969
2970         if (out.type & VIRTIO_BLK_T_OUT) {
2971                 /*
2972                  * Write
2973                  *
2974                  * Move to the right location in the block file.  This can fail
2975                  * if they try to write past end.
2976                  */
2977                 if (lseek64(vblk->fd, off, SEEK_SET) != off)
2978                         err(1, "Bad seek to sector %llu", out.sector);
2979
2980                 ret = writev(vblk->fd, iov, out_num);
2981                 verbose("WRITE to sector %llu: %i\n", out.sector, ret);
2982
2983                 /*
2984                  * Grr... Now we know how long the descriptor they sent was, we
2985                  * make sure they didn't try to write over the end of the block
2986                  * file (possibly extending it).
2987                  */
2988                 if (ret > 0 && off + ret > vblk->len) {
2989                         /* Trim it back to the correct length */
2990                         ftruncate64(vblk->fd, vblk->len);
2991                         /* Die, bad Guest, die. */
2992                         bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
2993                 }
2994
2995                 wlen = sizeof(*in);
2996                 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
2997         } else if (out.type & VIRTIO_BLK_T_FLUSH) {
2998                 /* Flush */
2999                 ret = fdatasync(vblk->fd);
3000                 verbose("FLUSH fdatasync: %i\n", ret);
3001                 wlen = sizeof(*in);
3002                 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
3003         } else {
3004                 /*
3005                  * Read
3006                  *
3007                  * Move to the right location in the block file.  This can fail
3008                  * if they try to read past end.
3009                  */
3010                 if (lseek64(vblk->fd, off, SEEK_SET) != off)
3011                         err(1, "Bad seek to sector %llu", out.sector);
3012
3013                 ret = readv(vblk->fd, iov + out_num, in_num);
3014                 if (ret >= 0) {
3015                         wlen = sizeof(*in) + ret;
3016                         *in = VIRTIO_BLK_S_OK;
3017                 } else {
3018                         wlen = sizeof(*in);
3019                         *in = VIRTIO_BLK_S_IOERR;
3020                 }
3021         }
3022
3023         /* Finished that request. */
3024         add_used(vq, head, wlen);
3025 }
3026
3027 /*L:198 This actually sets up a virtual block device. */
3028 static void setup_block_file(const char *filename)
3029 {
3030         struct device *dev;
3031         struct vblk_info *vblk;
3032         struct virtio_blk_config conf;
3033
3034         /* Create the device. */
3035         dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
3036
3037         /* The device has one virtqueue, where the Guest places requests. */
3038         add_pci_virtqueue(dev, blk_request, "request");
3039
3040         /* Allocate the room for our own bookkeeping */
3041         vblk = dev->priv = malloc(sizeof(*vblk));
3042
3043         /* First we open the file and store the length. */
3044         vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
3045         vblk->len = lseek64(vblk->fd, 0, SEEK_END);
3046
3047         /* Tell Guest how many sectors this device has. */
3048         conf.capacity = cpu_to_le64(vblk->len / 512);
3049
3050         /*
3051          * Tell Guest not to put in too many descriptors at once: two are used
3052          * for the in and out elements.
3053          */
3054         add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
3055         conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
3056
3057         set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
3058
3059         verbose("device %u: virtblock %llu sectors\n",
3060                 devices.device_num, le64_to_cpu(conf.capacity));
3061 }
3062
3063 /*L:211
3064  * Our random number generator device reads from /dev/urandom into the Guest's
3065  * input buffers.  The usual case is that the Guest doesn't want random numbers
3066  * and so has no buffers although /dev/urandom is still readable, whereas
3067  * console is the reverse.
3068  *
3069  * The same logic applies, however.
3070  */
3071 struct rng_info {
3072         int rfd;
3073 };
3074
3075 static void rng_input(struct virtqueue *vq)
3076 {
3077         int len;
3078         unsigned int head, in_num, out_num, totlen = 0;
3079         struct rng_info *rng_info = vq->dev->priv;
3080         struct iovec iov[vq->vring.num];
3081
3082         /* First we need a buffer from the Guests's virtqueue. */
3083         head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
3084         if (out_num)
3085                 bad_driver_vq(vq, "Output buffers in rng?");
3086
3087         /*
3088          * Just like the console write, we loop to cover the whole iovec.
3089          * In this case, short reads actually happen quite a bit.
3090          */
3091         while (!iov_empty(iov, in_num)) {
3092                 len = readv(rng_info->rfd, iov, in_num);
3093                 if (len <= 0)
3094                         err(1, "Read from /dev/urandom gave %i", len);
3095                 iov_consume(vq->dev, iov, in_num, NULL, len);
3096                 totlen += len;
3097         }
3098
3099         /* Tell the Guest about the new input. */
3100         add_used(vq, head, totlen);
3101 }
3102
3103 /*L:199
3104  * This creates a "hardware" random number device for the Guest.
3105  */
3106 static void setup_rng(void)
3107 {
3108         struct device *dev;
3109         struct rng_info *rng_info = malloc(sizeof(*rng_info));
3110
3111         /* Our device's private info simply contains the /dev/urandom fd. */
3112         rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
3113
3114         /* Create the new device. */
3115         dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
3116         dev->priv = rng_info;
3117
3118         /* The device has one virtqueue, where the Guest places inbufs. */
3119         add_pci_virtqueue(dev, rng_input, "input");
3120
3121         /* We don't have any configuration space */
3122         no_device_config(dev);
3123
3124         verbose("device %u: rng\n", devices.device_num);
3125 }
3126 /* That's the end of device setup. */
3127
3128 /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
3129 static void __attribute__((noreturn)) restart_guest(void)
3130 {
3131         unsigned int i;
3132
3133         /*
3134          * Since we don't track all open fds, we simply close everything beyond
3135          * stderr.
3136          */
3137         for (i = 3; i < FD_SETSIZE; i++)
3138                 close(i);
3139
3140         /* Reset all the devices (kills all threads). */
3141         cleanup_devices();
3142
3143         execv(main_args[0], main_args);
3144         err(1, "Could not exec %s", main_args[0]);
3145 }
3146
3147 /*L:220
3148  * Finally we reach the core of the Launcher which runs the Guest, serves
3149  * its input and output, and finally, lays it to rest.
3150  */
3151 static void __attribute__((noreturn)) run_guest(void)
3152 {
3153         for (;;) {
3154                 struct lguest_pending notify;
3155                 int readval;
3156
3157                 /* We read from the /dev/lguest device to run the Guest. */
3158                 readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id);
3159                 if (readval == sizeof(notify)) {
3160                         if (notify.trap == 13) {
3161                                 verbose("Emulating instruction at %#x\n",
3162                                         getreg(eip));
3163                                 emulate_insn(notify.insn);
3164                         } else if (notify.trap == 14) {
3165                                 verbose("Emulating MMIO at %#x\n",
3166                                         getreg(eip));
3167                                 emulate_mmio(notify.addr, notify.insn);
3168                         } else
3169                                 errx(1, "Unknown trap %i addr %#08x\n",
3170                                      notify.trap, notify.addr);
3171                 /* ENOENT means the Guest died.  Reading tells us why. */
3172                 } else if (errno == ENOENT) {
3173                         char reason[1024] = { 0 };
3174                         pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
3175                         errx(1, "%s", reason);
3176                 /* ERESTART means that we need to reboot the guest */
3177                 } else if (errno == ERESTART) {
3178                         restart_guest();
3179                 /* Anything else means a bug or incompatible change. */
3180                 } else
3181                         err(1, "Running guest failed");
3182         }
3183 }
3184 /*L:240
3185  * This is the end of the Launcher.  The good news: we are over halfway
3186  * through!  The bad news: the most fiendish part of the code still lies ahead
3187  * of us.
3188  *
3189  * Are you ready?  Take a deep breath and join me in the core of the Host, in
3190  * "make Host".
3191 :*/
3192
3193 static struct option opts[] = {
3194         { "verbose", 0, NULL, 'v' },
3195         { "tunnet", 1, NULL, 't' },
3196         { "block", 1, NULL, 'b' },
3197         { "rng", 0, NULL, 'r' },
3198         { "initrd", 1, NULL, 'i' },
3199         { "username", 1, NULL, 'u' },
3200         { "chroot", 1, NULL, 'c' },
3201         { NULL },
3202 };
3203 static void usage(void)
3204 {
3205         errx(1, "Usage: lguest [--verbose] "
3206              "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
3207              "|--block=<filename>|--initrd=<filename>]...\n"
3208              "<mem-in-mb> vmlinux [args...]");
3209 }
3210
3211 /*L:105 The main routine is where the real work begins: */
3212 int main(int argc, char *argv[])
3213 {
3214         /* Memory, code startpoint and size of the (optional) initrd. */
3215         unsigned long mem = 0, start, initrd_size = 0;
3216         /* Two temporaries. */
3217         int i, c;
3218         /* The boot information for the Guest. */
3219         struct boot_params *boot;
3220         /* If they specify an initrd file to load. */
3221         const char *initrd_name = NULL;
3222
3223         /* Password structure for initgroups/setres[gu]id */
3224         struct passwd *user_details = NULL;
3225
3226         /* Directory to chroot to */
3227         char *chroot_path = NULL;
3228
3229         /* Save the args: we "reboot" by execing ourselves again. */
3230         main_args = argv;
3231
3232         /*
3233          * First we initialize the device list.  We remember next interrupt
3234          * number to use for devices (1: remember that 0 is used by the timer).
3235          */
3236         devices.next_irq = 1;
3237
3238         /* We're CPU 0.  In fact, that's the only CPU possible right now. */
3239         cpu_id = 0;
3240
3241         /*
3242          * We need to know how much memory so we can set up the device
3243          * descriptor and memory pages for the devices as we parse the command
3244          * line.  So we quickly look through the arguments to find the amount
3245          * of memory now.
3246          */
3247         for (i = 1; i < argc; i++) {
3248                 if (argv[i][0] != '-') {
3249                         mem = atoi(argv[i]) * 1024 * 1024;
3250                         /*
3251                          * We start by mapping anonymous pages over all of
3252                          * guest-physical memory range.  This fills it with 0,
3253                          * and ensures that the Guest won't be killed when it
3254                          * tries to access it.
3255                          */
3256                         guest_base = map_zeroed_pages(mem / getpagesize()
3257                                                       + DEVICE_PAGES);
3258                         guest_limit = mem;
3259                         guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
3260                         break;
3261                 }
3262         }
3263
3264         /* We always have a console device, and it's always device 1. */
3265         setup_console();
3266
3267         /* The options are fairly straight-forward */
3268         while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
3269                 switch (c) {
3270                 case 'v':
3271                         verbose = true;
3272                         break;
3273                 case 't':
3274                         setup_tun_net(optarg);
3275                         break;
3276                 case 'b':
3277                         setup_block_file(optarg);
3278                         break;
3279                 case 'r':
3280                         setup_rng();
3281                         break;
3282                 case 'i':
3283                         initrd_name = optarg;
3284                         break;
3285                 case 'u':
3286                         user_details = getpwnam(optarg);
3287                         if (!user_details)
3288                                 err(1, "getpwnam failed, incorrect username?");
3289                         break;
3290                 case 'c':
3291                         chroot_path = optarg;
3292                         break;
3293                 default:
3294                         warnx("Unknown argument %s", argv[optind]);
3295                         usage();
3296                 }
3297         }
3298         /*
3299          * After the other arguments we expect memory and kernel image name,
3300          * followed by command line arguments for the kernel.
3301          */
3302         if (optind + 2 > argc)
3303                 usage();
3304
3305         verbose("Guest base is at %p\n", guest_base);
3306
3307         /* Initialize the (fake) PCI host bridge device. */
3308         init_pci_host_bridge();
3309
3310         /* Now we load the kernel */
3311         start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
3312
3313         /* Boot information is stashed at physical address 0 */
3314         boot = from_guest_phys(0);
3315
3316         /* Map the initrd image if requested (at top of physical memory) */
3317         if (initrd_name) {
3318                 initrd_size = load_initrd(initrd_name, mem);
3319                 /*
3320                  * These are the location in the Linux boot header where the
3321                  * start and size of the initrd are expected to be found.
3322                  */
3323                 boot->hdr.ramdisk_image = mem - initrd_size;
3324                 boot->hdr.ramdisk_size = initrd_size;
3325                 /* The bootloader type 0xFF means "unknown"; that's OK. */
3326                 boot->hdr.type_of_loader = 0xFF;
3327         }
3328
3329         /*
3330          * The Linux boot header contains an "E820" memory map: ours is a
3331          * simple, single region.
3332          */
3333         boot->e820_entries = 1;
3334         boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
3335         /*
3336          * The boot header contains a command line pointer: we put the command
3337          * line after the boot header.
3338          */
3339         boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
3340         /* We use a simple helper to copy the arguments separated by spaces. */
3341         concat((char *)(boot + 1), argv+optind+2);
3342
3343         /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
3344         boot->hdr.kernel_alignment = 0x1000000;
3345
3346         /* Boot protocol version: 2.07 supports the fields for lguest. */
3347         boot->hdr.version = 0x207;
3348
3349         /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
3350         boot->hdr.hardware_subarch = 1;
3351
3352         /* Tell the entry path not to try to reload segment registers. */
3353         boot->hdr.loadflags |= KEEP_SEGMENTS;
3354
3355         /* We tell the kernel to initialize the Guest. */
3356         tell_kernel(start);
3357
3358         /* Ensure that we terminate if a device-servicing child dies. */
3359         signal(SIGCHLD, kill_launcher);
3360
3361         /* If we exit via err(), this kills all the threads, restores tty. */
3362         atexit(cleanup_devices);
3363
3364         /* If requested, chroot to a directory */
3365         if (chroot_path) {
3366                 if (chroot(chroot_path) != 0)
3367                         err(1, "chroot(\"%s\") failed", chroot_path);
3368
3369                 if (chdir("/") != 0)
3370                         err(1, "chdir(\"/\") failed");
3371
3372                 verbose("chroot done\n");
3373         }
3374
3375         /* If requested, drop privileges */
3376         if (user_details) {
3377                 uid_t u;
3378                 gid_t g;
3379
3380                 u = user_details->pw_uid;
3381                 g = user_details->pw_gid;
3382
3383                 if (initgroups(user_details->pw_name, g) != 0)
3384                         err(1, "initgroups failed");
3385
3386                 if (setresgid(g, g, g) != 0)
3387                         err(1, "setresgid failed");
3388
3389                 if (setresuid(u, u, u) != 0)
3390                         err(1, "setresuid failed");
3391
3392                 verbose("Dropping privileges completed\n");
3393         }
3394
3395         /* Finally, run the Guest.  This doesn't return. */
3396         run_guest();
3397 }
3398 /*:*/
3399
3400 /*M:999
3401  * Mastery is done: you now know everything I do.
3402  *
3403  * But surely you have seen code, features and bugs in your wanderings which
3404  * you now yearn to attack?  That is the real game, and I look forward to you
3405  * patching and forking lguest into the Your-Name-Here-visor.
3406  *
3407  * Farewell, and good coding!
3408  * Rusty Russell.
3409  */