Update to aosp/crosvm-master 4cdfa9a264c

* merge in 4cdfa9a264c144ecb8ee63d99cd01495fe56f34b

Bug: 151628085
Test: make
Test: atest --host -c --include-subdirs external/crosvm
Change-Id: Ie19dc6114d542e22979a4f5c4dfcd3da54d85a95
diff --git a/Android.bp b/Android.bp
index 6d243df..4659e6b 100644
--- a/Android.bp
+++ b/Android.bp
@@ -24,6 +24,9 @@
             relative_install_path: "x86_64-linux-gnu",
             rlibs: ["libx86_64_rust"],
         },
+        darwin: {
+            enabled: false,
+        },
     },
 
     // Install the crosvm prebuilts as requirements until we're building these as well.
@@ -36,7 +39,9 @@
     features: [
         "default-no-sandbox",
         "gpu",
+        "x",
     ],
+
     flags: [
         "-C overflow-checks=y",
         "-C panic=abort",
@@ -52,9 +57,10 @@
         "libassertions",
         "libaudio_streams",
         "libbit_field",
-        "libbyteorder",
+        "libcrosvm",
         "libdata_model",
         "libdevices",
+        "libdisk",
         "libio_jail",
         "libkernel_cmdline",
         "libkernel_loader",
@@ -62,10 +68,10 @@
         "libkvm_sys",
         "liblibc",
         "liblibcras",
+        "libminijail_sys",
         "libmsg_socket",
         "libnet_util",
         "libp9",
-        "libqcow",
         "librand_ish",
         "libresources",
         "libsync_rust",
@@ -76,7 +82,7 @@
     ],
     proc_macros: [
         "libenumn",
-        "libremain-0.1.3",
+        "libremain",
     ],
     shared_libs: [
         "libminijail",
@@ -90,3 +96,46 @@
         "libfdt",
     ],
 }
+
+rust_library_host_rlib {
+    name: "libcrosvm",
+    defaults: ["crosvm_defaults"],
+    crate_name: "crosvm",
+    srcs: ["src/crosvm.rs"],
+    features: [
+        "default-no-sandbox",
+        "gpu",
+        "x",
+    ],
+    rlibs: [
+        "libarch",
+        "libassertions",
+        "libaudio_streams",
+        "libbit_field",
+        "libdata_model",
+        "libdevices",
+        "libdisk",
+        "libio_jail",
+        "libkernel_cmdline",
+        "libkernel_loader",
+        "libkvm",
+        "libkvm_sys",
+        "liblibc",
+        "liblibcras",
+        "libminijail_sys",
+        "libmsg_socket",
+        "libnet_util",
+        "libp9",
+        "librand_ish",
+        "libresources",
+        "libsync_rust",
+        "libsys_util",
+        "libvhost",
+        "libvm_control",
+        "libx86_64_rust",
+    ],
+    proc_macros: [
+        "libenumn",
+        "libremain",
+    ],
+}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..63ed15d
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,40 @@
+## Intro
+
+This article goes into detail about multiple areas of interest to contributors, which includes reviewers, developers, and integrators who each share an interest in guiding crosvm's direction.
+
+## Guidelines
+
+The following is high level guidance for producing contributions to crosvm.
+
+- Prefer mechanism to policy.
+- Use existing protocols when they are adequate, such as virtio.
+- Prefer security over code re-use and speed of development.
+- Only the version of Rust in use by the Chrome OS toolchain is supported. This is ordinarily the stable version of Rust, but can be behind a version for a few weeks.
+- Avoid distribution specific code.
+
+## Code Health
+
+### Scripts
+
+In the `bin/` directory of the crosvm repository, there is the `clippy` script which lints the Rust code and the `fmt` script which will format the crosvm Rust code inplace. When submitting changes, the `bin/smoke_test` script, which checks Rust format and unit tests, will be run by Kokoro, the internal Google run cloud builder, and the results will be posted to the change. Kokoro is only informational, so if Kokoro rejects a change, it can still be submitted.
+
+###  Submitting Code
+
+See also, [Chrome OS Contributing Guide](https://chromium.googlesource.com/chromiumos/docs/+/master/contributing.md)
+
+When a change is approved, verified, and added to the [commit queue](https://chromium.googlesource.com/chromiumos/docs/+/master/contributing.md#send-your-changes-to-the-commit-queue), crosvm will be built and the unit tests (with some exceptions) will be run by the Chrome OS infrastructure. Only if that passes, will the change be submitted. Failures here will cause the commit queue to reject the change until it is re-added (CQ+2). Unfortunately, it is extremely common for false negatives to cause a change to get rejected, so be ready to re-apply the CQ+2 label if you're the owner of a ready to submit change.
+
+### Style guidelines
+
+To format all code, crosvm defers to rustfmt. In addition, the code adheres to the following rules:
+
+The `use` statements for each module should be grouped in this order
+
+1.  `std`
+2. third-party crates
+3. chrome os crates
+4. crosvm crates
+5. `crate`
+
+crosvm uses the [remain](https://github.com/dtolnay/remain) crate to keep error enums sorted, along with the `#[sorted]` attribute to keep their corresponding match statements in the same order.
+
diff --git a/Cargo.lock b/Cargo.lock
index 8446532..0ec0a39 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -22,7 +22,6 @@
 name = "arch"
 version = "0.1.0"
 dependencies = [
- "byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "devices 0.1.0",
  "io_jail 0.1.0",
  "kernel_cmdline 0.1.0",
@@ -40,6 +39,10 @@
 [[package]]
 name = "audio_streams"
 version = "0.1.0"
+dependencies = [
+ "sync 0.1.0",
+ "sys_util 0.1.0",
+]
 
 [[package]]
 name = "bit_field"
@@ -52,13 +55,13 @@
 name = "bit_field_derive"
 version = "0.1.0"
 dependencies = [
- "proc-macro2 0.4.21 (registry+https://github.com/rust-lang/crates.io-index)",
- "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
- "syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)",
+ "proc-macro2 1.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
+ "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "syn 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
-name = "byteorder"
+name = "bitflags"
 version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
@@ -76,6 +79,7 @@
 name = "cras-sys"
 version = "0.1.0"
 dependencies = [
+ "audio_streams 0.1.0",
  "data_model 0.1.0",
 ]
 
@@ -88,12 +92,13 @@
  "assertions 0.1.0",
  "audio_streams 0.1.0",
  "bit_field 0.1.0",
- "byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "crosvm_plugin 0.17.0",
  "data_model 0.1.0",
  "devices 0.1.0",
+ "disk 0.1.0",
  "enumn 0.1.0",
  "gpu_buffer 0.1.0",
+ "gpu_renderer 0.1.0",
  "io_jail 0.1.0",
  "kernel_cmdline 0.1.0",
  "kernel_loader 0.1.0",
@@ -101,15 +106,14 @@
  "kvm_sys 0.1.0",
  "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
  "libcras 0.1.0",
+ "minijail-sys 0.0.11",
  "msg_socket 0.1.0",
  "net_util 0.1.0",
  "p9 0.1.0",
- "protobuf 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "protobuf 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "protos 0.1.0",
- "qcow 0.1.0",
  "rand_ish 0.1.0",
  "remain 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
- "render_node_forward 0.1.0",
  "resources 0.1.0",
  "sync 0.1.0",
  "sys_util 0.1.0",
@@ -125,7 +129,7 @@
  "kvm 0.1.0",
  "kvm_sys 0.1.0",
  "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
- "protobuf 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "protobuf 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "protos 0.1.0",
  "sys_util 0.1.0",
 ]
@@ -143,15 +147,18 @@
 dependencies = [
  "audio_streams 0.1.0",
  "bit_field 0.1.0",
- "byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "bitflags 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "data_model 0.1.0",
+ "disk 0.1.0",
  "enumn 0.1.0",
  "gpu_buffer 0.1.0",
  "gpu_display 0.1.0",
  "gpu_renderer 0.1.0",
  "io_jail 0.1.0",
  "kvm 0.1.0",
+ "kvm_sys 0.1.0",
  "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
+ "linux_input_sys 0.1.0",
  "msg_on_socket_derive 0.1.0",
  "msg_socket 0.1.0",
  "net_sys 0.1.0",
@@ -162,20 +169,35 @@
  "resources 0.1.0",
  "sync 0.1.0",
  "sys_util 0.1.0",
+ "syscall_defines 0.1.0",
+ "tempfile 3.0.7",
  "tpm2 0.1.0",
  "usb_util 0.1.0",
+ "vfio_sys 0.1.0",
  "vhost 0.1.0",
  "virtio_sys 0.1.0",
  "vm_control 0.1.0",
 ]
 
 [[package]]
+name = "disk"
+version = "0.1.0"
+dependencies = [
+ "data_model 0.1.0",
+ "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
+ "protobuf 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "protos 0.1.0",
+ "remain 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "sys_util 0.1.0",
+]
+
+[[package]]
 name = "enumn"
 version = "0.1.0"
 dependencies = [
- "proc-macro2 0.4.21 (registry+https://github.com/rust-lang/crates.io-index)",
- "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
- "syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)",
+ "proc-macro2 1.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
+ "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "syn 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -201,6 +223,7 @@
  "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",
  "data_model 0.1.0",
  "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
+ "linux_input_sys 0.1.0",
  "sys_util 0.1.0",
 ]
 
@@ -218,6 +241,7 @@
 version = "0.1.0"
 dependencies = [
  "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
+ "minijail-sys 0.0.11",
 ]
 
 [[package]]
@@ -239,6 +263,7 @@
 name = "kvm"
 version = "0.1.0"
 dependencies = [
+ "data_model 0.1.0",
  "kvm_sys 0.1.0",
  "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
  "msg_socket 0.1.0",
@@ -270,6 +295,15 @@
 ]
 
 [[package]]
+name = "linux_input_sys"
+version = "0.1.0"
+dependencies = [
+ "data_model 0.1.0",
+ "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
+ "sys_util 0.1.0",
+]
+
+[[package]]
 name = "log"
 version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -278,12 +312,20 @@
 ]
 
 [[package]]
+name = "minijail-sys"
+version = "0.0.11"
+dependencies = [
+ "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
+ "pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
 name = "msg_on_socket_derive"
 version = "0.1.0"
 dependencies = [
- "proc-macro2 0.4.21 (registry+https://github.com/rust-lang/crates.io-index)",
- "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
- "syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)",
+ "proc-macro2 1.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
+ "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "syn 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -306,6 +348,7 @@
 name = "net_util"
 version = "0.1.0"
 dependencies = [
+ "data_model 0.1.0",
  "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
  "net_sys 0.1.0",
  "sys_util 0.1.0",
@@ -336,9 +379,9 @@
 name = "poll_token_derive"
 version = "0.1.0"
 dependencies = [
- "proc-macro2 0.4.21 (registry+https://github.com/rust-lang/crates.io-index)",
- "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
- "syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)",
+ "proc-macro2 1.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
+ "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "syn 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -350,21 +393,29 @@
 ]
 
 [[package]]
+name = "proc-macro2"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
 name = "protobuf"
-version = "2.4.2"
+version = "2.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
 name = "protobuf-codegen"
-version = "2.4.2"
+version = "2.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "protobuf 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "protobuf 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
 name = "protoc"
-version = "2.4.2"
+version = "2.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "log 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -372,12 +423,12 @@
 
 [[package]]
 name = "protoc-rust"
-version = "2.4.2"
+version = "2.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "protobuf 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
- "protobuf-codegen 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
- "protoc 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "protobuf 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "protobuf-codegen 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "protoc 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "tempfile 3.0.7",
 ]
 
@@ -386,28 +437,17 @@
 version = "0.1.0"
 dependencies = [
  "kvm_sys 0.1.0",
- "protobuf 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
- "protoc-rust 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
-]
-
-[[package]]
-name = "qcow"
-version = "0.1.0"
-dependencies = [
- "byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
- "data_model 0.1.0",
- "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
- "remain 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
- "sys_util 0.1.0",
+ "protobuf 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "protoc-rust 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
 name = "qcow_utils"
 version = "0.1.0"
 dependencies = [
+ "disk 0.1.0",
  "getopts 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)",
  "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
- "qcow 0.1.0",
  "sys_util 0.1.0",
 ]
 
@@ -420,6 +460,14 @@
 ]
 
 [[package]]
+name = "quote"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "proc-macro2 1.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
 name = "rand_ish"
 version = "0.1.0"
 
@@ -434,13 +482,6 @@
 ]
 
 [[package]]
-name = "render_node_forward"
-version = "0.1.0"
-dependencies = [
- "sys_util 0.1.0",
-]
-
-[[package]]
 name = "resources"
 version = "0.1.0"
 dependencies = [
@@ -461,6 +502,16 @@
 ]
 
 [[package]]
+name = "syn"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "proc-macro2 1.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
+ "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
 name = "sync"
 version = "0.1.0"
 
@@ -473,6 +524,7 @@
  "poll_token_derive 0.1.0",
  "sync 0.1.0",
  "syscall_defines 0.1.0",
+ "tempfile 3.0.7",
 ]
 
 [[package]]
@@ -483,7 +535,7 @@
 name = "tempfile"
 version = "3.0.7"
 dependencies = [
- "rand_ish 0.1.0",
+ "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -512,13 +564,34 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
+name = "unicode-xid"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "usb_sys"
+version = "0.1.0"
+dependencies = [
+ "sys_util 0.1.0",
+]
+
+[[package]]
 name = "usb_util"
 version = "0.1.0"
 dependencies = [
  "assertions 0.1.0",
  "data_model 0.1.0",
- "pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)",
- "sync 0.1.0",
+ "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
+ "remain 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "sys_util 0.1.0",
+ "usb_sys 0.1.0",
+]
+
+[[package]]
+name = "vfio_sys"
+version = "0.1.0"
+dependencies = [
+ "sys_util 0.1.0",
 ]
 
 [[package]]
@@ -543,7 +616,6 @@
 name = "vm_control"
 version = "0.1.0"
 dependencies = [
- "byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "data_model 0.1.0",
  "kvm 0.1.0",
  "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -556,9 +628,9 @@
 name = "wire_format_derive"
 version = "0.1.0"
 dependencies = [
- "proc-macro2 0.4.21 (registry+https://github.com/rust-lang/crates.io-index)",
- "quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
- "syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)",
+ "proc-macro2 1.0.8 (registry+https://github.com/rust-lang/crates.io-index)",
+ "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "syn 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -567,7 +639,6 @@
 dependencies = [
  "arch 0.1.0",
  "assertions 0.1.0",
- "byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",
  "data_model 0.1.0",
  "devices 0.1.0",
@@ -584,7 +655,7 @@
 ]
 
 [metadata]
-"checksum byteorder 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff81738b726f5d099632ceaffe7fb65b90212e8dce59d518729e7e8634032d3d"
+"checksum bitflags 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3d155346769a6855b86399e9bc3814ab343cd3d62c7e985113d46a0ec3c281fd"
 "checksum cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "f159dfd43363c4d08055a07703eb7a3406b0dac4d0584d96965a3262db3c9d16"
 "checksum cfg-if 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4e7bb64a8ebb0d856483e1e682ea3422f883c5f5615a90d51a2c82fe87fdd3"
 "checksum getopts 0.2.18 (registry+https://github.com/rust-lang/crates.io-index)" = "0a7292d30132fb5424b354f5dc02512a86e4c516fe544bb7a25e7f266951b797"
@@ -593,12 +664,16 @@
 "checksum num_cpus 1.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5a69d464bdc213aaaff628444e99578ede64e9c854025aa43b9796530afa9238"
 "checksum pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)" = "110d5ee3593dbb73f56294327fe5668bcc997897097cbc76b51e7aed3f52452f"
 "checksum proc-macro2 0.4.21 (registry+https://github.com/rust-lang/crates.io-index)" = "ab2fc21ba78ac73e4ff6b3818ece00be4e175ffbef4d0a717d978b48b24150c4"
-"checksum protobuf 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "524d165d95627ddebba768db728216c4429bbb62882f7e6ab1a6c3c54a7ed830"
-"checksum protobuf-codegen 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e142c5972a0736674d647714ac7a454f20aef31b09902d330583b8d8a96401a1"
-"checksum protoc 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "82ac4c59bf852f415c62a1d30da3348f977322dc66bdb283c92b3df9bee2073a"
-"checksum protoc-rust 2.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0e9dc0547688715431c954528a3dabe7559b4d53b3161426981e19419ea7b1f0"
+"checksum proc-macro2 1.0.8 (registry+https://github.com/rust-lang/crates.io-index)" = "3acb317c6ff86a4e579dfa00fc5e6cca91ecbb4e7eb2df0468805b674eb88548"
+"checksum protobuf 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "40361836defdd5871ff7e84096c6f6444af7fc157f8ef1789f54f147687caa20"
+"checksum protobuf-codegen 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "12c6abd78435445fc86898ebbd0521a68438063d4a73e23527b7134e6bf58b4a"
+"checksum protoc 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3998c4bc0af8ccbd3cc68245ee9f72663c5ae2fb78bc48ff7719aef11562edea"
+"checksum protoc-rust 2.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "234c97039c32bb58a883d0deafa57db37e59428ce536f3bdfe1c46cffec04113"
 "checksum quote 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "53fa22a1994bd0f9372d7a816207d8a2677ad0325b073f5c5332760f0fb62b5c"
+"checksum quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "053a8c8bcc71fcce321828dc897a98ab9760bef03a4fc36693c231e5b3216cfe"
 "checksum remain 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3bec2543b50be4539fdc27fde082e218cf4c3895358ca77f5c52fe930589e209"
 "checksum syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)" = "f92e629aa1d9c827b2bb8297046c1ccffc57c99b947a680d3ccff1f136a3bee9"
+"checksum syn 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)" = "af6f3550d8dff9ef7dc34d384ac6f107e5d31c8f57d9f28e0081503f547ac8f5"
 "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526"
 "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
+"checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
diff --git a/Cargo.toml b/Cargo.toml
index 048681f..7eb7215 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,13 @@
 authors = ["The Chromium OS Authors"]
 edition = "2018"
 
+[lib]
+path = "src/crosvm.rs"
+
+[[bin]]
+name = "crosvm"
+path = "src/main.rs"
+
 [profile.release]
 panic = 'abort'
 overflow-checks = true
@@ -12,6 +19,8 @@
 members = ["qcow_utils"]
 exclude = [
     "assertions",
+    "async_core",
+    "cros_async",
     "data_model",
     "rand_ish",
     "sync",
@@ -23,39 +32,41 @@
 [features]
 default-no-sandbox = []
 gpu = ["devices/gpu"]
-gpu-forward = ["render_node_forward"]
 plugin = ["protos/plugin", "crosvm_plugin", "protobuf"]
-sandboxed-libusb = ["devices/sandboxed-libusb", "vm_control/sandboxed-libusb"]
 tpm = ["devices/tpm"]
 wl-dmabuf = ["devices/wl-dmabuf", "gpu_buffer", "resources/wl-dmabuf"]
+x = ["devices/x"]
+virtio-gpu-next = ["gpu_renderer/virtio-gpu-next"]
+composite-disk = ["protos/composite-disk", "protobuf", "disk/composite-disk"]
+gfxstream = ["devices/gfxstream"]
 
 [dependencies]
 arch = { path = "arch" }
 assertions = { path = "assertions" }
 audio_streams = "*"
 bit_field = { path = "bit_field" }
-byteorder = "=1.1.0"
 crosvm_plugin = { path = "crosvm_plugin", optional = true }
 data_model = "*"
 devices = { path = "devices" }
+disk = { path = "disk" }
 enumn = { path = "enumn" }
 gpu_buffer = { path = "gpu_buffer", optional = true }
+gpu_renderer = { path = "gpu_renderer", optional = true }
 io_jail = { path = "io_jail" }
 kernel_cmdline = { path = "kernel_cmdline" }
 kernel_loader = { path = "kernel_loader" }
 kvm = { path = "kvm" }
 kvm_sys = { path = "kvm_sys" }
-libc = "=0.2.44"
+libc = "0.2.44"
 libcras = "*"
+minijail-sys = "*" # provided by ebuild
 msg_socket = { path = "msg_socket" }
 net_util = { path = "net_util" }
 p9 = { path = "p9" }
 protobuf = { version = "2.3", optional = true }
 protos = { path = "protos", optional = true }
-qcow = { path = "qcow" }
 rand_ish = { path = "rand_ish" }
 remain = "*"
-render_node_forward = { path = "render_node_forward", optional = true }
 resources = { path = "resources" }
 sync = { path = "sync" }
 sys_util = "*"
@@ -76,6 +87,7 @@
 audio_streams = { path = "../../third_party/adhd/audio_streams" } # ignored by ebuild
 data_model = { path = "data_model" }
 libcras = { path = "../../third_party/adhd/cras/client/libcras" } # ignored by ebuild
+minijail-sys = { path = "../../aosp/external/minijail" } # ignored by ebuild
 poll_token_derive = { path = "sys_util/poll_token_derive" }
 sync = { path = "sync" }
 sys_util = { path = "sys_util" }
diff --git a/OWNERS b/OWNERS
index fecc05c..11582f1 100644
--- a/OWNERS
+++ b/OWNERS
@@ -3,3 +3,6 @@
 dgreid@google.com
 smbarber@chromium.org
 zachr@chromium.org
+
+# So any team members can +2
+*
diff --git a/README.md b/README.md
index d287af8..51c200d 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,15 @@
 # crosvm - The Chrome OS Virtual Machine Monitor
 
 This component, known as crosvm, runs untrusted operating systems along with
-virtualized devices. No actual hardware is emulated. This only runs VMs
-through the Linux's KVM interface. What makes crosvm unique is a focus on
-safety within the programming language and a sandbox around the virtual
-devices to protect the kernel from attack in case of an exploit in the
-devices.
+virtualized devices. This only runs VMs through the Linux's KVM interface. What
+makes crosvm unique is a focus on safety within the programming language and a
+sandbox around the virtual devices to protect the kernel from attack in case of
+an exploit in the devices.
+
+## IRC
+
+The channel #crosvm on [freenode](https://webchat.freenode.net/#crosvm) is used
+for technical discussion related to crosvm development and integration.
 
 ## Building with Docker
 
@@ -157,7 +161,6 @@
 dependencies also must be reviewed to ensure their suitability to the crosvm
 project. Currently allowed crates are:
 
-* `byteorder` - A very small library used for endian swaps.
 * `cc` - Build time dependency needed to build C source code used in crosvm.
 * `libc` - Required to use the standard library, this crate is a simple wrapper around `libc`'s symbols.
 
diff --git a/aarch64/src/fdt.rs b/aarch64/src/fdt.rs
index 753ca0f..9dcafa5 100644
--- a/aarch64/src/fdt.rs
+++ b/aarch64/src/fdt.rs
@@ -4,6 +4,7 @@
 
 use std::ffi::CStr;
 use std::fs::File;
+use std::io::Read;
 
 use arch::fdt::{
     begin_node, end_node, finish_fdt, generate_prop32, generate_prop64, property, property_cstring,
@@ -20,6 +21,7 @@
 use crate::AARCH64_GIC_CPUI_SIZE;
 use crate::AARCH64_GIC_DIST_BASE;
 use crate::AARCH64_GIC_DIST_SIZE;
+use crate::AARCH64_GIC_REDIST_SIZE;
 
 // These are RTC related constants
 use crate::AARCH64_RTC_ADDR;
@@ -86,16 +88,20 @@
     Ok(())
 }
 
-fn create_gic_node(fdt: &mut Vec<u8>) -> Result<()> {
-    let gic_reg_prop = generate_prop64(&[
-        AARCH64_GIC_DIST_BASE,
-        AARCH64_GIC_DIST_SIZE,
-        AARCH64_GIC_CPUI_BASE,
-        AARCH64_GIC_CPUI_SIZE,
-    ]);
+fn create_gic_node(fdt: &mut Vec<u8>, is_gicv3: bool, num_cpus: u64) -> Result<()> {
+    let mut gic_reg_prop = [AARCH64_GIC_DIST_BASE, AARCH64_GIC_DIST_SIZE, 0, 0];
 
     begin_node(fdt, "intc")?;
-    property_string(fdt, "compatible", "arm,cortex-a15-gic")?;
+    if is_gicv3 {
+        property_string(fdt, "compatible", "arm,gic-v3")?;
+        gic_reg_prop[2] = AARCH64_GIC_DIST_BASE - (AARCH64_GIC_REDIST_SIZE * num_cpus);
+        gic_reg_prop[3] = AARCH64_GIC_REDIST_SIZE * num_cpus;
+    } else {
+        property_string(fdt, "compatible", "arm,cortex-a15-gic")?;
+        gic_reg_prop[2] = AARCH64_GIC_CPUI_BASE;
+        gic_reg_prop[3] = AARCH64_GIC_CPUI_SIZE;
+    }
+    let gic_reg_prop = generate_prop64(&gic_reg_prop);
     property_u32(fdt, "#interrupt-cells", GIC_FDT_IRQ_NUM_CELLS)?;
     property_null(fdt, "interrupt-controller")?;
     property(fdt, "reg", &gic_reg_prop)?;
@@ -182,7 +188,21 @@
     begin_node(fdt, "chosen")?;
     property_u32(fdt, "linux,pci-probe-only", 1)?;
     property_cstring(fdt, "bootargs", cmdline)?;
-    property_u64(fdt, "kaslr", 0)?;
+
+    let mut random_file = File::open("/dev/urandom").map_err(Error::FdtIoError)?;
+    let mut kaslr_seed_bytes = [0u8; 8];
+    random_file
+        .read_exact(&mut kaslr_seed_bytes)
+        .map_err(Error::FdtIoError)?;
+    let kaslr_seed = u64::from_le_bytes(kaslr_seed_bytes);
+    property_u64(fdt, "kaslr-seed", kaslr_seed)?;
+
+    let mut rng_seed_bytes = [0u8; 256];
+    random_file
+        .read_exact(&mut rng_seed_bytes)
+        .map_err(Error::FdtIoError)?;
+    property(fdt, "rng-seed", &rng_seed_bytes)?;
+
     if let Some((initrd_addr, initrd_size)) = initrd {
         let initrd_start = initrd_addr.offset() as u32;
         let initrd_end = initrd_start + initrd_size as u32;
@@ -318,6 +338,7 @@
 /// * `cmdline` - The kernel commandline
 /// * `initrd` - An optional tuple of initrd guest physical address and size
 /// * `android_fstab` - An optional file holding Android fstab entries
+/// * `is_gicv3` - True if gicv3, false if v2
 pub fn create_fdt(
     fdt_max_size: usize,
     guest_mem: &GuestMemory,
@@ -329,6 +350,7 @@
     cmdline: &CStr,
     initrd: Option<(GuestAddress, usize)>,
     android_fstab: Option<File>,
+    is_gicv3: bool,
 ) -> Result<()> {
     let mut fdt = vec![0; fdt_max_size];
     start_fdt(&mut fdt, fdt_max_size)?;
@@ -345,7 +367,7 @@
     create_chosen_node(&mut fdt, cmdline, initrd)?;
     create_memory_node(&mut fdt, guest_mem)?;
     create_cpu_nodes(&mut fdt, num_cpus)?;
-    create_gic_node(&mut fdt)?;
+    create_gic_node(&mut fdt, is_gicv3, num_cpus as u64)?;
     create_timer_node(&mut fdt, num_cpus)?;
     create_serial_nodes(&mut fdt)?;
     create_psci_node(&mut fdt)?;
diff --git a/aarch64/src/lib.rs b/aarch64/src/lib.rs
index d21d70d..67fad72 100644
--- a/aarch64/src/lib.rs
+++ b/aarch64/src/lib.rs
@@ -44,6 +44,7 @@
 // address space.
 const AARCH64_GIC_DIST_BASE: u64 = AARCH64_AXI_BASE - AARCH64_GIC_DIST_SIZE;
 const AARCH64_GIC_CPUI_BASE: u64 = AARCH64_GIC_DIST_BASE - AARCH64_GIC_CPUI_SIZE;
+const AARCH64_GIC_REDIST_SIZE: u64 = 0x20000;
 
 // This is the minimum number of SPI interrupts aligned to 32 + 32 for the
 // PPI (16) and GSI (16).
@@ -195,6 +196,7 @@
         mut components: VmComponents,
         _split_irqchip: bool,
         serial_parameters: &BTreeMap<u8, SerialParameters>,
+        serial_jail: Option<Minijail>,
         create_devices: F,
     ) -> Result<RunnableLinuxVm>
     where
@@ -229,12 +231,16 @@
 
         let vcpu_affinity = components.vcpu_affinity;
 
-        let irq_chip = Self::create_irq_chip(&vm)?;
+        let (irq_chip, is_gicv3) = Self::create_irq_chip(&vm, vcpu_count as u64)?;
 
         let mut mmio_bus = devices::Bus::new();
 
         let exit_evt = EventFd::new().map_err(Error::CreateEventFd)?;
 
+        // Event used by PMDevice to notify crosvm that
+        // guest OS is trying to suspend.
+        let suspend_evt = EventFd::new().map_err(Error::CreateEventFd)?;
+
         let pci_devices = create_devices(&mem, &mut vm, &mut resources, &exit_evt)
             .map_err(|e| Error::CreateDevices(Box::new(e)))?;
         let (pci, pci_irqs, pid_debug_label_map) =
@@ -249,11 +255,12 @@
 
         let com_evt_1_3 = EventFd::new().map_err(Error::CreateEventFd)?;
         let com_evt_2_4 = EventFd::new().map_err(Error::CreateEventFd)?;
-        let (stdio_serial_num, stdio_serial) = arch::add_serial_devices(
+        let stdio_serial_num = arch::add_serial_devices(
             &mut mmio_bus,
             &com_evt_1_3,
             &com_evt_2_4,
             &serial_parameters,
+            serial_jail,
         )
         .map_err(Error::CreateSerialDevices)?;
 
@@ -296,13 +303,13 @@
             pci_irqs,
             components.android_fstab,
             kernel_end,
+            is_gicv3,
         )?;
 
         Ok(RunnableLinuxVm {
             vm,
             kvm,
             resources,
-            stdio_serial,
             exit_evt,
             vcpus,
             vcpu_affinity,
@@ -310,6 +317,7 @@
             io_bus,
             mmio_bus,
             pid_debug_label_map,
+            suspend_evt,
         })
     }
 }
@@ -324,6 +332,7 @@
         pci_irqs: Vec<(u32, PciInterruptPin)>,
         android_fstab: Option<File>,
         kernel_end: u64,
+        is_gicv3: bool,
     ) -> Result<()> {
         let initrd = match initrd_file {
             Some(initrd_file) => {
@@ -339,7 +348,7 @@
             }
             None => None,
         };
-        let (pci_device_base, pci_device_size) = Self::get_device_addr_base_size(mem_size);
+        let (pci_device_base, pci_device_size) = Self::get_high_mmio_base_size(mem_size);
         fdt::create_fdt(
             AARCH64_FDT_MAX_SIZE as usize,
             mem,
@@ -351,6 +360,7 @@
             cmdline,
             initrd,
             android_fstab,
+            is_gicv3,
         )
         .map_err(Error::CreateFdt)?;
         Ok(())
@@ -362,7 +372,7 @@
         Ok(mem)
     }
 
-    fn get_device_addr_base_size(mem_size: u64) -> (u64, u64) {
+    fn get_high_mmio_base_size(mem_size: u64) -> (u64, u64) {
         let base = AARCH64_PHYS_MEM_START + mem_size;
         let size = u64::max_value() - base;
         (base, size)
@@ -371,8 +381,8 @@
     /// This returns a base part of the kernel command for this architecture
     fn get_base_linux_cmdline(stdio_serial_num: Option<u8>) -> kernel_cmdline::Cmdline {
         let mut cmdline = kernel_cmdline::Cmdline::new(sys_util::pagesize());
-        if stdio_serial_num.is_some() {
-            let tty_string = get_serial_tty_string(stdio_serial_num.unwrap());
+        if let Some(stdio_serial_num) = stdio_serial_num {
+            let tty_string = get_serial_tty_string(stdio_serial_num);
             cmdline.insert("console", &tty_string).unwrap();
         }
         cmdline.insert_str("panic=-1").unwrap();
@@ -381,10 +391,10 @@
 
     /// Returns a system resource allocator.
     fn get_resource_allocator(mem_size: u64, gpu_allocation: bool) -> SystemAllocator {
-        let (device_addr_base, device_addr_size) = Self::get_device_addr_base_size(mem_size);
+        let (high_mmio_base, high_mmio_size) = Self::get_high_mmio_base_size(mem_size);
         SystemAllocator::builder()
-            .add_device_addresses(device_addr_base, device_addr_size)
-            .add_mmio_addresses(AARCH64_MMIO_BASE, AARCH64_MMIO_SIZE)
+            .add_high_mmio_addresses(high_mmio_base, high_mmio_size)
+            .add_low_mmio_addresses(AARCH64_MMIO_BASE, AARCH64_MMIO_SIZE)
             .create_allocator(AARCH64_IRQ_BASE, gpu_allocation)
             .unwrap()
     }
@@ -414,11 +424,14 @@
     /// # Arguments
     ///
     /// * `vm` - the vm object
-    fn create_irq_chip(vm: &Vm) -> Result<Option<File>> {
+    /// * `vcpu_count` - the number of vCPUs
+    fn create_irq_chip(vm: &Vm, vcpu_count: u64) -> Result<(Option<File>, bool)> {
         let cpu_if_addr: u64 = AARCH64_GIC_CPUI_BASE;
         let dist_if_addr: u64 = AARCH64_GIC_DIST_BASE;
+        let redist_addr: u64 = dist_if_addr - (AARCH64_GIC_REDIST_SIZE * vcpu_count);
         let raw_cpu_if_addr = &cpu_if_addr as *const u64;
         let raw_dist_if_addr = &dist_if_addr as *const u64;
+        let raw_redist_addr = &redist_addr as *const u64;
 
         let cpu_if_attr = kvm_device_attr {
             group: kvm_sys::KVM_DEV_ARM_VGIC_GRP_ADDR,
@@ -426,19 +439,40 @@
             addr: raw_cpu_if_addr as u64,
             flags: 0,
         };
-        let dist_attr = kvm_device_attr {
+        let redist_attr = kvm_device_attr {
             group: kvm_sys::KVM_DEV_ARM_VGIC_GRP_ADDR,
-            attr: kvm_sys::KVM_VGIC_V2_ADDR_TYPE_DIST as u64,
-            addr: raw_dist_if_addr as u64,
+            attr: kvm_sys::KVM_VGIC_V3_ADDR_TYPE_REDIST as u64,
+            addr: raw_redist_addr as u64,
             flags: 0,
         };
+        let mut dist_attr = kvm_device_attr {
+            group: kvm_sys::KVM_DEV_ARM_VGIC_GRP_ADDR,
+            addr: raw_dist_if_addr as u64,
+            attr: 0,
+            flags: 0,
+        };
+
         let mut kcd = kvm_sys::kvm_create_device {
-            type_: kvm_sys::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V2,
+            type_: kvm_sys::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3,
             fd: 0,
             flags: 0,
         };
-        vm.create_device(&mut kcd)
-            .map_err(|e| Error::CreateGICFailure(e))?;
+
+        let mut cpu_redist_attr = redist_attr;
+        let mut is_gicv3 = true;
+        dist_attr.attr = kvm_sys::KVM_VGIC_V3_ADDR_TYPE_DIST as u64;
+        if vm.create_device(&mut kcd).is_err() {
+            is_gicv3 = false;
+            cpu_redist_attr = cpu_if_attr;
+            kcd.type_ = kvm_sys::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V2;
+            dist_attr.attr = kvm_sys::KVM_VGIC_V2_ADDR_TYPE_DIST as u64;
+            vm.create_device(&mut kcd)
+                .map_err(|e| Error::CreateGICFailure(e))?;
+        }
+
+        let is_gicv3 = is_gicv3;
+        let cpu_redist_attr = cpu_redist_attr;
+        let dist_attr = dist_attr;
 
         // Safe because the kernel is passing us an FD back inside
         // the struct after we successfully did the create_device ioctl
@@ -446,7 +480,7 @@
 
         // Safe because we allocated the struct that's being passed in
         let ret = unsafe {
-            sys_util::ioctl_with_ref(&vgic_fd, kvm_sys::KVM_SET_DEVICE_ATTR(), &cpu_if_attr)
+            sys_util::ioctl_with_ref(&vgic_fd, kvm_sys::KVM_SET_DEVICE_ATTR(), &cpu_redist_attr)
         };
         if ret != 0 {
             return Err(Error::CreateGICFailure(sys_util::Error::new(ret)));
@@ -492,7 +526,7 @@
         if ret != 0 {
             return Err(Error::SetDeviceAttr(sys_util::Error::new(ret)));
         }
-        Ok(Some(vgic_fd))
+        Ok((Some(vgic_fd), is_gicv3))
     }
 
     fn configure_vcpu(
diff --git a/arch/Android.bp b/arch/Android.bp
index 5bcf664..489c7e2 100644
--- a/arch/Android.bp
+++ b/arch/Android.bp
@@ -2,11 +2,10 @@
 
 rust_library_host_rlib {
     name: "libarch",
+    defaults: ["crosvm_defaults"],
     crate_name: "arch",
     srcs: ["src/lib.rs"],
-    defaults: ["crosvm_defaults"],
     rlibs: [
-        "libbyteorder",
         "libdevices",
         "libio_jail",
         "libkernel_cmdline",
diff --git a/arch/Cargo.toml b/arch/Cargo.toml
index f562fca..bf28560 100644
--- a/arch/Cargo.toml
+++ b/arch/Cargo.toml
@@ -5,7 +5,6 @@
 edition = "2018"
 
 [dependencies]
-byteorder = "*"
 devices = { path = "../devices" }
 io_jail = { path = "../io_jail" }
 kernel_cmdline = { path = "../kernel_cmdline" }
diff --git a/arch/src/fdt.rs b/arch/src/fdt.rs
index 71c791d..a3861d5 100644
--- a/arch/src/fdt.rs
+++ b/arch/src/fdt.rs
@@ -2,7 +2,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-use byteorder::{BigEndian, ByteOrder};
 use libc::{c_char, c_int, c_void};
 use std::ffi::{CStr, CString};
 use std::fmt::{self, Display};
@@ -106,15 +105,11 @@
 }
 
 fn cpu_to_fdt32(input: u32) -> [u8; 4] {
-    let mut buf = [0; 4];
-    BigEndian::write_u32(&mut buf, input);
-    buf
+    input.to_be_bytes()
 }
 
 fn cpu_to_fdt64(input: u64) -> [u8; 8] {
-    let mut buf = [0; 8];
-    BigEndian::write_u64(&mut buf, input);
-    buf
+    input.to_be_bytes()
 }
 
 pub fn property_u32(fdt: &mut Vec<u8>, name: &str, val: u32) -> Result<()> {
diff --git a/arch/src/lib.rs b/arch/src/lib.rs
index 71e4327..6af78e3 100644
--- a/arch/src/lib.rs
+++ b/arch/src/lib.rs
@@ -4,6 +4,7 @@
 
 pub mod android;
 pub mod fdt;
+pub mod pstore;
 
 use std::collections::BTreeMap;
 use std::error::Error as StdError;
@@ -11,12 +12,13 @@
 use std::fs::File;
 use std::io::{self, Read, Seek, SeekFrom};
 use std::os::unix::io::AsRawFd;
+use std::path::PathBuf;
 use std::sync::Arc;
 
 use devices::virtio::VirtioDevice;
 use devices::{
     Bus, BusDevice, BusError, PciDevice, PciDeviceError, PciInterruptPin, PciRoot, ProxyDevice,
-    Serial, SerialParameters, DEFAULT_SERIAL_PARAMS, SERIAL_ADDR,
+    SerialParameters, DEFAULT_SERIAL_PARAMS, SERIAL_ADDR,
 };
 use io_jail::Minijail;
 use kvm::{IoeventAddress, Kvm, Vcpu, Vm};
@@ -29,6 +31,12 @@
     Bios(File),
 }
 
+#[derive(Clone)]
+pub struct Pstore {
+    pub path: PathBuf,
+    pub size: u32,
+}
+
 /// Holds the pieces needed to build a VM. Passed to `build_vm` in the `LinuxArch` trait below to
 /// create a `RunnableLinuxVm`.
 pub struct VmComponents {
@@ -37,6 +45,7 @@
     pub vcpu_affinity: Vec<usize>,
     pub vm_image: VmImage,
     pub android_fstab: Option<File>,
+    pub pstore: Option<Pstore>,
     pub initrd_image: Option<File>,
     pub extra_kernel_params: Vec<String>,
     pub wayland_dmabuf: bool,
@@ -47,7 +56,6 @@
     pub vm: Vm,
     pub kvm: Kvm,
     pub resources: SystemAllocator,
-    pub stdio_serial: Option<Arc<Mutex<Serial>>>,
     pub exit_evt: EventFd,
     pub vcpus: Vec<Vcpu>,
     pub vcpu_affinity: Vec<usize>,
@@ -55,6 +63,7 @@
     pub io_bus: Bus,
     pub mmio_bus: Bus,
     pub pid_debug_label_map: BTreeMap<u32, String>,
+    pub suspend_evt: EventFd,
 }
 
 /// The device and optional jail.
@@ -80,6 +89,7 @@
         components: VmComponents,
         split_irqchip: bool,
         serial_parameters: &BTreeMap<u8, SerialParameters>,
+        serial_jail: Option<Minijail>,
         create_devices: F,
     ) -> Result<RunnableLinuxVm, Self::Error>
     where
@@ -101,9 +111,9 @@
     AllocateDeviceAddrs(PciDeviceError),
     /// Could not allocate an IRQ number.
     AllocateIrq,
-    /// Could not create the mmio device to wrap a VirtioDevice.
-    CreateMmioDevice(sys_util::Error),
-    //  Unable to create serial device from serial parameters
+    // Unable to create a pipe.
+    CreatePipe(sys_util::Error),
+    // Unable to create serial device from serial parameters
     CreateSerialDevice(devices::SerialError),
     /// Could not create an event fd.
     EventFdCreate(sys_util::Error),
@@ -133,7 +143,7 @@
             AllocateIoAddrs(e) => write!(f, "Allocating IO addresses: {}", e),
             AllocateDeviceAddrs(e) => write!(f, "Allocating device addresses: {}", e),
             AllocateIrq => write!(f, "Allocating IRQ number"),
-            CreateMmioDevice(e) => write!(f, "failed to create mmio device: {}", e),
+            CreatePipe(e) => write!(f, "failed to create pipe: {}", e),
             CreateSerialDevice(e) => write!(f, "failed to create serial device: {}", e),
             Cmdline(e) => write!(f, "unable to add device to kernel command line: {}", e),
             EventFdCreate(e) => write!(f, "failed to create eventfd: {}", e),
@@ -243,9 +253,9 @@
     com_evt_1_3: &EventFd,
     com_evt_2_4: &EventFd,
     serial_parameters: &BTreeMap<u8, SerialParameters>,
-) -> Result<(Option<u8>, Option<Arc<Mutex<Serial>>>), DeviceRegistrationError> {
+    serial_jail: Option<Minijail>,
+) -> Result<Option<u8>, DeviceRegistrationError> {
     let mut stdio_serial_num = None;
-    let mut stdio_serial = None;
 
     for x in 0..=3 {
         let com_evt = match x {
@@ -260,22 +270,35 @@
             .get(&(x + 1))
             .unwrap_or(&DEFAULT_SERIAL_PARAMS[x as usize]);
 
-        let com = Arc::new(Mutex::new(
-            param
-                .create_serial_device(&com_evt)
-                .map_err(DeviceRegistrationError::CreateSerialDevice)?,
-        ));
-        io_bus
-            .insert(com.clone(), SERIAL_ADDR[x as usize], 0x8, false)
-            .unwrap();
-
         if param.console {
             stdio_serial_num = Some(x + 1);
-            stdio_serial = Some(com.clone());
+        }
+
+        let mut preserved_fds = Vec::new();
+        let com = param
+            .create_serial_device(&com_evt, &mut preserved_fds)
+            .map_err(DeviceRegistrationError::CreateSerialDevice)?;
+
+        match serial_jail.as_ref() {
+            Some(jail) => {
+                let com = Arc::new(Mutex::new(
+                    ProxyDevice::new(com, &jail, preserved_fds)
+                        .map_err(DeviceRegistrationError::ProxyDeviceCreation)?,
+                ));
+                io_bus
+                    .insert(com.clone(), SERIAL_ADDR[x as usize], 0x8, false)
+                    .unwrap();
+            }
+            None => {
+                let com = Arc::new(Mutex::new(com));
+                io_bus
+                    .insert(com.clone(), SERIAL_ADDR[x as usize], 0x8, false)
+                    .unwrap();
+            }
         }
     }
 
-    Ok((stdio_serial_num, stdio_serial))
+    Ok(stdio_serial_num)
 }
 
 /// Errors for image loading.
diff --git a/arch/src/pstore.rs b/arch/src/pstore.rs
new file mode 100644
index 0000000..a06ea1b
--- /dev/null
+++ b/arch/src/pstore.rs
@@ -0,0 +1,74 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::fmt::{self, Display};
+use std::fs::OpenOptions;
+use std::io;
+
+use crate::Pstore;
+use kvm::Vm;
+use resources::SystemAllocator;
+use resources::{Alloc, MmioType};
+use sys_util::{GuestAddress, MemoryMapping};
+
+/// Error for pstore.
+#[derive(Debug)]
+pub enum Error {
+    IoError(io::Error),
+    MmapError(sys_util::MmapError),
+    ResourcesError(resources::Error),
+    SysUtilError(sys_util::Error),
+}
+
+impl Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use self::Error::*;
+
+        match self {
+            IoError(e) => write!(f, "failed to create pstore backend file: {}", e),
+            MmapError(e) => write!(f, "failed to get file mapped address: {}", e),
+            ResourcesError(e) => write!(f, "failed to allocate pstore region: {}", e),
+            SysUtilError(e) => write!(f, "file to add pstore region to mmio: {}", e),
+        }
+    }
+}
+
+impl std::error::Error for Error {}
+type Result<T> = std::result::Result<T, Error>;
+
+pub struct RamoopsRegion {
+    pub address: u64,
+    pub size: u32,
+}
+
+/// Creates a mmio memory region for pstore.
+pub fn create_memory_region(
+    vm: &mut Vm,
+    resources: &mut SystemAllocator,
+    pstore: &Pstore,
+) -> Result<RamoopsRegion> {
+    let file = OpenOptions::new()
+        .read(true)
+        .write(true)
+        .create(true)
+        .open(&pstore.path)
+        .map_err(Error::IoError)?;
+    file.set_len(pstore.size as u64).map_err(Error::IoError)?;
+
+    let address = resources
+        .mmio_allocator(MmioType::High)
+        .allocate(pstore.size as u64, Alloc::Pstore, "pstore".to_owned())
+        .map_err(Error::ResourcesError)?;
+
+    let memory_mapping =
+        MemoryMapping::from_fd(&file, pstore.size as usize).map_err(Error::MmapError)?;
+
+    vm.add_mmio_memory(GuestAddress(address), memory_mapping, false, false)
+        .map_err(Error::SysUtilError)?;
+
+    Ok(RamoopsRegion {
+        address,
+        size: pstore.size,
+    })
+}
diff --git a/async_core/Cargo.toml b/async_core/Cargo.toml
new file mode 100644
index 0000000..1555bd9
--- /dev/null
+++ b/async_core/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "async_core"
+version = "0.1.0"
+authors = ["The Chromium OS Authors"]
+edition = "2018"
+
+[dependencies]
+libc = "*"
+cros_async = { path = "../cros_async" }
+sys_util = { path = "../sys_util" }
+syscall_defines = { path = "../syscall_defines" }
+
+[dependencies.futures]
+version = "*"
+default-features = false
diff --git a/async_core/src/eventfd.rs b/async_core/src/eventfd.rs
new file mode 100644
index 0000000..b930f07
--- /dev/null
+++ b/async_core/src/eventfd.rs
@@ -0,0 +1,166 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use futures::Stream;
+use std::convert::TryFrom;
+use std::fmt::{self, Display};
+use std::os::unix::io::AsRawFd;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+
+use libc::{EWOULDBLOCK, O_NONBLOCK};
+
+use sys_util::{self, add_fd_flags};
+
+use cros_async::fd_executor::{self, add_read_waker};
+
+/// Errors generated while polling for events.
+#[derive(Debug)]
+pub enum Error {
+    /// An error occurred attempting to register a waker with the executor.
+    AddingWaker(fd_executor::Error),
+    /// Failure creating the event FD.
+    EventFdCreate(sys_util::Error),
+    /// An error occurred when reading the event FD.
+    EventFdRead(sys_util::Error),
+    /// An error occurred when setting the event FD non-blocking.
+    SettingNonBlocking(sys_util::Error),
+}
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl std::error::Error for Error {}
+
+impl Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use self::Error::*;
+
+        match self {
+            AddingWaker(e) => write!(
+                f,
+                "An error occurred attempting to register a waker with the executor: {}.",
+                e
+            ),
+            EventFdCreate(e) => write!(f, "An error occurred when creating the event FD: {}.", e),
+            EventFdRead(e) => write!(f, "An error occurred when reading the event FD: {}.", e),
+            SettingNonBlocking(e) => {
+                write!(f, "An error occurred setting the FD non-blocking: {}.", e)
+            }
+        }
+    }
+}
+
+/// Asynchronous version of `sys_util::EventFd`. Provides an implementation of `futures::Stream` so
+/// that events can be consumed in an async context.
+///
+/// # Example
+///
+/// ```
+/// use std::convert::TryInto;
+///
+/// use async_core::{EventFd };
+/// use futures::StreamExt;
+/// use sys_util::{self};
+///
+/// async fn process_events() -> std::result::Result<(), Box<dyn std::error::Error>> {
+///     let mut async_events: EventFd = sys_util::EventFd::new()?.try_into()?;
+///     while let Some(e) = async_events.next().await {
+///         // Handle event here.
+///     }
+///     Ok(())
+/// }
+/// ```
+pub struct EventFd {
+    inner: sys_util::EventFd,
+    done: bool,
+}
+
+impl EventFd {
+    pub fn new() -> Result<EventFd> {
+        Self::try_from(sys_util::EventFd::new().map_err(Error::EventFdCreate)?)
+    }
+}
+
+impl TryFrom<sys_util::EventFd> for EventFd {
+    type Error = crate::eventfd::Error;
+
+    fn try_from(eventfd: sys_util::EventFd) -> Result<EventFd> {
+        let fd = eventfd.as_raw_fd();
+        add_fd_flags(fd, O_NONBLOCK).map_err(Error::SettingNonBlocking)?;
+        Ok(EventFd {
+            inner: eventfd,
+            done: false,
+        })
+    }
+}
+
+impl Stream for EventFd {
+    type Item = Result<u64>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
+        if self.done {
+            return Poll::Ready(None);
+        }
+
+        let res = self
+            .inner
+            .read()
+            .map(|v| Poll::Ready(Some(Ok(v))))
+            .or_else(|e| {
+                if e.errno() == EWOULDBLOCK {
+                    add_read_waker(self.inner.as_raw_fd(), cx.waker().clone())
+                        .map(|()| Poll::Pending)
+                        .map_err(Error::AddingWaker)
+                } else {
+                    Err(Error::EventFdRead(e))
+                }
+            });
+
+        match res {
+            Ok(v) => v,
+            Err(e) => {
+                self.done = true;
+                Poll::Ready(Some(Err(e)))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use cros_async::{select2, SelectResult};
+    use futures::future::pending;
+    use futures::pin_mut;
+    use futures::stream::StreamExt;
+
+    #[test]
+    fn eventfd_write_read() {
+        let evt = EventFd::new().unwrap();
+        async fn read_one(mut evt: EventFd) -> u64 {
+            if let Some(Ok(e)) = evt.next().await {
+                e
+            } else {
+                66
+            }
+        }
+        async fn write_pend(evt: sys_util::EventFd) {
+            evt.write(55).unwrap();
+            let () = pending().await;
+        }
+        let write_evt = evt.inner.try_clone().unwrap();
+
+        let r = read_one(evt);
+        pin_mut!(r);
+        let w = write_pend(write_evt);
+        pin_mut!(w);
+
+        if let Ok((SelectResult::Finished(read_res), SelectResult::Pending(_pend_fut))) =
+            select2(r, w)
+        {
+            assert_eq!(read_res, 55);
+        } else {
+            panic!("wrong futures returned from select2");
+        }
+    }
+}
diff --git a/async_core/src/lib.rs b/async_core/src/lib.rs
new file mode 100644
index 0000000..96cdcf1
--- /dev/null
+++ b/async_core/src/lib.rs
@@ -0,0 +1,11 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+//! Extensions using cros_async and futures-rs to add asynchronous operations to sys_util features.
+//! Provides basic `Futures` implementations for some of the interfaces provided by the `sys_util`
+//! crate.
+
+mod eventfd;
+
+pub use eventfd::EventFd;
diff --git a/bin/clippy b/bin/clippy
index 9f31816..66df660 100755
--- a/bin/clippy
+++ b/bin/clippy
@@ -20,7 +20,6 @@
     let_unit_value
     question_mark
     range_plus_one
-    unit_arg
 
     # We don't care about these lints. Okay to remain suppressed globally.
     blacklisted_name
diff --git a/bin/fmt b/bin/fmt
index 39eac2c..60070dd 100755
--- a/bin/fmt
+++ b/bin/fmt
@@ -4,7 +4,7 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
-# Run `cargo fmt` on all Rust code contained in crosvm. This is different from
+# Run `rustfmt` on all Rust code contained in crosvm. This is different from
 # `cargo fmt --all` which formats multiple crates but a single workspace only.
 # Crosvm consists of multiple workspaces.
 #
@@ -26,19 +26,4 @@
 # Jump up to root directory of crosvm repo.
 cd ..
 
-# Keep track of whether any cargo fmt invocation exited with error.
-EXIT=0
-
-FIND_CARGO_TOMLS="$(find "$PWD" -name Cargo.toml)"
-
-while read path_to_cargo_toml; do
-    cd "$(dirname "$path_to_cargo_toml")"
-
-    if grep --quiet '\[workspace\]' Cargo.toml; then
-        if ! cargo fmt --all -- "$@"; then
-            EXIT=1
-        fi
-    fi
-done <<< "$FIND_CARGO_TOMLS"
-
-exit $EXIT
+find . -name '*.rs' -print0 | grep -vz '^./target/' | xargs -0 rustfmt --edition=2018 "$@" --
diff --git a/bin/smoke_test b/bin/smoke_test
index e488e0e..b2d71f3 100755
--- a/bin/smoke_test
+++ b/bin/smoke_test
@@ -12,7 +12,7 @@
 rustup component add rustfmt-preview
 cargo --version && rustc --version && rustfmt --version
 echo "Running cargo test"
-cargo test --no-fail-fast --features plugin,default-no-sandbox,wl-dmabuf,gpu,tpm,gpu-forward \
+cargo test --no-fail-fast --features plugin,default-no-sandbox,wl-dmabuf,gpu,tpm \
     --all --exclude aarch64 $TEST_FLAGS -- \
     --test-threads=1 $TEST_RUNNER_FLAGS
 echo "Running cargo fmt"
diff --git a/bit_field/bit_field_derive/Android.bp b/bit_field/bit_field_derive/Android.bp
index bdd83bf..7d3d711 100644
--- a/bit_field/bit_field_derive/Android.bp
+++ b/bit_field/bit_field_derive/Android.bp
@@ -8,6 +8,6 @@
     rlibs: [
         "libproc_macro2",
         "libquote",
-        "libsyn-0.15.42",
+        "libsyn",
     ],
 }
diff --git a/bit_field/bit_field_derive/Cargo.toml b/bit_field/bit_field_derive/Cargo.toml
index df84a2a..4642b93 100644
--- a/bit_field/bit_field_derive/Cargo.toml
+++ b/bit_field/bit_field_derive/Cargo.toml
@@ -5,9 +5,9 @@
 edition = "2018"
 
 [dependencies]
-proc-macro2 = "=0.4"
-quote = "=0.6"
-syn = "=0.15"
+proc-macro2 = "^1"
+quote = "^1"
+syn = "^1"
 
 [lib]
 proc-macro = true
diff --git a/bit_field/bit_field_derive/bit_field_derive.rs b/bit_field/bit_field_derive/bit_field_derive.rs
index 7084e22..92fea94 100644
--- a/bit_field/bit_field_derive/bit_field_derive.rs
+++ b/bit_field/bit_field_derive/bit_field_derive.rs
@@ -75,14 +75,14 @@
 
     let ident = &ast.ident;
 
-    if width.value() > 64 {
+    if width > 64 {
         return Err(Error::new(
             Span::call_site(),
             "max width of bitfield field is 64",
         ));
     }
 
-    let bits = width.value() as u8;
+    let bits = width as u8;
 
     if fields.unnamed.len() != 1 {
         return Err(Error::new(
@@ -91,7 +91,7 @@
         ));
     }
 
-    let field_type = match &fields.unnamed.first().unwrap().value().ty {
+    let field_type = match &fields.unnamed.first().unwrap().ty {
         Type::Path(t) => t,
         _ => {
             return Err(Error::new(
@@ -100,14 +100,7 @@
             ));
         }
     };
-    let span = field_type
-        .path
-        .segments
-        .first()
-        .unwrap()
-        .value()
-        .ident
-        .span();
+    let span = field_type.path.segments.first().unwrap().ident.span();
 
     let from_u64 = quote_spanned! {
         span => val as #field_type
@@ -145,22 +138,22 @@
     let width = parse_remove_bits_attr(&mut ast)?;
     match width {
         None => bitfield_enum_without_width_impl(&ast, data),
-        Some(width) => bitfield_enum_with_width_impl(&ast, data, &width),
+        Some(width) => bitfield_enum_with_width_impl(&ast, data, width),
     }
 }
 
 fn bitfield_enum_with_width_impl(
     ast: &DeriveInput,
     data: &DataEnum,
-    width: &LitInt,
+    width: u64,
 ) -> Result<TokenStream> {
-    if width.value() > 64 {
+    if width > 64 {
         return Err(Error::new(
             Span::call_site(),
             "max width of bitfield enum is 64",
         ));
     }
-    let bits = width.value() as u8;
+    let bits = width as u8;
     let declare_discriminants = get_declare_discriminants_for_enum(bits, ast, data);
 
     let ident = &ast.ident;
@@ -409,14 +402,14 @@
     Ok(None)
 }
 
-fn parse_remove_bits_attr(ast: &mut DeriveInput) -> Result<Option<LitInt>> {
+fn parse_remove_bits_attr(ast: &mut DeriveInput) -> Result<Option<u64>> {
     let mut width = None;
     let mut bits_idx = 0;
 
     for (i, attr) in ast.attrs.iter().enumerate() {
         if let Some(w) = try_parse_bits_attr(attr)? {
             bits_idx = i;
-            width = Some(w);
+            width = Some(w.base10_parse()?);
         }
     }
 
diff --git a/build_test.py b/build_test.py
index 21d87df..81d09db 100755
--- a/build_test.py
+++ b/build_test.py
@@ -26,16 +26,30 @@
 X86_64_TRIPLE = os.getenv('X86_64_TRIPLE', 'x86_64-cros-linux-gnu')
 
 TEST_MODULES_PARALLEL = [
+    'arch',
+    'assertions',
+    'bit_field',
     'crosvm',
     'data_model',
+    'devices',
+    'disk',
+    'enumn',
+    'kernel_cmdline',
     'kernel_loader',
     'kvm',
     'kvm_sys',
+    'msg_socket',
     'net_sys',
     'net_util',
+    'qcow_utils',
+    'rand_ish',
+    'resources',
+    'sync',
     'syscall_defines',
+    'tpm2',
     'vhost',
     'virtio_sys',
+    'vm_control',
     'x86_64',
 ]
 
diff --git a/cros_async/Cargo.toml b/cros_async/Cargo.toml
new file mode 100644
index 0000000..4e62732
--- /dev/null
+++ b/cros_async/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "cros_async"
+version = "0.1.0"
+authors = ["The Chromium OS Authors"]
+edition = "2018"
+
+[dependencies]
+libc = "*"
+paste = "*"
+pin-utils = "0.1.0-alpha.4"
+sys_util = { path = "../sys_util" }
+syscall_defines = { path = "../syscall_defines" }
+
+[dependencies.futures]
+version = "*"
+default-features = false
diff --git a/cros_async/src/complete.rs b/cros_async/src/complete.rs
new file mode 100644
index 0000000..8455066
--- /dev/null
+++ b/cros_async/src/complete.rs
@@ -0,0 +1,101 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Need non-snake case so the macro can re-use type names for variables.
+#![allow(non_snake_case)]
+
+use std::future::Future;
+use std::pin::Pin;
+use std::task::Context;
+
+use futures::future::{maybe_done, FutureExt, MaybeDone};
+
+use crate::executor::{FutureList, FutureState, UnitFutures};
+
+// Macro-generate future combinators to allow for running different numbers of top-level futures in
+// this FutureList. Generates the implementation of `FutureList` for the completion types. For an
+// explicit example this is modeled after, see `UnitFutures`.
+macro_rules! generate {
+    ($(
+        $(#[$doc:meta])*
+        ($Complete:ident, <$($Fut:ident),*>),
+    )*) => ($(
+        $(#[$doc])*
+        #[must_use = "Combinations of futures don't do anything unless run in an executor."]
+        paste::item! {
+            pub(crate) struct $Complete<$($Fut: Future + Unpin),*> {
+                added_futures: UnitFutures,
+                $($Fut: MaybeDone<$Fut>,)*
+                $([<$Fut _state>]: FutureState,)*
+            }
+        }
+
+        impl<$($Fut: Future + Unpin),*> $Complete<$($Fut),*> {
+            paste::item! {
+                pub(crate) fn new($($Fut: $Fut),*) -> $Complete<$($Fut),*> {
+                    $Complete {
+                        added_futures: UnitFutures::new(),
+                        $($Fut: maybe_done($Fut),)*
+                        $([<$Fut _state>]: FutureState::new(),)*
+                    }
+                }
+            }
+        }
+
+        impl<$($Fut: Future + Unpin),*> FutureList for $Complete<$($Fut),*> {
+            type Output = ($($Fut::Output),*);
+
+            fn futures_mut(&mut self) -> &mut UnitFutures {
+                &mut self.added_futures
+            }
+
+            paste::item! {
+                fn poll_results(&mut self) -> Option<Self::Output> {
+                    let _ = self.added_futures.poll_results();
+
+                    let mut complete = true;
+                    $(
+                        if self.[<$Fut _state>].needs_poll.replace(false) {
+                            let mut ctx = Context::from_waker(&self.[<$Fut _state>].waker);
+                            // The future impls `Unpin`, use `poll_unpin` to avoid wrapping it in
+                            // `Pin` to call `poll`.
+                            complete &= self.$Fut.poll_unpin(&mut ctx).is_ready();
+                        }
+                    )*
+
+                    if complete {
+                        $(
+                            let $Fut = Pin::new(&mut self.$Fut);
+                        )*
+                        Some(($($Fut.take_output().unwrap()), *))
+                    } else {
+                        None
+                    }
+                }
+
+                fn any_ready(&self) -> bool {
+                    let mut ready = self.added_futures.any_ready();
+                    $(
+                        ready |= self.[<$Fut _state>].needs_poll.get();
+                    )*
+                    ready
+                }
+            }
+        }
+    )*)
+}
+
+generate! {
+    /// _Future for the [`complete2`] function.
+    (Complete2, <_Fut1, _Fut2>),
+
+    /// _Future for the [`complete3`] function.
+    (Complete3, <_Fut1, _Fut2, _Fut3>),
+
+    /// _Future for the [`complete4`] function.
+    (Complete4, <_Fut1, _Fut2, _Fut3, _Fut4>),
+
+    /// _Future for the [`complete5`] function.
+    (Complete5, <_Fut1, _Fut2, _Fut3, _Fut4, _Fut5>),
+}
diff --git a/cros_async/src/executor.rs b/cros_async/src/executor.rs
new file mode 100644
index 0000000..ac788f1
--- /dev/null
+++ b/cros_async/src/executor.rs
@@ -0,0 +1,179 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::cell::Cell;
+use std::collections::VecDeque;
+use std::future::Future;
+use std::pin::Pin;
+use std::rc::Rc;
+use std::task::Waker;
+use std::task::{Context, Poll};
+
+use crate::waker::create_waker;
+
+/// Represents a future executor that can be run. Implementers of the trait will take a list of
+/// futures and poll them until completed.
+pub trait Executor {
+    /// The type returned by the executor. This is normally `()` or a combination of the output the
+    /// futures produce.
+    type Output;
+
+    /// Run the executor, this will return once the exit criteria is met. The exit criteria is
+    /// specified when the executor is created, for example running until all futures are complete.
+    fn run(&mut self) -> Self::Output;
+}
+
+// Tracks if a future needs to be polled and the waker to use.
+pub(crate) struct FutureState {
+    pub needs_poll: Rc<Cell<bool>>,
+    pub waker: Waker,
+}
+
+impl FutureState {
+    pub fn new() -> FutureState {
+        let needs_poll = Rc::new(Cell::new(true));
+        // Safe because a valid pointer is passed to `create_waker` and the valid result is
+        // passed to `Waker::from_raw`. And because the reference count to needs_poll is
+        // incremented by cloning it so it can't be dropped before the waker.
+        let waker = unsafe {
+            let clone = needs_poll.clone();
+            let raw_waker = create_waker(Rc::into_raw(clone) as *const _);
+            Waker::from_raw(raw_waker)
+        };
+        FutureState { needs_poll, waker }
+    }
+}
+
+// Couples a future owned by the executor with a flag that indicates the future is ready to be
+// polled. Futures will start with the flag set. After blocking by returning `Poll::Pending`, the
+// flag will be false until the waker is triggered and sets the flag to true, signalling the
+// executor to poll the future again.
+pub(crate) struct ExecutableFuture<T> {
+    future: Pin<Box<dyn Future<Output = T>>>,
+    state: FutureState,
+}
+
+impl<T> ExecutableFuture<T> {
+    // Creates an `ExecutableFuture` from the future. The returned struct is used to track when the
+    // future should be polled again.
+    pub fn new(future: Pin<Box<dyn Future<Output = T>>>) -> ExecutableFuture<T> {
+        ExecutableFuture {
+            future,
+            state: FutureState::new(),
+        }
+    }
+
+    // Polls the future if needed and returns the result.
+    // Covers setting up the waker and context before calling the future.
+    fn poll(&mut self) -> Poll<T> {
+        let mut ctx = Context::from_waker(&self.state.waker);
+        let f = self.future.as_mut();
+        f.poll(&mut ctx)
+    }
+}
+
+// Private trait used to allow one executor to behave differently.  Using FutureList allows the
+// executor code to be common across different collections of crates and different termination
+// behavior. For example, one list can decide to exit after the first trait completes, others can
+// wait until all are complete.
+pub(crate) trait FutureList {
+    type Output;
+
+    // Return a mutable reference to the list of futures that can be added or removed from this
+    // List.
+    fn futures_mut(&mut self) -> &mut UnitFutures;
+    // Polls all futures that are ready. Returns the results if this list has completed.
+    fn poll_results(&mut self) -> Option<Self::Output>;
+    // Returns true if any future in the list is ready to be polled.
+    fn any_ready(&self) -> bool;
+}
+
+// `UnitFutures` is the simplest implementor of `FutureList`. It runs all futures added to it until
+// there are none left to poll. The futures must all return `()`.
+pub(crate) struct UnitFutures {
+    futures: VecDeque<ExecutableFuture<()>>,
+}
+
+impl UnitFutures {
+    // Creates a new, empty list of futures.
+    pub fn new() -> UnitFutures {
+        UnitFutures {
+            futures: VecDeque::new(),
+        }
+    }
+
+    // Adds a future to the list of futures to be polled.
+    pub fn append(&mut self, futures: &mut VecDeque<ExecutableFuture<()>>) {
+        self.futures.append(futures);
+    }
+
+    // Polls all futures that are ready to be polled. Removes any futures that indicate they are
+    // completed.
+    pub fn poll_all(&mut self) {
+        let mut i = 0;
+        while i < self.futures.len() {
+            let fut = &mut self.futures[i];
+            let remove = if fut.state.needs_poll.replace(false) {
+                fut.poll().is_ready()
+            } else {
+                false
+            };
+            if remove {
+                self.futures.remove(i);
+            } else {
+                i += 1;
+            }
+        }
+    }
+}
+
+impl FutureList for UnitFutures {
+    type Output = ();
+
+    fn futures_mut(&mut self) -> &mut UnitFutures {
+        self
+    }
+
+    fn poll_results(&mut self) -> Option<Self::Output> {
+        self.poll_all();
+        if self.futures.is_empty() {
+            Some(())
+        } else {
+            None
+        }
+    }
+
+    fn any_ready(&self) -> bool {
+        self.futures.iter().any(|fut| fut.state.needs_poll.get())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    #[test]
+    fn basic_run() {
+        async fn f(called: Rc<AtomicUsize>) {
+            called.fetch_add(1, Ordering::Relaxed);
+        }
+
+        let f1_called = Rc::new(AtomicUsize::new(0));
+        let f2_called = Rc::new(AtomicUsize::new(0));
+
+        let fut1 = Box::pin(f(f1_called.clone()));
+        let fut2 = Box::pin(f(f2_called.clone()));
+
+        let mut futures = VecDeque::new();
+        futures.push_back(ExecutableFuture::new(fut1));
+        futures.push_back(ExecutableFuture::new(fut2));
+
+        let mut uf = UnitFutures::new();
+        uf.append(&mut futures);
+        assert!(uf.poll_results().is_some());
+        assert_eq!(f1_called.load(Ordering::Relaxed), 1);
+        assert_eq!(f2_called.load(Ordering::Relaxed), 1);
+    }
+}
diff --git a/cros_async/src/fd_executor.rs b/cros_async/src/fd_executor.rs
new file mode 100644
index 0000000..58d6013
--- /dev/null
+++ b/cros_async/src/fd_executor.rs
@@ -0,0 +1,250 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+//! The executor runs all given futures to completion. Futures register wakers associated with file
+//! descriptors. The wakers will be called when the FD becomes readable or writable depending on
+//! the situation.
+//!
+//! `FdExecutor` is meant to be used with the `futures-rs` crate that provides combinators and
+//! utility functions to combine futures.
+//!
+//! # Example of starting the framework and running a future:
+//!
+//! ```
+//! # use std::rc::Rc;
+//! # use std::cell::RefCell;
+//! use cros_async::Executor;
+//! async fn my_async(mut x: Rc<RefCell<u64>>) {
+//!     x.replace(4);
+//! }
+//!
+//! let mut ex = cros_async::empty_executor().expect("Failed creating executor");
+//! let x = Rc::new(RefCell::new(0));
+//! cros_async::fd_executor::add_future(Box::pin(my_async(x.clone())));
+//! ex.run();
+//! assert_eq!(*x.borrow(), 4);
+//! ```
+
+use std::cell::RefCell;
+use std::collections::{BTreeMap, VecDeque};
+use std::fmt::{self, Display};
+use std::fs::File;
+use std::future::Future;
+use std::os::unix::io::FromRawFd;
+use std::os::unix::io::RawFd;
+use std::pin::Pin;
+use std::task::Waker;
+
+use sys_util::{PollContext, WatchingEvents};
+
+use crate::executor::{ExecutableFuture, Executor, FutureList};
+
+#[derive(Debug, PartialEq)]
+pub enum Error {
+    /// Attempts to create two Executors on the same thread fail.
+    AttemptedDuplicateExecutor,
+    /// Failed to copy the FD for the polling context.
+    DuplicatingFd(sys_util::Error),
+    /// Failed accessing the thread local storage for wakers.
+    InvalidContext,
+    /// Creating a context to wait on FDs failed.
+    CreatingContext(sys_util::Error),
+    /// PollContext failure.
+    PollContextError(sys_util::Error),
+    /// Failed to submit the waker to the polling context.
+    SubmittingWaker(sys_util::Error),
+}
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use self::Error::*;
+
+        match self {
+            AttemptedDuplicateExecutor => write!(f, "Cannot have two executors on one thread."),
+            DuplicatingFd(e) => write!(f, "Failed to copy the FD for the polling context: {}", e),
+            InvalidContext => write!(
+                f,
+                "Invalid context, was the Fd executor created successfully?"
+            ),
+            CreatingContext(e) => write!(f, "An error creating the fd waiting context: {}.", e),
+            PollContextError(e) => write!(f, "PollContext failure: {}", e),
+            SubmittingWaker(e) => write!(f, "An error adding to the Aio context: {}.", e),
+        }
+    }
+}
+
+// Temporary vectors of new additions to the executor.
+
+// Tracks active wakers and the futures they are associated with.
+thread_local!(static STATE: RefCell<Option<FdWakerState>> = RefCell::new(None));
+
+fn add_waker(fd: RawFd, waker: Waker, events: WatchingEvents) -> Result<()> {
+    STATE.with(|state| {
+        let mut state = state.borrow_mut();
+        if let Some(state) = state.as_mut() {
+            state.add_waker(fd, waker, events)
+        } else {
+            Err(Error::InvalidContext)
+        }
+    })
+}
+
+/// Tells the waking system to wake `waker` when `fd` becomes readable.
+/// The 'fd' must be fully owned by the future adding the waker, and must not be closed until the
+/// next time the future is polled. If the fd is closed, there is a race where another FD can be
+/// opened on top of it causing the next poll to access the new target file.
+pub fn add_read_waker(fd: RawFd, waker: Waker) -> Result<()> {
+    add_waker(fd, waker, WatchingEvents::empty().set_read())
+}
+
+/// Tells the waking system to wake `waker` when `fd` becomes writable.
+/// The 'fd' must be fully owned by the future adding the waker, and must not be closed until the
+/// next time the future is polled. If the fd is closed, there is a race where another FD can be
+/// opened on top of it causing the next poll to access the new target file.
+pub fn add_write_waker(fd: RawFd, waker: Waker) -> Result<()> {
+    add_waker(fd, waker, WatchingEvents::empty().set_write())
+}
+
+/// Adds a new top level future to the Executor.
+/// These futures must return `()`, indicating they are intended to create side-effects only.
+pub fn add_future(future: Pin<Box<dyn Future<Output = ()>>>) -> Result<()> {
+    STATE.with(|state| {
+        let mut state = state.borrow_mut();
+        if let Some(state) = state.as_mut() {
+            state.new_futures.push_back(ExecutableFuture::new(future));
+            Ok(())
+        } else {
+            Err(Error::InvalidContext)
+        }
+    })
+}
+
+// Tracks active wakers and associates wakers with the futures that registered them.
+struct FdWakerState {
+    poll_ctx: PollContext<u64>,
+    token_map: BTreeMap<u64, (File, Waker)>,
+    next_token: u64, // Next token for adding to the context.
+    new_futures: VecDeque<ExecutableFuture<()>>,
+}
+
+impl FdWakerState {
+    fn new() -> Result<Self> {
+        Ok(FdWakerState {
+            poll_ctx: PollContext::new().map_err(Error::CreatingContext)?,
+            token_map: BTreeMap::new(),
+            next_token: 0,
+            new_futures: VecDeque::new(),
+        })
+    }
+
+    // Adds an fd that, when signaled, will trigger the given waker.
+    fn add_waker(&mut self, fd: RawFd, waker: Waker, events: WatchingEvents) -> Result<()> {
+        let duped_fd = unsafe {
+            // Safe because duplicating an FD doesn't affect memory safety, and the dup'd FD
+            // will only be added to the poll loop.
+            File::from_raw_fd(dup_fd(fd)?)
+        };
+        self.poll_ctx
+            .add_fd_with_events(&duped_fd, events, self.next_token)
+            .map_err(Error::SubmittingWaker)?;
+        let next_token = self.next_token;
+        self.token_map.insert(next_token, (duped_fd, waker));
+        self.next_token += 1;
+        Ok(())
+    }
+
+    // Waits until one of the FDs is readable and wakes the associated waker.
+    fn wait_wake_event(&mut self) -> Result<()> {
+        let events = self.poll_ctx.wait().map_err(Error::PollContextError)?;
+        for e in events.iter() {
+            if let Some((fd, waker)) = self.token_map.remove(&e.token()) {
+                self.poll_ctx.delete(&fd).map_err(Error::PollContextError)?;
+                waker.wake_by_ref();
+            }
+        }
+        Ok(())
+    }
+}
+
+/// Runs futures to completion on a single thread. Futures are allowed to block on file descriptors
+/// only. Futures can only block on FDs becoming readable or writable. `FdExecutor` is meant to be
+/// used where a poll or select loop would be used otherwise.
+pub(crate) struct FdExecutor<T: FutureList> {
+    futures: T,
+}
+
+impl<T: FutureList> Executor for FdExecutor<T> {
+    type Output = Result<T::Output>;
+
+    fn run(&mut self) -> Self::Output {
+        self.append_futures();
+
+        loop {
+            if let Some(output) = self.futures.poll_results() {
+                return Ok(output);
+            }
+
+            self.append_futures();
+
+            // If no futures are ready, sleep until a waker is signaled.
+            if !self.futures.any_ready() {
+                STATE.with(|state| {
+                    let mut state = state.borrow_mut();
+                    if let Some(state) = state.as_mut() {
+                        state.wait_wake_event()?;
+                    } else {
+                        unreachable!("Can't get here without a context being created");
+                    }
+                    Ok(())
+                })?;
+            }
+        }
+    }
+}
+
+impl<T: FutureList> FdExecutor<T> {
+    /// Create a new executor.
+    pub fn new(futures: T) -> Result<FdExecutor<T>> {
+        STATE.with(|state| {
+            if state.borrow().is_some() {
+                return Err(Error::AttemptedDuplicateExecutor);
+            }
+            state.replace(Some(FdWakerState::new()?));
+            Ok(())
+        })?;
+        Ok(FdExecutor { futures })
+    }
+
+    // Add any new futures and wakers to the lists.
+    fn append_futures(&mut self) {
+        STATE.with(|state| {
+            let mut state = state.borrow_mut();
+            if let Some(state) = state.as_mut() {
+                self.futures.futures_mut().append(&mut state.new_futures);
+            } else {
+                unreachable!("Can't get here without a context being created");
+            }
+        });
+    }
+}
+
+impl<T: FutureList> Drop for FdExecutor<T> {
+    fn drop(&mut self) {
+        STATE.with(|state| {
+            state.replace(None);
+        });
+    }
+}
+
+// Used to dup the FDs passed to the executor so there is a guarantee they aren't closed while
+// waiting in TLS to be added to the main polling context.
+unsafe fn dup_fd(fd: RawFd) -> Result<RawFd> {
+    let ret = libc::dup(fd);
+    if ret < 0 {
+        Err(Error::DuplicatingFd(sys_util::Error::last()))
+    } else {
+        Ok(ret)
+    }
+}
diff --git a/cros_async/src/lib.rs b/cros_async/src/lib.rs
new file mode 100644
index 0000000..f671a0c
--- /dev/null
+++ b/cros_async/src/lib.rs
@@ -0,0 +1,354 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+//! An Executor and future combinators based on operations that block on file descriptors.
+//!
+//! This crate is meant to be used with the `futures-rs` crate that provides further combinators
+//! and utility functions to combine and manage futures. All futures will run until they block on a
+//! file descriptor becoming readable or writable. Facilities are provided to register future
+//! wakers based on such events.
+//!
+//! # Running top-level futures.
+//!
+//! Use helper functions based the desired behavior of your application.
+//!
+//! ## Completing one of several futures.
+//!
+//! If there are several top level tasks that should run until any one completes, use the "select"
+//! family of executor constructors. These return an [`Executor`](trait.Executor.html) whose `run`
+//! function will return when the first future completes. The uncompleted futures will also be
+//! returned so they can be run further or otherwise cleaned up. These functions are inspired by
+//! the `select_all` function from futures-rs, but built to be run inside an FD based executor and
+//! to poll only when necessary. See the docs for [`select2`](fn.select2.html),
+//! [`select3`](fn.select3.html), [`select4`](fn.select4.html), and [`select5`](fn.select5.html).
+//!
+//! ## Completing all of several futures.
+//!
+//! If there are several top level tasks that all need to be completed, use the "complete" family
+//! of executor constructors. These return an [`Executor`](trait.Executor.html) whose `run`
+//! function will return only once all the futures passed to it have completed. These functions are
+//! inspired by the `join_all` function from futures-rs, but built to be run inside an FD based
+//! executor and to poll only when necessary. See the docs for [`complete2`](fn.complete2.html),
+//! [`complete3`](fn.complete3.html), [`complete4`](fn.complete4.html), and
+//! [`complete5`](fn.complete5.html).
+//!
+//! ## Many futures all returning `()`
+//!
+//! It there are futures that produce side effects and return `()`, the
+//! [`empty_executor`](fn.empty_executor.html) function provides an Executor that runs futures
+//! returning `()`. Futures are added using the [`add_future`](fn.add_future.html) function.
+//!
+//! # Implementing new FD-based futures.
+//!
+//! When building futures to be run in an `FdExecutor` framework, use the following helper
+//! functions to perform common tasks:
+//!
+//! [`add_read_waker`](fn.add_read_waker.html) - Used to associate a provided FD becoming readable
+//! with the future being woken. Used before returning Poll::Pending from a future that waits until
+//! an FD is writable.
+//!
+//! [`add_write_waker`](fn.add_write_waker.html) - Used to associate a provided FD becoming
+//! writable with the future being woken. Used before returning Poll::Pending from a future that
+//! waits until an FD is readable.
+//!
+//! [`add_future`](fn.add_future.html) - Used to add a new future to the top-level list of running
+//! futures.
+
+mod complete;
+mod executor;
+pub mod fd_executor;
+mod select;
+mod waker;
+
+pub use executor::Executor;
+pub use select::SelectResult;
+
+use executor::UnitFutures;
+use fd_executor::{FdExecutor, Result};
+use std::future::Future;
+
+/// Creates an empty FdExecutor that can have futures returning `()` added via
+/// [`add_future`](fn.add_future.html).
+///
+///  # Example
+///
+///    ```
+///    use cros_async::{empty_executor, Executor};
+///    use cros_async::fd_executor::add_future;
+///    use futures::future::pending;
+///
+///    let fut = async { () };
+///    let mut ex = empty_executor().expect("Failed to create executor");
+///
+///    add_future(Box::pin(fut));
+///    ex.run();
+///    ```
+pub fn empty_executor() -> Result<impl Executor> {
+    FdExecutor::new(UnitFutures::new())
+}
+
+// Select helpers to run until any future completes.
+
+/// Creates an executor that runs the two given futures until one completes, returning a tuple
+/// containing the result of the finished future and the still pending future.
+///
+///  # Example
+///
+///    ```
+///    use cros_async::{empty_executor, Executor, select2, SelectResult};
+///    use cros_async::fd_executor::add_future;
+///    use futures::future::pending;
+///    use futures::pin_mut;
+///
+///    let first = async {5};
+///    let second = async {let () = pending().await;};
+///    pin_mut!(first);
+///    pin_mut!(second);
+///    match select2(first, second) {
+///        Ok((SelectResult::Finished(5), SelectResult::Pending(_second))) => (),
+///        _ => panic!("Select didn't return the first future"),
+///    };
+///    ```
+pub fn select2<F1: Future + Unpin, F2: Future + Unpin>(
+    f1: F1,
+    f2: F2,
+) -> Result<(SelectResult<F1>, SelectResult<F2>)> {
+    FdExecutor::new(select::Select2::new(f1, f2)).and_then(|mut f| f.run())
+}
+
+/// Creates an executor that runs the three given futures until one or more completes, returning a
+/// tuple containing the result of the finished future(s) and the still pending future(s).
+///
+///  # Example
+///
+///    ```
+///    use cros_async::{empty_executor, Executor, select3, SelectResult};
+///    use cros_async::fd_executor::add_future;
+///    use futures::future::pending;
+///    use futures::pin_mut;
+///
+///    let first = async {4};
+///    let second = async {let () = pending().await;};
+///    let third = async {5};
+///    pin_mut!(first);
+///    pin_mut!(second);
+///    pin_mut!(third);
+///    match select3(first, second, third) {
+///        Ok((SelectResult::Finished(4),
+///            SelectResult::Pending(_second),
+///            SelectResult::Finished(5))) => (),
+///        _ => panic!("Select didn't return the futures"),
+///    };
+///    ```
+pub fn select3<F1: Future + Unpin, F2: Future + Unpin, F3: Future + Unpin>(
+    f1: F1,
+    f2: F2,
+    f3: F3,
+) -> Result<(SelectResult<F1>, SelectResult<F2>, SelectResult<F3>)> {
+    FdExecutor::new(select::Select3::new(f1, f2, f3)).and_then(|mut f| f.run())
+}
+
+/// Creates an executor that runs the four given futures until one or more completes, returning a
+/// tuple containing the result of the finished future(s) and the still pending future(s).
+///
+///  # Example
+///
+///    ```
+///    use cros_async::{empty_executor, Executor, select4, SelectResult};
+///    use cros_async::fd_executor::add_future;
+///    use futures::future::pending;
+///    use futures::pin_mut;
+///
+///    let first = async {4};
+///    let second = async {let () = pending().await;};
+///    let third = async {5};
+///    let fourth = async {let () = pending().await;};
+///    pin_mut!(first);
+///    pin_mut!(second);
+///    pin_mut!(third);
+///    pin_mut!(fourth);
+///    match select4(first, second, third, fourth) {
+///        Ok((SelectResult::Finished(4), SelectResult::Pending(_second),
+///            SelectResult::Finished(5), SelectResult::Pending(_fourth))) => (),
+///        _ => panic!("Select didn't return the futures"),
+///    };
+///    ```
+pub fn select4<F1: Future + Unpin, F2: Future + Unpin, F3: Future + Unpin, F4: Future + Unpin>(
+    f1: F1,
+    f2: F2,
+    f3: F3,
+    f4: F4,
+) -> Result<(
+    SelectResult<F1>,
+    SelectResult<F2>,
+    SelectResult<F3>,
+    SelectResult<F4>,
+)> {
+    FdExecutor::new(select::Select4::new(f1, f2, f3, f4)).and_then(|mut f| f.run())
+}
+
+/// Creates an executor that runs the five given futures until one or more completes, returning a
+/// tuple containing the result of the finished future(s) and the still pending future(s).
+///
+///  # Example
+///
+///    ```
+///    use cros_async::{empty_executor, Executor, select5, SelectResult};
+///    use cros_async::fd_executor::add_future;
+///    use futures::future::pending;
+///    use futures::pin_mut;
+///
+///    let first = async {4};
+///    let second = async {let () = pending().await;};
+///    let third = async {5};
+///    let fourth = async {let () = pending().await;};
+///    let fifth = async {6};
+///    pin_mut!(first);
+///    pin_mut!(second);
+///    pin_mut!(third);
+///    pin_mut!(fourth);
+///    pin_mut!(fifth);
+///    match select5(first, second, third, fourth, fifth) {
+///        Ok((SelectResult::Finished(4), SelectResult::Pending(_second),
+///            SelectResult::Finished(5), SelectResult::Pending(_fourth),
+///            SelectResult::Finished(6))) => (),
+///        _ => panic!("Select didn't return the futures"),
+///    };
+///    ```
+pub fn select5<
+    F1: Future + Unpin,
+    F2: Future + Unpin,
+    F3: Future + Unpin,
+    F4: Future + Unpin,
+    F5: Future + Unpin,
+>(
+    f1: F1,
+    f2: F2,
+    f3: F3,
+    f4: F4,
+    f5: F5,
+) -> Result<(
+    SelectResult<F1>,
+    SelectResult<F2>,
+    SelectResult<F3>,
+    SelectResult<F4>,
+    SelectResult<F5>,
+)> {
+    FdExecutor::new(select::Select5::new(f1, f2, f3, f4, f5)).and_then(|mut f| f.run())
+}
+
+// Combination helpers to run until all futures are complete.
+
+/// Creates an executor that runs the two given futures to completion, returning a tuple of the
+/// outputs each yields.
+///
+///  # Example
+///
+///    ```
+///    use cros_async::{empty_executor, Executor, complete2};
+///    use futures::pin_mut;
+///
+///    let first = async {5};
+///    let second = async {6};
+///    pin_mut!(first);
+///    pin_mut!(second);
+///    assert_eq!(complete2(first, second).unwrap_or((0,0)), (5,6));
+///    ```
+pub fn complete2<F1: Future + Unpin, F2: Future + Unpin>(
+    f1: F1,
+    f2: F2,
+) -> Result<(F1::Output, F2::Output)> {
+    FdExecutor::new(complete::Complete2::new(f1, f2)).and_then(|mut f| f.run())
+}
+
+/// Creates an executor that runs the three given futures to completion, returning a tuple of the
+/// outputs each yields.
+///
+///  # Example
+///
+///    ```
+///    use cros_async::{empty_executor, Executor, complete3};
+///    use futures::pin_mut;
+///
+///    let first = async {5};
+///    let second = async {6};
+///    let third = async {7};
+///    pin_mut!(first);
+///    pin_mut!(second);
+///    pin_mut!(third);
+///    assert_eq!(complete3(first, second, third).unwrap_or((0,0,0)), (5,6,7));
+///    ```
+pub fn complete3<F1: Future + Unpin, F2: Future + Unpin, F3: Future + Unpin>(
+    f1: F1,
+    f2: F2,
+    f3: F3,
+) -> Result<(F1::Output, F2::Output, F3::Output)> {
+    FdExecutor::new(complete::Complete3::new(f1, f2, f3)).and_then(|mut f| f.run())
+}
+
+/// Creates an executor that runs the four given futures to completion, returning a tuple of the
+/// outputs each yields.
+///
+///  # Example
+///
+///    ```
+///    use cros_async::{empty_executor, Executor, complete4};
+///    use futures::pin_mut;
+///
+///    let first = async {5};
+///    let second = async {6};
+///    let third = async {7};
+///    let fourth = async {8};
+///    pin_mut!(first);
+///    pin_mut!(second);
+///    pin_mut!(third);
+///    pin_mut!(fourth);
+///    assert_eq!(complete4(first, second, third, fourth).unwrap_or((0,0,0,0)), (5,6,7,8));
+///    ```
+pub fn complete4<F1: Future + Unpin, F2: Future + Unpin, F3: Future + Unpin, F4: Future + Unpin>(
+    f1: F1,
+    f2: F2,
+    f3: F3,
+    f4: F4,
+) -> Result<(F1::Output, F2::Output, F3::Output, F4::Output)> {
+    FdExecutor::new(complete::Complete4::new(f1, f2, f3, f4)).and_then(|mut f| f.run())
+}
+
+/// Creates an executor that runs the five given futures to completion, returning a tuple of the
+/// outputs each yields.
+///
+///  # Example
+///
+///    ```
+///    use cros_async::{empty_executor, Executor, complete5};
+///    use futures::pin_mut;
+///
+///    let first = async {5};
+///    let second = async {6};
+///    let third = async {7};
+///    let fourth = async {8};
+///    let fifth = async {9};
+///    pin_mut!(first);
+///    pin_mut!(second);
+///    pin_mut!(third);
+///    pin_mut!(fourth);
+///    pin_mut!(fifth);
+///    assert_eq!(complete5(first, second, third, fourth, fifth).unwrap_or((0,0,0,0,0)),
+///               (5,6,7,8,9));
+///    ```
+pub fn complete5<
+    F1: Future + Unpin,
+    F2: Future + Unpin,
+    F3: Future + Unpin,
+    F4: Future + Unpin,
+    F5: Future + Unpin,
+>(
+    f1: F1,
+    f2: F2,
+    f3: F3,
+    f4: F4,
+    f5: F5,
+) -> Result<(F1::Output, F2::Output, F3::Output, F4::Output, F5::Output)> {
+    FdExecutor::new(complete::Complete5::new(f1, f2, f3, f4, f5)).and_then(|mut f| f.run())
+}
diff --git a/cros_async/src/select.rs b/cros_async/src/select.rs
new file mode 100644
index 0000000..8eef317
--- /dev/null
+++ b/cros_async/src/select.rs
@@ -0,0 +1,110 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Need non-snake case so the macro can re-use type names for variables.
+#![allow(non_snake_case)]
+
+use std::future::Future;
+use std::pin::Pin;
+use std::task::Context;
+
+use futures::future::{maybe_done, FutureExt, MaybeDone};
+
+use crate::executor::{FutureList, FutureState, UnitFutures};
+
+pub enum SelectResult<F: Future> {
+    Pending(F),
+    Finished(F::Output),
+}
+
+// Macro-generate future combinators to allow for running different numbers of top-level futures in
+// this FutureList. Generates the implementation of `FutureList` for the select types. For an
+// explicit example this is modeled after, see `UnitFutures`.
+macro_rules! generate {
+    ($(
+        $(#[$doc:meta])*
+        ($Select:ident, <$($Fut:ident),*>),
+    )*) => ($(
+        $(#[$doc])*
+        #[must_use = "Combinations of futures don't do anything unless run in an executor."]
+        paste::item! {
+            pub(crate) struct $Select<$($Fut: Future + Unpin),*> {
+                added_futures: UnitFutures,
+                $($Fut: MaybeDone<$Fut>,)*
+                $([<$Fut _state>]: FutureState,)*
+            }
+        }
+
+        impl<$($Fut: Future + Unpin),*> $Select<$($Fut),*> {
+            paste::item! {
+                pub(crate) fn new($($Fut: $Fut),*) -> $Select<$($Fut),*> {
+                    $Select {
+                        added_futures: UnitFutures::new(),
+                        $($Fut: maybe_done($Fut),)*
+                        $([<$Fut _state>]: FutureState::new(),)*
+                    }
+                }
+            }
+        }
+
+        impl<$($Fut: Future + Unpin),*> FutureList for $Select<$($Fut),*> {
+            type Output = ($(SelectResult<$Fut>),*);
+
+            fn futures_mut(&mut self) -> &mut UnitFutures {
+                &mut self.added_futures
+            }
+
+            paste::item! {
+                fn poll_results(&mut self) -> Option<Self::Output> {
+                    let _ = self.added_futures.poll_results();
+
+                    let mut complete = false;
+                    $(
+                        let $Fut = Pin::new(&mut self.$Fut);
+                        if self.[<$Fut _state>].needs_poll.replace(false) {
+                            let mut ctx = Context::from_waker(&self.[<$Fut _state>].waker);
+                            // The future impls `Unpin`, use `poll_unpin` to avoid wrapping it in
+                            // `Pin` to call `poll`.
+                            complete |= self.$Fut.poll_unpin(&mut ctx).is_ready();
+                        }
+                    )*
+
+                    if complete {
+                        Some(($(
+                                    match std::mem::replace(&mut self.$Fut, MaybeDone::Gone) {
+                                        MaybeDone::Future(f) => SelectResult::Pending(f),
+                                        MaybeDone::Done(o) => SelectResult::Finished(o),
+                                        MaybeDone::Gone=>unreachable!(),
+                                    }
+                               ), *))
+                    } else {
+                        None
+                    }
+                }
+
+                fn any_ready(&self) -> bool {
+                    let mut ready = self.added_futures.any_ready();
+                    $(
+                        ready |= self.[<$Fut _state>].needs_poll.get();
+                    )*
+                    ready
+                }
+            }
+        }
+    )*)
+}
+
+generate! {
+    /// _Future for the [`select2`] function.
+    (Select2, <_Fut1, _Fut2>),
+
+    /// _Future for the [`select3`] function.
+    (Select3, <_Fut1, _Fut2, _Fut3>),
+
+    /// _Future for the [`select4`] function.
+    (Select4, <_Fut1, _Fut2, _Fut3, _Fut4>),
+
+    /// _Future for the [`select5`] function.
+    (Select5, <_Fut1, _Fut2, _Fut3, _Fut4, _Fut5>),
+}
diff --git a/cros_async/src/waker.rs b/cros_async/src/waker.rs
new file mode 100644
index 0000000..f0dac0f
--- /dev/null
+++ b/cros_async/src/waker.rs
@@ -0,0 +1,42 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::rc::Rc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::task::{RawWaker, RawWakerVTable};
+
+// Boiler-plate for creating a waker with function pointers.
+// This waker sets the atomic bool it is passed to true.
+// The bool will be used by the executor to know which futures to poll
+
+// Convert the pointer back to the Rc it was created from and drop it.
+unsafe fn waker_drop(data_ptr: *const ()) {
+    // from_raw, then drop
+    let _rc_bool = Rc::<AtomicBool>::from_raw(data_ptr as *const _);
+}
+
+unsafe fn waker_wake(_: *const ()) {}
+
+// Called when the bool should be set to true to wake the waker.
+unsafe fn waker_wake_by_ref(data_ptr: *const ()) {
+    let bool_atomic_ptr = data_ptr as *const AtomicBool;
+    let bool_atomic_ref = bool_atomic_ptr.as_ref().unwrap();
+    bool_atomic_ref.store(true, Ordering::Relaxed);
+}
+
+// The data_ptr will be a pointer to an Rc<AtomicBool>.
+unsafe fn waker_clone(data_ptr: *const ()) -> RawWaker {
+    let rc_bool = Rc::<AtomicBool>::from_raw(data_ptr as *const _);
+    let new_ptr = rc_bool.clone();
+    Rc::into_raw(rc_bool); // Don't decrement the ref count of the original, so back to raw.
+    create_waker(Rc::into_raw(new_ptr) as *const _)
+}
+
+static WAKER_VTABLE: RawWakerVTable =
+    RawWakerVTable::new(waker_clone, waker_wake, waker_wake_by_ref, waker_drop);
+
+/// To use safely, data_ptr must be from Rc<AtomicBool>::from_raw().
+pub unsafe fn create_waker(data_ptr: *const ()) -> RawWaker {
+    RawWaker::new(data_ptr, &WAKER_VTABLE)
+}
diff --git a/crosvm_plugin/Cargo.toml b/crosvm_plugin/Cargo.toml
index 6805898..b9abc02 100644
--- a/crosvm_plugin/Cargo.toml
+++ b/crosvm_plugin/Cargo.toml
@@ -4,6 +4,9 @@
 authors = ["The Chromium OS Authors"]
 edition = "2018"
 
+[features]
+stats = []
+
 [lib]
 crate-type = ["cdylib"]
 
diff --git a/crosvm_plugin/crosvm.h b/crosvm_plugin/crosvm.h
index d7a036c..63763f1 100644
--- a/crosvm_plugin/crosvm.h
+++ b/crosvm_plugin/crosvm.h
@@ -47,7 +47,7 @@
  * do not indicate anything about what version of crosvm is running.
  */
 #define CROSVM_API_MAJOR 0
-#define CROSVM_API_MINOR 17
+#define CROSVM_API_MINOR 19
 #define CROSVM_API_PATCH 0
 
 enum crosvm_address_space {
@@ -175,6 +175,23 @@
                          uint64_t __length);
 
 /*
+ * Registers a range in the given address space that, when accessed via write,
+ * will cause a notification in crosvm_vcpu_wait() but the VM will continue
+ * running.
+ * For this type of notification (where |no_resume| is set) the next call
+ * should be crosvm_vcpu_wait() (without an inbetween call to
+ * crosvm_vcpu_resume() ).
+ *
+ * The requested range must not overlap any prior (and currently active)
+ * reservation to crosvm_reserve_range() or crosvm_reserve_async_write_range().
+ *
+ * To unreserve a range previously reserved by this function, pass the |__space|
+ * and |__start| of the old reservation with a 0 |__length|.
+ */
+int crosvm_reserve_async_write_range(struct crosvm*, uint32_t __space,
+                                     uint64_t __start, uint64_t __length);
+
+/*
  * Sets the state of the given irq pin.
  */
 int crosvm_set_irq(struct crosvm*, uint32_t __irq_id, bool __active);
@@ -229,6 +246,79 @@
 int crosvm_set_irq_routing(struct crosvm*, uint32_t __route_count,
                            const struct crosvm_irq_route* __routes);
 
+/* Hint on what information is queried for a particular hypercall. */
+struct crosvm_hint_detail {
+  bool match_rax;
+  bool match_rbx;
+  bool match_rcx;
+  bool match_rdx;
+  uint8_t _reserved[4];
+  uint64_t rax;
+  uint64_t rbx;
+  uint64_t rcx;
+  uint64_t rdx;
+  bool send_sregs;
+  bool send_debugregs;
+  uint8_t _reserved2[6];
+};
+
+#ifdef static_assert
+static_assert(sizeof(struct crosvm_hint_detail) == 48,
+              "extra padding in struct crosvm_hint_detail");
+#endif
+
+/* Maximum # of hints that can be passed to crosvm_set_hypercall_hint(). */
+#define CROSVM_MAX_HINT_COUNT 1
+
+/* Maximum # of hint details that can be provided for a hint. */
+#define CROSVM_MAX_HINT_DETAIL_COUNT 32
+
+#define CROSVM_HINT_ON_WRITE 0x1
+
+/* Hint on what information is queried for a particular hypercall. */
+struct crosvm_hint {
+  uint32_t hint_version;  /* For now only 0 is defined. */
+  uint32_t _reserved;     /* Must be zero. */
+  uint32_t address_space; /* Value from crosvm_address_space. */
+  uint16_t address_flags; /* 0: read/in; CROSVM_HINT_ON_WRITE: write/out. */
+  uint16_t details_count; /* # of elements in |details|. */
+  uint64_t address;
+  union {
+    struct crosvm_hint_detail *details;
+    uint64_t _reserved2; /* forcing pointer length to 64-bit */
+  };
+};
+
+#ifdef static_assert
+static_assert(sizeof(struct crosvm_hint) == 32,
+              "extra padding in struct crosvm_hint");
+#endif
+
+/*
+ * Sets performance hint(s) for a hypercall port.
+ *
+ * If a VM does an io access the specified |address_space|, |address|
+ * (|address| must be non-zero), and direction (|address_flags|), then
+ * crosvm will assume the plugin is likely to call crosvm_vcpu_get_regs()
+ * (and thus utilize a cache to improve performance).
+ *
+ * Additional hints can be provided via |details| (the element length of
+ * |details| is limited to CROSVM_MAX_HINT_DETAIL_COUNT) on when to also cache
+ * information for crosvm_vcpu_get_sregs() and crosvm_vcpu_get_debugregs()
+ * based on values in the vcpu registers.  |match_XXX| indicates which of
+ * 1 or more of |XXX| needs to be equal to the vcpu registers to be a match.
+ * On a match |send_sregs| and |send_debugregs| are used to determine what
+ * data to proactively cache for the plugin's use.  Once a match is found
+ * the remaining hints are not consulted.
+ *
+ * To remove all hints, pass 0 for |__hint_count|.  The value of
+ * |__hint_count| can be at most CROSVM_MAX_HINT_COUNT.  Currently the API
+ * is limited to 1 hint (i.e., |__hint_count| must be 0 or 1).  Each call
+ * to this API will replace the values specified by any prior call to this API.
+ */
+int crosvm_set_hypercall_hint(struct crosvm *, uint32_t __hints_count,
+                              const struct crosvm_hint* __hints);
+
 /* Gets the state of interrupt controller in a VM. */
 int crosvm_get_pic_state(struct crosvm *, bool __primary,
                          struct kvm_pic_state *__pic_state);
@@ -436,7 +526,13 @@
        */
       uint8_t is_write;
 
-      uint8_t _reserved[3];
+      /*
+       * Valid when |is_write| is true -- indicates that VM has continued
+       * to run.  The only next valid call for the vcpu is crosvm_vcpu_wait().
+       */
+      uint8_t no_resume;
+
+      uint8_t _reserved[2];
     } io_access;
 
     /* CROSVM_VCPU_EVENT_KIND_PAUSED */
diff --git a/crosvm_plugin/src/lib.rs b/crosvm_plugin/src/lib.rs
index a6fd4df..eb30e4b 100644
--- a/crosvm_plugin/src/lib.rs
+++ b/crosvm_plugin/src/lib.rs
@@ -28,7 +28,6 @@
 use std::slice::{from_raw_parts, from_raw_parts_mut};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
-use std::time::Instant;
 
 use libc::{E2BIG, EINVAL, ENOENT, ENOTCONN, EPROTO};
 
@@ -46,6 +45,9 @@
 
 use protos::plugin::*;
 
+#[cfg(feature = "stats")]
+mod stats;
+
 // Needs to be large enough to receive all the VCPU sockets.
 const MAX_DATAGRAM_FD: usize = 32;
 // Needs to be large enough for a sizable dirty log.
@@ -96,6 +98,37 @@
     route: anon_route,
 }
 
+const CROSVM_MAX_HINT_COUNT: u32 = 1;
+const CROSVM_MAX_HINT_DETAIL_COUNT: u32 = 32;
+const CROSVM_HINT_ON_WRITE: u16 = 1;
+
+#[repr(C)]
+pub struct crosvm_hint {
+    hint_version: u32,
+    reserved: u32,
+    address_space: u32,
+    address_flags: u16,
+    details_count: u16,
+    address: u64,
+    details: *const crosvm_hint_detail,
+}
+
+#[repr(C)]
+pub struct crosvm_hint_detail {
+    match_rax: bool,
+    match_rbx: bool,
+    match_rcx: bool,
+    match_rdx: bool,
+    reserved1: [u8; 4],
+    rax: u64,
+    rbx: u64,
+    rcx: u64,
+    rdx: u64,
+    send_sregs: bool,
+    send_debugregs: bool,
+    reserved2: [u8; 6],
+}
+
 fn proto_error_to_int(e: protobuf::ProtobufError) -> c_int {
     match e {
         protobuf::ProtobufError::IoError(e) => e.raw_os_error().unwrap_or(EINVAL),
@@ -124,7 +157,7 @@
 
 #[repr(u8)]
 #[derive(Debug, Clone, Copy)]
-enum Stat {
+pub enum Stat {
     IoEventFd,
     MemoryGetDirtyLog,
     IrqEventGetFd,
@@ -138,6 +171,7 @@
     GetMsrIndexList,
     NetGetConfig,
     ReserveRange,
+    ReserveAsyncWriteRange,
     SetIrq,
     SetIrqRouting,
     GetPicState,
@@ -174,88 +208,34 @@
     VcpuGetVcpuEvents,
     VcpuSetVcpuEvents,
     NewConnection,
+    SetHypercallHint,
 
     Count,
 }
 
-#[derive(Clone, Copy)]
-struct StatEntry {
-    count: u64,
-    total: u64,
-    max: u64,
+#[cfg(feature = "stats")]
+fn record(a: Stat) -> stats::StatUpdater {
+    unsafe { stats::STATS.record(a) }
 }
 
-struct StatUpdater {
-    idx: usize,
-    start: Instant,
+#[cfg(not(feature = "stats"))]
+fn record(_a: Stat) -> u32 {
+    0
 }
 
-struct GlobalStats {
-    entries: [StatEntry; Stat::Count as usize],
-}
-
-static mut STATS: GlobalStats = GlobalStats {
-    entries: [StatEntry {
-        count: 0,
-        total: 0,
-        max: 0,
-    }; Stat::Count as usize],
-};
-
-impl GlobalStats {
-    // Record latency from this call until the end of block/function
-    // Example:
-    // pub fn foo() {
-    //     let _u = STATS.record(Stat::Foo);
-    //     // ... some operation ...
-    // }
-    // The added STATS.record will record latency of "some operation" and will
-    // update max and average latencies for it under Stats::Foo. Subsequent
-    // call to STATS.print() will print out max and average latencies for all
-    // operations that were performed.
-    fn record(&mut self, idx: Stat) -> StatUpdater {
-        StatUpdater {
-            idx: idx as usize,
-            start: Instant::now(),
-        }
-    }
-
-    fn print(&self) {
-        for idx in 0..Stat::Count as usize {
-            let e = &self.entries[idx as usize];
-            let stat = unsafe { std::mem::transmute::<u8, Stat>(idx as u8) };
-            if e.count > 0 {
-                println!(
-                    "Stat::{:?}: avg {}ns max {}ns",
-                    stat,
-                    e.total / e.count,
-                    e.max
-                );
-            }
-        }
-    }
-
-    fn update(&mut self, idx: usize, elapsed_nanos: u64) {
-        let e = &mut self.entries[idx as usize];
-        e.total += elapsed_nanos;
-        if e.max < elapsed_nanos {
-            e.max = elapsed_nanos;
-        }
-        e.count += 1;
-    }
-}
-
-impl Drop for StatUpdater {
-    fn drop(&mut self) {
-        let elapsed = self.start.elapsed();
-        let elapsed_nanos = elapsed.as_secs() * 1000000000 + elapsed.subsec_nanos() as u64;
-        // Unsafe due to racy access - OK for stats
+#[cfg(feature = "stats")]
+fn printstats() {
+    // Unsafe due to racy access - OK for stats
+    if std::env::var("CROSVM_STATS").is_ok() {
         unsafe {
-            STATS.update(self.idx, elapsed_nanos);
+            stats::STATS.print();
         }
     }
 }
 
+#[cfg(not(feature = "stats"))]
+fn printstats() {}
+
 pub struct crosvm {
     id_allocator: Arc<IdAllocator>,
     socket: UnixDatagram,
@@ -346,12 +326,7 @@
         r.mut_destroy().id = id;
         self.main_transaction(&r, &[])?;
         self.get_id_allocator().free(id);
-        // Unsafe due to racy access - OK for stats
-        if std::env::var("CROSVM_STATS").is_ok() {
-            unsafe {
-                STATS.print();
-            }
-        }
+        printstats();
         Ok(())
     }
 
@@ -492,12 +467,19 @@
         Ok(())
     }
 
-    fn reserve_range(&mut self, space: u32, start: u64, length: u64) -> result::Result<(), c_int> {
+    fn reserve_range(
+        &mut self,
+        space: u32,
+        start: u64,
+        length: u64,
+        async_write: bool,
+    ) -> result::Result<(), c_int> {
         let mut r = MainRequest::new();
         let reserve: &mut MainRequest_ReserveRange = r.mut_reserve_range();
         reserve.space = AddressSpace::from_i32(space as i32).ok_or(EINVAL)?;
         reserve.start = start;
         reserve.length = length;
+        reserve.async_write = async_write;
 
         self.main_transaction(&r, &[])?;
         Ok(())
@@ -543,6 +525,38 @@
         Ok(())
     }
 
+    fn set_hint(
+        &mut self,
+        space: u32,
+        addr: u64,
+        on_write: bool,
+        hints: &[crosvm_hint_detail],
+    ) -> result::Result<(), c_int> {
+        let mut r = MainRequest::new();
+        let req: &mut MainRequest_SetCallHint = r.mut_set_call_hint();
+        let set_hints: &mut RepeatedField<MainRequest_SetCallHint_RegHint> = req.mut_hints();
+        for hint in hints {
+            let mut entry = MainRequest_SetCallHint_RegHint::new();
+            entry.match_rax = hint.match_rax;
+            entry.match_rbx = hint.match_rbx;
+            entry.match_rcx = hint.match_rcx;
+            entry.match_rdx = hint.match_rdx;
+            entry.rax = hint.rax;
+            entry.rbx = hint.rbx;
+            entry.rcx = hint.rcx;
+            entry.rdx = hint.rdx;
+            entry.send_sregs = hint.send_sregs;
+            entry.send_debugregs = hint.send_debugregs;
+            set_hints.push(entry);
+        }
+        req.space = AddressSpace::from_i32(space as i32).ok_or(EINVAL)?;
+        req.address = addr;
+        req.on_write = on_write;
+
+        self.main_transaction(&r, &[])?;
+        Ok(())
+    }
+
     fn get_state(
         &mut self,
         state_set: MainRequest_StateSet,
@@ -749,7 +763,7 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_io_event_fd(this: *mut crosvm_io_event) -> c_int {
-    let _u = STATS.record(Stat::IoEventFd);
+    let _u = record(Stat::IoEventFd);
     (*this).evt.as_raw_fd()
 }
 
@@ -822,7 +836,7 @@
     this: *mut crosvm_memory,
     log: *mut u8,
 ) -> c_int {
-    let _u = STATS.record(Stat::MemoryGetDirtyLog);
+    let _u = record(Stat::MemoryGetDirtyLog);
     let crosvm = &mut *crosvm;
     let this = &mut *this;
     let log_slice = slice::from_raw_parts_mut(log, dirty_log_bitmap_size(this.length as usize));
@@ -884,13 +898,13 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_irq_event_get_fd(this: *mut crosvm_irq_event) -> c_int {
-    let _u = STATS.record(Stat::IrqEventGetFd);
+    let _u = record(Stat::IrqEventGetFd);
     (*this).trigger_evt.as_raw_fd()
 }
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_irq_event_get_resample_fd(this: *mut crosvm_irq_event) -> c_int {
-    let _u = STATS.record(Stat::IrqEventGetResampleFd);
+    let _u = record(Stat::IrqEventGetResampleFd);
     (*this).resample_evt.as_raw_fd()
 }
 
@@ -904,7 +918,8 @@
     data: *mut u8,
     length: u32,
     is_write: u8,
-    __reserved1: u8,
+    no_resume: u8,
+    __reserved1: [u8; 2],
 }
 
 #[repr(C)]
@@ -922,13 +937,53 @@
     event: anon_vcpu_event,
 }
 
+// |get| tracks if the |cache| contains a cached value that can service get()
+// requests.  A set() call will populate |cache| and |set| to true to record
+// that the next resume() should apply the state.  We've got two choices on
+// what to do about |get| on a set(): 1) leave it as true, or 2) clear it and
+// have any call to get() first apply any pending set.  Currently #2 is used
+// to favor correctness over performance (it gives KVM a chance to
+// modify/massage the values input to the set call). A plugin will rarely
+// (if ever) issue a get() after a set() on the same vcpu exit, so opting for
+// #1 is unlikely to provide a tangible performance gain.
+pub struct crosvm_vcpu_reg_cache {
+    get: bool,
+    set: bool,
+    cache: Vec<u8>,
+}
+
 pub struct crosvm_vcpu {
     read_pipe: File,
     write_pipe: File,
     send_init: bool,
     request_buffer: Vec<u8>,
     response_buffer: Vec<u8>,
+    response_base: usize,
+    response_length: usize,
     resume_data: Vec<u8>,
+
+    regs: crosvm_vcpu_reg_cache,
+    sregs: crosvm_vcpu_reg_cache,
+    debugregs: crosvm_vcpu_reg_cache,
+}
+
+fn read_varint32(data: &[u8]) -> (u32, usize) {
+    let mut value: u32 = 0;
+    let mut shift: u32 = 0;
+    for (i, &b) in data.iter().enumerate() {
+        if b < 0x80 {
+            return match (b as u32).checked_shl(shift) {
+                None => (0, 0),
+                Some(b) => (value | b, i + 1),
+            };
+        }
+        match ((b as u32) & 0x7F).checked_shl(shift) {
+            None => return (0, 0),
+            Some(b) => value |= b,
+        }
+        shift += 7;
+    }
+    (0, 0)
 }
 
 impl crosvm_vcpu {
@@ -939,7 +994,24 @@
             send_init: true,
             request_buffer: Vec::new(),
             response_buffer: vec![0; MAX_DATAGRAM_SIZE],
+            response_base: 0,
+            response_length: 0,
             resume_data: Vec::new(),
+            regs: crosvm_vcpu_reg_cache {
+                get: false,
+                set: false,
+                cache: vec![],
+            },
+            sregs: crosvm_vcpu_reg_cache {
+                get: false,
+                set: false,
+                cache: vec![],
+            },
+            debugregs: crosvm_vcpu_reg_cache {
+                get: false,
+                set: false,
+                cache: vec![],
+            },
         }
     }
     fn vcpu_send(&mut self, request: &VcpuRequest) -> result::Result<(), c_int> {
@@ -954,13 +1026,30 @@
     }
 
     fn vcpu_recv(&mut self) -> result::Result<VcpuResponse, c_int> {
-        let msg_size = self
-            .read_pipe
-            .read(&mut self.response_buffer)
-            .map_err(|e| -e.raw_os_error().unwrap_or(EINVAL))?;
-
-        let response: VcpuResponse =
-            parse_from_bytes(&self.response_buffer[..msg_size]).map_err(proto_error_to_int)?;
+        if self.response_length == 0 {
+            let msg_size = self
+                .read_pipe
+                .read(&mut self.response_buffer)
+                .map_err(|e| -e.raw_os_error().unwrap_or(EINVAL))?;
+            self.response_base = 0;
+            self.response_length = msg_size;
+        }
+        if self.response_length == 0 {
+            return Err(EINVAL);
+        }
+        let (value, bytes) = read_varint32(
+            &self.response_buffer[self.response_base..self.response_base + self.response_length],
+        );
+        let total_size: usize = bytes + value as usize;
+        if bytes == 0 || total_size > self.response_length {
+            return Err(EINVAL);
+        }
+        let response: VcpuResponse = parse_from_bytes(
+            &self.response_buffer[self.response_base + bytes..self.response_base + total_size],
+        )
+        .map_err(proto_error_to_int)?;
+        self.response_base += total_size;
+        self.response_length -= total_size;
         if response.errno != 0 {
             return Err(response.errno);
         }
@@ -987,6 +1076,9 @@
         let wait: &mut VcpuResponse_Wait = response.mut_wait();
         if wait.has_init() {
             event.kind = CROSVM_VCPU_EVENT_KIND_INIT;
+            self.regs.get = false;
+            self.sregs.get = false;
+            self.debugregs.get = false;
             Ok(())
         } else if wait.has_io() {
             let mut io: VcpuResponse_Wait_Io = wait.take_io();
@@ -998,14 +1090,30 @@
                 data: io.data.as_mut_ptr(),
                 length: io.data.len() as u32,
                 is_write: io.is_write as u8,
+                no_resume: io.no_resume as u8,
                 __reserved1: Default::default(),
             };
             self.resume_data = io.data;
+            self.regs.get = !io.regs.is_empty();
+            if self.regs.get {
+                swap(&mut self.regs.cache, &mut io.regs);
+            }
+            self.sregs.get = !io.sregs.is_empty();
+            if self.sregs.get {
+                swap(&mut self.sregs.cache, &mut io.sregs);
+            }
+            self.debugregs.get = !io.debugregs.is_empty();
+            if self.debugregs.get {
+                swap(&mut self.debugregs.cache, &mut io.debugregs);
+            }
             Ok(())
         } else if wait.has_user() {
             let user: &VcpuResponse_Wait_User = wait.get_user();
             event.kind = CROSVM_VCPU_EVENT_KIND_PAUSED;
             event.event.user = user.user as *mut c_void;
+            self.regs.get = false;
+            self.sregs.get = false;
+            self.debugregs.get = false;
             Ok(())
         } else {
             Err(EPROTO)
@@ -1017,6 +1125,19 @@
         let resume: &mut VcpuRequest_Resume = r.mut_resume();
         swap(&mut resume.data, &mut self.resume_data);
 
+        if self.regs.set {
+            swap(&mut resume.regs, &mut self.regs.cache);
+            self.regs.set = false;
+        }
+        if self.sregs.set {
+            swap(&mut resume.sregs, &mut self.sregs.cache);
+            self.sregs.set = false;
+        }
+        if self.debugregs.set {
+            swap(&mut resume.debugregs, &mut self.debugregs.cache);
+            self.debugregs.set = false;
+        }
+
         self.vcpu_send(&r)?;
         Ok(())
     }
@@ -1054,6 +1175,33 @@
         Ok(())
     }
 
+    fn set_state_from_cache(
+        &mut self,
+        state_set: VcpuRequest_StateSet,
+    ) -> result::Result<(), c_int> {
+        let mut r = VcpuRequest::new();
+        let set_state: &mut VcpuRequest_SetState = r.mut_set_state();
+        set_state.set = state_set;
+        match state_set {
+            VcpuRequest_StateSet::REGS => {
+                swap(&mut set_state.state, &mut self.regs.cache);
+                self.regs.set = false;
+            }
+            VcpuRequest_StateSet::SREGS => {
+                swap(&mut set_state.state, &mut self.sregs.cache);
+                self.sregs.set = false;
+            }
+            VcpuRequest_StateSet::DEBUGREGS => {
+                swap(&mut set_state.state, &mut self.debugregs.cache);
+                self.debugregs.set = false;
+            }
+            _ => return Err(EINVAL),
+        }
+
+        self.vcpu_transaction(&r)?;
+        Ok(())
+    }
+
     fn get_msrs(
         &mut self,
         msr_entries: &mut [kvm_msr_entry],
@@ -1120,7 +1268,7 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_connect(out: *mut *mut crosvm) -> c_int {
-    let _u = STATS.record(Stat::Connect);
+    let _u = record(Stat::Connect);
     let socket_name = match env::var("CROSVM_SOCKET") {
         Ok(v) => v,
         _ => return -ENOTCONN,
@@ -1143,7 +1291,7 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_new_connection(self_: *mut crosvm, out: *mut *mut crosvm) -> c_int {
-    let _u = STATS.record(Stat::NewConnection);
+    let _u = record(Stat::NewConnection);
     let self_ = &mut (*self_);
     match self_.try_clone() {
         Ok(cloned) => {
@@ -1156,7 +1304,7 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_destroy_connection(self_: *mut *mut crosvm) -> c_int {
-    let _u = STATS.record(Stat::DestroyConnection);
+    let _u = record(Stat::DestroyConnection);
     Box::from_raw(*self_);
     *self_ = null_mut();
     0
@@ -1164,7 +1312,7 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_get_shutdown_eventfd(self_: *mut crosvm) -> c_int {
-    let _u = STATS.record(Stat::GetShutdownEventFd);
+    let _u = record(Stat::GetShutdownEventFd);
     let self_ = &mut (*self_);
     match self_.get_shutdown_eventfd() {
         Ok(f) => f.into_raw_fd(),
@@ -1178,7 +1326,7 @@
     extension: u32,
     has_extension: *mut bool,
 ) -> c_int {
-    let _u = STATS.record(Stat::CheckExtentsion);
+    let _u = record(Stat::CheckExtentsion);
     let self_ = &mut (*self_);
     let ret = self_.check_extension(extension);
 
@@ -1195,7 +1343,7 @@
     cpuid_entries: *mut kvm_cpuid_entry2,
     out_count: *mut u32,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetSupportedCpuid);
+    let _u = record(Stat::GetSupportedCpuid);
     let this = &mut *this;
     let cpuid_entries = from_raw_parts_mut(cpuid_entries, entry_count as usize);
     let mut cpuid_count: usize = 0;
@@ -1211,7 +1359,7 @@
     cpuid_entries: *mut kvm_cpuid_entry2,
     out_count: *mut u32,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetEmulatedCpuid);
+    let _u = record(Stat::GetEmulatedCpuid);
     let this = &mut *this;
     let cpuid_entries = from_raw_parts_mut(cpuid_entries, entry_count as usize);
     let mut cpuid_count: usize = 0;
@@ -1227,7 +1375,7 @@
     msr_indices: *mut u32,
     out_count: *mut u32,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetMsrIndexList);
+    let _u = record(Stat::GetMsrIndexList);
     let this = &mut *this;
     let msr_indices = from_raw_parts_mut(msr_indices, entry_count as usize);
     let mut msr_count: usize = 0;
@@ -1241,7 +1389,7 @@
     self_: *mut crosvm,
     config: *mut crosvm_net_config,
 ) -> c_int {
-    let _u = STATS.record(Stat::NetGetConfig);
+    let _u = record(Stat::NetGetConfig);
     let self_ = &mut (*self_);
     let ret = self_.get_net_config();
 
@@ -1259,15 +1407,28 @@
     start: u64,
     length: u64,
 ) -> c_int {
-    let _u = STATS.record(Stat::ReserveRange);
+    let _u = record(Stat::ReserveRange);
     let self_ = &mut (*self_);
-    let ret = self_.reserve_range(space, start, length);
+    let ret = self_.reserve_range(space, start, length, false);
+    to_crosvm_rc(ret)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn crosvm_reserve_async_write_range(
+    self_: *mut crosvm,
+    space: u32,
+    start: u64,
+    length: u64,
+) -> c_int {
+    let _u = record(Stat::ReserveAsyncWriteRange);
+    let self_ = &mut (*self_);
+    let ret = self_.reserve_range(space, start, length, true);
     to_crosvm_rc(ret)
 }
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_set_irq(self_: *mut crosvm, irq_id: u32, active: bool) -> c_int {
-    let _u = STATS.record(Stat::SetIrq);
+    let _u = record(Stat::SetIrq);
     let self_ = &mut (*self_);
     let ret = self_.set_irq(irq_id, active);
     to_crosvm_rc(ret)
@@ -1279,19 +1440,54 @@
     route_count: u32,
     routes: *const crosvm_irq_route,
 ) -> c_int {
-    let _u = STATS.record(Stat::SetIrqRouting);
+    let _u = record(Stat::SetIrqRouting);
     let self_ = &mut (*self_);
     let ret = self_.set_irq_routing(slice::from_raw_parts(routes, route_count as usize));
     to_crosvm_rc(ret)
 }
 
 #[no_mangle]
+pub unsafe extern "C" fn crosvm_set_hypercall_hint(
+    self_: *mut crosvm,
+    hints_count: u32,
+    hints: *const crosvm_hint,
+) -> c_int {
+    let _u = record(Stat::SetHypercallHint);
+    let self_ = &mut (*self_);
+
+    if hints_count < 1 {
+        let ret = self_.set_hint(0, 0, false, &[]);
+        return to_crosvm_rc(ret);
+    }
+    if hints_count > CROSVM_MAX_HINT_COUNT {
+        return -EINVAL;
+    }
+    let hints = slice::from_raw_parts(hints, hints_count as usize);
+    let hint = &hints[0];
+    if hint.hint_version != 0
+        || hint.reserved != 0
+        || hint.address == 0
+        || (hint.address_flags != 0 && hint.address_flags != CROSVM_HINT_ON_WRITE)
+        || hint.details_count > CROSVM_MAX_HINT_DETAIL_COUNT as u16
+    {
+        return -EINVAL;
+    }
+    let ret = self_.set_hint(
+        hint.address_space,
+        hint.address,
+        hint.address_flags == CROSVM_HINT_ON_WRITE,
+        slice::from_raw_parts(hint.details, hint.details_count as usize),
+    );
+    to_crosvm_rc(ret)
+}
+
+#[no_mangle]
 pub unsafe extern "C" fn crosvm_get_pic_state(
     this: *mut crosvm,
     primary: bool,
     state: *mut kvm_pic_state,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetPicState);
+    let _u = record(Stat::GetPicState);
     let this = &mut *this;
     let state_set = if primary {
         MainRequest_StateSet::PIC0
@@ -1309,7 +1505,7 @@
     primary: bool,
     state: *mut kvm_pic_state,
 ) -> c_int {
-    let _u = STATS.record(Stat::SetPicState);
+    let _u = record(Stat::SetPicState);
     let this = &mut *this;
     let state_set = if primary {
         MainRequest_StateSet::PIC0
@@ -1326,7 +1522,7 @@
     this: *mut crosvm,
     state: *mut kvm_ioapic_state,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetIoapicState);
+    let _u = record(Stat::GetIoapicState);
     let this = &mut *this;
     let state = from_raw_parts_mut(state as *mut u8, size_of::<kvm_ioapic_state>());
     let ret = this.get_state(MainRequest_StateSet::IOAPIC, state);
@@ -1338,7 +1534,7 @@
     this: *mut crosvm,
     state: *const kvm_ioapic_state,
 ) -> c_int {
-    let _u = STATS.record(Stat::SetIoapicState);
+    let _u = record(Stat::SetIoapicState);
     let this = &mut *this;
     let state = from_raw_parts(state as *mut u8, size_of::<kvm_ioapic_state>());
     let ret = this.set_state(MainRequest_StateSet::IOAPIC, state);
@@ -1350,7 +1546,7 @@
     this: *mut crosvm,
     state: *mut kvm_pit_state2,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetPitState);
+    let _u = record(Stat::GetPitState);
     let this = &mut *this;
     let state = from_raw_parts_mut(state as *mut u8, size_of::<kvm_pit_state2>());
     let ret = this.get_state(MainRequest_StateSet::PIT, state);
@@ -1362,7 +1558,7 @@
     this: *mut crosvm,
     state: *const kvm_pit_state2,
 ) -> c_int {
-    let _u = STATS.record(Stat::SetPitState);
+    let _u = record(Stat::SetPitState);
     let this = &mut *this;
     let state = from_raw_parts(state as *mut u8, size_of::<kvm_pit_state2>());
     let ret = this.set_state(MainRequest_StateSet::PIT, state);
@@ -1374,7 +1570,7 @@
     this: *mut crosvm,
     clock_data: *mut kvm_clock_data,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetClock);
+    let _u = record(Stat::GetClock);
     let this = &mut *this;
     let state = from_raw_parts_mut(clock_data as *mut u8, size_of::<kvm_clock_data>());
     let ret = this.get_state(MainRequest_StateSet::CLOCK, state);
@@ -1386,7 +1582,7 @@
     this: *mut crosvm,
     clock_data: *const kvm_clock_data,
 ) -> c_int {
-    let _u = STATS.record(Stat::SetClock);
+    let _u = record(Stat::SetClock);
     let this = &mut *this;
     let state = from_raw_parts(clock_data as *mut u8, size_of::<kvm_clock_data>());
     let ret = this.set_state(MainRequest_StateSet::CLOCK, state);
@@ -1395,7 +1591,7 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_set_identity_map_addr(self_: *mut crosvm, addr: u32) -> c_int {
-    let _u = STATS.record(Stat::SetIdentityMapAddr);
+    let _u = record(Stat::SetIdentityMapAddr);
     let self_ = &mut (*self_);
     let ret = self_.set_identity_map_addr(addr);
     to_crosvm_rc(ret)
@@ -1407,7 +1603,7 @@
     cpu_mask: u64,
     user: *mut c_void,
 ) -> c_int {
-    let _u = STATS.record(Stat::PauseVcpus);
+    let _u = record(Stat::PauseVcpus);
     let self_ = &mut (*self_);
     let ret = self_.pause_vcpus(cpu_mask, user);
     to_crosvm_rc(ret)
@@ -1415,7 +1611,7 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_start(self_: *mut crosvm) -> c_int {
-    let _u = STATS.record(Stat::Start);
+    let _u = record(Stat::Start);
     let self_ = &mut (*self_);
     let ret = self_.start();
     to_crosvm_rc(ret)
@@ -1427,7 +1623,7 @@
     cpu_id: u32,
     out: *mut *mut crosvm_vcpu,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetVcpu);
+    let _u = record(Stat::GetVcpu);
     let self_ = &mut (*self_);
     let ret = self_.get_vcpu(cpu_id);
 
@@ -1442,7 +1638,7 @@
     this: *mut crosvm_vcpu,
     event: *mut crosvm_vcpu_event,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuWait);
+    let _u = record(Stat::VcpuWait);
     let this = &mut *this;
     let event = &mut *event;
     let ret = this.wait(event);
@@ -1451,7 +1647,7 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_vcpu_resume(this: *mut crosvm_vcpu) -> c_int {
-    let _u = STATS.record(Stat::VcpuResume);
+    let _u = record(Stat::VcpuResume);
     let this = &mut *this;
     let ret = this.resume();
     to_crosvm_rc(ret)
@@ -1462,11 +1658,21 @@
     this: *mut crosvm_vcpu,
     regs: *mut kvm_regs,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuGetRegs);
+    let _u = record(Stat::VcpuGetRegs);
     let this = &mut *this;
+    if this.regs.set {
+        if let Err(e) = this.set_state_from_cache(VcpuRequest_StateSet::REGS) {
+            return -e;
+        }
+    }
     let regs = from_raw_parts_mut(regs as *mut u8, size_of::<kvm_regs>());
-    let ret = this.get_state(VcpuRequest_StateSet::REGS, regs);
-    to_crosvm_rc(ret)
+    if this.regs.get {
+        regs.copy_from_slice(&this.regs.cache);
+        0
+    } else {
+        let ret = this.get_state(VcpuRequest_StateSet::REGS, regs);
+        to_crosvm_rc(ret)
+    }
 }
 
 #[no_mangle]
@@ -1474,11 +1680,13 @@
     this: *mut crosvm_vcpu,
     regs: *const kvm_regs,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuSetRegs);
+    let _u = record(Stat::VcpuSetRegs);
     let this = &mut *this;
+    this.regs.get = false;
     let regs = from_raw_parts(regs as *mut u8, size_of::<kvm_regs>());
-    let ret = this.set_state(VcpuRequest_StateSet::REGS, regs);
-    to_crosvm_rc(ret)
+    this.regs.set = true;
+    this.regs.cache = regs.to_vec();
+    0
 }
 
 #[no_mangle]
@@ -1486,11 +1694,21 @@
     this: *mut crosvm_vcpu,
     sregs: *mut kvm_sregs,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuGetSregs);
+    let _u = record(Stat::VcpuGetSregs);
     let this = &mut *this;
+    if this.sregs.set {
+        if let Err(e) = this.set_state_from_cache(VcpuRequest_StateSet::SREGS) {
+            return -e;
+        }
+    }
     let sregs = from_raw_parts_mut(sregs as *mut u8, size_of::<kvm_sregs>());
-    let ret = this.get_state(VcpuRequest_StateSet::SREGS, sregs);
-    to_crosvm_rc(ret)
+    if this.sregs.get {
+        sregs.copy_from_slice(&this.sregs.cache);
+        0
+    } else {
+        let ret = this.get_state(VcpuRequest_StateSet::SREGS, sregs);
+        to_crosvm_rc(ret)
+    }
 }
 
 #[no_mangle]
@@ -1498,16 +1716,18 @@
     this: *mut crosvm_vcpu,
     sregs: *const kvm_sregs,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuSetSregs);
+    let _u = record(Stat::VcpuSetSregs);
     let this = &mut *this;
+    this.sregs.get = false;
     let sregs = from_raw_parts(sregs as *mut u8, size_of::<kvm_sregs>());
-    let ret = this.set_state(VcpuRequest_StateSet::SREGS, sregs);
-    to_crosvm_rc(ret)
+    this.sregs.set = true;
+    this.sregs.cache = sregs.to_vec();
+    0
 }
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_vcpu_get_fpu(this: *mut crosvm_vcpu, fpu: *mut kvm_fpu) -> c_int {
-    let _u = STATS.record(Stat::GetFpu);
+    let _u = record(Stat::GetFpu);
     let this = &mut *this;
     let fpu = from_raw_parts_mut(fpu as *mut u8, size_of::<kvm_fpu>());
     let ret = this.get_state(VcpuRequest_StateSet::FPU, fpu);
@@ -1516,7 +1736,7 @@
 
 #[no_mangle]
 pub unsafe extern "C" fn crosvm_vcpu_set_fpu(this: *mut crosvm_vcpu, fpu: *const kvm_fpu) -> c_int {
-    let _u = STATS.record(Stat::SetFpu);
+    let _u = record(Stat::SetFpu);
     let this = &mut *this;
     let fpu = from_raw_parts(fpu as *mut u8, size_of::<kvm_fpu>());
     let ret = this.set_state(VcpuRequest_StateSet::FPU, fpu);
@@ -1528,11 +1748,21 @@
     this: *mut crosvm_vcpu,
     dregs: *mut kvm_debugregs,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetDebugRegs);
+    let _u = record(Stat::GetDebugRegs);
     let this = &mut *this;
+    if this.debugregs.set {
+        if let Err(e) = this.set_state_from_cache(VcpuRequest_StateSet::DEBUGREGS) {
+            return -e;
+        }
+    }
     let dregs = from_raw_parts_mut(dregs as *mut u8, size_of::<kvm_debugregs>());
-    let ret = this.get_state(VcpuRequest_StateSet::DEBUGREGS, dregs);
-    to_crosvm_rc(ret)
+    if this.debugregs.get {
+        dregs.copy_from_slice(&this.debugregs.cache);
+        0
+    } else {
+        let ret = this.get_state(VcpuRequest_StateSet::DEBUGREGS, dregs);
+        to_crosvm_rc(ret)
+    }
 }
 
 #[no_mangle]
@@ -1540,11 +1770,13 @@
     this: *mut crosvm_vcpu,
     dregs: *const kvm_debugregs,
 ) -> c_int {
-    let _u = STATS.record(Stat::SetDebugRegs);
+    let _u = record(Stat::SetDebugRegs);
     let this = &mut *this;
+    this.debugregs.get = false;
     let dregs = from_raw_parts(dregs as *mut u8, size_of::<kvm_debugregs>());
-    let ret = this.set_state(VcpuRequest_StateSet::DEBUGREGS, dregs);
-    to_crosvm_rc(ret)
+    this.debugregs.set = true;
+    this.debugregs.cache = dregs.to_vec();
+    0
 }
 
 #[no_mangle]
@@ -1552,7 +1784,7 @@
     this: *mut crosvm_vcpu,
     xcrs: *mut kvm_xcrs,
 ) -> c_int {
-    let _u = STATS.record(Stat::GetXCRegs);
+    let _u = record(Stat::GetXCRegs);
     let this = &mut *this;
     let xcrs = from_raw_parts_mut(xcrs as *mut u8, size_of::<kvm_xcrs>());
     let ret = this.get_state(VcpuRequest_StateSet::XCREGS, xcrs);
@@ -1564,7 +1796,7 @@
     this: *mut crosvm_vcpu,
     xcrs: *const kvm_xcrs,
 ) -> c_int {
-    let _u = STATS.record(Stat::SetXCRegs);
+    let _u = record(Stat::SetXCRegs);
     let this = &mut *this;
     let xcrs = from_raw_parts(xcrs as *mut u8, size_of::<kvm_xcrs>());
     let ret = this.set_state(VcpuRequest_StateSet::XCREGS, xcrs);
@@ -1578,7 +1810,7 @@
     msr_entries: *mut kvm_msr_entry,
     out_count: *mut u32,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuGetMsrs);
+    let _u = record(Stat::VcpuGetMsrs);
     let this = &mut *this;
     let msr_entries = from_raw_parts_mut(msr_entries, msr_count as usize);
     let mut count: usize = 0;
@@ -1593,7 +1825,7 @@
     msr_count: u32,
     msr_entries: *const kvm_msr_entry,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuSetMsrs);
+    let _u = record(Stat::VcpuSetMsrs);
     let this = &mut *this;
     let msr_entries = from_raw_parts(msr_entries, msr_count as usize);
     let ret = this.set_msrs(msr_entries);
@@ -1606,7 +1838,7 @@
     cpuid_count: u32,
     cpuid_entries: *const kvm_cpuid_entry2,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuSetCpuid);
+    let _u = record(Stat::VcpuSetCpuid);
     let this = &mut *this;
     let cpuid_entries = from_raw_parts(cpuid_entries, cpuid_count as usize);
     let ret = this.set_cpuid(cpuid_entries);
@@ -1618,7 +1850,7 @@
     this: *mut crosvm_vcpu,
     state: *mut kvm_lapic_state,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuGetLapicState);
+    let _u = record(Stat::VcpuGetLapicState);
     let this = &mut *this;
     let state = from_raw_parts_mut(state as *mut u8, size_of::<kvm_lapic_state>());
     let ret = this.get_state(VcpuRequest_StateSet::LAPIC, state);
@@ -1630,7 +1862,7 @@
     this: *mut crosvm_vcpu,
     state: *const kvm_lapic_state,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuSetLapicState);
+    let _u = record(Stat::VcpuSetLapicState);
     let this = &mut *this;
     let state = from_raw_parts(state as *mut u8, size_of::<kvm_lapic_state>());
     let ret = this.set_state(VcpuRequest_StateSet::LAPIC, state);
@@ -1642,7 +1874,7 @@
     this: *mut crosvm_vcpu,
     state: *mut kvm_mp_state,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuGetMpState);
+    let _u = record(Stat::VcpuGetMpState);
     let this = &mut *this;
     let state = from_raw_parts_mut(state as *mut u8, size_of::<kvm_mp_state>());
     let ret = this.get_state(VcpuRequest_StateSet::MP, state);
@@ -1654,7 +1886,7 @@
     this: *mut crosvm_vcpu,
     state: *const kvm_mp_state,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuSetMpState);
+    let _u = record(Stat::VcpuSetMpState);
     let this = &mut *this;
     let state = from_raw_parts(state as *mut u8, size_of::<kvm_mp_state>());
     let ret = this.set_state(VcpuRequest_StateSet::MP, state);
@@ -1666,7 +1898,7 @@
     this: *mut crosvm_vcpu,
     events: *mut kvm_vcpu_events,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuGetVcpuEvents);
+    let _u = record(Stat::VcpuGetVcpuEvents);
     let this = &mut *this;
     let events = from_raw_parts_mut(events as *mut u8, size_of::<kvm_vcpu_events>());
     let ret = this.get_state(VcpuRequest_StateSet::EVENTS, events);
@@ -1678,7 +1910,7 @@
     this: *mut crosvm_vcpu,
     events: *const kvm_vcpu_events,
 ) -> c_int {
-    let _u = STATS.record(Stat::VcpuSetVcpuEvents);
+    let _u = record(Stat::VcpuSetVcpuEvents);
     let this = &mut *this;
     let events = from_raw_parts(events as *mut u8, size_of::<kvm_vcpu_events>());
     let ret = this.set_state(VcpuRequest_StateSet::EVENTS, events);
diff --git a/crosvm_plugin/src/stats.rs b/crosvm_plugin/src/stats.rs
new file mode 100644
index 0000000..0406e2f
--- /dev/null
+++ b/crosvm_plugin/src/stats.rs
@@ -0,0 +1,84 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use crate::Stat;
+use std::time::Instant;
+
+#[derive(Clone, Copy)]
+struct StatEntry {
+    count: u64,
+    total: u64,
+    max: u64,
+}
+
+pub struct StatUpdater {
+    idx: usize,
+    start: Instant,
+}
+
+pub struct GlobalStats {
+    entries: [StatEntry; Stat::Count as usize],
+}
+
+pub static mut STATS: GlobalStats = GlobalStats {
+    entries: [StatEntry {
+        count: 0,
+        total: 0,
+        max: 0,
+    }; Stat::Count as usize],
+};
+
+impl GlobalStats {
+    // Record latency from this call until the end of block/function
+    // Example:
+    // pub fn foo() {
+    //     let _u = STATS.record(Stat::Foo);
+    //     // ... some operation ...
+    // }
+    // The added STATS.record will record latency of "some operation" and will
+    // update max and average latencies for it under Stats::Foo. Subsequent
+    // call to STATS.print() will print out max and average latencies for all
+    // operations that were performed.
+    pub fn record(&mut self, idx: Stat) -> StatUpdater {
+        StatUpdater {
+            idx: idx as usize,
+            start: Instant::now(),
+        }
+    }
+
+    pub fn print(&self) {
+        for idx in 0..Stat::Count as usize {
+            let e = &self.entries[idx as usize];
+            let stat = unsafe { std::mem::transmute::<u8, Stat>(idx as u8) };
+            if e.count > 0 {
+                println!(
+                    "Stat::{:?}: avg {}ns max {}ns",
+                    stat,
+                    e.total / e.count,
+                    e.max
+                );
+            }
+        }
+    }
+
+    fn update(&mut self, idx: usize, elapsed_nanos: u64) {
+        let e = &mut self.entries[idx as usize];
+        e.total += elapsed_nanos;
+        if e.max < elapsed_nanos {
+            e.max = elapsed_nanos;
+        }
+        e.count += 1;
+    }
+}
+
+impl Drop for StatUpdater {
+    fn drop(&mut self) {
+        let elapsed = self.start.elapsed();
+        let elapsed_nanos = elapsed.as_secs() * 1000000000 + elapsed.subsec_nanos() as u64;
+        // Unsafe due to racy access - OK for stats
+        unsafe {
+            STATS.update(self.idx, elapsed_nanos);
+        }
+    }
+}
diff --git a/data_model/Android.bp b/data_model/Android.bp
index 332471f..c8e178b 100644
--- a/data_model/Android.bp
+++ b/data_model/Android.bp
@@ -2,7 +2,6 @@
 
 rust_library_host_rlib {
     name: "libdata_model",
-    deny_warnings: false,
     defaults: ["crosvm_defaults"],
     crate_name: "data_model",
     srcs: ["src/lib.rs"],
diff --git a/data_model/src/flexible_array.rs b/data_model/src/flexible_array.rs
new file mode 100644
index 0000000..35ac15a
--- /dev/null
+++ b/data_model/src/flexible_array.rs
@@ -0,0 +1,39 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+//! A wrapper for structures that contain flexible arrays.
+
+use std::mem::size_of;
+
+// Returns a `Vec<T>` with a size in bytes at least as large as `size_in_bytes`.
+fn vec_with_size_in_bytes<T: Default>(size_in_bytes: usize) -> Vec<T> {
+    let rounded_size = (size_in_bytes + size_of::<T>() - 1) / size_of::<T>();
+    let mut v = Vec::with_capacity(rounded_size);
+    for _ in 0..rounded_size {
+        v.push(T::default())
+    }
+    v
+}
+
+/// The kernel API has many structs that resemble the following `Foo` structure:
+///
+/// ```ignore
+/// #[repr(C)]
+/// struct Foo {
+///    some_data: u32,
+///    entries: __IncompleteArrayField<__u32>,
+/// }
+/// ```
+///
+/// In order to allocate such a structure, `size_of::<Foo>()` would be too small because it would
+/// not include any space for `entries`. To make the allocation large enough while still being
+/// aligned for `Foo`, a `Vec<Foo>` is created. Only the first element of `Vec<Foo>` would actually
+/// be used as a `Foo`. The remaining memory in the `Vec<Foo>` is for `entries`, which must be
+/// contiguous with `Foo`. This function is used to make the `Vec<Foo>` with enough space for
+/// `count` entries.
+pub fn vec_with_array_field<T: Default, F>(count: usize) -> Vec<T> {
+    let element_space = count * size_of::<F>();
+    let vec_size_bytes = size_of::<T>() + element_space;
+    vec_with_size_in_bytes(vec_size_bytes)
+}
diff --git a/data_model/src/lib.rs b/data_model/src/lib.rs
index 8288c84..3f75377 100644
--- a/data_model/src/lib.rs
+++ b/data_model/src/lib.rs
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-use std::mem::size_of;
+use std::io;
+use std::mem::{align_of, size_of};
 use std::slice::{from_raw_parts, from_raw_parts_mut};
 
 /// Types for which it is safe to initialize from raw data.
@@ -64,6 +65,25 @@
         }
     }
 
+    /// Creates an instance of `Self` by copying raw data from an io::Read stream.
+    fn from_reader<R: io::Read>(mut read: R) -> io::Result<Self> {
+        // Allocate a Vec<u8> with enough extra space for the worst-case alignment offset.
+        let mut data = vec![0u8; size_of::<Self>() + align_of::<Self>()];
+
+        // Get a u8 slice within data with sufficient alignment for Self.
+        let align_offset = data.as_ptr().align_offset(align_of::<Self>());
+        let mut aligned_data = &mut data[align_offset..align_offset + size_of::<Self>()];
+
+        read.read_exact(&mut aligned_data)?;
+        match Self::from_slice(&aligned_data) {
+            Some(obj) => Ok(*obj),
+            None => Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "from_slice failed",
+            )),
+        }
+    }
+
     /// Converts a reference to `self` into a slice of bytes.
     ///
     /// The value of `self` is not copied. Instead, the slice is made from a reference to `self`.
@@ -160,3 +180,6 @@
 
 pub mod volatile_memory;
 pub use crate::volatile_memory::*;
+
+mod flexible_array;
+pub use flexible_array::vec_with_array_field;
diff --git a/data_model/src/volatile_memory.rs b/data_model/src/volatile_memory.rs
index 6ce230b..d834f0b 100644
--- a/data_model/src/volatile_memory.rs
+++ b/data_model/src/volatile_memory.rs
@@ -25,7 +25,7 @@
 use std::mem::size_of;
 use std::ptr::{copy, null_mut, read_volatile, write_bytes, write_volatile};
 use std::result;
-use std::{isize, usize};
+use std::usize;
 
 use crate::DataInit;
 
@@ -163,7 +163,7 @@
             return Err(VolatileMemoryError::Overflow {
                 base: self.addr as u64,
                 offset: count,
-            })?;
+            });
         }
         let new_size = self
             .size
@@ -174,6 +174,21 @@
         unsafe { Ok(VolatileSlice::new(new_addr as *mut u8, new_size)) }
     }
 
+    /// Similar to `get_slice` but the returned slice outlives this slice.
+    ///
+    /// The returned slice's lifetime is still limited by the underlying data's lifetime.
+    pub fn sub_slice(self, offset: u64, count: u64) -> Result<VolatileSlice<'a>> {
+        let mem_end = calc_offset(offset, count)?;
+        if mem_end > self.size {
+            return Err(Error::OutOfBounds { addr: mem_end });
+        }
+        Ok(VolatileSlice {
+            addr: (self.addr as u64 + offset) as *mut _,
+            size: count,
+            phantom: PhantomData,
+        })
+    }
+
     /// Sets each byte of this slice with the given byte, similar to `memset`.
     ///
     /// The bytes of this slice are accessed in an arbitray order.
@@ -386,9 +401,8 @@
 mod tests {
     use super::*;
 
-    use std::sync::Arc;
-    use std::thread::{sleep, spawn};
-    use std::time::Duration;
+    use std::sync::{Arc, Barrier};
+    use std::thread::spawn;
 
     #[derive(Clone)]
     struct VecMem {
@@ -457,35 +471,23 @@
         let a_clone = a.clone();
         let v_ref = a.get_ref::<u8>(0).unwrap();
         v_ref.store(99);
+
+        let start_barrier = Arc::new(Barrier::new(2));
+        let thread_start_barrier = start_barrier.clone();
+        let end_barrier = Arc::new(Barrier::new(2));
+        let thread_end_barrier = end_barrier.clone();
         spawn(move || {
-            sleep(Duration::from_millis(10));
+            thread_start_barrier.wait();
             let clone_v_ref = a_clone.get_ref::<u8>(0).unwrap();
             clone_v_ref.store(0);
+            thread_end_barrier.wait();
         });
 
-        // Technically this is a race condition but we have to observe the v_ref's value changing
-        // somehow and this helps to ensure the sleep actually happens before the store rather then
-        // being reordered by the compiler.
         assert_eq!(v_ref.load(), 99);
 
-        // Granted we could have a machine that manages to perform this many volatile loads in the
-        // amount of time the spawned thread sleeps, but the most likely reason the retry limit will
-        // get reached is because v_ref.load() is not actually performing the required volatile read
-        // or v_ref.store() is not doing a volatile write. A timer based solution was avoided
-        // because that might use a syscall which could hint the optimizer to reload v_ref's pointer
-        // regardless of volatile status. Note that we use a longer retry duration for optimized
-        // builds.
-        #[cfg(debug_assertions)]
-        const RETRY_MAX: u64 = 500_000_000;
-        #[cfg(not(debug_assertions))]
-        const RETRY_MAX: u64 = 10_000_000_000;
+        start_barrier.wait();
+        end_barrier.wait();
 
-        let mut retry = 0;
-        while v_ref.load() == 99 && retry < RETRY_MAX {
-            retry += 1;
-        }
-
-        assert_ne!(retry, RETRY_MAX, "maximum retry exceeded");
         assert_eq!(v_ref.load(), 0);
     }
 
diff --git a/devices/Android.bp b/devices/Android.bp
index 3f3332b..3b6a819 100644
--- a/devices/Android.bp
+++ b/devices/Android.bp
@@ -3,7 +3,6 @@
 rust_library_host_rlib {
     name: "libdevices",
     defaults: ["crosvm_defaults"],
-    deny_warnings: false,
     crate_name: "devices",
     srcs: ["src/lib.rs"],
     features: [
@@ -11,18 +10,22 @@
         "gpu_buffer",
         "gpu_display",
         "gpu_renderer",
+        "x",
     ],
     rlibs: [
         "libaudio_streams",
         "libbit_field",
-        "libbyteorder",
+        "libbitflags",
         "libdata_model",
+        "libdisk",
         "libgpu_buffer",
         "libgpu_display",
         "libgpu_renderer",
         "libio_jail",
         "libkvm",
+        "libkvm_sys",
         "liblibc",
+        "liblinux_input_sys",
         "libmsg_socket",
         "libnet_sys",
         "libnet_util",
@@ -30,7 +33,9 @@
         "libresources",
         "libsync_rust",
         "libsys_util",
+        "libsyscall_defines",
         "libusb_util",
+        "libvfio_sys",
         "libvhost",
         "libvirtio_sys",
         "libvm_control",
@@ -38,6 +43,6 @@
     proc_macros: [
         "libenumn",
         "libmsg_on_socket_derive",
-        "libremain-0.1.3",
+        "libremain",
     ],
 }
diff --git a/devices/Cargo.toml b/devices/Cargo.toml
index 62b570e..83aa406 100644
--- a/devices/Cargo.toml
+++ b/devices/Cargo.toml
@@ -6,22 +6,26 @@
 
 [features]
 gpu = ["gpu_buffer", "gpu_display", "gpu_renderer"]
-sandboxed-libusb = ["usb_util/sandboxed-libusb"]
 tpm = ["protos/trunks", "tpm2"]
 wl-dmabuf = []
+x = ["gpu_display/x"]
+gfxstream = ["gpu"]
 
 [dependencies]
 audio_streams = "*"
 bit_field = { path = "../bit_field" }
-byteorder = "*"
+bitflags = "1"
 data_model = { path = "../data_model" }
+disk = { path = "../disk" }
 enumn = { path = "../enumn" }
 gpu_buffer = { path = "../gpu_buffer", optional = true }
 gpu_display = { path = "../gpu_display", optional = true }
 gpu_renderer = { path = "../gpu_renderer", optional = true }
 io_jail = { path = "../io_jail" }
 kvm = { path = "../kvm" }
+kvm_sys = { path = "../kvm_sys" }
 libc = "*"
+linux_input_sys = { path = "../linux_input_sys" }
 msg_on_socket_derive = { path = "../msg_socket/msg_on_socket_derive" }
 msg_socket = { path = "../msg_socket" }
 net_sys = { path = "../net_sys" }
@@ -32,8 +36,13 @@
 resources = { path = "../resources" }
 sync = { path = "../sync" }
 sys_util = { path = "../sys_util" }
+syscall_defines = { path = "../syscall_defines" }
 tpm2 = { path = "../tpm2", optional = true }
 usb_util = { path = "../usb_util" }
+vfio_sys = { path = "../vfio_sys" }
 vhost = { path = "../vhost" }
 virtio_sys = { path = "../virtio_sys" }
 vm_control = { path = "../vm_control" }
+
+[dev-dependencies]
+tempfile = { path = "../tempfile" }
diff --git a/devices/src/acpi.rs b/devices/src/acpi.rs
new file mode 100644
index 0000000..6d36353
--- /dev/null
+++ b/devices/src/acpi.rs
@@ -0,0 +1,127 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use crate::{BusDevice, BusResumeDevice};
+use sys_util::{error, warn, EventFd};
+
+/// ACPI PM resource for handling OS suspend/resume request
+pub struct ACPIPMResource {
+    suspend_evt: EventFd,
+    pm1_status: u16,
+    pm1_enable: u16,
+    pm1_control: u16,
+    sleep_control: u8,
+    sleep_status: u8,
+}
+
+impl ACPIPMResource {
+    /// Constructs ACPI Power Management Resouce.
+    pub fn new(suspend_evt: EventFd) -> ACPIPMResource {
+        ACPIPMResource {
+            suspend_evt,
+            pm1_status: 0,
+            pm1_enable: 0,
+            pm1_control: 0,
+            sleep_control: 0,
+            sleep_status: 0,
+        }
+    }
+}
+
+/// the ACPI PM register's base and length.
+pub const ACPIPM_RESOURCE_BASE: u64 = 0x600;
+pub const ACPIPM_RESOURCE_LEN: u64 = 8;
+
+/// ACPI PM register value definations
+const PM1_STATUS: u16 = 0;
+const PM1_ENABLE: u16 = 2;
+const PM1_CONTROL: u16 = 4;
+const SLEEP_CONTROL: u16 = 6;
+const SLEEP_STATUS: u16 = 7;
+const BITMASK_PM1CNT_SLEEP_ENABLE: u16 = 0x2000;
+const BITMASK_SLEEPCNT_SLEEP_ENABLE: u8 = 0x20;
+const BITMASK_PM1CNT_WAKE_STATUS: u16 = 0x8000;
+const BITMASK_SLEEPCNT_WAKE_STATUS: u8 = 0x80;
+
+impl BusDevice for ACPIPMResource {
+    fn debug_label(&self) -> String {
+        "ACPIPMResource".to_owned()
+    }
+
+    fn read(&mut self, offset: u64, data: &mut [u8]) {
+        let val = match offset as u16 {
+            PM1_STATUS => self.pm1_status,
+            PM1_ENABLE => self.pm1_enable,
+            PM1_CONTROL => self.pm1_control,
+            SLEEP_CONTROL => self.sleep_control as u16,
+            SLEEP_STATUS => self.sleep_status as u16,
+            _ => {
+                warn!("ACPIPM: Bad read from offset {}", offset);
+                return;
+            }
+        };
+
+        let val_arr = val.to_ne_bytes();
+        for i in 0..std::mem::size_of::<u16>() {
+            if i < data.len() {
+                data[i] = val_arr[i];
+            }
+        }
+    }
+
+    fn write(&mut self, offset: u64, data: &[u8]) {
+        let max_bytes = std::mem::size_of::<u16>();
+
+        // only allow maximum max_bytes to write
+        if data.len() > max_bytes {
+            warn!("ACPIPM: bad write size: {}", data.len());
+            return;
+        }
+
+        let mut val_arr = u16::to_ne_bytes(0 as u16);
+        for i in 0..std::mem::size_of::<u16>() {
+            if i < data.len() {
+                val_arr[i] = data[i];
+            }
+        }
+        let val = u16::from_ne_bytes(val_arr);
+
+        match offset as u16 {
+            PM1_STATUS => self.pm1_status &= !val,
+            PM1_ENABLE => self.pm1_enable = val,
+            PM1_CONTROL => {
+                if (val & BITMASK_PM1CNT_SLEEP_ENABLE) == BITMASK_PM1CNT_SLEEP_ENABLE {
+                    if let Err(e) = self.suspend_evt.write(1) {
+                        error!("ACPIPM: failed to trigger suspend event: {}", e);
+                    }
+                }
+                self.pm1_control = val & !BITMASK_PM1CNT_SLEEP_ENABLE;
+            }
+            SLEEP_CONTROL => {
+                let sleep_control = val as u8;
+                if (sleep_control & BITMASK_SLEEPCNT_SLEEP_ENABLE) == BITMASK_SLEEPCNT_SLEEP_ENABLE
+                {
+                    if let Err(e) = self.suspend_evt.write(1) {
+                        error!("ACPIPM: failed to trigger suspend event: {}", e);
+                    }
+                }
+                self.sleep_control = sleep_control as u8 & !BITMASK_SLEEPCNT_SLEEP_ENABLE;
+            }
+            SLEEP_STATUS => self.sleep_status &= !val as u8,
+            _ => {
+                warn!("ACPIPM: Bad write to offset {}", offset);
+            }
+        };
+    }
+}
+
+impl BusResumeDevice for ACPIPMResource {
+    fn resume_imminent(&mut self) {
+        let val = self.pm1_status;
+        self.pm1_status = val | BITMASK_PM1CNT_WAKE_STATUS;
+
+        let val = self.sleep_status;
+        self.sleep_status = val | BITMASK_SLEEPCNT_WAKE_STATUS;
+    }
+}
diff --git a/devices/src/bus.rs b/devices/src/bus.rs
index d4b46eb..3f93974 100644
--- a/devices/src/bus.rs
+++ b/devices/src/bus.rs
@@ -37,6 +37,12 @@
     fn on_sandboxed(&mut self) {}
 }
 
+pub trait BusResumeDevice: Send {
+    /// notify the devices which are invoked
+    /// before the VM resumes form suspend.
+    fn resume_imminent(&mut self) {}
+}
+
 #[derive(Debug)]
 pub enum Error {
     /// The insertion failed because the new device overlapped with an old device.
@@ -104,9 +110,13 @@
 ///
 /// This doesn't have any restrictions on what kind of device or address space this applies to. The
 /// only restriction is that no two devices can overlap in this address space.
+///
+/// the 'resume_notify_devices' contains the devices which requires to be notified before the system
+/// resume back from S3 suspended state.
 #[derive(Clone)]
 pub struct Bus {
     devices: BTreeMap<BusRange, Arc<Mutex<dyn BusDevice>>>,
+    resume_notify_devices: Vec<Arc<Mutex<dyn BusResumeDevice>>>,
 }
 
 impl Bus {
@@ -114,6 +124,7 @@
     pub fn new() -> Bus {
         Bus {
             devices: BTreeMap::new(),
+            resume_notify_devices: Vec::new(),
         }
     }
 
@@ -208,6 +219,19 @@
             false
         }
     }
+
+    /// Register `device` for notifications of VM resume from suspend.
+    pub fn notify_on_resume(&mut self, device: Arc<Mutex<dyn BusResumeDevice>>) {
+        self.resume_notify_devices.push(device);
+    }
+
+    /// Call `notify_resume` to notify the device that suspend resume is imminent.
+    pub fn notify_resume(&mut self) {
+        let devices = self.resume_notify_devices.clone();
+        for dev in devices {
+            dev.lock().resume_imminent();
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/devices/src/i8042.rs b/devices/src/i8042.rs
index 1f8a837..dfedbf2 100644
--- a/devices/src/i8042.rs
+++ b/devices/src/i8042.rs
@@ -19,17 +19,16 @@
 }
 
 // i8042 device is mapped I/O address 0x61. We partially implement two 8-bit
-// registers: port 0x61 (I8042_PORT_B_REG, offset 0 from base of 0x61), and
-// port 0x64 (I8042_COMMAND_REG, offset 3 from base of 0x61).
+// registers: port 0x61 (I8042_PORT_B_REG), and port 0x64 (I8042_COMMAND_REG).
 impl BusDevice for I8042Device {
     fn debug_label(&self) -> String {
         "i8042".to_owned()
     }
 
     fn read(&mut self, offset: u64, data: &mut [u8]) {
-        if data.len() == 1 && offset == 3 {
+        if data.len() == 1 && offset == 0x64 {
             data[0] = 0x0;
-        } else if data.len() == 1 && offset == 0 {
+        } else if data.len() == 1 && offset == 0x61 {
             // Like kvmtool, we return bit 5 set in I8042_PORT_B_REG to
             // avoid hang in pit_calibrate_tsc() in Linux kernel.
             data[0] = 0x20;
@@ -37,7 +36,7 @@
     }
 
     fn write(&mut self, offset: u64, data: &[u8]) {
-        if data.len() == 1 && data[0] == 0xfe && offset == 3 {
+        if data.len() == 1 && data[0] == 0xfe && offset == 0x64 {
             if let Err(e) = self.reset_evt.write(1) {
                 error!("failed to trigger i8042 reset event: {}", e);
             }
diff --git a/devices/src/ioapic.rs b/devices/src/ioapic.rs
index ce9f2b2..6f8e358 100644
--- a/devices/src/ioapic.rs
+++ b/devices/src/ioapic.rs
@@ -142,7 +142,6 @@
             }
             _ => {
                 warn!("IOAPIC: Bad write to offset {}", offset);
-                return;
             }
         }
     }
@@ -187,14 +186,9 @@
 
     pub fn service_irq(&mut self, irq: usize, level: bool) -> bool {
         let entry = &mut self.redirect_table[irq];
-        let line_status = if entry.get_polarity() == 1 {
-            !level
-        } else {
-            level
-        };
 
         // De-assert the interrupt.
-        if !line_status {
+        if !level {
             self.current_interrupt_level_bitmap &= !(1 << irq);
             return true;
         }
@@ -227,7 +221,7 @@
         // TODO(mutexlox): Pulse (assert and deassert) interrupt
         let injected = true;
 
-        if entry.get_trigger_mode() == TriggerMode::Level && line_status && injected {
+        if entry.get_trigger_mode() == TriggerMode::Level && level && injected {
             entry.set_remote_irr(true);
         } else if irq == RTC_IRQ && injected {
             self.rtc_remote_irr = true;
diff --git a/devices/src/lib.rs b/devices/src/lib.rs
index bc9c8c1..174b956 100644
--- a/devices/src/lib.rs
+++ b/devices/src/lib.rs
@@ -15,19 +15,23 @@
 mod proxy;
 #[macro_use]
 mod register_space;
+pub mod acpi;
 mod serial;
 pub mod split_irqchip_common;
 pub mod usb;
 mod utils;
+pub mod vfio;
 pub mod virtio;
 
+pub use self::acpi::ACPIPMResource;
 pub use self::bus::Error as BusError;
-pub use self::bus::{Bus, BusDevice, BusRange};
+pub use self::bus::{Bus, BusDevice, BusRange, BusResumeDevice};
 pub use self::cmos::Cmos;
 pub use self::i8042::I8042Device;
 pub use self::ioapic::Ioapic;
 pub use self::pci::{
     Ac97Dev, PciConfigIo, PciConfigMmio, PciDevice, PciDeviceError, PciInterruptPin, PciRoot,
+    VfioPciDevice,
 };
 pub use self::pic::Pic;
 pub use self::pit::{Pit, PitError};
@@ -40,4 +44,5 @@
 };
 pub use self::usb::host_backend::host_backend_device_provider::HostBackendDeviceProvider;
 pub use self::usb::xhci::xhci_controller::XhciController;
+pub use self::vfio::VfioDevice;
 pub use self::virtio::VirtioPciDevice;
diff --git a/devices/src/pci/ac97.rs b/devices/src/pci/ac97.rs
index aeab1f1..eb19b5f 100644
--- a/devices/src/pci/ac97.rs
+++ b/devices/src/pci/ac97.rs
@@ -5,7 +5,7 @@
 use std::os::unix::io::RawFd;
 
 use audio_streams::StreamSource;
-use resources::{Alloc, SystemAllocator};
+use resources::{Alloc, MmioType, SystemAllocator};
 use sys_util::{error, EventFd, GuestMemory};
 
 use crate::pci::ac97_bus_master::Ac97BusMaster;
@@ -149,11 +149,12 @@
             .expect("assign_bus_dev must be called prior to allocate_io_bars");
         let mut ranges = Vec::new();
         let mixer_regs_addr = resources
-            .mmio_allocator()
-            .allocate(
+            .mmio_allocator(MmioType::Low)
+            .allocate_with_align(
                 MIXER_REGS_SIZE,
                 Alloc::PciBar { bus, dev, bar: 0 },
                 "ac97-mixer_regs".to_string(),
+                MIXER_REGS_SIZE,
             )
             .map_err(|e| pci_device::Error::IoAllocationFailed(MIXER_REGS_SIZE, e))?;
         let mixer_config = PciBarConfiguration::default()
@@ -166,11 +167,12 @@
         ranges.push((mixer_regs_addr, MIXER_REGS_SIZE));
 
         let master_regs_addr = resources
-            .mmio_allocator()
-            .allocate(
+            .mmio_allocator(MmioType::Low)
+            .allocate_with_align(
                 MASTER_REGS_SIZE,
                 Alloc::PciBar { bus, dev, bar: 1 },
                 "ac97-master_regs".to_string(),
+                MASTER_REGS_SIZE,
             )
             .map_err(|e| pci_device::Error::IoAllocationFailed(MASTER_REGS_SIZE, e))?;
         let master_config = PciBarConfiguration::default()
@@ -184,12 +186,12 @@
         Ok(ranges)
     }
 
-    fn config_registers(&self) -> &PciConfiguration {
-        &self.config_regs
+    fn read_config_register(&self, reg_idx: usize) -> u32 {
+        self.config_regs.read_reg(reg_idx)
     }
 
-    fn config_registers_mut(&mut self) -> &mut PciConfiguration {
-        &mut self.config_regs
+    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
+        (&mut self.config_regs).write_reg(reg_idx, offset, data)
     }
 
     fn keep_fds(&self) -> Vec<RawFd> {
@@ -201,8 +203,8 @@
     }
 
     fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
-        let bar0 = u64::from(self.config_regs.get_bar_addr(0));
-        let bar1 = u64::from(self.config_regs.get_bar_addr(1));
+        let bar0 = self.config_regs.get_bar_addr(0);
+        let bar1 = self.config_regs.get_bar_addr(1);
         match addr {
             a if a >= bar0 && a < bar0 + MIXER_REGS_SIZE => self.read_mixer(addr - bar0, data),
             a if a >= bar1 && a < bar1 + MASTER_REGS_SIZE => {
@@ -213,8 +215,8 @@
     }
 
     fn write_bar(&mut self, addr: u64, data: &[u8]) {
-        let bar0 = u64::from(self.config_regs.get_bar_addr(0));
-        let bar1 = u64::from(self.config_regs.get_bar_addr(1));
+        let bar0 = self.config_regs.get_bar_addr(0);
+        let bar1 = self.config_regs.get_bar_addr(1);
         match addr {
             a if a >= bar0 && a < bar0 + MIXER_REGS_SIZE => self.write_mixer(addr - bar0, data),
             a if a >= bar1 && a < bar1 + MASTER_REGS_SIZE => {
@@ -243,8 +245,8 @@
         let mut ac97_dev = Ac97Dev::new(mem, Box::new(DummyStreamSource::new()));
         let mut allocator = SystemAllocator::builder()
             .add_io_addresses(0x1000_0000, 0x1000_0000)
-            .add_mmio_addresses(0x2000_0000, 0x1000_0000)
-            .add_device_addresses(0x3000_0000, 0x1000_0000)
+            .add_low_mmio_addresses(0x2000_0000, 0x1000_0000)
+            .add_high_mmio_addresses(0x3000_0000, 0x1000_0000)
             .create_allocator(5, false)
             .unwrap();
         ac97_dev.assign_bus_dev(0, 0);
diff --git a/devices/src/pci/ac97_bus_master.rs b/devices/src/pci/ac97_bus_master.rs
index bde5c90..d3d2f85 100644
--- a/devices/src/pci/ac97_bus_master.rs
+++ b/devices/src/pci/ac97_bus_master.rs
@@ -14,7 +14,7 @@
 
 use audio_streams::{
     capture::{CaptureBuffer, CaptureBufferStream},
-    PlaybackBuffer, PlaybackBufferStream, StreamControl, StreamSource,
+    PlaybackBuffer, PlaybackBufferStream, SampleFormat, StreamControl, StreamSource,
 };
 use data_model::{VolatileMemory, VolatileSlice};
 use sync::Mutex;
@@ -95,23 +95,17 @@
     }
 }
 
-impl From<GuestMemoryError> for PlaybackError {
+impl From<GuestMemoryError> for AudioError {
     fn from(err: GuestMemoryError) -> Self {
-        PlaybackError::ReadingGuestError(err)
-    }
-}
-
-impl From<GuestMemoryError> for CaptureError {
-    fn from(err: GuestMemoryError) -> Self {
-        CaptureError::ReadingGuestError(err)
+        AudioError::ReadingGuestError(err)
     }
 }
 
 type GuestMemoryResult<T> = std::result::Result<T, GuestMemoryError>;
 
-// Internal error type used for reporting errors from the audio playback thread.
+// Internal error type used for reporting errors from the audio thread.
 #[derive(Debug)]
-enum PlaybackError {
+enum AudioError {
     // Failure to read guest memory.
     ReadingGuestError(GuestMemoryError),
     // Failure to get an buffer from the stream.
@@ -120,11 +114,11 @@
     WritingOutput(std::io::Error),
 }
 
-impl std::error::Error for PlaybackError {}
+impl std::error::Error for AudioError {}
 
-impl Display for PlaybackError {
+impl Display for AudioError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        use self::PlaybackError::*;
+        use self::AudioError::*;
 
         match self {
             ReadingGuestError(e) => write!(f, "Failed to read guest memory: {}.", e),
@@ -134,32 +128,25 @@
     }
 }
 
-type PlaybackResult<T> = std::result::Result<T, PlaybackError>;
+type AudioResult<T> = std::result::Result<T, AudioError>;
 
-// Internal error type used for reporting errors from the audio capture thread.
-#[derive(Debug)]
-enum CaptureError {
-    // Failure to read guest memory.
-    ReadingGuestError(GuestMemoryError),
-    // Failure to get an buffer from the stream.
-    StreamError(Box<dyn Error>),
+// Audio thread book-keeping data
+struct AudioThreadInfo {
+    thread: Option<thread::JoinHandle<()>>,
+    thread_run: Arc<AtomicBool>,
+    stream_control: Option<Box<dyn StreamControl>>,
 }
 
-impl std::error::Error for CaptureError {}
-
-impl Display for CaptureError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        use self::CaptureError::*;
-
-        match self {
-            ReadingGuestError(e) => write!(f, "Failed to read guest memory: {}.", e),
-            StreamError(e) => write!(f, "Failed to get a buffer from the stream: {}", e),
+impl AudioThreadInfo {
+    fn new() -> Self {
+        Self {
+            thread: None,
+            thread_run: Arc::new(AtomicBool::new(false)),
+            stream_control: None,
         }
     }
 }
 
-type CaptureResult<T> = std::result::Result<T, CaptureError>;
-
 /// `Ac97BusMaster` emulates the bus master portion of AC97. It exposes a register read/write
 /// interface compliant with the ICH bus master.
 pub struct Ac97BusMaster {
@@ -168,15 +155,9 @@
     regs: Arc<Mutex<Ac97BusMasterRegs>>,
     acc_sema: u8,
 
-    // Audio thread for capture stream.
-    audio_thread_pi: Option<thread::JoinHandle<()>>,
-    audio_thread_pi_run: Arc<AtomicBool>,
-    pi_stream_control: Option<Box<dyn StreamControl>>,
-
-    // Audio thread book keeping.
-    audio_thread_po: Option<thread::JoinHandle<()>>,
-    audio_thread_po_run: Arc<AtomicBool>,
-    po_stream_control: Option<Box<dyn StreamControl>>,
+    // Bookkeeping info for playback and capture stream.
+    po_info: AudioThreadInfo,
+    pi_info: AudioThreadInfo,
 
     // Audio server used to create playback or capture streams.
     audio_server: Box<dyn StreamSource>,
@@ -194,13 +175,8 @@
             regs: Arc::new(Mutex::new(Ac97BusMasterRegs::new())),
             acc_sema: 0,
 
-            audio_thread_pi: None,
-            audio_thread_pi_run: Arc::new(AtomicBool::new(false)),
-            pi_stream_control: None,
-
-            audio_thread_po: None,
-            audio_thread_po_run: Arc::new(AtomicBool::new(false)),
-            po_stream_control: None,
+            po_info: AudioThreadInfo::new(),
+            pi_info: AudioThreadInfo::new(),
 
             audio_server,
 
@@ -229,19 +205,12 @@
                 {
                     // Scope for the lock on thread_regs.
                     let regs = thread_regs.lock();
-                    // Check output irq
+                    // Check output and input irq
                     let po_int_mask = regs.func_regs(Ac97Function::Output).int_mask();
-                    if regs.func_regs(Ac97Function::Output).sr & po_int_mask != 0 {
-                        if let Some(irq_evt) = regs.irq_evt.as_ref() {
-                            if let Err(e) = irq_evt.write(1) {
-                                error!("Failed to set the irq from the resample thread: {}.", e);
-                                break;
-                            }
-                        }
-                    }
-                    // Check input irq
                     let pi_int_mask = regs.func_regs(Ac97Function::Input).int_mask();
-                    if regs.func_regs(Ac97Function::Input).sr & pi_int_mask != 0 {
+                    if regs.func_regs(Ac97Function::Output).sr & po_int_mask != 0
+                        || regs.func_regs(Ac97Function::Input).sr & pi_int_mask != 0
+                    {
                         if let Some(irq_evt) = regs.irq_evt.as_ref() {
                             if let Err(e) = irq_evt.write(1) {
                                 error!("Failed to set the irq from the resample thread: {}.", e);
@@ -257,7 +226,7 @@
     /// Called when `mixer` has been changed and the new values should be applied to currently
     /// active streams.
     pub fn update_mixer_settings(&mut self, mixer: &Ac97Mixer) {
-        if let Some(control) = self.po_stream_control.as_mut() {
+        if let Some(control) = self.po_info.stream_control.as_mut() {
             // The audio server only supports one volume, not separate left and right.
             let (muted, left_volume, _right_volume) = mixer.get_master_volume();
             control.set_volume(left_volume);
@@ -285,9 +254,9 @@
 
         let regs = self.regs.lock();
         match offset {
-            PI_BASE_00...PI_CR_0B => readb_func_regs(&regs.pi_regs, offset - PI_BASE_00),
-            PO_BASE_10...PO_CR_1B => readb_func_regs(&regs.po_regs, offset - PO_BASE_10),
-            MC_BASE_20...MC_CR_2B => readb_func_regs(&regs.mc_regs, offset - MC_BASE_20),
+            PI_BASE_00..=PI_CR_0B => readb_func_regs(&regs.pi_regs, offset - PI_BASE_00),
+            PO_BASE_10..=PO_CR_1B => readb_func_regs(&regs.po_regs, offset - PO_BASE_10),
+            MC_BASE_20..=MC_CR_2B => readb_func_regs(&regs.mc_regs, offset - MC_BASE_20),
             ACC_SEMA_34 => self.acc_sema,
             _ => 0,
         }
@@ -302,7 +271,7 @@
             PO_SR_16 => regs.po_regs.sr,
             PO_PICB_18 => {
                 // PO PICB
-                if !self.audio_thread_po_run.load(Ordering::Relaxed) {
+                if !self.po_info.thread_run.load(Ordering::Relaxed) {
                     // Not running, no need to estimate what has been consumed.
                     regs.po_regs.picb
                 } else {
@@ -417,7 +386,7 @@
             && func_regs.sr & SR_DCH == SR_DCH
             && func_regs.civ != func_regs.lvi
         {
-            func_regs.sr &= !SR_DCH;
+            func_regs.sr &= !(SR_DCH | SR_CELV);
         }
     }
 
@@ -437,9 +406,8 @@
 
     fn set_cr(&mut self, func: Ac97Function, val: u8, mixer: &Ac97Mixer) {
         if val & CR_RR != 0 {
-            self.stop_audio(func);
             let mut regs = self.regs.lock();
-            regs.func_regs_mut(func).do_reset();
+            Self::reset_func_regs(&mut regs, func);
         } else {
             let cr = self.regs.lock().func_regs(func).cr;
             if val & CR_RPBM == 0 {
@@ -457,8 +425,8 @@
                     func_regs.civ = 0;
                     func_regs.sr &= !SR_DCH;
                 }
-                if self.start_audio(func, mixer).is_err() {
-                    warn!("Failed to start audio");
+                if let Err(e) = self.start_audio(func, mixer) {
+                    warn!("Failed to start audio: {}", e);
                 }
             }
             let mut regs = self.regs.lock();
@@ -479,7 +447,9 @@
         if new_glob_cnt & GLOB_CNT_WARM_RESET != 0 {
             // Check if running and if so, ignore. Warm reset is specified to no-op when the device
             // is playing or recording audio.
-            if !self.audio_thread_po_run.load(Ordering::Relaxed) {
+            if !self.po_info.thread_run.load(Ordering::Relaxed)
+                && !self.pi_info.thread_run.load(Ordering::Relaxed)
+            {
                 self.stop_all_audio();
                 let mut regs = self.regs.lock();
                 regs.glob_cnt = new_glob_cnt & !GLOB_CNT_WARM_RESET; // Auto-cleared reset bit.
@@ -490,28 +460,34 @@
     }
 
     fn start_audio(&mut self, func: Ac97Function, mixer: &Ac97Mixer) -> Result<(), Box<dyn Error>> {
-        const AUDIO_THREAD_RTPRIO: u16 = 12; // Matches other cros audio clients.
+        const AUDIO_THREAD_RTPRIO: u16 = 10; // Matches other cros audio clients.
+
+        let thread_info = match func {
+            Ac97Function::Microphone => return Ok(()),
+            Ac97Function::Input => &mut self.pi_info,
+            Ac97Function::Output => &mut self.po_info,
+        };
+
+        let num_channels = 2;
+        let buffer_samples = current_buffer_size(self.regs.lock().func_regs(func), &self.mem)?;
+        let buffer_frames = buffer_samples / num_channels;
+        thread_info.thread_run.store(true, Ordering::Relaxed);
+        let thread_run = thread_info.thread_run.clone();
+        let thread_mem = self.mem.clone();
+        let thread_regs = self.regs.clone();
 
         match func {
             Ac97Function::Input => {
-                let num_channels = 2;
-                let buffer_samples =
-                    current_buffer_size(self.regs.lock().func_regs(func), &self.mem)?;
-                let buffer_frames = buffer_samples / num_channels;
                 let (stream_control, input_stream) = self.audio_server.new_capture_stream(
                     num_channels,
+                    SampleFormat::S16LE,
                     DEVICE_SAMPLE_RATE,
                     buffer_frames,
                 )?;
-                self.pi_stream_control = Some(stream_control);
+                self.pi_info.stream_control = Some(stream_control);
                 self.update_mixer_settings(mixer);
 
-                self.audio_thread_pi_run.store(true, Ordering::Relaxed);
-                let thread_run = self.audio_thread_pi_run.clone();
-                let thread_mem = self.mem.clone();
-                let thread_regs = self.regs.clone();
-
-                self.audio_thread_pi = Some(thread::spawn(move || {
+                self.pi_info.thread = Some(thread::spawn(move || {
                     if set_rt_prio_limit(u64::from(AUDIO_THREAD_RTPRIO)).is_err()
                         || set_rt_round_robin(i32::from(AUDIO_THREAD_RTPRIO)).is_err()
                     {
@@ -526,27 +502,16 @@
                 }));
             }
             Ac97Function::Output => {
-                let num_channels = 2;
-
-                let buffer_samples =
-                    current_buffer_size(self.regs.lock().func_regs(func), &self.mem)?;
-
-                let buffer_frames = buffer_samples / num_channels;
                 let (stream_control, output_stream) = self.audio_server.new_playback_stream(
                     num_channels,
+                    SampleFormat::S16LE,
                     DEVICE_SAMPLE_RATE,
                     buffer_frames,
                 )?;
-                self.po_stream_control = Some(stream_control);
-
+                self.po_info.stream_control = Some(stream_control);
                 self.update_mixer_settings(mixer);
 
-                self.audio_thread_po_run.store(true, Ordering::Relaxed);
-                let thread_run = self.audio_thread_po_run.clone();
-                let thread_mem = self.mem.clone();
-                let thread_regs = self.regs.clone();
-
-                self.audio_thread_po = Some(thread::spawn(move || {
+                self.po_info.thread = Some(thread::spawn(move || {
                     if set_rt_prio_limit(u64::from(AUDIO_THREAD_RTPRIO)).is_err()
                         || set_rt_round_robin(i32::from(AUDIO_THREAD_RTPRIO)).is_err()
                     {
@@ -561,30 +526,22 @@
                 }));
             }
             Ac97Function::Microphone => (),
-        }
+        };
         Ok(())
     }
 
     fn stop_audio(&mut self, func: Ac97Function) {
-        match func {
-            Ac97Function::Input => {
-                self.audio_thread_pi_run.store(false, Ordering::Relaxed);
-                if let Some(thread) = self.audio_thread_pi.take() {
-                    if let Err(e) = thread.join() {
-                        error!("Failed to join the capture thread: {:?}.", e);
-                    }
-                }
-            }
-            Ac97Function::Output => {
-                self.audio_thread_po_run.store(false, Ordering::Relaxed);
-                if let Some(thread) = self.audio_thread_po.take() {
-                    if let Err(e) = thread.join() {
-                        error!("Failed to join the playback thread: {:?}.", e);
-                    }
-                }
-            }
-            Ac97Function::Microphone => (),
+        let thread_info = match func {
+            Ac97Function::Microphone => return,
+            Ac97Function::Input => &mut self.pi_info,
+            Ac97Function::Output => &mut self.po_info,
         };
+        thread_info.thread_run.store(false, Ordering::Relaxed);
+        if let Some(thread) = thread_info.thread.take() {
+            if let Err(e) = thread.join() {
+                error!("Failed to join {:?} thread: {:?}.", func, e);
+            }
+        }
     }
 
     fn stop_all_audio(&mut self) {
@@ -593,12 +550,18 @@
         self.stop_audio(Ac97Function::Microphone);
     }
 
+    // Helper function for resetting function registers.
+    fn reset_func_regs(regs: &mut Ac97BusMasterRegs, func: Ac97Function) {
+        regs.func_regs_mut(func).do_reset();
+        update_sr(regs, func, SR_DCH);
+    }
+
     fn reset_audio_regs(&mut self) {
         self.stop_all_audio();
         let mut regs = self.regs.lock();
-        regs.pi_regs.do_reset();
-        regs.po_regs.do_reset();
-        regs.mc_regs.do_reset();
+        Self::reset_func_regs(&mut regs, Ac97Function::Input);
+        Self::reset_func_regs(&mut regs, Ac97Function::Output);
+        Self::reset_func_regs(&mut regs, Ac97Function::Microphone);
     }
 }
 
@@ -642,7 +605,7 @@
     regs: &mut Ac97BusMasterRegs,
     mem: &GuestMemory,
     out_buffer: &mut PlaybackBuffer,
-) -> PlaybackResult<()> {
+) -> AudioResult<()> {
     // If the current buffer had any samples in it, mark it as done.
     if regs.func_regs_mut(Ac97Function::Output).picb > 0 {
         buffer_completed(regs, mem, Ac97Function::Output)?
@@ -655,12 +618,13 @@
         let zeros = vec![0u8; buffer_len as usize];
         out_buffer
             .write(&zeros)
-            .map_err(PlaybackError::WritingOutput)?;
+            .map_err(AudioError::WritingOutput)?;
     }
     Ok(())
 }
 
-// Moves to the next buffer for the given function and registers.
+// Marks the current buffer completed and moves to the next buffer for the given
+// function and registers.
 fn buffer_completed(
     regs: &mut Ac97BusMasterRegs,
     mem: &GuestMemory,
@@ -673,8 +637,7 @@
         .read_obj_from_addr(GuestAddress(u64::from(descriptor_addr) + 4))
         .map_err(GuestMemoryError::ReadingGuestBufferAddress)?;
 
-    let mut new_sr = regs.func_regs(func).sr;
-
+    let mut new_sr = regs.func_regs(func).sr & !SR_CELV;
     if control_reg & BD_IOC != 0 {
         new_sr |= SR_BCIS;
     }
@@ -685,16 +648,15 @@
     if civ == lvi {
         new_sr |= SR_DCH | SR_CELV | SR_LVBCI;
     } else {
-        let func_regs = regs.func_regs_mut(func);
-        func_regs.civ = func_regs.piv;
-        func_regs.piv = (func_regs.piv + 1) % 32; // move piv to the next buffer.
+        regs.func_regs_mut(func).move_to_next_buffer();
     }
 
-    if new_sr != regs.func_regs(func).sr {
-        update_sr(regs, func, new_sr);
-    }
+    update_sr(regs, func, new_sr);
 
-    regs.po_pointer_update_time = Instant::now();
+    regs.func_regs_mut(func).picb = current_buffer_size(regs.func_regs(func), &mem)? as u16;
+    if func == Ac97Function::Output {
+        regs.po_pointer_update_time = Instant::now();
+    }
 
     Ok(())
 }
@@ -705,11 +667,11 @@
     mem: GuestMemory,
     thread_run: &AtomicBool,
     mut output_stream: Box<dyn PlaybackBufferStream>,
-) -> PlaybackResult<()> {
+) -> AudioResult<()> {
     while thread_run.load(Ordering::Relaxed) {
         output_stream
             .next_playback_buffer()
-            .map_err(PlaybackError::StreamError)
+            .map_err(AudioError::StreamError)
             .and_then(|mut pb_buf| play_buffer(&mut regs.lock(), &mem, &mut pb_buf))?;
     }
     Ok(())
@@ -720,7 +682,7 @@
     regs: &mut Ac97BusMasterRegs,
     mem: &GuestMemory,
     in_buffer: &mut CaptureBuffer,
-) -> CaptureResult<()> {
+) -> AudioResult<()> {
     // If the current buffer had any samples in it, mark it as done.
     if regs.func_regs_mut(Ac97Function::Input).picb > 0 {
         buffer_completed(regs, mem, Ac97Function::Input)?
@@ -738,11 +700,11 @@
     mem: GuestMemory,
     thread_run: &AtomicBool,
     mut input_stream: Box<dyn CaptureBufferStream>,
-) -> CaptureResult<()> {
+) -> AudioResult<()> {
     while thread_run.load(Ordering::Relaxed) {
         input_stream
             .next_capture_buffer()
-            .map_err(CaptureError::StreamError)
+            .map_err(AudioError::StreamError)
             .and_then(|mut cp_buf| capture_buffer(&mut regs.lock(), &mem, &mut cp_buf))?;
     }
     Ok(())
@@ -760,14 +722,17 @@
 
     {
         let func_regs = regs.func_regs_mut(func);
+        let old_sr = func_regs.sr;
         func_regs.sr = val;
-        if val & SR_INT_MASK != 0 {
+        if (old_sr ^ val) & SR_INT_MASK != 0 {
             if (val & SR_LVBCI) != 0 && (func_regs.cr & CR_LVBIE) != 0 {
                 interrupt_high = true;
             }
             if (val & SR_BCIS) != 0 && (func_regs.cr & CR_IOCE) != 0 {
                 interrupt_high = true;
             }
+        } else {
+            return;
         }
     }
 
@@ -779,12 +744,6 @@
         }
     } else {
         regs.glob_sta &= !int_mask;
-        if regs.glob_sta & (GS_PIINT | GS_POINT | GS_MINT) == 0 {
-            if let Some(irq_evt) = regs.irq_evt.as_ref() {
-                // Ignore write failure, nothing can be done about it from here.
-                let _ = irq_evt.write(0);
-            }
-        }
     }
 }
 
@@ -914,7 +873,7 @@
         bm.writeb(PO_LVI_15, LVI_MASK, &mixer);
 
         // Start.
-        bm.writeb(PO_CR_1B, CR_RPBM, &mixer);
+        bm.writeb(PO_CR_1B, CR_IOCE | CR_RPBM, &mixer);
 
         std::thread::sleep(time::Duration::from_millis(50));
         let picb = bm.readw(PO_PICB_18);
@@ -951,13 +910,19 @@
         std::thread::sleep(time::Duration::from_millis(500));
         assert!(bm.readw(PO_SR_16) & SR_LVBCI != 0); // Hit last buffer
         assert!(bm.readw(PO_SR_16) & SR_DCH == SR_DCH); // DMA stopped because of lack of buffers.
+        assert_eq!(bm.readw(PO_SR_16) & SR_CELV, SR_CELV);
         assert_eq!(bm.readb(PO_LVI_15), bm.readb(PO_CIV_14));
+        assert!(
+            bm.readl(GLOB_STA_30) & GS_POINT != 0,
+            "POINT bit should be set."
+        );
         // Clear the LVB bit
         bm.writeb(PO_SR_16, SR_LVBCI as u8, &mixer);
         assert!(bm.readw(PO_SR_16) & SR_LVBCI == 0);
         // Reset the LVI to the last buffer and check that playback resumes
         bm.writeb(PO_LVI_15, LVI_MASK, &mixer);
         assert!(bm.readw(PO_SR_16) & SR_DCH == 0); // DMA restarts.
+        assert_eq!(bm.readw(PO_SR_16) & SR_CELV, 0);
 
         let (restart_civ, restart_picb) = (bm.readb(PO_CIV_14), bm.readw(PO_PICB_18));
         std::thread::sleep(time::Duration::from_millis(20));
@@ -966,6 +931,11 @@
         // Stop.
         bm.writeb(PO_CR_1B, 0, &mixer);
         assert!(bm.readw(PO_SR_16) & 0x01 != 0); // DMA is not running.
+        bm.writeb(PO_CR_1B, CR_RR, &mixer);
+        assert!(
+            bm.readl(GLOB_STA_30) & GS_POINT == 0,
+            "POINT bit should be disabled."
+        );
     }
 
     #[test]
@@ -999,7 +969,7 @@
         bm.writeb(PI_LVI_05, LVI_MASK, &mixer);
 
         // Start.
-        bm.writeb(PI_CR_0B, CR_RPBM, &mixer);
+        bm.writeb(PI_CR_0B, CR_IOCE | CR_RPBM, &mixer);
         assert_eq!(bm.readw(PI_PICB_08), 0);
 
         std::thread::sleep(time::Duration::from_millis(50));
@@ -1023,7 +993,12 @@
         std::thread::sleep(time::Duration::from_millis(5000));
         assert_ne!(bm.readw(PI_SR_06) & SR_LVBCI, 0); // Hit last buffer
         assert_eq!(bm.readw(PI_SR_06) & SR_DCH, SR_DCH); // DMA stopped because of lack of buffers.
+        assert_eq!(bm.readw(PI_SR_06) & SR_CELV, SR_CELV);
         assert_eq!(bm.readb(PI_LVI_05), bm.readb(PI_CIV_04));
+        assert!(
+            bm.readl(GLOB_STA_30) & GS_PIINT != 0,
+            "PIINT bit should be set."
+        );
 
         // Clear the LVB bit
         bm.writeb(PI_SR_06, SR_LVBCI as u8, &mixer);
@@ -1031,6 +1006,7 @@
         // Reset the LVI to the last buffer and check that playback resumes
         bm.writeb(PI_LVI_05, LVI_MASK, &mixer);
         assert!(bm.readw(PI_SR_06) & SR_DCH == 0); // DMA restarts.
+        assert_eq!(bm.readw(PI_SR_06) & SR_CELV, 0);
 
         let restart_civ = bm.readb(PI_CIV_04);
         std::thread::sleep(time::Duration::from_millis(200));
@@ -1039,5 +1015,10 @@
         // Stop.
         bm.writeb(PI_CR_0B, 0, &mixer);
         assert!(bm.readw(PI_SR_06) & 0x01 != 0); // DMA is not running.
+        bm.writeb(PI_CR_0B, CR_RR, &mixer);
+        assert!(
+            bm.readl(GLOB_STA_30) & GS_PIINT == 0,
+            "PIINT bit should be disabled."
+        );
     }
 }
diff --git a/devices/src/pci/ac97_regs.rs b/devices/src/pci/ac97_regs.rs
index bcca05b..20b35ec 100644
--- a/devices/src/pci/ac97_regs.rs
+++ b/devices/src/pci/ac97_regs.rs
@@ -183,7 +183,7 @@
 pub const BD_IOC: u32 = 1 << 31;
 
 /// The functions that are supported by the Ac97 subsystem.
-#[derive(Copy, Clone)]
+#[derive(Debug, Copy, Clone, PartialEq)]
 pub enum Ac97Function {
     Input,
     Output,
@@ -215,12 +215,11 @@
         regs
     }
 
-    /// Reset all the registers to the PoR defaults.
+    /// Reset all the registers to the PoR defaults. `sr` should be updated by `update_sr`.
     pub fn do_reset(&mut self) {
         self.bdbar = 0;
         self.civ = 0;
         self.lvi = 0;
-        self.sr = SR_DCH;
         self.picb = 0;
         self.piv = 0;
         self.cr &= CR_DONT_CLEAR_MASK;
@@ -244,4 +243,11 @@
         }
         int_mask
     }
+
+    /// Sets the current buffer to the next buffer by updating CIV to PIV, and
+    /// updates related fields.
+    pub fn move_to_next_buffer(&mut self) {
+        self.civ = self.piv;
+        self.piv = (self.piv + 1) % 32; // move piv to the next buffer.
+    }
 }
diff --git a/devices/src/pci/mod.rs b/devices/src/pci/mod.rs
index 791161a..a9e8749 100644
--- a/devices/src/pci/mod.rs
+++ b/devices/src/pci/mod.rs
@@ -8,11 +8,14 @@
 mod ac97_bus_master;
 mod ac97_mixer;
 mod ac97_regs;
+mod msix;
 mod pci_configuration;
 mod pci_device;
 mod pci_root;
+mod vfio_pci;
 
 pub use self::ac97::Ac97Dev;
+pub use self::msix::{MsixCap, MsixConfig};
 pub use self::pci_configuration::{
     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityID,
     PciClassCode, PciConfiguration, PciHeaderType, PciProgrammingInterface, PciSerialBusSubClass,
@@ -21,6 +24,7 @@
 pub use self::pci_device::Error as PciDeviceError;
 pub use self::pci_device::PciDevice;
 pub use self::pci_root::{PciConfigIo, PciConfigMmio, PciRoot};
+pub use self::vfio_pci::VfioPciDevice;
 
 /// PCI has four interrupt pins A->D.
 #[derive(Copy, Clone)]
diff --git a/devices/src/pci/msix.rs b/devices/src/pci/msix.rs
new file mode 100644
index 0000000..0b9f391
--- /dev/null
+++ b/devices/src/pci/msix.rs
@@ -0,0 +1,496 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use crate::pci::{PciCapability, PciCapabilityID};
+use msg_socket::{MsgReceiver, MsgSender};
+use std::convert::TryInto;
+use std::os::unix::io::{AsRawFd, RawFd};
+use std::sync::Arc;
+use sys_util::{error, EventFd};
+use vm_control::{MaybeOwnedFd, VmIrqRequest, VmIrqRequestSocket, VmIrqResponse};
+
+use data_model::DataInit;
+
+const MAX_MSIX_VECTORS_PER_DEVICE: u16 = 2048;
+const MSIX_TABLE_ENTRIES_MODULO: u64 = 16;
+const MSIX_PBA_ENTRIES_MODULO: u64 = 8;
+const BITS_PER_PBA_ENTRY: usize = 64;
+const FUNCTION_MASK_BIT: u16 = 0x4000;
+const MSIX_ENABLE_BIT: u16 = 0x8000;
+
+#[derive(Clone)]
+struct MsixTableEntry {
+    msg_addr_lo: u32,
+    msg_addr_hi: u32,
+    msg_data: u32,
+    vector_ctl: u32,
+}
+
+impl MsixTableEntry {
+    fn masked(&self) -> bool {
+        self.vector_ctl & 0x1 == 0x1
+    }
+}
+
+impl Default for MsixTableEntry {
+    fn default() -> Self {
+        MsixTableEntry {
+            msg_addr_lo: 0,
+            msg_addr_hi: 0,
+            msg_data: 0,
+            vector_ctl: 0,
+        }
+    }
+}
+
+struct IrqfdGsi {
+    irqfd: EventFd,
+    gsi: u32,
+}
+
+/// Wrapper over MSI-X Capability Structure and MSI-X Tables
+pub struct MsixConfig {
+    table_entries: Vec<MsixTableEntry>,
+    pba_entries: Vec<u64>,
+    irq_vec: Vec<IrqfdGsi>,
+    masked: bool,
+    enabled: bool,
+    msi_device_socket: Arc<VmIrqRequestSocket>,
+    msix_num: u16,
+}
+
+impl MsixConfig {
+    pub fn new(msix_vectors: u16, vm_socket: Arc<VmIrqRequestSocket>) -> Self {
+        assert!(msix_vectors <= MAX_MSIX_VECTORS_PER_DEVICE);
+
+        let mut table_entries: Vec<MsixTableEntry> = Vec::new();
+        table_entries.resize_with(msix_vectors as usize, Default::default);
+        let mut pba_entries: Vec<u64> = Vec::new();
+        let num_pba_entries: usize = ((msix_vectors as usize) / BITS_PER_PBA_ENTRY) + 1;
+        pba_entries.resize_with(num_pba_entries, Default::default);
+
+        MsixConfig {
+            table_entries,
+            pba_entries,
+            irq_vec: Vec::new(),
+            masked: false,
+            enabled: false,
+            msi_device_socket: vm_socket,
+            msix_num: msix_vectors,
+        }
+    }
+
+    /// Get the number of MSI-X vectors in this configuration.
+    pub fn num_vectors(&self) -> u16 {
+        self.msix_num
+    }
+
+    /// Check whether the Function Mask bit in Message Control word in set or not.
+    /// if 1, all of the vectors associated with the function are masked,
+    /// regardless of their per-vector Mask bit states.
+    /// If 0, each vector's Mask bit determines whether the vector is masked or not.
+    pub fn masked(&self) -> bool {
+        self.masked
+    }
+
+    /// Check whether the MSI-X Enable bit in Message Control word in set or not.
+    /// if 1, the function is permitted to use MSI-X to request service.
+    pub fn enabled(&self) -> bool {
+        self.enabled
+    }
+
+    /// Read the MSI-X Capability Structure.
+    /// The top 2 bits in Message Control word are emulated and all other
+    /// bits are read only.
+    pub fn read_msix_capability(&self, data: u32) -> u32 {
+        let mut msg_ctl = (data >> 16) as u16;
+        msg_ctl &= !(MSIX_ENABLE_BIT | FUNCTION_MASK_BIT);
+
+        if self.enabled {
+            msg_ctl |= MSIX_ENABLE_BIT;
+        }
+        if self.masked {
+            msg_ctl |= FUNCTION_MASK_BIT;
+        }
+        (msg_ctl as u32) << 16 | (data & u16::max_value() as u32)
+    }
+
+    /// Write to the MSI-X Capability Structure.
+    /// Only the top 2 bits in Message Control Word are writable.
+    pub fn write_msix_capability(&mut self, offset: u64, data: &[u8]) {
+        if offset == 2 && data.len() == 2 {
+            let reg = u16::from_le_bytes([data[0], data[1]]);
+            let old_masked = self.masked;
+            let old_enabled = self.enabled;
+
+            self.masked = (reg & FUNCTION_MASK_BIT) == FUNCTION_MASK_BIT;
+            self.enabled = (reg & MSIX_ENABLE_BIT) == MSIX_ENABLE_BIT;
+
+            if !old_enabled && self.enabled {
+                self.msix_enable();
+            }
+
+            // If the Function Mask bit was set, and has just been cleared, it's
+            // important to go through the entire PBA to check if there was any
+            // pending MSI-X message to inject, given that the vector is not
+            // masked.
+            if old_masked && !self.masked {
+                for (index, entry) in self.table_entries.clone().iter().enumerate() {
+                    if !entry.masked() && self.get_pba_bit(index as u16) == 1 {
+                        self.inject_msix_and_clear_pba(index);
+                    }
+                }
+            }
+        } else {
+            error!(
+                "invalid write to MSI-X Capability Structure offset {:x}",
+                offset
+            );
+        }
+    }
+
+    fn add_msi_route(&self, index: u16, gsi: u32) {
+        let mut data: [u8; 8] = [0, 0, 0, 0, 0, 0, 0, 0];
+        self.read_msix_table((index * 16).into(), data.as_mut());
+        let msi_address: u64 = u64::from_le_bytes(data);
+        let mut data: [u8; 4] = [0, 0, 0, 0];
+        self.read_msix_table((index * 16 + 8).into(), data.as_mut());
+        let msi_data: u32 = u32::from_le_bytes(data);
+
+        if msi_address == 0 {
+            return;
+        }
+
+        if let Err(e) = self.msi_device_socket.send(&VmIrqRequest::AddMsiRoute {
+            gsi,
+            msi_address,
+            msi_data,
+        }) {
+            error!("failed to send AddMsiRoute request: {:?}", e);
+            return;
+        }
+        if self.msi_device_socket.recv().is_err() {
+            error!("Faied to receive AddMsiRoute Response");
+        }
+    }
+
+    fn msix_enable(&mut self) {
+        self.irq_vec.clear();
+        for i in 0..self.msix_num {
+            let irqfd = EventFd::new().unwrap();
+            if let Err(e) = self.msi_device_socket.send(&VmIrqRequest::AllocateOneMsi {
+                irqfd: MaybeOwnedFd::Borrowed(irqfd.as_raw_fd()),
+            }) {
+                error!("failed to send AllocateOneMsi request: {:?}", e);
+                continue;
+            }
+            let irq_num: u32;
+            match self.msi_device_socket.recv() {
+                Ok(VmIrqResponse::AllocateOneMsi { gsi }) => irq_num = gsi,
+                _ => continue,
+            }
+            self.irq_vec.push(IrqfdGsi {
+                irqfd,
+                gsi: irq_num,
+            });
+
+            self.add_msi_route(i, irq_num);
+        }
+    }
+
+    /// Read MSI-X table
+    ///  # Arguments
+    ///  * 'offset' - the offset within the MSI-X Table
+    ///  * 'data' - used to store the read results
+    ///
+    /// For all accesses to MSI-X Table and MSI-X PBA fields, software must use aligned full
+    /// DWORD or aligned full QWORD transactions; otherwise, the result is undefined.
+    ///
+    ///   location: DWORD3            DWORD2      DWORD1            DWORD0
+    ///   entry 0:  Vector Control    Msg Data    Msg Upper Addr    Msg Addr
+    ///   entry 1:  Vector Control    Msg Data    Msg Upper Addr    Msg Addr
+    ///   entry 2:  Vector Control    Msg Data    Msg Upper Addr    Msg Addr
+    ///   ...
+    pub fn read_msix_table(&self, offset: u64, data: &mut [u8]) {
+        let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize;
+        let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO;
+
+        match data.len() {
+            4 => {
+                let value = match modulo_offset {
+                    0x0 => self.table_entries[index].msg_addr_lo,
+                    0x4 => self.table_entries[index].msg_addr_hi,
+                    0x8 => self.table_entries[index].msg_data,
+                    0xc => self.table_entries[index].vector_ctl,
+                    _ => {
+                        error!("invalid offset");
+                        0
+                    }
+                };
+
+                data.copy_from_slice(&value.to_le_bytes());
+            }
+            8 => {
+                let value = match modulo_offset {
+                    0x0 => {
+                        (u64::from(self.table_entries[index].msg_addr_hi) << 32)
+                            | u64::from(self.table_entries[index].msg_addr_lo)
+                    }
+                    0x8 => {
+                        (u64::from(self.table_entries[index].vector_ctl) << 32)
+                            | u64::from(self.table_entries[index].msg_data)
+                    }
+                    _ => {
+                        error!("invalid offset");
+                        0
+                    }
+                };
+
+                data.copy_from_slice(&value.to_le_bytes());
+            }
+            _ => error!("invalid data length"),
+        };
+    }
+
+    /// Write to MSI-X table
+    ///
+    /// Message Address: the contents of this field specifies the address
+    ///     for the memory write transaction; different MSI-X vectors have
+    ///     different Message Address values
+    /// Message Data: the contents of this field specifies the data driven
+    ///     on AD[31::00] during the memory write transaction's data phase.
+    /// Vector Control: only bit 0 (Mask Bit) is not reserved: when this bit
+    ///     is set, the function is prohibited from sending a message using
+    ///     this MSI-X Table entry.
+    pub fn write_msix_table(&mut self, offset: u64, data: &[u8]) {
+        let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize;
+        let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO;
+
+        // Store the value of the entry before modification
+        let old_entry = self.table_entries[index].clone();
+
+        match data.len() {
+            4 => {
+                let value = u32::from_le_bytes(data.try_into().unwrap());
+                match modulo_offset {
+                    0x0 => self.table_entries[index].msg_addr_lo = value,
+                    0x4 => self.table_entries[index].msg_addr_hi = value,
+                    0x8 => self.table_entries[index].msg_data = value,
+                    0xc => self.table_entries[index].vector_ctl = value,
+                    _ => error!("invalid offset"),
+                };
+            }
+            8 => {
+                let value = u64::from_le_bytes(data.try_into().unwrap());
+                match modulo_offset {
+                    0x0 => {
+                        self.table_entries[index].msg_addr_lo = (value & 0xffff_ffffu64) as u32;
+                        self.table_entries[index].msg_addr_hi = (value >> 32) as u32;
+                    }
+                    0x8 => {
+                        self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32;
+                        self.table_entries[index].vector_ctl = (value >> 32) as u32;
+                    }
+                    _ => error!("invalid offset"),
+                };
+            }
+            _ => error!("invalid data length"),
+        };
+
+        let new_entry = self.table_entries[index].clone();
+        if self.enabled()
+            && (old_entry.msg_addr_lo != new_entry.msg_addr_lo
+                || old_entry.msg_addr_hi != new_entry.msg_addr_hi
+                || old_entry.msg_data != new_entry.msg_data)
+        {
+            let irq_num = self.irq_vec[index].gsi;
+            self.add_msi_route(index as u16, irq_num);
+        }
+
+        // After the MSI-X table entry has been updated, it is necessary to
+        // check if the vector control masking bit has changed. In case the
+        // bit has been flipped from 1 to 0, we need to inject a MSI message
+        // if the corresponding pending bit from the PBA is set. Once the MSI
+        // has been injected, the pending bit in the PBA needs to be cleared.
+        // All of this is valid only if MSI-X has not been masked for the whole
+        // device.
+
+        // Check if bit has been flipped
+        if !self.masked()
+            && old_entry.masked()
+            && !self.table_entries[index].masked()
+            && self.get_pba_bit(index as u16) == 1
+        {
+            self.inject_msix_and_clear_pba(index);
+        }
+    }
+
+    /// Read PBA Entries
+    ///  # Arguments
+    ///  * 'offset' - the offset within the PBA entries
+    ///  * 'data' - used to store the read results
+    ///
+    /// Pending Bits[63::00]: For each Pending Bit that is set, the function
+    /// has a pending message for the associated MSI-X Table entry.
+    pub fn read_pba_entries(&self, offset: u64, data: &mut [u8]) {
+        let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize;
+        let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO;
+
+        match data.len() {
+            4 => {
+                let value: u32 = match modulo_offset {
+                    0x0 => (self.pba_entries[index] & 0xffff_ffffu64) as u32,
+                    0x4 => (self.pba_entries[index] >> 32) as u32,
+                    _ => {
+                        error!("invalid offset");
+                        0
+                    }
+                };
+
+                data.copy_from_slice(&value.to_le_bytes());
+            }
+            8 => {
+                let value: u64 = match modulo_offset {
+                    0x0 => self.pba_entries[index],
+                    _ => {
+                        error!("invalid offset");
+                        0
+                    }
+                };
+
+                data.copy_from_slice(&value.to_le_bytes());
+            }
+            _ => error!("invalid data length"),
+        }
+    }
+
+    /// Write to PBA Entries
+    ///
+    /// Software should never write, and should only read Pending Bits.
+    /// If software writes to Pending Bits, the result is undefined.
+    pub fn write_pba_entries(&mut self, _offset: u64, _data: &[u8]) {
+        error!("Pending Bit Array is read only");
+    }
+
+    fn set_pba_bit(&mut self, vector: u16, set: bool) {
+        assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE);
+
+        let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY;
+        let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY;
+        let mut mask: u64 = (1 << shift) as u64;
+
+        if set {
+            self.pba_entries[index] |= mask;
+        } else {
+            mask = !mask;
+            self.pba_entries[index] &= mask;
+        }
+    }
+
+    fn get_pba_bit(&self, vector: u16) -> u8 {
+        assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE);
+
+        let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY;
+        let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY;
+
+        ((self.pba_entries[index] >> shift) & 0x0000_0001u64) as u8
+    }
+
+    fn inject_msix_and_clear_pba(&mut self, vector: usize) {
+        if let Some(irq) = self.irq_vec.get(vector) {
+            irq.irqfd.write(1).unwrap();
+        }
+
+        // Clear the bit from PBA
+        self.set_pba_bit(vector as u16, false);
+    }
+
+    /// Inject virtual interrupt to the guest
+    ///
+    ///  # Arguments
+    ///  * 'vector' - the index to the MSI-X Table entry
+    ///
+    /// PCI Spec 3.0 6.8.3.5: while a vector is masked, the function is
+    /// prohibited from sending the associated message, and the function
+    /// must set the associated Pending bit whenever the function would
+    /// otherwise send the message. When software unmasks a vector whose
+    /// associated Pending bit is set, the function must schedule sending
+    /// the associated message, and clear the Pending bit as soon as the
+    /// message has been sent.
+    ///
+    /// If the vector is unmasked, writing to irqfd which wakes up KVM to
+    /// inject virtual interrupt to the guest.
+    pub fn trigger(&mut self, vector: u16) {
+        if self.table_entries[vector as usize].masked() || self.masked() {
+            self.set_pba_bit(vector, true);
+        } else if let Some(irq) = self.irq_vec.get(vector as usize) {
+            irq.irqfd.write(1).unwrap();
+        }
+    }
+
+    /// Return the raw fd of the MSI device socket
+    pub fn get_msi_socket(&self) -> RawFd {
+        self.msi_device_socket.as_ref().as_raw_fd()
+    }
+}
+
+// It is safe to implement DataInit; all members are simple numbers and any value is valid.
+unsafe impl DataInit for MsixCap {}
+
+#[allow(dead_code)]
+#[repr(C)]
+#[derive(Clone, Copy, Default)]
+/// MSI-X Capability Structure
+pub struct MsixCap {
+    // To make add_capability() happy
+    _cap_vndr: u8,
+    _cap_next: u8,
+    // Message Control Register
+    //   10-0:  MSI-X Table size
+    //   13-11: Reserved
+    //   14:    Mask. Mask all MSI-X when set.
+    //   15:    Enable. Enable all MSI-X when set.
+    msg_ctl: u16,
+    // Table. Contains the offset and the BAR indicator (BIR)
+    //   2-0:  Table BAR indicator (BIR). Can be 0 to 5.
+    //   31-3: Table offset in the BAR pointed by the BIR.
+    table: u32,
+    // Pending Bit Array. Contains the offset and the BAR indicator (BIR)
+    //   2-0:  PBA BAR indicator (BIR). Can be 0 to 5.
+    //   31-3: PBA offset in the BAR pointed by the BIR.
+    pba: u32,
+}
+
+impl PciCapability for MsixCap {
+    fn bytes(&self) -> &[u8] {
+        self.as_slice()
+    }
+
+    fn id(&self) -> PciCapabilityID {
+        PciCapabilityID::MSIX
+    }
+}
+
+impl MsixCap {
+    pub fn new(
+        table_pci_bar: u8,
+        table_size: u16,
+        table_off: u32,
+        pba_pci_bar: u8,
+        pba_off: u32,
+    ) -> Self {
+        assert!(table_size < MAX_MSIX_VECTORS_PER_DEVICE);
+
+        // Set the table size and enable MSI-X.
+        let msg_ctl: u16 = MSIX_ENABLE_BIT + table_size - 1;
+
+        MsixCap {
+            _cap_vndr: 0,
+            _cap_next: 0,
+            msg_ctl,
+            table: (table_off & 0xffff_fff8u32) | u32::from(table_pci_bar & 0x7u8),
+            pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8),
+        }
+    }
+}
diff --git a/devices/src/pci/pci_configuration.rs b/devices/src/pci/pci_configuration.rs
index ab969f7..ebfa2a6 100644
--- a/devices/src/pci/pci_configuration.rs
+++ b/devices/src/pci/pci_configuration.rs
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+use std::convert::TryInto;
 use std::fmt::{self, Display};
 
 use crate::pci::PciInterruptPin;
@@ -23,6 +24,7 @@
 const INTERRUPT_LINE_PIN_REG: usize = 15;
 
 /// Represents the types of PCI headers allowed in the configuration registers.
+#[allow(dead_code)]
 #[derive(Copy, Clone)]
 pub enum PciHeaderType {
     Device,
@@ -175,7 +177,7 @@
 }
 
 /// See pci_regs.h in kernel
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, Debug, PartialEq)]
 pub enum PciBarRegionType {
     Memory32BitRegion = 0,
     IORegion = 0x01,
@@ -200,6 +202,7 @@
 #[derive(Debug)]
 pub enum Error {
     BarAddressInvalid(u64, u64),
+    BarAlignmentInvalid(u64, u64),
     BarInUse(usize),
     BarInUse64(usize),
     BarInvalid(usize),
@@ -218,6 +221,7 @@
         use self::Error::*;
         match self {
             BarAddressInvalid(a, s) => write!(f, "address {} size {} too big", a, s),
+            BarAlignmentInvalid(a, s) => write!(f, "address {} is not aligned to size {}", a, s),
             BarInUse(b) => write!(f, "bar {} already used", b),
             BarInUse64(b) => write!(f, "64bit bar {} already used(requires two regs)", b),
             BarInvalid(b) => write!(f, "bar {} invalid, max {}", b, NUM_BAR_REGS - 1),
@@ -286,22 +290,42 @@
         *(self.registers.get(reg_idx).unwrap_or(&0xffff_ffff))
     }
 
-    /// Writes a 32bit register to `reg_idx` in the register map.
-    pub fn write_reg(&mut self, reg_idx: usize, value: u32) {
+    /// Writes data to PciConfiguration.registers.
+    /// `reg_idx` - index into PciConfiguration.registers.
+    /// `offset`  - PciConfiguration.registers is in unit of DWord, offset define byte
+    ///             offset in the DWrod.
+    /// `data`    - The data to write.
+    pub fn write_reg(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
+        let reg_offset = reg_idx * 4 + offset as usize;
+        match data.len() {
+            1 => self.write_byte(reg_offset, data[0]),
+            2 => self.write_word(reg_offset, u16::from_le_bytes(data.try_into().unwrap())),
+            4 => self.write_dword(reg_offset, u32::from_le_bytes(data.try_into().unwrap())),
+            _ => (),
+        }
+    }
+
+    /// Writes a 32bit dword to `offset`. `offset` must be 32bit aligned.
+    fn write_dword(&mut self, offset: usize, value: u32) {
+        if offset % 4 != 0 {
+            warn!("bad PCI config dword write offset {}", offset);
+            return;
+        }
+        let reg_idx = offset / 4;
         if let Some(r) = self.registers.get_mut(reg_idx) {
-            *r = value & self.writable_bits[reg_idx];
+            *r = (*r & !self.writable_bits[reg_idx]) | (value & self.writable_bits[reg_idx]);
         } else {
-            warn!("bad PCI register write {}", reg_idx);
+            warn!("bad PCI dword write {}", offset);
         }
     }
 
     /// Writes a 16bit word to `offset`. `offset` must be 16bit aligned.
-    pub fn write_word(&mut self, offset: usize, value: u16) {
+    fn write_word(&mut self, offset: usize, value: u16) {
         let shift = match offset % 4 {
             0 => 0,
             2 => 16,
             _ => {
-                warn!("bad PCI config write offset {}", offset);
+                warn!("bad PCI config word write offset {}", offset);
                 return;
             }
         };
@@ -313,12 +337,12 @@
             let shifted_value = (u32::from(value) << shift) & writable_mask;
             *r = *r & !mask | shifted_value;
         } else {
-            warn!("bad PCI config write offset {}", offset);
+            warn!("bad PCI config word write offset {}", offset);
         }
     }
 
     /// Writes a byte to `offset`.
-    pub fn write_byte(&mut self, offset: usize, value: u8) {
+    fn write_byte(&mut self, offset: usize, value: u8) {
         self.write_byte_internal(offset, value, true);
     }
 
@@ -337,7 +361,7 @@
             let shifted_value = (u32::from(value) << shift) & writable_mask;
             *r = *r & !mask | shifted_value;
         } else {
-            warn!("bad PCI config write offset {}", offset);
+            warn!("bad PCI config byte write offset {}", offset);
         }
     }
 
@@ -358,6 +382,10 @@
             return Err(Error::BarInvalid(config.reg_idx));
         }
 
+        if config.addr % config.size != 0 {
+            return Err(Error::BarAlignmentInvalid(config.addr, config.size));
+        }
+
         let bar_idx = BAR0_REG + config.reg_idx;
         let end_addr = config
             .addr
@@ -383,7 +411,7 @@
                 }
 
                 self.registers[bar_idx + 1] = (config.addr >> 32) as u32;
-                self.writable_bits[bar_idx + 1] = !((config.size >> 32).wrapping_sub(1)) as u32;
+                self.writable_bits[bar_idx + 1] = !((config.size - 1) >> 32) as u32;
                 self.bar_used[config.reg_idx + 1] = true;
             }
         }
@@ -402,11 +430,38 @@
         Ok(config.reg_idx)
     }
 
+    /// Returns the type of the given BAR region.
+    pub fn get_bar_type(&self, bar_num: usize) -> Option<PciBarRegionType> {
+        let reg_idx = BAR0_REG + bar_num;
+        let reg_value = self.registers.get(reg_idx)?;
+
+        match (reg_value & 1, (reg_value >> 1u32) & 3) {
+            (1, _) => Some(PciBarRegionType::IORegion),
+            (0, 0b00) => Some(PciBarRegionType::Memory32BitRegion),
+            (0, 0b10) => Some(PciBarRegionType::Memory64BitRegion),
+            _ => None,
+        }
+    }
+
     /// Returns the address of the given BAR region.
-    pub fn get_bar_addr(&self, bar_num: usize) -> u32 {
+    pub fn get_bar_addr(&self, bar_num: usize) -> u64 {
         let bar_idx = BAR0_REG + bar_num;
 
-        self.registers[bar_idx] & BAR_MEM_ADDR_MASK
+        let bar_type = match self.get_bar_type(bar_num) {
+            Some(t) => t,
+            None => return 0,
+        };
+
+        match bar_type {
+            PciBarRegionType::IORegion => u64::from(self.registers[bar_idx] & BAR_IO_ADDR_MASK),
+            PciBarRegionType::Memory32BitRegion => {
+                u64::from(self.registers[bar_idx] & BAR_MEM_ADDR_MASK)
+            }
+            PciBarRegionType::Memory64BitRegion => {
+                u64::from(self.registers[bar_idx] & BAR_MEM_ADDR_MASK)
+                    | u64::from(self.registers[bar_idx + 1]) << 32
+            }
+        }
     }
 
     /// Configures the IRQ line and pin used by this device.
@@ -620,4 +675,117 @@
         assert_eq!(subclass, 0x01);
         assert_eq!(prog_if, 0x5a);
     }
+
+    #[test]
+    fn read_only_bits() {
+        let mut cfg = PciConfiguration::new(
+            0x1234,
+            0x5678,
+            PciClassCode::MultimediaController,
+            &PciMultimediaSubclass::AudioController,
+            Some(&TestPI::Test),
+            PciHeaderType::Device,
+            0xABCD,
+            0x2468,
+        );
+
+        // Attempt to overwrite vendor ID and device ID, which are read-only
+        cfg.write_reg(0, 0, &[0xBA, 0xAD, 0xF0, 0x0D]);
+        // The original vendor and device ID should remain.
+        assert_eq!(cfg.read_reg(0), 0x56781234);
+    }
+
+    #[test]
+    fn add_pci_bar_mem_64bit() {
+        let mut cfg = PciConfiguration::new(
+            0x1234,
+            0x5678,
+            PciClassCode::MultimediaController,
+            &PciMultimediaSubclass::AudioController,
+            Some(&TestPI::Test),
+            PciHeaderType::Device,
+            0xABCD,
+            0x2468,
+        );
+
+        cfg.add_pci_bar(
+            &PciBarConfiguration::new(
+                0,
+                0x4,
+                PciBarRegionType::Memory64BitRegion,
+                PciBarPrefetchable::NotPrefetchable,
+            )
+            .set_address(0x01234567_89ABCDE0),
+        )
+        .expect("add_pci_bar failed");
+
+        assert_eq!(
+            cfg.get_bar_type(0),
+            Some(PciBarRegionType::Memory64BitRegion)
+        );
+        assert_eq!(cfg.get_bar_addr(0), 0x01234567_89ABCDE0);
+        assert_eq!(cfg.writable_bits[BAR0_REG + 1], 0xFFFFFFFF);
+        assert_eq!(cfg.writable_bits[BAR0_REG + 0], 0xFFFFFFFC);
+    }
+
+    #[test]
+    fn add_pci_bar_mem_32bit() {
+        let mut cfg = PciConfiguration::new(
+            0x1234,
+            0x5678,
+            PciClassCode::MultimediaController,
+            &PciMultimediaSubclass::AudioController,
+            Some(&TestPI::Test),
+            PciHeaderType::Device,
+            0xABCD,
+            0x2468,
+        );
+
+        cfg.add_pci_bar(
+            &PciBarConfiguration::new(
+                0,
+                0x4,
+                PciBarRegionType::Memory32BitRegion,
+                PciBarPrefetchable::NotPrefetchable,
+            )
+            .set_address(0x12345670),
+        )
+        .expect("add_pci_bar failed");
+
+        assert_eq!(
+            cfg.get_bar_type(0),
+            Some(PciBarRegionType::Memory32BitRegion)
+        );
+        assert_eq!(cfg.get_bar_addr(0), 0x12345670);
+        assert_eq!(cfg.writable_bits[BAR0_REG], 0xFFFFFFFC);
+    }
+
+    #[test]
+    fn add_pci_bar_io() {
+        let mut cfg = PciConfiguration::new(
+            0x1234,
+            0x5678,
+            PciClassCode::MultimediaController,
+            &PciMultimediaSubclass::AudioController,
+            Some(&TestPI::Test),
+            PciHeaderType::Device,
+            0xABCD,
+            0x2468,
+        );
+
+        cfg.add_pci_bar(
+            &PciBarConfiguration::new(
+                0,
+                0x4,
+                PciBarRegionType::IORegion,
+                PciBarPrefetchable::NotPrefetchable,
+            )
+            .set_address(0x1230),
+        )
+        .expect("add_pci_bar failed");
+
+        assert_eq!(cfg.get_bar_type(0), Some(PciBarRegionType::IORegion));
+        assert_eq!(cfg.get_bar_addr(0), 0x1230);
+        assert_eq!(cfg.writable_bits[BAR0_REG], 0xFFFFFFFC);
+    }
 }
diff --git a/devices/src/pci/pci_device.rs b/devices/src/pci/pci_device.rs
index 8ea3548..4c62e05 100644
--- a/devices/src/pci/pci_device.rs
+++ b/devices/src/pci/pci_device.rs
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-use byteorder::{ByteOrder, LittleEndian};
-
 use std;
 use std::fmt::{self, Display};
 use std::os::unix::io::RawFd;
@@ -12,7 +10,7 @@
 use resources::{Error as SystemAllocatorFaliure, SystemAllocator};
 use sys_util::EventFd;
 
-use crate::pci::pci_configuration::{self, PciConfiguration};
+use crate::pci::pci_configuration;
 use crate::pci::PciInterruptPin;
 use crate::BusDevice;
 
@@ -90,10 +88,17 @@
     fn ioeventfds(&self) -> Vec<(&EventFd, u64, Datamatch)> {
         Vec::new()
     }
-    /// Gets the configuration registers of the Pci Device.
-    fn config_registers(&self) -> &PciConfiguration; // TODO - remove these
-    /// Gets the configuration registers of the Pci Device for modification.
-    fn config_registers_mut(&mut self) -> &mut PciConfiguration;
+
+    /// Reads from a PCI configuration register.
+    /// * `reg_idx` - PCI register index (in units of 4 bytes).
+    fn read_config_register(&self, reg_idx: usize) -> u32;
+
+    /// Writes to a PCI configuration register.
+    /// * `reg_idx` - PCI register index (in units of 4 bytes).
+    /// * `offset`  - byte offset within 4-byte register.
+    /// * `data`    - The data to write.
+    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]);
+
     /// Reads from a BAR region mapped in to the device.
     /// * `addr` - The guest address inside the BAR.
     /// * `data` - Filled with the data from `addr`.
@@ -124,21 +129,11 @@
             return;
         }
 
-        let regs = self.config_registers_mut();
-
-        match data.len() {
-            1 => regs.write_byte(reg_idx * 4 + offset as usize, data[0]),
-            2 => regs.write_word(
-                reg_idx * 4 + offset as usize,
-                (data[0] as u16) | (data[1] as u16) << 8,
-            ),
-            4 => regs.write_reg(reg_idx, LittleEndian::read_u32(data)),
-            _ => (),
-        }
+        self.write_config_register(reg_idx, offset, data)
     }
 
     fn config_register_read(&self, reg_idx: usize) -> u32 {
-        self.config_registers().read_reg(reg_idx)
+        self.read_config_register(reg_idx)
     }
 
     fn on_sandboxed(&mut self) {
@@ -178,11 +173,11 @@
     fn ioeventfds(&self) -> Vec<(&EventFd, u64, Datamatch)> {
         (**self).ioeventfds()
     }
-    fn config_registers(&self) -> &PciConfiguration {
-        (**self).config_registers()
+    fn read_config_register(&self, reg_idx: usize) -> u32 {
+        (**self).read_config_register(reg_idx)
     }
-    fn config_registers_mut(&mut self) -> &mut PciConfiguration {
-        (**self).config_registers_mut()
+    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
+        (**self).write_config_register(reg_idx, offset, data)
     }
     fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
         (**self).read_bar(addr, data)
diff --git a/devices/src/pci/pci_root.rs b/devices/src/pci/pci_root.rs
index 187b564..eef642e 100644
--- a/devices/src/pci/pci_root.rs
+++ b/devices/src/pci/pci_root.rs
@@ -2,10 +2,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+use std::convert::TryInto;
 use std::os::unix::io::RawFd;
 use std::sync::Arc;
 
-use byteorder::{ByteOrder, LittleEndian};
 use sync::Mutex;
 
 use crate::pci::pci_configuration::{
@@ -26,12 +26,12 @@
     fn keep_fds(&self) -> Vec<RawFd> {
         Vec::new()
     }
-    fn config_registers(&self) -> &PciConfiguration {
-        &self.config
+    fn read_config_register(&self, reg_idx: usize) -> u32 {
+        self.config.read_reg(reg_idx)
     }
 
-    fn config_registers_mut(&mut self) -> &mut PciConfiguration {
-        &mut self.config
+    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
+        (&mut self.config).write_reg(reg_idx, offset, data)
     }
 
     fn read_bar(&mut self, _addr: u64, _data: &mut [u8]) {}
@@ -47,18 +47,21 @@
     devices: Vec<Arc<Mutex<dyn BusDevice>>>,
 }
 
+const PCI_VENDOR_ID_INTEL: u16 = 0x8086;
+const PCI_DEVICE_ID_INTEL_82441: u16 = 0x1237;
+
 impl PciRoot {
     /// Create an empty PCI root bus.
     pub fn new() -> Self {
         PciRoot {
             root_configuration: PciRootConfiguration {
                 config: PciConfiguration::new(
-                    0,
-                    0,
+                    PCI_VENDOR_ID_INTEL,
+                    PCI_DEVICE_ID_INTEL_82441,
                     PciClassCode::BridgeDevice,
                     &PciBridgeSubclass::HostBridge,
                     None,
-                    PciHeaderType::Bridge,
+                    PciHeaderType::Device,
                     0,
                     0,
                 ),
@@ -181,9 +184,9 @@
             ),
             2 => (
                 0x0000_ffff << (offset * 16),
-                ((data[1] as u32) << 8 | data[0] as u32) << (offset * 16),
+                u32::from(u16::from_le_bytes(data.try_into().unwrap())) << (offset * 16),
             ),
-            4 => (0xffff_ffff, LittleEndian::read_u32(data)),
+            4 => (0xffff_ffff, u32::from_le_bytes(data.try_into().unwrap())),
             _ => return,
         };
         self.config_address = (self.config_address & !mask) | value;
@@ -198,8 +201,8 @@
     fn read(&mut self, offset: u64, data: &mut [u8]) {
         // `offset` is relative to 0xcf8
         let value = match offset {
-            0...3 => self.config_address,
-            4...7 => self.config_space_read(),
+            0..=3 => self.config_address,
+            4..=7 => self.config_space_read(),
             _ => 0xffff_ffff,
         };
 
@@ -220,8 +223,8 @@
     fn write(&mut self, offset: u64, data: &[u8]) {
         // `offset` is relative to 0xcf8
         match offset {
-            o @ 0...3 => self.set_config_address(o, data),
-            o @ 4...7 => self.config_space_write(o - 4, data),
+            o @ 0..=3 => self.set_config_address(o, data),
+            o @ 4..=7 => self.config_space_write(o - 4, data),
             _ => (),
         };
     }
diff --git a/devices/src/pci/vfio_pci.rs b/devices/src/pci/vfio_pci.rs
new file mode 100644
index 0000000..766cbcf
--- /dev/null
+++ b/devices/src/pci/vfio_pci.rs
@@ -0,0 +1,895 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::os::unix::io::{AsRawFd, RawFd};
+use std::sync::Arc;
+use std::u32;
+
+use kvm::Datamatch;
+use msg_socket::{MsgReceiver, MsgSender};
+use resources::{Alloc, MmioType, SystemAllocator};
+use sys_util::{error, EventFd, MemoryMapping};
+
+use vfio_sys::*;
+use vm_control::{
+    MaybeOwnedFd, VmIrqRequest, VmIrqRequestSocket, VmIrqResponse, VmMemoryControlRequestSocket,
+    VmMemoryRequest, VmMemoryResponse,
+};
+
+use crate::pci::msix::MsixConfig;
+
+use crate::pci::pci_device::{Error as PciDeviceError, PciDevice};
+use crate::pci::{PciClassCode, PciInterruptPin};
+
+use crate::vfio::{VfioDevice, VfioIrqType};
+
+const PCI_VENDOR_ID: u32 = 0x0;
+const INTEL_VENDOR_ID: u16 = 0x8086;
+const PCI_COMMAND: u32 = 0x4;
+const PCI_COMMAND_MEMORY: u8 = 0x2;
+const PCI_BASE_CLASS_CODE: u32 = 0x0B;
+const PCI_HEADER_TYPE: usize = 0x0E;
+const PCI_MULTI_FLAG: u32 = 0x0080_0000;
+
+const PCI_INTERRUPT_PIN: u32 = 0x3D;
+
+struct VfioPciConfig {
+    device: Arc<VfioDevice>,
+}
+
+impl VfioPciConfig {
+    fn new(device: Arc<VfioDevice>) -> Self {
+        VfioPciConfig { device }
+    }
+
+    #[allow(dead_code)]
+    fn read_config_byte(&self, offset: u32) -> u8 {
+        let mut data: [u8; 1] = [0];
+        self.device
+            .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into());
+
+        data[0]
+    }
+
+    #[allow(dead_code)]
+    fn read_config_word(&self, offset: u32) -> u16 {
+        let mut data: [u8; 2] = [0, 0];
+        self.device
+            .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into());
+
+        u16::from_le_bytes(data)
+    }
+
+    #[allow(dead_code)]
+    fn read_config_dword(&self, offset: u32) -> u32 {
+        let mut data: [u8; 4] = [0, 0, 0, 0];
+        self.device
+            .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into());
+
+        u32::from_le_bytes(data)
+    }
+
+    #[allow(dead_code)]
+    fn write_config_byte(&self, buf: u8, offset: u32) {
+        self.device.region_write(
+            VFIO_PCI_CONFIG_REGION_INDEX,
+            ::std::slice::from_ref(&buf),
+            offset.into(),
+        )
+    }
+
+    #[allow(dead_code)]
+    fn write_config_word(&self, buf: u16, offset: u32) {
+        let data: [u8; 2] = buf.to_le_bytes();
+        self.device
+            .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into())
+    }
+
+    #[allow(dead_code)]
+    fn write_config_dword(&self, buf: u32, offset: u32) {
+        let data: [u8; 4] = buf.to_le_bytes();
+        self.device
+            .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into())
+    }
+}
+
+const PCI_CAPABILITY_LIST: u32 = 0x34;
+const PCI_CAP_ID_MSI: u8 = 0x05;
+const PCI_CAP_ID_MSIX: u8 = 0x11;
+
+// MSI registers
+const PCI_MSI_NEXT_POINTER: u32 = 0x1; // Next cap pointer
+const PCI_MSI_FLAGS: u32 = 0x2; // Message Control
+const PCI_MSI_FLAGS_ENABLE: u16 = 0x0001; // MSI feature enabled
+const PCI_MSI_FLAGS_64BIT: u16 = 0x0080; // 64-bit addresses allowed
+const PCI_MSI_FLAGS_MASKBIT: u16 = 0x0100; // Per-vector masking capable
+const PCI_MSI_ADDRESS_LO: u32 = 0x4; // MSI address lower 32 bits
+const PCI_MSI_ADDRESS_HI: u32 = 0x8; // MSI address upper 32 bits (if 64 bit allowed)
+const PCI_MSI_DATA_32: u32 = 0x8; // 16 bits of data for 32-bit message address
+const PCI_MSI_DATA_64: u32 = 0xC; // 16 bits of date for 64-bit message address
+
+// MSI length
+const MSI_LENGTH_32BIT: u32 = 0xA;
+const MSI_LENGTH_64BIT_WITHOUT_MASK: u32 = 0xE;
+const MSI_LENGTH_64BIT_WITH_MASK: u32 = 0x18;
+
+enum VfioMsiChange {
+    Disable,
+    Enable,
+}
+
+struct VfioMsiCap {
+    offset: u32,
+    size: u32,
+    ctl: u16,
+    address: u64,
+    data: u16,
+    vm_socket_irq: Arc<VmIrqRequestSocket>,
+    irqfd: Option<EventFd>,
+    gsi: Option<u32>,
+}
+
+impl VfioMsiCap {
+    fn new(
+        config: &VfioPciConfig,
+        msi_cap_start: u32,
+        vm_socket_irq: Arc<VmIrqRequestSocket>,
+    ) -> Self {
+        // msi minimum size is 0xa
+        let mut msi_len: u32 = MSI_LENGTH_32BIT;
+        let msi_ctl = config.read_config_word(msi_cap_start + PCI_MSI_FLAGS);
+        if msi_ctl & PCI_MSI_FLAGS_64BIT != 0 {
+            msi_len = MSI_LENGTH_64BIT_WITHOUT_MASK;
+        }
+        if msi_ctl & PCI_MSI_FLAGS_MASKBIT != 0 {
+            msi_len = MSI_LENGTH_64BIT_WITH_MASK;
+        }
+
+        VfioMsiCap {
+            offset: msi_cap_start,
+            size: msi_len,
+            ctl: 0,
+            address: 0,
+            data: 0,
+            vm_socket_irq,
+            irqfd: None,
+            gsi: None,
+        }
+    }
+
+    fn is_msi_reg(&self, index: u64, len: usize) -> bool {
+        if index >= self.offset as u64
+            && index + len as u64 <= (self.offset + self.size) as u64
+            && len as u32 <= self.size
+        {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
+        let len = data.len();
+        let offset = index as u32 - self.offset;
+        let mut ret: Option<VfioMsiChange> = None;
+        let old_address = self.address;
+        let old_data = self.data;
+
+        // write msi ctl
+        if len == 2 && offset == PCI_MSI_FLAGS {
+            let was_enabled = self.is_msi_enabled();
+            let value: [u8; 2] = [data[0], data[1]];
+            self.ctl = u16::from_le_bytes(value);
+            let is_enabled = self.is_msi_enabled();
+            if !was_enabled && is_enabled {
+                self.enable();
+                ret = Some(VfioMsiChange::Enable);
+            } else if was_enabled && !is_enabled {
+                ret = Some(VfioMsiChange::Disable)
+            }
+        } else if len == 4 && offset == PCI_MSI_ADDRESS_LO && self.size == MSI_LENGTH_32BIT {
+            //write 32 bit message address
+            let value: [u8; 8] = [data[0], data[1], data[2], data[3], 0, 0, 0, 0];
+            self.address = u64::from_le_bytes(value);
+        } else if len == 4 && offset == PCI_MSI_ADDRESS_LO && self.size != MSI_LENGTH_32BIT {
+            // write 64 bit message address low part
+            let value: [u8; 8] = [data[0], data[1], data[2], data[3], 0, 0, 0, 0];
+            self.address &= !0xffffffff;
+            self.address |= u64::from_le_bytes(value);
+        } else if len == 4 && offset == PCI_MSI_ADDRESS_HI && self.size != MSI_LENGTH_32BIT {
+            //write 64 bit message address high part
+            let value: [u8; 8] = [0, 0, 0, 0, data[0], data[1], data[2], data[3]];
+            self.address &= 0xffffffff;
+            self.address |= u64::from_le_bytes(value);
+        } else if len == 8 && offset == PCI_MSI_ADDRESS_LO && self.size != MSI_LENGTH_32BIT {
+            // write 64 bit message address
+            let value: [u8; 8] = [
+                data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
+            ];
+            self.address = u64::from_le_bytes(value);
+        } else if len == 2
+            && ((offset == PCI_MSI_DATA_32 && self.size == MSI_LENGTH_32BIT)
+                || (offset == PCI_MSI_DATA_64 && self.size == MSI_LENGTH_64BIT_WITH_MASK)
+                || (offset == PCI_MSI_DATA_64 && self.size == MSI_LENGTH_64BIT_WITHOUT_MASK))
+        {
+            // write message data
+            let value: [u8; 2] = [data[0], data[1]];
+            self.data = u16::from_le_bytes(value);
+        }
+
+        if self.is_msi_enabled() && (old_address != self.address || old_data != self.data) {
+            self.add_msi_route();
+        }
+
+        ret
+    }
+
+    fn is_msi_enabled(&self) -> bool {
+        self.ctl & PCI_MSI_FLAGS_ENABLE == PCI_MSI_FLAGS_ENABLE
+    }
+
+    fn add_msi_route(&self) {
+        let gsi = match self.gsi {
+            Some(g) => g,
+            None => {
+                error!("Add msi route but gsi is none");
+                return;
+            }
+        };
+        if let Err(e) = self.vm_socket_irq.send(&VmIrqRequest::AddMsiRoute {
+            gsi,
+            msi_address: self.address,
+            msi_data: self.data.into(),
+        }) {
+            error!("failed to send AddMsiRoute request at {:?}", e);
+            return;
+        }
+        match self.vm_socket_irq.recv() {
+            Ok(VmIrqResponse::Err(e)) => error!("failed to call AddMsiRoute request {:?}", e),
+            Ok(_) => {}
+            Err(e) => error!("failed to receive AddMsiRoute response {:?}", e),
+        }
+    }
+
+    fn allocate_one_msi(&mut self) {
+        if self.irqfd.is_none() {
+            match EventFd::new() {
+                Ok(fd) => self.irqfd = Some(fd),
+                Err(e) => {
+                    error!("failed to create eventfd: {:?}", e);
+                    return;
+                }
+            };
+        }
+
+        if let Err(e) = self.vm_socket_irq.send(&VmIrqRequest::AllocateOneMsi {
+            irqfd: MaybeOwnedFd::Borrowed(self.irqfd.as_ref().unwrap().as_raw_fd()),
+        }) {
+            error!("failed to send AllocateOneMsi request: {:?}", e);
+            return;
+        }
+
+        match self.vm_socket_irq.recv() {
+            Ok(VmIrqResponse::AllocateOneMsi { gsi }) => self.gsi = Some(gsi),
+            _ => error!("failed to receive AllocateOneMsi Response"),
+        }
+    }
+
+    fn enable(&mut self) {
+        if self.gsi.is_none() || self.irqfd.is_none() {
+            self.allocate_one_msi();
+        }
+
+        self.add_msi_route();
+    }
+
+    fn get_msi_irqfd(&self) -> Option<&EventFd> {
+        self.irqfd.as_ref()
+    }
+}
+
+// MSI-X registers in MSI-X capability
+const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
+const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
+const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
+const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
+const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
+const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
+const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
+const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
+
+#[allow(dead_code)]
+struct VfioMsixCap {
+    config: MsixConfig,
+    offset: u32,
+    table_size: u16,
+    table_pci_bar: u32,
+    table_offset: u64,
+    pba_pci_bar: u32,
+    pba_offset: u64,
+}
+
+impl VfioMsixCap {
+    fn new(
+        config: &VfioPciConfig,
+        msix_cap_start: u32,
+        vm_socket_irq: Arc<VmIrqRequestSocket>,
+    ) -> Self {
+        let msix_ctl = config.read_config_word(msix_cap_start + PCI_MSIX_FLAGS);
+        let table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) + 1;
+        let table = config.read_config_dword(msix_cap_start + PCI_MSIX_TABLE);
+        let table_pci_bar = table & PCI_MSIX_TABLE_BIR;
+        let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
+        let pba = config.read_config_dword(msix_cap_start + PCI_MSIX_PBA);
+        let pba_pci_bar = pba & PCI_MSIX_PBA_BIR;
+        let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
+
+        VfioMsixCap {
+            config: MsixConfig::new(table_size, vm_socket_irq),
+            offset: msix_cap_start,
+            table_size,
+            table_pci_bar,
+            table_offset,
+            pba_pci_bar,
+            pba_offset,
+        }
+    }
+}
+
+struct MmioInfo {
+    bar_index: u32,
+    start: u64,
+    length: u64,
+}
+
+struct IoInfo {
+    bar_index: u32,
+}
+
+enum DeviceData {
+    IntelGfxData { opregion_index: u32 },
+}
+
+/// Implements the Vfio Pci device, then a pci device is added into vm
+#[allow(dead_code)]
+pub struct VfioPciDevice {
+    device: Arc<VfioDevice>,
+    config: VfioPciConfig,
+    pci_bus_dev: Option<(u8, u8)>,
+    interrupt_evt: Option<EventFd>,
+    interrupt_resample_evt: Option<EventFd>,
+    mmio_regions: Vec<MmioInfo>,
+    io_regions: Vec<IoInfo>,
+    msi_cap: Option<VfioMsiCap>,
+    msix_cap: Option<VfioMsixCap>,
+    irq_type: Option<VfioIrqType>,
+    vm_socket_mem: VmMemoryControlRequestSocket,
+    vm_socket_irq: Arc<VmIrqRequestSocket>,
+    device_data: Option<DeviceData>,
+
+    // scratch MemoryMapping to avoid unmap beform vm exit
+    mem: Vec<MemoryMapping>,
+}
+
+impl VfioPciDevice {
+    /// Constructs a new Vfio Pci device for the give Vfio device
+    pub fn new(
+        device: VfioDevice,
+        vfio_device_socket_irq: VmIrqRequestSocket,
+        vfio_device_socket_mem: VmMemoryControlRequestSocket,
+    ) -> Self {
+        let dev = Arc::new(device);
+        let config = VfioPciConfig::new(Arc::clone(&dev));
+        let vm_socket_irq = Arc::new(vfio_device_socket_irq);
+        let mut msi_cap: Option<VfioMsiCap> = None;
+        let mut msix_cap: Option<VfioMsixCap> = None;
+
+        let mut cap_next: u32 = config.read_config_byte(PCI_CAPABILITY_LIST).into();
+        while cap_next != 0 {
+            let cap_id = config.read_config_byte(cap_next);
+            if cap_id == PCI_CAP_ID_MSI {
+                msi_cap = Some(VfioMsiCap::new(
+                    &config,
+                    cap_next,
+                    Arc::clone(&vm_socket_irq),
+                ));
+            } else if cap_id == PCI_CAP_ID_MSIX {
+                msix_cap = Some(VfioMsixCap::new(
+                    &config,
+                    cap_next,
+                    Arc::clone(&vm_socket_irq),
+                ));
+            }
+            let offset = cap_next + PCI_MSI_NEXT_POINTER;
+            cap_next = config.read_config_byte(offset).into();
+        }
+
+        let vendor_id = config.read_config_word(PCI_VENDOR_ID);
+        let class_code = config.read_config_byte(PCI_BASE_CLASS_CODE);
+
+        let is_intel_gfx = vendor_id == INTEL_VENDOR_ID
+            && class_code == PciClassCode::DisplayController.get_register_value();
+        let device_data = if is_intel_gfx {
+            Some(DeviceData::IntelGfxData {
+                opregion_index: u32::max_value(),
+            })
+        } else {
+            None
+        };
+
+        VfioPciDevice {
+            device: dev,
+            config,
+            pci_bus_dev: None,
+            interrupt_evt: None,
+            interrupt_resample_evt: None,
+            mmio_regions: Vec::new(),
+            io_regions: Vec::new(),
+            msi_cap,
+            msix_cap,
+            irq_type: None,
+            vm_socket_mem: vfio_device_socket_mem,
+            vm_socket_irq,
+            device_data,
+            mem: Vec::new(),
+        }
+    }
+
+    fn is_intel_gfx(&self) -> bool {
+        let mut ret = false;
+
+        if let Some(device_data) = &self.device_data {
+            match *device_data {
+                DeviceData::IntelGfxData { .. } => ret = true,
+            }
+        }
+
+        ret
+    }
+
+    fn find_region(&self, addr: u64) -> Option<MmioInfo> {
+        for mmio_info in self.mmio_regions.iter() {
+            if addr >= mmio_info.start && addr < mmio_info.start + mmio_info.length {
+                return Some(MmioInfo {
+                    bar_index: mmio_info.bar_index,
+                    start: mmio_info.start,
+                    length: mmio_info.length,
+                });
+            }
+        }
+
+        None
+    }
+
+    fn enable_intx(&mut self) {
+        if self.interrupt_evt.is_none() || self.interrupt_resample_evt.is_none() {
+            return;
+        }
+
+        if let Some(ref interrupt_evt) = self.interrupt_evt {
+            let mut fds = Vec::new();
+            fds.push(interrupt_evt);
+            if let Err(e) = self.device.irq_enable(fds, VfioIrqType::Intx) {
+                error!("Intx enable failed: {}", e);
+                return;
+            }
+            if let Some(ref irq_resample_evt) = self.interrupt_resample_evt {
+                if let Err(e) = self.device.irq_mask(VfioIrqType::Intx) {
+                    error!("Intx mask failed: {}", e);
+                    self.disable_intx();
+                    return;
+                }
+                if let Err(e) = self.device.resample_virq_enable(irq_resample_evt) {
+                    error!("resample enable failed: {}", e);
+                    self.disable_intx();
+                    return;
+                }
+                if let Err(e) = self.device.irq_unmask(VfioIrqType::Intx) {
+                    error!("Intx unmask failed: {}", e);
+                    self.disable_intx();
+                    return;
+                }
+            }
+        }
+
+        self.irq_type = Some(VfioIrqType::Intx);
+    }
+
+    fn disable_intx(&mut self) {
+        if let Err(e) = self.device.irq_disable(VfioIrqType::Intx) {
+            error!("Intx disable failed: {}", e);
+        }
+        self.irq_type = None;
+    }
+
+    fn enable_msi(&mut self) {
+        if let Some(irq_type) = &self.irq_type {
+            match irq_type {
+                VfioIrqType::Intx => self.disable_intx(),
+                _ => return,
+            }
+        }
+
+        let irqfd = match &self.msi_cap {
+            Some(cap) => {
+                if let Some(fd) = cap.get_msi_irqfd() {
+                    fd
+                } else {
+                    self.enable_intx();
+                    return;
+                }
+            }
+            None => {
+                self.enable_intx();
+                return;
+            }
+        };
+
+        let mut fds = Vec::new();
+        fds.push(irqfd);
+        if let Err(e) = self.device.irq_enable(fds, VfioIrqType::Msi) {
+            error!("failed to enable msi: {}", e);
+            self.enable_intx();
+            return;
+        }
+
+        self.irq_type = Some(VfioIrqType::Msi);
+    }
+
+    fn disable_msi(&mut self) {
+        if let Err(e) = self.device.irq_disable(VfioIrqType::Msi) {
+            error!("failed to disable msi: {}", e);
+            return;
+        }
+
+        self.enable_intx();
+    }
+
+    fn add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemoryMapping> {
+        let mut mem_map: Vec<MemoryMapping> = Vec::new();
+        if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
+            let mmaps = self.device.get_region_mmap(index);
+            if mmaps.is_empty() {
+                return mem_map;
+            }
+
+            for mmap in mmaps.iter() {
+                let mmap_offset = mmap.offset;
+                let mmap_size = mmap.size;
+                let guest_map_start = bar_addr + mmap_offset;
+                let region_offset = self.device.get_region_offset(index);
+                let offset: usize = (region_offset + mmap_offset) as usize;
+                if self
+                    .vm_socket_mem
+                    .send(&VmMemoryRequest::RegisterMmapMemory {
+                        fd: MaybeOwnedFd::Borrowed(self.device.as_raw_fd()),
+                        size: mmap_size as usize,
+                        offset,
+                        gpa: guest_map_start,
+                    })
+                    .is_err()
+                {
+                    break;
+                }
+
+                let response = match self.vm_socket_mem.recv() {
+                    Ok(res) => res,
+                    Err(_) => break,
+                };
+                match response {
+                    VmMemoryResponse::Ok => {
+                        // Even if vm has mapped this region, but it is in vm main process,
+                        // device process doesn't has this mapping, but vfio_dma_map() need it
+                        // in device process, so here map it again.
+                        let mmap = match MemoryMapping::from_fd_offset(
+                            self.device.as_ref(),
+                            mmap_size as usize,
+                            offset,
+                        ) {
+                            Ok(v) => v,
+                            Err(_e) => break,
+                        };
+                        let host = (&mmap).as_ptr() as u64;
+                        // Safe because the given guest_map_start is valid guest bar address. and
+                        // the host pointer is correct and valid guaranteed by MemoryMapping interface.
+                        match unsafe { self.device.vfio_dma_map(guest_map_start, mmap_size, host) }
+                        {
+                            Ok(_) => mem_map.push(mmap),
+                            Err(e) => {
+                                error!(
+                                    "{}, index: {}, bar_addr:0x{:x}, host:0x{:x}",
+                                    e, index, bar_addr, host
+                                );
+                                break;
+                            }
+                        }
+                    }
+                    _ => break,
+                }
+            }
+        }
+
+        mem_map
+    }
+
+    fn enable_bars_mmap(&mut self) {
+        for mmio_info in self.mmio_regions.iter() {
+            let mut mem_map = self.add_bar_mmap(mmio_info.bar_index, mmio_info.start);
+            self.mem.append(&mut mem_map);
+        }
+    }
+}
+
+impl PciDevice for VfioPciDevice {
+    fn debug_label(&self) -> String {
+        "vfio pci device".to_string()
+    }
+
+    fn assign_bus_dev(&mut self, bus: u8, device: u8) {
+        self.pci_bus_dev = Some((bus, device));
+    }
+
+    fn keep_fds(&self) -> Vec<RawFd> {
+        let mut fds = self.device.keep_fds();
+        if let Some(ref interrupt_evt) = self.interrupt_evt {
+            fds.push(interrupt_evt.as_raw_fd());
+        }
+        if let Some(ref interrupt_resample_evt) = self.interrupt_resample_evt {
+            fds.push(interrupt_resample_evt.as_raw_fd());
+        }
+        fds.push(self.vm_socket_mem.as_raw_fd());
+        fds.push(self.vm_socket_irq.as_ref().as_raw_fd());
+        fds
+    }
+
+    fn assign_irq(
+        &mut self,
+        irq_evt: EventFd,
+        irq_resample_evt: EventFd,
+        irq_num: u32,
+        _irq_pin: PciInterruptPin,
+    ) {
+        self.config.write_config_byte(irq_num as u8, 0x3C);
+        self.interrupt_evt = Some(irq_evt);
+        self.interrupt_resample_evt = Some(irq_resample_evt);
+
+        // enable INTX
+        if self.config.read_config_byte(PCI_INTERRUPT_PIN) > 0 {
+            self.enable_intx();
+        }
+    }
+
+    fn allocate_io_bars(
+        &mut self,
+        resources: &mut SystemAllocator,
+    ) -> Result<Vec<(u64, u64)>, PciDeviceError> {
+        let mut ranges = Vec::new();
+        let mut i = VFIO_PCI_BAR0_REGION_INDEX;
+        let (bus, dev) = self
+            .pci_bus_dev
+            .expect("assign_bus_dev must be called prior to allocate_io_bars");
+
+        while i <= VFIO_PCI_ROM_REGION_INDEX {
+            let mut low: u32 = 0xffffffff;
+            let offset: u32;
+            if i == VFIO_PCI_ROM_REGION_INDEX {
+                offset = 0x30;
+            } else {
+                offset = 0x10 + i * 4;
+            }
+            self.config.write_config_dword(low, offset);
+            low = self.config.read_config_dword(offset);
+
+            let low_flag = low & 0xf;
+            let is_64bit = match low_flag & 0x4 {
+                0x4 => true,
+                _ => false,
+            };
+            if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
+                let mut upper: u32 = 0xffffffff;
+                if is_64bit {
+                    self.config.write_config_dword(upper, offset + 4);
+                    upper = self.config.read_config_dword(offset + 4);
+                }
+
+                low &= 0xffff_fff0;
+                let mut size: u64 = u64::from(upper);
+                size <<= 32;
+                size |= u64::from(low);
+                size = !size + 1;
+                let mmio_type = match is_64bit {
+                    false => MmioType::Low,
+                    true => MmioType::High,
+                };
+                let bar_addr = resources
+                    .mmio_allocator(mmio_type)
+                    .allocate_with_align(
+                        size,
+                        Alloc::PciBar {
+                            bus,
+                            dev,
+                            bar: i as u8,
+                        },
+                        "vfio_bar".to_string(),
+                        size,
+                    )
+                    .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
+                ranges.push((bar_addr, size));
+                self.mmio_regions.push(MmioInfo {
+                    bar_index: i,
+                    start: bar_addr,
+                    length: size,
+                });
+
+                low = bar_addr as u32;
+                low |= low_flag;
+                self.config.write_config_dword(low, offset);
+                if is_64bit {
+                    upper = (bar_addr >> 32) as u32;
+                    self.config.write_config_dword(upper, offset + 4);
+                }
+            } else if low_flag & 0x1 == 0x1 {
+                self.io_regions.push(IoInfo { bar_index: i });
+            }
+
+            if is_64bit {
+                i += 2;
+            } else {
+                i += 1;
+            }
+        }
+
+        if let Err(e) = self.device.setup_dma_map() {
+            error!(
+                "failed to add all guest memory regions into iommu table: {}",
+                e
+            );
+        }
+
+        // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
+        // driver doesn't claim this vga device, then xorg couldn't boot up.
+        if self.is_intel_gfx() {
+            let mut cmd = self.config.read_config_byte(PCI_COMMAND);
+            cmd |= PCI_COMMAND_MEMORY;
+            self.config.write_config_byte(cmd, PCI_COMMAND);
+        }
+
+        Ok(ranges)
+    }
+
+    fn allocate_device_bars(
+        &mut self,
+        resources: &mut SystemAllocator,
+    ) -> Result<Vec<(u64, u64)>, PciDeviceError> {
+        let mut ranges = Vec::new();
+
+        if !self.is_intel_gfx() {
+            return Ok(ranges);
+        }
+
+        // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
+        // then write this gpa into pci cfg register
+        if let Some((index, size)) = self.device.get_cap_type_info(
+            VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (INTEL_VENDOR_ID as u32),
+            VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
+        ) {
+            let (bus, dev) = self
+                .pci_bus_dev
+                .expect("assign_bus_dev must be called prior to allocate_device_bars");
+            let bar_addr = resources
+                .mmio_allocator(MmioType::Low)
+                .allocate(
+                    size,
+                    Alloc::PciBar {
+                        bus,
+                        dev,
+                        bar: (index * 4) as u8,
+                    },
+                    "vfio_bar".to_string(),
+                )
+                .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
+            ranges.push((bar_addr, size));
+            self.device_data = Some(DeviceData::IntelGfxData {
+                opregion_index: index,
+            });
+
+            self.mmio_regions.push(MmioInfo {
+                bar_index: index,
+                start: bar_addr,
+                length: size,
+            });
+            self.config.write_config_dword(bar_addr as u32, 0xFC);
+        }
+
+        Ok(ranges)
+    }
+
+    fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
+        Ok(())
+    }
+
+    fn ioeventfds(&self) -> Vec<(&EventFd, u64, Datamatch)> {
+        Vec::new()
+    }
+
+    fn read_config_register(&self, reg_idx: usize) -> u32 {
+        let reg: u32 = (reg_idx * 4) as u32;
+
+        let mut config = self.config.read_config_dword(reg);
+
+        // Ignore IO bar
+        if reg >= 0x10 && reg <= 0x24 {
+            for io_info in self.io_regions.iter() {
+                if io_info.bar_index * 4 + 0x10 == reg {
+                    config = 0;
+                }
+            }
+        } else if reg_idx == PCI_HEADER_TYPE / 4 {
+            // Clear multifunction flags as pci_root doesn't
+            // support multifunction.
+            config &= !PCI_MULTI_FLAG;
+        }
+
+        // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
+        if self.is_intel_gfx() && reg == 0x50 {
+            config &= 0xffff00ff;
+        }
+
+        config
+    }
+
+    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
+        let start = (reg_idx * 4) as u64 + offset;
+
+        let mut msi_change: Option<VfioMsiChange> = None;
+        if let Some(msi_cap) = self.msi_cap.as_mut() {
+            if msi_cap.is_msi_reg(start, data.len()) {
+                msi_change = msi_cap.write_msi_reg(start, data);
+            }
+        }
+
+        match msi_change {
+            Some(VfioMsiChange::Enable) => self.enable_msi(),
+            Some(VfioMsiChange::Disable) => self.disable_msi(),
+            None => (),
+        }
+
+        // if guest enable memory access, then enable bar mappable once
+        if start == PCI_COMMAND as u64
+            && data.len() == 2
+            && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
+            && self.mem.is_empty()
+        {
+            self.enable_bars_mmap();
+        }
+
+        self.device
+            .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, start);
+    }
+
+    fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
+        if let Some(mmio_info) = self.find_region(addr) {
+            let offset = addr - mmio_info.start;
+            self.device.region_read(mmio_info.bar_index, data, offset);
+        }
+    }
+
+    fn write_bar(&mut self, addr: u64, data: &[u8]) {
+        if let Some(mmio_info) = self.find_region(addr) {
+            // Ignore igd opregion's write
+            if let Some(device_data) = &self.device_data {
+                match *device_data {
+                    DeviceData::IntelGfxData { opregion_index } => {
+                        if opregion_index == mmio_info.bar_index {
+                            return;
+                        }
+                    }
+                }
+            }
+
+            let offset = addr - mmio_info.start;
+            self.device.region_write(mmio_info.bar_index, data, offset);
+        }
+    }
+}
diff --git a/devices/src/pit.rs b/devices/src/pit.rs
index 63f31f5..86ad4c5 100644
--- a/devices/src/pit.rs
+++ b/devices/src/pit.rs
@@ -745,10 +745,9 @@
             Kill,
         }
 
-        let poll_ctx: PollContext<Token> = PollContext::new()
-            .and_then(|pc| pc.add(&self.fd, Token::TimerExpire).and(Ok(pc)))
-            .and_then(|pc| pc.add(&kill_evt, Token::Kill).and(Ok(pc)))
-            .map_err(PitError::CreatePollContext)?;
+        let poll_ctx: PollContext<Token> =
+            PollContext::build_with(&[(&self.fd, Token::TimerExpire), (&kill_evt, Token::Kill)])
+                .map_err(PitError::CreatePollContext)?;
 
         loop {
             let events = poll_ctx.wait().map_err(PitError::PollError)?;
diff --git a/devices/src/proxy.rs b/devices/src/proxy.rs
index dc64212..cb7ebd6 100644
--- a/devices/src/proxy.rs
+++ b/devices/src/proxy.rs
@@ -6,12 +6,11 @@
 
 use std::fmt::{self, Display};
 use std::os::unix::io::{AsRawFd, RawFd};
-use std::process;
 use std::time::Duration;
 use std::{self, io};
 
 use io_jail::{self, Minijail};
-use libc::pid_t;
+use libc::{self, pid_t};
 use msg_socket::{MsgOnSocket, MsgReceiver, MsgSender, MsgSocket};
 use sys_util::{error, net::UnixSeqpacket};
 
@@ -38,7 +37,7 @@
 
 const SOCKET_TIMEOUT_MS: u64 = 2000;
 
-#[derive(MsgOnSocket)]
+#[derive(Debug, MsgOnSocket)]
 enum Command {
     Read {
         len: u32,
@@ -66,7 +65,7 @@
     ReadConfigResult(u32),
 }
 
-fn child_proc(sock: UnixSeqpacket, device: &mut dyn BusDevice) {
+fn child_proc<D: BusDevice>(sock: UnixSeqpacket, device: &mut D) {
     let mut running = true;
     let sock = MsgSocket::<CommandResult, Command>::new(sock);
 
@@ -88,7 +87,8 @@
             Command::Write { len, offset, data } => {
                 let len = len as usize;
                 device.write(offset, &data[0..len]);
-                sock.send(&CommandResult::Ok)
+                // Command::Write does not have a result.
+                Ok(())
             }
             Command::ReadConfig(idx) => {
                 let val = device.config_register_read(idx as usize);
@@ -102,7 +102,8 @@
             } => {
                 let len = len as usize;
                 device.config_register_write(reg_idx as usize, offset as u64, &data[0..len]);
-                sock.send(&CommandResult::Ok)
+                // Command::WriteConfig does not have a result.
+                Ok(())
             }
             Command::Shutdown => {
                 running = false;
@@ -133,7 +134,8 @@
     ///
     /// # Arguments
     /// * `device` - The device to isolate to another process.
-    /// * `keep_fds` - File descriptors that will be kept open in the child
+    /// * `jail` - The jail to use for isolating the given device.
+    /// * `keep_fds` - File descriptors that will be kept open in the child.
     pub fn new<D: BusDevice>(
         mut device: D,
         jail: &Minijail,
@@ -149,8 +151,16 @@
                 0 => {
                     device.on_sandboxed();
                     child_proc(child_sock, &mut device);
+
+                    // We're explicitly not using std::process::exit here to avoid the cleanup of
+                    // stdout/stderr globals. This can cause cascading panics and SIGILL if a worker
+                    // thread attempts to log to stderr after at_exit handlers have been run.
+                    // TODO(crbug.com/992494): Remove this once device shutdown ordering is clearly
+                    // defined.
+                    //
+                    // exit() is trivially safe.
                     // ! Never returns
-                    process::exit(0);
+                    libc::exit(0);
                 }
                 p => p,
             }
@@ -173,19 +183,25 @@
         self.pid
     }
 
-    fn sync_send(&self, cmd: Command) -> Option<CommandResult> {
-        let res = self.sock.send(&cmd);
+    /// Send a command that does not expect a response from the child device process.
+    fn send_no_result(&self, cmd: &Command) {
+        let res = self.sock.send(cmd);
         if let Err(e) = res {
             error!(
                 "failed write to child device process {}: {}",
                 self.debug_label, e,
             );
-        };
+        }
+    }
+
+    /// Send a command and read its response from the child device process.
+    fn sync_send(&self, cmd: &Command) -> Option<CommandResult> {
+        self.send_no_result(cmd);
         match self.sock.recv() {
             Err(e) => {
                 error!(
-                    "failed read from child device process {}: {}",
-                    self.debug_label, e,
+                    "failed to read result of {:?} from child device process {}: {}",
+                    cmd, self.debug_label, e,
                 );
                 None
             }
@@ -205,7 +221,7 @@
         buffer[0..data.len()].clone_from_slice(data);
         let reg_idx = reg_idx as u32;
         let offset = offset as u32;
-        self.sync_send(Command::WriteConfig {
+        self.send_no_result(&Command::WriteConfig {
             reg_idx,
             offset,
             len,
@@ -214,7 +230,7 @@
     }
 
     fn config_register_read(&self, reg_idx: usize) -> u32 {
-        let res = self.sync_send(Command::ReadConfig(reg_idx as u32));
+        let res = self.sync_send(&Command::ReadConfig(reg_idx as u32));
         if let Some(CommandResult::ReadConfigResult(val)) = res {
             val
         } else {
@@ -225,7 +241,7 @@
     fn read(&mut self, offset: u64, data: &mut [u8]) {
         let len = data.len() as u32;
         if let Some(CommandResult::ReadResult(buffer)) =
-            self.sync_send(Command::Read { len, offset })
+            self.sync_send(&Command::Read { len, offset })
         {
             let len = data.len();
             data.clone_from_slice(&buffer[0..len]);
@@ -236,7 +252,7 @@
         let mut buffer = [0u8; 8];
         let len = data.len() as u32;
         buffer[0..data.len()].clone_from_slice(data);
-        self.sync_send(Command::Write {
+        self.send_no_result(&Command::Write {
             len,
             offset,
             data: buffer,
@@ -246,6 +262,6 @@
 
 impl Drop for ProxyDevice {
     fn drop(&mut self) {
-        self.sync_send(Command::Shutdown);
+        self.sync_send(&Command::Shutdown);
     }
 }
diff --git a/devices/src/register_space/register.rs b/devices/src/register_space/register.rs
index 268b945..c2f5184 100644
--- a/devices/src/register_space/register.rs
+++ b/devices/src/register_space/register.rs
@@ -448,7 +448,7 @@
 
     #[test]
     fn static_register_interface_test() {
-        let r: Box<RegisterInterface> = Box::new(static_register! {
+        let r: Box<dyn RegisterInterface> = Box::new(static_register! {
             ty: u8,
             offset: 3,
             value: 32,
diff --git a/devices/src/register_space/register_space.rs b/devices/src/register_space/register_space.rs
index 892961a..3f30ac0 100644
--- a/devices/src/register_space/register_space.rs
+++ b/devices/src/register_space/register_space.rs
@@ -293,5 +293,4 @@
         regs.read(8, &mut data);
         assert_eq!([0, 0, 0xff, 0xee, 0xff, 0xee, 0xff, 0xee], data);
     }
-
 }
diff --git a/devices/src/serial.rs b/devices/src/serial.rs
index 0bf5642..98f39dc 100644
--- a/devices/src/serial.rs
+++ b/devices/src/serial.rs
@@ -5,11 +5,16 @@
 use std::collections::VecDeque;
 use std::fmt::{self, Display};
 use std::fs::File;
-use std::io::{self, stdout};
+use std::io::{self, stdin, stdout, Read, Write};
+use std::os::unix::io::{AsRawFd, RawFd};
 use std::path::PathBuf;
 use std::str::FromStr;
+use std::sync::atomic::{AtomicU8, Ordering};
+use std::sync::mpsc::{channel, Receiver, TryRecvError};
+use std::sync::Arc;
+use std::thread::{self};
 
-use sys_util::{error, syslog, EventFd, Result};
+use sys_util::{error, read_raw_stdin, syslog, EventFd, Result};
 
 use crate::BusDevice;
 
@@ -127,6 +132,7 @@
     pub path: Option<PathBuf>,
     pub num: u8,
     pub console: bool,
+    pub stdin: bool,
 }
 
 impl SerialParameters {
@@ -134,28 +140,55 @@
     ///
     /// # Arguments
     /// * `evt_fd` - eventfd used for interrupt events
-    pub fn create_serial_device(&self, evt_fd: &EventFd) -> std::result::Result<Serial, Error> {
+    /// * `keep_fds` - Vector of FDs required by this device if it were sandboxed in a child
+    ///                process. `evt_fd` will always be added to this vector by this function.
+    pub fn create_serial_device(
+        &self,
+        evt_fd: &EventFd,
+        keep_fds: &mut Vec<RawFd>,
+    ) -> std::result::Result<Serial, Error> {
+        let evt_fd = evt_fd.try_clone().map_err(Error::CloneEventFd)?;
+        keep_fds.push(evt_fd.as_raw_fd());
         match self.type_ {
-            SerialType::Stdout => Ok(Serial::new_out(
-                evt_fd.try_clone().map_err(Error::CloneEventFd)?,
-                Box::new(stdout()),
-            )),
-            SerialType::Sink => Ok(Serial::new_sink(
-                evt_fd.try_clone().map_err(Error::CloneEventFd)?,
-            )),
-            SerialType::Syslog => Ok(Serial::new_out(
-                evt_fd.try_clone().map_err(Error::CloneEventFd)?,
-                Box::new(syslog::Syslogger::new(
-                    syslog::Priority::Info,
-                    syslog::Facility::Daemon,
-                )),
-            )),
+            SerialType::Stdout => {
+                keep_fds.push(stdout().as_raw_fd());
+                if self.stdin {
+                    keep_fds.push(stdin().as_raw_fd());
+                    // This wrapper is used in place of the libstd native version because we don't
+                    // want buffering for stdin.
+                    struct StdinWrapper;
+                    impl io::Read for StdinWrapper {
+                        fn read(&mut self, out: &mut [u8]) -> io::Result<usize> {
+                            read_raw_stdin(out).map_err(|e| e.into())
+                        }
+                    }
+                    Ok(Serial::new_in_out(
+                        evt_fd,
+                        Box::new(StdinWrapper),
+                        Box::new(stdout()),
+                    ))
+                } else {
+                    Ok(Serial::new_out(evt_fd, Box::new(stdout())))
+                }
+            }
+            SerialType::Sink => Ok(Serial::new_sink(evt_fd)),
+            SerialType::Syslog => {
+                syslog::push_fds(keep_fds);
+                Ok(Serial::new_out(
+                    evt_fd,
+                    Box::new(syslog::Syslogger::new(
+                        syslog::Priority::Info,
+                        syslog::Facility::Daemon,
+                    )),
+                ))
+            }
             SerialType::File => match &self.path {
                 None => Err(Error::PathRequired),
-                Some(path) => Ok(Serial::new_out(
-                    evt_fd.try_clone().map_err(Error::CloneEventFd)?,
-                    Box::new(File::create(path.as_path()).map_err(Error::FileError)?),
-                )),
+                Some(path) => {
+                    let file = File::create(path.as_path()).map_err(Error::FileError)?;
+                    keep_fds.push(file.as_raw_fd());
+                    Ok(Serial::new_out(evt_fd, Box::new(file)))
+                }
             },
             SerialType::UnixSocket => Err(Error::Unimplemented(SerialType::UnixSocket)),
         }
@@ -169,24 +202,28 @@
         path: None,
         num: 1,
         console: true,
+        stdin: true,
     },
     SerialParameters {
         type_: SerialType::Sink,
         path: None,
         num: 2,
         console: false,
+        stdin: false,
     },
     SerialParameters {
         type_: SerialType::Sink,
         path: None,
         num: 3,
         console: false,
+        stdin: false,
     },
     SerialParameters {
         type_: SerialType::Sink,
         path: None,
         num: 4,
         console: false,
+        stdin: false,
     },
 ];
 
@@ -211,9 +248,11 @@
 /// Emulates serial COM ports commonly seen on x86 I/O ports 0x3f8/0x2f8/0x3e8/0x2e8.
 ///
 /// This can optionally write the guest's output to a Write trait object. To send input to the
-/// guest, use `queue_input_bytes`.
+/// guest, use `queue_input_bytes` directly, or give a Read trait object which will be used queue
+/// bytes when `used_command` is called.
 pub struct Serial {
-    interrupt_enable: u8,
+    // Serial port registers
+    interrupt_enable: Arc<AtomicU8>,
     interrupt_identification: u8,
     interrupt_evt: EventFd,
     line_control: u8,
@@ -222,14 +261,22 @@
     modem_status: u8,
     scratch: u8,
     baud_divisor: u16,
+
+    // Host input/output
     in_buffer: VecDeque<u8>,
+    in_channel: Option<Receiver<u8>>,
+    input: Option<Box<dyn io::Read + Send>>,
     out: Option<Box<dyn io::Write + Send>>,
 }
 
 impl Serial {
-    fn new(interrupt_evt: EventFd, out: Option<Box<dyn io::Write + Send>>) -> Serial {
+    fn new(
+        interrupt_evt: EventFd,
+        input: Option<Box<dyn io::Read + Send>>,
+        out: Option<Box<dyn io::Write + Send>>,
+    ) -> Serial {
         Serial {
-            interrupt_enable: 0,
+            interrupt_enable: Default::default(),
             interrupt_identification: DEFAULT_INTERRUPT_IDENTIFICATION,
             interrupt_evt,
             line_control: DEFAULT_LINE_CONTROL,
@@ -238,41 +285,146 @@
             modem_status: DEFAULT_MODEM_STATUS,
             scratch: 0,
             baud_divisor: DEFAULT_BAUD_DIVISOR,
-            in_buffer: VecDeque::new(),
+            in_buffer: Default::default(),
+            in_channel: None,
+            input,
             out,
         }
     }
 
-    /// Constructs a Serial port ready for output.
-    pub fn new_out(interrupt_evt: EventFd, out: Box<dyn io::Write + Send>) -> Serial {
-        Self::new(interrupt_evt, Some(out))
+    /// Constructs a Serial port ready for input and output.
+    ///
+    /// The stream `input` should not block, instead returning 0 bytes if are no bytes available.
+    pub fn new_in_out(
+        interrupt_evt: EventFd,
+        input: Box<dyn io::Read + Send>,
+        out: Box<dyn io::Write + Send>,
+    ) -> Serial {
+        Self::new(interrupt_evt, Some(input), Some(out))
     }
 
-    /// Constructs a Serial port with no connected output.
+    /// Constructs a Serial port ready for output but not input.
+    pub fn new_out(interrupt_evt: EventFd, out: Box<dyn io::Write + Send>) -> Serial {
+        Self::new(interrupt_evt, None, Some(out))
+    }
+
+    /// Constructs a Serial port with no connected input or output.
     pub fn new_sink(interrupt_evt: EventFd) -> Serial {
-        Self::new(interrupt_evt, None)
+        Self::new(interrupt_evt, None, None)
     }
 
     /// Queues raw bytes for the guest to read and signals the interrupt if the line status would
-    /// change.
+    /// change. These bytes will be read by the guest before any bytes from the input stream that
+    /// have not already been queued.
     pub fn queue_input_bytes(&mut self, c: &[u8]) -> Result<()> {
-        if !self.is_loop() {
+        if !c.is_empty() && !self.is_loop() {
             self.in_buffer.extend(c);
-            self.recv_data()?;
+            self.set_data_bit();
+            self.trigger_recv_interrupt()?;
         }
+
         Ok(())
     }
 
+    fn spawn_input_thread(&mut self) {
+        let mut rx = match self.input.take() {
+            Some(input) => input,
+            None => return,
+        };
+
+        let (send_channel, recv_channel) = channel();
+
+        // The interrupt enable and interrupt event are used to trigger the guest serial driver to
+        // read the serial device, which will give the VCPU threads time to queue input bytes from
+        // the input thread's buffer, changing the serial device state accordingly.
+        let interrupt_enable = self.interrupt_enable.clone();
+        let interrupt_evt = match self.interrupt_evt.try_clone() {
+            Ok(e) => e,
+            Err(e) => {
+                error!("failed to clone interrupt eventfd: {}", e);
+                return;
+            }
+        };
+
+        // The input thread runs in detached mode and will exit when channel is disconnected because
+        // the serial device has been dropped. Initial versions of this kept a `JoinHandle` and had
+        // the drop implementation of serial join on this thread, but the input thread can block
+        // indefinitely depending on the `Box<io::Read>` implementation.
+        let res = thread::Builder::new()
+            .name(format!("{} input thread", self.debug_label()))
+            .spawn(move || {
+                let mut rx_buf = [0u8; 1];
+                loop {
+                    match rx.read(&mut rx_buf) {
+                        Ok(0) => break, // Assume the stream of input has ended.
+                        Ok(_) => {
+                            if send_channel.send(rx_buf[0]).is_err() {
+                                // The receiver has disconnected.
+                                break;
+                            }
+                            if (interrupt_enable.load(Ordering::SeqCst) & IER_RECV_BIT) != 0 {
+                                interrupt_evt.write(1).unwrap();
+                            }
+                        }
+                        Err(e) => {
+                            // Being interrupted is not an error, but everything else is.
+                            if e.kind() != io::ErrorKind::Interrupted {
+                                error!(
+                                    "failed to read for bytes to queue into serial device: {}",
+                                    e
+                                );
+                                break;
+                            }
+                        }
+                    }
+                }
+            });
+        if let Err(e) = res {
+            error!("failed to spawn input thread: {}", e);
+            return;
+        }
+        self.in_channel = Some(recv_channel);
+    }
+
+    fn handle_input_thread(&mut self) {
+        if self.input.is_some() {
+            self.spawn_input_thread();
+        }
+
+        loop {
+            let in_channel = match self.in_channel.as_ref() {
+                Some(v) => v,
+                None => return,
+            };
+            match in_channel.try_recv() {
+                Ok(byte) => {
+                    self.queue_input_bytes(&[byte]).unwrap();
+                }
+                Err(TryRecvError::Empty) => break,
+                Err(TryRecvError::Disconnected) => {
+                    self.in_channel = None;
+                    return;
+                }
+            }
+        }
+    }
+
+    /// Gets the interrupt eventfd used to interrupt the driver when it needs to respond to this
+    /// device.
+    pub fn interrupt_eventfd(&self) -> &EventFd {
+        &self.interrupt_evt
+    }
+
     fn is_dlab_set(&self) -> bool {
         (self.line_control & 0x80) != 0
     }
 
     fn is_recv_intr_enabled(&self) -> bool {
-        (self.interrupt_enable & IER_RECV_BIT) != 0
+        (self.interrupt_enable.load(Ordering::SeqCst) & IER_RECV_BIT) != 0
     }
 
     fn is_thr_intr_enabled(&self) -> bool {
-        (self.interrupt_enable & IER_THR_BIT) != 0
+        (self.interrupt_enable.load(Ordering::SeqCst) & IER_THR_BIT) != 0
     }
 
     fn is_loop(&self) -> bool {
@@ -291,7 +443,7 @@
         }
     }
 
-    fn thr_empty(&mut self) -> Result<()> {
+    fn trigger_thr_empty(&mut self) -> Result<()> {
         if self.is_thr_intr_enabled() {
             self.add_intr_bit(IIR_THR_BIT);
             self.trigger_interrupt()?
@@ -299,12 +451,15 @@
         Ok(())
     }
 
-    fn recv_data(&mut self) -> Result<()> {
+    fn trigger_recv_interrupt(&mut self) -> Result<()> {
         if self.is_recv_intr_enabled() {
-            self.add_intr_bit(IIR_RECV_BIT);
-            self.trigger_interrupt()?
+            // Only bother triggering the interrupt if the identification bit wasn't set or
+            // acknowledged.
+            if self.interrupt_identification & IIR_RECV_BIT == 0 {
+                self.add_intr_bit(IIR_RECV_BIT);
+                self.trigger_interrupt()?
+            }
         }
-        self.line_status |= LSR_DATA_BIT;
         Ok(())
     }
 
@@ -312,6 +467,10 @@
         self.interrupt_evt.write(1)
     }
 
+    fn set_data_bit(&mut self) {
+        self.line_status |= LSR_DATA_BIT;
+    }
+
     fn iir_reset(&mut self) {
         self.interrupt_identification = DEFAULT_INTERRUPT_IDENTIFICATION;
     }
@@ -328,17 +487,20 @@
                 if self.is_loop() {
                     if self.in_buffer.len() < LOOP_SIZE {
                         self.in_buffer.push_back(v);
-                        self.recv_data()?;
+                        self.set_data_bit();
+                        self.trigger_recv_interrupt()?;
                     }
                 } else {
                     if let Some(out) = self.out.as_mut() {
                         out.write_all(&[v])?;
                         out.flush()?;
                     }
-                    self.thr_empty()?;
+                    self.trigger_thr_empty()?;
                 }
             }
-            IER => self.interrupt_enable = v & IER_FIFO_BITS,
+            IER => self
+                .interrupt_enable
+                .store(v & IER_FIFO_BITS, Ordering::SeqCst),
             LCR => self.line_control = v,
             MCR => self.modem_control = v,
             SCR => self.scratch = v,
@@ -368,6 +530,8 @@
             return;
         }
 
+        self.handle_input_thread();
+
         data[0] = match offset as u8 {
             DLAB_LOW if self.is_dlab_set() => self.baud_divisor as u8,
             DLAB_HIGH if self.is_dlab_set() => (self.baud_divisor >> 8) as u8,
@@ -378,7 +542,7 @@
                 }
                 self.in_buffer.pop_front().unwrap_or_default()
             }
-            IER => self.interrupt_enable,
+            IER => self.interrupt_enable.load(Ordering::SeqCst),
             IIR => {
                 let v = self.interrupt_identification | IIR_FIFO_BITS;
                 self.iir_reset();
diff --git a/devices/src/usb/host_backend/context.rs b/devices/src/usb/host_backend/context.rs
deleted file mode 100644
index 960d066..0000000
--- a/devices/src/usb/host_backend/context.rs
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright 2019 The Chromium OS Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-use super::error::*;
-use crate::utils::{EventHandler, EventLoop};
-use std::os::raw::c_short;
-use std::os::unix::io::RawFd;
-use std::sync::{Arc, Weak};
-use sys_util::{error, WatchingEvents};
-use usb_util::hotplug::UsbHotplugHandler;
-use usb_util::libusb_context::{LibUsbContext, LibUsbPollfdChangeHandler};
-use usb_util::libusb_device::LibUsbDevice;
-use vm_control::MaybeOwnedFd;
-
-/// Context wraps libusb context with libusb event handling.
-pub struct Context {
-    context: LibUsbContext,
-    event_loop: Arc<EventLoop>,
-    event_handler: Arc<dyn EventHandler>,
-}
-
-impl Context {
-    /// Create a new context.
-    #[cfg(not(feature = "sandboxed-libusb"))]
-    pub fn new(event_loop: Arc<EventLoop>) -> Result<Context> {
-        let context = LibUsbContext::new().map_err(Error::CreateLibUsbContext)?;
-        let ctx = Context {
-            context: context.clone(),
-            event_loop,
-            event_handler: Arc::new(LibUsbEventHandler {
-                context: context.clone(),
-            }),
-        };
-        ctx.init_event_handler()?;
-        Ok(ctx)
-    }
-
-    #[cfg(feature = "sandboxed-libusb")]
-    pub fn new(event_loop: Arc<EventLoop>) -> Result<Context> {
-        let context = LibUsbContext::new_jailed().map_err(Error::CreateLibUsbContext)?;
-        let ctx = Context {
-            context: context.clone(),
-            event_loop,
-            event_handler: Arc::new(LibUsbEventHandler {
-                context: context.clone(),
-            }),
-        };
-        ctx.init_event_handler()?;
-        Ok(ctx)
-    }
-
-    pub fn set_hotplug_handler<H: UsbHotplugHandler + Sized>(&self, handler: H) {
-        if let Err(e) = self.context.set_hotplug_cb(handler) {
-            error!("cannot set hotplug handler: {:?}", e);
-        }
-    }
-
-    fn init_event_handler(&self) -> Result<()> {
-        for pollfd in self.context.get_pollfd_iter() {
-            usb_debug!("event loop add event {} events handler", pollfd.fd);
-            self.event_loop
-                .add_event(
-                    &MaybeOwnedFd::Borrowed(pollfd.fd),
-                    WatchingEvents::new(pollfd.events as u32),
-                    Arc::downgrade(&self.event_handler),
-                )
-                .map_err(Error::AddToEventLoop)?;
-        }
-
-        self.context
-            .set_pollfd_notifiers(Box::new(PollfdChangeHandler {
-                event_loop: self.event_loop.clone(),
-                event_handler: Arc::downgrade(&self.event_handler),
-            }));
-        Ok(())
-    }
-
-    /// Get libusb device with matching bus, addr, vid and pid.
-    #[cfg(not(feature = "sandboxed-libusb"))]
-    pub fn get_device(&self, bus: u8, addr: u8, vid: u16, pid: u16) -> Option<LibUsbDevice> {
-        let device_iter = match self.context.get_device_iter() {
-            Ok(iter) => iter,
-            Err(e) => {
-                error!("could not get libusb device iterator: {:?}", e);
-                return None;
-            }
-        };
-        for device in device_iter {
-            if device.get_bus_number() == bus && device.get_address() == addr {
-                if let Ok(descriptor) = device.get_device_descriptor() {
-                    if descriptor.idProduct == pid && descriptor.idVendor == vid {
-                        return Some(device);
-                    }
-                }
-            }
-        }
-        error!("device not found bus {}, addr {}", bus, addr);
-        None
-    }
-
-    #[cfg(feature = "sandboxed-libusb")]
-    pub fn get_device(&self, fd: std::fs::File) -> Option<LibUsbDevice> {
-        match self.context.get_device_from_fd(fd) {
-            Ok(dev) => Some(dev),
-            Err(e) => {
-                error!("could not build device from fd: {:?}", e);
-                None
-            }
-        }
-    }
-}
-
-struct LibUsbEventHandler {
-    context: LibUsbContext,
-}
-
-impl EventHandler for LibUsbEventHandler {
-    fn on_event(&self) -> std::result::Result<(), ()> {
-        self.context.handle_events_nonblock();
-        Ok(())
-    }
-}
-
-struct PollfdChangeHandler {
-    event_loop: Arc<EventLoop>,
-    event_handler: Weak<dyn EventHandler>,
-}
-
-impl LibUsbPollfdChangeHandler for PollfdChangeHandler {
-    fn add_poll_fd(&self, fd: RawFd, events: c_short) {
-        if let Err(e) = self.event_loop.add_event(
-            &MaybeOwnedFd::Borrowed(fd),
-            WatchingEvents::new(events as u32),
-            self.event_handler.clone(),
-        ) {
-            error!("cannot add event to event loop: {}", e);
-        }
-    }
-
-    fn remove_poll_fd(&self, fd: RawFd) {
-        if let Some(h) = self.event_handler.upgrade() {
-            if let Err(e) = h.on_event() {
-                error!("cannot handle event: {:?}", e);
-            }
-        }
-        if let Err(e) = self
-            .event_loop
-            .remove_event_for_fd(&MaybeOwnedFd::Borrowed(fd))
-        {
-            error!(
-                "failed to remove poll change handler from event loop: {}",
-                e
-            );
-        }
-    }
-}
diff --git a/devices/src/usb/host_backend/error.rs b/devices/src/usb/host_backend/error.rs
index ef097f9..3f1fed4 100644
--- a/devices/src/usb/host_backend/error.rs
+++ b/devices/src/usb/host_backend/error.rs
@@ -7,7 +7,7 @@
 use crate::utils::Error as UtilsError;
 use msg_socket::MsgError;
 use std::fmt::{self, Display};
-use usb_util::error::Error as UsbUtilError;
+use usb_util::Error as UsbUtilError;
 
 #[derive(Debug)]
 pub enum Error {
@@ -19,6 +19,7 @@
     SetActiveConfig(UsbUtilError),
     SetInterfaceAltSetting(UsbUtilError),
     ClearHalt(UsbUtilError),
+    CreateTransfer(UsbUtilError),
     GetEndpointType,
     CreateControlSock(std::io::Error),
     SetupControlSock(std::io::Error),
@@ -30,7 +31,7 @@
     WriteBuffer(BufferError),
     BufferLen(BufferError),
     /// Cannot get interface descriptor for (interface, altsetting).
-    GetInterfaceDescriptor((i32, u16)),
+    GetInterfaceDescriptor((u8, u8)),
     GetEndpointDescriptor(u8),
     BadXhciTransferState,
     BadBackendProviderState,
@@ -49,6 +50,7 @@
             SetActiveConfig(e) => write!(f, "failed to set active config: {:?}", e),
             SetInterfaceAltSetting(e) => write!(f, "failed to set interface alt setting: {:?}", e),
             ClearHalt(e) => write!(f, "failed to clear halt: {:?}", e),
+            CreateTransfer(e) => write!(f, "failed to create transfer: {:?}", e),
             GetEndpointType => write!(f, "failed to get endpoint type"),
             CreateControlSock(e) => write!(f, "failed to create contro sock: {}", e),
             SetupControlSock(e) => write!(f, "failed to setup control sock: {}", e),
diff --git a/devices/src/usb/host_backend/host_backend_device_provider.rs b/devices/src/usb/host_backend/host_backend_device_provider.rs
index 01fd84c..f60983f 100644
--- a/devices/src/usb/host_backend/host_backend_device_provider.rs
+++ b/devices/src/usb/host_backend/host_backend_device_provider.rs
@@ -4,22 +4,23 @@
 
 use std::sync::Arc;
 
-use super::context::Context;
 use super::error::*;
 use super::host_device::HostDevice;
-use super::hotplug::HotplugHandler;
 use crate::usb::xhci::usb_hub::UsbHub;
 use crate::usb::xhci::xhci_backend_device_provider::XhciBackendDeviceProvider;
 use crate::utils::AsyncJobQueue;
 use crate::utils::{EventHandler, EventLoop, FailHandle};
 use msg_socket::{MsgReceiver, MsgSender, MsgSocket};
+use std::collections::HashMap;
 use std::mem;
 use std::os::unix::io::{AsRawFd, RawFd};
 use std::time::Duration;
+use sync::Mutex;
 use sys_util::net::UnixSeqpacket;
 use sys_util::{error, WatchingEvents};
+use usb_util::Device;
 use vm_control::{
-    UsbControlAttachedDevice, UsbControlCommand, UsbControlResult, UsbControlSocket,
+    MaybeOwnedFd, UsbControlAttachedDevice, UsbControlCommand, UsbControlResult, UsbControlSocket,
     USB_CONTROL_MAX_PORTS,
 };
 
@@ -64,12 +65,15 @@
     ) -> Result<()> {
         match mem::replace(self, HostBackendDeviceProvider::Failed) {
             HostBackendDeviceProvider::Created { sock } => {
-                let ctx = Context::new(event_loop.clone())?;
-                let hotplug_handler = HotplugHandler::new(hub.clone());
-                ctx.set_hotplug_handler(hotplug_handler);
                 let job_queue =
                     AsyncJobQueue::init(&event_loop).map_err(Error::StartAsyncJobQueue)?;
-                let inner = Arc::new(ProviderInner::new(fail_handle, job_queue, ctx, sock, hub));
+                let inner = Arc::new(ProviderInner::new(
+                    fail_handle,
+                    job_queue,
+                    event_loop.clone(),
+                    sock,
+                    hub,
+                ));
                 let handler: Arc<dyn EventHandler> = inner.clone();
                 event_loop
                     .add_event(
@@ -123,25 +127,125 @@
 pub struct ProviderInner {
     fail_handle: Arc<dyn FailHandle>,
     job_queue: Arc<AsyncJobQueue>,
-    ctx: Context,
+    event_loop: Arc<EventLoop>,
     sock: MsgSocket<UsbControlResult, UsbControlCommand>,
     usb_hub: Arc<UsbHub>,
+
+    // Map of USB hub port number to per-device context.
+    devices: Mutex<HashMap<u8, HostDeviceContext>>,
+}
+
+struct HostDeviceContext {
+    event_handler: Arc<dyn EventHandler>,
+    device: Arc<Mutex<Device>>,
 }
 
 impl ProviderInner {
     fn new(
         fail_handle: Arc<dyn FailHandle>,
         job_queue: Arc<AsyncJobQueue>,
-        ctx: Context,
+        event_loop: Arc<EventLoop>,
         sock: MsgSocket<UsbControlResult, UsbControlCommand>,
         usb_hub: Arc<UsbHub>,
     ) -> ProviderInner {
         ProviderInner {
             fail_handle,
             job_queue,
-            ctx,
+            event_loop,
             sock,
             usb_hub,
+            devices: Mutex::new(HashMap::new()),
+        }
+    }
+
+    /// Open a usbdevfs file to create a host USB device object.
+    /// `fd` should be an open file descriptor for a file in `/dev/bus/usb`.
+    fn handle_attach_device(&self, fd: Option<MaybeOwnedFd>) -> UsbControlResult {
+        let usb_file = match fd {
+            Some(MaybeOwnedFd::Owned(file)) => file,
+            _ => {
+                error!("missing fd in UsbControlCommand::AttachDevice message");
+                return UsbControlResult::FailedToOpenDevice;
+            }
+        };
+
+        let raw_fd = usb_file.as_raw_fd();
+        let device = match Device::new(usb_file) {
+            Ok(d) => d,
+            Err(e) => {
+                error!("could not construct USB device from fd: {}", e);
+                return UsbControlResult::NoSuchDevice;
+            }
+        };
+
+        let arc_mutex_device = Arc::new(Mutex::new(device));
+
+        let event_handler: Arc<dyn EventHandler> = Arc::new(UsbUtilEventHandler {
+            device: arc_mutex_device.clone(),
+        });
+
+        if let Err(e) = self.event_loop.add_event(
+            &MaybeOwnedFd::Borrowed(raw_fd),
+            WatchingEvents::empty().set_read().set_write(),
+            Arc::downgrade(&event_handler),
+        ) {
+            error!("failed to add USB device fd to event handler: {}", e);
+            return UsbControlResult::FailedToOpenDevice;
+        }
+
+        let device_ctx = HostDeviceContext {
+            event_handler,
+            device: arc_mutex_device.clone(),
+        };
+
+        // Resetting the device is used to make sure it is in a known state, but it may
+        // still function if the reset fails.
+        if let Err(e) = arc_mutex_device.lock().reset() {
+            error!("failed to reset device after attach: {:?}", e);
+        }
+
+        let host_device = Box::new(HostDevice::new(
+            self.fail_handle.clone(),
+            self.job_queue.clone(),
+            arc_mutex_device,
+        ));
+        let port = self.usb_hub.connect_backend(host_device);
+        match port {
+            Ok(port) => {
+                self.devices.lock().insert(port, device_ctx);
+                UsbControlResult::Ok { port }
+            }
+            Err(e) => {
+                error!("failed to connect device to hub: {}", e);
+                UsbControlResult::NoAvailablePort
+            }
+        }
+    }
+
+    fn handle_detach_device(&self, port: u8) -> UsbControlResult {
+        match self.usb_hub.disconnect_port(port) {
+            Ok(()) => {
+                if let Some(device_ctx) = self.devices.lock().remove(&port) {
+                    let _ = device_ctx.event_handler.on_event();
+                    let device = device_ctx.device.lock();
+                    let fd = device.fd();
+
+                    if let Err(e) = self
+                        .event_loop
+                        .remove_event_for_fd(&MaybeOwnedFd::Borrowed(fd.as_raw_fd()))
+                    {
+                        error!(
+                            "failed to remove poll change handler from event loop: {}",
+                            e
+                        );
+                    }
+                }
+                UsbControlResult::Ok { port }
+            }
+            Err(e) => {
+                error!("failed to disconnect device from port {}: {}", port, e);
+                UsbControlResult::NoSuchDevice
+            }
         }
     }
 
@@ -168,163 +272,13 @@
 
     fn on_event_helper(&self) -> Result<()> {
         let cmd = self.sock.recv().map_err(Error::ReadControlSock)?;
-        match cmd {
-            UsbControlCommand::AttachDevice {
-                bus,
-                addr,
-                vid,
-                pid,
-                fd: usb_fd,
-            } => {
-                let _ = usb_fd;
-                #[cfg(not(feature = "sandboxed-libusb"))]
-                let device = match self.ctx.get_device(bus, addr, vid, pid) {
-                    Some(d) => d,
-                    None => {
-                        error!(
-                            "cannot get device bus: {}, addr: {}, vid: {}, pid: {}",
-                            bus, addr, vid, pid
-                        );
-                        // The send failure will be logged, but event loop still think the event is
-                        // handled.
-                        let _ = self
-                            .sock
-                            .send(&UsbControlResult::NoSuchDevice)
-                            .map_err(Error::WriteControlSock)?;
-                        return Ok(());
-                    }
-                };
-                #[cfg(feature = "sandboxed-libusb")]
-                let (device, device_handle) = {
-                    use vm_control::MaybeOwnedFd;
-
-                    let usb_file = match usb_fd {
-                        Some(MaybeOwnedFd::Owned(file)) => file,
-                        _ => {
-                            let _ = self
-                                .sock
-                                .send(&UsbControlResult::FailedToOpenDevice)
-                                .map_err(Error::WriteControlSock);
-                            return Ok(());
-                        }
-                    };
-
-                    let device_fd = usb_file.as_raw_fd();
-
-                    let device = match self.ctx.get_device(usb_file) {
-                        Some(d) => d,
-                        None => {
-                            error!(
-                                "cannot get device bus: {}, addr: {}, vid: {}, pid: {}",
-                                bus, addr, vid, pid
-                            );
-                            // The send failure will be logged, but event loop still think the event
-                            // is handled.
-                            let _ = self
-                                .sock
-                                .send(&UsbControlResult::NoSuchDevice)
-                                .map_err(Error::WriteControlSock);
-                            return Ok(());
-                        }
-                    };
-
-                    let device_handle = {
-                        // This is safe only when fd is an fd of the current device.
-                        match unsafe { device.open_fd(device_fd) } {
-                            Ok(handle) => handle,
-                            Err(e) => {
-                                error!("fail to open device: {:?}", e);
-                                // The send failure will be logged, but event loop still think
-                                // the event is handled.
-                                let _ = self
-                                    .sock
-                                    .send(&UsbControlResult::FailedToOpenDevice)
-                                    .map_err(Error::WriteControlSock);
-                                return Ok(());
-                            }
-                        }
-                    };
-
-                    // Resetting the device is used to make sure it is in a known state, but it may
-                    // still function if the reset fails.
-                    if let Err(e) = device_handle.reset() {
-                        error!("failed to reset device after attach: {:?}", e);
-                    }
-                    (device, device_handle)
-                };
-
-                #[cfg(not(feature = "sandboxed-libusb"))]
-                let device_handle = match device.open() {
-                    Ok(handle) => handle,
-                    Err(e) => {
-                        error!("fail to open device: {:?}", e);
-                        // The send failure will be logged, but event loop still think the event is
-                        // handled.
-                        let _ = self
-                            .sock
-                            .send(&UsbControlResult::FailedToOpenDevice)
-                            .map_err(Error::WriteControlSock);
-                        return Ok(());
-                    }
-                };
-                let device = Box::new(HostDevice::new(
-                    self.fail_handle.clone(),
-                    self.job_queue.clone(),
-                    device,
-                    device_handle,
-                ));
-                let port = self.usb_hub.connect_backend(device);
-                match port {
-                    Ok(port) => {
-                        // The send failure will be logged, but event loop still think the event is
-                        // handled.
-                        let _ = self
-                            .sock
-                            .send(&UsbControlResult::Ok { port })
-                            .map_err(Error::WriteControlSock);
-                    }
-                    Err(e) => {
-                        error!("failed to connect device to hub: {}", e);
-                        // The send failure will be logged, but event loop still think the event is
-                        // handled.
-                        let _ = self
-                            .sock
-                            .send(&UsbControlResult::NoAvailablePort)
-                            .map_err(Error::WriteControlSock);
-                    }
-                }
-                Ok(())
-            }
-            UsbControlCommand::DetachDevice { port } => {
-                match self.usb_hub.disconnect_port(port) {
-                    Ok(()) => {
-                        // The send failure will be logged, but event loop still think the event is
-                        // handled.
-                        let _ = self
-                            .sock
-                            .send(&UsbControlResult::Ok { port })
-                            .map_err(Error::WriteControlSock);
-                    }
-                    Err(e) => {
-                        error!("failed to disconnect device from port {}: {}", port, e);
-                        // The send failure will be logged, but event loop still think the event is
-                        // handled.
-                        let _ = self
-                            .sock
-                            .send(&UsbControlResult::NoSuchDevice)
-                            .map_err(Error::WriteControlSock);
-                    }
-                }
-                Ok(())
-            }
-            UsbControlCommand::ListDevice { ports } => {
-                let result = self.handle_list_devices(ports);
-                // The send failure will be logged, but event loop still think the event is
-                // handled.
-                let _ = self.sock.send(&result).map_err(Error::WriteControlSock);
-                Ok(())
-            }
-        }
+        let result = match cmd {
+            UsbControlCommand::AttachDevice { fd, .. } => self.handle_attach_device(fd),
+            UsbControlCommand::DetachDevice { port } => self.handle_detach_device(port),
+            UsbControlCommand::ListDevice { ports } => self.handle_list_devices(ports),
+        };
+        self.sock.send(&result).map_err(Error::WriteControlSock)?;
+        Ok(())
     }
 }
 
@@ -335,3 +289,13 @@
         })
     }
 }
+
+struct UsbUtilEventHandler {
+    device: Arc<Mutex<Device>>,
+}
+
+impl EventHandler for UsbUtilEventHandler {
+    fn on_event(&self) -> std::result::Result<(), ()> {
+        self.device.lock().poll_transfers().map_err(|_e| ())
+    }
+}
diff --git a/devices/src/usb/host_backend/host_device.rs b/devices/src/usb/host_backend/host_device.rs
index 8841f00..3b85ea2 100644
--- a/devices/src/usb/host_backend/host_device.rs
+++ b/devices/src/usb/host_backend/host_device.rs
@@ -4,7 +4,6 @@
 
 use std::mem::drop;
 use std::sync::Arc;
-use sync::Mutex;
 
 use super::error::*;
 use super::usb_endpoint::UsbEndpoint;
@@ -14,17 +13,14 @@
 use crate::usb::xhci::xhci_transfer::{XhciTransfer, XhciTransferState, XhciTransferType};
 use crate::utils::AsyncJobQueue;
 use crate::utils::FailHandle;
+use data_model::DataInit;
 use std::collections::HashMap;
+use std::mem;
+use sync::Mutex;
 use sys_util::{error, warn};
-use usb_util::device_handle::DeviceHandle;
-use usb_util::error::Error as LibUsbError;
-use usb_util::libusb_device::LibUsbDevice;
-use usb_util::types::{
-    ControlRequestDataPhaseTransferDirection, ControlRequestRecipient, StandardControlRequest,
-    UsbRequestSetup,
-};
-use usb_util::usb_transfer::{
-    control_transfer, ControlTransferBuffer, TransferStatus, UsbTransfer,
+use usb_util::{
+    ConfigDescriptorTree, ControlRequestDataPhaseTransferDirection, ControlRequestRecipient,
+    Device, StandardControlRequest, Transfer, TransferStatus, UsbRequestSetup,
 };
 
 #[derive(PartialEq)]
@@ -43,11 +39,10 @@
     // Endpoints only contains data endpoints (1 to 30). Control transfers are handled at device
     // level.
     endpoints: Vec<UsbEndpoint>,
-    device: LibUsbDevice,
-    device_handle: Arc<Mutex<DeviceHandle>>,
+    device: Arc<Mutex<Device>>,
     ctl_ep_state: ControlEndpointState,
-    alt_settings: HashMap<u16, u16>,
-    claimed_interfaces: Vec<i32>,
+    alt_settings: HashMap<u8, u8>,
+    claimed_interfaces: Vec<u8>,
     control_request_setup: UsbRequestSetup,
     executed: bool,
     job_queue: Arc<AsyncJobQueue>,
@@ -64,14 +59,12 @@
     pub fn new(
         fail_handle: Arc<dyn FailHandle>,
         job_queue: Arc<AsyncJobQueue>,
-        device: LibUsbDevice,
-        device_handle: DeviceHandle,
+        device: Arc<Mutex<Device>>,
     ) -> HostDevice {
         HostDevice {
             fail_handle,
             endpoints: vec![],
             device,
-            device_handle: Arc::new(Mutex::new(device_handle)),
             ctl_ep_state: ControlEndpointState::SetupStage,
             alt_settings: HashMap::new(),
             claimed_interfaces: vec![],
@@ -81,30 +74,6 @@
         }
     }
 
-    fn get_interface_number_of_active_config(&self) -> i32 {
-        match self.device.get_active_config_descriptor() {
-            Err(LibUsbError::NotFound) => {
-                usb_debug!("device is in unconfigured state");
-                0
-            }
-            Err(e) => {
-                // device might be disconnected now.
-                error!("unexpected error: {:?}", e);
-                0
-            }
-            Ok(descriptor) => descriptor.bNumInterfaces as i32,
-        }
-    }
-
-    fn release_interfaces(&mut self) {
-        for i in &self.claimed_interfaces {
-            if let Err(e) = self.device_handle.lock().release_interface(*i) {
-                error!("could not release interface: {:?}", e);
-            }
-        }
-        self.claimed_interfaces = Vec::new();
-    }
-
     // Check for requests that should be intercepted and emulated using libusb
     // functions rather than passed directly to the device.
     // Returns true if the request has been intercepted or false if the request
@@ -167,20 +136,34 @@
         xhci_transfer: Arc<XhciTransfer>,
         buffer: Option<ScatterGatherBuffer>,
     ) -> Result<()> {
-        let mut control_transfer = control_transfer(0);
-        control_transfer
-            .buffer_mut()
-            .set_request_setup(&self.control_request_setup);
-
         if self.intercepted_control_transfer(&xhci_transfer)? {
             return Ok(());
         }
 
+        // Default buffer size for control data transfer.
+        const CONTROL_DATA_BUFFER_SIZE: usize = 1024;
+
+        // Buffer type for control transfer. The first 8 bytes is a UsbRequestSetup struct.
+        #[derive(Copy, Clone)]
+        #[repr(C, packed)]
+        struct ControlTransferBuffer {
+            pub setup: UsbRequestSetup,
+            pub data: [u8; CONTROL_DATA_BUFFER_SIZE],
+        }
+
+        // Safe because it only has data and has no implicit padding.
+        unsafe impl DataInit for ControlTransferBuffer {}
+
+        let mut control_request = ControlTransferBuffer {
+            setup: self.control_request_setup,
+            data: [0; CONTROL_DATA_BUFFER_SIZE],
+        };
+
         let direction = self.control_request_setup.get_direction();
         let buffer = if direction == ControlRequestDataPhaseTransferDirection::HostToDevice {
             if let Some(buffer) = buffer {
                 buffer
-                    .read(&mut control_transfer.buffer_mut().data_buffer)
+                    .read(&mut control_request.data)
                     .map_err(Error::ReadBuffer)?;
             }
             // buffer is consumed here for HostToDevice transfers.
@@ -190,8 +173,13 @@
             buffer
         };
 
+        let control_buffer = control_request.as_slice().to_vec();
+
+        let mut control_transfer =
+            Transfer::new_control(control_buffer).map_err(Error::CreateTransfer)?;
+
         let tmp_transfer = xhci_transfer.clone();
-        let callback = move |t: UsbTransfer<ControlTransferBuffer>| {
+        let callback = move |t: Transfer| {
             usb_debug!("setup token control transfer callback invoked");
             update_transfer_state(&xhci_transfer, &t)?;
             let state = xhci_transfer.state().lock();
@@ -207,10 +195,14 @@
                     let status = t.status();
                     let actual_length = t.actual_length();
                     if direction == ControlRequestDataPhaseTransferDirection::DeviceToHost {
-                        if let Some(buffer) = &buffer {
-                            buffer
-                                .write(&t.buffer().data_buffer)
-                                .map_err(Error::WriteBuffer)?;
+                        if let Some(control_request_data) =
+                            t.buffer.get(mem::size_of::<UsbRequestSetup>()..)
+                        {
+                            if let Some(buffer) = &buffer {
+                                buffer
+                                    .write(&control_request_data)
+                                    .map_err(Error::WriteBuffer)?;
+                            }
                         }
                     }
                     drop(state);
@@ -231,20 +223,18 @@
         };
 
         let fail_handle = self.fail_handle.clone();
-        control_transfer.set_callback(
-            move |t: UsbTransfer<ControlTransferBuffer>| match callback(t) {
-                Ok(_) => {}
-                Err(e) => {
-                    error!("control transfer callback failed {:?}", e);
-                    fail_handle.fail();
-                }
-            },
-        );
+        control_transfer.set_callback(move |t: Transfer| match callback(t) {
+            Ok(_) => {}
+            Err(e) => {
+                error!("control transfer callback failed {:?}", e);
+                fail_handle.fail();
+            }
+        });
         submit_transfer(
             self.fail_handle.clone(),
             &self.job_queue,
             tmp_transfer,
-            &self.device_handle,
+            &mut self.device.lock(),
             control_transfer,
         )
     }
@@ -309,40 +299,50 @@
 
     fn set_config(&mut self) -> Result<TransferStatus> {
         // It's a standard, set_config, device request.
-        let config = (self.control_request_setup.value & 0xff) as i32;
+        let config = (self.control_request_setup.value & 0xff) as u8;
         usb_debug!(
             "Set config control transfer is received with config: {}",
             config
         );
         self.release_interfaces();
         let cur_config = self
-            .device_handle
+            .device
             .lock()
             .get_active_configuration()
             .map_err(Error::GetActiveConfig)?;
         usb_debug!("current config is: {}", cur_config);
         if config != cur_config {
-            self.device_handle
+            self.device
                 .lock()
                 .set_active_configuration(config)
                 .map_err(Error::SetActiveConfig)?;
         }
-        self.claim_interfaces();
-        self.create_endpoints()?;
+        let config_descriptor = self
+            .device
+            .lock()
+            .get_active_config_descriptor()
+            .map_err(Error::GetActiveConfig)?;
+        self.claim_interfaces(&config_descriptor);
+        self.create_endpoints(&config_descriptor)?;
         Ok(TransferStatus::Completed)
     }
 
     fn set_interface(&mut self) -> Result<TransferStatus> {
         usb_debug!("set interface");
         // It's a standard, set_interface, interface request.
-        let interface = self.control_request_setup.index;
-        let alt_setting = self.control_request_setup.value;
-        self.device_handle
+        let interface = self.control_request_setup.index as u8;
+        let alt_setting = self.control_request_setup.value as u8;
+        self.device
             .lock()
-            .set_interface_alt_setting(interface as i32, alt_setting as i32)
+            .set_interface_alt_setting(interface, alt_setting)
             .map_err(Error::SetInterfaceAltSetting)?;
         self.alt_settings.insert(interface, alt_setting);
-        self.create_endpoints()?;
+        let config_descriptor = self
+            .device
+            .lock()
+            .get_active_config_descriptor()
+            .map_err(Error::GetActiveConfig)?;
+        self.create_endpoints(&config_descriptor)?;
         Ok(TransferStatus::Completed)
     }
 
@@ -352,7 +352,7 @@
         // It's a standard, clear_feature, endpoint request.
         const STD_FEATURE_ENDPOINT_HALT: u16 = 0;
         if request_setup.value == STD_FEATURE_ENDPOINT_HALT {
-            self.device_handle
+            self.device
                 .lock()
                 .clear_halt(request_setup.index as u8)
                 .map_err(Error::ClearHalt)?;
@@ -360,9 +360,9 @@
         Ok(TransferStatus::Completed)
     }
 
-    fn claim_interfaces(&mut self) {
-        for i in 0..self.get_interface_number_of_active_config() {
-            match self.device_handle.lock().claim_interface(i) {
+    fn claim_interfaces(&mut self, config_descriptor: &ConfigDescriptorTree) {
+        for i in 0..config_descriptor.num_interfaces() {
+            match self.device.lock().claim_interface(i) {
                 Ok(()) => {
                     usb_debug!("claimed interface {}", i);
                     self.claimed_interfaces.push(i);
@@ -374,23 +374,16 @@
         }
     }
 
-    fn create_endpoints(&mut self) -> Result<()> {
+    fn create_endpoints(&mut self, config_descriptor: &ConfigDescriptorTree) -> Result<()> {
         self.endpoints = Vec::new();
-        let config_descriptor = match self.device.get_active_config_descriptor() {
-            Err(e) => {
-                error!("device might be disconnected: {:?}", e);
-                return Ok(());
-            }
-            Ok(descriptor) => descriptor,
-        };
         for i in &self.claimed_interfaces {
-            let alt_setting = self.alt_settings.get(&(*i as u16)).unwrap_or(&0);
+            let alt_setting = self.alt_settings.get(i).unwrap_or(&0);
             let interface = config_descriptor
-                .get_interface_descriptor(*i as u8, *alt_setting as i32)
+                .get_interface_descriptor(*i, *alt_setting)
                 .ok_or(Error::GetInterfaceDescriptor((*i, *alt_setting)))?;
             for ep_idx in 0..interface.bNumEndpoints {
                 let ep_dp = interface
-                    .endpoint_descriptor(ep_idx)
+                    .get_endpoint_descriptor(ep_idx)
                     .ok_or(Error::GetEndpointDescriptor(ep_idx))?;
                 let ep_num = ep_dp.get_endpoint_number();
                 if ep_num == 0 {
@@ -402,7 +395,7 @@
                 self.endpoints.push(UsbEndpoint::new(
                     self.fail_handle.clone(),
                     self.job_queue.clone(),
-                    self.device_handle.clone(),
+                    self.device.clone(),
                     ep_num,
                     direction,
                     ty,
@@ -412,6 +405,15 @@
         Ok(())
     }
 
+    fn release_interfaces(&mut self) {
+        for i in &self.claimed_interfaces {
+            if let Err(e) = self.device.lock().release_interface(*i) {
+                error!("could not release interface: {:?}", e);
+            }
+        }
+        self.claimed_interfaces = Vec::new();
+    }
+
     fn submit_transfer_helper(&mut self, transfer: XhciTransfer) -> Result<()> {
         if transfer.get_endpoint_number() == 0 {
             return self.handle_control_transfer(transfer);
@@ -430,7 +432,7 @@
 
 impl XhciBackendDevice for HostDevice {
     fn get_backend_type(&self) -> BackendType {
-        let d = match self.device.get_device_descriptor() {
+        let d = match self.device.lock().get_device_descriptor() {
             Ok(d) => d,
             Err(_) => return BackendType::Usb2,
         };
@@ -443,16 +445,8 @@
         }
     }
 
-    fn host_bus(&self) -> u8 {
-        self.device.get_bus_number()
-    }
-
-    fn host_address(&self) -> u8 {
-        self.device.get_address()
-    }
-
     fn get_vid(&self) -> u16 {
-        match self.device.get_device_descriptor() {
+        match self.device.lock().get_device_descriptor() {
             Ok(d) => d.idVendor,
             Err(e) => {
                 error!("cannot get device descriptor: {:?}", e);
@@ -462,7 +456,7 @@
     }
 
     fn get_pid(&self) -> u16 {
-        match self.device.get_device_descriptor() {
+        match self.device.lock().get_device_descriptor() {
             Ok(d) => d.idProduct,
             Err(e) => {
                 error!("cannot get device descriptor: {:?}", e);
@@ -488,16 +482,9 @@
 
     fn reset(&mut self) -> std::result::Result<(), ()> {
         usb_debug!("resetting host device");
-        let result = self.device_handle.lock().reset();
-        match result {
-            Err(LibUsbError::NotFound) => {
-                // libusb will return NotFound if it fails to re-claim
-                // the interface after the reset.
-                Ok(())
-            }
-            _ => result.map_err(|e| {
-                error!("failed to reset device: {:?}", e);
-            }),
-        }
+        self.device
+            .lock()
+            .reset()
+            .map_err(|e| error!("failed to reset device: {:?}", e))
     }
 }
diff --git a/devices/src/usb/host_backend/hotplug.rs b/devices/src/usb/host_backend/hotplug.rs
deleted file mode 100644
index 0764660..0000000
--- a/devices/src/usb/host_backend/hotplug.rs
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2019 The Chromium OS Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-use std::sync::Arc;
-
-use crate::usb::xhci::usb_hub::UsbHub;
-use sys_util::error;
-use usb_util::hotplug::{HotplugEvent, UsbHotplugHandler};
-use usb_util::libusb_device::LibUsbDevice;
-
-pub struct HotplugHandler {
-    hub: Arc<UsbHub>,
-}
-
-impl HotplugHandler {
-    pub fn new(hub: Arc<UsbHub>) -> Self {
-        HotplugHandler { hub }
-    }
-}
-
-impl UsbHotplugHandler for HotplugHandler {
-    fn hotplug_event(&self, device: LibUsbDevice, event: HotplugEvent) {
-        if event != HotplugEvent::DeviceLeft {
-            return;
-        }
-
-        let bus = device.get_bus_number();
-        let address = device.get_address();
-        let descriptor = match device.get_device_descriptor() {
-            Ok(d) => d,
-            Err(e) => {
-                error!("cannot get device descriptor: {:?}", e);
-                return;
-            }
-        };
-        let vid = descriptor.idVendor;
-        let pid = descriptor.idProduct;
-
-        if let Err(e) = self.hub.try_detach(bus, address, vid, pid) {
-            error!("device left event triggered failed detach from hub: {}", e);
-            return;
-        }
-    }
-}
diff --git a/devices/src/usb/host_backend/mod.rs b/devices/src/usb/host_backend/mod.rs
index ea0f44c..aa0f568 100644
--- a/devices/src/usb/host_backend/mod.rs
+++ b/devices/src/usb/host_backend/mod.rs
@@ -2,10 +2,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-pub mod context;
 pub mod error;
 pub mod host_backend_device_provider;
 pub mod host_device;
-mod hotplug;
 pub mod usb_endpoint;
 mod utils;
diff --git a/devices/src/usb/host_backend/usb_endpoint.rs b/devices/src/usb/host_backend/usb_endpoint.rs
index 329d6ff..f9f06ff 100644
--- a/devices/src/usb/host_backend/usb_endpoint.rs
+++ b/devices/src/usb/host_backend/usb_endpoint.rs
@@ -4,7 +4,6 @@
 
 use std::cmp;
 use std::sync::Arc;
-use sync::Mutex;
 
 use super::error::*;
 use super::utils::{submit_transfer, update_transfer_state};
@@ -14,18 +13,17 @@
 };
 use crate::utils::AsyncJobQueue;
 use crate::utils::FailHandle;
+use sync::Mutex;
 use sys_util::error;
-use usb_util::device_handle::DeviceHandle;
-use usb_util::types::{EndpointDirection, EndpointType, ENDPOINT_DIRECTION_OFFSET};
-use usb_util::usb_transfer::{
-    bulk_transfer, interrupt_transfer, BulkTransferBuffer, TransferStatus, UsbTransfer,
+use usb_util::{
+    Device, EndpointDirection, EndpointType, Transfer, TransferStatus, ENDPOINT_DIRECTION_OFFSET,
 };
 
 /// Isochronous, Bulk or Interrupt endpoint.
 pub struct UsbEndpoint {
     fail_handle: Arc<dyn FailHandle>,
     job_queue: Arc<AsyncJobQueue>,
-    device_handle: Arc<Mutex<DeviceHandle>>,
+    device: Arc<Mutex<Device>>,
     endpoint_number: u8,
     direction: EndpointDirection,
     ty: EndpointType,
@@ -36,7 +34,7 @@
     pub fn new(
         fail_handle: Arc<dyn FailHandle>,
         job_queue: Arc<AsyncJobQueue>,
-        device_handle: Arc<Mutex<DeviceHandle>>,
+        device: Arc<Mutex<Device>>,
         endpoint_number: u8,
         direction: EndpointDirection,
         ty: EndpointType,
@@ -45,7 +43,7 @@
         UsbEndpoint {
             fail_handle,
             job_queue,
-            device_handle,
+            device,
             endpoint_number,
             direction,
             ty,
@@ -101,13 +99,23 @@
         Ok(())
     }
 
+    fn get_transfer_buffer(&self, buffer: &ScatterGatherBuffer) -> Result<Vec<u8>> {
+        let mut v = vec![0u8; buffer.len().map_err(Error::BufferLen)?];
+        if self.direction == EndpointDirection::HostToDevice {
+            // Read data from ScatterGatherBuffer to a continuous memory.
+            buffer.read(v.as_mut_slice()).map_err(Error::ReadBuffer)?;
+        }
+        Ok(v)
+    }
+
     fn handle_bulk_transfer(
         &self,
         xhci_transfer: XhciTransfer,
         buffer: ScatterGatherBuffer,
     ) -> Result<()> {
+        let transfer_buffer = self.get_transfer_buffer(&buffer)?;
         let usb_transfer =
-            bulk_transfer(self.ep_addr(), 0, buffer.len().map_err(Error::BufferLen)?);
+            Transfer::new_bulk(self.ep_addr(), transfer_buffer).map_err(Error::CreateTransfer)?;
         self.do_handle_transfer(xhci_transfer, usb_transfer, buffer)
     }
 
@@ -116,32 +124,28 @@
         xhci_transfer: XhciTransfer,
         buffer: ScatterGatherBuffer,
     ) -> Result<()> {
-        let usb_transfer =
-            interrupt_transfer(self.ep_addr(), 0, buffer.len().map_err(Error::BufferLen)?);
+        let transfer_buffer = self.get_transfer_buffer(&buffer)?;
+        let usb_transfer = Transfer::new_interrupt(self.ep_addr(), transfer_buffer)
+            .map_err(Error::CreateTransfer)?;
         self.do_handle_transfer(xhci_transfer, usb_transfer, buffer)
     }
 
     fn do_handle_transfer(
         &self,
         xhci_transfer: XhciTransfer,
-        mut usb_transfer: UsbTransfer<BulkTransferBuffer>,
+        mut usb_transfer: Transfer,
         buffer: ScatterGatherBuffer,
     ) -> Result<()> {
         let xhci_transfer = Arc::new(xhci_transfer);
         let tmp_transfer = xhci_transfer.clone();
         match self.direction {
             EndpointDirection::HostToDevice => {
-                // Read data from ScatterGatherBuffer to a continuous memory.
-                buffer
-                    .read(usb_transfer.buffer_mut().as_mut_slice())
-                    .map_err(Error::ReadBuffer)?;
                 usb_debug!(
-                    "out transfer ep_addr {:#x}, buffer len {:?}, data {:#x?}",
+                    "out transfer ep_addr {:#x}, buffer len {:?}",
                     self.ep_addr(),
                     buffer.len(),
-                    usb_transfer.buffer_mut().as_mut_slice()
                 );
-                let callback = move |t: UsbTransfer<BulkTransferBuffer>| {
+                let callback = move |t: Transfer| {
                     usb_debug!("out transfer callback");
                     update_transfer_state(&xhci_transfer, &t)?;
                     let state = xhci_transfer.state().lock();
@@ -168,20 +172,18 @@
                     }
                 };
                 let fail_handle = self.fail_handle.clone();
-                usb_transfer.set_callback(
-                    move |t: UsbTransfer<BulkTransferBuffer>| match callback(t) {
-                        Ok(_) => {}
-                        Err(e) => {
-                            error!("bulk transfer callback failed: {:?}", e);
-                            fail_handle.fail();
-                        }
-                    },
-                );
+                usb_transfer.set_callback(move |t: Transfer| match callback(t) {
+                    Ok(_) => {}
+                    Err(e) => {
+                        error!("bulk transfer callback failed: {:?}", e);
+                        fail_handle.fail();
+                    }
+                });
                 submit_transfer(
                     self.fail_handle.clone(),
                     &self.job_queue,
                     tmp_transfer,
-                    &self.device_handle,
+                    &mut self.device.lock(),
                     usb_transfer,
                 )?;
             }
@@ -192,12 +194,8 @@
                     buffer.len()
                 );
                 let _addr = self.ep_addr();
-                let callback = move |t: UsbTransfer<BulkTransferBuffer>| {
-                    usb_debug!(
-                        "ep {:#x} in transfer data {:?}",
-                        _addr,
-                        t.buffer().as_slice()
-                    );
+                let callback = move |t: Transfer| {
+                    usb_debug!("ep {:#x} in transfer data {:?}", _addr, t.buffer.as_slice());
                     update_transfer_state(&xhci_transfer, &t)?;
                     let state = xhci_transfer.state().lock();
                     match *state {
@@ -212,7 +210,7 @@
                             let status = t.status();
                             let actual_length = t.actual_length() as usize;
                             let copied_length = buffer
-                                .write(t.buffer().as_slice())
+                                .write(t.buffer.as_slice())
                                 .map_err(Error::WriteBuffer)?;
                             let actual_length = cmp::min(actual_length, copied_length);
                             drop(state);
@@ -230,21 +228,19 @@
                 };
                 let fail_handle = self.fail_handle.clone();
 
-                usb_transfer.set_callback(
-                    move |t: UsbTransfer<BulkTransferBuffer>| match callback(t) {
-                        Ok(_) => {}
-                        Err(e) => {
-                            error!("bulk transfer callback {:?}", e);
-                            fail_handle.fail();
-                        }
-                    },
-                );
+                usb_transfer.set_callback(move |t: Transfer| match callback(t) {
+                    Ok(_) => {}
+                    Err(e) => {
+                        error!("bulk transfer callback {:?}", e);
+                        fail_handle.fail();
+                    }
+                });
 
                 submit_transfer(
                     self.fail_handle.clone(),
                     &self.job_queue,
                     tmp_transfer,
-                    &self.device_handle,
+                    &mut self.device.lock(),
                     usb_transfer,
                 )?;
             }
diff --git a/devices/src/usb/host_backend/utils.rs b/devices/src/usb/host_backend/utils.rs
index 7d23bd4..ac506e7 100644
--- a/devices/src/usb/host_backend/utils.rs
+++ b/devices/src/usb/host_backend/utils.rs
@@ -4,20 +4,18 @@
 
 use std::mem;
 use std::sync::Arc;
-use sync::Mutex;
 
 use super::error::*;
 use crate::usb::xhci::xhci_transfer::{XhciTransfer, XhciTransferState};
 use crate::utils::AsyncJobQueue;
 use crate::utils::FailHandle;
 use sys_util::{error, warn};
-use usb_util::device_handle::DeviceHandle;
-use usb_util::usb_transfer::{TransferStatus, UsbTransfer, UsbTransferBuffer};
+use usb_util::{Device, Transfer, TransferStatus};
 
 /// Helper function to update xhci_transfer state.
-pub fn update_transfer_state<T: UsbTransferBuffer>(
+pub fn update_transfer_state(
     xhci_transfer: &Arc<XhciTransfer>,
-    usb_transfer: &UsbTransfer<T>,
+    usb_transfer: &Transfer,
 ) -> Result<()> {
     let status = usb_transfer.status();
     let mut state = xhci_transfer.state().lock();
@@ -44,12 +42,12 @@
 }
 
 /// Helper function to submit usb_transfer to device handle.
-pub fn submit_transfer<T: UsbTransferBuffer>(
+pub fn submit_transfer(
     fail_handle: Arc<dyn FailHandle>,
     job_queue: &Arc<AsyncJobQueue>,
     xhci_transfer: Arc<XhciTransfer>,
-    device_handle: &Arc<Mutex<DeviceHandle>>,
-    usb_transfer: UsbTransfer<T>,
+    device: &mut Device,
+    usb_transfer: Transfer,
 ) -> Result<()> {
     let transfer_status = {
         // We need to hold the lock to avoid race condition.
@@ -58,25 +56,25 @@
         let mut state = xhci_transfer.state().lock();
         match mem::replace(&mut *state, XhciTransferState::Cancelled) {
             XhciTransferState::Created => {
-                let canceller = usb_transfer.get_canceller();
-                // TODO(jkwang) refactor canceller to return Cancel::Ok or Cancel::Err.
-                let cancel_callback = Box::new(move || match canceller.try_cancel() {
-                    true => {
-                        usb_debug!("cancel issued to libusb backend");
-                    }
-                    false => {
-                        usb_debug!("fail to cancel");
-                    }
-                });
-                *state = XhciTransferState::Submitted { cancel_callback };
-                match device_handle.lock().submit_async_transfer(usb_transfer) {
+                match device.submit_transfer(usb_transfer) {
                     Err(e) => {
                         error!("fail to submit transfer {:?}", e);
                         *state = XhciTransferState::Completed;
                         TransferStatus::NoDevice
                     }
                     // If it's submitted, we don't need to send on_transfer_complete now.
-                    Ok(_) => return Ok(()),
+                    Ok(canceller) => {
+                        let cancel_callback = Box::new(move || match canceller.cancel() {
+                            Ok(()) => {
+                                usb_debug!("cancel issued to kernel");
+                            }
+                            Err(e) => {
+                                usb_debug!("fail to cancel: {}", e);
+                            }
+                        });
+                        *state = XhciTransferState::Submitted { cancel_callback };
+                        return Ok(());
+                    }
                 }
             }
             XhciTransferState::Cancelled => {
diff --git a/devices/src/usb/xhci/device_slot.rs b/devices/src/usb/xhci/device_slot.rs
index e2b5e5d..22cfb1d 100644
--- a/devices/src/usb/xhci/device_slot.rs
+++ b/devices/src/usb/xhci/device_slot.rs
@@ -229,7 +229,7 @@
         *self.0.lock() = 0;
     }
 
-    fn get(&self) -> Result<(u8)> {
+    fn get(&self) -> Result<u8> {
         let val = *self.0.lock();
         if val == 0 {
             return Err(Error::BadPortId(val));
diff --git a/devices/src/usb/xhci/ring_buffer.rs b/devices/src/usb/xhci/ring_buffer.rs
index a178ebb..3033b0e 100644
--- a/devices/src/usb/xhci/ring_buffer.rs
+++ b/devices/src/usb/xhci/ring_buffer.rs
@@ -264,5 +264,4 @@
         let descriptor = transfer_ring.dequeue_transfer_descriptor().unwrap();
         assert_eq!(descriptor.is_none(), true);
     }
-
 }
diff --git a/devices/src/usb/xhci/scatter_gather_buffer.rs b/devices/src/usb/xhci/scatter_gather_buffer.rs
index 0150e4f..f64f778 100644
--- a/devices/src/usb/xhci/scatter_gather_buffer.rs
+++ b/devices/src/usb/xhci/scatter_gather_buffer.rs
@@ -2,7 +2,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-use super::xhci_abi::{Error as TrbError, NormalTrb, TransferDescriptor, TrbCast, TrbType};
+use super::xhci_abi::{
+    AddressedTrb, Error as TrbError, NormalTrb, TransferDescriptor, TrbCast, TrbType,
+};
 use bit_field::Error as BitFieldError;
 use std::fmt::{self, Display};
 use sys_util::{GuestAddress, GuestMemory, GuestMemoryError};
@@ -14,6 +16,7 @@
     UnknownTrbType(BitFieldError),
     CastTrb(TrbError),
     BadTrbType(TrbType),
+    ImmediateDataTooLong(usize),
 }
 
 type Result<T> = std::result::Result<T, Error>;
@@ -28,6 +31,7 @@
             UnknownTrbType(e) => write!(f, "unknown trb type: {}", e),
             CastTrb(e) => write!(f, "cannot cast trb: {}", e),
             BadTrbType(t) => write!(f, "should not build buffer from trb type: {:?}", t),
+            ImmediateDataTooLong(l) => write!(f, "immediate data longer than allowed: {}", l),
         }
     }
 }
@@ -67,13 +71,30 @@
         Ok(total_len)
     }
 
+    /// Get the guest address and length of the TRB's data buffer.
+    /// This is usually a separate buffer pointed to by the TRB,
+    /// but it can also be within the TRB itself in the case of immediate data.
+    fn get_trb_data(&self, atrb: &AddressedTrb) -> Result<(GuestAddress, usize)> {
+        let normal_trb = atrb.trb.cast::<NormalTrb>().map_err(Error::CastTrb)?;
+        let len = normal_trb.get_trb_transfer_length() as usize;
+        let addr = if normal_trb.get_immediate_data() == 1 {
+            // If the Immediate Data flag is set, the first <= 8 bytes of the TRB hold the data.
+            if len > 8 {
+                return Err(Error::ImmediateDataTooLong(len));
+            }
+            atrb.gpa
+        } else {
+            normal_trb.get_data_buffer()
+        };
+        Ok((GuestAddress(addr), len))
+    }
+
     /// Read content to buffer, return number of bytes read.
     pub fn read(&self, buffer: &mut [u8]) -> Result<usize> {
         let mut total_size = 0usize;
         let mut offset = 0;
         for atrb in &self.td {
-            let normal_trb = atrb.trb.cast::<NormalTrb>().map_err(Error::CastTrb)?;
-            let len = normal_trb.get_trb_transfer_length() as usize;
+            let (guest_address, len) = self.get_trb_data(&atrb)?;
             let buffer_len = {
                 if offset == buffer.len() {
                     return Ok(total_size);
@@ -89,7 +110,7 @@
             offset = buffer_end;
             total_size += self
                 .mem
-                .read_at_addr(cur_buffer, GuestAddress(normal_trb.get_data_buffer()))
+                .read_at_addr(cur_buffer, guest_address)
                 .map_err(Error::ReadGuestMemory)?;
         }
         Ok(total_size)
@@ -100,8 +121,7 @@
         let mut total_size = 0usize;
         let mut offset = 0;
         for atrb in &self.td {
-            let normal_trb = atrb.trb.cast::<NormalTrb>().map_err(Error::CastTrb)?;
-            let len = normal_trb.get_trb_transfer_length() as usize;
+            let (guest_address, len) = self.get_trb_data(&atrb)?;
             let buffer_len = {
                 if offset == buffer.len() {
                     return Ok(total_size);
@@ -117,7 +137,7 @@
             offset = buffer_end;
             total_size += self
                 .mem
-                .write_at_addr(cur_buffer, GuestAddress(normal_trb.get_data_buffer()))
+                .write_at_addr(cur_buffer, guest_address)
                 .map_err(Error::WriteGuestMemory)?;
         }
         Ok(total_size)
@@ -166,14 +186,38 @@
 
         let mut d = [0; 4];
         gm.read_exact_at_addr(&mut d, GuestAddress(0x100)).unwrap();
-        assert_eq!(d, [7, 6, 5, 4]);;
+        assert_eq!(d, [7, 6, 5, 4]);
         gm.read_exact_at_addr(&mut d, GuestAddress(0x200)).unwrap();
-        assert_eq!(d, [3, 2, 0, 0]);;
+        assert_eq!(d, [3, 2, 0, 0]);
         gm.read_exact_at_addr(&mut d, GuestAddress(0x300)).unwrap();
-        assert_eq!(d, [1, 0, 0, 0]);;
+        assert_eq!(d, [1, 0, 0, 0]);
 
         let mut data_read = [0; 7];
         buffer.read(&mut data_read).unwrap();
         assert_eq!(data_to_write, data_read);
     }
+
+    #[test]
+    fn immediate_data_test() {
+        let gm = GuestMemory::new(&vec![(GuestAddress(0), 0x1000)]).unwrap();
+        let mut td = TransferDescriptor::new();
+
+        let expected_immediate_data: [u8; 8] = [0xDE, 0xAD, 0xBE, 0xEF, 0xF0, 0x0D, 0xCA, 0xFE];
+
+        let mut trb = Trb::new();
+        let ntrb = trb.cast_mut::<NormalTrb>().unwrap();
+        ntrb.set_trb_type(TrbType::Normal);
+        ntrb.set_data_buffer(u64::from_le_bytes(expected_immediate_data));
+        ntrb.set_trb_transfer_length(8);
+        ntrb.set_immediate_data(1);
+        td.push(AddressedTrb { trb, gpa: 0xC00 });
+
+        gm.write_obj_at_addr(trb, GuestAddress(0xc00)).unwrap();
+
+        let buffer = ScatterGatherBuffer::new(gm.clone(), td).unwrap();
+
+        let mut data_read = [0; 8];
+        buffer.read(&mut data_read).unwrap();
+        assert_eq!(data_read, expected_immediate_data);
+    }
 }
diff --git a/devices/src/usb/xhci/usb_hub.rs b/devices/src/usb/xhci/usb_hub.rs
index 28c7f19..d54eaa9 100644
--- a/devices/src/usb/xhci/usb_hub.rs
+++ b/devices/src/usb/xhci/usb_hub.rs
@@ -205,41 +205,6 @@
         UsbHub { ports }
     }
 
-    /// Try to detach device of bus, addr, vid, pid
-    pub fn try_detach(&self, bus: u8, addr: u8, vid: u16, pid: u16) -> Result<()> {
-        for port in &self.ports {
-            // This block exists so that we only hold the backend device
-            // lock while checking the address. It needs to be dropped before
-            // calling port.detach(), because that acquires the backend
-            // device lock again.
-            {
-                let backend_device = port.get_backend_device();
-
-                let d = match backend_device.as_ref() {
-                    None => continue,
-                    Some(d) => d,
-                };
-
-                if d.host_bus() != bus
-                    || d.host_address() != addr
-                    || d.get_vid() != vid
-                    || d.get_pid() != pid
-                {
-                    continue;
-                }
-            }
-
-            return port.detach();
-        }
-
-        Err(Error::NoSuchDevice {
-            bus,
-            addr,
-            vid,
-            pid,
-        })
-    }
-
     /// Reset all ports.
     pub fn reset(&self) -> Result<()> {
         usb_debug!("reseting usb hub");
diff --git a/devices/src/usb/xhci/xhci.rs b/devices/src/usb/xhci/xhci.rs
index 47ebe85..bd8e217 100644
--- a/devices/src/usb/xhci/xhci.rs
+++ b/devices/src/usb/xhci/xhci.rs
@@ -14,6 +14,7 @@
 use crate::utils::{Error as UtilsError, EventLoop, FailHandle};
 use std::fmt::{self, Display};
 use std::sync::Arc;
+use std::thread;
 use sync::Mutex;
 use sys_util::{error, EventFd, GuestAddress, GuestMemory};
 
@@ -65,6 +66,8 @@
     interrupter: Arc<Mutex<Interrupter>>,
     command_ring_controller: Arc<CommandRingController>,
     device_slots: DeviceSlots,
+    event_loop: Arc<EventLoop>,
+    event_loop_join_handle: Option<thread::JoinHandle<()>>,
     // resample handler and device provider only lives on EventLoop to handle corresponding events.
     // By design, event loop only hold weak reference. We need to keep a strong reference here to
     // keep it alive.
@@ -84,7 +87,7 @@
         irq_resample_evt: EventFd,
         regs: XhciRegs,
     ) -> Result<Arc<Self>> {
-        let (event_loop, _join_handle) =
+        let (event_loop, join_handle) =
             EventLoop::start("xhci".to_string(), Some(fail_handle.clone()))
                 .map_err(Error::StartEventLoop)?;
         let interrupter = Arc::new(Mutex::new(Interrupter::new(mem.clone(), irq_evt, &regs)));
@@ -122,6 +125,8 @@
             command_ring_controller,
             device_slots,
             device_provider,
+            event_loop,
+            event_loop_join_handle: Some(join_handle),
         });
         Self::init_reg_callbacks(&xhci);
         Ok(xhci)
@@ -390,3 +395,12 @@
         Ok(())
     }
 }
+
+impl Drop for Xhci {
+    fn drop(&mut self) {
+        self.event_loop.stop();
+        if let Some(join_handle) = self.event_loop_join_handle.take() {
+            let _ = join_handle.join();
+        }
+    }
+}
diff --git a/devices/src/usb/xhci/xhci_backend_device.rs b/devices/src/usb/xhci/xhci_backend_device.rs
index e104cdc..a3d9e66 100644
--- a/devices/src/usb/xhci/xhci_backend_device.rs
+++ b/devices/src/usb/xhci/xhci_backend_device.rs
@@ -18,10 +18,6 @@
 pub trait XhciBackendDevice: Send {
     /// Returns the type of USB device provided by this device.
     fn get_backend_type(&self) -> BackendType;
-    /// Returns host bus number of this device.
-    fn host_bus(&self) -> u8;
-    /// Returns host address of this device.
-    fn host_address(&self) -> u8;
     /// Get vendor id of this device.
     fn get_vid(&self) -> u16;
     /// Get product id of this device.
diff --git a/devices/src/usb/xhci/xhci_controller.rs b/devices/src/usb/xhci/xhci_controller.rs
index 76e1d4a..b77a73a 100644
--- a/devices/src/usb/xhci/xhci_controller.rs
+++ b/devices/src/usb/xhci/xhci_controller.rs
@@ -12,7 +12,7 @@
 use crate::usb::xhci::xhci_backend_device_provider::XhciBackendDeviceProvider;
 use crate::usb::xhci::xhci_regs::{init_xhci_mmio_space_and_regs, XhciRegs};
 use crate::utils::FailHandle;
-use resources::{Alloc, SystemAllocator};
+use resources::{Alloc, MmioType, SystemAllocator};
 use std::mem;
 use std::os::unix::io::RawFd;
 use std::sync::atomic::{AtomicBool, Ordering};
@@ -96,7 +96,6 @@
     config_regs: PciConfiguration,
     pci_bus_dev: Option<(u8, u8)>,
     mem: GuestMemory,
-    bar0: u64, // bar0 in config_regs will be changed by guest. Not sure why.
     state: XhciControllerState,
 }
 
@@ -117,7 +116,6 @@
             config_regs,
             pci_bus_dev: None,
             mem,
-            bar0: 0,
             state: XhciControllerState::Created {
                 device_provider: usb_provider,
             },
@@ -158,7 +156,6 @@
             }
             _ => {
                 error!("xhci controller is in a wrong state");
-                return;
             }
         }
     }
@@ -201,7 +198,6 @@
             }
             _ => {
                 error!("xhci controller is in a wrong state");
-                return;
             }
         }
     }
@@ -215,11 +211,12 @@
             .expect("assign_bus_dev must be called prior to allocate_io_bars");
         // xHCI spec 5.2.1.
         let bar0_addr = resources
-            .mmio_allocator()
-            .allocate(
+            .mmio_allocator(MmioType::Low)
+            .allocate_with_align(
                 XHCI_BAR0_SIZE,
                 Alloc::PciBar { bus, dev, bar: 0 },
                 "xhci_bar0".to_string(),
+                XHCI_BAR0_SIZE,
             )
             .map_err(|e| PciDeviceError::IoAllocationFailed(XHCI_BAR0_SIZE, e))?;
         let bar0_config = PciBarConfiguration::default()
@@ -229,20 +226,19 @@
         self.config_regs
             .add_pci_bar(&bar0_config)
             .map_err(|e| PciDeviceError::IoRegistrationFailed(bar0_addr, e))?;
-        self.bar0 = bar0_addr;
         Ok(vec![(bar0_addr, XHCI_BAR0_SIZE)])
     }
 
-    fn config_registers(&self) -> &PciConfiguration {
-        &self.config_regs
+    fn read_config_register(&self, reg_idx: usize) -> u32 {
+        self.config_regs.read_reg(reg_idx)
     }
 
-    fn config_registers_mut(&mut self) -> &mut PciConfiguration {
-        &mut self.config_regs
+    fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
+        (&mut self.config_regs).write_reg(reg_idx, offset, data)
     }
 
     fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
-        let bar0 = self.bar0;
+        let bar0 = self.config_regs.get_bar_addr(0);
         if addr < bar0 || addr > bar0 + XHCI_BAR0_SIZE {
             return;
         }
@@ -253,13 +249,12 @@
             }
             _ => {
                 error!("xhci controller is in a wrong state");
-                return;
             }
         }
     }
 
     fn write_bar(&mut self, addr: u64, data: &[u8]) {
-        let bar0 = self.bar0;
+        let bar0 = self.config_regs.get_bar_addr(0);
         if addr < bar0 || addr > bar0 + XHCI_BAR0_SIZE {
             return;
         }
@@ -273,10 +268,10 @@
             }
             _ => {
                 error!("xhci controller is in a wrong state");
-                return;
             }
         }
     }
+
     fn on_device_sandboxed(&mut self) {
         self.init_when_forked();
     }
diff --git a/devices/src/usb/xhci/xhci_transfer.rs b/devices/src/usb/xhci/xhci_transfer.rs
index e166c1a..625f97e 100644
--- a/devices/src/usb/xhci/xhci_transfer.rs
+++ b/devices/src/usb/xhci/xhci_transfer.rs
@@ -17,8 +17,7 @@
 use std::sync::{Arc, Weak};
 use sync::Mutex;
 use sys_util::{error, Error as SysError, EventFd, GuestMemory};
-use usb_util::types::UsbRequestSetup;
-use usb_util::usb_transfer::TransferStatus;
+use usb_util::{TransferStatus, UsbRequestSetup};
 
 #[derive(Debug)]
 pub enum Error {
@@ -67,7 +66,7 @@
     /// When transfer is submitted, it will contain a transfer callback, which should be invoked
     /// when the transfer is cancelled.
     Submitted {
-        cancel_callback: Box<dyn FnMut() + Send>,
+        cancel_callback: Box<dyn FnOnce() + Send>,
     },
     Cancelling,
     Cancelled,
@@ -78,9 +77,7 @@
     /// Try to cancel this transfer, if it's possible.
     pub fn try_cancel(&mut self) {
         match mem::replace(self, XhciTransferState::Created) {
-            XhciTransferState::Submitted {
-                mut cancel_callback,
-            } => {
+            XhciTransferState::Submitted { cancel_callback } => {
                 *self = XhciTransferState::Cancelling;
                 cancel_callback();
             }
@@ -290,8 +287,13 @@
                 usb_debug!("device disconnected, detaching from port");
                 // If the device is gone, we don't need to send transfer completion event, cause we
                 // are going to destroy everything related to this device anyway.
-                self.port.detach().map_err(Error::DetachPort)?;
-                return Ok(());
+                return match self.port.detach() {
+                    Ok(()) => Ok(()),
+                    // It's acceptable for the port to be already disconnected
+                    // as asynchronous transfer completions are processed.
+                    Err(HubError::AlreadyDetached(_e)) => Ok(()),
+                    Err(e) => Err(Error::DetachPort(e)),
+                };
             }
             TransferStatus::Cancelled => {
                 // TODO(jkwang) According to the spec, we should send a stopped event here. But
diff --git a/devices/src/utils/event_loop.rs b/devices/src/utils/event_loop.rs
index 405c543..6db8d0e 100644
--- a/devices/src/utils/event_loop.rs
+++ b/devices/src/utils/event_loop.rs
@@ -236,7 +236,7 @@
             cvar: Condvar::new(),
             evt,
         });
-        let t: Arc<EventHandler> = h.clone();
+        let t: Arc<dyn EventHandler> = h.clone();
         l.add_event(
             &h.evt,
             WatchingEvents::empty().set_read(),
diff --git a/devices/src/vfio.rs b/devices/src/vfio.rs
new file mode 100644
index 0000000..88d7066
--- /dev/null
+++ b/devices/src/vfio.rs
@@ -0,0 +1,779 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use data_model::vec_with_array_field;
+use std::ffi::CString;
+use std::fmt;
+use std::fs::{File, OpenOptions};
+use std::io;
+use std::mem;
+use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
+use std::os::unix::prelude::FileExt;
+use std::path::{Path, PathBuf};
+use std::u32;
+
+use kvm::Vm;
+use sys_util::{
+    ioctl, ioctl_with_mut_ref, ioctl_with_ptr, ioctl_with_ref, ioctl_with_val, warn, Error,
+    EventFd, GuestMemory,
+};
+
+use vfio_sys::*;
+
+#[derive(Debug)]
+pub enum VfioError {
+    OpenContainer(io::Error),
+    OpenGroup(io::Error),
+    GetGroupStatus(Error),
+    GroupViable,
+    VfioApiVersion,
+    VfioType1V2,
+    GroupSetContainer(Error),
+    ContainerSetIOMMU(Error),
+    GroupGetDeviceFD(Error),
+    CreateVfioKvmDevice(Error),
+    KvmSetDeviceAttr(Error),
+    VfioDeviceGetInfo(Error),
+    VfioDeviceGetRegionInfo(Error),
+    InvalidPath,
+    IommuDmaMap(Error),
+    IommuDmaUnmap(Error),
+    VfioIrqEnable(Error),
+    VfioIrqDisable(Error),
+    VfioIrqUnmask(Error),
+    VfioIrqMask(Error),
+}
+
+impl fmt::Display for VfioError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            VfioError::OpenContainer(e) => write!(f, "failed to open /dev/vfio/vfio container: {}", e),
+            VfioError::OpenGroup(e) => write!(f, "failed to open /dev/vfio/$group_num group: {}", e),
+            VfioError::GetGroupStatus(e) => write!(f, "failed to get Group Status: {}", e),
+            VfioError::GroupViable => write!(f, "group is inviable"),
+            VfioError::VfioApiVersion => write!(f, "vfio API version doesn't match with VFIO_API_VERSION defined in vfio_sys/srv/vfio.rs"),
+            VfioError::VfioType1V2 => write!(f, "container dones't support VfioType1V2 IOMMU driver type"),
+            VfioError::GroupSetContainer(e) => write!(f, "failed to add vfio group into vfio container: {}", e),
+            VfioError::ContainerSetIOMMU(e) => write!(f, "failed to set container's IOMMU driver type as VfioType1V2: {}", e),
+            VfioError::GroupGetDeviceFD(e) => write!(f, "failed to get vfio device fd: {}", e),
+            VfioError::CreateVfioKvmDevice(e) => write!(f, "failed to create KVM vfio device: {}", e),
+            VfioError::KvmSetDeviceAttr(e) => write!(f, "failed to set KVM vfio device's attribute: {}", e),
+            VfioError::VfioDeviceGetInfo(e) => write!(f, "failed to get vfio device's info or info doesn't match: {}", e),
+            VfioError::VfioDeviceGetRegionInfo(e) => write!(f, "failed to get vfio device's region info: {}", e),
+            VfioError::InvalidPath => write!(f,"invalid file path"),
+            VfioError::IommuDmaMap(e) => write!(f, "failed to add guest memory map into iommu table: {}", e),
+            VfioError::IommuDmaUnmap(e) => write!(f, "failed to remove guest memory map from iommu table: {}", e),
+            VfioError::VfioIrqEnable(e) => write!(f, "failed to enable vfio deviece's irq: {}", e),
+            VfioError::VfioIrqDisable(e) => write!(f, "failed to disable vfio deviece's irq: {}", e),
+            VfioError::VfioIrqUnmask(e) => write!(f, "failed to unmask vfio deviece's irq: {}", e),
+            VfioError::VfioIrqMask(e) => write!(f, "failed to mask vfio deviece's irq: {}", e),
+        }
+    }
+}
+
+fn get_error() -> Error {
+    Error::last()
+}
+
+struct VfioContainer {
+    container: File,
+}
+
+const VFIO_API_VERSION: u8 = 0;
+impl VfioContainer {
+    fn new() -> Result<Self, VfioError> {
+        let container = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .open("/dev/vfio/vfio")
+            .map_err(VfioError::OpenContainer)?;
+
+        Ok(VfioContainer { container })
+    }
+
+    fn get_api_version(&self) -> i32 {
+        // Safe as file is vfio container fd and ioctl is defined by kernel.
+        unsafe { ioctl(self, VFIO_GET_API_VERSION()) }
+    }
+
+    fn check_extension(&self, val: u32) -> bool {
+        if val != VFIO_TYPE1_IOMMU && val != VFIO_TYPE1v2_IOMMU {
+            panic!("IOMMU type error");
+        }
+
+        // Safe as file is vfio container and make sure val is valid.
+        let ret = unsafe { ioctl_with_val(self, VFIO_CHECK_EXTENSION(), val.into()) };
+        ret == 1
+    }
+
+    fn set_iommu(&self, val: u32) -> i32 {
+        if val != VFIO_TYPE1_IOMMU && val != VFIO_TYPE1v2_IOMMU {
+            panic!("IOMMU type error");
+        }
+
+        // Safe as file is vfio container and make sure val is valid.
+        unsafe { ioctl_with_val(self, VFIO_SET_IOMMU(), val.into()) }
+    }
+
+    unsafe fn vfio_dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioError> {
+        let dma_map = vfio_iommu_type1_dma_map {
+            argsz: mem::size_of::<vfio_iommu_type1_dma_map>() as u32,
+            flags: VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+            vaddr: user_addr,
+            iova,
+            size,
+        };
+
+        let ret = ioctl_with_ref(self, VFIO_IOMMU_MAP_DMA(), &dma_map);
+        if ret != 0 {
+            return Err(VfioError::IommuDmaMap(get_error()));
+        }
+
+        Ok(())
+    }
+
+    fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioError> {
+        let mut dma_unmap = vfio_iommu_type1_dma_unmap {
+            argsz: mem::size_of::<vfio_iommu_type1_dma_unmap>() as u32,
+            flags: 0,
+            iova,
+            size,
+        };
+
+        // Safe as file is vfio container, dma_unmap is constructed by us, and
+        // we check the return value
+        let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_UNMAP_DMA(), &mut dma_unmap) };
+        if ret != 0 || dma_unmap.size != size {
+            return Err(VfioError::IommuDmaUnmap(get_error()));
+        }
+
+        Ok(())
+    }
+}
+
+impl AsRawFd for VfioContainer {
+    fn as_raw_fd(&self) -> RawFd {
+        self.container.as_raw_fd()
+    }
+}
+
+struct VfioGroup {
+    group: File,
+    container: VfioContainer,
+}
+
+impl VfioGroup {
+    fn new(id: u32, vm: &Vm) -> Result<Self, VfioError> {
+        let mut group_path = String::from("/dev/vfio/");
+        let s_id = &id;
+        group_path.push_str(s_id.to_string().as_str());
+
+        let group_file = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .open(Path::new(&group_path))
+            .map_err(VfioError::OpenGroup)?;
+
+        let mut group_status = vfio_group_status {
+            argsz: mem::size_of::<vfio_group_status>() as u32,
+            flags: 0,
+        };
+        // Safe as we are the owner of group_file and group_status which are valid value.
+        let mut ret =
+            unsafe { ioctl_with_mut_ref(&group_file, VFIO_GROUP_GET_STATUS(), &mut group_status) };
+        if ret < 0 {
+            return Err(VfioError::GetGroupStatus(get_error()));
+        }
+
+        if group_status.flags != VFIO_GROUP_FLAGS_VIABLE {
+            return Err(VfioError::GroupViable);
+        }
+
+        let container = VfioContainer::new()?;
+        if container.get_api_version() as u8 != VFIO_API_VERSION {
+            return Err(VfioError::VfioApiVersion);
+        }
+        if !container.check_extension(VFIO_TYPE1v2_IOMMU) {
+            return Err(VfioError::VfioType1V2);
+        }
+
+        // Safe as we are the owner of group_file and container_raw_fd which are valid value,
+        // and we verify the ret value
+        let container_raw_fd = container.as_raw_fd();
+        ret = unsafe { ioctl_with_ref(&group_file, VFIO_GROUP_SET_CONTAINER(), &container_raw_fd) };
+        if ret < 0 {
+            return Err(VfioError::GroupSetContainer(get_error()));
+        }
+
+        ret = container.set_iommu(VFIO_TYPE1v2_IOMMU);
+        if ret < 0 {
+            return Err(VfioError::ContainerSetIOMMU(get_error()));
+        }
+
+        Self::kvm_device_add_group(vm, &group_file)?;
+
+        Ok(VfioGroup {
+            group: group_file,
+            container,
+        })
+    }
+
+    fn kvm_device_add_group(vm: &Vm, group: &File) -> Result<File, VfioError> {
+        let mut vfio_dev = kvm_sys::kvm_create_device {
+            type_: kvm_sys::kvm_device_type_KVM_DEV_TYPE_VFIO,
+            fd: 0,
+            flags: 0,
+        };
+        vm.create_device(&mut vfio_dev)
+            .map_err(VfioError::CreateVfioKvmDevice)?;
+
+        // Safe as we are the owner of vfio_dev.fd which is valid value.
+        let vfio_dev_fd = unsafe { File::from_raw_fd(vfio_dev.fd as i32) };
+
+        let group_fd = group.as_raw_fd();
+        let group_fd_ptr = &group_fd as *const i32;
+        let vfio_dev_attr = kvm_sys::kvm_device_attr {
+            flags: 0,
+            group: kvm_sys::KVM_DEV_VFIO_GROUP,
+            attr: kvm_sys::KVM_DEV_VFIO_GROUP_ADD as u64,
+            addr: group_fd_ptr as u64,
+        };
+
+        // Safe as we are the owner of vfio_dev_fd and vfio_dev_attr which are valid value,
+        // and we verify the return value.
+        if 0 != unsafe {
+            ioctl_with_ref(&vfio_dev_fd, kvm_sys::KVM_SET_DEVICE_ATTR(), &vfio_dev_attr)
+        } {
+            return Err(VfioError::KvmSetDeviceAttr(get_error()));
+        }
+
+        Ok(vfio_dev_fd)
+    }
+
+    fn get_device(&self, name: &Path) -> Result<File, VfioError> {
+        let uuid_osstr = name.file_name().ok_or(VfioError::InvalidPath)?;
+        let uuid_str = uuid_osstr.to_str().ok_or(VfioError::InvalidPath)?;
+        let path: CString = CString::new(uuid_str.as_bytes()).expect("CString::new() failed");
+        let path_ptr = path.as_ptr();
+
+        // Safe as we are the owner of self and path_ptr which are valid value.
+        let ret = unsafe { ioctl_with_ptr(self, VFIO_GROUP_GET_DEVICE_FD(), path_ptr) };
+        if ret < 0 {
+            return Err(VfioError::GroupGetDeviceFD(get_error()));
+        }
+
+        // Safe as ret is valid FD
+        Ok(unsafe { File::from_raw_fd(ret) })
+    }
+}
+
+impl AsRawFd for VfioGroup {
+    fn as_raw_fd(&self) -> RawFd {
+        self.group.as_raw_fd()
+    }
+}
+
+/// Vfio Irq type used to enable/disable/mask/unmask vfio irq
+pub enum VfioIrqType {
+    Intx,
+    Msi,
+    Msix,
+}
+
+struct VfioRegion {
+    // flags for this region: read/write/mmap
+    flags: u32,
+    size: u64,
+    // region offset used to read/write with vfio device fd
+    offset: u64,
+    // vectors for mmap offset and size
+    mmaps: Vec<vfio_region_sparse_mmap_area>,
+    // type and subtype for cap type
+    cap_info: Option<(u32, u32)>,
+}
+
+/// Vfio device for exposing regions which could be read/write to kernel vfio device.
+pub struct VfioDevice {
+    dev: File,
+    group: VfioGroup,
+    // vec for vfio device's regions
+    regions: Vec<VfioRegion>,
+    guest_mem: GuestMemory,
+}
+
+impl VfioDevice {
+    /// Create a new vfio device, then guest read/write on this device could be
+    /// transfered into kernel vfio.
+    /// sysfspath specify the vfio device path in sys file system.
+    pub fn new(sysfspath: &Path, vm: &Vm, guest_mem: GuestMemory) -> Result<Self, VfioError> {
+        let mut uuid_path = PathBuf::new();
+        uuid_path.push(sysfspath);
+        uuid_path.push("iommu_group");
+        let group_path = uuid_path.read_link().map_err(|_| VfioError::InvalidPath)?;
+        let group_osstr = group_path.file_name().ok_or(VfioError::InvalidPath)?;
+        let group_str = group_osstr.to_str().ok_or(VfioError::InvalidPath)?;
+        let group_id = group_str
+            .parse::<u32>()
+            .map_err(|_| VfioError::InvalidPath)?;
+
+        let group = VfioGroup::new(group_id, vm)?;
+        let new_dev = group.get_device(sysfspath)?;
+        let dev_regions = Self::get_regions(&new_dev)?;
+
+        Ok(VfioDevice {
+            dev: new_dev,
+            group,
+            regions: dev_regions,
+            guest_mem,
+        })
+    }
+
+    /// Enable vfio device's irq and associate Irqfd EventFd with device.
+    /// When MSIx is enabled, multi vectors will be supported, so fds is vector and the vector
+    /// length is the num of MSIx vectors
+    pub fn irq_enable(&self, fds: Vec<&EventFd>, irq_type: VfioIrqType) -> Result<(), VfioError> {
+        let count = fds.len();
+        let u32_size = mem::size_of::<u32>();
+        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
+        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
+        irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+        match irq_type {
+            VfioIrqType::Intx => irq_set[0].index = VFIO_PCI_INTX_IRQ_INDEX,
+            VfioIrqType::Msi => irq_set[0].index = VFIO_PCI_MSI_IRQ_INDEX,
+            VfioIrqType::Msix => irq_set[0].index = VFIO_PCI_MSIX_IRQ_INDEX,
+        }
+        irq_set[0].start = 0;
+        irq_set[0].count = count as u32;
+
+        // irq_set.data could be none, bool or fd according to flags, so irq_set.data
+        // is u8 default, here irq_set.data is fd as u32, so 4 default u8 are combined
+        // together as u32. It is safe as enough space is reserved through
+        // vec_with_array_field(u32)<count>.
+        let mut data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
+        for fd in fds.iter().take(count) {
+            let (left, right) = data.split_at_mut(u32_size);
+            left.copy_from_slice(&fd.as_raw_fd().to_ne_bytes()[..]);
+            data = right;
+        }
+
+        // Safe as we are the owner of self and irq_set which are valid value
+        let ret = unsafe { ioctl_with_ref(self, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
+        if ret < 0 {
+            Err(VfioError::VfioIrqEnable(get_error()))
+        } else {
+            Ok(())
+        }
+    }
+
+    /// When intx is enabled, irqfd is used to trigger a level interrupt into guest, resample irqfd
+    /// is used to get guest EOI notification.
+    /// When host hw generates interrupt, vfio irq handler in host kernel receive and handle it,
+    /// this handler disable hw irq first, then trigger irqfd to inject interrupt into guest. When
+    /// resample irqfd is triggered by guest EOI, vfio kernel could enable hw irq, so hw could
+    /// generate another interrupts.
+    /// This function enable resample irqfd and let vfio kernel could get EOI notification.
+    ///
+    /// fd: should be resample IrqFd.
+    pub fn resample_virq_enable(&self, fd: &EventFd) -> Result<(), VfioError> {
+        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
+        irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + mem::size_of::<u32>()) as u32;
+        irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
+        irq_set[0].index = VFIO_PCI_INTX_IRQ_INDEX;
+        irq_set[0].start = 0;
+        irq_set[0].count = 1;
+
+        {
+            // irq_set.data could be none, bool or fd according to flags, so irq_set.data
+            // is u8 default, here irq_set.data is fd as u32, so 4 default u8 are combined
+            // together as u32. It is safe as enough space is reserved through
+            // vec_with_array_field(u32)<1>.
+            let fds = unsafe { irq_set[0].data.as_mut_slice(4) };
+            fds.copy_from_slice(&fd.as_raw_fd().to_le_bytes()[..]);
+        }
+
+        // Safe as we are the owner of self and irq_set which are valid value
+        let ret = unsafe { ioctl_with_ref(self, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
+        if ret < 0 {
+            Err(VfioError::VfioIrqEnable(get_error()))
+        } else {
+            Ok(())
+        }
+    }
+
+    /// disable vfio device's irq and disconnect Irqfd EventFd with device
+    pub fn irq_disable(&self, irq_type: VfioIrqType) -> Result<(), VfioError> {
+        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
+        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
+        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+        match irq_type {
+            VfioIrqType::Intx => irq_set[0].index = VFIO_PCI_INTX_IRQ_INDEX,
+            VfioIrqType::Msi => irq_set[0].index = VFIO_PCI_MSI_IRQ_INDEX,
+            VfioIrqType::Msix => irq_set[0].index = VFIO_PCI_MSIX_IRQ_INDEX,
+        }
+        irq_set[0].start = 0;
+        irq_set[0].count = 0;
+
+        // Safe as we are the owner of self and irq_set which are valid value
+        let ret = unsafe { ioctl_with_ref(self, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
+        if ret < 0 {
+            Err(VfioError::VfioIrqDisable(get_error()))
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Unmask vfio device irq
+    pub fn irq_unmask(&self, irq_type: VfioIrqType) -> Result<(), VfioError> {
+        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
+        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
+        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
+        match irq_type {
+            VfioIrqType::Intx => irq_set[0].index = VFIO_PCI_INTX_IRQ_INDEX,
+            VfioIrqType::Msi => irq_set[0].index = VFIO_PCI_MSI_IRQ_INDEX,
+            VfioIrqType::Msix => irq_set[0].index = VFIO_PCI_MSIX_IRQ_INDEX,
+        }
+        irq_set[0].start = 0;
+        irq_set[0].count = 1;
+
+        // Safe as we are the owner of self and irq_set which are valid value
+        let ret = unsafe { ioctl_with_ref(self, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
+        if ret < 0 {
+            Err(VfioError::VfioIrqUnmask(get_error()))
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Mask vfio device irq
+    pub fn irq_mask(&self, irq_type: VfioIrqType) -> Result<(), VfioError> {
+        let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
+        irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
+        irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
+        match irq_type {
+            VfioIrqType::Intx => irq_set[0].index = VFIO_PCI_INTX_IRQ_INDEX,
+            VfioIrqType::Msi => irq_set[0].index = VFIO_PCI_MSI_IRQ_INDEX,
+            VfioIrqType::Msix => irq_set[0].index = VFIO_PCI_MSIX_IRQ_INDEX,
+        }
+        irq_set[0].start = 0;
+        irq_set[0].count = 1;
+
+        // Safe as we are the owner of self and irq_set which are valid value
+        let ret = unsafe { ioctl_with_ref(self, VFIO_DEVICE_SET_IRQS(), &irq_set[0]) };
+        if ret < 0 {
+            Err(VfioError::VfioIrqMask(get_error()))
+        } else {
+            Ok(())
+        }
+    }
+
+    #[allow(clippy::cast_ptr_alignment)]
+    fn get_regions(dev: &File) -> Result<Vec<VfioRegion>, VfioError> {
+        let mut regions: Vec<VfioRegion> = Vec::new();
+        let mut dev_info = vfio_device_info {
+            argsz: mem::size_of::<vfio_device_info>() as u32,
+            flags: 0,
+            num_regions: 0,
+            num_irqs: 0,
+        };
+        // Safe as we are the owner of dev and dev_info which are valid value,
+        // and we verify the return value.
+        let mut ret = unsafe { ioctl_with_mut_ref(dev, VFIO_DEVICE_GET_INFO(), &mut dev_info) };
+        if ret < 0
+            || (dev_info.flags & VFIO_DEVICE_FLAGS_PCI) == 0
+            || dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1
+            || dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1
+        {
+            return Err(VfioError::VfioDeviceGetInfo(get_error()));
+        }
+
+        for i in VFIO_PCI_BAR0_REGION_INDEX..dev_info.num_regions {
+            let argsz = mem::size_of::<vfio_region_info>() as u32;
+            let mut reg_info = vfio_region_info {
+                argsz,
+                flags: 0,
+                index: i,
+                cap_offset: 0,
+                size: 0,
+                offset: 0,
+            };
+            // Safe as we are the owner of dev and reg_info which are valid value,
+            // and we verify the return value.
+            ret = unsafe { ioctl_with_mut_ref(dev, VFIO_DEVICE_GET_REGION_INFO(), &mut reg_info) };
+            if ret < 0 {
+                continue;
+            }
+
+            let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::new();
+            let mut cap_info: Option<(u32, u32)> = None;
+            if reg_info.argsz > argsz {
+                let cap_len: usize = (reg_info.argsz - argsz) as usize;
+                let mut region_with_cap =
+                    vec_with_array_field::<vfio_region_info_with_cap, u8>(cap_len);
+                region_with_cap[0].region_info.argsz = reg_info.argsz;
+                region_with_cap[0].region_info.flags = 0;
+                region_with_cap[0].region_info.index = i;
+                region_with_cap[0].region_info.cap_offset = 0;
+                region_with_cap[0].region_info.size = 0;
+                region_with_cap[0].region_info.offset = 0;
+                // Safe as we are the owner of dev and region_info which are valid value,
+                // and we verify the return value.
+                ret = unsafe {
+                    ioctl_with_mut_ref(
+                        dev,
+                        VFIO_DEVICE_GET_REGION_INFO(),
+                        &mut (region_with_cap[0].region_info),
+                    )
+                };
+                if ret < 0 {
+                    return Err(VfioError::VfioDeviceGetRegionInfo(get_error()));
+                }
+
+                if region_with_cap[0].region_info.flags & VFIO_REGION_INFO_FLAG_CAPS == 0 {
+                    continue;
+                }
+
+                let cap_header_sz = mem::size_of::<vfio_info_cap_header>() as u32;
+                let mmap_cap_sz = mem::size_of::<vfio_region_info_cap_sparse_mmap>() as u32;
+                let mmap_area_sz = mem::size_of::<vfio_region_sparse_mmap_area>() as u32;
+                let type_cap_sz = mem::size_of::<vfio_region_info_cap_type>() as u32;
+                let region_info_sz = reg_info.argsz;
+
+                // region_with_cap[0].cap_info may contain many structures, like
+                // vfio_region_info_cap_sparse_mmap struct or vfio_region_info_cap_type struct.
+                // Both of them begin with vfio_info_cap_header, so we will get individual cap from
+                // vfio_into_cap_header.
+                // Go through all the cap structs.
+                let info_ptr = region_with_cap.as_ptr() as *mut u8;
+                let mut offset = region_with_cap[0].region_info.cap_offset;
+                while offset != 0 {
+                    if offset + cap_header_sz >= region_info_sz {
+                        break;
+                    }
+                    // Safe, as cap_header struct is in this function allocated region_with_cap
+                    // vec.
+                    let cap_ptr = unsafe { info_ptr.offset(offset as isize) };
+                    let cap_header =
+                        unsafe { &*(cap_ptr as *mut u8 as *const vfio_info_cap_header) };
+                    if cap_header.id as u32 == VFIO_REGION_INFO_CAP_SPARSE_MMAP {
+                        if offset + mmap_cap_sz >= region_info_sz {
+                            break;
+                        }
+                        // cap_ptr is vfio_region_info_cap_sparse_mmap here
+                        // Safe, this vfio_region_info_cap_sparse_mmap is in this function allocated
+                        // region_with_cap vec.
+                        let sparse_mmap = unsafe {
+                            &*(cap_ptr as *mut u8 as *const vfio_region_info_cap_sparse_mmap)
+                        };
+
+                        let area_num = sparse_mmap.nr_areas;
+                        if offset + mmap_cap_sz + area_num * mmap_area_sz > region_info_sz {
+                            break;
+                        }
+                        // Safe, these vfio_region_sparse_mmap_area are in this function allocated
+                        // region_with_cap vec.
+                        let areas =
+                            unsafe { sparse_mmap.areas.as_slice(sparse_mmap.nr_areas as usize) };
+                        for area in areas.iter() {
+                            mmaps.push(area.clone());
+                        }
+                    } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_TYPE {
+                        if offset + type_cap_sz > region_info_sz {
+                            break;
+                        }
+                        // cap_ptr is vfio_region_info_cap_type here
+                        // Safe, this vfio_region_info_cap_type is in this function allocated
+                        // region_with_cap vec
+                        let cap_type_info =
+                            unsafe { &*(cap_ptr as *mut u8 as *const vfio_region_info_cap_type) };
+
+                        cap_info = Some((cap_type_info.type_, cap_type_info.subtype));
+                    }
+
+                    offset = cap_header.next;
+                }
+            } else if reg_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
+                mmaps.push(vfio_region_sparse_mmap_area {
+                    offset: 0,
+                    size: reg_info.size,
+                });
+            }
+
+            let region = VfioRegion {
+                flags: reg_info.flags,
+                size: reg_info.size,
+                offset: reg_info.offset,
+                mmaps,
+                cap_info,
+            };
+            regions.push(region);
+        }
+
+        Ok(regions)
+    }
+
+    /// get a region's flag
+    /// the return's value may conatin:
+    ///     VFIO_REGION_INFO_FLAG_READ:  region supports read
+    ///     VFIO_REGION_INFO_FLAG_WRITE: region supports write
+    ///     VFIO_REGION_INFO_FLAG_MMAP:  region supports mmap
+    ///     VFIO_REGION_INFO_FLAG_CAPS:  region's info supports caps
+    pub fn get_region_flags(&self, index: u32) -> u32 {
+        match self.regions.get(index as usize) {
+            Some(v) => v.flags,
+            None => {
+                warn!("get_region_flags() with invalid index: {}", index);
+                0
+            }
+        }
+    }
+
+    /// get a region's offset
+    /// return: Region offset from the start of vfio device fd
+    pub fn get_region_offset(&self, index: u32) -> u64 {
+        match self.regions.get(index as usize) {
+            Some(v) => v.offset,
+            None => {
+                warn!("get_region_offset with invalid index: {}", index);
+                0
+            }
+        }
+    }
+
+    /// get a region's mmap info vector
+    pub fn get_region_mmap(&self, index: u32) -> Vec<vfio_region_sparse_mmap_area> {
+        match self.regions.get(index as usize) {
+            Some(v) => v.mmaps.clone(),
+            None => {
+                warn!("get_region_mmap with invalid index: {}", index);
+                Vec::new()
+            }
+        }
+    }
+
+    /// find the specified cap type in device regions
+    /// Input:
+    ///      type_:  cap type
+    ///      sub_type: cap sub_type
+    /// Output:
+    ///     None: device doesn't have the specified cap type
+    ///     Some((bar_index, region_size)): device has the specified cap type, return region's
+    ///                                     index and size
+    pub fn get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)> {
+        for (index, region) in self.regions.iter().enumerate() {
+            if let Some(cap_info) = &region.cap_info {
+                if cap_info.0 == type_ && cap_info.1 == sub_type {
+                    return Some((index as u32, region.size));
+                }
+            }
+        }
+
+        None
+    }
+
+    /// Read region's data from VFIO device into buf
+    /// index: region num
+    /// buf: data destination and buf length is read size
+    /// addr: offset in the region
+    pub fn region_read(&self, index: u32, buf: &mut [u8], addr: u64) {
+        let stub: &VfioRegion;
+        match self.regions.get(index as usize) {
+            Some(v) => stub = v,
+            None => {
+                warn!("region read with invalid index: {}", index);
+                return;
+            }
+        }
+
+        let size = buf.len() as u64;
+        if size > stub.size || addr + size > stub.size {
+            warn!(
+                "region read with invalid parameter, index: {}, add: {:x}, size: {:x}",
+                index, addr, size
+            );
+            return;
+        }
+
+        if let Err(e) = self.dev.read_exact_at(buf, stub.offset + addr) {
+            warn!(
+                "Failed to read region in index: {}, addr: {:x}, error: {}",
+                index, addr, e
+            );
+        }
+    }
+
+    /// write the data from buf into a vfio device region
+    /// index: region num
+    /// buf: data src and buf length is write size
+    /// addr: offset in the region
+    pub fn region_write(&self, index: u32, buf: &[u8], addr: u64) {
+        let stub: &VfioRegion;
+        match self.regions.get(index as usize) {
+            Some(v) => stub = v,
+            None => {
+                warn!("region write with invalid index: {}", index);
+                return;
+            }
+        }
+
+        let size = buf.len() as u64;
+        if size > stub.size
+            || addr + size > stub.size
+            || (stub.flags & VFIO_REGION_INFO_FLAG_WRITE) == 0
+        {
+            warn!(
+                "region write with invalid parameter,indxe: {}, add: {:x}, size: {:x}",
+                index, addr, size
+            );
+            return;
+        }
+
+        if let Err(e) = self.dev.write_all_at(buf, stub.offset + addr) {
+            warn!(
+                "Failed to write region in index: {}, addr: {:x}, error: {}",
+                index, addr, e
+            );
+        }
+    }
+
+    /// get vfio device's fds which are passed into minijail process
+    pub fn keep_fds(&self) -> Vec<RawFd> {
+        let mut fds = Vec::new();
+        fds.push(self.as_raw_fd());
+        fds.push(self.group.as_raw_fd());
+        fds.push(self.group.container.as_raw_fd());
+        fds
+    }
+
+    /// Add (iova, user_addr) map into vfio container iommu table
+    pub unsafe fn vfio_dma_map(
+        &self,
+        iova: u64,
+        size: u64,
+        user_addr: u64,
+    ) -> Result<(), VfioError> {
+        self.group.container.vfio_dma_map(iova, size, user_addr)
+    }
+
+    /// Remove (iova, user_addr) map from vfio container iommu table
+    pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioError> {
+        self.group.container.vfio_dma_unmap(iova, size)
+    }
+
+    /// Add all guest memory regions into vfio container's iommu table,
+    /// then vfio kernel driver could access guest memory from gfn
+    pub fn setup_dma_map(&self) -> Result<(), VfioError> {
+        self.guest_mem
+            .with_regions(|_index, guest_addr, size, host_addr, _fd_offset| {
+                // Safe because the guest regions are guaranteed not to overlap
+                unsafe { self.vfio_dma_map(guest_addr.0, size as u64, host_addr as u64) }
+            })?;
+        Ok(())
+    }
+}
+
+impl AsRawFd for VfioDevice {
+    fn as_raw_fd(&self) -> RawFd {
+        self.dev.as_raw_fd()
+    }
+}
diff --git a/devices/src/virtio/balloon.rs b/devices/src/virtio/balloon.rs
index 451421b..83bcdcf 100644
--- a/devices/src/virtio/balloon.rs
+++ b/devices/src/virtio/balloon.rs
@@ -3,15 +3,13 @@
 // found in the LICENSE file.
 
 use std;
-use std::cmp;
 use std::fmt::{self, Display};
-use std::io::Write;
 use std::os::unix::io::{AsRawFd, RawFd};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 use std::thread;
 
-use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use data_model::{DataInit, Le32};
 use msg_socket::MsgReceiver;
 use sys_util::{
     self, error, info, warn, EventFd, GuestAddress, GuestMemory, PollContext, PollToken,
@@ -19,8 +17,7 @@
 use vm_control::{BalloonControlCommand, BalloonControlResponseSocket};
 
 use super::{
-    DescriptorChain, Queue, VirtioDevice, INTERRUPT_STATUS_CONFIG_CHANGED,
-    INTERRUPT_STATUS_USED_RING, TYPE_BALLOON, VIRTIO_F_VERSION_1,
+    copy_config, Interrupt, Queue, Reader, VirtioDevice, TYPE_BALLOON, VIRTIO_F_VERSION_1,
 };
 
 #[derive(Debug)]
@@ -54,6 +51,17 @@
 const VIRTIO_BALLOON_F_MUST_TELL_HOST: u32 = 0; // Tell before reclaiming pages
 const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u32 = 2; // Deflate balloon on OOM
 
+// virtio_balloon_config is the ballon device configuration space defined by the virtio spec.
+#[derive(Copy, Clone, Debug, Default)]
+#[repr(C)]
+struct virtio_balloon_config {
+    num_pages: Le32,
+    actual: Le32,
+}
+
+// Safe because it only has data and has no implicit padding.
+unsafe impl DataInit for virtio_balloon_config {}
+
 // BalloonConfig is modified by the worker and read from the device thread.
 #[derive(Default)]
 struct BalloonConfig {
@@ -62,20 +70,14 @@
 }
 
 struct Worker {
+    interrupt: Interrupt,
     mem: GuestMemory,
     inflate_queue: Queue,
     deflate_queue: Queue,
-    interrupt_status: Arc<AtomicUsize>,
-    interrupt_evt: EventFd,
-    interrupt_resample_evt: EventFd,
     config: Arc<BalloonConfig>,
     command_socket: BalloonControlResponseSocket,
 }
 
-fn valid_inflate_desc(desc: &DescriptorChain) -> bool {
-    !desc.is_write_only() && desc.len % 4 == 0
-}
-
 impl Worker {
     fn process_inflate_deflate(&mut self, inflate: bool) -> bool {
         let queue = if inflate {
@@ -86,19 +88,38 @@
 
         let mut needs_interrupt = false;
         while let Some(avail_desc) = queue.pop(&self.mem) {
-            if inflate && valid_inflate_desc(&avail_desc) {
-                let num_addrs = avail_desc.len / 4;
-                for i in 0..num_addrs as usize {
-                    let addr = match avail_desc.addr.checked_add((i * 4) as u64) {
-                        Some(a) => a,
-                        None => break,
-                    };
-                    let guest_input: u32 = match self.mem.read_obj_from_addr(addr) {
-                        Ok(a) => a,
-                        Err(_) => continue,
+            let index = avail_desc.index;
+
+            if inflate {
+                let mut reader = match Reader::new(&self.mem, avail_desc) {
+                    Ok(r) => r,
+                    Err(e) => {
+                        error!("balloon: failed to create reader: {}", e);
+                        queue.add_used(&self.mem, index, 0);
+                        needs_interrupt = true;
+                        continue;
+                    }
+                };
+                let data_length = reader.available_bytes();
+
+                if data_length % 4 != 0 {
+                    error!("invalid inflate buffer size: {}", data_length);
+                    queue.add_used(&self.mem, index, 0);
+                    needs_interrupt = true;
+                    continue;
+                }
+
+                let num_addrs = data_length / 4;
+                for _ in 0..num_addrs as usize {
+                    let guest_input = match reader.read_obj::<Le32>() {
+                        Ok(a) => a.to_native(),
+                        Err(err) => {
+                            error!("error while reading unused pages: {}", err);
+                            break;
+                        }
                     };
                     let guest_address =
-                        GuestAddress((guest_input as u64) << VIRTIO_BALLOON_PFN_SHIFT);
+                        GuestAddress((u64::from(guest_input)) << VIRTIO_BALLOON_PFN_SHIFT);
 
                     if self
                         .mem
@@ -111,25 +132,13 @@
                 }
             }
 
-            queue.add_used(&self.mem, avail_desc.index, 0);
+            queue.add_used(&self.mem, index, 0);
             needs_interrupt = true;
         }
 
         needs_interrupt
     }
 
-    fn signal_used_queue(&self) {
-        self.interrupt_status
-            .fetch_or(INTERRUPT_STATUS_USED_RING as usize, Ordering::SeqCst);
-        self.interrupt_evt.write(1).unwrap();
-    }
-
-    fn signal_config_changed(&self) {
-        self.interrupt_status
-            .fetch_or(INTERRUPT_STATUS_CONFIG_CHANGED as usize, Ordering::SeqCst);
-        self.interrupt_evt.write(1).unwrap();
-    }
-
     fn run(&mut self, mut queue_evts: Vec<EventFd>, kill_evt: EventFd) {
         #[derive(PartialEq, PollToken)]
         enum Token {
@@ -143,19 +152,13 @@
         let inflate_queue_evt = queue_evts.remove(0);
         let deflate_queue_evt = queue_evts.remove(0);
 
-        let poll_ctx: PollContext<Token> = match PollContext::new()
-            .and_then(|pc| pc.add(&inflate_queue_evt, Token::Inflate).and(Ok(pc)))
-            .and_then(|pc| pc.add(&deflate_queue_evt, Token::Deflate).and(Ok(pc)))
-            .and_then(|pc| {
-                pc.add(&self.command_socket, Token::CommandSocket)
-                    .and(Ok(pc))
-            })
-            .and_then(|pc| {
-                pc.add(&self.interrupt_resample_evt, Token::InterruptResample)
-                    .and(Ok(pc))
-            })
-            .and_then(|pc| pc.add(&kill_evt, Token::Kill).and(Ok(pc)))
-        {
+        let poll_ctx: PollContext<Token> = match PollContext::build_with(&[
+            (&inflate_queue_evt, Token::Inflate),
+            (&deflate_queue_evt, Token::Deflate),
+            (&self.command_socket, Token::CommandSocket),
+            (self.interrupt.get_resample_evt(), Token::InterruptResample),
+            (&kill_evt, Token::Kill),
+        ]) {
             Ok(pc) => pc,
             Err(e) => {
                 error!("failed creating PollContext: {}", e);
@@ -172,7 +175,8 @@
                 }
             };
 
-            let mut needs_interrupt = false;
+            let mut needs_interrupt_inflate = false;
+            let mut needs_interrupt_deflate = false;
             for event in events.iter_readable() {
                 match event.token() {
                     Token::Inflate => {
@@ -180,14 +184,14 @@
                             error!("failed reading inflate queue EventFd: {}", e);
                             break 'poll;
                         }
-                        needs_interrupt |= self.process_inflate_deflate(true);
+                        needs_interrupt_inflate |= self.process_inflate_deflate(true);
                     }
                     Token::Deflate => {
                         if let Err(e) = deflate_queue_evt.read() {
                             error!("failed reading deflate queue EventFd: {}", e);
                             break 'poll;
                         }
-                        needs_interrupt |= self.process_inflate_deflate(false);
+                        needs_interrupt_deflate |= self.process_inflate_deflate(false);
                     }
                     Token::CommandSocket => {
                         if let Ok(req) = self.command_socket.recv() {
@@ -198,16 +202,13 @@
                                     info!("ballon config changed to consume {} pages", num_pages);
 
                                     self.config.num_pages.store(num_pages, Ordering::Relaxed);
-                                    self.signal_config_changed();
+                                    self.interrupt.signal_config_changed();
                                 }
                             };
                         }
                     }
                     Token::InterruptResample => {
-                        let _ = self.interrupt_resample_evt.read();
-                        if self.interrupt_status.load(Ordering::SeqCst) != 0 {
-                            self.interrupt_evt.write(1).unwrap();
-                        }
+                        self.interrupt.interrupt_resample();
                     }
                     Token::Kill => break 'poll,
                 }
@@ -219,8 +220,13 @@
                     let _ = poll_ctx.delete(&self.command_socket);
                 }
             }
-            if needs_interrupt {
-                self.signal_used_queue();
+
+            if needs_interrupt_inflate {
+                self.interrupt.signal_used_queue(self.inflate_queue.vector);
+            }
+
+            if needs_interrupt_deflate {
+                self.interrupt.signal_used_queue(self.deflate_queue.vector);
             }
         }
     }
@@ -232,6 +238,7 @@
     config: Arc<BalloonConfig>,
     features: u64,
     kill_evt: Option<EventFd>,
+    worker_thread: Option<thread::JoinHandle<Worker>>,
 }
 
 impl Balloon {
@@ -244,10 +251,20 @@
                 actual_pages: AtomicUsize::new(0),
             }),
             kill_evt: None,
+            worker_thread: None,
             // TODO(dgreid) - Add stats queue feature.
             features: 1 << VIRTIO_BALLOON_F_MUST_TELL_HOST | 1 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
         })
     }
+
+    fn get_config(&self) -> virtio_balloon_config {
+        let num_pages = self.config.num_pages.load(Ordering::Relaxed) as u32;
+        let actual_pages = self.config.actual_pages.load(Ordering::Relaxed) as u32;
+        virtio_balloon_config {
+            num_pages: num_pages.into(),
+            actual: actual_pages.into(),
+        }
+    }
 }
 
 impl Drop for Balloon {
@@ -256,6 +273,10 @@
             // Ignore the result because there is nothing we can do with a failure.
             let _ = kill_evt.write(1);
         }
+
+        if let Some(worker_thread) = self.worker_thread.take() {
+            let _ = worker_thread.join();
+        }
     }
 }
 
@@ -272,37 +293,16 @@
         QUEUE_SIZES
     }
 
-    fn read_config(&self, offset: u64, mut data: &mut [u8]) {
-        if offset >= 8 {
-            return;
-        }
-        let num_pages = self.config.num_pages.load(Ordering::Relaxed) as u32;
-        let actual_pages = self.config.actual_pages.load(Ordering::Relaxed) as u32;
-        let mut config = [0u8; 8];
-        // These writes can't fail as they fit in the declared array so unwrap is fine.
-        (&mut config[0..])
-            .write_u32::<LittleEndian>(num_pages)
-            .unwrap();
-        (&mut config[4..])
-            .write_u32::<LittleEndian>(actual_pages)
-            .unwrap();
-        if let Some(end) = offset.checked_add(data.len() as u64) {
-            // This write can't fail, offset and end are checked against the length of config.
-            data.write_all(&config[offset as usize..cmp::min(end, 8) as usize])
-                .unwrap();
-        }
+    fn read_config(&self, offset: u64, data: &mut [u8]) {
+        copy_config(data, 0, self.get_config().as_slice(), offset);
     }
 
-    fn write_config(&mut self, offset: u64, mut data: &[u8]) {
-        // Only allow writing to `actual` pages from the guest.
-        if offset != 4 || data.len() != 4 {
-            return;
-        }
-        // This read can't fail as it fits in the declared array so unwrap is fine.
-        let new_actual: u32 = data.read_u32::<LittleEndian>().unwrap();
+    fn write_config(&mut self, offset: u64, data: &[u8]) {
+        let mut config = self.get_config();
+        copy_config(config.as_mut_slice(), offset, data, 0);
         self.config
             .actual_pages
-            .store(new_actual as usize, Ordering::Relaxed);
+            .store(config.actual.to_native() as usize, Ordering::Relaxed);
     }
 
     fn features(&self) -> u64 {
@@ -318,9 +318,7 @@
     fn activate(
         &mut self,
         mem: GuestMemory,
-        interrupt_evt: EventFd,
-        interrupt_resample_evt: EventFd,
-        status: Arc<AtomicUsize>,
+        interrupt: Interrupt,
         mut queues: Vec<Queue>,
         queue_evts: Vec<EventFd>,
     ) {
@@ -343,20 +341,47 @@
             .name("virtio_balloon".to_string())
             .spawn(move || {
                 let mut worker = Worker {
+                    interrupt,
                     mem,
                     inflate_queue: queues.remove(0),
                     deflate_queue: queues.remove(0),
-                    interrupt_status: status,
-                    interrupt_evt,
-                    interrupt_resample_evt,
                     command_socket,
                     config,
                 };
                 worker.run(queue_evts, kill_evt);
+                worker
             });
-        if let Err(e) = worker_result {
-            error!("failed to spawn virtio_balloon worker: {}", e);
-            return;
+
+        match worker_result {
+            Err(e) => {
+                error!("failed to spawn virtio_balloon worker: {}", e);
+            }
+            Ok(join_handle) => {
+                self.worker_thread = Some(join_handle);
+            }
         }
     }
+
+    fn reset(&mut self) -> bool {
+        if let Some(kill_evt) = self.kill_evt.take() {
+            if kill_evt.write(1).is_err() {
+                error!("{}: failed to notify the kill event", self.debug_label());
+                return false;
+            }
+        }
+
+        if let Some(worker_thread) = self.worker_thread.take() {
+            match worker_thread.join() {
+                Err(_) => {
+                    error!("{}: failed to get back resources", self.debug_label());
+                    return false;
+                }
+                Ok(worker) => {
+                    self.command_socket = Some(worker.command_socket);
+                    return true;
+                }
+            }
+        }
+        false
+    }
 }
diff --git a/devices/src/virtio/block.rs b/devices/src/virtio/block.rs
index 59cc51a..1e9c63e 100644
--- a/devices/src/virtio/block.rs
+++ b/devices/src/virtio/block.rs
@@ -2,33 +2,29 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-use std::cmp;
+use std::cmp::{max, min};
 use std::fmt::{self, Display};
-use std::io::{self, Seek, SeekFrom, Write};
-use std::mem::{size_of, size_of_val};
+use std::io::{self, Write};
+use std::mem::size_of;
 use std::os::unix::io::{AsRawFd, RawFd};
 use std::result;
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 use std::thread;
 use std::time::Duration;
 use std::u32;
 
+use data_model::{DataInit, Le16, Le32, Le64};
+use disk::DiskFile;
+use msg_socket::{MsgReceiver, MsgSender};
 use sync::Mutex;
 use sys_util::Error as SysError;
 use sys_util::Result as SysResult;
-use sys_util::{
-    error, info, warn, EventFd, FileReadWriteVolatile, FileSetLen, FileSync, GuestAddress,
-    GuestMemory, GuestMemoryError, PollContext, PollToken, PunchHole, TimerFd, WriteZeroes,
-};
-
-use data_model::{DataInit, Le16, Le32, Le64, VolatileMemory, VolatileMemoryError};
-use msg_socket::{MsgReceiver, MsgSender};
+use sys_util::{error, info, iov_max, warn, EventFd, GuestMemory, PollContext, PollToken, TimerFd};
 use vm_control::{DiskControlCommand, DiskControlResponseSocket, DiskControlResult};
 
 use super::{
-    DescriptorChain, Queue, VirtioDevice, INTERRUPT_STATUS_CONFIG_CHANGED,
-    INTERRUPT_STATUS_USED_RING, TYPE_BLOCK, VIRTIO_F_VERSION_1,
+    copy_config, DescriptorChain, DescriptorError, Interrupt, Queue, Reader, VirtioDevice, Writer,
+    TYPE_BLOCK, VIRTIO_F_VERSION_1,
 };
 
 const QUEUE_SIZE: u16 = 256;
@@ -37,6 +33,9 @@
 const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT;
 const MAX_DISCARD_SECTORS: u32 = u32::MAX;
 const MAX_WRITE_ZEROES_SECTORS: u32 = u32::MAX;
+// Arbitrary limits for number of discard/write zeroes segments.
+const MAX_DISCARD_SEG: u32 = 32;
+const MAX_WRITE_ZEROES_SEG: u32 = 32;
 // Hard-coded to 64 KiB (in 512-byte sectors) for now,
 // but this should probably be based on cluster size for qcow.
 const DISCARD_SECTOR_ALIGNMENT: u32 = 128;
@@ -51,6 +50,7 @@
 const VIRTIO_BLK_S_IOERR: u8 = 1;
 const VIRTIO_BLK_S_UNSUPP: u8 = 2;
 
+const VIRTIO_BLK_F_SEG_MAX: u32 = 2;
 const VIRTIO_BLK_F_RO: u32 = 5;
 const VIRTIO_BLK_F_BLK_SIZE: u32 = 6;
 const VIRTIO_BLK_F_FLUSH: u32 = 9;
@@ -101,6 +101,17 @@
 }
 
 // Safe because it only has data and has no implicit padding.
+unsafe impl DataInit for virtio_blk_req_header {}
+
+#[derive(Copy, Clone, Debug, Default)]
+#[repr(C)]
+struct virtio_blk_req_header {
+    req_type: Le32,
+    reserved: Le32,
+    sector: Le64,
+}
+
+// Safe because it only has data and has no implicit padding.
 unsafe impl DataInit for virtio_blk_config {}
 
 #[derive(Copy, Clone, Debug, Default)]
@@ -116,139 +127,23 @@
 // Safe because it only has data and has no implicit padding.
 unsafe impl DataInit for virtio_blk_discard_write_zeroes {}
 
-pub trait DiskFile:
-    FileSetLen + FileSync + FileReadWriteVolatile + PunchHole + Seek + WriteZeroes
-{
-}
-impl<D: FileSetLen + FileSync + PunchHole + FileReadWriteVolatile + Seek + WriteZeroes> DiskFile
-    for D
-{
-}
-
-#[derive(Copy, Clone, Debug, PartialEq)]
-enum RequestType {
-    In,
-    Out,
-    Flush,
-    Discard,
-    WriteZeroes,
-    Unsupported(u32),
-}
-
-impl Display for RequestType {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        use self::RequestType::*;
-
-        match self {
-            In => write!(f, "in"),
-            Out => write!(f, "out"),
-            Flush => write!(f, "flush"),
-            Discard => write!(f, "discard"),
-            WriteZeroes => write!(f, "write zeroes"),
-            Unsupported(n) => write!(f, "unsupported({})", n),
-        }
-    }
-}
-
-#[derive(Debug)]
-enum ParseError {
-    /// Guest gave us bad memory addresses
-    GuestMemory(GuestMemoryError),
-    /// Guest gave us offsets that would have overflowed a usize.
-    CheckedOffset(GuestAddress, u64),
-    /// Guest gave us a write only descriptor that protocol says to read from.
-    UnexpectedWriteOnlyDescriptor,
-    /// Guest gave us a read only descriptor that protocol says to write to.
-    UnexpectedReadOnlyDescriptor,
-    /// Guest gave us too few descriptors in a descriptor chain.
-    DescriptorChainTooShort,
-    /// Guest gave us a descriptor that was too short to use.
-    DescriptorLengthTooSmall,
-}
-
-impl Display for ParseError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        use self::ParseError::*;
-
-        match self {
-            GuestMemory(e) => write!(f, "bad guest memory address: {}", e),
-            CheckedOffset(addr, offset) => write!(f, "{}+{} would overflow a usize", addr, offset),
-            UnexpectedWriteOnlyDescriptor => write!(f, "unexpected write-only descriptor"),
-            UnexpectedReadOnlyDescriptor => write!(f, "unexpected read-only descriptor"),
-            DescriptorChainTooShort => write!(f, "descriptor chain too short"),
-            DescriptorLengthTooSmall => write!(f, "descriptor length too small"),
-        }
-    }
-}
-
-fn request_type(
-    mem: &GuestMemory,
-    desc_addr: GuestAddress,
-) -> result::Result<RequestType, ParseError> {
-    let type_ = mem
-        .read_obj_from_addr(desc_addr)
-        .map_err(ParseError::GuestMemory)?;
-    match type_ {
-        VIRTIO_BLK_T_IN => Ok(RequestType::In),
-        VIRTIO_BLK_T_OUT => Ok(RequestType::Out),
-        VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush),
-        VIRTIO_BLK_T_DISCARD => Ok(RequestType::Discard),
-        VIRTIO_BLK_T_WRITE_ZEROES => Ok(RequestType::WriteZeroes),
-        t => Ok(RequestType::Unsupported(t)),
-    }
-}
-
-fn sector(mem: &GuestMemory, desc_addr: GuestAddress) -> result::Result<u64, ParseError> {
-    const SECTOR_OFFSET: u64 = 8;
-    let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) {
-        Some(v) => v,
-        None => return Err(ParseError::CheckedOffset(desc_addr, SECTOR_OFFSET)),
-    };
-
-    mem.read_obj_from_addr(addr)
-        .map_err(ParseError::GuestMemory)
-}
-
-fn discard_write_zeroes_segment(
-    mem: &GuestMemory,
-    seg_addr: GuestAddress,
-) -> result::Result<virtio_blk_discard_write_zeroes, ParseError> {
-    mem.read_obj_from_addr(seg_addr)
-        .map_err(ParseError::GuestMemory)
-}
-
 #[derive(Debug)]
 enum ExecuteError {
+    Descriptor(DescriptorError),
+    Read(io::Error),
+    WriteStatus(io::Error),
     /// Error arming the flush timer.
     Flush(io::Error),
-    ReadVolatile {
-        addr: GuestAddress,
-        length: u32,
-        sector: u64,
-        volatile_memory_error: VolatileMemoryError,
-    },
     ReadIo {
-        addr: GuestAddress,
-        length: u32,
+        length: usize,
         sector: u64,
-        io_error: io::Error,
-    },
-    Seek {
-        ioerr: io::Error,
-        sector: u64,
+        desc_error: io::Error,
     },
     TimerFd(SysError),
-    WriteVolatile {
-        addr: GuestAddress,
-        length: u32,
-        sector: u64,
-        volatile_memory_error: VolatileMemoryError,
-    },
     WriteIo {
-        addr: GuestAddress,
-        length: u32,
+        length: usize,
         sector: u64,
-        io_error: io::Error,
+        desc_error: io::Error,
     },
     DiscardWriteZeroes {
         ioerr: Option<io::Error>,
@@ -257,9 +152,10 @@
         flags: u32,
     },
     ReadOnly {
-        request_type: RequestType,
+        request_type: u32,
     },
     OutOfRange,
+    MissingStatus,
     Unsupported(u32),
 }
 
@@ -268,48 +164,28 @@
         use self::ExecuteError::*;
 
         match self {
+            Descriptor(e) => write!(f, "virtio descriptor error: {}", e),
+            Read(e) => write!(f, "failed to read message: {}", e),
+            WriteStatus(e) => write!(f, "failed to write request status: {}", e),
             Flush(e) => write!(f, "failed to flush: {}", e),
-            ReadVolatile {
-                addr,
-                length,
-                sector,
-                volatile_memory_error,
-            } => write!(
-                f,
-                "memory error reading {} bytes from sector {} to address {}: {}",
-                length, sector, addr, volatile_memory_error,
-            ),
             ReadIo {
-                addr,
                 length,
                 sector,
-                io_error,
+                desc_error,
             } => write!(
                 f,
-                "io error reading {} bytes from sector {} to address {}: {}",
-                length, sector, addr, io_error,
+                "io error reading {} bytes from sector {}: {}",
+                length, sector, desc_error,
             ),
-            Seek { ioerr, sector } => write!(f, "failed to seek to sector {}: {}", sector, ioerr),
             TimerFd(e) => write!(f, "{}", e),
-            WriteVolatile {
-                addr,
-                length,
-                sector,
-                volatile_memory_error,
-            } => write!(
-                f,
-                "memory error writing {} bytes from address {} to sector {}: {}",
-                length, addr, sector, volatile_memory_error,
-            ),
             WriteIo {
-                addr,
                 length,
                 sector,
-                io_error,
+                desc_error,
             } => write!(
                 f,
-                "io error writing {} bytes from address {} to sector {}: {}",
-                length, addr, sector, io_error,
+                "io error writing {} bytes to sector {}: {}",
+                length, sector, desc_error,
             ),
             DiscardWriteZeroes {
                 ioerr: Some(ioerr),
@@ -333,6 +209,7 @@
             ),
             ReadOnly { request_type } => write!(f, "read only; request_type={}", request_type),
             OutOfRange => write!(f, "out of range"),
+            MissingStatus => write!(f, "not enough space in descriptor chain to write status"),
             Unsupported(n) => write!(f, "unsupported ({})", n),
         }
     }
@@ -341,177 +218,355 @@
 impl ExecuteError {
     fn status(&self) -> u8 {
         match self {
+            ExecuteError::Descriptor(_) => VIRTIO_BLK_S_IOERR,
+            ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR,
+            ExecuteError::WriteStatus(_) => VIRTIO_BLK_S_IOERR,
             ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR,
             ExecuteError::ReadIo { .. } => VIRTIO_BLK_S_IOERR,
-            ExecuteError::ReadVolatile { .. } => VIRTIO_BLK_S_IOERR,
-            ExecuteError::Seek { .. } => VIRTIO_BLK_S_IOERR,
             ExecuteError::TimerFd(_) => VIRTIO_BLK_S_IOERR,
             ExecuteError::WriteIo { .. } => VIRTIO_BLK_S_IOERR,
-            ExecuteError::WriteVolatile { .. } => VIRTIO_BLK_S_IOERR,
             ExecuteError::DiscardWriteZeroes { .. } => VIRTIO_BLK_S_IOERR,
             ExecuteError::ReadOnly { .. } => VIRTIO_BLK_S_IOERR,
             ExecuteError::OutOfRange { .. } => VIRTIO_BLK_S_IOERR,
+            ExecuteError::MissingStatus => VIRTIO_BLK_S_IOERR,
             ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP,
         }
     }
 }
 
-struct Request {
-    request_type: RequestType,
-    sector: u64,
-    data_addr: GuestAddress,
-    data_len: u32,
-    status_addr: GuestAddress,
-    discard_write_zeroes_seg: Option<virtio_blk_discard_write_zeroes>,
+struct Worker {
+    interrupt: Interrupt,
+    queues: Vec<Queue>,
+    mem: GuestMemory,
+    disk_image: Box<dyn DiskFile>,
+    disk_size: Arc<Mutex<u64>>,
+    read_only: bool,
+    sparse: bool,
+    control_socket: DiskControlResponseSocket,
 }
 
-impl Request {
-    fn parse(
-        avail_desc: &DescriptorChain,
-        mem: &GuestMemory,
-    ) -> result::Result<Request, ParseError> {
-        // The head contains the request type which MUST be readable.
-        if avail_desc.is_write_only() {
-            return Err(ParseError::UnexpectedWriteOnlyDescriptor);
-        }
-
-        let req_type = request_type(&mem, avail_desc.addr)?;
-        if req_type == RequestType::Flush {
-            Request::parse_flush(avail_desc, mem)
-        } else if req_type == RequestType::Discard || req_type == RequestType::WriteZeroes {
-            Request::parse_discard_write_zeroes(avail_desc, mem, req_type)
-        } else {
-            Request::parse_read_write(avail_desc, mem, req_type)
-        }
-    }
-
-    fn parse_flush(
-        avail_desc: &DescriptorChain,
-        mem: &GuestMemory,
-    ) -> result::Result<Request, ParseError> {
-        let sector = sector(&mem, avail_desc.addr)?;
-        let status_desc = avail_desc
-            .next_descriptor()
-            .ok_or(ParseError::DescriptorChainTooShort)?;
-
-        // The status MUST always be writable
-        if !status_desc.is_write_only() {
-            return Err(ParseError::UnexpectedReadOnlyDescriptor);
-        }
-
-        if status_desc.len < 1 {
-            return Err(ParseError::DescriptorLengthTooSmall);
-        }
-
-        Ok(Request {
-            request_type: RequestType::Flush,
-            sector,
-            data_addr: GuestAddress(0),
-            data_len: 0,
-            status_addr: status_desc.addr,
-            discard_write_zeroes_seg: None,
-        })
-    }
-
-    fn parse_discard_write_zeroes(
-        avail_desc: &DescriptorChain,
-        mem: &GuestMemory,
-        req_type: RequestType,
-    ) -> result::Result<Request, ParseError> {
-        let seg_desc = avail_desc
-            .next_descriptor()
-            .ok_or(ParseError::DescriptorChainTooShort)?;
-        let status_desc = seg_desc
-            .next_descriptor()
-            .ok_or(ParseError::DescriptorChainTooShort)?;
-
-        if seg_desc.is_write_only() {
-            return Err(ParseError::UnexpectedWriteOnlyDescriptor);
-        }
-
-        // For simplicity, we currently only support a single segment
-        // for discard and write zeroes commands.  This allows the
-        // request to be represented as a single Request object.
-        if seg_desc.len < size_of::<virtio_blk_discard_write_zeroes>() as u32 {
-            return Err(ParseError::DescriptorLengthTooSmall);
-        }
-
-        let seg = discard_write_zeroes_segment(&mem, seg_desc.addr)?;
-
-        // The status MUST always be writable
-        if !status_desc.is_write_only() {
-            return Err(ParseError::UnexpectedReadOnlyDescriptor);
-        }
-
-        if status_desc.len < 1 {
-            return Err(ParseError::DescriptorLengthTooSmall);
-        }
-
-        Ok(Request {
-            request_type: req_type,
-            sector: 0,
-            data_addr: GuestAddress(0),
-            data_len: 0,
-            status_addr: status_desc.addr,
-            discard_write_zeroes_seg: Some(seg),
-        })
-    }
-
-    fn parse_read_write(
-        avail_desc: &DescriptorChain,
-        mem: &GuestMemory,
-        req_type: RequestType,
-    ) -> result::Result<Request, ParseError> {
-        let sector = sector(&mem, avail_desc.addr)?;
-        let data_desc = avail_desc
-            .next_descriptor()
-            .ok_or(ParseError::DescriptorChainTooShort)?;
-        let status_desc = data_desc
-            .next_descriptor()
-            .ok_or(ParseError::DescriptorChainTooShort)?;
-
-        if data_desc.is_write_only() && req_type == RequestType::Out {
-            return Err(ParseError::UnexpectedWriteOnlyDescriptor);
-        }
-
-        if !data_desc.is_write_only() && req_type == RequestType::In {
-            return Err(ParseError::UnexpectedReadOnlyDescriptor);
-        }
-
-        // The status MUST always be writable
-        if !status_desc.is_write_only() {
-            return Err(ParseError::UnexpectedReadOnlyDescriptor);
-        }
-
-        if status_desc.len < 1 {
-            return Err(ParseError::DescriptorLengthTooSmall);
-        }
-
-        Ok(Request {
-            request_type: req_type,
-            sector,
-            data_addr: data_desc.addr,
-            data_len: data_desc.len,
-            status_addr: status_desc.addr,
-            discard_write_zeroes_seg: None,
-        })
-    }
-
-    fn execute<T: DiskFile>(
-        &self,
+impl Worker {
+    fn process_one_request(
+        avail_desc: DescriptorChain,
         read_only: bool,
-        disk: &mut T,
+        sparse: bool,
+        disk: &mut dyn DiskFile,
         disk_size: u64,
         flush_timer: &mut TimerFd,
         flush_timer_armed: &mut bool,
         mem: &GuestMemory,
-    ) -> result::Result<u32, ExecuteError> {
+    ) -> result::Result<usize, ExecuteError> {
+        let mut reader = Reader::new(mem, avail_desc.clone()).map_err(ExecuteError::Descriptor)?;
+        let mut writer = Writer::new(mem, avail_desc).map_err(ExecuteError::Descriptor)?;
+
+        // The last byte of the buffer is virtio_blk_req::status.
+        // Split it into a separate Writer so that status_writer is the final byte and
+        // the original writer is left with just the actual block I/O data.
+        let available_bytes = writer.available_bytes();
+        let status_offset = available_bytes
+            .checked_sub(1)
+            .ok_or(ExecuteError::MissingStatus)?;
+        let mut status_writer = writer
+            .split_at(status_offset)
+            .map_err(ExecuteError::Descriptor)?;
+
+        let status = match Block::execute_request(
+            &mut reader,
+            &mut writer,
+            read_only,
+            sparse,
+            disk,
+            disk_size,
+            flush_timer,
+            flush_timer_armed,
+        ) {
+            Ok(()) => VIRTIO_BLK_S_OK,
+            Err(e) => {
+                error!("failed executing disk request: {}", e);
+                e.status()
+            }
+        };
+
+        status_writer
+            .write_all(&[status])
+            .map_err(ExecuteError::WriteStatus)?;
+        Ok(available_bytes)
+    }
+
+    fn process_queue(
+        &mut self,
+        queue_index: usize,
+        flush_timer: &mut TimerFd,
+        flush_timer_armed: &mut bool,
+    ) {
+        let queue = &mut self.queues[queue_index];
+
+        let disk_size = self.disk_size.lock();
+
+        while let Some(avail_desc) = queue.pop(&self.mem) {
+            queue.set_notify(&self.mem, false);
+            let desc_index = avail_desc.index;
+
+            let len = match Worker::process_one_request(
+                avail_desc,
+                self.read_only,
+                self.sparse,
+                &mut *self.disk_image,
+                *disk_size,
+                flush_timer,
+                flush_timer_armed,
+                &self.mem,
+            ) {
+                Ok(len) => len,
+                Err(e) => {
+                    error!("block: failed to handle request: {}", e);
+                    0
+                }
+            };
+
+            queue.add_used(&self.mem, desc_index, len as u32);
+            queue.trigger_interrupt(&self.mem, &self.interrupt);
+            queue.set_notify(&self.mem, true);
+        }
+    }
+
+    fn resize(&mut self, new_size: u64) -> DiskControlResult {
+        if self.read_only {
+            error!("Attempted to resize read-only block device");
+            return DiskControlResult::Err(SysError::new(libc::EROFS));
+        }
+
+        info!("Resizing block device to {} bytes", new_size);
+
+        if let Err(e) = self.disk_image.set_len(new_size) {
+            error!("Resizing disk failed! {}", e);
+            return DiskControlResult::Err(SysError::new(libc::EIO));
+        }
+
+        if !self.sparse {
+            // Allocate new space if the disk image is not sparse.
+            if let Err(e) = self.disk_image.allocate(0, new_size) {
+                error!("Allocating disk space after resize failed! {}", e);
+                return DiskControlResult::Err(SysError::new(libc::EIO));
+            }
+        }
+
+        if let Ok(new_disk_size) = self.disk_image.get_len() {
+            let mut disk_size = self.disk_size.lock();
+            *disk_size = new_disk_size;
+        }
+        DiskControlResult::Ok
+    }
+
+    fn run(&mut self, queue_evt: EventFd, kill_evt: EventFd) {
+        #[derive(PollToken)]
+        enum Token {
+            FlushTimer,
+            QueueAvailable,
+            ControlRequest,
+            InterruptResample,
+            Kill,
+        }
+
+        let mut flush_timer = match TimerFd::new() {
+            Ok(t) => t,
+            Err(e) => {
+                error!("Failed to create the flush timer: {}", e);
+                return;
+            }
+        };
+        let mut flush_timer_armed = false;
+
+        let poll_ctx: PollContext<Token> = match PollContext::build_with(&[
+            (&flush_timer, Token::FlushTimer),
+            (&queue_evt, Token::QueueAvailable),
+            (&self.control_socket, Token::ControlRequest),
+            (self.interrupt.get_resample_evt(), Token::InterruptResample),
+            (&kill_evt, Token::Kill),
+        ]) {
+            Ok(pc) => pc,
+            Err(e) => {
+                error!("failed creating PollContext: {}", e);
+                return;
+            }
+        };
+
+        'poll: loop {
+            let events = match poll_ctx.wait() {
+                Ok(v) => v,
+                Err(e) => {
+                    error!("failed polling for events: {}", e);
+                    break;
+                }
+            };
+
+            let mut needs_config_interrupt = false;
+            for event in events.iter_readable() {
+                match event.token() {
+                    Token::FlushTimer => {
+                        if let Err(e) = self.disk_image.fsync() {
+                            error!("Failed to flush the disk: {}", e);
+                            break 'poll;
+                        }
+                        if let Err(e) = flush_timer.wait() {
+                            error!("Failed to clear flush timer: {}", e);
+                            break 'poll;
+                        }
+                    }
+                    Token::QueueAvailable => {
+                        if let Err(e) = queue_evt.read() {
+                            error!("failed reading queue EventFd: {}", e);
+                            break 'poll;
+                        }
+                        self.process_queue(0, &mut flush_timer, &mut flush_timer_armed);
+                    }
+                    Token::ControlRequest => {
+                        let req = match self.control_socket.recv() {
+                            Ok(req) => req,
+                            Err(e) => {
+                                error!("control socket failed recv: {}", e);
+                                break 'poll;
+                            }
+                        };
+
+                        let resp = match req {
+                            DiskControlCommand::Resize { new_size } => {
+                                needs_config_interrupt = true;
+                                self.resize(new_size)
+                            }
+                        };
+
+                        if let Err(e) = self.control_socket.send(&resp) {
+                            error!("control socket failed send: {}", e);
+                            break 'poll;
+                        }
+                    }
+                    Token::InterruptResample => {
+                        self.interrupt.interrupt_resample();
+                    }
+                    Token::Kill => break 'poll,
+                }
+            }
+            if needs_config_interrupt {
+                self.interrupt.signal_config_changed();
+            }
+        }
+    }
+}
+
+/// Virtio device for exposing block level read/write operations on a host file.
+pub struct Block {
+    kill_evt: Option<EventFd>,
+    worker_thread: Option<thread::JoinHandle<Worker>>,
+    disk_image: Option<Box<dyn DiskFile>>,
+    disk_size: Arc<Mutex<u64>>,
+    avail_features: u64,
+    read_only: bool,
+    sparse: bool,
+    seg_max: u32,
+    block_size: u32,
+    control_socket: Option<DiskControlResponseSocket>,
+}
+
+fn build_config_space(disk_size: u64, seg_max: u32, block_size: u32) -> virtio_blk_config {
+    virtio_blk_config {
+        // If the image is not a multiple of the sector size, the tail bits are not exposed.
+        capacity: Le64::from(disk_size >> SECTOR_SHIFT),
+        seg_max: Le32::from(seg_max),
+        blk_size: Le32::from(block_size),
+        max_discard_sectors: Le32::from(MAX_DISCARD_SECTORS),
+        discard_sector_alignment: Le32::from(DISCARD_SECTOR_ALIGNMENT),
+        max_write_zeroes_sectors: Le32::from(MAX_WRITE_ZEROES_SECTORS),
+        write_zeroes_may_unmap: 1,
+        max_discard_seg: Le32::from(MAX_DISCARD_SEG),
+        max_write_zeroes_seg: Le32::from(MAX_WRITE_ZEROES_SEG),
+        ..Default::default()
+    }
+}
+
+impl Block {
+    /// Create a new virtio block device that operates on the given DiskFile.
+    pub fn new(
+        disk_image: Box<dyn DiskFile>,
+        read_only: bool,
+        sparse: bool,
+        block_size: u32,
+        control_socket: Option<DiskControlResponseSocket>,
+    ) -> SysResult<Block> {
+        if block_size % SECTOR_SIZE as u32 != 0 {
+            error!(
+                "Block size {} is not a multiple of {}.",
+                block_size, SECTOR_SIZE,
+            );
+            return Err(SysError::new(libc::EINVAL));
+        }
+        let disk_size = disk_image.get_len()?;
+        if disk_size % block_size as u64 != 0 {
+            warn!(
+                "Disk size {} is not a multiple of block size {}; \
+                 the remainder will not be visible to the guest.",
+                disk_size, block_size,
+            );
+        }
+
+        let mut avail_features: u64 = 1 << VIRTIO_BLK_F_FLUSH;
+        if read_only {
+            avail_features |= 1 << VIRTIO_BLK_F_RO;
+        } else {
+            if sparse {
+                avail_features |= 1 << VIRTIO_BLK_F_DISCARD;
+            }
+            avail_features |= 1 << VIRTIO_BLK_F_WRITE_ZEROES;
+        }
+        avail_features |= 1 << VIRTIO_F_VERSION_1;
+        avail_features |= 1 << VIRTIO_BLK_F_SEG_MAX;
+        avail_features |= 1 << VIRTIO_BLK_F_BLK_SIZE;
+
+        let seg_max = min(max(iov_max(), 1), u32::max_value() as usize) as u32;
+
+        // Since we do not currently support indirect descriptors, the maximum
+        // number of segments must be smaller than the queue size.
+        // In addition, the request header and status each consume a descriptor.
+        let seg_max = min(seg_max, u32::from(QUEUE_SIZE) - 2);
+
+        Ok(Block {
+            kill_evt: None,
+            worker_thread: None,
+            disk_image: Some(disk_image),
+            disk_size: Arc::new(Mutex::new(disk_size)),
+            avail_features,
+            read_only,
+            sparse,
+            seg_max,
+            block_size,
+            control_socket,
+        })
+    }
+
+    // Execute a single block device request.
+    // `writer` includes the data region only; the status byte is not included.
+    // It is up to the caller to convert the result of this function into a status byte
+    // and write it to the expected location in guest memory.
+    fn execute_request(
+        reader: &mut Reader,
+        writer: &mut Writer,
+        read_only: bool,
+        sparse: bool,
+        disk: &mut dyn DiskFile,
+        disk_size: u64,
+        flush_timer: &mut TimerFd,
+        flush_timer_armed: &mut bool,
+    ) -> result::Result<(), ExecuteError> {
+        let req_header: virtio_blk_req_header = reader.read_obj().map_err(ExecuteError::Read)?;
+
+        let req_type = req_header.req_type.to_native();
+        let sector = req_header.sector.to_native();
         // Delay after a write when the file is auto-flushed.
         let flush_delay = Duration::from_secs(60);
 
-        if read_only && self.request_type != RequestType::In {
+        if read_only && req_type != VIRTIO_BLK_T_IN {
             return Err(ExecuteError::ReadOnly {
-                request_type: self.request_type,
+                request_type: req_type,
             });
         }
 
@@ -532,60 +587,33 @@
             }
         }
 
-        match self.request_type {
-            RequestType::In => {
-                let offset = self
-                    .sector
+        match req_type {
+            VIRTIO_BLK_T_IN => {
+                let data_len = writer.available_bytes();
+                let offset = sector
                     .checked_shl(u32::from(SECTOR_SHIFT))
                     .ok_or(ExecuteError::OutOfRange)?;
-                check_range(offset, u64::from(self.data_len), disk_size)?;
-                disk.seek(SeekFrom::Start(offset))
-                    .map_err(|e| ExecuteError::Seek {
-                        ioerr: e,
-                        sector: self.sector,
+                check_range(offset, data_len as u64, disk_size)?;
+                writer
+                    .write_all_from_at(disk, data_len, offset)
+                    .map_err(|desc_error| ExecuteError::ReadIo {
+                        length: data_len,
+                        sector,
+                        desc_error,
                     })?;
-                let mem_slice = mem
-                    .get_slice(self.data_addr.0, self.data_len as u64)
-                    .map_err(|volatile_memory_error| ExecuteError::ReadVolatile {
-                        addr: self.data_addr,
-                        length: self.data_len,
-                        sector: self.sector,
-                        volatile_memory_error,
-                    })?;
-                disk.read_exact_volatile(mem_slice)
-                    .map_err(|io_error| ExecuteError::ReadIo {
-                        addr: self.data_addr,
-                        length: self.data_len,
-                        sector: self.sector,
-                        io_error,
-                    })?;
-                return Ok(self.data_len);
             }
-            RequestType::Out => {
-                let offset = self
-                    .sector
+            VIRTIO_BLK_T_OUT => {
+                let data_len = reader.available_bytes();
+                let offset = sector
                     .checked_shl(u32::from(SECTOR_SHIFT))
                     .ok_or(ExecuteError::OutOfRange)?;
-                check_range(offset, u64::from(self.data_len), disk_size)?;
-                disk.seek(SeekFrom::Start(offset))
-                    .map_err(|e| ExecuteError::Seek {
-                        ioerr: e,
-                        sector: self.sector,
-                    })?;
-                let mem_slice = mem
-                    .get_slice(self.data_addr.0, self.data_len as u64)
-                    .map_err(|volatile_memory_error| ExecuteError::WriteVolatile {
-                        addr: self.data_addr,
-                        length: self.data_len,
-                        sector: self.sector,
-                        volatile_memory_error,
-                    })?;
-                disk.write_all_volatile(mem_slice)
-                    .map_err(|io_error| ExecuteError::WriteIo {
-                        addr: self.data_addr,
-                        length: self.data_len,
-                        sector: self.sector,
-                        io_error,
+                check_range(offset, data_len as u64, disk_size)?;
+                reader
+                    .read_exact_to_at(disk, data_len, offset)
+                    .map_err(|desc_error| ExecuteError::WriteIo {
+                        length: data_len,
+                        sector,
+                        desc_error,
                     })?;
                 if !*flush_timer_armed {
                     flush_timer
@@ -594,13 +622,20 @@
                     *flush_timer_armed = true;
                 }
             }
-            RequestType::Discard | RequestType::WriteZeroes => {
-                if let Some(seg) = self.discard_write_zeroes_seg {
+            VIRTIO_BLK_T_DISCARD | VIRTIO_BLK_T_WRITE_ZEROES => {
+                if req_type == VIRTIO_BLK_T_DISCARD && !sparse {
+                    return Err(ExecuteError::Unsupported(req_type));
+                }
+
+                while reader.available_bytes() >= size_of::<virtio_blk_discard_write_zeroes>() {
+                    let seg: virtio_blk_discard_write_zeroes =
+                        reader.read_obj().map_err(ExecuteError::Read)?;
+
                     let sector = seg.sector.to_native();
                     let num_sectors = seg.num_sectors.to_native();
                     let flags = seg.flags.to_native();
 
-                    let valid_flags = if self.request_type == RequestType::WriteZeroes {
+                    let valid_flags = if req_type == VIRTIO_BLK_T_WRITE_ZEROES {
                         VIRTIO_BLK_DISCARD_WRITE_ZEROES_FLAG_UNMAP
                     } else {
                         0
@@ -623,323 +658,51 @@
                         .ok_or(ExecuteError::OutOfRange)?;
                     check_range(offset, length, disk_size)?;
 
-                    if self.request_type == RequestType::Discard {
+                    if req_type == VIRTIO_BLK_T_DISCARD {
                         // Since Discard is just a hint and some filesystems may not implement
                         // FALLOC_FL_PUNCH_HOLE, ignore punch_hole errors.
                         let _ = disk.punch_hole(offset, length);
                     } else {
-                        disk.seek(SeekFrom::Start(offset))
-                            .map_err(|e| ExecuteError::Seek { ioerr: e, sector })?;
-                        disk.write_zeroes(length as usize).map_err(|e| {
-                            ExecuteError::DiscardWriteZeroes {
+                        disk.write_zeroes_all_at(offset, length as usize)
+                            .map_err(|e| ExecuteError::DiscardWriteZeroes {
                                 ioerr: Some(e),
                                 sector,
                                 num_sectors,
                                 flags,
-                            }
-                        })?;
+                            })?;
                     }
                 }
             }
-            RequestType::Flush => {
+            VIRTIO_BLK_T_FLUSH => {
                 disk.fsync().map_err(ExecuteError::Flush)?;
                 flush_timer.clear().map_err(ExecuteError::TimerFd)?;
                 *flush_timer_armed = false;
             }
-            RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
+            t => return Err(ExecuteError::Unsupported(t)),
         };
-        Ok(0)
+        Ok(())
     }
 }
 
-struct Worker<T: DiskFile> {
-    queues: Vec<Queue>,
-    mem: GuestMemory,
-    disk_image: T,
-    disk_size: Arc<Mutex<u64>>,
-    read_only: bool,
-    interrupt_status: Arc<AtomicUsize>,
-    interrupt_evt: EventFd,
-    interrupt_resample_evt: EventFd,
-}
-
-impl<T: DiskFile> Worker<T> {
-    fn process_queue(
-        &mut self,
-        queue_index: usize,
-        flush_timer: &mut TimerFd,
-        flush_timer_armed: &mut bool,
-    ) -> bool {
-        let queue = &mut self.queues[queue_index];
-
-        let disk_size = self.disk_size.lock();
-
-        let mut needs_interrupt = false;
-        while let Some(avail_desc) = queue.pop(&self.mem) {
-            let len;
-            match Request::parse(&avail_desc, &self.mem) {
-                Ok(request) => {
-                    let status = match request.execute(
-                        self.read_only,
-                        &mut self.disk_image,
-                        *disk_size,
-                        flush_timer,
-                        flush_timer_armed,
-                        &self.mem,
-                    ) {
-                        Ok(l) => {
-                            len = l;
-                            VIRTIO_BLK_S_OK
-                        }
-                        Err(e) => {
-                            error!("failed executing disk request: {}", e);
-                            len = 1; // 1 byte for the status
-                            e.status()
-                        }
-                    };
-                    // We use unwrap because the request parsing process already checked that the
-                    // status_addr was valid.
-                    self.mem
-                        .write_obj_at_addr(status, request.status_addr)
-                        .unwrap();
-                }
-                Err(e) => {
-                    error!("failed processing available descriptor chain: {}", e);
-                    len = 0;
-                }
-            }
-
-            queue.add_used(&self.mem, avail_desc.index, len);
-            needs_interrupt = true;
-        }
-
-        needs_interrupt
-    }
-
-    fn resize(&mut self, new_size: u64) -> DiskControlResult {
-        if self.read_only {
-            error!("Attempted to resize read-only block device");
-            return DiskControlResult::Err(SysError::new(libc::EROFS));
-        }
-
-        info!("Resizing block device to {} bytes", new_size);
-
-        if let Err(e) = self.disk_image.set_len(new_size) {
-            error!("Resizing disk failed! {}", e);
-            return DiskControlResult::Err(SysError::new(libc::EIO));
-        }
-
-        if let Ok(new_disk_size) = self.disk_image.seek(SeekFrom::End(0)) {
-            let mut disk_size = self.disk_size.lock();
-            *disk_size = new_disk_size;
-        }
-        DiskControlResult::Ok
-    }
-
-    fn signal_used_queue(&self) {
-        self.interrupt_status
-            .fetch_or(INTERRUPT_STATUS_USED_RING as usize, Ordering::SeqCst);
-        self.interrupt_evt.write(1).unwrap();
-    }
-
-    fn signal_config_changed(&self) {
-        self.interrupt_status
-            .fetch_or(INTERRUPT_STATUS_CONFIG_CHANGED as usize, Ordering::SeqCst);
-        self.interrupt_evt.write(1).unwrap();
-    }
-
-    fn run(
-        &mut self,
-        queue_evt: EventFd,
-        kill_evt: EventFd,
-        control_socket: DiskControlResponseSocket,
-    ) {
-        #[derive(PollToken)]
-        enum Token {
-            FlushTimer,
-            QueueAvailable,
-            ControlRequest,
-            InterruptResample,
-            Kill,
-        }
-
-        let mut flush_timer = match TimerFd::new() {
-            Ok(t) => t,
-            Err(e) => {
-                error!("Failed to create the flush timer: {}", e);
-                return;
-            }
-        };
-        let mut flush_timer_armed = false;
-
-        let poll_ctx: PollContext<Token> = match PollContext::new()
-            .and_then(|pc| pc.add(&flush_timer, Token::FlushTimer).and(Ok(pc)))
-            .and_then(|pc| pc.add(&queue_evt, Token::QueueAvailable).and(Ok(pc)))
-            .and_then(|pc| pc.add(&control_socket, Token::ControlRequest).and(Ok(pc)))
-            .and_then(|pc| {
-                pc.add(&self.interrupt_resample_evt, Token::InterruptResample)
-                    .and(Ok(pc))
-            })
-            .and_then(|pc| pc.add(&kill_evt, Token::Kill).and(Ok(pc)))
-        {
-            Ok(pc) => pc,
-            Err(e) => {
-                error!("failed creating PollContext: {}", e);
-                return;
-            }
-        };
-
-        'poll: loop {
-            let events = match poll_ctx.wait() {
-                Ok(v) => v,
-                Err(e) => {
-                    error!("failed polling for events: {}", e);
-                    break;
-                }
-            };
-
-            let mut needs_interrupt = false;
-            let mut needs_config_interrupt = false;
-            for event in events.iter_readable() {
-                match event.token() {
-                    Token::FlushTimer => {
-                        if let Err(e) = self.disk_image.fsync() {
-                            error!("Failed to flush the disk: {}", e);
-                            break 'poll;
-                        }
-                        if let Err(e) = flush_timer.wait() {
-                            error!("Failed to clear flush timer: {}", e);
-                            break 'poll;
-                        }
-                    }
-                    Token::QueueAvailable => {
-                        if let Err(e) = queue_evt.read() {
-                            error!("failed reading queue EventFd: {}", e);
-                            break 'poll;
-                        }
-                        needs_interrupt |=
-                            self.process_queue(0, &mut flush_timer, &mut flush_timer_armed);
-                    }
-                    Token::ControlRequest => {
-                        let req = match control_socket.recv() {
-                            Ok(req) => req,
-                            Err(e) => {
-                                error!("control socket failed recv: {}", e);
-                                break 'poll;
-                            }
-                        };
-
-                        let resp = match req {
-                            DiskControlCommand::Resize { new_size } => {
-                                needs_config_interrupt = true;
-                                self.resize(new_size)
-                            }
-                        };
-
-                        if let Err(e) = control_socket.send(&resp) {
-                            error!("control socket failed send: {}", e);
-                            break 'poll;
-                        }
-                    }
-                    Token::InterruptResample => {
-                        let _ = self.interrupt_resample_evt.read();
-                        if self.interrupt_status.load(Ordering::SeqCst) != 0 {
-                            self.interrupt_evt.write(1).unwrap();
-                        }
-                    }
-                    Token::Kill => break 'poll,
-                }
-            }
-            if needs_interrupt {
-                self.signal_used_queue();
-            }
-            if needs_config_interrupt {
-                self.signal_config_changed();
-            }
-        }
-    }
-}
-
-/// Virtio device for exposing block level read/write operations on a host file.
-pub struct Block<T: DiskFile> {
-    kill_evt: Option<EventFd>,
-    disk_image: Option<T>,
-    disk_size: Arc<Mutex<u64>>,
-    avail_features: u64,
-    read_only: bool,
-    control_socket: Option<DiskControlResponseSocket>,
-}
-
-fn build_config_space(disk_size: u64) -> virtio_blk_config {
-    virtio_blk_config {
-        // If the image is not a multiple of the sector size, the tail bits are not exposed.
-        capacity: Le64::from(disk_size >> SECTOR_SHIFT),
-        blk_size: Le32::from(SECTOR_SIZE as u32),
-        max_discard_sectors: Le32::from(MAX_DISCARD_SECTORS),
-        discard_sector_alignment: Le32::from(DISCARD_SECTOR_ALIGNMENT),
-        max_write_zeroes_sectors: Le32::from(MAX_WRITE_ZEROES_SECTORS),
-        write_zeroes_may_unmap: 1,
-        // Limit number of segments to 1 - see parse_discard_write_zeroes()
-        max_discard_seg: Le32::from(1),
-        max_write_zeroes_seg: Le32::from(1),
-        ..Default::default()
-    }
-}
-
-impl<T: DiskFile> Block<T> {
-    /// Create a new virtio block device that operates on the given file.
-    ///
-    /// The given file must be seekable and sizable.
-    pub fn new(
-        mut disk_image: T,
-        read_only: bool,
-        control_socket: Option<DiskControlResponseSocket>,
-    ) -> SysResult<Block<T>> {
-        let disk_size = disk_image.seek(SeekFrom::End(0))? as u64;
-        if disk_size % SECTOR_SIZE != 0 {
-            warn!(
-                "Disk size {} is not a multiple of sector size {}; \
-                 the remainder will not be visible to the guest.",
-                disk_size, SECTOR_SIZE
-            );
-        }
-
-        let mut avail_features: u64 = 1 << VIRTIO_BLK_F_FLUSH;
-        if read_only {
-            avail_features |= 1 << VIRTIO_BLK_F_RO;
-        } else {
-            avail_features |= 1 << VIRTIO_BLK_F_DISCARD;
-            avail_features |= 1 << VIRTIO_BLK_F_WRITE_ZEROES;
-        }
-        avail_features |= 1 << VIRTIO_F_VERSION_1;
-        avail_features |= 1 << VIRTIO_BLK_F_BLK_SIZE;
-
-        Ok(Block {
-            kill_evt: None,
-            disk_image: Some(disk_image),
-            disk_size: Arc::new(Mutex::new(disk_size)),
-            avail_features,
-            read_only,
-            control_socket,
-        })
-    }
-}
-
-impl<T: DiskFile> Drop for Block<T> {
+impl Drop for Block {
     fn drop(&mut self) {
         if let Some(kill_evt) = self.kill_evt.take() {
             // Ignore the result because there is nothing we can do about it.
             let _ = kill_evt.write(1);
         }
+
+        if let Some(worker_thread) = self.worker_thread.take() {
+            let _ = worker_thread.join();
+        }
     }
 }
 
-impl<T: 'static + AsRawFd + DiskFile + Send> VirtioDevice for Block<T> {
+impl VirtioDevice for Block {
     fn keep_fds(&self) -> Vec<RawFd> {
         let mut keep_fds = Vec::new();
 
         if let Some(disk_image) = &self.disk_image {
-            keep_fds.push(disk_image.as_raw_fd());
+            keep_fds.extend(disk_image.as_raw_fds());
         }
 
         if let Some(control_socket) = &self.control_socket {
@@ -961,31 +724,18 @@
         QUEUE_SIZES
     }
 
-    fn read_config(&self, offset: u64, mut data: &mut [u8]) {
+    fn read_config(&self, offset: u64, data: &mut [u8]) {
         let config_space = {
             let disk_size = self.disk_size.lock();
-            build_config_space(*disk_size)
+            build_config_space(*disk_size, self.seg_max, self.block_size)
         };
-        let config_len = size_of_val(&config_space) as u64;
-        if offset >= config_len {
-            return;
-        }
-
-        if let Some(end) = offset.checked_add(data.len() as u64) {
-            let offset = offset as usize;
-            let end = cmp::min(end, config_len) as usize;
-            // This write can't fail, offset and end are checked against config_len.
-            data.write_all(&config_space.as_slice()[offset..end])
-                .unwrap();
-        }
+        copy_config(data, 0, config_space.as_slice(), offset);
     }
 
     fn activate(
         &mut self,
         mem: GuestMemory,
-        interrupt_evt: EventFd,
-        interrupt_resample_evt: EventFd,
-        status: Arc<AtomicUsize>,
+        interrupt: Interrupt,
         queues: Vec<Queue>,
         mut queue_evts: Vec<EventFd>,
     ) {
@@ -1003,6 +753,7 @@
         self.kill_evt = Some(self_kill_evt);
 
         let read_only = self.read_only;
+        let sparse = self.sparse;
         let disk_size = self.disk_size.clone();
         if let Some(disk_image) = self.disk_image.take() {
             if let Some(control_socket) = self.control_socket.take() {
@@ -1011,44 +762,77 @@
                         .name("virtio_blk".to_string())
                         .spawn(move || {
                             let mut worker = Worker {
+                                interrupt,
                                 queues,
                                 mem,
                                 disk_image,
                                 disk_size,
                                 read_only,
-                                interrupt_status: status,
-                                interrupt_evt,
-                                interrupt_resample_evt,
+                                sparse,
+                                control_socket,
                             };
-                            worker.run(queue_evts.remove(0), kill_evt, control_socket);
+                            worker.run(queue_evts.remove(0), kill_evt);
+                            worker
                         });
 
-                if let Err(e) = worker_result {
-                    error!("failed to spawn virtio_blk worker: {}", e);
-                    return;
+                match worker_result {
+                    Err(e) => {
+                        error!("failed to spawn virtio_blk worker: {}", e);
+                        return;
+                    }
+                    Ok(join_handle) => {
+                        self.worker_thread = Some(join_handle);
+                    }
                 }
             }
         }
     }
+
+    fn reset(&mut self) -> bool {
+        if let Some(kill_evt) = self.kill_evt.take() {
+            if kill_evt.write(1).is_err() {
+                error!("{}: failed to notify the kill event", self.debug_label());
+                return false;
+            }
+        }
+
+        if let Some(worker_thread) = self.worker_thread.take() {
+            match worker_thread.join() {
+                Err(_) => {
+                    error!("{}: failed to get back resources", self.debug_label());
+                    return false;
+                }
+                Ok(worker) => {
+                    self.disk_image = Some(worker.disk_image);
+                    self.control_socket = Some(worker.control_socket);
+                    return true;
+                }
+            }
+        }
+        false
+    }
 }
 
 #[cfg(test)]
 mod tests {
     use std::fs::{File, OpenOptions};
-    use std::path::PathBuf;
-    use sys_util::TempDir;
+    use std::mem::size_of_val;
+    use sys_util::GuestAddress;
+    use tempfile::TempDir;
+
+    use crate::virtio::descriptor_utils::{create_descriptor_chain, DescriptorType};
 
     use super::*;
 
     #[test]
     fn read_size() {
-        let tempdir = TempDir::new("/tmp/block_read_test").unwrap();
-        let mut path = PathBuf::from(tempdir.as_path().unwrap());
+        let tempdir = TempDir::new().unwrap();
+        let mut path = tempdir.path().to_owned();
         path.push("disk_image");
         let f = File::create(&path).unwrap();
         f.set_len(0x1000).unwrap();
 
-        let b = Block::new(f, true, None).unwrap();
+        let b = Block::new(Box::new(f), true, false, 512, None).unwrap();
         let mut num_sectors = [0u8; 4];
         b.read_config(0, &mut num_sectors);
         // size is 0x1000, so num_sectors is 8 (4096/512).
@@ -1060,34 +844,60 @@
     }
 
     #[test]
+    fn read_block_size() {
+        let tempdir = TempDir::new().unwrap();
+        let mut path = tempdir.path().to_owned();
+        path.push("disk_image");
+        let f = File::create(&path).unwrap();
+        f.set_len(0x1000).unwrap();
+
+        let b = Block::new(Box::new(f), true, false, 4096, None).unwrap();
+        let mut blk_size = [0u8; 4];
+        b.read_config(20, &mut blk_size);
+        // blk_size should be 4096 (0x1000).
+        assert_eq!([0x00, 0x10, 0x00, 0x00], blk_size);
+    }
+
+    #[test]
     fn read_features() {
-        let tempdir = TempDir::new("/tmp/block_read_test").unwrap();
-        let mut path = PathBuf::from(tempdir.as_path().unwrap());
+        let tempdir = TempDir::new().unwrap();
+        let mut path = tempdir.path().to_owned();
         path.push("disk_image");
 
         // read-write block device
         {
             let f = File::create(&path).unwrap();
-            let b = Block::new(f, false, None).unwrap();
+            let b = Block::new(Box::new(f), false, true, 512, None).unwrap();
             // writable device should set VIRTIO_BLK_F_FLUSH + VIRTIO_BLK_F_DISCARD
             // + VIRTIO_BLK_F_WRITE_ZEROES + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE
-            assert_eq!(0x100006240, b.features());
+            // + VIRTIO_BLK_F_SEG_MAX
+            assert_eq!(0x100006244, b.features());
+        }
+
+        // read-write block device, non-sparse
+        {
+            let f = File::create(&path).unwrap();
+            let b = Block::new(Box::new(f), false, false, 512, None).unwrap();
+            // writable device should set VIRTIO_BLK_F_FLUSH
+            // + VIRTIO_BLK_F_WRITE_ZEROES + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE
+            // + VIRTIO_BLK_F_SEG_MAX
+            assert_eq!(0x100004244, b.features());
         }
 
         // read-only block device
         {
             let f = File::create(&path).unwrap();
-            let b = Block::new(f, true, None).unwrap();
+            let b = Block::new(Box::new(f), true, true, 512, None).unwrap();
             // read-only device should set VIRTIO_BLK_F_FLUSH and VIRTIO_BLK_F_RO
-            // + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE
-            assert_eq!(0x100000260, b.features());
+            // + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE + VIRTIO_BLK_F_SEG_MAX
+            assert_eq!(0x100000264, b.features());
         }
     }
 
     #[test]
     fn read_last_sector() {
-        let tempdir = TempDir::new("/tmp/block_read_test").unwrap();
-        let mut path = PathBuf::from(tempdir.as_path().unwrap());
+        let tempdir = TempDir::new().unwrap();
+        let mut path = tempdir.path().to_owned();
         path.push("disk_image");
         let mut f = OpenOptions::new()
             .read(true)
@@ -1101,69 +911,108 @@
         let mem = GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)])
             .expect("Creating guest memory failed.");
 
-        let req = Request {
-            request_type: RequestType::In,
-            sector: 7, // Disk is 8 sectors long, so this is the last valid sector.
-            data_addr: GuestAddress(0x1000),
-            data_len: 512, // Read 1 sector of data.
-            status_addr: GuestAddress(0),
-            discard_write_zeroes_seg: None,
+        let req_hdr = virtio_blk_req_header {
+            req_type: Le32::from(VIRTIO_BLK_T_IN),
+            reserved: Le32::from(0),
+            sector: Le64::from(7), // Disk is 8 sectors long, so this is the last valid sector.
         };
+        mem.write_obj_at_addr(req_hdr, GuestAddress(0x1000))
+            .expect("writing req failed");
+
+        let avail_desc = create_descriptor_chain(
+            &mem,
+            GuestAddress(0x100),  // Place descriptor chain at 0x100.
+            GuestAddress(0x1000), // Describe buffer at 0x1000.
+            vec![
+                // Request header
+                (DescriptorType::Readable, size_of_val(&req_hdr) as u32),
+                // I/O buffer (1 sector of data)
+                (DescriptorType::Writable, 512),
+                // Request status
+                (DescriptorType::Writable, 1),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
 
         let mut flush_timer = TimerFd::new().expect("failed to create flush_timer");
         let mut flush_timer_armed = false;
 
-        assert_eq!(
-            512,
-            req.execute(
-                false,
-                &mut f,
-                disk_size,
-                &mut flush_timer,
-                &mut flush_timer_armed,
-                &mem
-            )
-            .expect("execute failed"),
-        );
-    }
-
-    #[test]
-    fn read_beyond_last_sector() {
-        let tempdir = TempDir::new("/tmp/block_read_test").unwrap();
-        let mut path = PathBuf::from(tempdir.as_path().unwrap());
-        path.push("disk_image");
-        let mut f = OpenOptions::new()
-            .read(true)
-            .write(true)
-            .create(true)
-            .open(&path)
-            .unwrap();
-        let disk_size = 0x1000;
-        f.set_len(disk_size).unwrap();
-
-        let mem = GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)])
-            .expect("Creating guest memory failed.");
-
-        let req = Request {
-            request_type: RequestType::In,
-            sector: 7, // Disk is 8 sectors long, so this is the last valid sector.
-            data_addr: GuestAddress(0x1000),
-            data_len: 512 * 2, // Read 2 sectors of data (overlap the end of the disk).
-            status_addr: GuestAddress(0),
-            discard_write_zeroes_seg: None,
-        };
-
-        let mut flush_timer = TimerFd::new().expect("failed to create flush_timer");
-        let mut flush_timer_armed = false;
-
-        req.execute(
+        Worker::process_one_request(
+            avail_desc,
             false,
+            true,
             &mut f,
             disk_size,
             &mut flush_timer,
             &mut flush_timer_armed,
             &mem,
         )
-        .expect_err("execute was supposed to fail");
+        .expect("execute failed");
+
+        let status_offset = GuestAddress((0x1000 + size_of_val(&req_hdr) + 512) as u64);
+        let status = mem.read_obj_from_addr::<u8>(status_offset).unwrap();
+        assert_eq!(status, VIRTIO_BLK_S_OK);
+    }
+
+    #[test]
+    fn read_beyond_last_sector() {
+        let tempdir = TempDir::new().unwrap();
+        let mut path = tempdir.path().to_owned();
+        path.push("disk_image");
+        let mut f = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .open(&path)
+            .unwrap();
+        let disk_size = 0x1000;
+        f.set_len(disk_size).unwrap();
+
+        let mem = GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)])
+            .expect("Creating guest memory failed.");
+
+        let req_hdr = virtio_blk_req_header {
+            req_type: Le32::from(VIRTIO_BLK_T_IN),
+            reserved: Le32::from(0),
+            sector: Le64::from(7), // Disk is 8 sectors long, so this is the last valid sector.
+        };
+        mem.write_obj_at_addr(req_hdr, GuestAddress(0x1000))
+            .expect("writing req failed");
+
+        let avail_desc = create_descriptor_chain(
+            &mem,
+            GuestAddress(0x100),  // Place descriptor chain at 0x100.
+            GuestAddress(0x1000), // Describe buffer at 0x1000.
+            vec![
+                // Request header
+                (DescriptorType::Readable, size_of_val(&req_hdr) as u32),
+                // I/O buffer (2 sectors of data - overlap the end of the disk).
+                (DescriptorType::Writable, 512 * 2),
+                // Request status
+                (DescriptorType::Writable, 1),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+
+        let mut flush_timer = TimerFd::new().expect("failed to create flush_timer");
+        let mut flush_timer_armed = false;
+
+        Worker::process_one_request(
+            avail_desc,
+            false,
+            true,
+            &mut f,
+            disk_size,
+            &mut flush_timer,
+            &mut flush_timer_armed,
+            &mem,
+        )
+        .expect("execute failed");
+
+        let status_offset = GuestAddress((0x1000 + size_of_val(&req_hdr) + 512 * 2) as u64);
+        let status = mem.read_obj_from_addr::<u8>(status_offset).unwrap();
+        assert_eq!(status, VIRTIO_BLK_S_IOERR);
     }
 }
diff --git a/devices/src/virtio/descriptor_utils.rs b/devices/src/virtio/descriptor_utils.rs
new file mode 100644
index 0000000..2e5dfd3
--- /dev/null
+++ b/devices/src/virtio/descriptor_utils.rs
@@ -0,0 +1,1227 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::cmp;
+use std::collections::VecDeque;
+use std::fmt::{self, Display};
+use std::io::{self, Read, Write};
+use std::iter::FromIterator;
+use std::marker::PhantomData;
+use std::mem::{size_of, MaybeUninit};
+use std::ptr::copy_nonoverlapping;
+use std::result;
+
+use data_model::{DataInit, Le16, Le32, Le64, VolatileMemory, VolatileMemoryError, VolatileSlice};
+use sys_util::{
+    FileReadWriteAtVolatile, FileReadWriteVolatile, GuestAddress, GuestMemory, IntoIovec,
+};
+
+use super::DescriptorChain;
+
+#[derive(Debug)]
+pub enum Error {
+    DescriptorChainOverflow,
+    GuestMemoryError(sys_util::GuestMemoryError),
+    InvalidChain,
+    IoError(io::Error),
+    SplitOutOfBounds(usize),
+    VolatileMemoryError(VolatileMemoryError),
+}
+
+impl Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use self::Error::*;
+
+        match self {
+            DescriptorChainOverflow => write!(
+                f,
+                "the combined length of all the buffers in a `DescriptorChain` would overflow"
+            ),
+            GuestMemoryError(e) => write!(f, "descriptor guest memory error: {}", e),
+            InvalidChain => write!(f, "invalid descriptor chain"),
+            IoError(e) => write!(f, "descriptor I/O error: {}", e),
+            SplitOutOfBounds(off) => write!(f, "`DescriptorChain` split is out of bounds: {}", off),
+            VolatileMemoryError(e) => write!(f, "volatile memory error: {}", e),
+        }
+    }
+}
+
+pub type Result<T> = result::Result<T, Error>;
+
+impl std::error::Error for Error {}
+
+#[derive(Clone)]
+struct DescriptorChainConsumer<'a> {
+    buffers: VecDeque<VolatileSlice<'a>>,
+    bytes_consumed: usize,
+}
+
+impl<'a> DescriptorChainConsumer<'a> {
+    fn available_bytes(&self) -> usize {
+        // This is guaranteed not to overflow because the total length of the chain
+        // is checked during all creations of `DescriptorChainConsumer` (see
+        // `Reader::new()` and `Writer::new()`).
+        self.buffers
+            .iter()
+            .fold(0usize, |count, vs| count + vs.size() as usize)
+    }
+
+    fn bytes_consumed(&self) -> usize {
+        self.bytes_consumed
+    }
+
+    /// Consumes at most `count` bytes from the `DescriptorChain`. Callers must provide a function
+    /// that takes a `&[VolatileSlice]` and returns the total number of bytes consumed. This
+    /// function guarantees that the combined length of all the slices in the `&[VolatileSlice]` is
+    /// less than or equal to `count`.
+    ///
+    /// # Errors
+    ///
+    /// If the provided function returns any error then no bytes are consumed from the buffer and
+    /// the error is returned to the caller.
+    fn consume<F>(&mut self, count: usize, f: F) -> io::Result<usize>
+    where
+        F: FnOnce(&[VolatileSlice]) -> io::Result<usize>,
+    {
+        let mut buflen = 0;
+        let mut bufs = Vec::with_capacity(self.buffers.len());
+        for &vs in &self.buffers {
+            if buflen >= count {
+                break;
+            }
+
+            let rem = count - buflen;
+            if (rem as u64) < vs.size() {
+                let buf = vs.sub_slice(0, rem as u64).map_err(|e| {
+                    io::Error::new(io::ErrorKind::InvalidData, Error::VolatileMemoryError(e))
+                })?;
+                bufs.push(buf);
+                buflen += rem;
+            } else {
+                bufs.push(vs);
+                buflen += vs.size() as usize;
+            }
+        }
+
+        if bufs.is_empty() {
+            return Ok(0);
+        }
+
+        let bytes_consumed = f(&*bufs)?;
+
+        // This can happen if a driver tricks a device into reading/writing more data than
+        // fits in a `usize`.
+        let total_bytes_consumed =
+            self.bytes_consumed
+                .checked_add(bytes_consumed)
+                .ok_or_else(|| {
+                    io::Error::new(io::ErrorKind::InvalidData, Error::DescriptorChainOverflow)
+                })?;
+
+        let mut rem = bytes_consumed;
+        while let Some(vs) = self.buffers.pop_front() {
+            if (rem as u64) < vs.size() {
+                // Split the slice and push the remainder back into the buffer list. Safe because we
+                // know that `rem` is not out of bounds due to the check and we checked the bounds
+                // on `vs` when we added it to the buffer list.
+                self.buffers.push_front(vs.offset(rem as u64).unwrap());
+                break;
+            }
+
+            // No need for checked math because we know that `vs.size() <= rem`.
+            rem -= vs.size() as usize;
+        }
+
+        self.bytes_consumed = total_bytes_consumed;
+
+        Ok(bytes_consumed)
+    }
+
+    fn split_at(&mut self, offset: usize) -> Result<DescriptorChainConsumer<'a>> {
+        let mut rem = offset;
+        let pos = self.buffers.iter().position(|vs| {
+            if (rem as u64) < vs.size() {
+                true
+            } else {
+                rem -= vs.size() as usize;
+                false
+            }
+        });
+
+        if let Some(at) = pos {
+            let mut other = self.buffers.split_off(at);
+
+            if rem > 0 {
+                // There must be at least one element in `other` because we checked
+                // its `size` value in the call to `position` above.
+                let front = other.pop_front().expect("empty VecDeque after split");
+                self.buffers.push_back(
+                    front
+                        .sub_slice(0, rem as u64)
+                        .map_err(Error::VolatileMemoryError)?,
+                );
+                other.push_front(
+                    front
+                        .offset(rem as u64)
+                        .map_err(Error::VolatileMemoryError)?,
+                );
+            }
+
+            Ok(DescriptorChainConsumer {
+                buffers: other,
+                bytes_consumed: 0,
+            })
+        } else if rem == 0 {
+            Ok(DescriptorChainConsumer {
+                buffers: VecDeque::new(),
+                bytes_consumed: 0,
+            })
+        } else {
+            Err(Error::SplitOutOfBounds(offset))
+        }
+    }
+
+    fn get_iovec(&mut self, len: usize) -> io::Result<DescriptorIovec<'a>> {
+        let mut iovec = Vec::new();
+
+        self.consume(len, |bufs| {
+            let mut total = 0;
+            for vs in bufs {
+                iovec.push(libc::iovec {
+                    iov_base: vs.as_ptr() as *mut libc::c_void,
+                    iov_len: vs.size() as usize,
+                });
+                total += vs.size() as usize;
+            }
+            Ok(total)
+        })?;
+
+        Ok(DescriptorIovec {
+            iovec,
+            mem: PhantomData,
+        })
+    }
+}
+
+/// Provides high-level interface over the sequence of memory regions
+/// defined by readable descriptors in the descriptor chain.
+///
+/// Note that virtio spec requires driver to place any device-writable
+/// descriptors after any device-readable descriptors (2.6.4.2 in Virtio Spec v1.1).
+/// Reader will skip iterating over descriptor chain when first writable
+/// descriptor is encountered.
+#[derive(Clone)]
+pub struct Reader<'a> {
+    buffer: DescriptorChainConsumer<'a>,
+}
+
+// An iterator over `DataInit` objects on readable descriptors in the descriptor chain.
+struct ReaderIterator<'a, T: DataInit> {
+    reader: &'a mut Reader<'a>,
+    phantom: PhantomData<T>,
+}
+
+impl<'a, T: DataInit> Iterator for ReaderIterator<'a, T> {
+    type Item = io::Result<T>;
+
+    fn next(&mut self) -> Option<io::Result<T>> {
+        if self.reader.available_bytes() == 0 {
+            None
+        } else {
+            Some(self.reader.read_obj())
+        }
+    }
+}
+
+impl<'a> Reader<'a> {
+    /// Construct a new Reader wrapper over `desc_chain`.
+    pub fn new(mem: &'a GuestMemory, desc_chain: DescriptorChain<'a>) -> Result<Reader<'a>> {
+        // TODO(jstaron): Update this code to take the indirect descriptors into account.
+        let mut total_len: usize = 0;
+        let buffers = desc_chain
+            .into_iter()
+            .readable()
+            .map(|desc| {
+                // Verify that summing the descriptor sizes does not overflow.
+                // This can happen if a driver tricks a device into reading more data than
+                // fits in a `usize`.
+                total_len = total_len
+                    .checked_add(desc.len as usize)
+                    .ok_or(Error::DescriptorChainOverflow)?;
+
+                mem.get_slice(desc.addr.offset(), desc.len.into())
+                    .map_err(Error::VolatileMemoryError)
+            })
+            .collect::<Result<VecDeque<VolatileSlice<'a>>>>()?;
+        Ok(Reader {
+            buffer: DescriptorChainConsumer {
+                buffers,
+                bytes_consumed: 0,
+            },
+        })
+    }
+
+    /// Reads an object from the descriptor chain buffer.
+    pub fn read_obj<T: DataInit>(&mut self) -> io::Result<T> {
+        let mut obj = MaybeUninit::<T>::uninit();
+
+        // Safe because `MaybeUninit` guarantees that the pointer is valid for
+        // `size_of::<T>()` bytes.
+        let buf = unsafe {
+            ::std::slice::from_raw_parts_mut(obj.as_mut_ptr() as *mut u8, size_of::<T>())
+        };
+
+        self.read_exact(buf)?;
+
+        // Safe because any type that implements `DataInit` can be considered initialized
+        // even if it is filled with random data.
+        Ok(unsafe { obj.assume_init() })
+    }
+
+    /// Reads objects by consuming all the remaining data in the descriptor chain buffer and returns
+    /// them as a collection. Returns an error if the size of the remaining data is indivisible by
+    /// the size of an object of type `T`.
+    pub fn collect<C: FromIterator<io::Result<T>>, T: DataInit>(&'a mut self) -> C {
+        C::from_iter(ReaderIterator {
+            reader: self,
+            phantom: PhantomData,
+        })
+    }
+
+    /// Reads data from the descriptor chain buffer into a file descriptor.
+    /// Returns the number of bytes read from the descriptor chain buffer.
+    /// The number of bytes read can be less than `count` if there isn't
+    /// enough data in the descriptor chain buffer.
+    pub fn read_to<F: FileReadWriteVolatile>(
+        &mut self,
+        mut dst: F,
+        count: usize,
+    ) -> io::Result<usize> {
+        self.buffer
+            .consume(count, |bufs| dst.write_vectored_volatile(bufs))
+    }
+
+    /// Reads data from the descriptor chain buffer into a File at offset `off`.
+    /// Returns the number of bytes read from the descriptor chain buffer.
+    /// The number of bytes read can be less than `count` if there isn't
+    /// enough data in the descriptor chain buffer.
+    pub fn read_to_at<F: FileReadWriteAtVolatile>(
+        &mut self,
+        mut dst: F,
+        count: usize,
+        off: u64,
+    ) -> io::Result<usize> {
+        self.buffer
+            .consume(count, |bufs| dst.write_vectored_at_volatile(bufs, off))
+    }
+
+    pub fn read_exact_to<F: FileReadWriteVolatile>(
+        &mut self,
+        mut dst: F,
+        mut count: usize,
+    ) -> io::Result<()> {
+        while count > 0 {
+            match self.read_to(&mut dst, count) {
+                Ok(0) => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::UnexpectedEof,
+                        "failed to fill whole buffer",
+                    ))
+                }
+                Ok(n) => count -= n,
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn read_exact_to_at<F: FileReadWriteAtVolatile>(
+        &mut self,
+        mut dst: F,
+        mut count: usize,
+        mut off: u64,
+    ) -> io::Result<()> {
+        while count > 0 {
+            match self.read_to_at(&mut dst, count, off) {
+                Ok(0) => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::UnexpectedEof,
+                        "failed to fill whole buffer",
+                    ))
+                }
+                Ok(n) => {
+                    count -= n;
+                    off += n as u64;
+                }
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Returns number of bytes available for reading.  May return an error if the combined
+    /// lengths of all the buffers in the DescriptorChain would cause an integer overflow.
+    pub fn available_bytes(&self) -> usize {
+        self.buffer.available_bytes()
+    }
+
+    /// Returns number of bytes already read from the descriptor chain buffer.
+    pub fn bytes_read(&self) -> usize {
+        self.buffer.bytes_consumed()
+    }
+
+    /// Splits this `Reader` into two at the given offset in the `DescriptorChain` buffer.
+    /// After the split, `self` will be able to read up to `offset` bytes while the returned
+    /// `Reader` can read up to `available_bytes() - offset` bytes.  Returns an error if
+    /// `offset > self.available_bytes()`.
+    pub fn split_at(&mut self, offset: usize) -> Result<Reader<'a>> {
+        self.buffer.split_at(offset).map(|buffer| Reader { buffer })
+    }
+
+    /// Returns a DescriptorIovec for the next `len` bytes of the descriptor chain
+    /// buffer, which can be used as an IntoIovec.
+    pub fn get_iovec(&mut self, len: usize) -> io::Result<DescriptorIovec<'a>> {
+        self.buffer.get_iovec(len)
+    }
+}
+
+impl<'a> io::Read for Reader<'a> {
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        self.buffer.consume(buf.len(), |bufs| {
+            let mut rem = buf;
+            let mut total = 0;
+            for vs in bufs {
+                // This is guaranteed by the implementation of `consume`.
+                debug_assert_eq!(vs.size(), cmp::min(rem.len() as u64, vs.size()));
+
+                // Safe because we have already verified that `vs` points to valid memory.
+                unsafe {
+                    copy_nonoverlapping(
+                        vs.as_ptr() as *const u8,
+                        rem.as_mut_ptr(),
+                        vs.size() as usize,
+                    );
+                }
+                let copied = vs.size() as usize;
+                rem = &mut rem[copied..];
+                total += copied;
+            }
+            Ok(total)
+        })
+    }
+}
+
+/// Provides high-level interface over the sequence of memory regions
+/// defined by writable descriptors in the descriptor chain.
+///
+/// Note that virtio spec requires driver to place any device-writable
+/// descriptors after any device-readable descriptors (2.6.4.2 in Virtio Spec v1.1).
+/// Writer will start iterating the descriptors from the first writable one and will
+/// assume that all following descriptors are writable.
+#[derive(Clone)]
+pub struct Writer<'a> {
+    buffer: DescriptorChainConsumer<'a>,
+}
+
+impl<'a> Writer<'a> {
+    /// Construct a new Writer wrapper over `desc_chain`.
+    pub fn new(mem: &'a GuestMemory, desc_chain: DescriptorChain<'a>) -> Result<Writer<'a>> {
+        let mut total_len: usize = 0;
+        let buffers = desc_chain
+            .into_iter()
+            .writable()
+            .map(|desc| {
+                // Verify that summing the descriptor sizes does not overflow.
+                // This can happen if a driver tricks a device into writing more data than
+                // fits in a `usize`.
+                total_len = total_len
+                    .checked_add(desc.len as usize)
+                    .ok_or(Error::DescriptorChainOverflow)?;
+
+                mem.get_slice(desc.addr.offset(), desc.len.into())
+                    .map_err(Error::VolatileMemoryError)
+            })
+            .collect::<Result<VecDeque<VolatileSlice<'a>>>>()?;
+        Ok(Writer {
+            buffer: DescriptorChainConsumer {
+                buffers,
+                bytes_consumed: 0,
+            },
+        })
+    }
+
+    /// Writes an object to the descriptor chain buffer.
+    pub fn write_obj<T: DataInit>(&mut self, val: T) -> io::Result<()> {
+        self.write_all(val.as_slice())
+    }
+
+    /// Writes a collection of objects into the descriptor chain buffer.
+    pub fn consume<T: DataInit, C: IntoIterator<Item = T>>(&mut self, vals: C) -> io::Result<()> {
+        vals.into_iter().map(|v| self.write_obj(v)).collect()
+    }
+
+    /// Returns number of bytes available for writing.  May return an error if the combined
+    /// lengths of all the buffers in the DescriptorChain would cause an overflow.
+    pub fn available_bytes(&self) -> usize {
+        self.buffer.available_bytes()
+    }
+
+    /// Writes data to the descriptor chain buffer from a file descriptor.
+    /// Returns the number of bytes written to the descriptor chain buffer.
+    /// The number of bytes written can be less than `count` if
+    /// there isn't enough data in the descriptor chain buffer.
+    pub fn write_from<F: FileReadWriteVolatile>(
+        &mut self,
+        mut src: F,
+        count: usize,
+    ) -> io::Result<usize> {
+        self.buffer
+            .consume(count, |bufs| src.read_vectored_volatile(bufs))
+    }
+
+    /// Writes data to the descriptor chain buffer from a File at offset `off`.
+    /// Returns the number of bytes written to the descriptor chain buffer.
+    /// The number of bytes written can be less than `count` if
+    /// there isn't enough data in the descriptor chain buffer.
+    pub fn write_from_at<F: FileReadWriteAtVolatile>(
+        &mut self,
+        mut src: F,
+        count: usize,
+        off: u64,
+    ) -> io::Result<usize> {
+        self.buffer
+            .consume(count, |bufs| src.read_vectored_at_volatile(bufs, off))
+    }
+
+    pub fn write_all_from<F: FileReadWriteVolatile>(
+        &mut self,
+        mut src: F,
+        mut count: usize,
+    ) -> io::Result<()> {
+        while count > 0 {
+            match self.write_from(&mut src, count) {
+                Ok(0) => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::WriteZero,
+                        "failed to write whole buffer",
+                    ))
+                }
+                Ok(n) => count -= n,
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn write_all_from_at<F: FileReadWriteAtVolatile>(
+        &mut self,
+        mut src: F,
+        mut count: usize,
+        mut off: u64,
+    ) -> io::Result<()> {
+        while count > 0 {
+            match self.write_from_at(&mut src, count, off) {
+                Ok(0) => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::WriteZero,
+                        "failed to write whole buffer",
+                    ))
+                }
+                Ok(n) => {
+                    count -= n;
+                    off += n as u64;
+                }
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+        Ok(())
+    }
+
+    /// Returns number of bytes already written to the descriptor chain buffer.
+    pub fn bytes_written(&self) -> usize {
+        self.buffer.bytes_consumed()
+    }
+
+    /// Splits this `Writer` into two at the given offset in the `DescriptorChain` buffer.
+    /// After the split, `self` will be able to write up to `offset` bytes while the returned
+    /// `Writer` can write up to `available_bytes() - offset` bytes.  Returns an error if
+    /// `offset > self.available_bytes()`.
+    pub fn split_at(&mut self, offset: usize) -> Result<Writer<'a>> {
+        self.buffer.split_at(offset).map(|buffer| Writer { buffer })
+    }
+
+    /// Returns a DescriptorIovec for the next `len` bytes of the descriptor chain
+    /// buffer, which can be used as an IntoIovec.
+    pub fn get_iovec(&mut self, len: usize) -> io::Result<DescriptorIovec<'a>> {
+        self.buffer.get_iovec(len)
+    }
+}
+
+impl<'a> io::Write for Writer<'a> {
+    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+        self.buffer.consume(buf.len(), |bufs| {
+            let mut rem = buf;
+            let mut total = 0;
+            for vs in bufs {
+                // This is guaranteed by the implementation of `consume`.
+                debug_assert_eq!(vs.size(), cmp::min(rem.len() as u64, vs.size()));
+
+                // Safe because we have already verified that `vs` points to valid memory.
+                unsafe {
+                    copy_nonoverlapping(rem.as_ptr(), vs.as_ptr(), vs.size() as usize);
+                }
+                let copied = vs.size() as usize;
+                rem = &rem[copied..];
+                total += copied;
+            }
+            Ok(total)
+        })
+    }
+
+    fn flush(&mut self) -> io::Result<()> {
+        // Nothing to flush since the writes go straight into the buffer.
+        Ok(())
+    }
+}
+
+pub struct DescriptorIovec<'a> {
+    iovec: Vec<libc::iovec>,
+    mem: PhantomData<&'a GuestMemory>,
+}
+
+// Safe because the lifetime of DescriptorIovec is tied to the underlying GuestMemory.
+unsafe impl<'a> IntoIovec for DescriptorIovec<'a> {
+    fn into_iovec(&self) -> Vec<libc::iovec> {
+        self.iovec.clone()
+    }
+}
+
+const VIRTQ_DESC_F_NEXT: u16 = 0x1;
+const VIRTQ_DESC_F_WRITE: u16 = 0x2;
+
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub enum DescriptorType {
+    Readable,
+    Writable,
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C)]
+struct virtq_desc {
+    addr: Le64,
+    len: Le32,
+    flags: Le16,
+    next: Le16,
+}
+
+// Safe because it only has data and has no implicit padding.
+unsafe impl DataInit for virtq_desc {}
+
+/// Test utility function to create a descriptor chain in guest memory.
+pub fn create_descriptor_chain(
+    memory: &GuestMemory,
+    descriptor_array_addr: GuestAddress,
+    mut buffers_start_addr: GuestAddress,
+    descriptors: Vec<(DescriptorType, u32)>,
+    spaces_between_regions: u32,
+) -> Result<DescriptorChain> {
+    let descriptors_len = descriptors.len();
+    for (index, (type_, size)) in descriptors.into_iter().enumerate() {
+        let mut flags = 0;
+        if let DescriptorType::Writable = type_ {
+            flags |= VIRTQ_DESC_F_WRITE;
+        }
+        if index + 1 < descriptors_len {
+            flags |= VIRTQ_DESC_F_NEXT;
+        }
+
+        let index = index as u16;
+        let desc = virtq_desc {
+            addr: buffers_start_addr.offset().into(),
+            len: size.into(),
+            flags: flags.into(),
+            next: (index + 1).into(),
+        };
+
+        let offset = size + spaces_between_regions;
+        buffers_start_addr = buffers_start_addr
+            .checked_add(offset as u64)
+            .ok_or(Error::InvalidChain)?;
+
+        let _ = memory.write_obj_at_addr(
+            desc,
+            descriptor_array_addr
+                .checked_add(index as u64 * std::mem::size_of::<virtq_desc>() as u64)
+                .ok_or(Error::InvalidChain)?,
+        );
+    }
+
+    DescriptorChain::checked_new(memory, descriptor_array_addr, 0x100, 0, 0)
+        .ok_or(Error::InvalidChain)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs::{File, OpenOptions};
+    use tempfile::TempDir;
+
+    #[test]
+    fn reader_test_simple_chain() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![
+                (Readable, 8),
+                (Readable, 16),
+                (Readable, 18),
+                (Readable, 64),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+        assert_eq!(reader.available_bytes(), 106);
+        assert_eq!(reader.bytes_read(), 0);
+
+        let mut buffer = [0 as u8; 64];
+        if let Err(_) = reader.read_exact(&mut buffer) {
+            panic!("read_exact should not fail here");
+        }
+
+        assert_eq!(reader.available_bytes(), 42);
+        assert_eq!(reader.bytes_read(), 64);
+
+        match reader.read(&mut buffer) {
+            Err(_) => panic!("read should not fail here"),
+            Ok(length) => assert_eq!(length, 42),
+        }
+
+        assert_eq!(reader.available_bytes(), 0);
+        assert_eq!(reader.bytes_read(), 106);
+    }
+
+    #[test]
+    fn writer_test_simple_chain() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![
+                (Writable, 8),
+                (Writable, 16),
+                (Writable, 18),
+                (Writable, 64),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut writer = Writer::new(&memory, chain).expect("failed to create Writer");
+        assert_eq!(writer.available_bytes(), 106);
+        assert_eq!(writer.bytes_written(), 0);
+
+        let mut buffer = [0 as u8; 64];
+        if let Err(_) = writer.write_all(&mut buffer) {
+            panic!("write_all should not fail here");
+        }
+
+        assert_eq!(writer.available_bytes(), 42);
+        assert_eq!(writer.bytes_written(), 64);
+
+        match writer.write(&mut buffer) {
+            Err(_) => panic!("write should not fail here"),
+            Ok(length) => assert_eq!(length, 42),
+        }
+
+        assert_eq!(writer.available_bytes(), 0);
+        assert_eq!(writer.bytes_written(), 106);
+    }
+
+    #[test]
+    fn reader_test_incompatible_chain() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Writable, 8)],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+        assert_eq!(reader.available_bytes(), 0);
+        assert_eq!(reader.bytes_read(), 0);
+
+        assert!(reader.read_obj::<u8>().is_err());
+
+        assert_eq!(reader.available_bytes(), 0);
+        assert_eq!(reader.bytes_read(), 0);
+    }
+
+    #[test]
+    fn writer_test_incompatible_chain() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Readable, 8)],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut writer = Writer::new(&memory, chain).expect("failed to create Writer");
+        assert_eq!(writer.available_bytes(), 0);
+        assert_eq!(writer.bytes_written(), 0);
+
+        assert!(writer.write_obj(0u8).is_err());
+
+        assert_eq!(writer.available_bytes(), 0);
+        assert_eq!(writer.bytes_written(), 0);
+    }
+
+    #[test]
+    fn reader_failing_io() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Readable, 256), (Readable, 256)],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+
+        // Open a file in read-only mode so writes to it to trigger an I/O error.
+        let mut ro_file = File::open("/dev/zero").expect("failed to open /dev/zero");
+
+        reader
+            .read_exact_to(&mut ro_file, 512)
+            .expect_err("successfully read more bytes than SharedMemory size");
+
+        // The write above should have failed entirely, so we end up not writing any bytes at all.
+        assert_eq!(reader.available_bytes(), 512);
+        assert_eq!(reader.bytes_read(), 0);
+    }
+
+    #[test]
+    fn writer_failing_io() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Writable, 256), (Writable, 256)],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+
+        let mut writer = Writer::new(&memory, chain).expect("failed to create Writer");
+
+        let tempdir = TempDir::new().unwrap();
+        let mut path = tempdir.path().to_owned();
+        path.push("test_file");
+
+        let mut file = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create_new(true)
+            .open(&path)
+            .expect("failed to create temp file");
+
+        file.set_len(384).unwrap();
+
+        writer
+            .write_all_from(&mut file, 512)
+            .expect_err("successfully wrote more bytes than in SharedMemory");
+
+        assert_eq!(writer.available_bytes(), 128);
+        assert_eq!(writer.bytes_written(), 384);
+    }
+
+    #[test]
+    fn reader_writer_shared_chain() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![
+                (Readable, 16),
+                (Readable, 16),
+                (Readable, 96),
+                (Writable, 64),
+                (Writable, 1),
+                (Writable, 3),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain.clone()).expect("failed to create Reader");
+        let mut writer = Writer::new(&memory, chain).expect("failed to create Writer");
+
+        assert_eq!(reader.bytes_read(), 0);
+        assert_eq!(writer.bytes_written(), 0);
+
+        let mut buffer = Vec::with_capacity(200);
+
+        assert_eq!(
+            reader
+                .read_to_end(&mut buffer)
+                .expect("read should not fail here"),
+            128
+        );
+
+        // The writable descriptors are only 68 bytes long.
+        writer
+            .write_all(&buffer[..68])
+            .expect("write should not fail here");
+
+        assert_eq!(reader.available_bytes(), 0);
+        assert_eq!(reader.bytes_read(), 128);
+        assert_eq!(writer.available_bytes(), 0);
+        assert_eq!(writer.bytes_written(), 68);
+    }
+
+    #[test]
+    fn reader_writer_shattered_object() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let secret: Le32 = 0x12345678.into();
+
+        // Create a descriptor chain with memory regions that are properly separated.
+        let chain_writer = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Writable, 1), (Writable, 1), (Writable, 1), (Writable, 1)],
+            123,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut writer = Writer::new(&memory, chain_writer).expect("failed to create Writer");
+        if let Err(_) = writer.write_obj(secret) {
+            panic!("write_obj should not fail here");
+        }
+
+        // Now create new descriptor chain pointing to the same memory and try to read it.
+        let chain_reader = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Readable, 1), (Readable, 1), (Readable, 1), (Readable, 1)],
+            123,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain_reader).expect("failed to create Reader");
+        match reader.read_obj::<Le32>() {
+            Err(_) => panic!("read_obj should not fail here"),
+            Ok(read_secret) => assert_eq!(read_secret, secret),
+        }
+    }
+
+    #[test]
+    fn reader_unexpected_eof() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Readable, 256), (Readable, 256)],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+
+        let mut buf = Vec::with_capacity(1024);
+        buf.resize(1024, 0);
+
+        assert_eq!(
+            reader
+                .read_exact(&mut buf[..])
+                .expect_err("read more bytes than available")
+                .kind(),
+            io::ErrorKind::UnexpectedEof
+        );
+    }
+
+    #[test]
+    fn split_border() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![
+                (Readable, 16),
+                (Readable, 16),
+                (Readable, 96),
+                (Writable, 64),
+                (Writable, 1),
+                (Writable, 3),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+
+        let other = reader.split_at(32).expect("failed to split Reader");
+        assert_eq!(reader.available_bytes(), 32);
+        assert_eq!(other.available_bytes(), 96);
+    }
+
+    #[test]
+    fn split_middle() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![
+                (Readable, 16),
+                (Readable, 16),
+                (Readable, 96),
+                (Writable, 64),
+                (Writable, 1),
+                (Writable, 3),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+
+        let other = reader.split_at(24).expect("failed to split Reader");
+        assert_eq!(reader.available_bytes(), 24);
+        assert_eq!(other.available_bytes(), 104);
+    }
+
+    #[test]
+    fn split_end() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![
+                (Readable, 16),
+                (Readable, 16),
+                (Readable, 96),
+                (Writable, 64),
+                (Writable, 1),
+                (Writable, 3),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+
+        let other = reader.split_at(128).expect("failed to split Reader");
+        assert_eq!(reader.available_bytes(), 128);
+        assert_eq!(other.available_bytes(), 0);
+    }
+
+    #[test]
+    fn split_beginning() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![
+                (Readable, 16),
+                (Readable, 16),
+                (Readable, 96),
+                (Writable, 64),
+                (Writable, 1),
+                (Writable, 3),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+
+        let other = reader.split_at(0).expect("failed to split Reader");
+        assert_eq!(reader.available_bytes(), 0);
+        assert_eq!(other.available_bytes(), 128);
+    }
+
+    #[test]
+    fn split_outofbounds() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![
+                (Readable, 16),
+                (Readable, 16),
+                (Readable, 96),
+                (Writable, 64),
+                (Writable, 1),
+                (Writable, 3),
+            ],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+
+        if let Ok(_) = reader.split_at(256) {
+            panic!("successfully split Reader with out of bounds offset");
+        }
+    }
+
+    #[test]
+    fn read_full() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Readable, 16), (Readable, 16), (Readable, 16)],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, chain).expect("failed to create Reader");
+
+        let mut buf = vec![0u8; 64];
+        assert_eq!(
+            reader.read(&mut buf[..]).expect("failed to read to buffer"),
+            48
+        );
+    }
+
+    #[test]
+    fn write_full() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+
+        let chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Writable, 16), (Writable, 16), (Writable, 16)],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut writer = Writer::new(&memory, chain).expect("failed to create Writer");
+
+        let buf = vec![0xdeu8; 64];
+        assert_eq!(
+            writer.write(&buf[..]).expect("failed to write from buffer"),
+            48
+        );
+    }
+
+    #[test]
+    fn consume_collect() {
+        use DescriptorType::*;
+
+        let memory_start_addr = GuestAddress(0x0);
+        let memory = GuestMemory::new(&vec![(memory_start_addr, 0x10000)]).unwrap();
+        let vs: Vec<Le64> = vec![
+            0x0101010101010101.into(),
+            0x0202020202020202.into(),
+            0x0303030303030303.into(),
+        ];
+
+        let write_chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Writable, 24)],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut writer = Writer::new(&memory, write_chain).expect("failed to create Writer");
+        writer
+            .consume(vs.clone())
+            .expect("failed to consume() a vector");
+
+        let read_chain = create_descriptor_chain(
+            &memory,
+            GuestAddress(0x0),
+            GuestAddress(0x100),
+            vec![(Readable, 24)],
+            0,
+        )
+        .expect("create_descriptor_chain failed");
+        let mut reader = Reader::new(&memory, read_chain).expect("failed to create Reader");
+        let vs_read = reader
+            .collect::<io::Result<Vec<Le64>>, _>()
+            .expect("failed to collect() values");
+        assert_eq!(vs, vs_read);
+    }
+}
diff --git a/devices/src/virtio/fs/filesystem.rs b/devices/src/virtio/fs/filesystem.rs
new file mode 100644
index 0000000..232ff99
--- /dev/null
+++ b/devices/src/virtio/fs/filesystem.rs
@@ -0,0 +1,1142 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::convert::TryInto;
+use std::ffi::CStr;
+use std::fs::File;
+use std::io;
+use std::mem;
+use std::time::Duration;
+
+use libc;
+
+use crate::virtio::fs::fuse;
+
+pub use fuse::{FsOptions, IoctlFlags, IoctlIovec, OpenOptions, SetattrValid, ROOT_ID};
+
+/// Information about a path in the filesystem.
+pub struct Entry {
+    /// An `Inode` that uniquely identifies this path. During `lookup`, setting this to `0` means a
+    /// negative entry. Returning `ENOENT` also means a negative entry but setting this to `0`
+    /// allows the kernel to cache the negative result for `entry_timeout`. The value should be
+    /// produced by converting a `FileSystem::Inode` into a `u64`.
+    pub inode: u64,
+
+    /// The generation number for this `Entry`. Typically used for network file systems. An `inode`
+    /// / `generation` pair must be unique over the lifetime of the file system (rather than just
+    /// the lifetime of the mount). In other words, if a `FileSystem` implementation re-uses an
+    /// `Inode` after it has been deleted then it must assign a new, previously unused generation
+    /// number to the `Inode` at the same time.
+    pub generation: u64,
+
+    /// Inode attributes. Even if `attr_timeout` is zero, `attr` must be correct. For example, for
+    /// `open()`, FUSE uses `attr.st_size` from `lookup()` to determine how many bytes to request.
+    /// If this value is not correct, incorrect data will be returned.
+    pub attr: libc::stat64,
+
+    /// How long the values in `attr` should be considered valid. If the attributes of the `Entry`
+    /// are only modified by the FUSE client, then this should be set to a very large value.
+    pub attr_timeout: Duration,
+
+    /// How long the name associated with this `Entry` should be considered valid. If directory
+    /// entries are only changed or deleted by the FUSE client, then this should be set to a very
+    /// large value.
+    pub entry_timeout: Duration,
+}
+
+impl From<Entry> for fuse::EntryOut {
+    fn from(entry: Entry) -> fuse::EntryOut {
+        fuse::EntryOut {
+            nodeid: entry.inode,
+            generation: entry.generation,
+            entry_valid: entry.entry_timeout.as_secs(),
+            attr_valid: entry.attr_timeout.as_secs(),
+            entry_valid_nsec: entry.entry_timeout.subsec_nanos(),
+            attr_valid_nsec: entry.attr_timeout.subsec_nanos(),
+            attr: entry.attr.into(),
+        }
+    }
+}
+
+/// Represents information about an entry in a directory.
+pub struct DirEntry<'a> {
+    /// The inode number for this entry. This does NOT have to be the same as the `Inode` for this
+    /// directory entry. However, it must be the same as the `attr.st_ino` field of the `Entry` that
+    /// would be returned by a `lookup` request in the parent directory for `name`.
+    pub ino: libc::ino64_t,
+
+    /// Any non-zero value that the kernel can use to identify the current point in the directory
+    /// entry stream. It does not need to be the actual physical position. A value of `0` is
+    /// reserved to mean "from the beginning" and should never be used. The `offset` value of the
+    /// first entry in a stream should point to the beginning of the second entry and so on.
+    pub offset: u64,
+
+    /// The type of this directory entry. Valid values are any of the `libc::DT_*` constants.
+    pub type_: u32,
+
+    /// The name of this directory entry. There are no requirements for the contents of this field
+    /// and any sequence of bytes is considered valid.
+    pub name: &'a [u8],
+}
+
+/// A reply to a `getxattr` method call.
+pub enum GetxattrReply {
+    /// The value of the requested extended attribute. This can be arbitrary textual or binary data
+    /// and does not need to be nul-terminated.
+    Value(Vec<u8>),
+
+    /// The size of the buffer needed to hold the value of the requested extended attribute. Should
+    /// be returned when the `size` parameter is 0. Callers should note that it is still possible
+    /// for the size of the value to change in between `getxattr` calls and should not assume that a
+    /// subsequent call to `getxattr` with the returned count will always succeed.
+    Count(u32),
+}
+
+/// A reply to a `listxattr` method call.
+pub enum ListxattrReply {
+    /// A buffer containing a nul-separated list of the names of all the extended attributes
+    /// associated with this `Inode`. This list of names may be unordered and includes a namespace
+    /// prefix. There may be several disjoint namespaces associated with a single `Inode`.
+    Names(Vec<u8>),
+
+    /// This size of the buffer needed to hold the full list of extended attribute names associated
+    /// with this `Inode`. Should be returned when the `size` parameter is 0. Callers should note
+    /// that it is still possible for the set of extended attributes to change between `listxattr`
+    /// calls and so should not assume that a subsequent call to `listxattr` with the returned count
+    /// will always succeed.
+    Count(u32),
+}
+
+/// A reply to an `ioctl` method call.
+pub enum IoctlReply {
+    /// Indicates that the ioctl should be retried. This is only a valid reply when the `flags`
+    /// field of the ioctl request contains `IoctlFlags::UNRESTRICTED`. The kernel will read in data
+    /// and prepare output buffers as specified in the `input` and `output` fields before re-sending
+    /// the ioctl message.
+    Retry {
+        /// Data that should be read by the kernel module and sent to the server when the ioctl is
+        /// retried.
+        input: Vec<IoctlIovec>,
+
+        /// Buffer space that should be prepared so that the server can send back the response to
+        /// the ioctl.
+        output: Vec<IoctlIovec>,
+    },
+
+    /// Indicates that the ioctl was processed.
+    Done(io::Result<Vec<u8>>),
+}
+
+/// A trait for directly copying data from the fuse transport into a `File` without first storing it
+/// in an intermediate buffer.
+pub trait ZeroCopyReader {
+    /// Copies at most `count` bytes from `self` directly into `f` at offset `off` without storing
+    /// it in any intermediate buffers. If the return value is `Ok(n)` then it must be guaranteed
+    /// that `0 <= n <= count`. If `n` is `0`, then it can indicate one of 3 possibilities:
+    ///
+    /// 1. There is no more data left in `self`.
+    /// 2. There is no more space in `f`.
+    /// 3. `count` was `0`.
+    ///
+    /// # Errors
+    ///
+    /// If any error is returned then the implementation must guarantee that no bytes were copied
+    /// from `self`. If the underlying write to `f` returns `0` then the implementation must return
+    /// an error of the kind `io::ErrorKind::WriteZero`.
+    fn read_to(&mut self, f: &mut File, count: usize, off: u64) -> io::Result<usize>;
+
+    /// Copies exactly `count` bytes of data from `self` into `f` at offset `off`. `off + count`
+    /// must be less than `u64::MAX`.
+    ///
+    /// # Errors
+    ///
+    /// If an error is returned then the number of bytes copied from `self` is unspecified but it
+    /// will never be more than `count`.
+    fn read_exact_to(&mut self, f: &mut File, mut count: usize, mut off: u64) -> io::Result<()> {
+        let c = count
+            .try_into()
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
+        if off.checked_add(c).is_none() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "`off` + `count` must be less than u64::MAX",
+            ));
+        }
+
+        while count > 0 {
+            match self.read_to(f, count, off) {
+                Ok(0) => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::WriteZero,
+                        "failed to fill whole buffer",
+                    ))
+                }
+                Ok(n) => {
+                    count -= n;
+                    off += n as u64;
+                }
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Copies all remaining bytes from `self` into `f` at offset `off`. Equivalent to repeatedly
+    /// calling `read_to` until it returns either `Ok(0)` or a non-`ErrorKind::Interrupted` error.
+    ///
+    /// # Errors
+    ///
+    /// If an error is returned then the number of bytes copied from `self` is unspecified.
+    fn copy_to_end(&mut self, f: &mut File, mut off: u64) -> io::Result<usize> {
+        let mut out = 0;
+        loop {
+            match self.read_to(f, ::std::usize::MAX, off) {
+                Ok(0) => return Ok(out),
+                Ok(n) => {
+                    off = off.saturating_add(n as u64);
+                    out += n;
+                }
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+    }
+}
+
+impl<'a, R: ZeroCopyReader> ZeroCopyReader for &'a mut R {
+    fn read_to(&mut self, f: &mut File, count: usize, off: u64) -> io::Result<usize> {
+        (**self).read_to(f, count, off)
+    }
+    fn read_exact_to(&mut self, f: &mut File, count: usize, off: u64) -> io::Result<()> {
+        (**self).read_exact_to(f, count, off)
+    }
+    fn copy_to_end(&mut self, f: &mut File, off: u64) -> io::Result<usize> {
+        (**self).copy_to_end(f, off)
+    }
+}
+
+/// A trait for directly copying data from a `File` into the fuse transport without first storing
+/// it in an intermediate buffer.
+pub trait ZeroCopyWriter {
+    /// Copies at most `count` bytes from `f` at offset `off` directly into `self` without storing
+    /// it in any intermediate buffers. If the return value is `Ok(n)` then it must be guaranteed
+    /// that `0 <= n <= count`. If `n` is `0`, then it can indicate one of 3 possibilities:
+    ///
+    /// 1. There is no more data left in `f`.
+    /// 2. There is no more space in `self`.
+    /// 3. `count` was `0`.
+    ///
+    /// # Errors
+    ///
+    /// If any error is returned then the implementation must guarantee that no bytes were copied
+    /// from `f`. If the underlying read from `f` returns `0` then the implementation must return an
+    /// error of the kind `io::ErrorKind::UnexpectedEof`.
+    fn write_from(&mut self, f: &mut File, count: usize, off: u64) -> io::Result<usize>;
+
+    /// Copies exactly `count` bytes of data from `f` at offset `off` into `self`. `off + count`
+    /// must be less than `u64::MAX`.
+    ///
+    /// # Errors
+    ///
+    /// If an error is returned then the number of bytes copied from `self` is unspecified but it
+    /// well never be more than `count`.
+    fn write_all_from(&mut self, f: &mut File, mut count: usize, mut off: u64) -> io::Result<()> {
+        let c = count
+            .try_into()
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
+        if off.checked_add(c).is_none() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "`off` + `count` must be less than u64::MAX",
+            ));
+        }
+
+        while count > 0 {
+            match self.write_from(f, count, off) {
+                Ok(0) => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::UnexpectedEof,
+                        "failed to write whole buffer",
+                    ))
+                }
+                Ok(n) => {
+                    // No need for checked math here because we verified that `off + count` will not
+                    // overflow and `n` must be <= `count`.
+                    count -= n;
+                    off += n as u64;
+                }
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Copies all remaining bytes from `f` at offset `off` into `self`. Equivalent to repeatedly
+    /// calling `write_from` until it returns either `Ok(0)` or a non-`ErrorKind::Interrupted`
+    /// error.
+    ///
+    /// # Errors
+    ///
+    /// If an error is returned then the number of bytes copied from `f` is unspecified.
+    fn copy_to_end(&mut self, f: &mut File, mut off: u64) -> io::Result<usize> {
+        let mut out = 0;
+        loop {
+            match self.write_from(f, ::std::usize::MAX, off) {
+                Ok(0) => return Ok(out),
+                Ok(n) => {
+                    off = off.saturating_add(n as u64);
+                    out += n;
+                }
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+    }
+}
+
+impl<'a, W: ZeroCopyWriter> ZeroCopyWriter for &'a mut W {
+    fn write_from(&mut self, f: &mut File, count: usize, off: u64) -> io::Result<usize> {
+        (**self).write_from(f, count, off)
+    }
+    fn write_all_from(&mut self, f: &mut File, count: usize, off: u64) -> io::Result<()> {
+        (**self).write_all_from(f, count, off)
+    }
+    fn copy_to_end(&mut self, f: &mut File, off: u64) -> io::Result<usize> {
+        (**self).copy_to_end(f, off)
+    }
+}
+
+/// Additional context associated with requests.
+#[derive(Clone, Copy, Debug)]
+pub struct Context {
+    /// The user ID of the calling process.
+    pub uid: libc::uid_t,
+
+    /// The group ID of the calling process.
+    pub gid: libc::gid_t,
+
+    /// The thread group ID of the calling process.
+    pub pid: libc::pid_t,
+}
+
+impl From<fuse::InHeader> for Context {
+    fn from(source: fuse::InHeader) -> Self {
+        Context {
+            uid: source.uid,
+            gid: source.gid,
+            pid: source.pid as i32,
+        }
+    }
+}
+
+/// The main trait that connects a file system with a transport.
+#[allow(unused_variables)]
+pub trait FileSystem {
+    /// Represents a location in the filesystem tree and can be used to perform operations that act
+    /// on the metadata of a file/directory (e.g., `getattr` and `setattr`). Can also be used as the
+    /// starting point for looking up paths in the filesystem tree. An `Inode` may support operating
+    /// directly on the content of the path that to which it points. `FileSystem` implementations
+    /// that support this should set the `FsOptions::ZERO_MESSAGE_OPEN` option in the return value
+    /// of the `init` function. On linux based systems, an `Inode` is equivalent to opening a file
+    /// or directory with the `libc::O_PATH` flag.
+    ///
+    /// # Lookup Count
+    ///
+    /// The `FileSystem` implementation is required to keep a "lookup count" for every `Inode`.
+    /// Every time an `Entry` is returned by a `FileSystem` trait method, this lookup count should
+    /// increase by 1. The lookup count for an `Inode` decreases when the kernel sends a `forget`
+    /// request. `Inode`s with a non-zero lookup count may receive requests from the kernel even
+    /// after calls to `unlink`, `rmdir` or (when overwriting an existing file) `rename`.
+    /// `FileSystem` implementations must handle such requests properly and it is recommended to
+    /// defer removal of the `Inode` until the lookup count reaches zero. Calls to `unlink`, `rmdir`
+    /// or `rename` will be followed closely by `forget` unless the file or directory is open, in
+    /// which case the kernel issues `forget` only after the `release` or `releasedir` calls.
+    ///
+    /// Note that if a file system will be exported over NFS the `Inode`'s lifetime must extend even
+    /// beyond `forget`. See the `generation` field in `Entry`.
+    type Inode: From<u64> + Into<u64>;
+
+    /// Represents a file or directory that is open for reading/writing.
+    type Handle: From<u64> + Into<u64>;
+
+    /// Initialize the file system.
+    ///
+    /// This method is called when a connection to the FUSE kernel module is first established. The
+    /// `capable` parameter indicates the features that are supported by the kernel module. The
+    /// implementation should return the options that it supports. Any options set in the returned
+    /// `FsOptions` that are not also set in `capable` are silently dropped.
+    fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
+        Ok(FsOptions::empty())
+    }
+
+    /// Clean up the file system.
+    ///
+    /// Called when the filesystem exits. All open `Handle`s should be closed and the lookup count
+    /// for all open `Inode`s implicitly goes to zero. At this point the connection to the FUSE
+    /// kernel module may already be gone so implementations should not rely on being able to
+    /// communicate with the kernel.
+    fn destroy(&self) {}
+
+    /// Look up a directory entry by name and get its attributes.
+    ///
+    /// If this call is successful then the lookup count of the `Inode` associated with the returned
+    /// `Entry` must be increased by 1.
+    fn lookup(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result<Entry> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Forget about an inode.
+    ///
+    /// Called when the kernel removes an inode from its internal caches. `count` indicates the
+    /// amount by which the lookup count for the inode should be decreased. If reducing the lookup
+    /// count by `count` causes it to go to zero, then the implementation may delete the `Inode`.
+    fn forget(&self, ctx: Context, inode: Self::Inode, count: u64) {}
+
+    /// Forget about multiple inodes.
+    ///
+    /// `requests` is a vector of `(inode, count)` pairs. See the documentation for `forget` for
+    /// more information.
+    fn batch_forget(&self, ctx: Context, requests: Vec<(Self::Inode, u64)>) {
+        for (inode, count) in requests {
+            self.forget(ctx, inode, count)
+        }
+    }
+
+    /// Get attributes for a file / directory.
+    ///
+    /// If `handle` is not `None`, then it contains the handle previously returned by the
+    /// implementation after a call to `open` or `opendir`. However, implementations should still
+    /// take care to verify the handle if they do not trust the client (e.g., virtio-fs).
+    ///
+    /// If writeback caching is enabled (`FsOptions::WRITEBACK_CACHE`), then the kernel module
+    /// likely has a better idea of the length of the file than the file system (for
+    /// example, if there was a write that extended the size of the file but has not yet been
+    /// flushed). In this case, the `st_size` field of the returned struct is ignored.
+    ///
+    /// The returned `Duration` indicates how long the returned attributes should be considered
+    /// valid by the client. If the attributes are only changed via the FUSE kernel module (i.e.,
+    /// the kernel module has exclusive access), then this should be a very large value.
+    fn getattr(
+        &self,
+        ctx: Context,
+        inode: Self::Inode,
+        handle: Option<Self::Handle>,
+    ) -> io::Result<(libc::stat64, Duration)> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Set attributes for a file / directory.
+    ///
+    /// If `handle` is not `None`, then it contains the handle previously returned by the
+    /// implementation after a call to `open` or `opendir`. However, implementations should still
+    /// take care to verify the handle if they do not trust the client (e.g., virtio-fs).
+    ///
+    /// The `valid` parameter indicates the fields of `attr` that may be considered valid and should
+    /// be set by the file system. The content of all other fields in `attr` is undefined.
+    ///
+    /// If the `FsOptions::HANDLE_KILLPRIV` was set during `init`, then the implementation is
+    /// expected to reset the setuid and setgid bits if the file size or owner is being changed.
+    ///
+    /// This method returns the new attributes after making the modifications requested by the
+    /// client. The returned `Duration` indicates how long the returned attributes should be
+    /// considered valid by the client. If the attributes are only changed via the FUSE kernel
+    /// module (i.e., the kernel module has exclusive access), then this should be a very large
+    /// value.
+    fn setattr(
+        &self,
+        ctx: Context,
+        inode: Self::Inode,
+        attr: libc::stat64,
+        handle: Option<Self::Handle>,
+        valid: SetattrValid,
+    ) -> io::Result<(libc::stat64, Duration)> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Read a symbolic link.
+    fn readlink(&self, ctx: Context, inode: Self::Inode) -> io::Result<Vec<u8>> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Create a symbolic link.
+    ///
+    /// The file system must create a symbolic link named `name` in the directory represented by
+    /// `parent`, which contains the string `linkname`. Returns an `Entry` for the newly created
+    /// symlink.
+    ///
+    /// If this call is successful then the lookup count of the `Inode` associated with the returned
+    /// `Entry` must be increased by 1.
+    fn symlink(
+        &self,
+        ctx: Context,
+        linkname: &CStr,
+        parent: Self::Inode,
+        name: &CStr,
+    ) -> io::Result<Entry> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Create a file node.
+    ///
+    /// Create a regular file, character device, block device, fifo, or socket node named `name` in
+    /// the directory represented by `inode`. Valid values for `mode` and `rdev` are the same as
+    /// those accepted by the `mknod(2)` system call. Returns an `Entry` for the newly created node.
+    ///
+    /// When the `FsOptions::DONT_MASK` feature is set, the file system is responsible for setting
+    /// the permissions of the created node to `mode & !umask`.
+    ///
+    /// If this call is successful then the lookup count of the `Inode` associated with the returned
+    /// `Entry` must be increased by 1.
+    fn mknod(
+        &self,
+        ctx: Context,
+        inode: Self::Inode,
+        name: &CStr,
+        mode: u32,
+        rdev: u32,
+        umask: u32,
+    ) -> io::Result<Entry> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Create a directory.
+    ///
+    /// When the `FsOptions::DONT_MASK` feature is set, the file system is responsible for setting
+    /// the permissions of the created directory to `mode & !umask`. Returns an `Entry` for the
+    /// newly created directory.
+    ///
+    /// If this call is successful then the lookup count of the `Inode` associated with the returned
+    /// `Entry` must be increased by 1.
+    fn mkdir(
+        &self,
+        ctx: Context,
+        parent: Self::Inode,
+        name: &CStr,
+        mode: u32,
+        umask: u32,
+    ) -> io::Result<Entry> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Remove a file.
+    ///
+    /// If the file's inode lookup count is non-zero, then the file system is expected to delay
+    /// removal of the inode until the lookup count goes to zero. See the documentation of the
+    /// `forget` function for more information.
+    fn unlink(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result<()> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Remove a directory.
+    ///
+    /// If the directory's inode lookup count is non-zero, then the file system is expected to delay
+    /// removal of the inode until the lookup count goes to zero. See the documentation of the
+    /// `forget` function for more information.
+    fn rmdir(&self, ctx: Context, parent: Self::Inode, name: &CStr) -> io::Result<()> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Rename a file / directory.
+    ///
+    /// If the destination exists, it should be atomically replaced. If the destination's inode
+    /// lookup count is non-zero, then the file system is expected to delay removal of the inode
+    /// until the lookup count goes to zero. See the documentation of the `forget` function for more
+    /// information.
+    ///
+    /// `flags` may be `libc::RENAME_EXCHANGE` or `libc::RENAME_NOREPLACE`. If
+    /// `libc::RENAME_NOREPLACE` is specified, the implementation must not overwrite `newname` if it
+    /// exists and must return an error instead. If `libc::RENAME_EXCHANGE` is specified, the
+    /// implementation must atomically exchange the two files, i.e., both must exist and neither may
+    /// be deleted.
+    fn rename(
+        &self,
+        ctx: Context,
+        olddir: Self::Inode,
+        oldname: &CStr,
+        newdir: Self::Inode,
+        newname: &CStr,
+        flags: u32,
+    ) -> io::Result<()> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Create a hard link.
+    ///
+    /// Create a hard link from `inode` to `newname` in the directory represented by `newparent`.
+    ///
+    /// If this call is successful then the lookup count of the `Inode` associated with the returned
+    /// `Entry` must be increased by 1.
+    fn link(
+        &self,
+        ctx: Context,
+        inode: Self::Inode,
+        newparent: Self::Inode,
+        newname: &CStr,
+    ) -> io::Result<Entry> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Open a file.
+    ///
+    /// Open the file associated with `inode` for reading / writing. All values accepted by the
+    /// `open(2)` system call are valid values for `flags` and must be handled by the file system.
+    /// However, there are some additional rules:
+    ///
+    /// * Creation flags (`libc::O_CREAT`, `libc::O_EXCL`, `libc::O_NOCTTY`) will be filtered out
+    ///   and handled by the kernel.
+    ///
+    /// * The file system should check the access modes (`libc::O_RDONLY`, `libc::O_WRONLY`,
+    ///   `libc::O_RDWR`) to determine if the operation is permitted. If the file system was mounted
+    ///   with the `-o default_permissions` mount option, then this check will also be carried out
+    ///   by the kernel before sending the open request.
+    ///
+    /// * When writeback caching is enabled (`FsOptions::WRITEBACK_CACHE`) the kernel may send read
+    ///   requests even for files opened with `libc::O_WRONLY`. The file system should be prepared
+    ///   to handle this.
+    ///
+    /// * When writeback caching is enabled, the kernel will handle the `libc::O_APPEND` flag.
+    ///   However, this will not work reliably unless the kernel has exclusive access to the file.
+    ///   In this case the file system may either ignore the `libc::O_APPEND` flag or return an
+    ///   error to indicate that reliable `libc::O_APPEND` handling is not available.
+    ///
+    /// * When writeback caching is disabled, the file system is expected to properly handle
+    ///   `libc::O_APPEND` and ensure that each write is appended to the end of the file.
+    ///
+    /// The file system may choose to return a `Handle` to refer to the newly opened file. The
+    /// kernel will then use this `Handle` for all operations on the content of the file (`read`,
+    /// `write`, `flush`, `release`, `fsync`). If the file system does not return a
+    /// `Handle` then the kernel will use the `Inode` for the file to operate on its contents. In
+    /// this case the file system may wish to enable the `FsOptions::ZERO_MESSAGE_OPEN` feature if
+    /// it is supported by the kernel (see below).
+    ///
+    /// The returned `OpenOptions` allow the file system to change the way the opened file is
+    /// handled by the kernel. See the documentation of `OpenOptions` for more information.
+    ///
+    /// If the `FsOptions::ZERO_MESSAGE_OPEN` feature is enabled by both the file system
+    /// implementation and the kernel, then the file system may return an error of `ENOSYS`. This
+    /// will be interpreted by the kernel as success and future calls to `open` and `release` will
+    /// be handled by the kernel without being passed on to the file system.
+    fn open(
+        &self,
+        ctx: Context,
+        inode: Self::Inode,
+        flags: u32,
+    ) -> io::Result<(Option<Self::Handle>, OpenOptions)> {
+        // Matches the behavior of libfuse.
+        Ok((None, OpenOptions::empty()))
+    }
+
+    /// Create and open a file.
+    ///
+    /// If the file does not already exist, the file system should create it with the specified
+    /// `mode`. When the `FsOptions::DONT_MASK` feature is set, the file system is responsible for
+    /// setting the permissions of the created file to `mode & !umask`.
+    ///
+    /// If the file system returns an `ENOSYS` error, then the kernel will treat this method as
+    /// unimplemented and all future calls to `create` will be handled by calling the `mknod` and
+    /// `open` methods instead.
+    ///
+    /// See the documentation for the `open` method for more information about opening the file. In
+    /// addition to the optional `Handle` and the `OpenOptions`, the file system must also return an
+    /// `Entry` for the file. This increases the lookup count for the `Inode` associated with the
+    /// file by 1.
+    fn create(
+        &self,
+        ctx: Context,
+        parent: Self::Inode,
+        name: &CStr,
+        mode: u32,
+        flags: u32,
+        umask: u32,
+    ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Read data from a file.
+    ///
+    /// Returns `size` bytes of data starting from offset `off` from the file associated with
+    /// `inode` or `handle`.
+    ///
+    /// `flags` contains the flags used to open the file. Similarly, `handle` is the `Handle`
+    /// returned by the file system from the `open` method, if any. If the file system
+    /// implementation did not return a `Handle` from `open` then the contents of `handle` are
+    /// undefined.
+    ///
+    /// This method should return exactly the number of bytes requested by the kernel, except in the
+    /// case of error or EOF. Otherwise, the kernel will substitute the rest of the data with
+    /// zeroes. An exception to this rule is if the file was opened with the "direct I/O" option
+    /// (`libc::O_DIRECT`), in which case the kernel will forward the return code from this method
+    /// to the userspace application that made the system call.
+    fn read<W: io::Write + ZeroCopyWriter>(
+        &self,
+        ctx: Context,
+        inode: Self::Inode,
+        handle: Self::Handle,
+        w: W,
+        size: u32,
+        offset: u64,
+        lock_owner: Option<u64>,
+        flags: u32,
+    ) -> io::Result<usize> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Write data to a file.
+    ///
+    /// Writes `size` bytes of data starting from offset `off` to the file associated with `inode`
+    /// or `handle`.
+    ///
+    /// `flags` contains the flags used to open the file. Similarly, `handle` is the `Handle`
+    /// returned by the file system from the `open` method, if any. If the file system
+    /// implementation did not return a `Handle` from `open` then the contents of `handle` are
+    /// undefined.
+    ///
+    /// If the `FsOptions::HANDLE_KILLPRIV` feature is not enabled then then the file system is
+    /// expected to clear the setuid and setgid bits.
+    ///
+    /// If `delayed_write` is true then it indicates that this is a write for buffered data.
+    ///
+    /// This method should return exactly the number of bytes requested by the kernel, except in the
+    /// case of error. An exception to this rule is if the file was opened with the "direct I/O"
+    /// option (`libc::O_DIRECT`), in which case the kernel will forward the return code from this
+    /// method to the userspace application that made the system call.
+    fn write<R: io::Read + ZeroCopyReader>(
+        &self,
+        ctx: Context,
+        inode: Self::Inode,
+        handle: Self::Handle,
+        r: R,
+        size: u32,
+        offset: u64,
+        lock_owner: Option<u64>,
+        delayed_write: bool,
+        flags: u32,
+    ) -> io::Result<usize> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Flush the contents of a file.
+    ///
+    /// This method is called on every `close()` of a file descriptor. Since it is possible to
+    /// duplicate file descriptors there may be many `flush` calls for one call to `open`.
+    ///
+    /// File systems should not make any assumptions about when `flush` will be
+    /// called or even if it will be called at all.
+    ///
+    /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the
+    /// file system did not return a `Handle` from `open` then the contents of `handle` are
+    /// undefined.
+    ///
+    /// Unlike `fsync`, the file system is not required to flush pending writes. One reason to flush
+    /// data is if the file system wants to return write errors during close. However, this is not
+    /// portable because POSIX does not require `close` to wait for delayed I/O to complete.
+    ///
+    /// If the `FsOptions::POSIX_LOCKS` feature is enabled, then the file system must remove all
+    /// locks belonging to `lock_owner`.
+    ///
+    /// If this method returns an `ENOSYS` error then the kernel will treat it as success and all
+    /// subsequent calls to `flush` will be handled by the kernel without being forwarded to the
+    /// file system.
+    fn flush(
+        &self,
+        ctx: Context,
+        inode: Self::Inode,
+        handle: Self::Handle,
+        lock_owner: u64,
+    ) -> io::Result<()> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Synchronize file contents.
+    ///
+    /// File systems must ensure that the file contents have been flushed to disk before returning
+    /// from this method. If `datasync` is true then only the file data (but not the metadata) needs
+    /// to be flushed.
+    ///
+    /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the
+    /// file system did not return a `Handle` from `open` then the contents of
+    /// `handle` are undefined.
+    ///
+    /// If this method returns an `ENOSYS` error then the kernel will treat it as success and all
+    /// subsequent calls to `fsync` will be handled by the kernel without being forwarded to the
+    /// file system.
+    fn fsync(
+        &self,
+        ctx: Context,
+        inode: Self::Inode,
+        datasync: bool,
+        handle: Self::Handle,
+    ) -> io::Result<()> {
+        Err(io::Error::from_raw_os_error(libc::ENOSYS))
+    }
+
+    /// Allocate requested space for file data.
+    ///
+    /// If this function returns success, then the file sytem must guarantee that it is possible to
+    /// write up to `length` bytes of data starting at `offset` without failing due to a lack of
+    /// free space on the disk.
+    ///
+    /// `handle` is the `Handle` returned by the file system from the `open` method, if any. If the
+    /// file system did not return a `Handle` from `open` then the contents of `handle` are
+    /// undefined.
+    ///
+    /// If this method returns an `ENOSYS` error then the kernel will treat that as a permanent
+    /// failure: all future calls to `fallocate` will fail with `EOPNOTSUPP` without being forwarded
+    /// to the file system.
+    fn fallocate(