crosvm: Support MSR emulation

At present, crosvm can support handle RDMSR in userspace by read MSR of
Host directly. This is the RDMSR passthrough in userpace.

This patch adds more MSR handler support, including:
- WRMSR passthrough: write the corresponding MSR of Host directly in
  userspace; The control of MSR will work on host;
- RDMSR emulation: crosvm uses a dummy MSR value to let Guest reads on.
  This value can be different with real MSR of Host;
- WRMSR emulation: crosvm uses a dummy MSR value to let Guest writes
  into. This means the control of MSR won't really work (on Host).

BUG=b:225375705
TEST=Set `--userspace-msr=0x1a2,type=r,action=pass,from=cpu0`

Change-Id: I3276dd651464ef7b695e2ddd433793d59128af9b
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/3575508
Reviewed-by: Junichi Uekawa <uekawa@chromium.org>
Tested-by: kokoro <noreply+kokoro@google.com>
Commit-Queue: Junichi Uekawa <uekawa@chromium.org>
diff --git a/aarch64/src/lib.rs b/aarch64/src/lib.rs
index 105bc9e..02452a4 100644
--- a/aarch64/src/lib.rs
+++ b/aarch64/src/lib.rs
@@ -8,7 +8,10 @@
 use std::io;
 use std::sync::Arc;
 
-use arch::{get_serial_cmdline, GetSerialCmdlineError, RunnableLinuxVm, VmComponents, VmImage};
+use arch::{
+    get_serial_cmdline, GetSerialCmdlineError, MsrExitHandler, RunnableLinuxVm, VmComponents,
+    VmImage,
+};
 use base::{Event, MemoryMappingBuilder};
 use devices::serial_device::{SerialHardware, SerialParameters};
 use devices::{
@@ -635,3 +638,8 @@
         Ok(())
     }
 }
+
+#[derive(Default)]
+pub struct MsrAArch64;
+
+impl MsrExitHandler for MsrAArch64 {}
diff --git a/arch/src/lib.rs b/arch/src/lib.rs
index 90959b5..1c63663 100644
--- a/arch/src/lib.rs
+++ b/arch/src/lib.rs
@@ -14,6 +14,8 @@
 use std::path::PathBuf;
 use std::sync::Arc;
 
+use libc::sched_getcpu;
+
 use acpi_tables::aml::Aml;
 use acpi_tables::sdt::SDT;
 use base::{syslog, AsRawDescriptor, AsRawDescriptors, Event, Tube};
@@ -836,3 +838,105 @@
 
     Ok((guest_addr, size))
 }
+
+/// Read and write permissions setting
+///
+/// Wrap read_allow and write_allow to store them in MsrHandlers level.
+#[derive(Clone, Copy, Default, PartialEq)]
+pub struct MsrRWType {
+    pub read_allow: bool,
+    pub write_allow: bool,
+}
+
+/// Handler types for userspace-msr
+#[derive(Clone, Debug, PartialEq)]
+pub enum MsrAction {
+    /// Read and write from host directly, and the control of MSR will
+    /// take effect on host.
+    MsrPassthrough,
+    /// Store the dummy value for msr (copy from host or custom values),
+    /// and the control(WRMSR) of MSR won't take effect on host.
+    MsrEmulate,
+}
+
+/// Source CPU of MSR value
+///
+/// Indicate which CPU that user get/set MSRs from/to.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum MsrValueFrom {
+    /// Read/write MSR value from/into CPU 0.
+    /// The MSR source CPU always be CPU 0.
+    RWFromCPU0,
+    /// Read/write MSR value from/into the running CPU.
+    /// If vCPU migrates to another pcpu, the MSR source CPU will also change.
+    RWFromRunningCPU,
+}
+
+impl MsrValueFrom {
+    /// Get the physical(host) CPU id from MsrValueFrom type.
+    pub fn get_cpu_id(&self) -> usize {
+        match self {
+            MsrValueFrom::RWFromCPU0 => 0,
+            MsrValueFrom::RWFromRunningCPU => {
+                // Safe because the host supports this sys call.
+                (unsafe { sched_getcpu() }) as usize
+            }
+        }
+    }
+}
+
+/// If user doesn't specific CPU0, the default source CPU is running CPU.
+impl Default for MsrValueFrom {
+    fn default() -> Self {
+        MsrValueFrom::RWFromRunningCPU
+    }
+}
+
+/// Config option for userspace-msr handing
+///
+/// MsrConfig will be collected with its corresponding MSR's index.
+/// eg, (msr_index, msr_config)
+#[derive(Clone, Default, PartialEq)]
+pub struct MsrConfig {
+    /// If support RDMSR/WRMSR emulation in crosvm?
+    pub rw_type: MsrRWType,
+    /// Handlers should be used to handling MSR.
+    /// User must set this field.
+    pub action: Option<MsrAction>,
+    /// MSR source CPU.
+    pub from: MsrValueFrom,
+}
+
+impl MsrConfig {
+    pub fn new() -> Self {
+        Default::default()
+    }
+}
+
+#[sorted]
+#[derive(Error, Debug)]
+pub enum MsrExitHandlerError {
+    #[error("Fail to create MSR handler")]
+    HandlerCreateFailed,
+    #[error("Error parameter")]
+    InvalidParam,
+}
+
+pub trait MsrExitHandler {
+    fn read(&self, _index: u32) -> Option<u64> {
+        None
+    }
+
+    fn write(&self, _index: u32, _data: u64) -> Option<()> {
+        None
+    }
+
+    fn add_handler(
+        &mut self,
+        _index: u32,
+        _msr_config: MsrConfig,
+        _cpu_id: usize,
+    ) -> std::result::Result<(), MsrExitHandlerError> {
+        Ok(())
+    }
+}
diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs
index 19d3cef..3b05276 100644
--- a/hypervisor/src/kvm/mod.rs
+++ b/hypervisor/src/kvm/mod.rs
@@ -851,6 +851,13 @@
                 }
                 Ok(())
             }
+            KVM_EXIT_X86_WRMSR => {
+                // Safe because the exit_reason (which comes from the kernel) told us which
+                // union field to use.
+                let msr = unsafe { &mut run.__bindgen_anon_1.msr };
+                msr.error = 0;
+                Ok(())
+            }
             _ => Err(Error::new(EINVAL)),
         }
     }
diff --git a/src/crosvm.rs b/src/crosvm.rs
index c79ef01..9c40a3c 100644
--- a/src/crosvm.rs
+++ b/src/crosvm.rs
@@ -13,14 +13,14 @@
 #[cfg(feature = "plugin")]
 pub mod plugin;
 
-use std::collections::{BTreeMap, BTreeSet};
+use std::collections::BTreeMap;
 use std::net;
 use std::ops::RangeInclusive;
 use std::os::unix::io::RawFd;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
 
-use arch::{Pstore, VcpuAffinity};
+use arch::{MsrConfig, Pstore, VcpuAffinity};
 use devices::serial_device::{SerialHardware, SerialParameters};
 use devices::virtio::block::block::DiskOption;
 #[cfg(feature = "audio_cras")]
@@ -469,7 +469,7 @@
     pub force_s2idle: bool,
     pub strict_balloon: bool,
     pub mmio_address_ranges: Vec<RangeInclusive<u64>>,
-    pub userspace_msr: BTreeSet<u32>,
+    pub userspace_msr: BTreeMap<u32, MsrConfig>,
     #[cfg(target_os = "android")]
     pub task_profiles: Vec<String>,
 }
@@ -596,7 +596,7 @@
             force_s2idle: false,
             strict_balloon: false,
             mmio_address_ranges: Vec::new(),
-            userspace_msr: BTreeSet::new(),
+            userspace_msr: BTreeMap::new(),
             #[cfg(target_os = "android")]
             task_profiles: Vec::new(),
         }
diff --git a/src/linux/vcpu.rs b/src/linux/vcpu.rs
index 85b0243..4bda499 100644
--- a/src/linux/vcpu.rs
+++ b/src/linux/vcpu.rs
@@ -2,11 +2,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-use std::collections::{BTreeMap, BTreeSet};
+use std::collections::BTreeMap;
 use std::fs::{File, OpenOptions};
 use std::io::prelude::*;
-use std::os::unix::fs::FileExt;
-use std::rc::Rc;
 use std::sync::{mpsc, Arc, Barrier};
 
 use std::thread;
@@ -22,11 +20,10 @@
 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
 use vm_memory::GuestMemory;
 
-use arch::{self, LinuxArch};
-
+use arch::{self, LinuxArch, MsrConfig, MsrExitHandler};
 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
 use {
-    aarch64::AArch64 as Arch,
+    aarch64::{AArch64 as Arch, MsrAArch64 as MsrHandlers},
     devices::IrqChipAArch64 as IrqChipArch,
     hypervisor::{VcpuAArch64 as VcpuArch, VmAArch64 as VmArch},
 };
@@ -34,7 +31,7 @@
 use {
     devices::IrqChipX86_64 as IrqChipArch,
     hypervisor::{VcpuX86_64 as VcpuArch, VmX86_64 as VmArch},
-    x86_64::X8664arch as Arch,
+    x86_64::{msr::MsrHandlers, X8664arch as Arch},
 };
 
 use super::ExitState;
@@ -278,7 +275,7 @@
         mpsc::Sender<VcpuDebugStatusMessage>,
     >,
     #[cfg(all(target_arch = "x86_64", feature = "gdb"))] guest_mem: GuestMemory,
-    msr_handlers: MsrHandlers,
+    msr_handlers: Box<dyn MsrExitHandler>,
 ) -> ExitState
 where
     V: VcpuArch + 'static,
@@ -442,8 +439,10 @@
                         let _ = vcpu.set_data(&data.to_ne_bytes());
                     }
                 }
-                Ok(VcpuExit::WrMsr { .. }) => {
-                    // TODO(b/215297064): implement MSR write
+                Ok(VcpuExit::WrMsr { index, data }) => {
+                    if msr_handlers.write(index, data).is_some() {
+                        let _ = vcpu.set_data(&[]);
+                    }
                 }
                 Ok(VcpuExit::IoapicEoi { vector }) => {
                     if let Err(e) = irq_chip.broadcast_eoi(vector) {
@@ -524,67 +523,6 @@
     }
 }
 
-trait MsrHandling {
-    fn read(&self, index: u32) -> Result<u64>;
-    fn write(&self, index: u32, data: u64) -> Result<()>;
-}
-
-struct ReadPassthrough {
-    dev_msr: std::fs::File,
-}
-
-impl MsrHandling for ReadPassthrough {
-    fn read(&self, index: u32) -> Result<u64> {
-        let mut data = [0; 8];
-        self.dev_msr.read_exact_at(&mut data, index.into())?;
-        Ok(u64::from_ne_bytes(data))
-    }
-
-    fn write(&self, _index: u32, _data: u64) -> Result<()> {
-        // TODO(b/215297064): implement MSR write
-        unimplemented!();
-    }
-}
-
-impl ReadPassthrough {
-    fn new() -> Result<Self> {
-        // TODO(b/215297064): Support reading from other CPUs than 0, should match running CPU.
-        let filename = "/dev/cpu/0/msr";
-        let dev_msr = OpenOptions::new()
-            .read(true)
-            .open(&filename)
-            .context("Cannot open /dev/cpu/0/msr, are you root?")?;
-        Ok(ReadPassthrough { dev_msr })
-    }
-}
-
-/// MSR handler configuration. Per-cpu.
-struct MsrHandlers {
-    handler: BTreeMap<u32, Rc<Box<dyn MsrHandling>>>,
-}
-
-impl MsrHandlers {
-    fn new() -> Self {
-        MsrHandlers {
-            handler: BTreeMap::new(),
-        }
-    }
-
-    fn read(&self, index: u32) -> Option<u64> {
-        if let Some(handler) = self.handler.get(&index) {
-            match handler.read(index) {
-                Ok(data) => Some(data),
-                Err(e) => {
-                    error!("MSR host read failed {:#x} {:?}", index, e);
-                    None
-                }
-            }
-        } else {
-            None
-        }
-    }
-}
-
 pub fn run_vcpu<V>(
     cpu_id: usize,
     vcpu_id: usize,
@@ -613,7 +551,7 @@
     host_cpu_topology: bool,
     privileged_vm: bool,
     vcpu_cgroup_tasks_file: Option<File>,
-    userspace_msr: BTreeSet<u32>,
+    userspace_msr: BTreeMap<u32, MsrConfig>,
 ) -> Result<JoinHandle<()>>
 where
     V: VcpuArch + 'static,
@@ -626,21 +564,13 @@
             // anything happens before we get to writing the final event.
             let scoped_exit_evt = ScopedEvent::from(exit_evt);
 
-            let mut msr_handlers = MsrHandlers::new();
+            let mut msr_handlers: MsrHandlers = Default::default();
             if !userspace_msr.is_empty() {
-                let read_passthrough: Rc<Box<dyn MsrHandling>> = match ReadPassthrough::new() {
-                    Ok(r) => Rc::new(Box::new(r)),
-                    Err(e) => {
-                        error!(
-                            "failed to create MSR read passthrough handler for vcpu {}: {:#}",
-                            cpu_id, e
-                        );
+                userspace_msr.iter().for_each(|(index, msr_config)| {
+                    if let Err(e) = msr_handlers.add_handler(*index, msr_config.clone(), cpu_id) {
+                        error!("failed to add msr handler {}: {:#}", cpu_id, e);
                         return;
-                    }
-                };
-
-                userspace_msr.iter().for_each(|&index| {
-                    msr_handlers.handler.insert(index, read_passthrough.clone());
+                    };
                 });
             }
 
@@ -702,7 +632,7 @@
                 to_gdb_tube,
                 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
                 guest_mem,
-                msr_handlers,
+                Box::new(msr_handlers),
             );
 
             let exit_evt = scoped_exit_evt.into();
diff --git a/src/main.rs b/src/main.rs
index 583752e..50598dd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -20,7 +20,9 @@
 use std::thread::sleep;
 use std::time::Duration;
 
-use arch::{set_default_serial_parameters, Pstore, VcpuAffinity};
+use arch::{
+    set_default_serial_parameters, MsrAction, MsrConfig, MsrValueFrom, Pstore, VcpuAffinity,
+};
 use base::{debug, error, getpid, info, kill_process_group, pagesize, reap_child, syslog, warn};
 #[cfg(all(feature = "gpu", feature = "virgl_renderer_next"))]
 use crosvm::platform::GpuRenderServerParameters;
@@ -697,15 +699,9 @@
     Ok(ac97_params)
 }
 
-enum MsrAction {
-    Invalid,
-    /// Read MSR value from host CPU0 regardless of current vcpu.
-    ReadFromCPU0,
-}
+fn parse_userspace_msr_options(value: &str) -> argument::Result<(u32, MsrConfig)> {
+    let mut msr_config = MsrConfig::new();
 
-fn parse_userspace_msr_options(value: &str) -> argument::Result<u32> {
-    // TODO(b/215297064): Implement different type of operations, such
-    // as write or reading from the correct CPU.
     let mut options = argument::parse_key_value_options("userspace-msr", value, ',');
     let index: u32 = options
         .next()
@@ -713,23 +709,53 @@
             "userspace-msr: expected index",
         )))?
         .key_numeric()?;
-    let mut msr_config = MsrAction::Invalid;
+
     for opt in options {
         match opt.key() {
+            "type" => match opt.value()? {
+                "r" => msr_config.rw_type.read_allow = true,
+                "w" => msr_config.rw_type.write_allow = true,
+                "rw" | "wr" => {
+                    msr_config.rw_type.read_allow = true;
+                    msr_config.rw_type.write_allow = true;
+                }
+                _ => {
+                    return Err(opt.invalid_value_err(String::from("bad type")));
+                }
+            },
             "action" => match opt.value()? {
-                "r0" => msr_config = MsrAction::ReadFromCPU0,
+                // Compatible with the original command line format.
+                // TODO(b:225375705): Deprecate the old cmd format in the future.
+                "r0" => {
+                    msr_config.rw_type.read_allow = true;
+                    msr_config.action = Some(MsrAction::MsrPassthrough);
+                    msr_config.from = MsrValueFrom::RWFromCPU0;
+                }
+                "pass" => msr_config.action = Some(MsrAction::MsrPassthrough),
+                "emu" => msr_config.action = Some(MsrAction::MsrEmulate),
                 _ => return Err(opt.invalid_value_err(String::from("bad action"))),
             },
+            "from" => match opt.value()? {
+                "cpu0" => msr_config.from = MsrValueFrom::RWFromCPU0,
+                _ => return Err(opt.invalid_value_err(String::from("bad from"))),
+            },
             _ => return Err(opt.invalid_key_err()),
         }
     }
 
-    match msr_config {
-        MsrAction::ReadFromCPU0 => Ok(index),
-        _ => Err(argument::Error::UnknownArgument(
-            "userspace-msr action not specified".to_string(),
-        )),
+    if !msr_config.rw_type.read_allow && !msr_config.rw_type.write_allow {
+        return Err(argument::Error::ExpectedArgument(String::from(
+            "userspace-msr: type is required",
+        )));
     }
+
+    if msr_config.action.is_none() {
+        return Err(argument::Error::ExpectedArgument(String::from(
+            "userspace-msr: action is required",
+        )));
+    }
+
+    Ok((index, msr_config))
 }
 
 fn parse_serial_options(s: &str) -> argument::Result<SerialParameters> {
@@ -2200,8 +2226,16 @@
             cfg.no_legacy = true;
         }
         "userspace-msr" => {
-            let index = parse_userspace_msr_options(value.unwrap())?;
-            cfg.userspace_msr.insert(index);
+            let (index, msr_config) = parse_userspace_msr_options(value.unwrap())?;
+            // TODO(b:225375705): MSR configuration must be unique in the future.
+            if let Some(old_config) = cfg.userspace_msr.insert(index, msr_config.clone()) {
+                if old_config != msr_config {
+                    return Err(argument::Error::InvalidValue {
+                        value: value.unwrap().to_owned(),
+                        expected: String::from("Same msr must has the same configuration"),
+                    });
+                }
+            }
         }
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         "host-cpu-topology" => {
@@ -2537,6 +2571,18 @@
                 ));
             }
         }
+    } else {
+        // TODO(b/215297064): Support generic cpuaffinity if there's a need.
+        if !cfg.userspace_msr.is_empty() {
+            for (_, msr_config) in cfg.userspace_msr.iter() {
+                if msr_config.from == MsrValueFrom::RWFromRunningCPU {
+                    return Err(argument::Error::UnknownArgument(
+                        "`userspace-msr` must set `cpu0` if `host-cpu-topology` is not set"
+                            .to_owned(),
+                    ));
+                }
+            }
+        }
     }
     if !cfg.balloon && cfg.balloon_control.is_some() {
         return Err(argument::Error::ExpectedArgument(
@@ -2868,10 +2914,15 @@
           Argument::value("direct-gpe", "gpe", "Enable GPE interrupt and register access passthrough"),
           Argument::value("dmi", "DIR", "Directory with smbios_entry_point/DMI files"),
           Argument::flag("no-legacy", "Don't use legacy KBD/RTC devices emulation"),
-          Argument::value("userspace-msr", "INDEX,action=r0", "Userspace MSR handling. Takes INDEX of the MSR and how they are handled.
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+          Argument::value("userspace-msr", "INDEX,type=TYPE,action=TYPE,[from=TYPE]",
+                              "Userspace MSR handling. Takes INDEX of the MSR and how they are handled.
 
-                              action=r0 - forward RDMSR to host kernel cpu0.
-"),
+                              type=(r|w|rw|wr) - read/write permission control.
+
+                              action=(pass|emu) - if the control of msr is effective on host.
+
+                              from=(cpu0) - source of msr value. if not set, the source is running CPU."),
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
           Argument::flag("host-cpu-topology", "Use mirror cpu topology of Host for Guest VM, also copy some cpu feature to Guest VM."),
           Argument::flag("privileged-vm", "Grant this Guest VM certian privileges to manage Host resources, such as power management."),
@@ -4424,9 +4475,45 @@
 
     #[test]
     fn parse_userspace_msr_options_test() {
-        let index = parse_userspace_msr_options("0x10,action=r0").unwrap();
-        assert_eq!(index, 0x10);
+        let (pass_cpu0_index, pass_cpu0_cfg) =
+            parse_userspace_msr_options("0x10,type=r,action=pass,from=cpu0").unwrap();
+        assert_eq!(pass_cpu0_index, 0x10);
+        assert!(pass_cpu0_cfg.rw_type.read_allow);
+        assert!(!pass_cpu0_cfg.rw_type.write_allow);
+        assert_eq!(
+            *pass_cpu0_cfg.action.as_ref().unwrap(),
+            MsrAction::MsrPassthrough
+        );
+        assert_eq!(pass_cpu0_cfg.from, MsrValueFrom::RWFromCPU0);
+
+        let (pass_cpus_index, pass_cpus_cfg) =
+            parse_userspace_msr_options("0x10,type=rw,action=emu").unwrap();
+        assert_eq!(pass_cpus_index, 0x10);
+        assert!(pass_cpus_cfg.rw_type.read_allow);
+        assert!(pass_cpus_cfg.rw_type.write_allow);
+        assert_eq!(
+            *pass_cpus_cfg.action.as_ref().unwrap(),
+            MsrAction::MsrEmulate
+        );
+        assert_eq!(pass_cpus_cfg.from, MsrValueFrom::RWFromRunningCPU);
+
+        // Compatible with the original command line format.
+        // TODO(b:225375705): Deprecate the old cmd format in the future.
+        let (old_index, old_cfg) = parse_userspace_msr_options("0x10,action=r0").unwrap();
+        assert_eq!(old_index, 0x10);
+        assert!(old_cfg.rw_type.read_allow);
+        assert!(!pass_cpu0_cfg.rw_type.write_allow);
+        assert_eq!(
+            *pass_cpu0_cfg.action.as_ref().unwrap(),
+            MsrAction::MsrPassthrough
+        );
+        assert_eq!(old_cfg.from, MsrValueFrom::RWFromCPU0);
+
         assert!(parse_userspace_msr_options("0x10,action=none").is_err());
+        assert!(parse_userspace_msr_options("0x10,action=pass").is_err());
+        assert!(parse_userspace_msr_options("0x10,type=none").is_err());
+        assert!(parse_userspace_msr_options("0x10,type=rw").is_err());
+        assert!(parse_userspace_msr_options("0x10,type=w,action=pass,from=f").is_err());
         assert!(parse_userspace_msr_options("0x10").is_err());
         assert!(parse_userspace_msr_options("hoge").is_err());
     }
diff --git a/x86_64/Cargo.toml b/x86_64/Cargo.toml
index c75c563..6f2e6d2 100644
--- a/x86_64/Cargo.toml
+++ b/x86_64/Cargo.toml
@@ -10,6 +10,7 @@
 
 [dependencies]
 arch = { path = "../arch" }
+anyhow = "*"
 assertions = { path = "../common/assertions" }
 data_model = { path = "../common/data_model" }
 devices = { path = "../devices" }
diff --git a/x86_64/src/lib.rs b/x86_64/src/lib.rs
index 8181851..bb56b91 100644
--- a/x86_64/src/lib.rs
+++ b/x86_64/src/lib.rs
@@ -36,6 +36,8 @@
 unsafe impl data_model::DataInit for mpspec::mpc_lintsrc {}
 unsafe impl data_model::DataInit for mpspec::mpf_intel {}
 
+pub mod msr;
+
 mod acpi;
 mod bzimage;
 mod cpuid;
diff --git a/x86_64/src/msr.rs b/x86_64/src/msr.rs
new file mode 100644
index 0000000..543d140
--- /dev/null
+++ b/x86_64/src/msr.rs
@@ -0,0 +1,311 @@
+// Copyright 2022 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use std::cell::RefCell;
+use std::collections::BTreeMap;
+use std::fs::{File, OpenOptions};
+use std::os::unix::fs::FileExt;
+use std::rc::Rc;
+
+use anyhow::Context;
+
+use arch::{MsrAction, MsrConfig, MsrExitHandler, MsrExitHandlerError, MsrRWType, MsrValueFrom};
+use base::{debug, error};
+
+use remain::sorted;
+use thiserror::Error as ThisError;
+
+#[sorted]
+#[derive(ThisError, Debug)]
+pub enum Error {
+    #[error("Unable to open host msr file: {0}")]
+    HostMsrGetError(anyhow::Error),
+    #[error("Unable to get metadata of dev file for msr: {0}")]
+    HostMsrGetMetadataError(std::io::Error),
+    #[error("Unable to read host msr: {0}")]
+    HostMsrReadError(std::io::Error),
+    #[error("Unable to set permissions of dev file for msr: {0}")]
+    HostMsrSetPermsError(std::io::Error),
+    #[error("Unable to write host msr: {0}")]
+    HostMsrWriteError(std::io::Error),
+    #[error("Not set msr action parameter")]
+    InvalidAction,
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Wrap for userspace MSR file descriptor (/dev/cpu/*/msr).
+pub struct MsrDevFile {
+    dev_msr: File,
+}
+
+impl MsrDevFile {
+    /// Create a new MSR file descriptor.
+    ///
+    /// "Passthrough" handler will create file descriptor with both read and write
+    /// permissions. MsrHandlers controls read/write with MsrRWType. This avoids
+    /// the corner case that some MSRs are read-only while other MSRs need write
+    /// permission.
+    /// "Emulate" handler will create read-only file descriptor. This read-only
+    /// descriptor will only be used once to initialize MSR value and "Emulate"
+    /// handler won't store its descriptor at MsrHandlers level.
+    fn new(cpu_id: usize, read_only: bool) -> Result<Self> {
+        let filename = format!("/dev/cpu/{}/msr", cpu_id);
+        let dev_msr = OpenOptions::new()
+            .read(true)
+            .write(!read_only)
+            .open(&filename)
+            .context(format!("Cannot open {}, are you root?", filename))
+            .map_err(Error::HostMsrGetError)?;
+        Ok(MsrDevFile { dev_msr })
+    }
+
+    fn read(&self, index: u32) -> Result<u64> {
+        let mut data = [0; 8];
+        self.dev_msr
+            .read_exact_at(&mut data, index.into())
+            .map_err(Error::HostMsrReadError)?;
+        Ok(u64::from_ne_bytes(data))
+    }
+
+    // In fact, only "passthrough" will write into MSR file.
+    fn write(&self, index: u32, data: u64) -> Result<()> {
+        self.dev_msr
+            .write_all_at(&data.to_ne_bytes(), index.into())
+            .map_err(Error::HostMsrWriteError)?;
+        Ok(())
+    }
+}
+
+/// Wrap for general RDMSR/WRMSR handling.
+///
+/// Each specific handler needs to implement this trait.
+pub trait MsrHandling {
+    fn read(&self) -> Result<u64>;
+    // For "emulate" handler, it need to update MSR value which is stored in
+    // `msr_data` of MsrEmulate. So declare `self` as mutable.
+    fn write(&mut self, data: u64) -> Result<()>;
+}
+
+/// MsrPassthroughHandler - passthrough handler that will handle RDMSR/WRMSR
+///                         by reading/writing MSR file directly.
+/// For RDMSR, this handler will give Guest the current MSR value on Host.
+/// For WRMSR, this handler will directly pass the change desired by the Guest
+/// to the host, and expect the change to take effect on the MSR of the host.
+struct MsrPassthroughHandler {
+    /// MSR index.
+    index: u32,
+    /// MSR source CPU, CPU 0 or running CPU.
+    from: MsrValueFrom,
+    /// Reference of MSR file descriptors.
+    msr_file: Rc<RefCell<BTreeMap<usize, Rc<MsrDevFile>>>>,
+}
+
+impl MsrPassthroughHandler {
+    fn new(
+        index: u32,
+        msr_config: &MsrConfig,
+        msr_file: &Rc<RefCell<BTreeMap<usize, Rc<MsrDevFile>>>>,
+    ) -> Result<Self> {
+        let pass = MsrPassthroughHandler {
+            index,
+            from: msr_config.from,
+            msr_file: Rc::clone(msr_file), // Clone first, and then modify it.
+        };
+        pass.get_msr_dev()?;
+        Ok(pass)
+    }
+
+    /// A helper interface to get MSR file descriptor.
+    fn get_msr_dev(&self) -> Result<Rc<MsrDevFile>> {
+        let cpu_id = self.from.get_cpu_id();
+        let mut msr_file = self.msr_file.borrow_mut();
+        // First, check if the descriptor is stored before.
+        if let Some(dev_msr) = msr_file.get(&cpu_id) {
+            Ok(Rc::clone(dev_msr))
+        } else {
+            // If descriptor isn't found, create new one.
+            let new_dev_msr = Rc::new(MsrDevFile::new(cpu_id, false)?);
+            // Note: For MsrValueFrom::RWFromRunningCPU case, just store
+            // the new descriptor and don't remove the previous.
+            // This is for convenience, since the most decriptor number is
+            // same as Host CPU count.
+            msr_file.insert(cpu_id, Rc::clone(&new_dev_msr));
+            Ok(new_dev_msr)
+        }
+    }
+}
+
+impl MsrHandling for MsrPassthroughHandler {
+    fn read(&self) -> Result<u64> {
+        let index = self.index;
+        self.get_msr_dev()?.read(index)
+    }
+
+    fn write(&mut self, data: u64) -> Result<()> {
+        let index = self.index;
+        self.get_msr_dev()?.write(index, data)
+    }
+}
+
+/// MsrPassthroughHandler - emulate handler that will handle RDMSR/WRMSR
+///                         with a dummy MSR value other than access to real
+///                         MSR.
+/// This Handler will initialize a value(`msr_data`) with the corresponding
+/// Host MSR value, then handle the RDMSR/WRMSR based on this "value".
+///
+/// For RDMSR, this handler will give Guest the stored `msr_data`.
+/// For WRMSR, this handler will directly change `msr_data` without the
+/// modification on real Host MSR. The change will not take effect on the
+/// real MSR of Host.
+///
+/// 'emulate' Handler is used in the case, that some driver need to control
+/// MSR and user just wants to make WRMSR successful and doesn't care about
+/// if WRMSR really works. This handlers make Guest's control of CPU not
+/// affect the host
+struct MsrEmulateHandler {
+    /// Only initialize msr_data with MSR source pCPU, and will not update
+    /// msr value changes on host cpu into msr_data.
+    msr_data: u64,
+}
+
+impl MsrEmulateHandler {
+    fn new(
+        index: u32,
+        msr_config: &MsrConfig,
+        msr_file: &Rc<RefCell<BTreeMap<usize, Rc<MsrDevFile>>>>,
+    ) -> Result<Self> {
+        let cpu_id = msr_config.from.get_cpu_id();
+        let msr_file_map = msr_file.borrow();
+        let dev_msr = msr_file_map.get(&cpu_id);
+
+        let msr_data: u64 = if dev_msr.is_some() {
+            dev_msr.unwrap().read(index)?
+        } else {
+            // Don't allow to write. Only read the value to initialize
+            // `msr_data` and won't store in MsrHandlers level.
+            MsrDevFile::new(cpu_id, true)?.read(index)?
+        };
+
+        Ok(MsrEmulateHandler { msr_data })
+    }
+}
+
+impl MsrHandling for MsrEmulateHandler {
+    fn read(&self) -> Result<u64> {
+        Ok(self.msr_data)
+    }
+
+    fn write(&mut self, data: u64) -> Result<()> {
+        self.msr_data = data;
+        Ok(())
+    }
+}
+
+/// MSR handler configuration. Per-cpu.
+#[derive(Default)]
+pub struct MsrHandlers {
+    /// Store read/write permissions to control read/write brfore calling
+    /// MsrHandling trait.
+    pub handler: BTreeMap<u32, (MsrRWType, Rc<RefCell<Box<dyn MsrHandling>>>)>,
+    /// Store file descriptor here to avoid cache duplicate descriptors
+    /// for each MSR.
+    /// Only collect descriptor of 'passthrough' handler, since 'emulate'
+    /// uses descriptor only once during initialization.
+    pub msr_file: Option<Rc<RefCell<BTreeMap<usize, Rc<MsrDevFile>>>>>,
+}
+
+impl MsrExitHandler for MsrHandlers {
+    fn read(&self, index: u32) -> Option<u64> {
+        if let Some((rw_type, handler)) = self.handler.get(&index) {
+            // It's not error. This means user does't want to handle
+            // RDMSR. Just log it.
+            if !rw_type.read_allow {
+                debug!("RDMSR is not allowed for msr: {:#x}", index);
+                return None;
+            }
+
+            match handler.borrow().read() {
+                Ok(data) => Some(data),
+                Err(e) => {
+                    error!("MSR host read failed {:#x} {:?}", index, e);
+                    None
+                }
+            }
+        } else {
+            None
+        }
+    }
+
+    fn write(&self, index: u32, data: u64) -> Option<()> {
+        if let Some((rw_type, handler)) = self.handler.get(&index) {
+            // It's not error. This means user does't want to handle
+            // WRMSR. Just log it.
+            if !rw_type.write_allow {
+                debug!("WRMSR is not allowed for msr: {:#x}", index);
+                return None;
+            }
+
+            match handler.borrow_mut().write(data) {
+                Ok(_) => Some(()),
+                Err(e) => {
+                    error!("MSR host write failed {:#x} {:?}", index, e);
+                    None
+                }
+            }
+        } else {
+            None
+        }
+    }
+
+    fn add_handler(
+        &mut self,
+        index: u32,
+        msr_config: MsrConfig,
+        cpu_id: usize,
+    ) -> std::result::Result<(), MsrExitHandlerError> {
+        if msr_config.action.is_none() {
+            return Err(MsrExitHandlerError::InvalidParam);
+        }
+
+        let msr_file = Rc::new(RefCell::new(BTreeMap::new()));
+        match msr_config.action.as_ref().unwrap() {
+            MsrAction::MsrPassthrough => {
+                let msr_handler: Rc<RefCell<Box<dyn MsrHandling>>> =
+                    match MsrPassthroughHandler::new(index, &msr_config, &msr_file) {
+                        Ok(r) => Rc::new(RefCell::new(Box::new(r))),
+                        Err(e) => {
+                            error!(
+                                "failed to create MSR passthrough handler for vcpu {}: {:#}",
+                                cpu_id, e
+                            );
+                            return Err(MsrExitHandlerError::HandlerCreateFailed);
+                        }
+                    };
+                self.handler
+                    .insert(index, (msr_config.rw_type, msr_handler));
+            }
+            MsrAction::MsrEmulate => {
+                let msr_handler: Rc<RefCell<Box<dyn MsrHandling>>> =
+                    match MsrEmulateHandler::new(index, &msr_config, &msr_file) {
+                        Ok(r) => Rc::new(RefCell::new(Box::new(r))),
+                        Err(e) => {
+                            error!(
+                                "failed to create MSR emulate handler for vcpu {}: {:#}",
+                                cpu_id, e
+                            );
+                            return Err(MsrExitHandlerError::HandlerCreateFailed);
+                        }
+                    };
+                self.handler
+                    .insert(index, (msr_config.rw_type, msr_handler));
+            }
+        };
+        // Empty only when no 'passthrough' handler exists.
+        if !msr_file.borrow().is_empty() {
+            self.msr_file = Some(msr_file);
+        }
+        Ok(())
+    }
+}