/* * nestedsvm.c: Nested Virtualization * Copyright (c) 2011, Advanced Micro Devices, Inc * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #include #include #include #include #include #include #include #include /* paging_mode_hap */ #include /* for local_event_delivery_(en|dis)able */ #include /* p2m_get_pagetable, p2m_get_nestedp2m */ #define NSVM_ERROR_VVMCB 1 #define NSVM_ERROR_VMENTRY 2 static void nestedsvm_vcpu_clgi(struct vcpu *v) { /* clear gif flag */ vcpu_nestedsvm(v).ns_gif = 0; local_event_delivery_disable(); /* mask events for PV drivers */ } static void nestedsvm_vcpu_stgi(struct vcpu *v) { /* enable gif flag */ vcpu_nestedsvm(v).ns_gif = 1; local_event_delivery_enable(); /* unmask events for PV drivers */ } static int nestedsvm_vmcb_isvalid(struct vcpu *v, uint64_t vmcxaddr) { /* Address must be 4k aligned */ if ( (vmcxaddr & ~PAGE_MASK) != 0 ) return 0; /* Maximum valid physical address. * See AMD BKDG for HSAVE_PA MSR. */ if ( vmcxaddr > 0xfd00000000ULL ) return 0; return 1; } int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); if (nv->nv_vvmcx != NULL && nv->nv_vvmcxaddr != vmcbaddr) { ASSERT(nv->nv_vvmcxaddr != VMCX_EADDR); hvm_unmap_guest_frame(nv->nv_vvmcx, 1); nv->nv_vvmcx = NULL; nv->nv_vvmcxaddr = VMCX_EADDR; } if (nv->nv_vvmcx == NULL) { nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT, 1); if (nv->nv_vvmcx == NULL) return 0; nv->nv_vvmcxaddr = vmcbaddr; } return 1; } /* Interface methods */ int nsvm_vcpu_initialise(struct vcpu *v) { void *msrpm; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); svm->ns_cached_msrpm = msrpm; if (msrpm == NULL) goto err; memset(msrpm, 0x0, MSRPM_SIZE); msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); svm->ns_merged_msrpm = msrpm; if (msrpm == NULL) goto err; memset(msrpm, 0x0, MSRPM_SIZE); nv->nv_n2vmcx = alloc_vmcb(); if (nv->nv_n2vmcx == NULL) goto err; nv->nv_n2vmcx_pa = virt_to_maddr(nv->nv_n2vmcx); return 0; err: nsvm_vcpu_destroy(v); return -ENOMEM; } void nsvm_vcpu_destroy(struct vcpu *v) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); /* * When destroying the vcpu, it may be running on behalf of l2 guest. * Therefore we need to switch the VMCB pointer back to the l1 vmcb, * in order to avoid double free of l2 vmcb and the possible memory leak * of l1 vmcb page. */ if (nv->nv_n1vmcx) v->arch.hvm_svm.vmcb = nv->nv_n1vmcx; if (svm->ns_cached_msrpm) { free_xenheap_pages(svm->ns_cached_msrpm, get_order_from_bytes(MSRPM_SIZE)); svm->ns_cached_msrpm = NULL; } if (svm->ns_merged_msrpm) { free_xenheap_pages(svm->ns_merged_msrpm, get_order_from_bytes(MSRPM_SIZE)); svm->ns_merged_msrpm = NULL; } hvm_unmap_guest_frame(nv->nv_vvmcx, 1); nv->nv_vvmcx = NULL; if (nv->nv_n2vmcx) { free_vmcb(nv->nv_n2vmcx); nv->nv_n2vmcx = NULL; nv->nv_n2vmcx_pa = VMCX_EADDR; } if (svm->ns_iomap) svm->ns_iomap = NULL; } int nsvm_vcpu_reset(struct vcpu *v) { struct nestedsvm *svm = &vcpu_nestedsvm(v); svm->ns_msr_hsavepa = VMCX_EADDR; svm->ns_ovvmcb_pa = VMCX_EADDR; svm->ns_tscratio = DEFAULT_TSC_RATIO; svm->ns_cr_intercepts = 0; svm->ns_dr_intercepts = 0; svm->ns_exception_intercepts = 0; svm->ns_general1_intercepts = 0; svm->ns_general2_intercepts = 0; svm->ns_lbr_control.bytes = 0; svm->ns_hap_enabled = 0; svm->ns_vmcb_guestcr3 = 0; svm->ns_vmcb_hostcr3 = 0; svm->ns_guest_asid = 0; svm->ns_hostflags.bytes = 0; svm->ns_vmexit.exitinfo1 = 0; svm->ns_vmexit.exitinfo2 = 0; if (svm->ns_iomap) svm->ns_iomap = NULL; nestedsvm_vcpu_stgi(v); return 0; } static uint64_t nestedsvm_fpu_vmentry(uint64_t n1cr0, struct vmcb_struct *vvmcb, struct vmcb_struct *n1vmcb, struct vmcb_struct *n2vmcb) { uint64_t vcr0; vcr0 = vvmcb->_cr0; if ( !(n1cr0 & X86_CR0_TS) && (n1vmcb->_cr0 & X86_CR0_TS) ) { /* svm_fpu_leave() run while l1 guest was running. * Sync FPU state with l2 guest. */ vcr0 |= X86_CR0_TS; n2vmcb->_exception_intercepts |= (1U << TRAP_no_device); } else if ( !(vcr0 & X86_CR0_TS) && (n2vmcb->_cr0 & X86_CR0_TS) ) { /* svm_fpu_enter() run while l1 guest was running. * Sync FPU state with l2 guest. */ vcr0 &= ~X86_CR0_TS; n2vmcb->_exception_intercepts &= ~(1U << TRAP_no_device); } return vcr0; } static void nestedsvm_fpu_vmexit(struct vmcb_struct *n1vmcb, struct vmcb_struct *n2vmcb, uint64_t n1cr0, uint64_t guest_cr0) { if ( !(guest_cr0 & X86_CR0_TS) && (n2vmcb->_cr0 & X86_CR0_TS) ) { /* svm_fpu_leave() run while l2 guest was running. * Sync FPU state with l1 guest. */ n1vmcb->_cr0 |= X86_CR0_TS; n1vmcb->_exception_intercepts |= (1U << TRAP_no_device); } else if ( !(n1cr0 & X86_CR0_TS) && (n1vmcb->_cr0 & X86_CR0_TS) ) { /* svm_fpu_enter() run while l2 guest was running. * Sync FPU state with l1 guest. */ n1vmcb->_cr0 &= ~X86_CR0_TS; n1vmcb->_exception_intercepts &= ~(1U << TRAP_no_device); } } static int nsvm_vcpu_hostsave(struct vcpu *v, unsigned int inst_len) { struct nestedsvm *svm = &vcpu_nestedsvm(v); struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct vmcb_struct *n1vmcb; n1vmcb = nv->nv_n1vmcx; ASSERT(n1vmcb != NULL); n1vmcb->rip += inst_len; /* Save shadowed values. This ensures that the l1 guest * cannot override them to break out. */ n1vmcb->_efer = v->arch.hvm_vcpu.guest_efer; n1vmcb->_cr0 = v->arch.hvm_vcpu.guest_cr[0]; n1vmcb->_cr2 = v->arch.hvm_vcpu.guest_cr[2]; n1vmcb->_cr4 = v->arch.hvm_vcpu.guest_cr[4]; /* Remember the host interrupt flag */ svm->ns_hostflags.fields.rflagsif = (n1vmcb->rflags & X86_EFLAGS_IF) ? 1 : 0; return 0; } int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *n1vmcb, *n2vmcb; int rc; n1vmcb = nv->nv_n1vmcx; n2vmcb = nv->nv_n2vmcx; ASSERT(n1vmcb != NULL); ASSERT(n2vmcb != NULL); /* nsvm_vmcb_prepare4vmexit() already saved register values * handled by VMSAVE/VMLOAD into n1vmcb directly. */ /* switch vmcb to l1 guest's vmcb */ v->arch.hvm_svm.vmcb = n1vmcb; v->arch.hvm_svm.vmcb_pa = nv->nv_n1vmcx_pa; /* EFER */ v->arch.hvm_vcpu.guest_efer = n1vmcb->_efer; rc = hvm_set_efer(n1vmcb->_efer); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); /* CR4 */ v->arch.hvm_vcpu.guest_cr[4] = n1vmcb->_cr4; rc = hvm_set_cr4(n1vmcb->_cr4); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); /* CR0 */ nestedsvm_fpu_vmexit(n1vmcb, n2vmcb, svm->ns_cr0, v->arch.hvm_vcpu.guest_cr[0]); v->arch.hvm_vcpu.guest_cr[0] = n1vmcb->_cr0 | X86_CR0_PE; n1vmcb->rflags &= ~X86_EFLAGS_VM; rc = hvm_set_cr0(n1vmcb->_cr0 | X86_CR0_PE); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); svm->ns_cr0 = v->arch.hvm_vcpu.guest_cr[0]; /* CR2 */ v->arch.hvm_vcpu.guest_cr[2] = n1vmcb->_cr2; hvm_update_guest_cr(v, 2); /* CR3 */ /* Nested paging mode */ if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging. */ /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ } else if (paging_mode_hap(v->domain)) { /* host nested paging + guest shadow paging. */ /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ } else { /* host shadow paging + guest shadow paging. */ /* Reset MMU context -- XXX (hostrestore) not yet working*/ if (!pagetable_is_null(v->arch.guest_table)) put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_null(); /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ } rc = hvm_set_cr3(n1vmcb->_cr3); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); regs->eax = n1vmcb->rax; regs->esp = n1vmcb->rsp; regs->eip = n1vmcb->rip; regs->eflags = n1vmcb->rflags; n1vmcb->_dr7 = 0; /* disable all breakpoints */ n1vmcb->_cpl = 0; /* Clear exitintinfo to prevent a fault loop of re-injecting * exceptions forever. */ n1vmcb->exitintinfo.bytes = 0; /* Cleanbits */ n1vmcb->cleanbits.bytes = 0; return 0; } static int nsvm_vmrun_permissionmap(struct vcpu *v, bool_t viopm) { struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; struct nestedsvm *svm = &vcpu_nestedsvm(v); struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; struct vmcb_struct *host_vmcb = arch_svm->vmcb; unsigned long *ns_msrpm_ptr; unsigned int i; enum hvm_copy_result ret; unsigned long *ns_viomap; bool_t ioport_80, ioport_ed; ns_msrpm_ptr = (unsigned long *)svm->ns_cached_msrpm; ret = hvm_copy_from_guest_phys(svm->ns_cached_msrpm, ns_vmcb->_msrpm_base_pa, MSRPM_SIZE); if (ret != HVMCOPY_okay) { gdprintk(XENLOG_ERR, "hvm_copy_from_guest_phys msrpm %u\n", ret); return 1; } /* Check l1 guest io permission map and get a shadow one based on * if l1 guest intercepts io ports 0x80 and/or 0xED. */ svm->ns_oiomap_pa = svm->ns_iomap_pa; svm->ns_iomap_pa = ns_vmcb->_iopm_base_pa; ns_viomap = hvm_map_guest_frame_ro(svm->ns_iomap_pa >> PAGE_SHIFT, 0); ASSERT(ns_viomap != NULL); ioport_80 = test_bit(0x80, ns_viomap); ioport_ed = test_bit(0xed, ns_viomap); hvm_unmap_guest_frame(ns_viomap, 0); svm->ns_iomap = nestedhvm_vcpu_iomap_get(ioport_80, ioport_ed); nv->nv_ioport80 = ioport_80; nv->nv_ioportED = ioport_ed; /* v->arch.hvm_svm.msrpm has type unsigned long, thus * BYTES_PER_LONG. */ for (i = 0; i < MSRPM_SIZE / BYTES_PER_LONG; i++) svm->ns_merged_msrpm[i] = arch_svm->msrpm[i] | ns_msrpm_ptr[i]; host_vmcb->_iopm_base_pa = (uint64_t)virt_to_maddr(svm->ns_iomap); host_vmcb->_msrpm_base_pa = (uint64_t)virt_to_maddr(svm->ns_merged_msrpm); return 0; } static void nestedsvm_vmcb_set_nestedp2m(struct vcpu *v, struct vmcb_struct *vvmcb, struct vmcb_struct *n2vmcb) { struct p2m_domain *p2m; ASSERT(v != NULL); ASSERT(vvmcb != NULL); ASSERT(n2vmcb != NULL); p2m = p2m_get_nestedp2m(v, vvmcb->_h_cr3); n2vmcb->_h_cr3 = pagetable_get_paddr(p2m_get_pagetable(p2m)); } static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb, *n1vmcb, *n2vmcb; bool_t vcleanbits_valid; int rc; uint64_t cr0; ns_vmcb = nv->nv_vvmcx; n1vmcb = nv->nv_n1vmcx; n2vmcb = nv->nv_n2vmcx; ASSERT(ns_vmcb != NULL); ASSERT(n1vmcb != NULL); ASSERT(n2vmcb != NULL); /* Check if virtual VMCB cleanbits are valid */ vcleanbits_valid = 1; if (svm->ns_ovvmcb_pa == VMCX_EADDR) vcleanbits_valid = 0; if (svm->ns_ovvmcb_pa != nv->nv_vvmcxaddr) vcleanbits_valid = 0; #define vcleanbit_set(_name) \ (vcleanbits_valid && ns_vmcb->cleanbits.fields._name) /* Enable l2 guest intercepts */ if (!vcleanbit_set(intercepts)) { svm->ns_cr_intercepts = ns_vmcb->_cr_intercepts; svm->ns_dr_intercepts = ns_vmcb->_dr_intercepts; svm->ns_exception_intercepts = ns_vmcb->_exception_intercepts; svm->ns_general1_intercepts = ns_vmcb->_general1_intercepts; svm->ns_general2_intercepts = ns_vmcb->_general2_intercepts; } /* We could track the cleanbits of the n1vmcb from * last emulated #VMEXIT to this emulated VMRUN to safe the merges * below. Those cleanbits would be tracked in an integer field * in struct nestedsvm. * But this effort is not worth doing because: * - Only the intercepts bit of the n1vmcb can effectively be used here * - The CPU runs more instructions for the tracking than can be * safed here. * The overhead comes from (ordered from highest to lowest): * - svm_ctxt_switch_to (CPU context switching) * - svm_fpu_enter, svm_fpu_leave (lazy FPU switching) * - emulated CLGI (clears VINTR intercept) * - host clears VINTR intercept * Test results show that the overhead is high enough that the * tracked intercepts bit of the n1vmcb is practically *always* cleared. */ n2vmcb->_cr_intercepts = n1vmcb->_cr_intercepts | ns_vmcb->_cr_intercepts; n2vmcb->_dr_intercepts = n1vmcb->_dr_intercepts | ns_vmcb->_dr_intercepts; n2vmcb->_exception_intercepts = n1vmcb->_exception_intercepts | ns_vmcb->_exception_intercepts; n2vmcb->_general1_intercepts = n1vmcb->_general1_intercepts | ns_vmcb->_general1_intercepts; n2vmcb->_general2_intercepts = n1vmcb->_general2_intercepts | ns_vmcb->_general2_intercepts; /* Nested Pause Filter */ if (ns_vmcb->_general1_intercepts & GENERAL1_INTERCEPT_PAUSE) n2vmcb->_pause_filter_count = min(n1vmcb->_pause_filter_count, ns_vmcb->_pause_filter_count); else n2vmcb->_pause_filter_count = n1vmcb->_pause_filter_count; /* TSC offset */ n2vmcb->_tsc_offset = n1vmcb->_tsc_offset + ns_vmcb->_tsc_offset; /* Nested IO permission bitmaps */ rc = nsvm_vmrun_permissionmap(v, vcleanbit_set(iopm)); if (rc) return rc; /* ASID - Emulation handled in hvm_asid_handle_vmenter() */ /* TLB control */ n2vmcb->tlb_control = ns_vmcb->tlb_control; /* Virtual Interrupts */ if (!vcleanbit_set(tpr)) { n2vmcb->_vintr = ns_vmcb->_vintr; n2vmcb->_vintr.fields.intr_masking = 1; } /* Shadow Mode */ n2vmcb->interrupt_shadow = ns_vmcb->interrupt_shadow; /* Exit codes */ n2vmcb->exitcode = ns_vmcb->exitcode; n2vmcb->exitinfo1 = ns_vmcb->exitinfo1; n2vmcb->exitinfo2 = ns_vmcb->exitinfo2; n2vmcb->exitintinfo = ns_vmcb->exitintinfo; /* Pending Interrupts */ n2vmcb->eventinj = ns_vmcb->eventinj; /* LBR virtualization */ if (!vcleanbit_set(lbr)) { svm->ns_lbr_control = ns_vmcb->lbr_control; } n2vmcb->lbr_control.bytes = n1vmcb->lbr_control.bytes | ns_vmcb->lbr_control.bytes; /* NextRIP - only evaluated on #VMEXIT. */ /* * VMCB Save State Area */ /* Segments */ if (!vcleanbit_set(seg)) { n2vmcb->es = ns_vmcb->es; n2vmcb->cs = ns_vmcb->cs; n2vmcb->ss = ns_vmcb->ss; n2vmcb->ds = ns_vmcb->ds; /* CPL */ n2vmcb->_cpl = ns_vmcb->_cpl; } if (!vcleanbit_set(dt)) { n2vmcb->gdtr = ns_vmcb->gdtr; n2vmcb->idtr = ns_vmcb->idtr; } /* EFER */ v->arch.hvm_vcpu.guest_efer = ns_vmcb->_efer; rc = hvm_set_efer(ns_vmcb->_efer); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); /* CR4 */ v->arch.hvm_vcpu.guest_cr[4] = ns_vmcb->_cr4; rc = hvm_set_cr4(ns_vmcb->_cr4); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); /* CR0 */ svm->ns_cr0 = v->arch.hvm_vcpu.guest_cr[0]; cr0 = nestedsvm_fpu_vmentry(svm->ns_cr0, ns_vmcb, n1vmcb, n2vmcb); v->arch.hvm_vcpu.guest_cr[0] = ns_vmcb->_cr0; rc = hvm_set_cr0(cr0); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); /* CR2 */ v->arch.hvm_vcpu.guest_cr[2] = ns_vmcb->_cr2; hvm_update_guest_cr(v, 2); /* Nested paging mode */ if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging. */ n2vmcb->_np_enable = 1; nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb); /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ rc = hvm_set_cr3(ns_vmcb->_cr3); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); } else if (paging_mode_hap(v->domain)) { /* host nested paging + guest shadow paging. */ n2vmcb->_np_enable = 1; /* Keep h_cr3 as it is. */ n2vmcb->_h_cr3 = n1vmcb->_h_cr3; /* When l1 guest does shadow paging * we assume it intercepts page faults. */ /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ rc = hvm_set_cr3(ns_vmcb->_cr3); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); } else { /* host shadow paging + guest shadow paging. */ n2vmcb->_np_enable = 0; n2vmcb->_h_cr3 = 0x0; /* TODO: Once shadow-shadow paging is in place come back to here * and set host_vmcb->_cr3 to the shadowed shadow table. */ } /* DRn */ if (!vcleanbit_set(dr)) { n2vmcb->_dr7 = ns_vmcb->_dr7; n2vmcb->_dr6 = ns_vmcb->_dr6; } /* RFLAGS */ n2vmcb->rflags = ns_vmcb->rflags; /* RIP */ n2vmcb->rip = ns_vmcb->rip; /* RSP */ n2vmcb->rsp = ns_vmcb->rsp; /* RAX */ n2vmcb->rax = ns_vmcb->rax; /* Keep the host values of the fs, gs, ldtr, tr, kerngsbase, * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. */ /* Page tables */ n2vmcb->pdpe0 = ns_vmcb->pdpe0; n2vmcb->pdpe1 = ns_vmcb->pdpe1; n2vmcb->pdpe2 = ns_vmcb->pdpe2; n2vmcb->pdpe3 = ns_vmcb->pdpe3; /* PAT */ if (!vcleanbit_set(np)) { n2vmcb->_g_pat = ns_vmcb->_g_pat; } if (!vcleanbit_set(lbr)) { /* Debug Control MSR */ n2vmcb->_debugctlmsr = ns_vmcb->_debugctlmsr; /* LBR MSRs */ n2vmcb->_lastbranchfromip = ns_vmcb->_lastbranchfromip; n2vmcb->_lastbranchtoip = ns_vmcb->_lastbranchtoip; n2vmcb->_lastintfromip = ns_vmcb->_lastintfromip; n2vmcb->_lastinttoip = ns_vmcb->_lastinttoip; } /* Cleanbits */ n2vmcb->cleanbits.bytes = 0; rc = svm_vmcb_isvalid(__func__, ns_vmcb, 1); if (rc) { gdprintk(XENLOG_ERR, "virtual vmcb invalid\n"); return NSVM_ERROR_VVMCB; } rc = svm_vmcb_isvalid(__func__, n2vmcb, 1); if (rc) { gdprintk(XENLOG_ERR, "n2vmcb invalid\n"); return NSVM_ERROR_VMENTRY; } /* Switch guest registers to l2 guest */ regs->eax = ns_vmcb->rax; regs->eip = ns_vmcb->rip; regs->esp = ns_vmcb->rsp; regs->eflags = ns_vmcb->rflags; #undef vcleanbit_set return 0; } static int nsvm_vcpu_vmentry(struct vcpu *v, struct cpu_user_regs *regs, unsigned int inst_len) { int ret; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb; ns_vmcb = nv->nv_vvmcx; ASSERT(ns_vmcb != NULL); ASSERT(nv->nv_n2vmcx != NULL); ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR); /* Save values for later use. Needed for Nested-on-Nested and * Shadow-on-Shadow paging. */ svm->ns_vmcb_guestcr3 = ns_vmcb->_cr3; svm->ns_vmcb_hostcr3 = ns_vmcb->_h_cr3; /* Convert explicitely to boolean. Deals with l1 guests * that use flush-by-asid w/o checking the cpuid bits */ nv->nv_flushp2m = !!ns_vmcb->tlb_control; if ( svm->ns_guest_asid != ns_vmcb->_guest_asid ) { nv->nv_flushp2m = 1; hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(v).nv_n2asid); svm->ns_guest_asid = ns_vmcb->_guest_asid; } /* nested paging for the guest */ svm->ns_hap_enabled = (ns_vmcb->_np_enable) ? 1 : 0; /* Remember the V_INTR_MASK in hostflags */ svm->ns_hostflags.fields.vintrmask = (ns_vmcb->_vintr.fields.intr_masking) ? 1 : 0; /* Save l1 guest state (= host state) */ ret = nsvm_vcpu_hostsave(v, inst_len); if (ret) { gdprintk(XENLOG_ERR, "hostsave failed, ret = %i\n", ret); return ret; } /* switch vmcb to shadow vmcb */ v->arch.hvm_svm.vmcb = nv->nv_n2vmcx; v->arch.hvm_svm.vmcb_pa = nv->nv_n2vmcx_pa; ret = nsvm_vmcb_prepare4vmrun(v, regs); if (ret) { gdprintk(XENLOG_ERR, "prepare4vmrun failed, ret = %i\n", ret); return ret; } nestedsvm_vcpu_stgi(v); return 0; } int nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs) { int ret; unsigned int inst_len; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); inst_len = __get_instruction_length(v, INSTR_VMRUN); if (inst_len == 0) { svm->ns_vmexit.exitcode = VMEXIT_SHUTDOWN; return -1; } nv->nv_vmswitch_in_progress = 1; ASSERT(nv->nv_vvmcx != NULL); /* save host state */ ret = nsvm_vcpu_vmentry(v, regs, inst_len); /* Switch vcpu to guest mode. In the error case * this ensures the host mode is restored correctly * and l1 guest keeps alive. */ nestedhvm_vcpu_enter_guestmode(v); switch (ret) { case 0: break; case NSVM_ERROR_VVMCB: gdprintk(XENLOG_ERR, "inject VMEXIT(INVALID)\n"); svm->ns_vmexit.exitcode = VMEXIT_INVALID; return -1; case NSVM_ERROR_VMENTRY: default: gdprintk(XENLOG_ERR, "nsvm_vcpu_vmentry failed, injecting #UD\n"); hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); /* Must happen after hvm_inject_hw_exception or it doesn't work right. */ nv->nv_vmswitch_in_progress = 0; return 1; } /* If l1 guest uses shadow paging, update the paging mode. */ if (!nestedhvm_paging_mode_hap(v)) paging_update_paging_modes(v); nv->nv_vmswitch_in_progress = 0; return 0; } int nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb; ASSERT(svm->ns_gif == 0); ns_vmcb = nv->nv_vvmcx; if (nv->nv_vmexit_pending) { switch (exitcode) { case VMEXIT_INTR: if ( unlikely(ns_vmcb->eventinj.fields.v) && nv->nv_vmentry_pending && hvm_event_needs_reinjection(ns_vmcb->eventinj.fields.type, ns_vmcb->eventinj.fields.vector) ) { ns_vmcb->exitintinfo.bytes = ns_vmcb->eventinj.bytes; } break; case VMEXIT_EXCEPTION_PF: ns_vmcb->_cr2 = ns_vmcb->exitinfo2; /* fall through */ case VMEXIT_NPF: /* PF error code */ ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1; /* fault address */ ns_vmcb->exitinfo2 = svm->ns_vmexit.exitinfo2; break; case VMEXIT_EXCEPTION_NP: case VMEXIT_EXCEPTION_SS: case VMEXIT_EXCEPTION_GP: case VMEXIT_EXCEPTION_15: case VMEXIT_EXCEPTION_MF: case VMEXIT_EXCEPTION_AC: ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1; break; default: break; } } ns_vmcb->exitcode = exitcode; ns_vmcb->eventinj.bytes = 0; return 0; } int nsvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap) { ASSERT(vcpu_nestedhvm(v).nv_vvmcx != NULL); nestedsvm_vmexit_defer(v, VMEXIT_EXCEPTION_DE + trap->vector, trap->error_code, trap->cr2); return NESTEDHVM_VMEXIT_DONE; } uint64_t nsvm_vcpu_guestcr3(struct vcpu *v) { return vcpu_nestedsvm(v).ns_vmcb_guestcr3; } uint64_t nsvm_vcpu_hostcr3(struct vcpu *v) { return vcpu_nestedsvm(v).ns_vmcb_hostcr3; } uint32_t nsvm_vcpu_asid(struct vcpu *v) { return vcpu_nestedsvm(v).ns_guest_asid; } static int nsvm_vmcb_guest_intercepts_msr(unsigned long *msr_bitmap, uint32_t msr, bool_t write) { bool_t enabled; unsigned long *msr_bit; msr_bit = svm_msrbit(msr_bitmap, msr); if (msr_bit == NULL) /* MSR not in the permission map: Let the guest handle it. */ return NESTEDHVM_VMEXIT_INJECT; BUG_ON(msr_bit == NULL); msr &= 0x1fff; if (write) /* write access */ enabled = test_bit(msr * 2 + 1, msr_bit); else /* read access */ enabled = test_bit(msr * 2, msr_bit); if (!enabled) return NESTEDHVM_VMEXIT_HOST; return NESTEDHVM_VMEXIT_INJECT; } static int nsvm_vmcb_guest_intercepts_ioio(paddr_t iopm_pa, uint64_t exitinfo1) { unsigned long iopm_gfn = iopm_pa >> PAGE_SHIFT; unsigned long *io_bitmap = NULL; ioio_info_t ioinfo; uint16_t port; bool_t enabled; unsigned long gfn = 0; /* gcc ... */ ioinfo.bytes = exitinfo1; port = ioinfo.fields.port; switch (port) { case 0 ... 32767: /* first 4KB page */ gfn = iopm_gfn; break; case 32768 ... 65535: /* second 4KB page */ port -= 32768; gfn = iopm_gfn + 1; break; default: BUG(); break; } io_bitmap = hvm_map_guest_frame_ro(gfn, 0); if (io_bitmap == NULL) { gdprintk(XENLOG_ERR, "IOIO intercept: mapping of permission map failed\n"); return NESTEDHVM_VMEXIT_ERROR; } enabled = test_bit(port, io_bitmap); hvm_unmap_guest_frame(io_bitmap, 0); if (!enabled) return NESTEDHVM_VMEXIT_HOST; return NESTEDHVM_VMEXIT_INJECT; } int nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { uint64_t exit_bits; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; enum nestedhvm_vmexits vmexits; switch (exitcode) { case VMEXIT_CR0_READ ... VMEXIT_CR15_READ: case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE: exit_bits = 1ULL << (exitcode - VMEXIT_CR0_READ); if (svm->ns_cr_intercepts & exit_bits) break; return 0; case VMEXIT_DR0_READ ... VMEXIT_DR7_READ: case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE: exit_bits = 1ULL << (exitcode - VMEXIT_DR0_READ); if (svm->ns_dr_intercepts & exit_bits) break; return 0; case VMEXIT_EXCEPTION_DE ... VMEXIT_EXCEPTION_XF: exit_bits = 1ULL << (exitcode - VMEXIT_EXCEPTION_DE); if (svm->ns_exception_intercepts & exit_bits) break; return 0; case VMEXIT_INTR ... VMEXIT_SHUTDOWN: exit_bits = 1ULL << (exitcode - VMEXIT_INTR); if (svm->ns_general1_intercepts & exit_bits) break; return 0; case VMEXIT_VMRUN ... VMEXIT_XSETBV: exit_bits = 1ULL << (exitcode - VMEXIT_VMRUN); if (svm->ns_general2_intercepts & exit_bits) break; return 0; case VMEXIT_NPF: if (nestedhvm_paging_mode_hap(v)) break; return 0; case VMEXIT_INVALID: /* Always intercepted */ break; default: gdprintk(XENLOG_ERR, "Illegal exitcode %#"PRIx64"\n", exitcode); BUG(); break; } /* Special cases: Do more detailed checks */ switch (exitcode) { case VMEXIT_MSR: ASSERT(regs != NULL); nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr); ASSERT(nv->nv_vvmcx != NULL); ns_vmcb = nv->nv_vvmcx; vmexits = nsvm_vmcb_guest_intercepts_msr(svm->ns_cached_msrpm, regs->ecx, ns_vmcb->exitinfo1 != 0); if (vmexits == NESTEDHVM_VMEXIT_HOST) return 0; break; case VMEXIT_IOIO: nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr); ASSERT(nv->nv_vvmcx != NULL); ns_vmcb = nv->nv_vvmcx; vmexits = nsvm_vmcb_guest_intercepts_ioio(ns_vmcb->_iopm_base_pa, ns_vmcb->exitinfo1); if (vmexits == NESTEDHVM_VMEXIT_HOST) return 0; break; } return 1; } int nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr, int errcode) { return nsvm_vmcb_guest_intercepts_exitcode(v, guest_cpu_user_regs(), VMEXIT_EXCEPTION_DE + trapnr); } static int nsvm_vmcb_prepare4vmexit(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; struct vmcb_struct *n2vmcb = nv->nv_n2vmcx; svm_vmsave(nv->nv_n1vmcx); /* Cache guest physical address of virtual vmcb * for VMCB Cleanbit emulation. */ svm->ns_ovvmcb_pa = nv->nv_vvmcxaddr; /* Intercepts - keep them as they are */ /* Pausefilter - keep it as is */ /* Nested IO permission bitmap */ /* Just keep the iopm_base_pa and msrpm_base_pa values. * The guest must not see the virtualized values. */ /* TSC offset */ /* Keep it. It's maintainted by the l1 guest. */ /* ASID */ /* ns_vmcb->_guest_asid = n2vmcb->_guest_asid; */ /* TLB control */ ns_vmcb->tlb_control = 0; /* Virtual Interrupts */ ns_vmcb->_vintr = n2vmcb->_vintr; if (!(svm->ns_hostflags.fields.vintrmask)) ns_vmcb->_vintr.fields.intr_masking = 0; /* Shadow mode */ ns_vmcb->interrupt_shadow = n2vmcb->interrupt_shadow; /* Exit codes */ ns_vmcb->exitcode = n2vmcb->exitcode; ns_vmcb->exitinfo1 = n2vmcb->exitinfo1; ns_vmcb->exitinfo2 = n2vmcb->exitinfo2; ns_vmcb->exitintinfo = n2vmcb->exitintinfo; /* Interrupts */ /* If we emulate a VMRUN/#VMEXIT in the same host #VMEXIT cycle we have * to make sure that we do not lose injected events. So check eventinj * here and copy it to exitintinfo if it is valid. * exitintinfo and eventinj can't be both valid because the case below * only happens on a VMRUN instruction intercept which has no valid * exitintinfo set. */ if ( unlikely(n2vmcb->eventinj.fields.v) && hvm_event_needs_reinjection(n2vmcb->eventinj.fields.type, n2vmcb->eventinj.fields.vector) ) { ns_vmcb->exitintinfo = n2vmcb->eventinj; } ns_vmcb->eventinj.bytes = 0; /* Nested paging mode */ if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging. */ ns_vmcb->_np_enable = n2vmcb->_np_enable; ns_vmcb->_cr3 = n2vmcb->_cr3; /* The vmcb->h_cr3 is the shadowed h_cr3. The original * unshadowed guest h_cr3 is kept in ns_vmcb->h_cr3, * hence we keep the ns_vmcb->h_cr3 value. */ } else if (paging_mode_hap(v->domain)) { /* host nested paging + guest shadow paging. */ ns_vmcb->_np_enable = 0; /* Throw h_cr3 away. Guest is not allowed to set it or * it can break out, otherwise (security hole!) */ ns_vmcb->_h_cr3 = 0x0; /* Stop intercepting #PF (already done above * by restoring cached intercepts). */ ns_vmcb->_cr3 = n2vmcb->_cr3; } else { /* host shadow paging + guest shadow paging. */ ns_vmcb->_np_enable = 0; ns_vmcb->_h_cr3 = 0x0; /* The vmcb->_cr3 is the shadowed cr3. The original * unshadowed guest cr3 is kept in ns_vmcb->_cr3, * hence we keep the ns_vmcb->_cr3 value. */ } /* LBR virtualization - keep lbr control as is */ /* NextRIP */ ns_vmcb->nextrip = n2vmcb->nextrip; /* Decode Assist */ ns_vmcb->guest_ins_len = n2vmcb->guest_ins_len; memcpy(ns_vmcb->guest_ins, n2vmcb->guest_ins, sizeof(ns_vmcb->guest_ins)); /* * VMCB Save State Area */ /* Segments */ ns_vmcb->es = n2vmcb->es; ns_vmcb->cs = n2vmcb->cs; ns_vmcb->ss = n2vmcb->ss; ns_vmcb->ds = n2vmcb->ds; ns_vmcb->gdtr = n2vmcb->gdtr; ns_vmcb->idtr = n2vmcb->idtr; /* CPL */ ns_vmcb->_cpl = n2vmcb->_cpl; /* EFER */ ns_vmcb->_efer = n2vmcb->_efer; /* CRn */ ns_vmcb->_cr4 = n2vmcb->_cr4; ns_vmcb->_cr0 = n2vmcb->_cr0; /* DRn */ ns_vmcb->_dr7 = n2vmcb->_dr7; ns_vmcb->_dr6 = n2vmcb->_dr6; /* Restore registers from regs as those values * can be newer than in n2vmcb (e.g. due to an * instruction emulation right before). */ /* RFLAGS */ ns_vmcb->rflags = n2vmcb->rflags = regs->rflags; /* RIP */ ns_vmcb->rip = n2vmcb->rip = regs->rip; /* RSP */ ns_vmcb->rsp = n2vmcb->rsp = regs->rsp; /* RAX */ ns_vmcb->rax = n2vmcb->rax = regs->rax; /* Keep the l2 guest values of the fs, gs, ldtr, tr, kerngsbase, * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. */ /* CR2 */ ns_vmcb->_cr2 = n2vmcb->_cr2; /* Page tables */ ns_vmcb->pdpe0 = n2vmcb->pdpe0; ns_vmcb->pdpe1 = n2vmcb->pdpe1; ns_vmcb->pdpe2 = n2vmcb->pdpe2; ns_vmcb->pdpe3 = n2vmcb->pdpe3; /* PAT */ ns_vmcb->_g_pat = n2vmcb->_g_pat; /* Debug Control MSR */ ns_vmcb->_debugctlmsr = n2vmcb->_debugctlmsr; /* LBR MSRs */ ns_vmcb->_lastbranchfromip = n2vmcb->_lastbranchfromip; ns_vmcb->_lastbranchtoip = n2vmcb->_lastbranchtoip; ns_vmcb->_lastintfromip = n2vmcb->_lastintfromip; ns_vmcb->_lastinttoip = n2vmcb->_lastinttoip; return 0; } bool_t nsvm_vmcb_hap_enabled(struct vcpu *v) { return vcpu_nestedsvm(v).ns_hap_enabled; } /* This function uses L2_gpa to walk the P2M page table in L1. If the * walk is successful, the translated value is returned in * L1_gpa. The result value tells what to do next. */ int nsvm_hap_walk_L1_p2m(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa, unsigned int *page_order, uint8_t *p2m_acc, bool_t access_r, bool_t access_w, bool_t access_x) { uint32_t pfec; unsigned long nested_cr3, gfn; nested_cr3 = nhvm_vcpu_p2m_base(v); pfec = PFEC_user_mode | PFEC_page_present; if ( access_w ) pfec |= PFEC_write_access; if ( access_x ) pfec |= PFEC_insn_fetch; /* Walk the guest-supplied NPT table, just as if it were a pagetable */ gfn = paging_ga_to_gfn_cr3(v, nested_cr3, L2_gpa, &pfec, page_order); if ( gfn == INVALID_GFN ) return NESTEDHVM_PAGEFAULT_INJECT; *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK); return NESTEDHVM_PAGEFAULT_DONE; } enum hvm_intblk nsvm_intr_blocked(struct vcpu *v) { struct nestedsvm *svm = &vcpu_nestedsvm(v); struct nestedvcpu *nv = &vcpu_nestedhvm(v); ASSERT(nestedhvm_enabled(v->domain)); if ( !nestedsvm_gif_isset(v) ) return hvm_intblk_svm_gif; if ( nestedhvm_vcpu_in_guestmode(v) ) { struct vmcb_struct *n2vmcb = nv->nv_n2vmcx; if ( svm->ns_hostflags.fields.vintrmask ) if ( !svm->ns_hostflags.fields.rflagsif ) return hvm_intblk_rflags_ie; /* when l1 guest passes its devices through to the l2 guest * and l2 guest does an MMIO access then we may want to * inject an VMEXIT(#INTR) exitcode into the l1 guest. * Delay the injection because this would result in delivering * an interrupt *within* the execution of an instruction. */ if ( v->arch.hvm_vcpu.hvm_io.io_state != HVMIO_none ) return hvm_intblk_shadow; if ( !nv->nv_vmexit_pending && n2vmcb->exitintinfo.bytes != 0 ) { /* Give the l2 guest a chance to finish the delivery of * the last injected interrupt or exception before we * emulate a VMEXIT (e.g. VMEXIT(INTR) ). */ return hvm_intblk_shadow; } } if ( nv->nv_vmexit_pending ) { /* hvm_inject_hw_exception() must have run before. * exceptions have higher priority than interrupts. */ return hvm_intblk_rflags_ie; } return hvm_intblk_none; } /* MSR handling */ int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content) { struct nestedsvm *svm = &vcpu_nestedsvm(v); int ret = 1; *msr_content = 0; switch (msr) { case MSR_K8_VM_CR: break; case MSR_K8_VM_HSAVE_PA: *msr_content = svm->ns_msr_hsavepa; break; case MSR_AMD64_TSC_RATIO: *msr_content = svm->ns_tscratio; break; default: ret = 0; break; } return ret; } int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content) { int ret = 1; struct nestedsvm *svm = &vcpu_nestedsvm(v); switch (msr) { case MSR_K8_VM_CR: /* ignore write. handle all bits as read-only. */ break; case MSR_K8_VM_HSAVE_PA: if (!nestedsvm_vmcb_isvalid(v, msr_content)) { gdprintk(XENLOG_ERR, "MSR_K8_VM_HSAVE_PA value invalid %#"PRIx64"\n", msr_content); ret = -1; /* inject #GP */ break; } svm->ns_msr_hsavepa = msr_content; break; case MSR_AMD64_TSC_RATIO: if ((msr_content & ~TSC_RATIO_RSVD_BITS) != msr_content) { gdprintk(XENLOG_ERR, "reserved bits set in MSR_AMD64_TSC_RATIO %#"PRIx64"\n", msr_content); ret = -1; /* inject #GP */ break; } svm->ns_tscratio = msr_content; break; default: ret = 0; break; } return ret; } /* VMEXIT emulation */ void nestedsvm_vmexit_defer(struct vcpu *v, uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2) { struct nestedsvm *svm = &vcpu_nestedsvm(v); nestedsvm_vcpu_clgi(v); svm->ns_vmexit.exitcode = exitcode; svm->ns_vmexit.exitinfo1 = exitinfo1; svm->ns_vmexit.exitinfo2 = exitinfo2; vcpu_nestedhvm(v).nv_vmexit_pending = 1; } enum nestedhvm_vmexits nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { bool_t is_intercepted; ASSERT(vcpu_nestedhvm(v).nv_vmexit_pending == 0); is_intercepted = nsvm_vmcb_guest_intercepts_exitcode(v, regs, exitcode); switch (exitcode) { case VMEXIT_INVALID: if (is_intercepted) return NESTEDHVM_VMEXIT_INJECT; return NESTEDHVM_VMEXIT_HOST; case VMEXIT_INTR: case VMEXIT_NMI: return NESTEDHVM_VMEXIT_HOST; case VMEXIT_EXCEPTION_NM: /* Host must handle lazy fpu context switching first. * Then inject the VMEXIT if L1 guest intercepts this. */ return NESTEDHVM_VMEXIT_HOST; case VMEXIT_NPF: if (nestedhvm_paging_mode_hap(v)) { if (!is_intercepted) return NESTEDHVM_VMEXIT_FATALERROR; /* host nested paging + guest nested paging */ return NESTEDHVM_VMEXIT_HOST; } if (paging_mode_hap(v->domain)) { if (is_intercepted) return NESTEDHVM_VMEXIT_FATALERROR; /* host nested paging + guest shadow paging */ return NESTEDHVM_VMEXIT_HOST; } /* host shadow paging + guest shadow paging */ /* Can this happen? */ BUG(); return NESTEDHVM_VMEXIT_FATALERROR; case VMEXIT_EXCEPTION_PF: if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging */ if (!is_intercepted) /* l1 guest intercepts #PF unnecessarily */ return NESTEDHVM_VMEXIT_HOST; /* l2 guest intercepts #PF unnecessarily */ return NESTEDHVM_VMEXIT_INJECT; } if (!paging_mode_hap(v->domain)) { /* host shadow paging + guest shadow paging */ return NESTEDHVM_VMEXIT_HOST; } /* host nested paging + guest shadow paging */ return NESTEDHVM_VMEXIT_INJECT; case VMEXIT_VMMCALL: /* Always let the guest handle VMMCALL/VMCALL */ return NESTEDHVM_VMEXIT_INJECT; default: break; } if (is_intercepted) return NESTEDHVM_VMEXIT_INJECT; return NESTEDHVM_VMEXIT_HOST; } enum nestedhvm_vmexits nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs) { int rc; enum nestedhvm_vmexits ret = NESTEDHVM_VMEXIT_DONE; ASSERT(vcpu_nestedhvm(v).nv_vmswitch_in_progress); ASSERT(nestedhvm_vcpu_in_guestmode(v)); rc = nsvm_vmcb_prepare4vmexit(v, regs); if (rc) ret = NESTEDHVM_VMEXIT_ERROR; rc = nhvm_vcpu_hostrestore(v, regs); if (rc) ret = NESTEDHVM_VMEXIT_FATALERROR; nestedhvm_vcpu_exit_guestmode(v); return ret; } /* The exitcode is in native SVM/VMX format. The forced exitcode * is in generic format. */ static enum nestedhvm_vmexits nestedsvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { int rc; struct nestedvcpu *nv = &vcpu_nestedhvm(v); nv->nv_vmswitch_in_progress = 1; ASSERT(nv->nv_vvmcx != NULL); /* On special intercepts the host has to handle * the vcpu is still in guest mode here. */ if (nestedhvm_vcpu_in_guestmode(v)) { enum nestedhvm_vmexits ret; ret = nestedsvm_vmexit_n2n1(v, regs); switch (ret) { case NESTEDHVM_VMEXIT_FATALERROR: gdprintk(XENLOG_ERR, "VMEXIT: fatal error\n"); return ret; case NESTEDHVM_VMEXIT_HOST: BUG(); return ret; case NESTEDHVM_VMEXIT_ERROR: exitcode = VMEXIT_INVALID; break; default: ASSERT(!nestedhvm_vcpu_in_guestmode(v)); break; } /* host state has been restored */ } ASSERT(!nestedhvm_vcpu_in_guestmode(v)); /* Prepare for running the l1 guest. Make the actual * modifications to the virtual VMCB/VMCS. */ rc = nhvm_vcpu_vmexit(v, regs, exitcode); /* If l1 guest uses shadow paging, update the paging mode. */ if (!nestedhvm_paging_mode_hap(v)) paging_update_paging_modes(v); nv->nv_vmswitch_in_progress = 0; if (rc) return NESTEDHVM_VMEXIT_FATALERROR; return NESTEDHVM_VMEXIT_DONE; } /* VCPU switch */ void nsvm_vcpu_switch(struct cpu_user_regs *regs) { struct vcpu *v = current; struct nestedvcpu *nv; struct nestedsvm *svm; if (!nestedhvm_enabled(v->domain)) return; nv = &vcpu_nestedhvm(v); svm = &vcpu_nestedsvm(v); ASSERT(v->arch.hvm_svm.vmcb != NULL); ASSERT(nv->nv_n1vmcx != NULL); ASSERT(nv->nv_n2vmcx != NULL); ASSERT(nv->nv_n1vmcx_pa != VMCX_EADDR); ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR); if (nv->nv_vmexit_pending) { vmexit: nestedsvm_vcpu_vmexit(v, regs, svm->ns_vmexit.exitcode); nv->nv_vmexit_pending = 0; nv->nv_vmentry_pending = 0; return; } if (nv->nv_vmentry_pending) { int ret; ASSERT(!nv->nv_vmexit_pending); ret = nsvm_vcpu_vmrun(v, regs); if (ret) goto vmexit; ASSERT(nestedhvm_vcpu_in_guestmode(v)); nv->nv_vmentry_pending = 0; } if (nestedhvm_vcpu_in_guestmode(v) && nestedhvm_paging_mode_hap(v)) { /* In case left the l2 guest due to a physical interrupt (e.g. IPI) * that is not for the l1 guest then we continue running the l2 guest * but check if the nestedp2m is still valid. */ if (nv->nv_p2m == NULL) nestedsvm_vmcb_set_nestedp2m(v, nv->nv_vvmcx, nv->nv_n2vmcx); } } /* Interrupts, Virtual GIF */ int nestedsvm_vcpu_interrupt(struct vcpu *v, const struct hvm_intack intack) { int ret; enum hvm_intblk intr; uint64_t exitcode = VMEXIT_INTR; uint64_t exitinfo2 = 0; ASSERT(nestedhvm_vcpu_in_guestmode(v)); intr = nhvm_interrupt_blocked(v); if ( intr != hvm_intblk_none ) return NSVM_INTR_MASKED; switch (intack.source) { case hvm_intsrc_pic: case hvm_intsrc_lapic: case hvm_intsrc_vector: exitcode = VMEXIT_INTR; exitinfo2 = intack.vector; break; case hvm_intsrc_nmi: exitcode = VMEXIT_NMI; exitinfo2 = intack.vector; break; case hvm_intsrc_mce: exitcode = VMEXIT_EXCEPTION_MC; exitinfo2 = intack.vector; break; case hvm_intsrc_none: return NSVM_INTR_NOTHANDLED; default: BUG(); } ret = nsvm_vmcb_guest_intercepts_exitcode(v, guest_cpu_user_regs(), exitcode); if (ret) { nestedsvm_vmexit_defer(v, exitcode, intack.source, exitinfo2); return NSVM_INTR_FORCEVMEXIT; } return NSVM_INTR_NOTINTERCEPTED; } bool_t nestedsvm_gif_isset(struct vcpu *v) { struct nestedsvm *svm = &vcpu_nestedsvm(v); return (!!svm->ns_gif); } void svm_vmexit_do_stgi(struct cpu_user_regs *regs, struct vcpu *v) { unsigned int inst_len; if ( !nestedhvm_enabled(v->domain) ) { hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); return; } if ( (inst_len = __get_instruction_length(v, INSTR_STGI)) == 0 ) return; nestedsvm_vcpu_stgi(v); __update_guest_eip(regs, inst_len); } void svm_vmexit_do_clgi(struct cpu_user_regs *regs, struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned int inst_len; uint32_t general1_intercepts = vmcb_get_general1_intercepts(vmcb); vintr_t intr; if ( !nestedhvm_enabled(v->domain) ) { hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); return; } if ( (inst_len = __get_instruction_length(v, INSTR_CLGI)) == 0 ) return; nestedsvm_vcpu_clgi(v); /* After a CLGI no interrupts should come */ intr = vmcb_get_vintr(vmcb); intr.fields.irq = 0; general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR; vmcb_set_vintr(vmcb, intr); vmcb_set_general1_intercepts(vmcb, general1_intercepts); __update_guest_eip(regs, inst_len); }