/*
* hvm.c: Common hardware virtual machine abstractions.
*
* Copyright (c) 2004, Intel Corporation.
* Copyright (c) 2005, International Business Machines Corporation.
* Copyright (c) 2008, Citrix Systems, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place - Suite 330, Boston, MA 02111-1307 USA.
*/
#include <xen/config.h>
#include <xen/ctype.h>
#include <xen/init.h>
#include <xen/lib.h>
#include <xen/trace.h>
#include <xen/sched.h>
#include <xen/irq.h>
#include <xen/softirq.h>
#include <xen/domain.h>
#include <xen/domain_page.h>
#include <xen/hypercall.h>
#include <xen/guest_access.h>
#include <xen/event.h>
#include <xen/paging.h>
#include <asm/shadow.h>
#include <asm/current.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/regs.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
#include <asm/types.h>
#include <asm/msr.h>
#include <asm/mc146818rtc.h>
#include <asm/spinlock.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/vpt.h>
#include <asm/hvm/support.h>
#include <asm/hvm/cacheattr.h>
#include <asm/hvm/trace.h>
#include <public/sched.h>
#include <public/hvm/ioreq.h>
#include <public/version.h>
#include <public/memory.h>
int hvm_enabled __read_mostly;
unsigned int opt_hvm_debug_level __read_mostly;
integer_param("hvm_debug", opt_hvm_debug_level);
int opt_softtsc;
boolean_param("softtsc", opt_softtsc);
struct hvm_function_table hvm_funcs __read_mostly;
/* I/O permission bitmap is globally shared by all HVM guests. */
unsigned long __attribute__ ((__section__ (".bss.page_aligned")))
hvm_io_bitmap[3*PAGE_SIZE/BYTES_PER_LONG];
void hvm_enable(struct hvm_function_table *fns)
{
BUG_ON(hvm_enabled);
printk("HVM: %s enabled\n", fns->name);
/*
* Allow direct access to the PC debug port (it is often used for I/O
* delays, but the vmexits simply slow things down).
*/
memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
__clear_bit(0x80, hvm_io_bitmap);
hvm_funcs = *fns;
hvm_enabled = 1;
if ( hvm_funcs.hap_supported )
printk("HVM: Hardware Assisted Paging detected.\n");
}
/*
* Need to re-inject a given event? We avoid re-injecting software exceptions
* and interrupts because the faulting/trapping instruction can simply be
* re-executed (neither VMX nor SVM update RIP when they VMEXIT during
* INT3/INTO/INTn).
*/
int hvm_event_needs_reinjection(uint8_t type, uint8_t vector)
{
switch ( type )
{
case X86_EVENTTYPE_EXT_INTR:
case X86_EVENTTYPE_NMI:
return 1;
case X86_EVENTTYPE_HW_EXCEPTION:
/*
* SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly
* check for these vectors, as they are really SW Exceptions. SVM has
* not updated RIP to point after the trapping instruction (INT3/INTO).
*/
return (vector != 3) && (vector != 4);
default:
/* Software exceptions/interrupts can be re-executed (e.g., INT n). */
break;
}
return 0;
}
/*
* Combine two hardware exceptions: @vec2 was raised during delivery of @vec1.
* This means we can assume that @vec2 is contributory or a page fault.
*/
uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2)
{
/* Exception during double-fault delivery always causes a triple fault. */
if ( vec1 == TRAP_double_fault )
{
hvm_triple_fault();
return TRAP_double_fault; /* dummy return */
}
/* Exception during page-fault delivery always causes a double fault. */
if ( vec1 == TRAP_page_fault )
return TRAP_double_fault;
/* Discard the first exception if it's benign or if we now have a #PF. */
if ( !((1u << vec1) & 0x7c01u) || (vec2 == TRAP_page_fault) )
return vec2;
/* Cannot combine the exceptions: double fault. */
return TRAP_double_fault;
}
void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
{
u64 host_tsc;
rdtscll(host_tsc);
v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
}
u64 hvm_get_guest_tsc(struct vcpu *v)
{
u64 host_tsc;
if ( opt_softtsc )
host_tsc = hvm_get_guest_time(v);
else
rdtscll(host_tsc);
return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
}
void hvm_migrate_timers(struct vcpu *v)
{
rtc_migrate_timers(v);
pt_migrate(v);
}
void hvm_do_resume(struct vcpu *v)
{
ioreq_t *p;
pt_restore_timer(v);
/* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */
p = &get_ioreq(v)->vp_ioreq;
while ( p->state != STATE_IOREQ_NONE )
{
switch ( p->state )
{
case STATE_IORESP_READY: /* IORESP_READY -> NONE */
hvm_io_assist();
break;
case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
case STATE_IOREQ_INPROCESS:
wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port,
(p->state != STATE_IOREQ_READY) &&
(p->state != STATE_IOREQ_INPROCESS));
break;
default:
gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
domain_crash(v->domain);
return; /* bail */
}
}
}
static void hvm_init_ioreq_page(
struct domain *d, struct hvm_ioreq_page *iorp)
{
memset(iorp, 0, sizeof(*iorp));
spin_lock_init(&iorp->lock);
domain_pause(d);
}
static void hvm_destroy_ioreq_page(
struct domain *d, struct hvm_ioreq_page *iorp)
{
spin_lock(&iorp->lock);
ASSERT(d->is_dying);
if ( iorp->va != NULL )
{
unmap_domain_page_global(iorp->va);
put_page_and_type(iorp->page);
iorp->va = NULL;
}
spin_unlock(&iorp->lock);
}
static int hvm_set_ioreq_page(
struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn)
{
struct page_info *page;
p2m_type_t p2mt;
unsigned long mfn;
void *va;
mfn = mfn_x(gfn_to_mfn(d, gmfn, &p2mt));
if ( !p2m_is_ram(p2mt) )
return -EINVAL;
ASSERT(mfn_valid(mfn));
page = mfn_to_page(mfn);
if ( !get_page_and_type(page, d, PGT_writable_page) )
return -EINVAL;
va = map_domain_page_global(mfn);
if ( va == NULL )
{
put_page_and_type(page);
return -ENOMEM;
}
spin_lock(&iorp->lock);
if ( (iorp->va != NULL) || d->is_dying )
{
spin_unlock(&iorp->lock);
unmap_domain_page_global(va);
put_page_and_type(mfn_to_page(mfn));
return -EINVAL;
}
iorp->va = va;
iorp->page = page;
spin_unlock(&iorp->lock);
domain_unpause(d);
return 0;
}
static int hvm_print_line(
int dir, uint32_t port, uint32_t bytes, uint32_t *val)
{
struct vcpu *curr = current;
struct hvm_domain *hd = &curr->domain->arch.hvm_domain;
char c = *val;
BUG_ON(bytes != 1);
/* Accept only printable characters, newline, and horizontal tab. */
if ( !isprint(c) && (c != '\n') && (c != '\t') )
return X86EMUL_OKAY;
spin_lock(&hd->pbuf_lock);
hd->pbuf[hd->pbuf_idx++] = c;
if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
{
if ( c != '\n' )
hd->pbuf[hd->pbuf_idx++] = '\n';
hd->pbuf[hd->pbuf_idx] = '\0';
printk(XENLOG_G_DEBUG "HVM%u: %s", curr->domain->domain_id, hd->pbuf);
hd->pbuf_idx = 0;
}
spin_unlock(&hd->pbuf_lock);
return X86EMUL_OKAY;
}
int hvm_domain_initialise(struct domain *d)
{
int rc;
if ( !hvm_enabled )
{
gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest "
"on a non-VT/AMDV platform.\n");
return -EINVAL;
}
spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
spin_lock_init(&d->arch.hvm_domain.irq_lock);
spin_lock_init(&d->arch.hvm_domain.uc_lock);
INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list);
spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock);
hvm_init_guest_time(d);
d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1;
hvm_init_cacheattr_region_list(d);
rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
if ( rc != 0 )
goto fail1;
vpic_init(d);
rc = vioapic_init(d);
if ( rc != 0 )
goto fail1;
stdvga_init(d);
rtc_init(d);
hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq);
hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
register_portio_handler(d, 0xe9, 1, hvm_print_line);
rc = hvm_funcs.domain_initialise(d);
if ( rc != 0 )
goto fail2;
return 0;
fail2:
rtc_deinit(d);
stdvga_deinit(d);
vioapic_deinit(d);
fail1:
hvm_destroy_cacheattr_region_list(d);
return rc;
}
extern void msixtbl_pt_cleanup(struct domain *d);
void hvm_domain_relinquish_resources(struct domain *d)
{
hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
msixtbl_pt_cleanup(d);
/* Stop all asynchronous timer actions. */
rtc_deinit(d);
if ( d->vcpu[0] != NULL )
{
pit_deinit(d);
pmtimer_deinit(d);
hpet_deinit(d);
}
}
void hvm_domain_destroy(struct domain *d)
{
hvm_funcs.domain_destroy(d);
rtc_deinit(d);
stdvga_deinit(d);
vioapic_deinit(d);
hvm_destroy_cacheattr_region_list(d);
}
static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
{
struct vcpu *v;
struct hvm_hw_cpu ctxt;
struct segment_register seg;
struct vcpu_guest_context *vc;
for_each_vcpu ( d, v )
{
/* We don't need to save state for a vcpu that is down; the restore
* code will leave it down if there is nothing saved. */
if ( test_bit(_VPF_down, &v->pause_flags) )
continue;
/* Architecture-specific vmcs/vmcb bits */
hvm_funcs.save_cpu_ctxt(v, &ctxt);
hvm_get_segment_register(v, x86_seg_idtr, &seg);
ctxt.idtr_limit = seg.limit;
ctxt.idtr_base = seg.base;
hvm_get_segment_register(v, x86_seg_gdtr, &seg);
ctxt.gdtr_limit = seg.limit;
ctxt.gdtr_base = seg.base;
hvm_get_segment_register(v, x86_seg_cs, &seg);
ctxt.cs_sel = seg.sel;
ctxt.cs_limit = seg.limit;
ctxt.cs_base = seg.base;
ctxt.cs_arbytes = seg.attr.bytes;
hvm_get_segment_register(v, x86_seg_ds, &seg);
ctxt.ds_sel = seg.sel;
ctxt.ds_limit = seg.limit;
ctxt.ds_base = seg.base;
ctxt.ds_arbytes = seg.attr.bytes;
hvm_get_segment_register(v, x86_seg_es, &seg);
ctxt.es_sel = seg.sel;
ctxt.es_limit = seg.limit;
ctxt.es_base = seg.base;
ctxt.es_arbytes = seg.attr.bytes;
hvm_get_segment_register(v, x86_seg_ss, &seg);
ctxt.ss_sel = seg.sel;
ctxt.ss_limit = seg.limit;
ctxt.ss_base = seg.base;
ctxt.ss_arbytes = seg.attr.bytes;
hvm_get_segment_register(v, x86_seg_fs, &seg);
ctxt.fs_sel = seg.sel;
ctxt.fs_limit = seg.limit;
ctxt.fs_base = seg.base;
ctxt.fs_arbytes = seg.attr.bytes;
hvm_get_segment_register(v, x86_seg_gs, &seg);
ctxt.gs_sel = seg.sel;
ctxt.gs_limit = seg.limit;
ctxt.gs_base = seg.base;
ctxt.gs_arbytes = seg.attr.bytes;
hvm_get_segment_register(v, x86_seg_tr, &seg);
ctxt.tr_sel = seg.sel;
ctxt.tr_limit = seg.limit;
ctxt.tr_base = seg.base;
ctxt.tr_arbytes = seg.attr.bytes;
hvm_get_segment_register(v, x86_seg_ldtr, &seg);
ctxt.ldtr_sel = seg.sel;
ctxt.ldtr_limit = seg.limit;
ctxt.ldtr_base = seg.base;
ctxt.ldtr_arbytes = seg.attr.bytes;
vc = &v->arch.guest_context;
if ( v->fpu_initialised )
memcpy(ctxt.fpu_regs, &vc->fpu_ctxt, sizeof(ctxt.fpu_regs));
else
memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs));
ctxt.rax = vc->user_regs.eax;
ctxt.rbx = vc->user_regs.ebx;
ctxt.rcx = vc->user_regs.ecx;
ctxt.rdx = vc->user_regs.edx;
ctxt.rbp = vc->user_regs.ebp;
ctxt.rsi = vc->user_regs.esi;
ctxt.rdi = vc->user_regs.edi;
ctxt.rsp = vc->user_regs.esp;
ctxt.rip = vc->user_regs.eip;
ctxt.rflags = vc->user_regs.eflags;
#ifdef __x86_64__
ctxt.r8 = vc->user_regs.r8;
ctxt.r9 = vc->user_regs.r9;
ctxt.r10 = vc->user_regs.r10;
ctxt.r11 = vc->user_regs.r11;
ctxt.r12 = vc->user_regs.r12;
ctxt.r13 = vc->user_regs.r13;
ctxt.r14 = vc->user_regs.r14;
ctxt.r15 = vc->user_regs.r15;
#endif
ctxt.dr0 = vc->debugreg[0];
ctxt.dr1 = vc->debugreg[1];
ctxt.dr2 = vc->debugreg[2];
ctxt.dr3 = vc->debugreg[3];
ctxt.dr6 = vc->debugreg[6];
ctxt.dr7 = vc->debugreg[7];
if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 )
return 1;
}
return 0;
}
static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
{
int vcpuid, rc;
struct vcpu *v;
struct hvm_hw_cpu ctxt;
struct segment_register seg;
struct vcpu_guest_context *vc;
/* Which vcpu is this? */
vcpuid = hvm_load_instance(h);
if ( vcpuid > MAX_VIRT_CPUS || (v = d->vcpu[vcpuid]) == NULL )
{
gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", vcpuid);
return -EINVAL;
}
vc = &v->arch.guest_context;
/* Need to init this vcpu before loading its contents */
domain_lock(d);
if ( !v->is_initialised )
if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
return rc;
domain_unlock(d);
if ( hvm_load_entry(CPU, h, &ctxt) != 0 )
return -EINVAL;
/* Sanity check some control registers. */
if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) ||
!(ctxt.cr0 & X86_CR0_ET) ||
((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) )
{
gdprintk(XENLOG_ERR, "HVM restore: bad CR0 0x%"PRIx64"\n",
ctxt.cr0);
return -EINVAL;
}
if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS )
{
gdprintk(XENLOG_ERR, "HVM restore: bad CR4 0x%"PRIx64"\n",
ctxt.cr4);
return -EINVAL;
}
if ( (ctxt.msr_efer & ~(EFER_FFXSE | EFER_LME | EFER_LMA |
EFER_NX | EFER_SCE)) ||
((sizeof(long) != 8) && (ctxt.msr_efer & EFER_LME)) ||
(!cpu_has_nx && (ctxt.msr_efer & EFER_NX)) ||
(!cpu_has_syscall && (ctxt.msr_efer & EFER_SCE)) ||
(!cpu_has_ffxsr && (ctxt.msr_efer & EFER_FFXSE)) ||
((ctxt.msr_efer & (EFER_LME|EFER_LMA)) == EFER_LMA) )
{
gdprintk(XENLOG_ERR, "HVM restore: bad EFER 0x%"PRIx64"\n",
ctxt.msr_efer);
return -EINVAL;
}
/* Older Xen versions used to save the segment arbytes directly
* from the VMCS on Intel hosts. Detect this and rearrange them
* into the struct segment_register format. */
#define UNFOLD_ARBYTES(_r) \
if ( (_r & 0xf000) && !(_r & 0x0f00) ) \
_r = ((_r & 0xff) | ((_r >> 4) & 0xf00))
UNFOLD_ARBYTES(ctxt.cs_arbytes);
UNFOLD_ARBYTES(ctxt.ds_arbytes);
UNFOLD_ARBYTES(ctxt.es_arbytes);
UNFOLD_ARBYTES(ctxt.fs_arbytes);
UNFOLD_ARBYTES(ctxt.gs_arbytes);
UNFOLD_ARBYTES(ctxt.ss_arbytes);
UNFOLD_ARBYTES(ctxt.tr_arbytes);
UNFOLD_ARBYTES(ctxt.ldtr_arbytes);
#undef UNFOLD_ARBYTES
/* Architecture-specific vmcs/vmcb bits */
if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
return -EINVAL;
seg.limit = ctxt.idtr_limit;
seg.base = ctxt.idtr_base;
hvm_set_segment_register(v, x86_seg_idtr, &seg);
seg.limit = ctxt.gdtr_limit;
seg.base = ctxt.gdtr_base;
hvm_set_segment_register(v, x86_seg_gdtr, &seg);
seg.sel = ctxt.cs_sel;
seg.limit = ctxt.cs_limit;
seg.base = ctxt.cs_base;
seg.attr.bytes = ctxt.cs_arbytes;
hvm_set_segment_register(v, x86_seg_cs, &seg);
seg.sel = ctxt.ds_sel;
seg.limit = ctxt.ds_limit;
seg.base = ctxt.ds_base;
seg.attr.bytes = ctxt.ds_arbytes;
hvm_set_segment_register(v, x86_seg_ds, &seg);
seg.sel = ctxt.es_sel;
seg.limit = ctxt.es_limit;
seg.base = ctxt.es_base;
seg.attr.bytes = ctxt.es_arbytes;
hvm_set_segment_register(v, x86_seg_es, &seg);
seg.sel = ctxt.ss_sel;
seg.limit = ctxt.ss_limit;
seg.base = ctxt.ss_base;
seg.attr.bytes = ctxt.ss_arbytes;
hvm_set_segment_register(v, x86_seg_ss, &seg);
seg.sel = ctxt.fs_sel;
seg.limit = ctxt.fs_limit;
seg.base = ctxt.fs_base;
seg.attr.bytes = ctxt.fs_arbytes;
hvm_set_segment_register(v, x86_seg_fs, &seg);
seg.sel = ctxt.gs_sel;
seg.limit = ctxt.gs_limit;
seg.base = ctxt.gs_base;
seg.attr.bytes = ctxt.gs_arbytes;
hvm_set_segment_register(v, x86_seg_gs, &seg);
seg.sel = ctxt.tr_sel;
seg.limit = ctxt.tr_limit;
seg.base = ctxt.tr_base;
seg.attr.bytes = ctxt.tr_arbytes;
hvm_set_segment_register(v, x86_seg_tr, &seg);
seg.sel = ctxt.ldtr_sel;
seg.limit = ctxt.ldtr_limit;
seg.base = ctxt.ldtr_base;
seg.attr.bytes = ctxt.ldtr_arbytes;
hvm_set_segment_register(v, x86_seg_ldtr, &seg);
memcpy(&vc->fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
vc->user_regs.eax = ctxt.rax;
vc->user_regs.ebx = ctxt.rbx;
vc->user_regs.ecx = ctxt.rcx;
vc->user_regs.edx = ctxt.rdx;
vc->user_regs.ebp = ctxt.rbp;
vc->user_regs.esi = ctxt.rsi;
vc->user_regs.edi = ctxt.rdi;
vc->user_regs.esp = ctxt.rsp;
vc->user_regs.eip = ctxt.rip;
vc->user_regs.eflags = ctxt.rflags | 2;
#ifdef __x86_64__
vc->user_regs.r8 = ctxt.r8;
vc->user_regs.r9 = ctxt.r9;
vc->user_regs.r10 = ctxt.r10;
vc->user_regs.r11 = ctxt.r11;
vc->user_regs.r12 = ctxt.r12;
vc->user_regs.r13 = ctxt.r13;
vc->user_regs.r14 = ctxt.r14;
vc->user_regs.r15 = ctxt.r15;
#endif
vc->debugreg[0] = ctxt.dr0;
vc->debugreg[1] = ctxt.dr1;
vc->debugreg[2] = ctxt.dr2;
vc->debugreg[3] = ctxt.dr3;
vc->debugreg[6] = ctxt.dr6;
vc->debugreg[7] = ctxt.dr7;