x86/AMD: Add support for AMD's OSVW feature in guests.

In some cases guests should not provide workarounds for errata even when the physical processor is affected. For example, because of erratum 400 on family 10h processors a Linux guest will read an MSR (resulting in VMEXIT) before going to idle in order to avoid getting stuck in a non-C0 state. This is not necessary: HLT and IO instructions are intercepted and therefore there is no reason for erratum 400 workaround in the guest. This patch allows us to present a guest with certain errata as fixed, regardless of the state of actual hardware. Signed-off-by: Boris Ostrovsky <boris.ostrovsky@amd.com> Acked-by: Christoph Egger <Christoph.Egger@amd.com> Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Keir Fraser <keir@xen.org> Committed-by: Jan Beulich <jbeulich@suse.com>
author: Boris Ostrovsky <boris.ostrovsky@amd.com> 2012-02-07 15:05:19 +0100
committer: Boris Ostrovsky <boris.ostrovsky@amd.com> 2012-02-07 15:05:19 +0100
commit: 8cbb5278e034e5d7878f7a6a7d6987e5a7acf986 (patch)
tree: ca2b3b44237ca4631b489603e45ad6c148a3164b
parent: 62698cdaf331db0ba7f621040b5c63eaacf6c4a9 (diff)
download: xen-8cbb5278e034e5d7878f7a6a7d6987e5a7acf986.tar.gz
xen-8cbb5278e034e5d7878f7a6a7d6987e5a7acf986.tar.bz2
xen-8cbb5278e034e5d7878f7a6a7d6987e5a7acf986.zip
10 files changed, 207 insertions, 7 deletions
diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index db14b84dfc..d8a910a6e4 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -108,6 +108,7 @@ static void amd_xc_cpuid_policy(
                     bitmaskof(X86_FEATURE_SSE4A) |
                     bitmaskof(X86_FEATURE_MISALIGNSSE) |
                     bitmaskof(X86_FEATURE_3DNOWPREFETCH) |
+                    bitmaskof(X86_FEATURE_OSVW) |
                     bitmaskof(X86_FEATURE_XOP) |
                     bitmaskof(X86_FEATURE_FMA4) |
                     bitmaskof(X86_FEATURE_TBM) |
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index b5a5933f46..d89b3a2d6d 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -83,6 +83,10 @@ static DEFINE_PER_CPU_READ_MOSTLY(void *, root_vmcb);
 
 static bool_t amd_erratum383_found __read_mostly;
 
+/* OSVW bits */
+static uint64_t osvw_length, osvw_status;
+static DEFINE_SPINLOCK(osvw_lock);
+
 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
 {
     struct vcpu *curr = current;
@@ -902,6 +906,69 @@ static void svm_do_resume(struct vcpu *v)
     reset_stack_and_jump(svm_asm_do_resume);
 }
 
+static void svm_guest_osvw_init(struct vcpu *vcpu)
+{
+    if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+        return;
+
+    /*
+     * Guests should see errata 400 and 415 as fixed (assuming that
+     * HLT and IO instructions are intercepted).
+     */
+    vcpu->arch.hvm_svm.osvw.length = (osvw_length >= 3) ? osvw_length : 3;
+    vcpu->arch.hvm_svm.osvw.status = osvw_status & ~(6ULL);
+
+    /*
+     * By increasing VCPU's osvw.length to 3 we are telling the guest that
+     * all osvw.status bits inside that length, including bit 0 (which is
+     * reserved for erratum 298), are valid. However, if host processor's
+     * osvw_len is 0 then osvw_status[0] carries no information. We need to
+     * be conservative here and therefore we tell the guest that erratum 298
+     * is present (because we really don't know).
+     */
+    if ( osvw_length == 0 && boot_cpu_data.x86 == 0x10 )
+        vcpu->arch.hvm_svm.osvw.status |= 1;
+}
+
+void svm_host_osvw_reset()
+{
+    spin_lock(&osvw_lock);
+
+    osvw_length = 64; /* One register (MSRC001_0141) worth of errata */
+    osvw_status = 0;
+
+    spin_unlock(&osvw_lock);
+}
+
+void svm_host_osvw_init()
+{
+    spin_lock(&osvw_lock);
+
+    /*
+     * Get OSVW bits. If bits are not the same on different processors then
+     * choose the worst case (i.e. if erratum is present on one processor and
+     * not on another assume that the erratum is present everywhere).
+     */
+    if ( test_bit(X86_FEATURE_OSVW, &boot_cpu_data.x86_capability) )
+    {
+        uint64_t len, status;
+
+        if ( rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) ||
+             rdmsr_safe(MSR_AMD_OSVW_STATUS, status) )
+            len = status = 0;
+
+        if (len < osvw_length)
+            osvw_length = len;
+
+        osvw_status |= status;
+        osvw_status &= (1ULL << osvw_length) - 1;
+    }
+    else
+        osvw_length = osvw_status = 0;
+
+    spin_unlock(&osvw_lock);
+}
+
 static int svm_domain_initialise(struct domain *d)
 {
     return 0;
@@ -930,6 +997,9 @@ static int svm_vcpu_initialise(struct vcpu *v)
     }
 
     vpmu_initialise(v);
+
+    svm_guest_osvw_init(v);
+
     return 0;
 }
 
@@ -1044,6 +1114,27 @@ static void svm_init_erratum_383(struct cpuinfo_x86 *c)
     }
 }
 
+static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, bool_t read)
+{
+    uint eax, ebx, ecx, edx;
+
+    /* Guest OSVW support */
+    hvm_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+    if ( !test_bit((X86_FEATURE_OSVW & 31), &ecx) )
+        return -1;
+
+    if ( read )
+    {
+        if (msr == MSR_AMD_OSVW_ID_LENGTH)
+            *val = v->arch.hvm_svm.osvw.length;
+        else
+            *val = v->arch.hvm_svm.osvw.status;
+    }
+    /* Writes are ignored */
+
+    return 0;
+}
+
 static int svm_cpu_up(void)
 {
     uint64_t msr_content;
@@ -1094,6 +1185,9 @@ static int svm_cpu_up(void)
     }
 #endif
 
+    /* Initialize OSVW bits to be used by guests */
+    svm_host_osvw_init();
+
     return 0;
 }
 
@@ -1104,6 +1198,8 @@ struct hvm_function_table * __init start_svm(void)
     if ( !test_bit(X86_FEATURE_SVM, &boot_cpu_data.x86_capability) )
         return NULL;
 
+    svm_host_osvw_reset();
+
     if ( svm_cpu_up() )
     {
         printk("SVM: failed to initialise.\n");
@@ -1388,6 +1484,13 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
         vpmu_do_rdmsr(msr, msr_content);
         break;
 
+    case MSR_AMD_OSVW_ID_LENGTH:
+    case MSR_AMD_OSVW_STATUS:
+        ret = svm_handle_osvw(v, msr, msr_content, 1);
+        if ( ret < 0 )
+            goto gpf;
+        break;
+
     default:
         ret = nsvm_rdmsr(v, msr, msr_content);
         if ( ret < 0 )
@@ -1512,6 +1615,13 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
          */
         break;
 
+    case MSR_AMD_OSVW_ID_LENGTH:
+    case MSR_AMD_OSVW_STATUS:
+        ret = svm_handle_osvw(v, msr, &msr_content, 0);
+        if ( ret < 0 )
+            goto gpf;
+        break;
+
     default:
         ret = nsvm_wrmsr(v, msr, msr_content);
         if ( ret < 0 )
diff --git a/xen/arch/x86/microcode.c b/xen/arch/x86/microcode.c
index c3a8472e3b..bdda3f5f4d 100644
--- a/xen/arch/x86/microcode.c
+++ b/xen/arch/x86/microcode.c
@@ -218,6 +218,16 @@ int microcode_update(XEN_GUEST_HANDLE(const_void) buf, unsigned long len)
     info->error = 0;
     info->cpu = cpumask_first(&cpu_online_map);
 
+    if ( microcode_ops->start_update )
+    {
+        ret = microcode_ops->start_update();
+        if ( ret != 0 )
+        {
+            xfree(info);
+            return ret;
+        }
+    }
+
     return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
 }
 
@@ -240,6 +250,12 @@ static int __init microcode_init(void)
     if ( !data )
         return -ENOMEM;
 
+    if ( microcode_ops->start_update && microcode_ops->start_update() != 0 )
+    {
+        ucode_mod_map(NULL);
+        return 0;
+    }
+
     softirq_tasklet_init(&tasklet, _do_microcode_update, (unsigned long)data);
 
     for_each_online_cpu ( cpu )
diff --git a/xen/arch/x86/microcode_amd.c b/xen/arch/x86/microcode_amd.c
index 97846775cd..ef6f134e23 100644
--- a/xen/arch/x86/microcode_amd.c
+++ b/xen/arch/x86/microcode_amd.c
@@ -25,6 +25,7 @@
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/microcode.h>
+#include <asm/hvm/svm/svm.h>
 
 struct equiv_cpu_entry {
     uint32_t installed_cpu;
@@ -71,6 +72,7 @@ struct mpbhdr {
 /* serialize access to the physical write */
 static DEFINE_SPINLOCK(microcode_update_lock);
 
+/* See comment in start_update() for cases when this routine fails */
 static int collect_cpu_info(int cpu, struct cpu_signature *csig)
 {
     struct cpuinfo_x86 *c = &cpu_data[cpu];
@@ -287,7 +289,8 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize)
     {
         printk(KERN_ERR "microcode: error! Wrong "
                "microcode patch file magic\n");
-        return -EINVAL;
+        error = -EINVAL;
+        goto out;
     }
 
     mc_amd = xmalloc(struct microcode_amd);
@@ -295,7 +298,8 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize)
     {
         printk(KERN_ERR "microcode: error! "
                "Can not allocate memory for microcode patch\n");
-        return -ENOMEM;
+        error = -ENOMEM;
+        goto out;
     }
 
     error = install_equiv_cpu_table(mc_amd, buf, &offset);
@@ -303,7 +307,8 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize)
     {
         xfree(mc_amd);
         printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
-        return -EINVAL;
+        error = -EINVAL;
+        goto out;
     }
 
     mc_old = uci->mc.mc_amd;
@@ -337,13 +342,19 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize)
     /* On success keep the microcode patch for
      * re-apply on resume.
      */
-    if (error == 1)
+    if ( error == 1 )
     {
         xfree(mc_old);
-        return 0;
+        error = 0;
+    }
+    else
+    {
+        xfree(mc_amd);
+        uci->mc.mc_amd = mc_old;
     }
-    xfree(mc_amd);
-    uci->mc.mc_amd = mc_old;
+
+  out:
+    svm_host_osvw_init();
 
     return error;
 }
@@ -395,11 +406,28 @@ err1:
     return -ENOMEM;
 }
 
+static int start_update(void)
+{
+    /*
+     * We assume here that svm_host_osvw_init() will be called on each cpu (from
+     * cpu_request_microcode()).
+     *
+     * Note that if collect_cpu_info() returns an error then
+     * cpu_request_microcode() will not invoked thus leaving OSVW bits not
+     * updated. Currently though collect_cpu_info() will not fail on processors
+     * supporting OSVW so we will not deal with this possibility.
+     */
+    svm_host_osvw_reset();
+
+    return 0;
+}
+
 static const struct microcode_ops microcode_amd_ops = {
     .microcode_resume_match           = microcode_resume_match,
     .cpu_request_microcode            = cpu_request_microcode,
     .collect_cpu_info                 = collect_cpu_info,
     .apply_microcode                  = apply_microcode,
+    .start_update                     = start_update,
 };
 
 static __init int microcode_init_amd(void)
diff --git a/xen/arch/x86/platform_hypercall.c b/xen/arch/x86/platform_hypercall.c
index f9a836a8bd..d5f978ac31 100644
--- a/xen/arch/x86/platform_hypercall.c
+++ b/xen/arch/x86/platform_hypercall.c
@@ -166,7 +166,23 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op)
             break;
 
         guest_from_compat_handle(data, op->u.microcode.data);
+
+        /*
+         * alloc_vcpu() will access data which is modified during
+         * microcode update
+         */
+        while ( !spin_trylock(&vcpu_alloc_lock) )
+        {
+            if ( hypercall_preempt_check() )
+            {
+                ret = hypercall_create_continuation(
+                    __HYPERVISOR_platform_op, "h", u_xenpf_op);
+                goto out;
+            }
+        }
+
         ret = microcode_update(data, op->u.microcode.length);
+        spin_unlock(&vcpu_alloc_lock);
     }
     break;
 
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index 904fb45e8f..9f1a9ada8c 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -29,6 +29,7 @@
 #include <xsm/xsm.h>
 
 static DEFINE_SPINLOCK(domctl_lock);
+DEFINE_SPINLOCK(vcpu_alloc_lock);
 
 int cpumask_to_xenctl_cpumap(
     struct xenctl_cpumap *xenctl_cpumap, const cpumask_t *cpumask)
@@ -506,6 +507,20 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
         /* Needed, for example, to ensure writable p.t. state is synced. */
         domain_pause(d);
 
+        /*
+         * Certain operations (e.g. CPU microcode updates) modify data which is
+         * used during VCPU allocation/initialization
+         */
+        while ( !spin_trylock(&vcpu_alloc_lock) )
+        {
+            if ( hypercall_preempt_check() )
+            {
+                ret =  hypercall_create_continuation(
+                    __HYPERVISOR_domctl, "h", u_domctl);
+                goto maxvcpu_out_novcpulock;
+            }
+        }
+
         /* We cannot reduce maximum VCPUs. */
         ret = -EINVAL;
         if ( (max < d->max_vcpus) && (d->vcpu[max] != NULL) )
@@ -555,6 +570,9 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
         ret = 0;
 
     maxvcpu_out:
+        spin_unlock(&vcpu_alloc_lock);
+
+    maxvcpu_out_novcpulock:
         domain_unpause(d);
         rcu_unlock_domain(d);
     }
diff --git a/xen/include/asm-x86/hvm/svm/svm.h b/xen/include/asm-x86/hvm/svm/svm.h
index 52b05e1ad3..8a9e66014a 100644
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -98,4 +98,7 @@ extern u32 svm_feature_flags;
                                   ~TSC_RATIO_RSVD_BITS )
 #define vcpu_tsc_ratio(v)       TSC_RATIO((v)->domain->arch.tsc_khz, cpu_khz)
 
+extern void svm_host_osvw_reset(void);
+extern void svm_host_osvw_init(void);
+
 #endif /* __ASM_X86_HVM_SVM_H__ */
diff --git a/xen/include/asm-x86/hvm/svm/vmcb.h b/xen/include/asm-x86/hvm/svm/vmcb.h
index 7e6ddad437..d5cc0e2571 100644
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -516,6 +516,12 @@ struct arch_svm_struct {
     
     /* AMD lightweight profiling MSR */
     uint64_t guest_lwp_cfg;
+
+    /* OSVW MSRs */
+    struct {
+        u64 length;
+        u64 status;
+    } osvw;
 };
 
 struct vmcb_struct *alloc_vmcb(void);
diff --git a/xen/include/asm-x86/microcode.h b/xen/include/asm-x86/microcode.h
index cb71661187..00a672a28e 100644
--- a/xen/include/asm-x86/microcode.h
+++ b/xen/include/asm-x86/microcode.h
@@ -11,6 +11,7 @@ struct microcode_ops {
     int (*cpu_request_microcode)(int cpu, const void *buf, size_t size);
     int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
     int (*apply_microcode)(int cpu);
+    int (*start_update)(void);
 };
 
 struct cpu_signature {
diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
index eed73f49a9..d4ac50ff0f 100644
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -69,6 +69,7 @@ void arch_dump_domain_info(struct domain *d);
 
 void arch_vcpu_reset(struct vcpu *v);
 
+extern spinlock_t vcpu_alloc_lock;
 bool_t domctl_lock_acquire(void);
 void domctl_lock_release(void);
author	Boris Ostrovsky <boris.ostrovsky@amd.com>	2012-02-07 15:05:19 +0100
committer	Boris Ostrovsky <boris.ostrovsky@amd.com>	2012-02-07 15:05:19 +0100
commit	8cbb5278e034e5d7878f7a6a7d6987e5a7acf986 (patch)
tree	ca2b3b44237ca4631b489603e45ad6c148a3164b
parent	62698cdaf331db0ba7f621040b5c63eaacf6c4a9 (diff)
download	xen-8cbb5278e034e5d7878f7a6a7d6987e5a7acf986.tar.gz xen-8cbb5278e034e5d7878f7a6a7d6987e5a7acf986.tar.bz2 xen-8cbb5278e034e5d7878f7a6a7d6987e5a7acf986.zip