diff options
-rw-r--r-- | tools/examples/xmexample.hvm | 11 | ||||
-rw-r--r-- | tools/libxc/xc_domain.c | 49 | ||||
-rw-r--r-- | tools/libxc/xc_domain_restore.c | 12 | ||||
-rw-r--r-- | tools/libxc/xc_domain_save.c | 31 | ||||
-rw-r--r-- | tools/libxc/xenctrl.h | 14 | ||||
-rw-r--r-- | tools/python/xen/lowlevel/xc/xc.c | 17 | ||||
-rw-r--r-- | tools/python/xen/xend/XendConfig.py | 6 | ||||
-rw-r--r-- | tools/python/xen/xend/XendDomainInfo.py | 6 | ||||
-rw-r--r-- | tools/python/xen/xm/create.py | 10 | ||||
-rw-r--r-- | tools/python/xen/xm/xenapi_create.py | 2 | ||||
-rw-r--r-- | xen/arch/x86/domain.c | 2 | ||||
-rw-r--r-- | xen/arch/x86/domctl.c | 34 | ||||
-rw-r--r-- | xen/arch/x86/hvm/hvm.c | 2 | ||||
-rw-r--r-- | xen/arch/x86/time.c | 275 | ||||
-rw-r--r-- | xen/arch/x86/traps.c | 15 | ||||
-rw-r--r-- | xen/include/asm-x86/domain.h | 18 | ||||
-rw-r--r-- | xen/include/asm-x86/msr.h | 2 | ||||
-rw-r--r-- | xen/include/asm-x86/processor.h | 4 | ||||
-rw-r--r-- | xen/include/asm-x86/time.h | 28 | ||||
-rw-r--r-- | xen/include/public/domctl.h | 23 |
20 files changed, 484 insertions, 77 deletions
diff --git a/tools/examples/xmexample.hvm b/tools/examples/xmexample.hvm index e5ae97f966..09edda6aac 100644 --- a/tools/examples/xmexample.hvm +++ b/tools/examples/xmexample.hvm @@ -178,11 +178,16 @@ stdvga=0 serial='pty' #---------------------------------------------------------------------------- -# tsc_native : TSC mode (0=emulate TSC, 1=native TSC) +# tsc_mode : TSC mode (0=default, 1=native TSC, 2=never emulate, 3=pvrdtscp) # emulate TSC provides synced TSC for all vcpus, but lose perfomrance. # native TSC leverages hardware's TSC(no perf loss), but vcpu's TSC may lose -# sync due to hardware's unreliable/unsynced TSC between CPUs. -tsc_native=1 +# sync due to hardware's unreliable/unsynced TSC between CPUs. +# default intelligently uses native TSC on machines where it is safe, but +# switches to emulated if necessary after save/restore/migration +# pvrdtscp is for intelligent apps that use special Xen-only paravirtualized +# cpuid instructions to obtain offset/scaling/migration info and maximize +# performance within pools of machines that support the rdtscp instruction +tsc_mode=0 #----------------------------------------------------------------------------- # Qemu Monitor, default is disable diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c index f872583926..54a5914e9b 100644 --- a/tools/libxc/xc_domain.c +++ b/tools/libxc/xc_domain.c @@ -466,24 +466,61 @@ int xc_domain_set_time_offset(int xc_handle, return do_domctl(xc_handle, &domctl); } -int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native) +int xc_domain_disable_migrate(int xc_handle, uint32_t domid) { DECLARE_DOMCTL; - domctl.cmd = XEN_DOMCTL_set_tsc_native; + domctl.cmd = XEN_DOMCTL_disable_migrate; domctl.domain = (domid_t)domid; - domctl.u.set_tsc_native.is_native = is_native; + domctl.u.disable_migrate.disable = 1; return do_domctl(xc_handle, &domctl); } -int xc_domain_disable_migrate(int xc_handle, uint32_t domid) +int xc_domain_set_tsc_info(int xc_handle, + uint32_t domid, + uint32_t tsc_mode, + uint64_t elapsed_nsec, + uint32_t gtsc_khz, + uint32_t incarnation) { DECLARE_DOMCTL; - domctl.cmd = XEN_DOMCTL_disable_migrate; + domctl.cmd = XEN_DOMCTL_settscinfo; domctl.domain = (domid_t)domid; - domctl.u.disable_migrate.disable = 1; + domctl.u.tsc_info.info.tsc_mode = tsc_mode; + domctl.u.tsc_info.info.elapsed_nsec = elapsed_nsec; + domctl.u.tsc_info.info.gtsc_khz = gtsc_khz; + domctl.u.tsc_info.info.incarnation = incarnation; return do_domctl(xc_handle, &domctl); } +int xc_domain_get_tsc_info(int xc_handle, + uint32_t domid, + uint32_t *tsc_mode, + uint64_t *elapsed_nsec, + uint32_t *gtsc_khz, + uint32_t *incarnation) +{ + int rc; + DECLARE_DOMCTL; + xen_guest_tsc_info_t info = { 0 }; + + domctl.cmd = XEN_DOMCTL_gettscinfo; + domctl.domain = (domid_t)domid; + set_xen_guest_handle(domctl.u.tsc_info.out_info, &info); + if ( (rc = lock_pages(&info, sizeof(info))) != 0 ) + return rc; + rc = do_domctl(xc_handle, &domctl); + if ( rc == 0 ) + { + *tsc_mode = info.tsc_mode; + *elapsed_nsec = info.elapsed_nsec; + *gtsc_khz = info.gtsc_khz; + *incarnation = info.incarnation; + } + unlock_pages(&info,sizeof(info)); + return rc; +} + + int xc_domain_memory_increase_reservation(int xc_handle, uint32_t domid, unsigned long nr_extents, diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c index 01d7924f07..cf6a63c25a 100644 --- a/tools/libxc/xc_domain_restore.c +++ b/tools/libxc/xc_domain_restore.c @@ -1084,6 +1084,18 @@ static int pagebuf_get_one(pagebuf_t* buf, int fd, int xch, uint32_t dom) return -1; } return pagebuf_get_one(buf, fd, xch, dom); + } else if ( count == -7 ) { + uint32_t tsc_mode, khz, incarn; + uint64_t nsec; + if ( read_exact(fd, &tsc_mode, sizeof(uint32_t)) || + read_exact(fd, &nsec, sizeof(uint64_t)) || + read_exact(fd, &khz, sizeof(uint32_t)) || + read_exact(fd, &incarn, sizeof(uint32_t)) || + xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) { + ERROR("error reading/restoring tsc info"); + return -1; + } + return pagebuf_get_one(buf, fd, xch, dom); } else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) { ERROR("Max batch size exceeded (%d). Giving up.", count); return -1; diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c index 30c1b6d3a4..9d706a92d3 100644 --- a/tools/libxc/xc_domain_save.c +++ b/tools/libxc/xc_domain_save.c @@ -841,6 +841,24 @@ static xen_pfn_t *map_and_save_p2m_table(int xc_handle, return success ? p2m : NULL; } +/* must be done AFTER suspend_and_state() */ +static int save_tsc_info(int xc_handle, uint32_t dom, int io_fd) +{ + int marker = -7; + uint32_t tsc_mode, khz, incarn; + uint64_t nsec; + + if ( xc_domain_get_tsc_info(xc_handle, dom, &tsc_mode, + &nsec, &khz, &incarn) < 0 || + write_exact(io_fd, &marker, sizeof(marker)) || + write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) || + write_exact(io_fd, &nsec, sizeof(nsec)) || + write_exact(io_fd, &khz, sizeof(khz)) || + write_exact(io_fd, &incarn, sizeof(incarn)) ) + return -1; + return 0; +} + int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, uint32_t max_factor, uint32_t flags, struct save_callbacks* callbacks, @@ -1100,6 +1118,12 @@ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, goto out; } + if ( !live && save_tsc_info(xc_handle, dom, io_fd) < 0 ) + { + ERROR("Error when writing to state file (tsc)"); + goto out; + } + copypages: #define write_exact(fd, buf, len) write_buffer(last_iter, &ob, (fd), (buf), (len)) #ifdef ratewrite @@ -1458,6 +1482,13 @@ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, goto out; } + if ( save_tsc_info(xc_handle, dom, io_fd) < 0 ) + { + ERROR("Error when writing to state file (tsc)"); + goto out; + } + + } if ( xc_shadow_control(xc_handle, dom, diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h index d6ecaf399c..9fc05bb30b 100644 --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -628,7 +628,19 @@ int xc_domain_set_time_offset(int xc_handle, uint32_t domid, int32_t time_offset_seconds); -int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native); +int xc_domain_set_tsc_info(int xc_handle, + uint32_t domid, + uint32_t tsc_mode, + uint64_t elapsed_nsec, + uint32_t gtsc_khz, + uint32_t incarnation); + +int xc_domain_get_tsc_info(int xc_handle, + uint32_t domid, + uint32_t *tsc_mode, + uint64_t *elapsed_nsec, + uint32_t *gtsc_khz, + uint32_t *incarnation); int xc_domain_disable_migrate(int xc_handle, uint32_t domid); diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c index 7eaf63b94e..aa780aa303 100644 --- a/tools/python/xen/lowlevel/xc/xc.c +++ b/tools/python/xen/lowlevel/xc/xc.c @@ -1486,14 +1486,14 @@ static PyObject *pyxc_domain_set_time_offset(XcObject *self, PyObject *args) return zero; } -static PyObject *pyxc_domain_set_tsc_native(XcObject *self, PyObject *args) +static PyObject *pyxc_domain_set_tsc_info(XcObject *self, PyObject *args) { - uint32_t dom, is_native; + uint32_t dom, tsc_mode; - if (!PyArg_ParseTuple(args, "ii", &dom, &is_native)) + if (!PyArg_ParseTuple(args, "ii", &dom, &tsc_mode)) return NULL; - if (xc_domain_set_tsc_native(self->xc_handle, dom, is_native) != 0) + if (xc_domain_set_tsc_info(self->xc_handle, dom, tsc_mode, 0, 0, 0) != 0) return pyxc_error_to_exception(); Py_INCREF(zero); @@ -2036,12 +2036,13 @@ static PyMethodDef pyxc_methods[] = { " offset [int]: Time offset from UTC in seconds.\n" "Returns: [int] 0 on success; -1 on error.\n" }, - { "domain_set_tsc_native", - (PyCFunction)pyxc_domain_set_tsc_native, + { "domain_set_tsc_info", + (PyCFunction)pyxc_domain_set_tsc_info, METH_VARARGS, "\n" - "Set a domain's TSC mode (emulate vs native)\n" + "Set a domain's TSC mode\n" " dom [int]: Domain whose TSC mode is being set.\n" - " is_native [int]: 1=native, 0=emulate.\n" + " tsc_mode [int]: 0=default (monotonic, but native where possible)\n" + " 1=always emulate 2=never emulate 3=pvrdtscp\n" "Returns: [int] 0 on success; -1 on error.\n" }, { "domain_disable_migrate", diff --git a/tools/python/xen/xend/XendConfig.py b/tools/python/xen/xend/XendConfig.py index 0eadf343d3..3227cd4def 100644 --- a/tools/python/xen/xend/XendConfig.py +++ b/tools/python/xen/xend/XendConfig.py @@ -163,7 +163,7 @@ XENAPI_PLATFORM_CFG_TYPES = { 'vncdisplay': int, 'vnclisten': str, 'timer_mode': int, - 'tsc_native': int, + 'tsc_mode': int, 'vpt_align': int, 'viridian': int, 'vncpasswd': str, @@ -477,8 +477,8 @@ class XendConfig(dict): if not os.path.exists(self['platform']['device_model']): raise VmError("device model '%s' not found" % str(self['platform']['device_model'])) - if 'tsc_native' not in self['platform']: - self['platform']['tsc_native'] = 0 + if 'tsc_mode' not in self['platform']: + self['platform']['tsc_mode'] = 0 if 'nomigrate' not in self['platform']: self['platform']['nomigrate'] = 0 diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py index 592ba6fad8..8198228b9f 100644 --- a/tools/python/xen/xend/XendDomainInfo.py +++ b/tools/python/xen/xend/XendDomainInfo.py @@ -2468,9 +2468,9 @@ class XendDomainInfo: self._recreateDom() # Set TSC mode of domain - tsc_native = self.info["platform"].get("tsc_native") - if arch.type == "x86" and tsc_native is not None: - xc.domain_set_tsc_native(self.domid, int(tsc_native)) + tsc_mode = self.info["platform"].get("tsc_mode") + if arch.type == "x86" and tsc_mode is not None: + xc.domain_set_tsc_info(self.domid, int(tsc_mode)) # Set timer configuration of domain timer_mode = self.info["platform"].get("timer_mode") diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py index d6485a59a1..32b19235e1 100644 --- a/tools/python/xen/xm/create.py +++ b/tools/python/xen/xm/create.py @@ -221,9 +221,9 @@ gopts.var('timer_mode', val='TIMER_MODE', use="""Timer mode (0=delay virtual time when ticks are missed; 1=virtual time is always wallclock time.""") -gopts.var('tsc_native', val='TSC_NATIVE', +gopts.var('tsc_mode', val='TSC_MODE', fn=set_int, default=0, - use="""TSC mode (0=emulate TSC, 1=native TSC).""") + use="""TSC mode (0=default, 1=always emulate, 2=never emulate, 3=pvrdtscp).""") gopts.var('nomigrate', val='NOMIGRATE', fn=set_int, default=0, @@ -738,8 +738,8 @@ def configure_image(vals): if vals.suppress_spurious_page_faults: config_image.append(['suppress_spurious_page_faults', vals.suppress_spurious_page_faults]) - if vals.tsc_native is not None: - config_image.append(['tsc_native', vals.tsc_native]) + if vals.tsc_mode is not None: + config_image.append(['tsc_mode', vals.tsc_mode]) if vals.nomigrate is not None: config_image.append(['nomigrate', vals.nomigrate]) @@ -1036,7 +1036,7 @@ def make_config(vals): config.append([n, v]) map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory', - 'restart', 'on_poweroff', 'tsc_native', 'nomigrate', + 'restart', 'on_poweroff', 'tsc_mode', 'nomigrate', 'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail', 'features', 'on_xend_start', 'on_xend_stop', 'target', 'cpuid', 'cpuid_check', 'machine_address_size', 'suppress_spurious_page_faults']) diff --git a/tools/python/xen/xm/xenapi_create.py b/tools/python/xen/xm/xenapi_create.py index 9cfdb87d7e..4c0177b4aa 100644 --- a/tools/python/xen/xm/xenapi_create.py +++ b/tools/python/xen/xm/xenapi_create.py @@ -1108,7 +1108,7 @@ class sxp2xml: 'pci_msitranslate', 'pci_power_mgmt', 'xen_platform_pci', - 'tsc_native' + 'tsc_mode' 'description', 'nomigrate' ] diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index c851209853..84493aba43 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -520,6 +520,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED; } + /* initialize default tsc behavior in case tools don't */ + tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0); spin_lock_init(&d->arch.vtsc_lock); return 0; diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index 52f3945f06..4b0011a52c 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -1101,9 +1101,10 @@ long arch_do_domctl( } break; - case XEN_DOMCTL_set_tsc_native: + case XEN_DOMCTL_gettscinfo: { struct domain *d; + xen_guest_tsc_info_t info; ret = -ESRCH; d = rcu_lock_domain_by_id(domctl->domain); @@ -1111,9 +1112,34 @@ long arch_do_domctl( break; domain_pause(d); - d->arch.vtsc = !domctl->u.set_tsc_native.is_native; - if ( is_hvm_domain(d) ) - hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d)); + tsc_get_info(d, &info.tsc_mode, + &info.elapsed_nsec, + &info.gtsc_khz, + &info.incarnation); + if ( copy_to_guest(domctl->u.tsc_info.out_info, &info, 1) ) + ret = -EFAULT; + else + ret = 0; + domain_unpause(d); + + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_settscinfo: + { + struct domain *d; + + ret = -ESRCH; + d = rcu_lock_domain_by_id(domctl->domain); + if ( d == NULL ) + break; + + domain_pause(d); + tsc_set_info(d, domctl->u.tsc_info.info.tsc_mode, + domctl->u.tsc_info.info.elapsed_nsec, + domctl->u.tsc_info.info.gtsc_khz, + domctl->u.tsc_info.info.incarnation); domain_unpause(d); rcu_unlock_domain(d); diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index f4d342a52f..ca567638ae 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -1831,7 +1831,7 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx, if ( cpuid_viridian_leaves(input, eax, ebx, ecx, edx) ) return; - if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) ) + if ( cpuid_hypervisor_leaves(input, count, eax, ebx, ecx, edx) ) return; domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx); diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c index 982ce7ca52..e3652a9d45 100644 --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -34,6 +34,7 @@ #include <asm/hpet.h> #include <io_ports.h> #include <asm/setup.h> /* for early_time_init */ +#include <public/arch-x86/cpuid.h> /* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */ static char __initdata opt_clocksource[10]; @@ -45,10 +46,12 @@ unsigned long pit0_ticks; static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */ static DEFINE_SPINLOCK(wc_lock); +/* moved to <asm/domain.h> struct time_scale { int shift; u32 mul_frac; }; +*/ struct cpu_time { u64 local_tsc_stamp; @@ -150,13 +153,32 @@ static inline u64 scale_delta(u64 delta, struct time_scale *scale) return product; } +#define _TS_SHIFT_IDENTITY 1 +#define _TS_MUL_FRAC_IDENTITY 0x80000000UL +#define _TS_IDENTITY { _TS_SHIFT_IDENTITY, _TS_MUL_FRAC_IDENTITY } +static inline int time_scale_is_identity(struct time_scale *ts) +{ + if ( ts->shift != _TS_SHIFT_IDENTITY ) + return 0; + else if ( ts->mul_frac != _TS_MUL_FRAC_IDENTITY ) + return 0; + return 1; +} + +static inline void set_time_scale_identity(struct time_scale *ts) +{ + ts->shift = _TS_SHIFT_IDENTITY; + ts->mul_frac = _TS_MUL_FRAC_IDENTITY; +} + /* Compute the reciprocal of the given time_scale. */ static inline struct time_scale scale_reciprocal(struct time_scale scale) { struct time_scale reciprocal; u32 dividend; - dividend = 0x80000000u; + ASSERT(scale.mul_frac != 0); + dividend = _TS_MUL_FRAC_IDENTITY; reciprocal.shift = 1 - scale.shift; while ( unlikely(dividend >= scale.mul_frac) ) { @@ -818,6 +840,8 @@ static void __update_vcpu_system_time(struct vcpu *v, int force) struct cpu_time *t; struct vcpu_time_info *u, _u; XEN_GUEST_HANDLE(vcpu_time_info_t) user_u; + struct domain *d = v->domain; + s_time_t tsc_stamp = 0; if ( v->vcpu_info == NULL ) return; @@ -825,20 +849,31 @@ static void __update_vcpu_system_time(struct vcpu *v, int force) t = &this_cpu(cpu_time); u = &vcpu_info(v, time); + if ( d->arch.vtsc ) + { + tsc_stamp = t->stime_local_stamp - d->arch.vtsc_offset; + if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) ) + tsc_stamp = scale_delta(tsc_stamp, &d->arch.ns_to_vtsc); + } + else + tsc_stamp = t->local_tsc_stamp; + + if ( d->arch.tsc_mode == TSC_MODE_PVRDTSCP && + boot_cpu_has(X86_FEATURE_RDTSCP) ) + write_rdtscp_aux(d->arch.incarnation); + /* Don't bother unless timestamps have changed or we are forced. */ - if ( !force && (u->tsc_timestamp == (v->domain->arch.vtsc - ? t->stime_local_stamp - : t->local_tsc_stamp)) ) + if ( !force && (u->tsc_timestamp == tsc_stamp) ) return; memset(&_u, 0, sizeof(_u)); - if ( v->domain->arch.vtsc ) + if ( d->arch.vtsc ) { - _u.tsc_timestamp = t->stime_local_stamp; + _u.tsc_timestamp = tsc_stamp; _u.system_time = t->stime_local_stamp; - _u.tsc_to_system_mul = 0x80000000u; - _u.tsc_shift = 1; + _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac; + _u.tsc_shift = d->arch.vtsc_to_ns.shift; } else { @@ -1556,7 +1591,7 @@ static void tsc_check_slave(void *unused) local_irq_enable(); } -static void tsc_check_reliability(void) +void tsc_check_reliability(void) { unsigned int cpu = smp_processor_id(); static DEFINE_SPINLOCK(lock); @@ -1583,57 +1618,245 @@ static void tsc_check_reliability(void) void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs) { s_time_t now = get_s_time(); + struct domain *d = v->domain; - spin_lock(&v->domain->arch.vtsc_lock); + spin_lock(&d->arch.vtsc_lock); if ( guest_kernel_mode(v, regs) ) - v->domain->arch.vtsc_kerncount++; + d->arch.vtsc_kerncount++; else - v->domain->arch.vtsc_usercount++; + d->arch.vtsc_usercount++; - if ( (int64_t)(now - v->domain->arch.vtsc_last) > 0 ) - v->domain->arch.vtsc_last = now; + if ( (int64_t)(now - d->arch.vtsc_last) > 0 ) + d->arch.vtsc_last = now; else - now = ++v->domain->arch.vtsc_last; + now = ++d->arch.vtsc_last; - spin_unlock(&v->domain->arch.vtsc_lock); + spin_unlock(&d->arch.vtsc_lock); + + now = now - d->arch.vtsc_offset; + if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) ) + now = scale_delta(now, &d->arch.ns_to_vtsc); regs->eax = (uint32_t)now; regs->edx = (uint32_t)(now >> 32); } +static int host_tsc_is_safe(void) +{ + extern unsigned int max_cstate; + + if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) + return 1; + if ( num_online_cpus() == 1 ) + return 1; + if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && max_cstate <= 2 ) + { + if ( !tsc_check_count ) + tsc_check_reliability(); + if ( tsc_max_warp == 0 ) + return 1; + } + return 0; +} + +void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + struct domain *d = current->domain; + struct cpu_time *t; + + t = &this_cpu(cpu_time); + switch ( sub_idx ) + { + case 0: /* features */ + *eax = ( ( (!!d->arch.vtsc) << 0 ) | + ( (!!host_tsc_is_safe()) << 1 ) | + ( (!!boot_cpu_has(X86_FEATURE_RDTSCP)) << 2 ) | + 0 ); + *ebx = d->arch.tsc_mode; + *ecx = d->arch.tsc_khz; + *edx = d->arch.incarnation; + break; + case 1: /* pvclock group1 */ /* FIXME are these right? */ + *eax = (uint32_t)t->local_tsc_stamp; + *ebx = (uint32_t)(t->local_tsc_stamp >> 32); + *ecx = t->tsc_scale.mul_frac; + *edx = d->arch.incarnation; + break; + case 2: /* pvclock scaling values */ /* FIXME are these right? */ + *eax = (uint32_t)t->stime_local_stamp; + *ebx = (uint32_t)(t->stime_local_stamp >> 32); + *ecx = t->tsc_scale.shift; + *edx = d->arch.incarnation; + case 3: /* physical cpu_khz */ + *eax = cpu_khz; + *ebx = *ecx = 0; + *edx = d->arch.incarnation; + break; + } +} + +/* + * called to collect tsc-related data only for save file or live + * migrate; called after last rdtsc is done on this incarnation + */ +void tsc_get_info(struct domain *d, uint32_t *tsc_mode, + uint64_t *elapsed_nsec, uint32_t *gtsc_khz, + uint32_t *incarnation) +{ + *incarnation = d->arch.incarnation; + switch ( *tsc_mode = d->arch.tsc_mode ) + { + case TSC_MODE_NEVER_EMULATE: + *elapsed_nsec = *gtsc_khz = 0; + break; + case TSC_MODE_ALWAYS_EMULATE: + *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; + *gtsc_khz = 1000000UL; + break; + case TSC_MODE_DEFAULT: + if ( d->arch.vtsc ) + { + *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; + *gtsc_khz = d->arch.tsc_khz; + } else { + uint64_t tsc = 0; + rdtscll(tsc); + *elapsed_nsec = scale_delta(tsc,&d->arch.vtsc_to_ns); + *gtsc_khz = cpu_khz; + } + break; + case TSC_MODE_PVRDTSCP: + *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */ + *gtsc_khz = d->arch.tsc_khz; + break; + } +} + +/* + * This may be called as many as three times for a domain, once when the + * hypervisor creates the domain, once when the toolstack creates the + * domain and, if restoring/migrating, once when saved/migrated values + * are restored. Care must be taken that, if multiple calls occur, + * only the last "sticks" and all are completed before the guest executes + * an rdtsc instruction + */ +void tsc_set_info(struct domain *d, + uint32_t tsc_mode, uint64_t elapsed_nsec, + uint32_t gtsc_khz, uint32_t incarnation) +{ + if ( d->domain_id == 0 || d->domain_id == DOMID_INVALID ) + { + d->arch.vtsc = 0; + return; + } + switch ( d->arch.tsc_mode = tsc_mode ) + { + case TSC_MODE_NEVER_EMULATE: + gdprintk(XENLOG_G_INFO, "%s: never emulating TSC\n",__func__) + d->arch.vtsc = 0; + break; + case TSC_MODE_ALWAYS_EMULATE: + gdprintk(XENLOG_G_INFO, "%s: always emulating TSC\n",__func__) + d->arch.vtsc = 1; + d->arch.vtsc_offset = get_s_time() - elapsed_nsec; + set_time_scale_identity(&d->arch.vtsc_to_ns); + break; + case TSC_MODE_DEFAULT: + d->arch.vtsc_offset = get_s_time() - elapsed_nsec; + if ( (host_tsc_is_safe() && incarnation == 0) || !d->domain_id ) + { + gdprintk(XENLOG_G_INFO, "%s: using safe native TSC\n",__func__) + /* use native TSC if initial host supports it */ + d->arch.vtsc = 0; + d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz; + set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 ); + set_time_scale_identity(&d->arch.ns_to_vtsc); + } else if ( gtsc_khz != 0 && gtsc_khz != 1000000UL ) { + gdprintk(XENLOG_G_INFO, "%s: safe native TSC on initial host," + "but now using emulation\n",__func__) + /* was native on initial host, now emulated at initial tsc hz*/ + d->arch.vtsc = 1; + d->arch.tsc_khz = gtsc_khz; + set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 ); + d->arch.ns_to_vtsc = + scale_reciprocal(d->arch.vtsc_to_ns); + } else { + gdprintk(XENLOG_G_INFO, "%s: unsafe TSC on initial host," + "using emulation\n",__func__) + d->arch.vtsc = 1; + set_time_scale_identity(&d->arch.vtsc_to_ns); + set_time_scale_identity(&d->arch.ns_to_vtsc); + } + break; + case TSC_MODE_PVRDTSCP: + gdprintk(XENLOG_G_INFO, "%s: using PVRDTSCP\n",__func__) + if ( boot_cpu_has(X86_FEATURE_RDTSCP) && gtsc_khz != 0 ) { + d->arch.vtsc = 0; + set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 ); + } else { + d->arch.vtsc = 1; + d->arch.vtsc_offset = get_s_time() - elapsed_nsec; + set_time_scale_identity(&d->arch.vtsc_to_ns); + } + break; + } + d->arch.incarnation = incarnation + 1; + if ( is_hvm_domain(d) ) + hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d)); +} + /* vtsc may incur measurable performance degradation, diagnose with this */ static void dump_softtsc(unsigned char key) { struct domain *d; int domcnt = 0; + extern unsigned int max_cstate; tsc_check_reliability(); if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) printk("TSC marked as reliable, " "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count); else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) ) - printk("TSC marked as constant but not reliable, " - "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count); - else + { + printk("TSC has constant rate, "); + if (max_cstate <= 2 && tsc_max_warp == 0) + printk("no deep Cstates, passed warp test, deemed reliable, "); + else + printk("deep Cstates possible, so not reliable, "); + printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count); + } else printk("TSC not marked as either constant or reliable, " - "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count); + "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count); for_each_domain ( d ) { + if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT ) + continue; + printk("dom%u%s: mode=%d",d->domain_id, + is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode); + if ( d->arch.vtsc_offset ) + printk(",ofs=0x%"PRIx64"",d->arch.vtsc_offset); + if ( d->arch.tsc_khz ) + printk(",khz=%"PRIu32"",d->arch.tsc_khz); + if ( d->arch.incarnation ) + printk(",inc=%"PRIu32"",d->arch.incarnation); if ( !d->arch.vtsc ) + { + printk("\n"); continue; + } if ( is_hvm_domain(d) ) - printk("dom%u (hvm) vtsc count: %"PRIu64" total\n", - d->domain_id, d->arch.vtsc_kerncount); + printk(",vtsc count: %"PRIu64" total\n", + d->arch.vtsc_kerncount); else - printk("dom%u vtsc count: %"PRIu64" kernel, %"PRIu64" user\n", - d->domain_id, d->arch.vtsc_kerncount, - d->arch.vtsc_usercount); + printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n", + d->arch.vtsc_kerncount, d->arch.vtsc_usercount); domcnt++; } if ( !domcnt ) - printk("All domains have native TSC\n"); + printk("No domains have emulated TSC\n"); } static struct keyhandler dump_softtsc_keyhandler = { diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index e42420c8a1..174dc25af4 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -679,8 +679,8 @@ int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val) return 1; } -int cpuid_hypervisor_leaves( - uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx, + uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { struct domain *d = current->domain; /* Optionally shift out of the way of Viridian architectural leaves. */ @@ -693,7 +693,7 @@ int cpuid_hypervisor_leaves( switch ( idx ) { case 0: - *eax = base + 2; /* Largest leaf */ + *eax = base + 3; /* Largest leaf */ *ebx = XEN_CPUID_SIGNATURE_EBX; *ecx = XEN_CPUID_SIGNATURE_ECX; *edx = XEN_CPUID_SIGNATURE_EDX; @@ -717,6 +717,11 @@ int cpuid_hypervisor_leaves( *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD; break; + case 3: + *eax = *ebx = *ecx = *edx = 0; + cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx ); + break; + default: BUG(); } @@ -735,7 +740,7 @@ static void pv_cpuid(struct cpu_user_regs *regs) if ( current->domain->domain_id != 0 ) { - if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) ) + if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) ) domain_cpuid(current->domain, a, c, &a, &b, &c, &d); goto out; } @@ -815,7 +820,7 @@ static void pv_cpuid(struct cpu_user_regs *regs) a = b = c = d = 0; break; default: - (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d); + (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d); break; } diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index 3c122b5d44..a80a3ede5c 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -230,6 +230,11 @@ struct domain_mca_msrs spinlock_t lock; }; +struct time_scale { + int shift; + u32 mul_frac; +}; + struct arch_domain { #ifdef CONFIG_X86_64 @@ -298,10 +303,17 @@ struct arch_domain /* For Guest vMCA handling */ struct domain_mca_msrs vmca_msrs; - /* SoftTSC emulation */ - bool_t vtsc; - s_time_t vtsc_last; + /* TSC management (emulation, pv, scaling, stats) */ + int tsc_mode; /* see include/asm-x86/time.h */ + bool_t vtsc; /* tsc is emulated (may change after migrate) */ + s_time_t vtsc_last; /* previous TSC value (guarantee monotonicity) */ spinlock_t vtsc_lock; + uint64_t vtsc_offset; /* adjustment for save/restore/migrate */ + uint32_t tsc_khz; /* cached khz for certain emulated cases */ + struct time_scale vtsc_to_ns; /* scaling for certain emulated cases */ + struct time_scale ns_to_vtsc; /* scaling for certain emulated cases */ + uint32_t incarnation; /* incremented every restore or live migrate + (possibly other cases in the future */ uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */ uint64_t vtsc_usercount; /* not used for hvm */ } __cacheline_aligned; diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h index 56bb080f52..a65f080569 100644 --- a/xen/include/asm-x86/msr.h +++ b/xen/include/asm-x86/msr.h @@ -84,6 +84,8 @@ static inline void wrmsrl(unsigned int msr, __u64 val) #define write_tsc(val) wrmsrl(MSR_IA32_TSC, val) +#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) + #define rdpmc(counter,low,high) \ __asm__ __volatile__("rdpmc" \ : "=a" (low), "=d" (high) \ diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 7b09adecd6..628965ae3e 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -550,8 +550,8 @@ asmlinkage void do_machine_check(struct cpu_user_regs *regs); void cpu_mcheck_distribute_cmci(void); void cpu_mcheck_disable(void); -int cpuid_hypervisor_leaves( - uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); +int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx, + uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val); int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val); diff --git a/xen/include/asm-x86/time.h b/xen/include/asm-x86/time.h index c72cb8e506..6dd071d726 100644 --- a/xen/include/asm-x86/time.h +++ b/xen/include/asm-x86/time.h @@ -4,6 +4,24 @@ #include <asm/msr.h> +/* + * PV TSC emulation modes: + * 0 = guest rdtsc/p executed natively when monotonicity can be guaranteed + * and emulated otherwise (with frequency scaled if necessary) + * 1 = guest rdtsc/p always emulated at 1GHz (kernel and user) + * 2 = guest rdtsc always executed natively (no monotonicity/frequency + * guarantees); guest rdtscp emulated at native frequency if + * unsupported by h/w, else executed natively + * 3 = same as 2, except xen manages TSC_AUX register so guest can + * determine when a restore/migration has occurred and assumes + * guest obtains/uses pvclock-like mechanism to adjust for + * monotonicity and frequency changes + */ +#define TSC_MODE_DEFAULT 0 +#define TSC_MODE_ALWAYS_EMULATE 1 +#define TSC_MODE_NEVER_EMULATE 2 +#define TSC_MODE_PVRDTSCP 3 + void calibrate_tsc_bp(void); void calibrate_tsc_ap(void); @@ -43,6 +61,16 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns); void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs); +void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec, + uint32_t gtsc_khz, uint32_t incarnation); + +void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec, + uint32_t *gtsc_khz, uint32_t *incarnation); + + void force_update_vcpu_system_time(struct vcpu *v); +void cpuid_time_leaf(uint32_t sub_idx, unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + #endif /* __X86_TIME_H__ */ diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index cac3477671..88b19a4ffe 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -401,11 +401,6 @@ struct xen_domctl_settimeoffset { typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t); -#define XEN_DOMCTL_set_tsc_native 57 -typedef struct xen_domctl_set_tsc_native { - uint32_t is_native; /* IN: 0: TSC is emulated; 1: TSC is host TSC */ -} xen_domctl_set_tsc_native_t; - #define XEN_DOMCTL_gethvmcontext 33 #define XEN_DOMCTL_sethvmcontext 34 typedef struct xen_domctl_hvmcontext { @@ -656,6 +651,22 @@ typedef struct xen_domctl_disable_migrate { } xen_domctl_disable_migrate_t; +#define XEN_DOMCTL_gettscinfo 59 +#define XEN_DOMCTL_settscinfo 60 +struct xen_guest_tsc_info { + uint32_t tsc_mode; + uint32_t gtsc_khz; + uint32_t incarnation; + uint32_t pad; + uint64_t elapsed_nsec; +}; +typedef struct xen_guest_tsc_info xen_guest_tsc_info_t; +DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t); +typedef struct xen_domctl_tsc_info { + XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */ + xen_guest_tsc_info_t info; /* IN */ +} xen_domctl_tsc_info_t; + #define XEN_DOMCTL_gdbsx_guestmemio 1000 /* guest mem io */ struct xen_domctl_gdbsx_memio { uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */ @@ -705,8 +716,8 @@ struct xen_domctl { struct xen_domctl_hypercall_init hypercall_init; struct xen_domctl_arch_setup arch_setup; struct xen_domctl_settimeoffset settimeoffset; - struct xen_domctl_set_tsc_native set_tsc_native; struct xen_domctl_disable_migrate disable_migrate; + struct xen_domctl_tsc_info tsc_info; struct xen_domctl_real_mode_area real_mode_area; struct xen_domctl_hvmcontext hvmcontext; struct xen_domctl_hvmcontext_partial hvmcontext_partial; |