aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKeir Fraser <keir.fraser@citrix.com>2009-11-25 14:05:28 +0000
committerKeir Fraser <keir.fraser@citrix.com>2009-11-25 14:05:28 +0000
commit08a0b4ab0d193b8fbb9270b7ab26f527535ba69c (patch)
treecdef74bb4aeb86dfa80bcac7637500b547780731
parent83371c34bc5a2eed015a5d034cd82ec243659d85 (diff)
downloadxen-08a0b4ab0d193b8fbb9270b7ab26f527535ba69c.tar.gz
xen-08a0b4ab0d193b8fbb9270b7ab26f527535ba69c.tar.bz2
xen-08a0b4ab0d193b8fbb9270b7ab26f527535ba69c.zip
Replace tsc_native config option with tsc_mode config option
(NOTE: pvrdtscp mode not finished yet, but all other modes have been tested so sooner seemed better than later to submit this fairly major patch so we can get more mileage on it before next release.) New tsc_mode config option supercedes tsc_native and offers a more intelligent default and an additional option for intelligent apps running on PV domains ("pvrdtscp"). For PV domains, default mode will determine if the initial host has a "safe"** TSC (meaning it is always synchronized across all physical CPUs). If so, all domains will execute all rdtsc instructions natively; if not, all domains will emulate all rdtsc instructions but providing the TSC hertz rate of the initial machine. After being restored or live-migrated, all PV domains will emulate all rdtsc instructions. Hence, this default mode guarantees correctness while providing native performance in most conditions. For PV domains, tsc_mode==1 will always emulate rdtsc and tsc_mode==2 will never emulate rdtsc. For tsc_mode==3, rdtsc will never be emulated, but information is provided through pvcpuid instructions and rdtscp instructions so that an app can obtain "safe" pvclock-like TSC information across save/restore and live migration. (Will be completed in a follow-on patch.) For HVM domains, the default mode and "always emulate" mode do the same as tsc_native==0; the other two modes do the same as tsc_native==1. (HVM domains since 3.4 have implemented a tsc_mode=default-like functionality, but also can preserve native TSC across save/restore and live-migration IFF the initial and target machines have a common TSC cycle rate.) ** All newer AMD machines, and Nehalem and future Intel machines have "Invariant TSC"; many newer Intel machines have "Constant TSC" and do not support deep-C sleep states; these and all single-processor machines are "safe". Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
-rw-r--r--tools/examples/xmexample.hvm11
-rw-r--r--tools/libxc/xc_domain.c49
-rw-r--r--tools/libxc/xc_domain_restore.c12
-rw-r--r--tools/libxc/xc_domain_save.c31
-rw-r--r--tools/libxc/xenctrl.h14
-rw-r--r--tools/python/xen/lowlevel/xc/xc.c17
-rw-r--r--tools/python/xen/xend/XendConfig.py6
-rw-r--r--tools/python/xen/xend/XendDomainInfo.py6
-rw-r--r--tools/python/xen/xm/create.py10
-rw-r--r--tools/python/xen/xm/xenapi_create.py2
-rw-r--r--xen/arch/x86/domain.c2
-rw-r--r--xen/arch/x86/domctl.c34
-rw-r--r--xen/arch/x86/hvm/hvm.c2
-rw-r--r--xen/arch/x86/time.c275
-rw-r--r--xen/arch/x86/traps.c15
-rw-r--r--xen/include/asm-x86/domain.h18
-rw-r--r--xen/include/asm-x86/msr.h2
-rw-r--r--xen/include/asm-x86/processor.h4
-rw-r--r--xen/include/asm-x86/time.h28
-rw-r--r--xen/include/public/domctl.h23
20 files changed, 484 insertions, 77 deletions
diff --git a/tools/examples/xmexample.hvm b/tools/examples/xmexample.hvm
index e5ae97f966..09edda6aac 100644
--- a/tools/examples/xmexample.hvm
+++ b/tools/examples/xmexample.hvm
@@ -178,11 +178,16 @@ stdvga=0
serial='pty'
#----------------------------------------------------------------------------
-# tsc_native : TSC mode (0=emulate TSC, 1=native TSC)
+# tsc_mode : TSC mode (0=default, 1=native TSC, 2=never emulate, 3=pvrdtscp)
# emulate TSC provides synced TSC for all vcpus, but lose perfomrance.
# native TSC leverages hardware's TSC(no perf loss), but vcpu's TSC may lose
-# sync due to hardware's unreliable/unsynced TSC between CPUs.
-tsc_native=1
+# sync due to hardware's unreliable/unsynced TSC between CPUs.
+# default intelligently uses native TSC on machines where it is safe, but
+# switches to emulated if necessary after save/restore/migration
+# pvrdtscp is for intelligent apps that use special Xen-only paravirtualized
+# cpuid instructions to obtain offset/scaling/migration info and maximize
+# performance within pools of machines that support the rdtscp instruction
+tsc_mode=0
#-----------------------------------------------------------------------------
# Qemu Monitor, default is disable
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index f872583926..54a5914e9b 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -466,24 +466,61 @@ int xc_domain_set_time_offset(int xc_handle,
return do_domctl(xc_handle, &domctl);
}
-int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native)
+int xc_domain_disable_migrate(int xc_handle, uint32_t domid)
{
DECLARE_DOMCTL;
- domctl.cmd = XEN_DOMCTL_set_tsc_native;
+ domctl.cmd = XEN_DOMCTL_disable_migrate;
domctl.domain = (domid_t)domid;
- domctl.u.set_tsc_native.is_native = is_native;
+ domctl.u.disable_migrate.disable = 1;
return do_domctl(xc_handle, &domctl);
}
-int xc_domain_disable_migrate(int xc_handle, uint32_t domid)
+int xc_domain_set_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t tsc_mode,
+ uint64_t elapsed_nsec,
+ uint32_t gtsc_khz,
+ uint32_t incarnation)
{
DECLARE_DOMCTL;
- domctl.cmd = XEN_DOMCTL_disable_migrate;
+ domctl.cmd = XEN_DOMCTL_settscinfo;
domctl.domain = (domid_t)domid;
- domctl.u.disable_migrate.disable = 1;
+ domctl.u.tsc_info.info.tsc_mode = tsc_mode;
+ domctl.u.tsc_info.info.elapsed_nsec = elapsed_nsec;
+ domctl.u.tsc_info.info.gtsc_khz = gtsc_khz;
+ domctl.u.tsc_info.info.incarnation = incarnation;
return do_domctl(xc_handle, &domctl);
}
+int xc_domain_get_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t *tsc_mode,
+ uint64_t *elapsed_nsec,
+ uint32_t *gtsc_khz,
+ uint32_t *incarnation)
+{
+ int rc;
+ DECLARE_DOMCTL;
+ xen_guest_tsc_info_t info = { 0 };
+
+ domctl.cmd = XEN_DOMCTL_gettscinfo;
+ domctl.domain = (domid_t)domid;
+ set_xen_guest_handle(domctl.u.tsc_info.out_info, &info);
+ if ( (rc = lock_pages(&info, sizeof(info))) != 0 )
+ return rc;
+ rc = do_domctl(xc_handle, &domctl);
+ if ( rc == 0 )
+ {
+ *tsc_mode = info.tsc_mode;
+ *elapsed_nsec = info.elapsed_nsec;
+ *gtsc_khz = info.gtsc_khz;
+ *incarnation = info.incarnation;
+ }
+ unlock_pages(&info,sizeof(info));
+ return rc;
+}
+
+
int xc_domain_memory_increase_reservation(int xc_handle,
uint32_t domid,
unsigned long nr_extents,
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
index 01d7924f07..cf6a63c25a 100644
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -1084,6 +1084,18 @@ static int pagebuf_get_one(pagebuf_t* buf, int fd, int xch, uint32_t dom)
return -1;
}
return pagebuf_get_one(buf, fd, xch, dom);
+ } else if ( count == -7 ) {
+ uint32_t tsc_mode, khz, incarn;
+ uint64_t nsec;
+ if ( read_exact(fd, &tsc_mode, sizeof(uint32_t)) ||
+ read_exact(fd, &nsec, sizeof(uint64_t)) ||
+ read_exact(fd, &khz, sizeof(uint32_t)) ||
+ read_exact(fd, &incarn, sizeof(uint32_t)) ||
+ xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
+ ERROR("error reading/restoring tsc info");
+ return -1;
+ }
+ return pagebuf_get_one(buf, fd, xch, dom);
} else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
ERROR("Max batch size exceeded (%d). Giving up.", count);
return -1;
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
index 30c1b6d3a4..9d706a92d3 100644
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -841,6 +841,24 @@ static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
return success ? p2m : NULL;
}
+/* must be done AFTER suspend_and_state() */
+static int save_tsc_info(int xc_handle, uint32_t dom, int io_fd)
+{
+ int marker = -7;
+ uint32_t tsc_mode, khz, incarn;
+ uint64_t nsec;
+
+ if ( xc_domain_get_tsc_info(xc_handle, dom, &tsc_mode,
+ &nsec, &khz, &incarn) < 0 ||
+ write_exact(io_fd, &marker, sizeof(marker)) ||
+ write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
+ write_exact(io_fd, &nsec, sizeof(nsec)) ||
+ write_exact(io_fd, &khz, sizeof(khz)) ||
+ write_exact(io_fd, &incarn, sizeof(incarn)) )
+ return -1;
+ return 0;
+}
+
int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
uint32_t max_factor, uint32_t flags,
struct save_callbacks* callbacks,
@@ -1100,6 +1118,12 @@ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
goto out;
}
+ if ( !live && save_tsc_info(xc_handle, dom, io_fd) < 0 )
+ {
+ ERROR("Error when writing to state file (tsc)");
+ goto out;
+ }
+
copypages:
#define write_exact(fd, buf, len) write_buffer(last_iter, &ob, (fd), (buf), (len))
#ifdef ratewrite
@@ -1458,6 +1482,13 @@ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
goto out;
}
+ if ( save_tsc_info(xc_handle, dom, io_fd) < 0 )
+ {
+ ERROR("Error when writing to state file (tsc)");
+ goto out;
+ }
+
+
}
if ( xc_shadow_control(xc_handle, dom,
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index d6ecaf399c..9fc05bb30b 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -628,7 +628,19 @@ int xc_domain_set_time_offset(int xc_handle,
uint32_t domid,
int32_t time_offset_seconds);
-int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native);
+int xc_domain_set_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t tsc_mode,
+ uint64_t elapsed_nsec,
+ uint32_t gtsc_khz,
+ uint32_t incarnation);
+
+int xc_domain_get_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t *tsc_mode,
+ uint64_t *elapsed_nsec,
+ uint32_t *gtsc_khz,
+ uint32_t *incarnation);
int xc_domain_disable_migrate(int xc_handle, uint32_t domid);
diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c
index 7eaf63b94e..aa780aa303 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -1486,14 +1486,14 @@ static PyObject *pyxc_domain_set_time_offset(XcObject *self, PyObject *args)
return zero;
}
-static PyObject *pyxc_domain_set_tsc_native(XcObject *self, PyObject *args)
+static PyObject *pyxc_domain_set_tsc_info(XcObject *self, PyObject *args)
{
- uint32_t dom, is_native;
+ uint32_t dom, tsc_mode;
- if (!PyArg_ParseTuple(args, "ii", &dom, &is_native))
+ if (!PyArg_ParseTuple(args, "ii", &dom, &tsc_mode))
return NULL;
- if (xc_domain_set_tsc_native(self->xc_handle, dom, is_native) != 0)
+ if (xc_domain_set_tsc_info(self->xc_handle, dom, tsc_mode, 0, 0, 0) != 0)
return pyxc_error_to_exception();
Py_INCREF(zero);
@@ -2036,12 +2036,13 @@ static PyMethodDef pyxc_methods[] = {
" offset [int]: Time offset from UTC in seconds.\n"
"Returns: [int] 0 on success; -1 on error.\n" },
- { "domain_set_tsc_native",
- (PyCFunction)pyxc_domain_set_tsc_native,
+ { "domain_set_tsc_info",
+ (PyCFunction)pyxc_domain_set_tsc_info,
METH_VARARGS, "\n"
- "Set a domain's TSC mode (emulate vs native)\n"
+ "Set a domain's TSC mode\n"
" dom [int]: Domain whose TSC mode is being set.\n"
- " is_native [int]: 1=native, 0=emulate.\n"
+ " tsc_mode [int]: 0=default (monotonic, but native where possible)\n"
+ " 1=always emulate 2=never emulate 3=pvrdtscp\n"
"Returns: [int] 0 on success; -1 on error.\n" },
{ "domain_disable_migrate",
diff --git a/tools/python/xen/xend/XendConfig.py b/tools/python/xen/xend/XendConfig.py
index 0eadf343d3..3227cd4def 100644
--- a/tools/python/xen/xend/XendConfig.py
+++ b/tools/python/xen/xend/XendConfig.py
@@ -163,7 +163,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
'vncdisplay': int,
'vnclisten': str,
'timer_mode': int,
- 'tsc_native': int,
+ 'tsc_mode': int,
'vpt_align': int,
'viridian': int,
'vncpasswd': str,
@@ -477,8 +477,8 @@ class XendConfig(dict):
if not os.path.exists(self['platform']['device_model']):
raise VmError("device model '%s' not found" % str(self['platform']['device_model']))
- if 'tsc_native' not in self['platform']:
- self['platform']['tsc_native'] = 0
+ if 'tsc_mode' not in self['platform']:
+ self['platform']['tsc_mode'] = 0
if 'nomigrate' not in self['platform']:
self['platform']['nomigrate'] = 0
diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py
index 592ba6fad8..8198228b9f 100644
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -2468,9 +2468,9 @@ class XendDomainInfo:
self._recreateDom()
# Set TSC mode of domain
- tsc_native = self.info["platform"].get("tsc_native")
- if arch.type == "x86" and tsc_native is not None:
- xc.domain_set_tsc_native(self.domid, int(tsc_native))
+ tsc_mode = self.info["platform"].get("tsc_mode")
+ if arch.type == "x86" and tsc_mode is not None:
+ xc.domain_set_tsc_info(self.domid, int(tsc_mode))
# Set timer configuration of domain
timer_mode = self.info["platform"].get("timer_mode")
diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py
index d6485a59a1..32b19235e1 100644
--- a/tools/python/xen/xm/create.py
+++ b/tools/python/xen/xm/create.py
@@ -221,9 +221,9 @@ gopts.var('timer_mode', val='TIMER_MODE',
use="""Timer mode (0=delay virtual time when ticks are missed;
1=virtual time is always wallclock time.""")
-gopts.var('tsc_native', val='TSC_NATIVE',
+gopts.var('tsc_mode', val='TSC_MODE',
fn=set_int, default=0,
- use="""TSC mode (0=emulate TSC, 1=native TSC).""")
+ use="""TSC mode (0=default, 1=always emulate, 2=never emulate, 3=pvrdtscp).""")
gopts.var('nomigrate', val='NOMIGRATE',
fn=set_int, default=0,
@@ -738,8 +738,8 @@ def configure_image(vals):
if vals.suppress_spurious_page_faults:
config_image.append(['suppress_spurious_page_faults', vals.suppress_spurious_page_faults])
- if vals.tsc_native is not None:
- config_image.append(['tsc_native', vals.tsc_native])
+ if vals.tsc_mode is not None:
+ config_image.append(['tsc_mode', vals.tsc_mode])
if vals.nomigrate is not None:
config_image.append(['nomigrate', vals.nomigrate])
@@ -1036,7 +1036,7 @@ def make_config(vals):
config.append([n, v])
map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
- 'restart', 'on_poweroff', 'tsc_native', 'nomigrate',
+ 'restart', 'on_poweroff', 'tsc_mode', 'nomigrate',
'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail', 'features',
'on_xend_start', 'on_xend_stop', 'target', 'cpuid',
'cpuid_check', 'machine_address_size', 'suppress_spurious_page_faults'])
diff --git a/tools/python/xen/xm/xenapi_create.py b/tools/python/xen/xm/xenapi_create.py
index 9cfdb87d7e..4c0177b4aa 100644
--- a/tools/python/xen/xm/xenapi_create.py
+++ b/tools/python/xen/xm/xenapi_create.py
@@ -1108,7 +1108,7 @@ class sxp2xml:
'pci_msitranslate',
'pci_power_mgmt',
'xen_platform_pci',
- 'tsc_native'
+ 'tsc_mode'
'description',
'nomigrate'
]
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index c851209853..84493aba43 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -520,6 +520,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
}
+ /* initialize default tsc behavior in case tools don't */
+ tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
spin_lock_init(&d->arch.vtsc_lock);
return 0;
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 52f3945f06..4b0011a52c 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1101,9 +1101,10 @@ long arch_do_domctl(
}
break;
- case XEN_DOMCTL_set_tsc_native:
+ case XEN_DOMCTL_gettscinfo:
{
struct domain *d;
+ xen_guest_tsc_info_t info;
ret = -ESRCH;
d = rcu_lock_domain_by_id(domctl->domain);
@@ -1111,9 +1112,34 @@ long arch_do_domctl(
break;
domain_pause(d);
- d->arch.vtsc = !domctl->u.set_tsc_native.is_native;
- if ( is_hvm_domain(d) )
- hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+ tsc_get_info(d, &info.tsc_mode,
+ &info.elapsed_nsec,
+ &info.gtsc_khz,
+ &info.incarnation);
+ if ( copy_to_guest(domctl->u.tsc_info.out_info, &info, 1) )
+ ret = -EFAULT;
+ else
+ ret = 0;
+ domain_unpause(d);
+
+ rcu_unlock_domain(d);
+ }
+ break;
+
+ case XEN_DOMCTL_settscinfo:
+ {
+ struct domain *d;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+
+ domain_pause(d);
+ tsc_set_info(d, domctl->u.tsc_info.info.tsc_mode,
+ domctl->u.tsc_info.info.elapsed_nsec,
+ domctl->u.tsc_info.info.gtsc_khz,
+ domctl->u.tsc_info.info.incarnation);
domain_unpause(d);
rcu_unlock_domain(d);
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index f4d342a52f..ca567638ae 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1831,7 +1831,7 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
if ( cpuid_viridian_leaves(input, eax, ebx, ecx, edx) )
return;
- if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
+ if ( cpuid_hypervisor_leaves(input, count, eax, ebx, ecx, edx) )
return;
domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 982ce7ca52..e3652a9d45 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -34,6 +34,7 @@
#include <asm/hpet.h>
#include <io_ports.h>
#include <asm/setup.h> /* for early_time_init */
+#include <public/arch-x86/cpuid.h>
/* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */
static char __initdata opt_clocksource[10];
@@ -45,10 +46,12 @@ unsigned long pit0_ticks;
static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
static DEFINE_SPINLOCK(wc_lock);
+/* moved to <asm/domain.h>
struct time_scale {
int shift;
u32 mul_frac;
};
+*/
struct cpu_time {
u64 local_tsc_stamp;
@@ -150,13 +153,32 @@ static inline u64 scale_delta(u64 delta, struct time_scale *scale)
return product;
}
+#define _TS_SHIFT_IDENTITY 1
+#define _TS_MUL_FRAC_IDENTITY 0x80000000UL
+#define _TS_IDENTITY { _TS_SHIFT_IDENTITY, _TS_MUL_FRAC_IDENTITY }
+static inline int time_scale_is_identity(struct time_scale *ts)
+{
+ if ( ts->shift != _TS_SHIFT_IDENTITY )
+ return 0;
+ else if ( ts->mul_frac != _TS_MUL_FRAC_IDENTITY )
+ return 0;
+ return 1;
+}
+
+static inline void set_time_scale_identity(struct time_scale *ts)
+{
+ ts->shift = _TS_SHIFT_IDENTITY;
+ ts->mul_frac = _TS_MUL_FRAC_IDENTITY;
+}
+
/* Compute the reciprocal of the given time_scale. */
static inline struct time_scale scale_reciprocal(struct time_scale scale)
{
struct time_scale reciprocal;
u32 dividend;
- dividend = 0x80000000u;
+ ASSERT(scale.mul_frac != 0);
+ dividend = _TS_MUL_FRAC_IDENTITY;
reciprocal.shift = 1 - scale.shift;
while ( unlikely(dividend >= scale.mul_frac) )
{
@@ -818,6 +840,8 @@ static void __update_vcpu_system_time(struct vcpu *v, int force)
struct cpu_time *t;
struct vcpu_time_info *u, _u;
XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
+ struct domain *d = v->domain;
+ s_time_t tsc_stamp = 0;
if ( v->vcpu_info == NULL )
return;
@@ -825,20 +849,31 @@ static void __update_vcpu_system_time(struct vcpu *v, int force)
t = &this_cpu(cpu_time);
u = &vcpu_info(v, time);
+ if ( d->arch.vtsc )
+ {
+ tsc_stamp = t->stime_local_stamp - d->arch.vtsc_offset;
+ if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) )
+ tsc_stamp = scale_delta(tsc_stamp, &d->arch.ns_to_vtsc);
+ }
+ else
+ tsc_stamp = t->local_tsc_stamp;
+
+ if ( d->arch.tsc_mode == TSC_MODE_PVRDTSCP &&
+ boot_cpu_has(X86_FEATURE_RDTSCP) )
+ write_rdtscp_aux(d->arch.incarnation);
+
/* Don't bother unless timestamps have changed or we are forced. */
- if ( !force && (u->tsc_timestamp == (v->domain->arch.vtsc
- ? t->stime_local_stamp
- : t->local_tsc_stamp)) )
+ if ( !force && (u->tsc_timestamp == tsc_stamp) )
return;
memset(&_u, 0, sizeof(_u));
- if ( v->domain->arch.vtsc )
+ if ( d->arch.vtsc )
{
- _u.tsc_timestamp = t->stime_local_stamp;
+ _u.tsc_timestamp = tsc_stamp;
_u.system_time = t->stime_local_stamp;
- _u.tsc_to_system_mul = 0x80000000u;
- _u.tsc_shift = 1;
+ _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
+ _u.tsc_shift = d->arch.vtsc_to_ns.shift;
}
else
{
@@ -1556,7 +1591,7 @@ static void tsc_check_slave(void *unused)
local_irq_enable();
}
-static void tsc_check_reliability(void)
+void tsc_check_reliability(void)
{
unsigned int cpu = smp_processor_id();
static DEFINE_SPINLOCK(lock);
@@ -1583,57 +1618,245 @@ static void tsc_check_reliability(void)
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs)
{
s_time_t now = get_s_time();
+ struct domain *d = v->domain;
- spin_lock(&v->domain->arch.vtsc_lock);
+ spin_lock(&d->arch.vtsc_lock);
if ( guest_kernel_mode(v, regs) )
- v->domain->arch.vtsc_kerncount++;
+ d->arch.vtsc_kerncount++;
else
- v->domain->arch.vtsc_usercount++;
+ d->arch.vtsc_usercount++;
- if ( (int64_t)(now - v->domain->arch.vtsc_last) > 0 )
- v->domain->arch.vtsc_last = now;
+ if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
+ d->arch.vtsc_last = now;
else
- now = ++v->domain->arch.vtsc_last;
+ now = ++d->arch.vtsc_last;
- spin_unlock(&v->domain->arch.vtsc_lock);
+ spin_unlock(&d->arch.vtsc_lock);
+
+ now = now - d->arch.vtsc_offset;
+ if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) )
+ now = scale_delta(now, &d->arch.ns_to_vtsc);
regs->eax = (uint32_t)now;
regs->edx = (uint32_t)(now >> 32);
}
+static int host_tsc_is_safe(void)
+{
+ extern unsigned int max_cstate;
+
+ if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
+ return 1;
+ if ( num_online_cpus() == 1 )
+ return 1;
+ if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && max_cstate <= 2 )
+ {
+ if ( !tsc_check_count )
+ tsc_check_reliability();
+ if ( tsc_max_warp == 0 )
+ return 1;
+ }
+ return 0;
+}
+
+void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx,
+ uint32_t *ecx, uint32_t *edx)
+{
+ struct domain *d = current->domain;
+ struct cpu_time *t;
+
+ t = &this_cpu(cpu_time);
+ switch ( sub_idx )
+ {
+ case 0: /* features */
+ *eax = ( ( (!!d->arch.vtsc) << 0 ) |
+ ( (!!host_tsc_is_safe()) << 1 ) |
+ ( (!!boot_cpu_has(X86_FEATURE_RDTSCP)) << 2 ) |
+ 0 );
+ *ebx = d->arch.tsc_mode;
+ *ecx = d->arch.tsc_khz;
+ *edx = d->arch.incarnation;
+ break;
+ case 1: /* pvclock group1 */ /* FIXME are these right? */
+ *eax = (uint32_t)t->local_tsc_stamp;
+ *ebx = (uint32_t)(t->local_tsc_stamp >> 32);
+ *ecx = t->tsc_scale.mul_frac;
+ *edx = d->arch.incarnation;
+ break;
+ case 2: /* pvclock scaling values */ /* FIXME are these right? */
+ *eax = (uint32_t)t->stime_local_stamp;
+ *ebx = (uint32_t)(t->stime_local_stamp >> 32);
+ *ecx = t->tsc_scale.shift;
+ *edx = d->arch.incarnation;
+ case 3: /* physical cpu_khz */
+ *eax = cpu_khz;
+ *ebx = *ecx = 0;
+ *edx = d->arch.incarnation;
+ break;
+ }
+}
+
+/*
+ * called to collect tsc-related data only for save file or live
+ * migrate; called after last rdtsc is done on this incarnation
+ */
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
+ uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
+ uint32_t *incarnation)
+{
+ *incarnation = d->arch.incarnation;
+ switch ( *tsc_mode = d->arch.tsc_mode )
+ {
+ case TSC_MODE_NEVER_EMULATE:
+ *elapsed_nsec = *gtsc_khz = 0;
+ break;
+ case TSC_MODE_ALWAYS_EMULATE:
+ *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
+ *gtsc_khz = 1000000UL;
+ break;
+ case TSC_MODE_DEFAULT:
+ if ( d->arch.vtsc )
+ {
+ *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
+ *gtsc_khz = d->arch.tsc_khz;
+ } else {
+ uint64_t tsc = 0;
+ rdtscll(tsc);
+ *elapsed_nsec = scale_delta(tsc,&d->arch.vtsc_to_ns);
+ *gtsc_khz = cpu_khz;
+ }
+ break;
+ case TSC_MODE_PVRDTSCP:
+ *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+ *gtsc_khz = d->arch.tsc_khz;
+ break;
+ }
+}
+
+/*
+ * This may be called as many as three times for a domain, once when the
+ * hypervisor creates the domain, once when the toolstack creates the
+ * domain and, if restoring/migrating, once when saved/migrated values
+ * are restored. Care must be taken that, if multiple calls occur,
+ * only the last "sticks" and all are completed before the guest executes
+ * an rdtsc instruction
+ */
+void tsc_set_info(struct domain *d,
+ uint32_t tsc_mode, uint64_t elapsed_nsec,
+ uint32_t gtsc_khz, uint32_t incarnation)
+{
+ if ( d->domain_id == 0 || d->domain_id == DOMID_INVALID )
+ {
+ d->arch.vtsc = 0;
+ return;
+ }
+ switch ( d->arch.tsc_mode = tsc_mode )
+ {
+ case TSC_MODE_NEVER_EMULATE:
+ gdprintk(XENLOG_G_INFO, "%s: never emulating TSC\n",__func__)
+ d->arch.vtsc = 0;
+ break;
+ case TSC_MODE_ALWAYS_EMULATE:
+ gdprintk(XENLOG_G_INFO, "%s: always emulating TSC\n",__func__)
+ d->arch.vtsc = 1;
+ d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+ set_time_scale_identity(&d->arch.vtsc_to_ns);
+ break;
+ case TSC_MODE_DEFAULT:
+ d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+ if ( (host_tsc_is_safe() && incarnation == 0) || !d->domain_id )
+ {
+ gdprintk(XENLOG_G_INFO, "%s: using safe native TSC\n",__func__)
+ /* use native TSC if initial host supports it */
+ d->arch.vtsc = 0;
+ d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
+ set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
+ set_time_scale_identity(&d->arch.ns_to_vtsc);
+ } else if ( gtsc_khz != 0 && gtsc_khz != 1000000UL ) {
+ gdprintk(XENLOG_G_INFO, "%s: safe native TSC on initial host,"
+ "but now using emulation\n",__func__)
+ /* was native on initial host, now emulated at initial tsc hz*/
+ d->arch.vtsc = 1;
+ d->arch.tsc_khz = gtsc_khz;
+ set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 );
+ d->arch.ns_to_vtsc =
+ scale_reciprocal(d->arch.vtsc_to_ns);
+ } else {
+ gdprintk(XENLOG_G_INFO, "%s: unsafe TSC on initial host,"
+ "using emulation\n",__func__)
+ d->arch.vtsc = 1;
+ set_time_scale_identity(&d->arch.vtsc_to_ns);
+ set_time_scale_identity(&d->arch.ns_to_vtsc);
+ }
+ break;
+ case TSC_MODE_PVRDTSCP:
+ gdprintk(XENLOG_G_INFO, "%s: using PVRDTSCP\n",__func__)
+ if ( boot_cpu_has(X86_FEATURE_RDTSCP) && gtsc_khz != 0 ) {
+ d->arch.vtsc = 0;
+ set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 );
+ } else {
+ d->arch.vtsc = 1;
+ d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+ set_time_scale_identity(&d->arch.vtsc_to_ns);
+ }
+ break;
+ }
+ d->arch.incarnation = incarnation + 1;
+ if ( is_hvm_domain(d) )
+ hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+}
+
/* vtsc may incur measurable performance degradation, diagnose with this */
static void dump_softtsc(unsigned char key)
{
struct domain *d;
int domcnt = 0;
+ extern unsigned int max_cstate;
tsc_check_reliability();
if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
printk("TSC marked as reliable, "
"warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
- printk("TSC marked as constant but not reliable, "
- "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
- else
+ {
+ printk("TSC has constant rate, ");
+ if (max_cstate <= 2 && tsc_max_warp == 0)
+ printk("no deep Cstates, passed warp test, deemed reliable, ");
+ else
+ printk("deep Cstates possible, so not reliable, ");
+ printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+ } else
printk("TSC not marked as either constant or reliable, "
- "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+ "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
for_each_domain ( d )
{
+ if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT )
+ continue;
+ printk("dom%u%s: mode=%d",d->domain_id,
+ is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
+ if ( d->arch.vtsc_offset )
+ printk(",ofs=0x%"PRIx64"",d->arch.vtsc_offset);
+ if ( d->arch.tsc_khz )
+ printk(",khz=%"PRIu32"",d->arch.tsc_khz);
+ if ( d->arch.incarnation )
+ printk(",inc=%"PRIu32"",d->arch.incarnation);
if ( !d->arch.vtsc )
+ {
+ printk("\n");
continue;
+ }
if ( is_hvm_domain(d) )
- printk("dom%u (hvm) vtsc count: %"PRIu64" total\n",
- d->domain_id, d->arch.vtsc_kerncount);
+ printk(",vtsc count: %"PRIu64" total\n",
+ d->arch.vtsc_kerncount);
else
- printk("dom%u vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
- d->domain_id, d->arch.vtsc_kerncount,
- d->arch.vtsc_usercount);
+ printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
+ d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
domcnt++;
}
if ( !domcnt )
- printk("All domains have native TSC\n");
+ printk("No domains have emulated TSC\n");
}
static struct keyhandler dump_softtsc_keyhandler = {
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index e42420c8a1..174dc25af4 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -679,8 +679,8 @@ int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
return 1;
}
-int cpuid_hypervisor_leaves(
- uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
struct domain *d = current->domain;
/* Optionally shift out of the way of Viridian architectural leaves. */
@@ -693,7 +693,7 @@ int cpuid_hypervisor_leaves(
switch ( idx )
{
case 0:
- *eax = base + 2; /* Largest leaf */
+ *eax = base + 3; /* Largest leaf */
*ebx = XEN_CPUID_SIGNATURE_EBX;
*ecx = XEN_CPUID_SIGNATURE_ECX;
*edx = XEN_CPUID_SIGNATURE_EDX;
@@ -717,6 +717,11 @@ int cpuid_hypervisor_leaves(
*ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
break;
+ case 3:
+ *eax = *ebx = *ecx = *edx = 0;
+ cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx );
+ break;
+
default:
BUG();
}
@@ -735,7 +740,7 @@ static void pv_cpuid(struct cpu_user_regs *regs)
if ( current->domain->domain_id != 0 )
{
- if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
+ if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
goto out;
}
@@ -815,7 +820,7 @@ static void pv_cpuid(struct cpu_user_regs *regs)
a = b = c = d = 0;
break;
default:
- (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
+ (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d);
break;
}
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 3c122b5d44..a80a3ede5c 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -230,6 +230,11 @@ struct domain_mca_msrs
spinlock_t lock;
};
+struct time_scale {
+ int shift;
+ u32 mul_frac;
+};
+
struct arch_domain
{
#ifdef CONFIG_X86_64
@@ -298,10 +303,17 @@ struct arch_domain
/* For Guest vMCA handling */
struct domain_mca_msrs vmca_msrs;
- /* SoftTSC emulation */
- bool_t vtsc;
- s_time_t vtsc_last;
+ /* TSC management (emulation, pv, scaling, stats) */
+ int tsc_mode; /* see include/asm-x86/time.h */
+ bool_t vtsc; /* tsc is emulated (may change after migrate) */
+ s_time_t vtsc_last; /* previous TSC value (guarantee monotonicity) */
spinlock_t vtsc_lock;
+ uint64_t vtsc_offset; /* adjustment for save/restore/migrate */
+ uint32_t tsc_khz; /* cached khz for certain emulated cases */
+ struct time_scale vtsc_to_ns; /* scaling for certain emulated cases */
+ struct time_scale ns_to_vtsc; /* scaling for certain emulated cases */
+ uint32_t incarnation; /* incremented every restore or live migrate
+ (possibly other cases in the future */
uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */
uint64_t vtsc_usercount; /* not used for hvm */
} __cacheline_aligned;
diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
index 56bb080f52..a65f080569 100644
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -84,6 +84,8 @@ static inline void wrmsrl(unsigned int msr, __u64 val)
#define write_tsc(val) wrmsrl(MSR_IA32_TSC, val)
+#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
+
#define rdpmc(counter,low,high) \
__asm__ __volatile__("rdpmc" \
: "=a" (low), "=d" (high) \
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 7b09adecd6..628965ae3e 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -550,8 +550,8 @@ asmlinkage void do_machine_check(struct cpu_user_regs *regs);
void cpu_mcheck_distribute_cmci(void);
void cpu_mcheck_disable(void);
-int cpuid_hypervisor_leaves(
- uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val);
int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val);
diff --git a/xen/include/asm-x86/time.h b/xen/include/asm-x86/time.h
index c72cb8e506..6dd071d726 100644
--- a/xen/include/asm-x86/time.h
+++ b/xen/include/asm-x86/time.h
@@ -4,6 +4,24 @@
#include <asm/msr.h>
+/*
+ * PV TSC emulation modes:
+ * 0 = guest rdtsc/p executed natively when monotonicity can be guaranteed
+ * and emulated otherwise (with frequency scaled if necessary)
+ * 1 = guest rdtsc/p always emulated at 1GHz (kernel and user)
+ * 2 = guest rdtsc always executed natively (no monotonicity/frequency
+ * guarantees); guest rdtscp emulated at native frequency if
+ * unsupported by h/w, else executed natively
+ * 3 = same as 2, except xen manages TSC_AUX register so guest can
+ * determine when a restore/migration has occurred and assumes
+ * guest obtains/uses pvclock-like mechanism to adjust for
+ * monotonicity and frequency changes
+ */
+#define TSC_MODE_DEFAULT 0
+#define TSC_MODE_ALWAYS_EMULATE 1
+#define TSC_MODE_NEVER_EMULATE 2
+#define TSC_MODE_PVRDTSCP 3
+
void calibrate_tsc_bp(void);
void calibrate_tsc_ap(void);
@@ -43,6 +61,16 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns);
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
+void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
+ uint32_t gtsc_khz, uint32_t incarnation);
+
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec,
+ uint32_t *gtsc_khz, uint32_t *incarnation);
+
+
void force_update_vcpu_system_time(struct vcpu *v);
+void cpuid_time_leaf(uint32_t sub_idx, unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
#endif /* __X86_TIME_H__ */
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index cac3477671..88b19a4ffe 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -401,11 +401,6 @@ struct xen_domctl_settimeoffset {
typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
-#define XEN_DOMCTL_set_tsc_native 57
-typedef struct xen_domctl_set_tsc_native {
- uint32_t is_native; /* IN: 0: TSC is emulated; 1: TSC is host TSC */
-} xen_domctl_set_tsc_native_t;
-
#define XEN_DOMCTL_gethvmcontext 33
#define XEN_DOMCTL_sethvmcontext 34
typedef struct xen_domctl_hvmcontext {
@@ -656,6 +651,22 @@ typedef struct xen_domctl_disable_migrate {
} xen_domctl_disable_migrate_t;
+#define XEN_DOMCTL_gettscinfo 59
+#define XEN_DOMCTL_settscinfo 60
+struct xen_guest_tsc_info {
+ uint32_t tsc_mode;
+ uint32_t gtsc_khz;
+ uint32_t incarnation;
+ uint32_t pad;
+ uint64_t elapsed_nsec;
+};
+typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
+typedef struct xen_domctl_tsc_info {
+ XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
+ xen_guest_tsc_info_t info; /* IN */
+} xen_domctl_tsc_info_t;
+
#define XEN_DOMCTL_gdbsx_guestmemio 1000 /* guest mem io */
struct xen_domctl_gdbsx_memio {
uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */
@@ -705,8 +716,8 @@ struct xen_domctl {
struct xen_domctl_hypercall_init hypercall_init;
struct xen_domctl_arch_setup arch_setup;
struct xen_domctl_settimeoffset settimeoffset;
- struct xen_domctl_set_tsc_native set_tsc_native;
struct xen_domctl_disable_migrate disable_migrate;
+ struct xen_domctl_tsc_info tsc_info;
struct xen_domctl_real_mode_area real_mode_area;
struct xen_domctl_hvmcontext hvmcontext;
struct xen_domctl_hvmcontext_partial hvmcontext_partial;