aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tools/examples/xmexample.hvm11
-rw-r--r--tools/libxc/xc_domain.c49
-rw-r--r--tools/libxc/xc_domain_restore.c12
-rw-r--r--tools/libxc/xc_domain_save.c31
-rw-r--r--tools/libxc/xenctrl.h14
-rw-r--r--tools/python/xen/lowlevel/xc/xc.c17
-rw-r--r--tools/python/xen/xend/XendConfig.py6
-rw-r--r--tools/python/xen/xend/XendDomainInfo.py6
-rw-r--r--tools/python/xen/xm/create.py10
-rw-r--r--tools/python/xen/xm/xenapi_create.py2
-rw-r--r--xen/arch/x86/domain.c2
-rw-r--r--xen/arch/x86/domctl.c34
-rw-r--r--xen/arch/x86/hvm/hvm.c2
-rw-r--r--xen/arch/x86/time.c275
-rw-r--r--xen/arch/x86/traps.c15
-rw-r--r--xen/include/asm-x86/domain.h18
-rw-r--r--xen/include/asm-x86/msr.h2
-rw-r--r--xen/include/asm-x86/processor.h4
-rw-r--r--xen/include/asm-x86/time.h28
-rw-r--r--xen/include/public/domctl.h23
20 files changed, 484 insertions, 77 deletions
diff --git a/tools/examples/xmexample.hvm b/tools/examples/xmexample.hvm
index e5ae97f966..09edda6aac 100644
--- a/tools/examples/xmexample.hvm
+++ b/tools/examples/xmexample.hvm
@@ -178,11 +178,16 @@ stdvga=0
serial='pty'
#----------------------------------------------------------------------------
-# tsc_native : TSC mode (0=emulate TSC, 1=native TSC)
+# tsc_mode : TSC mode (0=default, 1=native TSC, 2=never emulate, 3=pvrdtscp)
# emulate TSC provides synced TSC for all vcpus, but lose perfomrance.
# native TSC leverages hardware's TSC(no perf loss), but vcpu's TSC may lose
-# sync due to hardware's unreliable/unsynced TSC between CPUs.
-tsc_native=1
+# sync due to hardware's unreliable/unsynced TSC between CPUs.
+# default intelligently uses native TSC on machines where it is safe, but
+# switches to emulated if necessary after save/restore/migration
+# pvrdtscp is for intelligent apps that use special Xen-only paravirtualized
+# cpuid instructions to obtain offset/scaling/migration info and maximize
+# performance within pools of machines that support the rdtscp instruction
+tsc_mode=0
#-----------------------------------------------------------------------------
# Qemu Monitor, default is disable
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index f872583926..54a5914e9b 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -466,24 +466,61 @@ int xc_domain_set_time_offset(int xc_handle,
return do_domctl(xc_handle, &domctl);
}
-int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native)
+int xc_domain_disable_migrate(int xc_handle, uint32_t domid)
{
DECLARE_DOMCTL;
- domctl.cmd = XEN_DOMCTL_set_tsc_native;
+ domctl.cmd = XEN_DOMCTL_disable_migrate;
domctl.domain = (domid_t)domid;
- domctl.u.set_tsc_native.is_native = is_native;
+ domctl.u.disable_migrate.disable = 1;
return do_domctl(xc_handle, &domctl);
}
-int xc_domain_disable_migrate(int xc_handle, uint32_t domid)
+int xc_domain_set_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t tsc_mode,
+ uint64_t elapsed_nsec,
+ uint32_t gtsc_khz,
+ uint32_t incarnation)
{
DECLARE_DOMCTL;
- domctl.cmd = XEN_DOMCTL_disable_migrate;
+ domctl.cmd = XEN_DOMCTL_settscinfo;
domctl.domain = (domid_t)domid;
- domctl.u.disable_migrate.disable = 1;
+ domctl.u.tsc_info.info.tsc_mode = tsc_mode;
+ domctl.u.tsc_info.info.elapsed_nsec = elapsed_nsec;
+ domctl.u.tsc_info.info.gtsc_khz = gtsc_khz;
+ domctl.u.tsc_info.info.incarnation = incarnation;
return do_domctl(xc_handle, &domctl);
}
+int xc_domain_get_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t *tsc_mode,
+ uint64_t *elapsed_nsec,
+ uint32_t *gtsc_khz,
+ uint32_t *incarnation)
+{
+ int rc;
+ DECLARE_DOMCTL;
+ xen_guest_tsc_info_t info = { 0 };
+
+ domctl.cmd = XEN_DOMCTL_gettscinfo;
+ domctl.domain = (domid_t)domid;
+ set_xen_guest_handle(domctl.u.tsc_info.out_info, &info);
+ if ( (rc = lock_pages(&info, sizeof(info))) != 0 )
+ return rc;
+ rc = do_domctl(xc_handle, &domctl);
+ if ( rc == 0 )
+ {
+ *tsc_mode = info.tsc_mode;
+ *elapsed_nsec = info.elapsed_nsec;
+ *gtsc_khz = info.gtsc_khz;
+ *incarnation = info.incarnation;
+ }
+ unlock_pages(&info,sizeof(info));
+ return rc;
+}
+
+
int xc_domain_memory_increase_reservation(int xc_handle,
uint32_t domid,
unsigned long nr_extents,
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
index 01d7924f07..cf6a63c25a 100644
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -1084,6 +1084,18 @@ static int pagebuf_get_one(pagebuf_t* buf, int fd, int xch, uint32_t dom)
return -1;
}
return pagebuf_get_one(buf, fd, xch, dom);
+ } else if ( count == -7 ) {
+ uint32_t tsc_mode, khz, incarn;
+ uint64_t nsec;
+ if ( read_exact(fd, &tsc_mode, sizeof(uint32_t)) ||
+ read_exact(fd, &nsec, sizeof(uint64_t)) ||
+ read_exact(fd, &khz, sizeof(uint32_t)) ||
+ read_exact(fd, &incarn, sizeof(uint32_t)) ||
+ xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
+ ERROR("error reading/restoring tsc info");
+ return -1;
+ }
+ return pagebuf_get_one(buf, fd, xch, dom);
} else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
ERROR("Max batch size exceeded (%d). Giving up.", count);
return -1;
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
index 30c1b6d3a4..9d706a92d3 100644
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -841,6 +841,24 @@ static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
return success ? p2m : NULL;
}
+/* must be done AFTER suspend_and_state() */
+static int save_tsc_info(int xc_handle, uint32_t dom, int io_fd)
+{
+ int marker = -7;
+ uint32_t tsc_mode, khz, incarn;
+ uint64_t nsec;
+
+ if ( xc_domain_get_tsc_info(xc_handle, dom, &tsc_mode,
+ &nsec, &khz, &incarn) < 0 ||
+ write_exact(io_fd, &marker, sizeof(marker)) ||
+ write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
+ write_exact(io_fd, &nsec, sizeof(nsec)) ||
+ write_exact(io_fd, &khz, sizeof(khz)) ||
+ write_exact(io_fd, &incarn, sizeof(incarn)) )
+ return -1;
+ return 0;
+}
+
int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
uint32_t max_factor, uint32_t flags,
struct save_callbacks* callbacks,
@@ -1100,6 +1118,12 @@ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
goto out;
}
+ if ( !live && save_tsc_info(xc_handle, dom, io_fd) < 0 )
+ {
+ ERROR("Error when writing to state file (tsc)");
+ goto out;
+ }
+
copypages:
#define write_exact(fd, buf, len) write_buffer(last_iter, &ob, (fd), (buf), (len))
#ifdef ratewrite
@@ -1458,6 +1482,13 @@ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
goto out;
}
+ if ( save_tsc_info(xc_handle, dom, io_fd) < 0 )
+ {
+ ERROR("Error when writing to state file (tsc)");
+ goto out;
+ }
+
+
}
if ( xc_shadow_control(xc_handle, dom,
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index d6ecaf399c..9fc05bb30b 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -628,7 +628,19 @@ int xc_domain_set_time_offset(int xc_handle,
uint32_t domid,
int32_t time_offset_seconds);
-int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native);
+int xc_domain_set_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t tsc_mode,
+ uint64_t elapsed_nsec,
+ uint32_t gtsc_khz,
+ uint32_t incarnation);
+
+int xc_domain_get_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t *tsc_mode,
+ uint64_t *elapsed_nsec,
+ uint32_t *gtsc_khz,
+ uint32_t *incarnation);
int xc_domain_disable_migrate(int xc_handle, uint32_t domid);
diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c
index 7eaf63b94e..aa780aa303 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -1486,14 +1486,14 @@ static PyObject *pyxc_domain_set_time_offset(XcObject *self, PyObject *args)
return zero;
}
-static PyObject *pyxc_domain_set_tsc_native(XcObject *self, PyObject *args)
+static PyObject *pyxc_domain_set_tsc_info(XcObject *self, PyObject *args)
{
- uint32_t dom, is_native;
+ uint32_t dom, tsc_mode;
- if (!PyArg_ParseTuple(args, "ii", &dom, &is_native))
+ if (!PyArg_ParseTuple(args, "ii", &dom, &tsc_mode))
return NULL;
- if (xc_domain_set_tsc_native(self->xc_handle, dom, is_native) != 0)
+ if (xc_domain_set_tsc_info(self->xc_handle, dom, tsc_mode, 0, 0, 0) != 0)
return pyxc_error_to_exception();
Py_INCREF(zero);
@@ -2036,12 +2036,13 @@ static PyMethodDef pyxc_methods[] = {
" offset [int]: Time offset from UTC in seconds.\n"
"Returns: [int] 0 on success; -1 on error.\n" },
- { "domain_set_tsc_native",
- (PyCFunction)pyxc_domain_set_tsc_native,
+ { "domain_set_tsc_info",
+ (PyCFunction)pyxc_domain_set_tsc_info,
METH_VARARGS, "\n"
- "Set a domain's TSC mode (emulate vs native)\n"
+ "Set a domain's TSC mode\n"
" dom [int]: Domain whose TSC mode is being set.\n"
- " is_native [int]: 1=native, 0=emulate.\n"
+ " tsc_mode [int]: 0=default (monotonic, but native where possible)\n"
+ " 1=always emulate 2=never emulate 3=pvrdtscp\n"
"Returns: [int] 0 on success; -1 on error.\n" },
{ "domain_disable_migrate",
diff --git a/tools/python/xen/xend/XendConfig.py b/tools/python/xen/xend/XendConfig.py
index 0eadf343d3..3227cd4def 100644
--- a/tools/python/xen/xend/XendConfig.py
+++ b/tools/python/xen/xend/XendConfig.py
@@ -163,7 +163,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
'vncdisplay': int,
'vnclisten': str,
'timer_mode': int,
- 'tsc_native': int,
+ 'tsc_mode': int,
'vpt_align': int,
'viridian': int,
'vncpasswd': str,
@@ -477,8 +477,8 @@ class XendConfig(dict):
if not os.path.exists(self['platform']['device_model']):
raise VmError("device model '%s' not found" % str(self['platform']['device_model']))
- if 'tsc_native' not in self['platform']:
- self['platform']['tsc_native'] = 0
+ if 'tsc_mode' not in self['platform']:
+ self['platform']['tsc_mode'] = 0
if 'nomigrate' not in self['platform']:
self['platform']['nomigrate'] = 0
diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py
index 592ba6fad8..8198228b9f 100644
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -2468,9 +2468,9 @@ class XendDomainInfo:
self._recreateDom()
# Set TSC mode of domain
- tsc_native = self.info["platform"].get("tsc_native")
- if arch.type == "x86" and tsc_native is not None:
- xc.domain_set_tsc_native(self.domid, int(tsc_native))
+ tsc_mode = self.info["platform"].get("tsc_mode")
+ if arch.type == "x86" and tsc_mode is not None:
+ xc.domain_set_tsc_info(self.domid, int(tsc_mode))
# Set timer configuration of domain
timer_mode = self.info["platform"].get("timer_mode")
diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py
index d6485a59a1..32b19235e1 100644
--- a/tools/python/xen/xm/create.py
+++ b/tools/python/xen/xm/create.py
@@ -221,9 +221,9 @@ gopts.var('timer_mode', val='TIMER_MODE',
use="""Timer mode (0=delay virtual time when ticks are missed;
1=virtual time is always wallclock time.""")
-gopts.var('tsc_native', val='TSC_NATIVE',
+gopts.var('tsc_mode', val='TSC_MODE',
fn=set_int, default=0,
- use="""TSC mode (0=emulate TSC, 1=native TSC).""")
+ use="""TSC mode (0=default, 1=always emulate, 2=never emulate, 3=pvrdtscp).""")
gopts.var('nomigrate', val='NOMIGRATE',
fn=set_int, default=0,
@@ -738,8 +738,8 @@ def configure_image(vals):
if vals.suppress_spurious_page_faults:
config_image.append(['suppress_spurious_page_faults', vals.suppress_spurious_page_faults])
- if vals.tsc_native is not None:
- config_image.append(['tsc_native', vals.tsc_native])
+ if vals.tsc_mode is not None:
+ config_image.append(['tsc_mode', vals.tsc_mode])
if vals.nomigrate is not None:
config_image.append(['nomigrate', vals.nomigrate])
@@ -1036,7 +1036,7 @@ def make_config(vals):
config.append([n, v])
map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
- 'restart', 'on_poweroff', 'tsc_native', 'nomigrate',
+ 'restart', 'on_poweroff', 'tsc_mode', 'nomigrate',
'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail', 'features',
'on_xend_start', 'on_xend_stop', 'target', 'cpuid',
'cpuid_check', 'machine_address_size', 'suppress_spurious_page_faults'])
diff --git a/tools/python/xen/xm/xenapi_create.py b/tools/python/xen/xm/xenapi_create.py
index 9cfdb87d7e..4c0177b4aa 100644
--- a/tools/python/xen/xm/xenapi_create.py
+++ b/tools/python/xen/xm/xenapi_create.py
@@ -1108,7 +1108,7 @@ class sxp2xml:
'pci_msitranslate',
'pci_power_mgmt',
'xen_platform_pci',
- 'tsc_native'
+ 'tsc_mode'
'description',
'nomigrate'
]
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index c851209853..84493aba43 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -520,6 +520,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
}
+ /* initialize default tsc behavior in case tools don't */
+ tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
spin_lock_init(&d->arch.vtsc_lock);
return 0;
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 52f3945f06..4b0011a52c 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1101,9 +1101,10 @@ long arch_do_domctl(
}
break;
- case XEN_DOMCTL_set_tsc_native:
+ case XEN_DOMCTL_gettscinfo:
{
struct domain *d;
+ xen_guest_tsc_info_t info;
ret = -ESRCH;
d = rcu_lock_domain_by_id(domctl->domain);
@@ -1111,9 +1112,34 @@ long arch_do_domctl(
break;
domain_pause(d);
- d->arch.vtsc = !domctl->u.set_tsc_native.is_native;
- if ( is_hvm_domain(d) )
- hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+ tsc_get_info(d, &info.tsc_mode,
+ &info.elapsed_nsec,
+ &info.gtsc_khz,
+ &info.incarnation);
+ if ( copy_to_guest(domctl->u.tsc_info.out_info, &info, 1) )
+ ret = -EFAULT;
+ else
+ ret = 0;
+ domain_unpause(d);
+
+ rcu_unlock_domain(d);
+ }
+ break;
+
+ case XEN_DOMCTL_settscinfo:
+ {
+ struct domain *d;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+
+ domain_pause(d);
+ tsc_set_info(d, domctl->u.tsc_info.info.tsc_mode,
+ domctl->u.tsc_info.info.elapsed_nsec,
+ domctl->u.tsc_info.info.gtsc_khz,
+ domctl->u.tsc_info.info.incarnation);
domain_unpause(d);
rcu_unlock_domain(d);
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index f4d342a52f..ca567638ae 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1831,7 +1831,7 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
if ( cpuid_viridian_leaves(input, eax, ebx, ecx, edx) )
return;
- if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
+ if ( cpuid_hypervisor_leaves(input, count, eax, ebx, ecx, edx) )
return;
domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 982ce7ca52..e3652a9d45 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -34,6 +34,7 @@
#include <asm/hpet.h>
#include <io_ports.h>
#include <asm/setup.h> /* for early_time_init */
+#include <public/arch-x86/cpuid.h>
/* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */
static char __initdata opt_clocksource[10];
@@ -45,10 +46,12 @@ unsigned long pit0_ticks;
static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
static DEFINE_SPINLOCK(wc_lock);
+/* moved to <asm/domain.h>
struct time_scale {
int shift;
u32 mul_frac;
};
+*/
struct cpu_time {
u64 local_tsc_stamp;
@@ -150,13 +153,32 @@ static inline u64 scale_delta(u64 delta, struct time_scale *scale)
return product;
}
+#define _TS_SHIFT_IDENTITY 1
+#define _TS_MUL_FRAC_IDENTITY 0x80000000UL
+#define _TS_IDENTITY { _TS_SHIFT_IDENTITY, _TS_MUL_FRAC_IDENTITY }
+static inline int time_scale_is_identity(struct time_scale *ts)
+{
+ if ( ts->shift != _TS_SHIFT_IDENTITY )
+ return 0;
+ else if ( ts->mul_frac != _TS_MUL_FRAC_IDENTITY )
+ return 0;
+ return 1;
+}
+
+static inline void set_time_scale_identity(struct time_scale *ts)
+{
+ ts->shift = _TS_SHIFT_IDENTITY;
+ ts->mul_frac = _TS_MUL_FRAC_IDENTITY;
+}
+
/* Compute the reciprocal of the given time_scale. */
static inline struct time_scale scale_reciprocal(struct time_scale scale)
{
struct time_scale reciprocal;
u32 dividend;
- dividend = 0x80000000u;
+ ASSERT(scale.mul_frac != 0);
+ dividend = _TS_MUL_FRAC_IDENTITY;
reciprocal.shift = 1 - scale.shift;
while ( unlikely(dividend >= scale.mul_frac) )
{
@@ -818,6 +840,8 @@ static void __update_vcpu_system_time(struct vcpu *v, int force)
struct cpu_time *t;
struct vcpu_time_info *u, _u;
XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
+ struct domain *d = v->domain;
+ s_time_t tsc_stamp = 0;
if ( v->vcpu_info == NULL )
return;
@@ -825,20 +849,31 @@ static void __update_vcpu_system_time(struct vcpu *v, int force)
t = &this_cpu(cpu_time);
u = &vcpu_info(v, time);
+ if ( d->arch.vtsc )
+ {
+ tsc_stamp = t->stime_local_stamp - d->arch.vtsc_offset;
+ if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) )
+ tsc_stamp = scale_delta(tsc_stamp, &d->arch.ns_to_vtsc);
+ }
+ else
+ tsc_stamp = t->local_tsc_stamp;
+
+ if ( d->arch.tsc_mode == TSC_MODE_PVRDTSCP &&
+ boot_cpu_has(X86_FEATURE_RDTSCP) )
+ write_rdtscp_aux(d->arch.incarnation);
+
/* Don't bother unless timestamps have changed or we are forced. */
- if ( !force && (u->tsc_timestamp == (v->domain->arch.vtsc
- ? t->stime_local_stamp
- : t->local_tsc_stamp)) )
+ if ( !force && (u->tsc_timestamp == tsc_stamp) )
return;
memset(&_u, 0, sizeof(_u));
- if ( v->domain->arch.vtsc )
+ if ( d->arch.vtsc )
{
- _u.tsc_timestamp = t->stime_local_stamp;
+ _u.tsc_timestamp = tsc_stamp;
_u.system_time = t->stime_local_stamp;
- _u.tsc_to_system_mul = 0x80000000u;
- _u.tsc_shift = 1;
+ _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
+ _u.tsc_shift = d->arch.vtsc_to_ns.shift;
}
else
{
@@ -1556,7 +1591,7 @@ static void tsc_check_slave(void *unused)
local_irq_enable();
}
-static void tsc_check_reliability(void)
+void tsc_check_reliability(void)
{
unsigned int cpu = smp_processor_id();
static DEFINE_SPINLOCK(lock);
@@ -1583,57 +1618,245 @@ static void tsc_check_reliability(void)
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs)
{
s_time_t now = get_s_time();
+ struct domain *d = v->domain;
- spin_lock(&v->domain->arch.vtsc_lock);
+ spin_lock(&d->arch.vtsc_lock);
if ( guest_kernel_mode(v, regs) )
- v->domain->arch.vtsc_kerncount++;
+ d->arch.vtsc_kerncount++;
else
- v->domain->arch.vtsc_usercount++;
+ d->arch.vtsc_usercount++;
- if ( (int64_t)(now - v->domain->arch.vtsc_last) > 0 )
- v->domain->arch.vtsc_last = now;
+ if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
+ d->arch.vtsc_last = now;
else
- now = ++v->domain->arch.vtsc_last;
+ now = ++d->arch.vtsc_last;
- spin_unlock(&v->domain->arch.vtsc_lock);
+ spin_unlock(&d->arch.vtsc_lock);
+
+ now = now - d->arch.vtsc_offset;
+ if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) )
+ now = scale_delta(now, &d->arch.ns_to_vtsc);
regs->eax = (uint32_t)now;
regs->edx = (uint32_t)(now >> 32);
}
+static int host_tsc_is_safe(void)
+{
+ extern unsigned int max_cstate;
+
+ if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
+ return 1;
+ if ( num_online_cpus() == 1 )
+ return 1;
+ if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && max_cstate <= 2 )
+ {
+ if ( !tsc_check_count )
+ tsc_check_reliability();
+ if ( tsc_max_warp == 0 )
+ return 1;
+ }
+ return 0;
+}
+
+void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx,
+ uint32_t *ecx, uint32_t *edx)
+{
+ struct domain *d = current->domain;
+ struct cpu_time *t;
+
+ t = &this_cpu(cpu_time);
+ switch ( sub_idx )
+ {
+ case 0: /* features */
+ *eax = ( ( (!!d->arch.vtsc) << 0 ) |
+ ( (!!host_tsc_is_safe()) << 1 ) |
+ ( (!!boot_cpu_has(X86_FEATURE_RDTSCP)) << 2 ) |
+ 0 );
+ *ebx = d->arch.tsc_mode;
+ *ecx = d->arch.tsc_khz;
+ *edx = d->arch.incarnation;
+ break;
+ case 1: /* pvclock group1 */ /* FIXME are these right? */
+ *eax = (uint32_t)t->local_tsc_stamp;
+ *ebx = (uint32_t)(t->local_tsc_stamp >> 32);
+ *ecx = t->tsc_scale.mul_frac;
+ *edx = d->arch.incarnation;
+ break;
+ case 2: /* pvclock scaling values */ /* FIXME are these right? */
+ *eax = (uint32_t)t->stime_local_stamp;
+ *ebx = (uint32_t)(t->stime_local_stamp >> 32);
+ *ecx = t->tsc_scale.shift;
+ *edx = d->arch.incarnation;
+ case 3: /* physical cpu_khz */
+ *eax = cpu_khz;
+ *ebx = *ecx = 0;
+ *edx = d->arch.incarnation;
+ break;
+ }
+}
+
+/*
+ * called to collect tsc-related data only for save file or live
+ * migrate; called after last rdtsc is done on this incarnation
+ */
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
+ uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
+ uint32_t *incarnation)
+{
+ *incarnation = d->arch.incarnation;
+ switch ( *tsc_mode = d->arch.tsc_mode )
+ {
+ case TSC_MODE_NEVER_EMULATE:
+ *elapsed_nsec = *gtsc_khz = 0;
+ break;
+ case TSC_MODE_ALWAYS_EMULATE:
+ *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
+ *gtsc_khz = 1000000UL;
+ break;
+ case TSC_MODE_DEFAULT:
+ if ( d->arch.vtsc )
+ {
+ *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
+ *gtsc_khz = d->arch.tsc_khz;
+ } else {
+ uint64_t tsc = 0;
+ rdtscll(tsc);
+ *elapsed_nsec = scale_delta(tsc,&d->arch.vtsc_to_ns);
+ *gtsc_khz = cpu_khz;
+ }
+ break;
+ case TSC_MODE_PVRDTSCP:
+ *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+ *gtsc_khz = d->arch.tsc_khz;
+ break;
+ }
+}
+
+/*
+ * This may be called as many as three times for a domain, once when the
+ * hypervisor creates the domain, once when the toolstack creates the
+ * domain and, if restoring/migrating, once when saved/migrated values
+ * are restored. Care must be taken that, if multiple calls occur,
+ * only the last "sticks" and all are completed before the guest executes
+ * an rdtsc instruction
+ */
+void tsc_set_info(struct domain *d,
+ uint32_t tsc_mode, uint64_t elapsed_nsec,
+ uint32_t gtsc_khz, uint32_t incarnation)
+{
+ if ( d->domain_id == 0 || d->domain_id == DOMID_INVALID )
+ {
+ d->arch.vtsc = 0;
+ return;
+ }
+ switch ( d->arch.tsc_mode = tsc_mode )
+ {
+ case TSC_MODE_NEVER_EMULATE:
+ gdprintk(XENLOG_G_INFO, "%s: never emulating TSC\n",__func__)
+ d->arch.vtsc = 0;
+ break;
+ case TSC_MODE_ALWAYS_EMULATE:
+ gdprintk(XENLOG_G_INFO, "%s: always emulating TSC\n",__func__)
+ d->arch.vtsc = 1;
+ d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+ set_time_scale_identity(&d->arch.vtsc_to_ns);
+ break;
+ case TSC_MODE_DEFAULT:
+ d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+ if ( (host_tsc_is_safe() && incarnation == 0) || !d->domain_id )
+ {
+ gdprintk(XENLOG_G_INFO, "%s: using safe native TSC\n",__func__)
+ /* use native TSC if initial host supports it */
+ d->arch.vtsc = 0;
+ d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
+ set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
+ set_time_scale_identity(&d->arch.ns_to_vtsc);
+ } else if ( gtsc_khz != 0 && gtsc_khz != 1000000UL ) {
+ gdprintk(XENLOG_G_INFO, "%s: safe native TSC on initial host,"
+ "but now using emulation\n",__func__)
+ /* was native on initial host, now emulated at initial tsc hz*/
+ d->arch.vtsc = 1;
+ d->arch.tsc_khz = gtsc_khz;
+ set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 );
+ d->arch.ns_to_vtsc =
+ scale_reciprocal(d->arch.vtsc_to_ns);
+ } else {
+ gdprintk(XENLOG_G_INFO, "%s: unsafe TSC on initial host,"
+ "using emulation\n",__func__)
+ d->arch.vtsc = 1;
+ set_time_scale_identity(&d->arch.vtsc_to_ns);
+ set_time_scale_identity(&d->arch.ns_to_vtsc);
+ }
+ break;
+ case TSC_MODE_PVRDTSCP:
+ gdprintk(XENLOG_G_INFO, "%s: using PVRDTSCP\n",__func__)
+ if ( boot_cpu_has(X86_FEATURE_RDTSCP) && gtsc_khz != 0 ) {
+ d->arch.vtsc = 0;
+ set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 );
+ } else {
+ d->arch.vtsc = 1;
+ d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+ set_time_scale_identity(&d->arch.vtsc_to_ns);
+ }
+ break;
+ }
+ d->arch.incarnation = incarnation + 1;
+ if ( is_hvm_domain(d) )
+ hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+}
+
/* vtsc may incur measurable performance degradation, diagnose with this */
static void dump_softtsc(unsigned char key)
{
struct domain *d;
int domcnt = 0;
+ extern unsigned int max_cstate;
tsc_check_reliability();
if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
printk("TSC marked as reliable, "
"warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
- printk("TSC marked as constant but not reliable, "
- "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
- else
+ {
+ printk("TSC has constant rate, ");
+ if (max_cstate <= 2 && tsc_max_warp == 0)
+ printk("no deep Cstates, passed warp test, deemed reliable, ");
+ else
+ printk("deep Cstates possible, so not reliable, ");
+ printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+ } else
printk("TSC not marked as either constant or reliable, "
- "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+ "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
for_each_domain ( d )
{
+ if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT )
+ continue;
+ printk("dom%u%s: mode=%d",d->domain_id,
+ is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
+ if ( d->arch.vtsc_offset )
+ printk(",ofs=0x%"PRIx64"",d->arch.vtsc_offset);
+ if ( d->arch.tsc_khz )
+ printk(",khz=%"PRIu32"",d->arch.tsc_khz);
+ if ( d->arch.incarnation )
+ printk(",inc=%"PRIu32"",d->arch.incarnation);
if ( !d->arch.vtsc )
+ {
+ printk("\n");
continue;
+ }
if ( is_hvm_domain(d) )
- printk("dom%u (hvm) vtsc count: %"PRIu64" total\n",
- d->domain_id, d->arch.vtsc_kerncount);
+ printk(",vtsc count: %"PRIu64" total\n",
+ d->arch.vtsc_kerncount);
else
- printk("dom%u vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
- d->domain_id, d->arch.vtsc_kerncount,
- d->arch.vtsc_usercount);
+ printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
+ d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
domcnt++;
}
if ( !domcnt )
- printk("All domains have native TSC\n");
+ printk("No domains have emulated TSC\n");
}
static struct keyhandler dump_softtsc_keyhandler = {
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index e42420c8a1..174dc25af4 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -679,8 +679,8 @@ int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
return 1;
}
-int cpuid_hypervisor_leaves(
- uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
struct domain *d = current->domain;
/* Optionally shift out of the way of Viridian architectural leaves. */
@@ -693,7 +693,7 @@ int cpuid_hypervisor_leaves(
switch ( idx )
{
case 0:
- *eax = base + 2; /* Largest leaf */
+ *eax = base + 3; /* Largest leaf */
*ebx = XEN_CPUID_SIGNATURE_EBX;
*ecx = XEN_CPUID_SIGNATURE_ECX;
*edx = XEN_CPUID_SIGNATURE_EDX;
@@ -717,6 +717,11 @@ int cpuid_hypervisor_leaves(
*ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
break;
+ case 3:
+ *eax = *ebx = *ecx = *edx = 0;
+ cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx );
+ break;
+
default:
BUG();
}
@@ -735,7 +740,7 @@ static void pv_cpuid(struct cpu_user_regs *regs)
if ( current->domain->domain_id != 0 )
{
- if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
+ if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
goto out;
}
@@ -815,7 +820,7 @@ static void pv_cpuid(struct cpu_user_regs *regs)
a = b = c = d = 0;
break;
default:
- (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
+ (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d);
break;
}
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 3c122b5d44..a80a3ede5c 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -230,6 +230,11 @@ struct domain_mca_msrs
spinlock_t lock;
};
+struct time_scale {
+ int shift;
+ u32 mul_frac;
+};
+
struct arch_domain
{
#ifdef CONFIG_X86_64
@@ -298,10 +303,17 @@ struct arch_domain
/* For Guest vMCA handling */
struct domain_mca_msrs vmca_msrs;
- /* SoftTSC emulation */
- bool_t vtsc;
- s_time_t vtsc_last;
+ /* TSC management (emulation, pv, scaling, stats) */
+ int tsc_mode; /* see include/asm-x86/time.h */
+ bool_t vtsc; /* tsc is emulated (may change after migrate) */
+ s_time_t vtsc_last; /* previous TSC value (guarantee monotonicity) */
spinlock_t vtsc_lock;
+ uint64_t vtsc_offset; /* adjustment for save/restore/migrate */
+ uint32_t tsc_khz; /* cached khz for certain emulated cases */
+ struct time_scale vtsc_to_ns; /* scaling for certain emulated cases */
+ struct time_scale ns_to_vtsc; /* scaling for certain emulated cases */
+ uint32_t incarnation; /* incremented every restore or live migrate
+ (possibly other cases in the future */
uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */
uint64_t vtsc_usercount; /* not used for hvm */
} __cacheline_aligned;
diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
index 56bb080f52..a65f080569 100644
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -84,6 +84,8 @@ static inline void wrmsrl(unsigned int msr, __u64 val)
#define write_tsc(val) wrmsrl(MSR_IA32_TSC, val)
+#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
+
#define rdpmc(counter,low,high) \
__asm__ __volatile__("rdpmc" \
: "=a" (low), "=d" (high) \
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 7b09adecd6..628965ae3e 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -550,8 +550,8 @@ asmlinkage void do_machine_check(struct cpu_user_regs *regs);
void cpu_mcheck_distribute_cmci(void);
void cpu_mcheck_disable(void);
-int cpuid_hypervisor_leaves(
- uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val);
int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val);
diff --git a/xen/include/asm-x86/time.h b/xen/include/asm-x86/time.h
index c72cb8e506..6dd071d726 100644
--- a/xen/include/asm-x86/time.h
+++ b/xen/include/asm-x86/time.h
@@ -4,6 +4,24 @@
#include <asm/msr.h>
+/*
+ * PV TSC emulation modes:
+ * 0 = guest rdtsc/p executed natively when monotonicity can be guaranteed
+ * and emulated otherwise (with frequency scaled if necessary)
+ * 1 = guest rdtsc/p always emulated at 1GHz (kernel and user)
+ * 2 = guest rdtsc always executed natively (no monotonicity/frequency
+ * guarantees); guest rdtscp emulated at native frequency if
+ * unsupported by h/w, else executed natively
+ * 3 = same as 2, except xen manages TSC_AUX register so guest can
+ * determine when a restore/migration has occurred and assumes
+ * guest obtains/uses pvclock-like mechanism to adjust for
+ * monotonicity and frequency changes
+ */
+#define TSC_MODE_DEFAULT 0
+#define TSC_MODE_ALWAYS_EMULATE 1
+#define TSC_MODE_NEVER_EMULATE 2
+#define TSC_MODE_PVRDTSCP 3
+
void calibrate_tsc_bp(void);
void calibrate_tsc_ap(void);
@@ -43,6 +61,16 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns);
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
+void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
+ uint32_t gtsc_khz, uint32_t incarnation);
+
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec,
+ uint32_t *gtsc_khz, uint32_t *incarnation);
+
+
void force_update_vcpu_system_time(struct vcpu *v);
+void cpuid_time_leaf(uint32_t sub_idx, unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
#endif /* __X86_TIME_H__ */
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index cac3477671..88b19a4ffe 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -401,11 +401,6 @@ struct xen_domctl_settimeoffset {
typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
-#define XEN_DOMCTL_set_tsc_native 57
-typedef struct xen_domctl_set_tsc_native {
- uint32_t is_native; /* IN: 0: TSC is emulated; 1: TSC is host TSC */
-} xen_domctl_set_tsc_native_t;
-
#define XEN_DOMCTL_gethvmcontext 33
#define XEN_DOMCTL_sethvmcontext 34
typedef struct xen_domctl_hvmcontext {
@@ -656,6 +651,22 @@ typedef struct xen_domctl_disable_migrate {
} xen_domctl_disable_migrate_t;
+#define XEN_DOMCTL_gettscinfo 59
+#define XEN_DOMCTL_settscinfo 60
+struct xen_guest_tsc_info {
+ uint32_t tsc_mode;
+ uint32_t gtsc_khz;
+ uint32_t incarnation;
+ uint32_t pad;
+ uint64_t elapsed_nsec;
+};
+typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
+typedef struct xen_domctl_tsc_info {
+ XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
+ xen_guest_tsc_info_t info; /* IN */
+} xen_domctl_tsc_info_t;
+
#define XEN_DOMCTL_gdbsx_guestmemio 1000 /* guest mem io */
struct xen_domctl_gdbsx_memio {
uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */
@@ -705,8 +716,8 @@ struct xen_domctl {
struct xen_domctl_hypercall_init hypercall_init;
struct xen_domctl_arch_setup arch_setup;
struct xen_domctl_settimeoffset settimeoffset;
- struct xen_domctl_set_tsc_native set_tsc_native;
struct xen_domctl_disable_migrate disable_migrate;
+ struct xen_domctl_tsc_info tsc_info;
struct xen_domctl_real_mode_area real_mode_area;
struct xen_domctl_hvmcontext hvmcontext;
struct xen_domctl_hvmcontext_partial hvmcontext_partial;