From 49f7c7364e0acbc44604e0315599782096eee522 Mon Sep 17 00:00:00 2001 From: "tdeegan@york.uk.xensource.com" Date: Wed, 16 Aug 2006 17:02:35 +0100 Subject: Replace shadow pagetable code with shadow2. --- .hgtags | 10 + tools/examples/xmexample.hvm | 4 + tools/libxc/xc_domain.c | 13 +- tools/libxc/xc_hvm_build.c | 13 + tools/libxc/xc_linux_build.c | 2 +- tools/libxc/xc_linux_save.c | 18 +- tools/libxc/xenctrl.h | 2 + tools/misc/xc_shadow.c | 2 + tools/python/xen/lowlevel/xc/xc.c | 69 + tools/python/xen/xend/XendDomain.py | 24 + tools/python/xen/xend/XendDomainInfo.py | 47 +- tools/python/xen/xend/image.py | 17 + tools/python/xen/xm/create.py | 9 +- xen/arch/x86/Makefile | 16 +- xen/arch/x86/audit.c | 984 ------- xen/arch/x86/dom0_ops.c | 2 +- xen/arch/x86/domain.c | 90 +- xen/arch/x86/domain_build.c | 13 +- xen/arch/x86/hvm/hvm.c | 23 +- xen/arch/x86/hvm/platform.c | 9 +- xen/arch/x86/hvm/svm/svm.c | 259 +- xen/arch/x86/hvm/svm/vmcb.c | 4 +- xen/arch/x86/hvm/vlapic.c | 3 +- xen/arch/x86/hvm/vmx/vmcs.c | 15 +- xen/arch/x86/hvm/vmx/vmx.c | 218 +- xen/arch/x86/mm.c | 463 ++-- xen/arch/x86/setup.c | 2 - xen/arch/x86/shadow.c | 4150 --------------------------- xen/arch/x86/shadow2-common.c | 3394 +++++++++++++++++++++++ xen/arch/x86/shadow2.c | 4469 ++++++++++++++++++++++++++++++ xen/arch/x86/shadow32.c | 3782 ------------------------- xen/arch/x86/shadow_guest32.c | 16 - xen/arch/x86/shadow_guest32pae.c | 16 - xen/arch/x86/shadow_public.c | 2143 -------------- xen/arch/x86/smpboot.c | 2 +- xen/arch/x86/traps.c | 32 +- xen/arch/x86/x86_32/domain_page.c | 31 +- xen/arch/x86/x86_32/mm.c | 3 +- xen/arch/x86/x86_64/mm.c | 3 +- xen/arch/x86/x86_64/traps.c | 14 +- xen/common/acm_ops.c | 1 - xen/common/grant_table.c | 4 +- xen/common/keyhandler.c | 19 +- xen/common/memory.c | 11 +- xen/drivers/char/console.c | 46 +- xen/include/asm-x86/bitops.h | 18 + xen/include/asm-x86/config.h | 20 +- xen/include/asm-x86/domain.h | 93 +- xen/include/asm-x86/grant_table.h | 2 +- xen/include/asm-x86/hvm/hvm.h | 25 + xen/include/asm-x86/hvm/support.h | 11 +- xen/include/asm-x86/hvm/vcpu.h | 6 + xen/include/asm-x86/hvm/vmx/vmcs.h | 1 + xen/include/asm-x86/hvm/vmx/vmx.h | 49 +- xen/include/asm-x86/mm.h | 136 +- xen/include/asm-x86/msr.h | 4 + xen/include/asm-x86/page-guest32.h | 7 - xen/include/asm-x86/page.h | 37 +- xen/include/asm-x86/perfc_defn.h | 53 + xen/include/asm-x86/processor.h | 1 + xen/include/asm-x86/shadow.h | 1791 +----------- xen/include/asm-x86/shadow2-multi.h | 116 + xen/include/asm-x86/shadow2-private.h | 612 ++++ xen/include/asm-x86/shadow2-types.h | 705 +++++ xen/include/asm-x86/shadow2.h | 627 +++++ xen/include/asm-x86/shadow_64.h | 587 ---- xen/include/asm-x86/shadow_ops.h | 138 - xen/include/asm-x86/shadow_public.h | 61 - xen/include/asm-x86/x86_32/page-2level.h | 1 + xen/include/asm-x86/x86_32/page-3level.h | 3 +- xen/include/asm-x86/x86_64/page.h | 5 +- xen/include/public/dom0_ops.h | 16 +- xen/include/xen/domain_page.h | 13 + xen/include/xen/lib.h | 4 +- xen/include/xen/list.h | 10 + xen/include/xen/sched.h | 5 +- 76 files changed, 11111 insertions(+), 14513 deletions(-) delete mode 100644 xen/arch/x86/audit.c delete mode 100644 xen/arch/x86/shadow.c create mode 100644 xen/arch/x86/shadow2-common.c create mode 100644 xen/arch/x86/shadow2.c delete mode 100644 xen/arch/x86/shadow32.c delete mode 100644 xen/arch/x86/shadow_guest32.c delete mode 100644 xen/arch/x86/shadow_guest32pae.c delete mode 100644 xen/arch/x86/shadow_public.c create mode 100644 xen/include/asm-x86/shadow2-multi.h create mode 100644 xen/include/asm-x86/shadow2-private.h create mode 100644 xen/include/asm-x86/shadow2-types.h create mode 100644 xen/include/asm-x86/shadow2.h delete mode 100644 xen/include/asm-x86/shadow_64.h delete mode 100644 xen/include/asm-x86/shadow_ops.h delete mode 100644 xen/include/asm-x86/shadow_public.h diff --git a/.hgtags b/.hgtags index b097c216b9..41fa5ab702 100644 --- a/.hgtags +++ b/.hgtags @@ -15,3 +15,13 @@ fb875591fd72e15c31879c0e9034d99b80225595 RELEASE-2.0.4 c8fdb0caa77b429cf47f9707926e83947778cb48 RELEASE-3.0.0 af0573e9e5258db0a9d28aa954dd302ddd2c2d23 3.0.2-rc d0d3fef37685be264a7f52201f8ef44c030daad3 3.0.2-branched +6e864d7de9db066f92bea505d256bfe286200fed last-code-review +a898a6510c5db4e3d1f69d40fcacb540643b0f22 mainline +bfa6f4a0c594bc0ebd896437d69857b58dab0988 last-code-review +fc6cbf31bd883bc76ceb97f4b817ac88078d696a latest patch to unstable +8e55c5c1147589b7a6a1875384d4317aec7ccf84 mainline +2d2ed4d9b1c14aeee29dfdd77acd6017d31290cd mainline +0e32095a7b4611d18a82052a9d5b23e474f91af9 mainline +88e6bd5e2b5439f97e1d50a8724103c619aeaadf mainline +5233c4b076b9aa073eff63508461b7bfa597737c mainline +fda70200da01b89d5339342df6c0db372369a16d mainline diff --git a/tools/examples/xmexample.hvm b/tools/examples/xmexample.hvm index 396274c860..dd07a3b90e 100644 --- a/tools/examples/xmexample.hvm +++ b/tools/examples/xmexample.hvm @@ -27,6 +27,10 @@ builder='hvm' # and modules. Allocating less than 32MBs is not recommended. memory = 128 +# Shadow pagetable memory for the domain, in MB. +# Should be at least 2KB per MB of domain memory, plus a few MB per vcpu. +shadow_memory = 8 + # A name for your domain. All domains must have different names. name = "ExampleHVMDomain" diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c index 51117117f4..801e35ea08 100644 --- a/tools/libxc/xc_domain.c +++ b/tools/libxc/xc_domain.c @@ -213,21 +213,28 @@ int xc_shadow_control(int xc_handle, unsigned int sop, unsigned long *dirty_bitmap, unsigned long pages, - xc_shadow_control_stats_t *stats ) + unsigned long *mb, + uint32_t mode, + xc_shadow_control_stats_t *stats) { int rc; DECLARE_DOM0_OP; op.cmd = DOM0_SHADOW_CONTROL; op.u.shadow_control.domain = (domid_t)domid; op.u.shadow_control.op = sop; - set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap); op.u.shadow_control.pages = pages; + op.u.shadow_control.mb = mb ? *mb : 0; + op.u.shadow_control.mode = mode; + set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap); rc = do_dom0_op(xc_handle, &op); if ( stats ) memcpy(stats, &op.u.shadow_control.stats, sizeof(xc_shadow_control_stats_t)); + + if ( mb ) + *mb = op.u.shadow_control.mb; return (rc == 0) ? op.u.shadow_control.pages : rc; } @@ -391,7 +398,7 @@ int xc_domain_memory_populate_physmap(int xc_handle, if ( err > 0 ) { - DPRINTF("Failed deallocation for dom %d: %ld pages order %d\n", + DPRINTF("Failed allocation for dom %d: %ld pages order %d\n", domid, nr_extents, extent_order); errno = EBUSY; err = -1; diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c index d4799abc87..173c6733ee 100644 --- a/tools/libxc/xc_hvm_build.c +++ b/tools/libxc/xc_hvm_build.c @@ -396,6 +396,19 @@ static int xc_hvm_build_internal(int xc_handle, goto error_out; } + /* HVM domains must be put into shadow2 mode at the start of day */ + if ( xc_shadow_control(xc_handle, domid, DOM0_SHADOW2_CONTROL_OP_ENABLE, + NULL, 0, NULL, + DOM0_SHADOW2_CONTROL_FLAG_ENABLE + | DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT + | DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE + | DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL, + NULL) ) + { + PERROR("Could not enable shadow paging for domain.\n"); + goto error_out; + } + memset(ctxt, 0, sizeof(*ctxt)); ctxt->flags = VGCF_HVM_GUEST; diff --git a/tools/libxc/xc_linux_build.c b/tools/libxc/xc_linux_build.c index 9d7ea54a86..116429a729 100644 --- a/tools/libxc/xc_linux_build.c +++ b/tools/libxc/xc_linux_build.c @@ -972,7 +972,7 @@ static int setup_guest(int xc_handle, /* Enable shadow translate mode */ if ( xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE, - NULL, 0, NULL) < 0 ) + NULL, 0, NULL, 0, NULL) < 0 ) { PERROR("Could not enable translation mode"); goto error_out; diff --git a/tools/libxc/xc_linux_save.c b/tools/libxc/xc_linux_save.c index 8cf21dced5..49d212995e 100644 --- a/tools/libxc/xc_linux_save.c +++ b/tools/libxc/xc_linux_save.c @@ -338,13 +338,13 @@ static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn, int i; xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN, - arr, max_pfn, NULL); + arr, max_pfn, NULL, 0, NULL); DPRINTF("#Flush\n"); for ( i = 0; i < 40; i++ ) { usleep(50000); now = llgettimeofday(); xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK, - NULL, 0, &stats); + NULL, 0, NULL, 0, &stats); DPRINTF("now= %lld faults= %" PRId32 " dirty= %" PRId32 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n", @@ -727,7 +727,7 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY, - NULL, 0, NULL ) < 0) { + NULL, 0, NULL, 0, NULL) < 0) { ERR("Couldn't enable shadow mode"); goto out; } @@ -879,7 +879,7 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, but this is fast enough for the moment. */ if (!last_iter && xc_shadow_control( xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK, - to_skip, max_pfn, NULL) != max_pfn) { + to_skip, max_pfn, NULL, 0, NULL) != max_pfn) { ERR("Error peeking shadow bitmap"); goto out; } @@ -1084,8 +1084,9 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, (unsigned long)ctxt.user_regs.edx); } - if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN, - to_send, max_pfn, &stats ) != max_pfn) { + if (xc_shadow_control(xc_handle, dom, + DOM0_SHADOW_CONTROL_OP_CLEAN, to_send, + max_pfn, NULL, 0, &stats) != max_pfn) { ERR("Error flushing shadow PT"); goto out; } @@ -1174,8 +1175,9 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, out: if (live) { - if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF, - NULL, 0, NULL ) < 0) { + if(xc_shadow_control(xc_handle, dom, + DOM0_SHADOW_CONTROL_OP_OFF, + NULL, 0, NULL, 0, NULL) < 0) { DPRINTF("Warning - couldn't disable shadow mode"); } } diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h index 2d301b2c43..a66a11839d 100644 --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -323,6 +323,8 @@ int xc_shadow_control(int xc_handle, unsigned int sop, unsigned long *dirty_bitmap, unsigned long pages, + unsigned long *mb, + uint32_t mode, xc_shadow_control_stats_t *stats); int xc_bvtsched_global_set(int xc_handle, diff --git a/tools/misc/xc_shadow.c b/tools/misc/xc_shadow.c index 83c52ebc19..f0f60c9c5c 100644 --- a/tools/misc/xc_shadow.c +++ b/tools/misc/xc_shadow.c @@ -60,6 +60,8 @@ int main(int argc, char *argv[]) mode, NULL, 0, + NULL, + 0, NULL) < 0 ) { fprintf(stderr, "Error reseting performance counters: %d (%s)\n", diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c index 3e5a9624d8..2c55ca079f 100644 --- a/tools/python/xen/lowlevel/xc/xc.c +++ b/tools/python/xen/lowlevel/xc/xc.c @@ -669,6 +669,59 @@ static PyObject *pyxc_sedf_domain_get(XcObject *self, PyObject *args) "weight", weight); } +static PyObject *pyxc_shadow_control(PyObject *self, + PyObject *args, + PyObject *kwds) +{ + XcObject *xc = (XcObject *)self; + + uint32_t dom; + int op=0; + + static char *kwd_list[] = { "dom", "op", NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, + &dom, &op) ) + return NULL; + + if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, 0, NULL) + < 0 ) + return PyErr_SetFromErrno(xc_error); + + Py_INCREF(zero); + return zero; +} + +static PyObject *pyxc_shadow_mem_control(PyObject *self, + PyObject *args, + PyObject *kwds) +{ + XcObject *xc = (XcObject *)self; + int op; + uint32_t dom; + int mbarg = -1; + unsigned long mb; + + static char *kwd_list[] = { "dom", "mb", NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, + &dom, &mbarg) ) + return NULL; + + if ( mbarg < 0 ) + op = DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION; + else + { + mb = mbarg; + op = DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION; + } + if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, &mb, 0, NULL) < 0 ) + return PyErr_SetFromErrno(xc_error); + + mbarg = mb; + return Py_BuildValue("i", mbarg); +} + static PyObject *pyxc_sched_credit_domain_set(XcObject *self, PyObject *args, PyObject *kwds) @@ -1119,6 +1172,22 @@ static PyMethodDef pyxc_methods[] = { "Returns [dict]: information about Xen" " [None]: on failure.\n" }, + { "shadow_control", + (PyCFunction)pyxc_shadow_control, + METH_VARARGS | METH_KEYWORDS, "\n" + "Set parameter for shadow pagetable interface\n" + " dom [int]: Identifier of domain.\n" + " op [int, 0]: operation\n\n" + "Returns: [int] 0 on success; -1 on error.\n" }, + + { "shadow_mem_control", + (PyCFunction)pyxc_shadow_mem_control, + METH_VARARGS | METH_KEYWORDS, "\n" + "Set or read shadow pagetable memory use\n" + " dom [int]: Identifier of domain.\n" + " mb [int, -1]: MB of shadow memory this domain should have.\n\n" + "Returns: [int] MB of shadow memory in use by this domain.\n" }, + { "domain_setmaxmem", (PyCFunction)pyxc_domain_setmaxmem, METH_VARARGS, "\n" diff --git a/tools/python/xen/xend/XendDomain.py b/tools/python/xen/xend/XendDomain.py index 52cca550d4..c253dc2777 100644 --- a/tools/python/xen/xend/XendDomain.py +++ b/tools/python/xen/xend/XendDomain.py @@ -532,6 +532,30 @@ class XendDomain: except Exception, ex: raise XendError(str(ex)) + def domain_shadow_control(self, domid, op): + """Shadow page control.""" + dominfo = self.domain_lookup(domid) + try: + return xc.shadow_control(dominfo.getDomid(), op) + except Exception, ex: + raise XendError(str(ex)) + + def domain_shadow_mem_get(self, domid): + """Get shadow pagetable memory allocation.""" + dominfo = self.domain_lookup(domid) + try: + return xc.shadow_mem_control(dominfo.getDomid()) + except Exception, ex: + raise XendError(str(ex)) + + def domain_shadow_mem_set(self, domid, mb): + """Set shadow pagetable memory allocation.""" + dominfo = self.domain_lookup(domid) + try: + return xc.shadow_mem_control(dominfo.getDomid(), mb=mb) + except Exception, ex: + raise XendError(str(ex)) + def domain_sched_credit_get(self, domid): """Get credit scheduler parameters for a domain. """ diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py index 3bc69981e8..ab0554fccd 100644 --- a/tools/python/xen/xend/XendDomainInfo.py +++ b/tools/python/xen/xend/XendDomainInfo.py @@ -30,6 +30,7 @@ import string import time import threading import os +import math import xen.lowlevel.xc from xen.util import asserts @@ -126,16 +127,17 @@ VM_CONFIG_PARAMS = [ # don't come out of xc in the same form as they are specified in the config # file, so those are handled separately. ROUNDTRIPPING_CONFIG_ENTRIES = [ - ('uuid', str), - ('vcpus', int), - ('vcpu_avail', int), - ('cpu_weight', float), - ('memory', int), - ('maxmem', int), - ('bootloader', str), + ('uuid', str), + ('vcpus', int), + ('vcpu_avail', int), + ('cpu_weight', float), + ('memory', int), + ('shadow_memory', int), + ('maxmem', int), + ('bootloader', str), ('bootloader_args', str), - ('features', str), - ('localtime', int), + ('features', str), + ('localtime', int), ] ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS @@ -146,12 +148,13 @@ ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS # entries written to the store that cannot be reconfigured on-the-fly. # VM_STORE_ENTRIES = [ - ('uuid', str), - ('vcpus', int), - ('vcpu_avail', int), - ('memory', int), - ('maxmem', int), - ('start_time', float), + ('uuid', str), + ('vcpus', int), + ('vcpu_avail', int), + ('memory', int), + ('shadow_memory', int), + ('maxmem', int), + ('start_time', float), ] VM_STORE_ENTRIES += VM_CONFIG_PARAMS @@ -572,6 +575,7 @@ class XendDomainInfo: defaultInfo('vcpu_avail', lambda: (1 << self.info['vcpus']) - 1) defaultInfo('memory', lambda: 0) + defaultInfo('shadow_memory', lambda: 0) defaultInfo('maxmem', lambda: 0) defaultInfo('bootloader', lambda: None) defaultInfo('bootloader_args', lambda: None) @@ -1280,7 +1284,18 @@ class XendDomainInfo: xc.domain_setmaxmem(self.domid, self.info['maxmem'] * 1024) m = self.image.getDomainMemory(self.info['memory'] * 1024) - balloon.free(m) + + # get the domain's shadow memory requirement + sm = int(math.ceil(self.image.getDomainShadowMemory(m) / 1024.0)) + if self.info['shadow_memory'] > sm: + sm = self.info['shadow_memory'] + + # Make sure there's enough RAM available for the domain + balloon.free(m + sm * 1024) + + # Set up the shadow memory + sm = xc.shadow_mem_control(self.domid, mb=sm) + self.info['shadow_memory'] = sm init_reservation = self.info['memory'] * 1024 if os.uname()[4] in ('ia64', 'ppc64'): diff --git a/tools/python/xen/xend/image.py b/tools/python/xen/xend/image.py index 64fb810944..268462c581 100644 --- a/tools/python/xen/xend/image.py +++ b/tools/python/xen/xend/image.py @@ -153,6 +153,12 @@ class ImageHandler: mem_kb += 4*1024; return mem_kb + def getDomainShadowMemory(self, mem_kb): + """@return The minimum shadow memory required, in KiB, for a domain + with mem_kb KiB of RAM.""" + # PV domains don't need any shadow memory + return 0 + def buildDomain(self): """Build the domain. Define in subclass.""" raise NotImplementedError() @@ -364,6 +370,17 @@ class HVMImageHandler(ImageHandler): extra_pages = int( math.ceil( extra_mb*1024 / page_kb )) return mem_kb + extra_pages * page_kb + def getDomainShadowMemory(self, mem_kb): + """@return The minimum shadow memory required, in KiB, for a domain + with mem_kb KiB of RAM.""" + if os.uname()[4] in ('ia64', 'ppc64'): + # Explicit shadow memory is not a concept + return 0 + else: + # 1MB per vcpu plus 4Kib/Mib of RAM. This is higher than + # the minimum that Xen would allocate if no value were given. + return 1024 * self.vm.getVCpuCount() + mem_kb / 256 + def register_shutdown_watch(self): """ add xen store watch on control/shutdown """ self.shutdownWatch = xswatch(self.vm.dompath + "/control/shutdown", \ diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py index 549018e209..6416aaab3f 100644 --- a/tools/python/xen/xm/create.py +++ b/tools/python/xen/xm/create.py @@ -158,6 +158,10 @@ gopts.var('maxmem', val='MEMORY', fn=set_int, default=None, use="Maximum domain memory in MB.") +gopts.var('shadow_memory', val='MEMORY', + fn=set_int, default=0, + use="Domain shadow memory in MB.") + gopts.var('cpu', val='CPU', fn=set_int, default=None, use="CPU to run the VCPU0 on.") @@ -666,8 +670,9 @@ def make_config(vals): if v: config.append([n, v]) - map(add_conf, ['name', 'memory', 'maxmem', 'restart', 'on_poweroff', - 'on_reboot', 'on_crash', 'vcpus', 'features']) + map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory', + 'restart', 'on_poweroff', 'on_reboot', 'on_crash', + 'vcpus', 'features']) if vals.uuid is not None: config.append(['uuid', vals.uuid]) diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile index aebee65e9c..e246594245 100644 --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -8,7 +8,6 @@ subdir-$(x86_32) += x86_32 subdir-$(x86_64) += x86_64 obj-y += apic.o -obj-y += audit.o obj-y += bitops.o obj-y += compat.o obj-y += delay.o @@ -41,12 +40,21 @@ obj-y += usercopy.o obj-y += x86_emulate.o ifneq ($(pae),n) -obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o +obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o else -obj-$(x86_32) += shadow32.o +obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o endif -obj-$(x86_64) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o +obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \ + shadow2_g2_on_s3.o + +guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1)))))) +shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1)))))) +shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \ + -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1)) + +shadow2_%.o: shadow2.c $(HDRS) Makefile + $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@ obj-$(crash_debug) += gdbstub.o diff --git a/xen/arch/x86/audit.c b/xen/arch/x86/audit.c deleted file mode 100644 index bacdb9cc50..0000000000 --- a/xen/arch/x86/audit.c +++ /dev/null @@ -1,984 +0,0 @@ -/****************************************************************************** - * arch/x86/audit.c - * - * Copyright (c) 2002-2005 K A Fraser - * Copyright (c) 2004 Christian Limpach - * Copyright (c) 2005 Michael A Fetterman - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* XXX SMP bug -- these should not be statics... */ -static int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; -static int l1, l2, oos_count, page_count; - -#define FILE_AND_LINE 0 - -#if FILE_AND_LINE -#define adjust(_p, _a) _adjust((_p), (_a), __FILE__, __LINE__) -#define ADJUST_EXTRA_ARGS ,const char *file, int line -#define APRINTK(_f, _a...) printk(_f " %s:%d\n", ## _a, file, line) -#else -#define adjust _adjust -#define ADJUST_EXTRA_ARGS -#define APRINTK(_f, _a...) printk(_f "\n", ##_a) -#endif - -int audit_adjust_pgtables(struct domain *d, int dir, int noisy) -{ - int errors = 0; - int shadow_refcounts = !!shadow_mode_refcounts(d); - int shadow_enabled = !!shadow_mode_enabled(d); - - int l2limit( unsigned long mfn ) - { - - if ( shadow_mode_external(d) ) - return L2_PAGETABLE_ENTRIES; - -#ifdef __i386__ -#ifdef CONFIG_X86_PAE - /* 32b PAE */ - if ( (( mfn_to_page(mfn)->u.inuse.type_info & PGT_va_mask ) - >> PGT_va_shift) == 3 ) - return l2_table_offset(HYPERVISOR_VIRT_START); - else - return L2_PAGETABLE_ENTRIES; -#else - /* 32b non-PAE */ - return DOMAIN_ENTRIES_PER_L2_PAGETABLE; -#endif -#else - /* 64b */ - return 0; /* XXX x86/64 XXX */ -#endif - } - - void _adjust(struct page_info *page, int adjtype ADJUST_EXTRA_ARGS) - { - int count; - - if ( adjtype ) - { - /* adjust the type count */ - int tcount = page->u.inuse.type_info & PGT_count_mask; - tcount += dir; - ttot++; - - if ( page_get_owner(page) == NULL ) - { - APRINTK("adjust(mfn=%lx, dir=%d, adjtype=%d) owner=NULL", - page_to_mfn(page), dir, adjtype); - errors++; - } - - if ( tcount < 0 ) - { - APRINTK("Audit %d: type count went below zero " - "mfn=%lx t=%" PRtype_info " ot=%x", - d->domain_id, page_to_mfn(page), - page->u.inuse.type_info, - page->tlbflush_timestamp); - errors++; - } - else if ( (tcount & ~PGT_count_mask) != 0 ) - { - APRINTK("Audit %d: type count overflowed " - "mfn=%lx t=%" PRtype_info " ot=%x", - d->domain_id, page_to_mfn(page), - page->u.inuse.type_info, - page->tlbflush_timestamp); - errors++; - } - else - page->u.inuse.type_info += dir; - } - - /* adjust the general count */ - count = (page->count_info & PGC_count_mask) + dir; - ctot++; - - if ( count < 0 ) - { - APRINTK("Audit %d: general count went below zero " - "mfn=%lx t=%" PRtype_info " ot=%x", - d->domain_id, page_to_mfn(page), - page->u.inuse.type_info, - page->tlbflush_timestamp); - errors++; - } - else if ( (count & ~PGT_count_mask) != 0 ) - { - APRINTK("Audit %d: general count overflowed " - "mfn=%lx t=%" PRtype_info " ot=%x", - d->domain_id, page_to_mfn(page), - page->u.inuse.type_info, - page->tlbflush_timestamp); - errors++; - } - else - page->count_info += dir; - } - - void adjust_l2_page(unsigned long mfn, int shadow) - { - l2_pgentry_t *pt = map_domain_page(mfn); - int i; - u32 page_type; - - for ( i = 0; i < l2limit(mfn); i++ ) - { - if ( l2e_get_flags(pt[i]) & _PAGE_PRESENT ) - { - unsigned long l1mfn = l2e_get_pfn(pt[i]); - struct page_info *l1page = mfn_to_page(l1mfn); - - if ( noisy ) - { - if ( shadow ) - { - if ( page_get_owner(l1page) != NULL ) - { - printk("L2: Bizarre shadow L1 page mfn=%lx " - "belonging to a domain %p (id=%d)\n", - l1mfn, - page_get_owner(l1page), - page_get_owner(l1page)->domain_id); - errors++; - continue; - } - - page_type = l1page->u.inuse.type_info & PGT_type_mask; - if ( page_type != PGT_l1_shadow ) - { - printk("Audit %d: [Shadow L2 mfn=%lx i=%x] " - "Expected Shadow L1 t=%" PRtype_info - " mfn=%lx\n", - d->domain_id, mfn, i, - l1page->u.inuse.type_info, l1mfn); - errors++; - } - } - else - { - if ( page_get_owner(l1page) != d ) - { - printk("L2: Skip bizarre L1 page mfn=%lx " - "belonging to other dom %p (id=%d)\n", - l1mfn, - page_get_owner(l1page), - (page_get_owner(l1page) - ? page_get_owner(l1page)->domain_id - : -1)); - errors++; - continue; - } - - page_type = l1page->u.inuse.type_info & PGT_type_mask; - if ( page_type == PGT_l2_page_table ) - { - printk("Audit %d: [%x] Found %s Linear PT " - "t=%" PRtype_info " mfn=%lx\n", - d->domain_id, i, (l1mfn==mfn) ? "Self" : "Other", - l1page->u.inuse.type_info, l1mfn); - } - else if ( page_type != PGT_l1_page_table ) - { - printk("Audit %d: [L2 mfn=%lx i=%x] " - "Expected L1 t=%" PRtype_info " mfn=%lx\n", - d->domain_id, mfn, i, - l1page->u.inuse.type_info, l1mfn); - errors++; - } - } - } - - adjust(l1page, !shadow); - } - } - - if ( shadow_mode_translate(d) && !shadow_mode_external(d) ) - { - unsigned long hl2mfn = - l2e_get_pfn(pt[l2_table_offset(LINEAR_PT_VIRT_START)]); - struct page_info *hl2page = mfn_to_page(hl2mfn); - adjust(hl2page, 0); - } - - unmap_domain_page(pt); - } - - void adjust_hl2_page(unsigned long hl2mfn) - { - l2_pgentry_t *pt = map_domain_page(hl2mfn); - int i; - - for ( i = 0; i < l2limit(hl2mfn); i++ ) - { - if ( l2e_get_flags(pt[i]) & _PAGE_PRESENT ) - { - unsigned long mfn = l2e_get_pfn(pt[i]); - struct page_info *gpage = mfn_to_page(mfn); - - if ( mfn < 0x100 ) - { - lowmem_mappings++; - continue; - } - - if ( !mfn_valid(mfn) ) - { - io_mappings++; - continue; - } - - if ( noisy ) - { - if ( page_get_owner(gpage) != d ) - { - printk("Audit %d: [hl2mfn=%lx,i=%x] Skip foreign page " - "dom=%p (id=%d) mfn=%lx c=%08x t=%" - PRtype_info "\n", - d->domain_id, hl2mfn, i, - page_get_owner(gpage), - page_get_owner(gpage)->domain_id, - mfn, - gpage->count_info, - gpage->u.inuse.type_info); - continue; - } - } - adjust(gpage, 0); - } - } - - unmap_domain_page(pt); - } - - void adjust_l1_page(unsigned long l1mfn) - { - l1_pgentry_t *pt = map_domain_page(l1mfn); - int i; - - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - { - if ( l1e_get_flags(pt[i]) & _PAGE_PRESENT ) - { - unsigned long mfn = l1e_get_pfn(pt[i]); - struct page_info *gpage = mfn_to_page(mfn); - - if ( mfn < 0x100 ) - { - lowmem_mappings++; - continue; - } - - if ( !mfn_valid(mfn) ) - { - io_mappings++; - continue; - } - - if ( noisy ) - { - if ( l1e_get_flags(pt[i]) & _PAGE_RW ) - { - // If it's not a writable page, complain. - // - if ( !((gpage->u.inuse.type_info & PGT_type_mask) == - PGT_writable_page) ) - { - printk("Audit %d: [l1mfn=%lx, i=%x] Illegal RW " - "t=%" PRtype_info " mfn=%lx\n", - d->domain_id, l1mfn, i, - gpage->u.inuse.type_info, mfn); - errors++; - } - - if ( shadow_refcounts && - page_is_page_table(gpage) && - ! page_out_of_sync(gpage) ) - { - printk("Audit %d: [l1mfn=%lx, i=%x] Illegal RW of " - "page table mfn=%lx\n", - d->domain_id, l1mfn, i, mfn); - errors++; - } - } - - if ( page_get_owner(gpage) != d ) - { - printk("Audit %d: [l1mfn=%lx,i=%x] Skip foreign page " - "dom=%p (id=%d) mfn=%lx c=%08x t=%" - PRtype_info "\n", - d->domain_id, l1mfn, i, - page_get_owner(gpage), - page_get_owner(gpage)->domain_id, - mfn, - gpage->count_info, - gpage->u.inuse.type_info); - continue; - } - } - - adjust(gpage, (l1e_get_flags(pt[i]) & _PAGE_RW) ? 1 : 0); - } - } - - unmap_domain_page(pt); - } - - void adjust_shadow_tables(void) - { - struct shadow_status *a; - unsigned long smfn, gmfn; - struct page_info *page; - int i; - - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - a = &d->arch.shadow_ht[i]; - while ( a && a->gpfn_and_flags ) - { - gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask); - smfn = a->smfn; - page = mfn_to_page(smfn); - - switch ( a->gpfn_and_flags & PGT_type_mask ) { - case PGT_writable_pred: - break; - case PGT_snapshot: - adjust(mfn_to_page(gmfn), 0); - break; - case PGT_l1_shadow: - adjust(mfn_to_page(gmfn), 0); - if ( shadow_refcounts ) - adjust_l1_page(smfn); - if ( page->u.inuse.type_info & PGT_pinned ) - adjust(page, 0); - break; - case PGT_hl2_shadow: - adjust(mfn_to_page(gmfn), 0); - if ( shadow_refcounts ) - adjust_hl2_page(smfn); - if ( page->u.inuse.type_info & PGT_pinned ) - adjust(page, 0); - break; - case PGT_l2_shadow: - adjust(mfn_to_page(gmfn), 0); - adjust_l2_page(smfn, 1); - if ( page->u.inuse.type_info & PGT_pinned ) - adjust(page, 0); - break; - default: - BUG(); - break; - } - - a = a->next; - } - } - } - - void adjust_oos_list(void) - { - struct out_of_sync_entry *oos; - - if ( (oos = d->arch.out_of_sync) ) - ASSERT(shadow_enabled); - - while ( oos ) - { - adjust(mfn_to_page(oos->gmfn), 0); - - // Only use entries that have low bits clear... - // - if ( !(oos->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ) - adjust(mfn_to_page(oos->writable_pl1e >> PAGE_SHIFT), 0); - - if ( oos->snapshot_mfn != SHADOW_SNAPSHOT_ELSEWHERE ) - adjust(mfn_to_page(oos->snapshot_mfn), 0); - - oos = oos->next; - oos_count++; - } - } - - void adjust_for_pgtbase(void) - { - struct vcpu *v; - - for_each_vcpu(d, v) - { - if ( !pagetable_is_null(v->arch.guest_table) ) - adjust(mfn_to_page(pagetable_get_pfn(v->arch.guest_table)), - !shadow_mode_refcounts(d)); - if ( !pagetable_is_null(v->arch.shadow_table) ) - adjust(mfn_to_page(pagetable_get_pfn(v->arch.shadow_table)), - 0); - if ( v->arch.monitor_shadow_ref ) - adjust(mfn_to_page(v->arch.monitor_shadow_ref), 0); - } - } - - void adjust_guest_pages(void) - { - struct list_head *list_ent = d->page_list.next; - struct page_info *page; - unsigned long mfn, snapshot_mfn; - - while ( list_ent != &d->page_list ) - { - u32 page_type; - - page = list_entry(list_ent, struct page_info, list); - snapshot_mfn = mfn = page_to_mfn(page); - page_type = page->u.inuse.type_info & PGT_type_mask; - - BUG_ON(page_get_owner(page) != d); - - page_count++; - - if ( shadow_enabled && !shadow_refcounts && - page_out_of_sync(page) ) - { - unsigned long gpfn = mfn_to_gmfn(d, mfn); - ASSERT( VALID_M2P(gpfn) ); - snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot); - ASSERT( snapshot_mfn ); - } - - switch ( page_type ) - { - case PGT_l2_page_table: - l2++; - - if ( noisy ) - { - if ( shadow_refcounts ) - { - printk("Audit %d: found an L2 guest page " - "mfn=%lx t=%" PRtype_info " c=%08x while in shadow mode\n", - d->domain_id, mfn, page->u.inuse.type_info, - page->count_info); - errors++; - } - - if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) - { - if ( (page->u.inuse.type_info & PGT_validated) != - PGT_validated ) - { - printk("Audit %d: L2 mfn=%lx not validated %" - PRtype_info "\n", - d->domain_id, mfn, page->u.inuse.type_info); - errors++; - } - - } - } - - if ( page->u.inuse.type_info & PGT_pinned ) - adjust(page, 1); - - if ( page->u.inuse.type_info & PGT_validated ) - adjust_l2_page(snapshot_mfn, 0); - - break; - - case PGT_l1_page_table: - l1++; - - if ( noisy ) - { - if ( shadow_refcounts ) - { - printk("found an L1 guest page mfn=%lx t=%" - PRtype_info " c=%08x " - "while in shadow mode\n", - mfn, page->u.inuse.type_info, page->count_info); - errors++; - } - - if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) - { - if ( (page->u.inuse.type_info & PGT_validated) != - PGT_validated ) - { - printk("Audit %d: L1 not validated mfn=%lx t=%" - PRtype_info "\n", - d->domain_id, mfn, page->u.inuse.type_info); - errors++; - } - } - } - - if ( page->u.inuse.type_info & PGT_pinned ) - adjust(page, 1); - - if ( page->u.inuse.type_info & PGT_validated ) - adjust_l1_page(snapshot_mfn); - - break; - - case PGT_gdt_page: - ASSERT( !page_out_of_sync(page) ); - adjust(page, 1); - break; - - case PGT_ldt_page: - ASSERT( !page_out_of_sync(page) ); - adjust(page, 1); - break; - - case PGT_writable_page: - if ( shadow_refcounts ) - { - // In shadow mode, writable pages can get pinned by - // paravirtualized guests that think they are pinning - // their L1s and/or L2s. - // - if ( page->u.inuse.type_info & PGT_pinned ) - adjust(page, 1); - } - } - - list_ent = page->list.next; - } - } - - adjust_for_pgtbase(); - - adjust_guest_pages(); - - if ( shadow_enabled ) - { - adjust_oos_list(); - adjust_shadow_tables(); - } - - adjust(virt_to_page(d->shared_info), 1); - - return errors; -} - - -#ifndef NDEBUG - -void audit_pagelist(struct domain *d) -{ - struct list_head *list_ent; - int xenpages, totpages; - - list_ent = d->xenpage_list.next; - for ( xenpages = 0; (list_ent != &d->xenpage_list); xenpages++ ) - { - list_ent = list_ent->next; - } - list_ent = d->page_list.next; - for ( totpages = 0; (list_ent != &d->page_list); totpages++ ) - { - list_ent = list_ent->next; - } - - if ( xenpages != d->xenheap_pages || - totpages != d->tot_pages ) - { - printk("ARGH! dom %d: xen=%d %d, pages=%d %d\n", d->domain_id, - xenpages, d->xenheap_pages, - totpages, d->tot_pages ); - } -} - -void _audit_domain(struct domain *d, int flags) -{ - int shadow_refcounts = !!shadow_mode_refcounts(d); - - void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn, - unsigned long mfn) - { - struct page_info *page = mfn_to_page(mfn); - l1_pgentry_t *pt = map_domain_page(mfn); - int i; - - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - { - if ( (l1e_get_flags(pt[i]) & _PAGE_PRESENT) && - (l1e_get_pfn(pt[i]) == xmfn) ) - printk(" found dom=%d mfn=%lx t=%" PRtype_info " c=%08x " - "pt[i=%x]=%" PRIpte "\n", - d->domain_id, mfn, page->u.inuse.type_info, - page->count_info, i, l1e_get_intpte(pt[i])); - } - - unmap_domain_page(pt); - } - - void scan_for_pfn_in_grant_table(struct domain *d, unsigned xmfn) - { - int i; - struct active_grant_entry *act = d->grant_table->active; - - spin_lock(&d->grant_table->lock); - - for ( i = 0; i < NR_GRANT_ENTRIES; i++ ) - { - if ( act[i].pin && (act[i].frame == xmfn) ) - { - printk(" found active grant table entry i=%d dom=%d pin=%d\n", - i, act[i].domid, act[i].pin); - } - } - - spin_unlock(&d->grant_table->lock); - } - - void scan_for_pfn(struct domain *d, unsigned long xmfn) - { - scan_for_pfn_in_grant_table(d, xmfn); - - if ( !shadow_mode_enabled(d) ) - { - struct list_head *list_ent = d->page_list.next; - struct page_info *page; - - while ( list_ent != &d->page_list ) - { - page = list_entry(list_ent, struct page_info, list); - - switch ( page->u.inuse.type_info & PGT_type_mask ) - { - case PGT_l1_page_table: - case PGT_l2_page_table: - scan_for_pfn_in_mfn(d, xmfn, page_to_mfn(page)); - break; - default: - break; - } - - list_ent = page->list.next; - } - } - else - { - struct shadow_status *a; - int i; - - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - a = &d->arch.shadow_ht[i]; - while ( a && a->gpfn_and_flags ) - { - switch ( a->gpfn_and_flags & PGT_type_mask ) - { - case PGT_l1_shadow: - case PGT_l2_shadow: - case PGT_hl2_shadow: - scan_for_pfn_in_mfn(d, xmfn, a->smfn); - break; - case PGT_snapshot: - case PGT_writable_pred: - break; - default: - BUG(); - break; - } - a = a->next; - } - } - } - } - - void scan_for_pfn_remote(unsigned long xmfn) - { - struct domain *e; - for_each_domain ( e ) - scan_for_pfn( e, xmfn ); - } - - unsigned long mfn; - struct list_head *list_ent; - struct page_info *page; - int errors = 0; - - if ( (d != current->domain) && shadow_mode_translate(d) ) - { - printk("skipping audit domain of translated domain %d " - "from other context\n", - d->domain_id); - return; - } - - if ( d != current->domain ) - domain_pause(d); - - // Maybe we should just be using BIGLOCK? - // - if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) ) - shadow_lock(d); - - spin_lock(&d->page_alloc_lock); - - audit_pagelist(d); - - /* PHASE 0 */ - - list_ent = d->page_list.next; - while ( list_ent != &d->page_list ) - { - u32 page_type; - unsigned long pfn; - - page = list_entry(list_ent, struct page_info, list); - mfn = page_to_mfn(page); - page_type = page->u.inuse.type_info & PGT_type_mask; - - BUG_ON(page_get_owner(page) != d); - - if ( (page->u.inuse.type_info & PGT_count_mask) > - (page->count_info & PGC_count_mask) ) - { - printk("taf(%" PRtype_info ") > caf(%08x) mfn=%lx\n", - page->u.inuse.type_info, page->count_info, mfn); - errors++; - } - - if ( shadow_mode_refcounts(d) && - (page_type == PGT_writable_page) && - !(page->u.inuse.type_info & PGT_validated) ) - { - printk("shadow mode writable page not validated mfn=%lx " - "t=%" PRtype_info " c=%08x\n", - mfn, page->u.inuse.type_info, page->count_info); - errors++; - } - -#if 0 /* SYSV shared memory pages plus writeable files. */ - if ( page_type == PGT_writable_page && - (page->u.inuse.type_info & PGT_count_mask) > 1 ) - { - printk("writeable page with type count >1: " - "mfn=%lx t=%" PRtype_info " c=%08x\n", - mfn, - page->u.inuse.type_info, - page->count_info ); - errors++; - scan_for_pfn_remote(mfn); - } -#endif - - if ( page_type == PGT_none && - (page->u.inuse.type_info & PGT_count_mask) > 0 ) - { - printk("normal page with type count >0: mfn=%lx t=%" PRtype_info " c=%08x\n", - mfn, - page->u.inuse.type_info, - page->count_info ); - errors++; - } - - if ( page_out_of_sync(page) ) - { - if ( !page_is_page_table(page) ) - { - printk("out of sync page mfn=%lx is not a page table\n", mfn); - errors++; - } - pfn = mfn_to_gmfn(d, mfn); - if ( !__shadow_status(d, pfn, PGT_snapshot) ) - { - printk("out of sync page mfn=%lx doesn't have a snapshot\n", - mfn); - errors++; - } - if ( shadow_refcounts - ? (page_type != PGT_writable_page) - : !(page_type && (page_type <= PGT_l4_page_table)) ) - { - printk("out of sync page mfn=%lx has strange type " - "t=%" PRtype_info " c=%08x\n", - mfn, page->u.inuse.type_info, page->count_info); - errors++; - } - } - - /* Use tlbflush_timestamp to store original type_info. */ - page->tlbflush_timestamp = page->u.inuse.type_info; - - list_ent = page->list.next; - } - - /* PHASE 1 */ - io_mappings = lowmem_mappings = 0; - - errors += audit_adjust_pgtables(d, -1, 1); - - if ( !(flags & AUDIT_QUIET) && - ((io_mappings > 0) || (lowmem_mappings > 0)) ) - printk("Audit %d: Found %d lowmem mappings and %d io mappings\n", - d->domain_id, lowmem_mappings, io_mappings); - - /* PHASE 2 */ - - list_ent = d->page_list.next; - while ( list_ent != &d->page_list ) - { - page = list_entry(list_ent, struct page_info, list); - mfn = page_to_mfn(page); - - switch ( page->u.inuse.type_info & PGT_type_mask) - { - case PGT_l1_page_table: - case PGT_l2_page_table: - case PGT_l3_page_table: - case PGT_l4_page_table: - if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) - { - printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n", - d->domain_id, page->u.inuse.type_info, - page->tlbflush_timestamp, - page->count_info, mfn); - errors++; - scan_for_pfn_remote(mfn); - } - break; - case PGT_none: - case PGT_writable_page: - case PGT_gdt_page: - case PGT_ldt_page: - if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) - { - printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n", - d->domain_id, page->u.inuse.type_info, - page->tlbflush_timestamp, - page->count_info, mfn); - //errors++; - } - break; - default: - BUG(); // XXX fix me... - } - - if ( (page->count_info & PGC_count_mask) != 1 ) - { - printk("Audit %d: gen count!=1 (c=%x) t=%" PRtype_info " ot=%x mfn=%lx\n", - d->domain_id, - page->count_info, - page->u.inuse.type_info, - page->tlbflush_timestamp, mfn ); - //errors++; - scan_for_pfn_remote(mfn); - } - - list_ent = page->list.next; - } - - if ( shadow_mode_enabled(d) ) - { - struct shadow_status *a; - struct page_info *page; - u32 page_type; - int i; - - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - a = &d->arch.shadow_ht[i]; - while ( a && a->gpfn_and_flags ) - { - page = mfn_to_page(a->smfn); - page_type = a->gpfn_and_flags & PGT_type_mask; - - switch ( page_type ) { - case PGT_l1_shadow: - case PGT_l2_shadow: - case PGT_hl2_shadow: - case PGT_snapshot: - if ( ((page->u.inuse.type_info & PGT_type_mask) != page_type ) || - (page->count_info != 0) ) - { - printk("Audit %d: shadow page counts wrong " - "mfn=%lx t=%" PRtype_info " c=%08x\n", - d->domain_id, page_to_mfn(page), - page->u.inuse.type_info, - page->count_info); - printk("a->gpfn_and_flags=%"PRIx64"\n", - (u64)a->gpfn_and_flags); - errors++; - } - break; - case PGT_writable_pred: - // XXX - nothing to check? - break; - - default: - BUG(); - break; - } - - a = a->next; - } - } - } - - /* PHASE 3 */ - ctot = ttot = page_count = l1 = l2 = oos_count = 0; - - audit_adjust_pgtables(d, 1, 0); - -#if 0 - // This covers our sins of trashing the tlbflush_timestamps... - // - local_flush_tlb(); -#endif - - spin_unlock(&d->page_alloc_lock); - - if ( !(flags & AUDIT_QUIET) ) - printk("Audit dom%d Done. " - "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n", - d->domain_id, page_count, oos_count, l1, l2, ctot, ttot); - - if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) ) - shadow_unlock(d); - - if ( d != current->domain ) - domain_unpause(d); - - if ( errors && !(flags & AUDIT_ERRORS_OK) ) - BUG(); -} - -void audit_domains(void) -{ - struct domain *d; - for_each_domain ( d ) - audit_domain(d); -} - -void audit_domains_key(unsigned char key) -{ - audit_domains(); -} -#endif diff --git a/xen/arch/x86/dom0_ops.c b/xen/arch/x86/dom0_ops.c index 214b0c58f8..0038112d63 100644 --- a/xen/arch/x86/dom0_ops.c +++ b/xen/arch/x86/dom0_ops.c @@ -89,7 +89,7 @@ long arch_do_dom0_op(struct dom0_op *op, XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op) d = find_domain_by_id(op->u.shadow_control.domain); if ( d != NULL ) { - ret = shadow_mode_control(d, &op->u.shadow_control); + ret = shadow2_control_op(d, &op->u.shadow_control, u_dom0_op); put_domain(d); copy_to_guest(u_dom0_op, op, 1); } diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 4dd71b1859..65e4dc4b9c 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -134,13 +134,6 @@ struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id) v->arch.perdomain_ptes = d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT); - v->arch.guest_vtable = __linear_l2_table; - v->arch.shadow_vtable = __shadow_linear_l2_table; -#if defined(__x86_64__) - v->arch.guest_vl3table = __linear_l3_table; - v->arch.guest_vl4table = __linear_l4_table; -#endif - pae_l3_cache_init(&v->arch.pae_l3_cache); return v; @@ -155,9 +148,7 @@ int arch_domain_create(struct domain *d) { l1_pgentry_t gdt_l1e; int vcpuid, pdpt_order; -#ifdef __x86_64__ int i; -#endif pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)); d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order); @@ -202,8 +193,12 @@ int arch_domain_create(struct domain *d) #endif /* __x86_64__ */ - shadow_lock_init(d); - INIT_LIST_HEAD(&d->arch.free_shadow_frames); + shadow2_lock_init(d); + for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ ) + INIT_LIST_HEAD(&d->arch.shadow2_freelists[i]); + INIT_LIST_HEAD(&d->arch.shadow2_p2m_freelist); + INIT_LIST_HEAD(&d->arch.shadow2_p2m_inuse); + INIT_LIST_HEAD(&d->arch.shadow2_toplevel_shadows); if ( !is_idle_domain(d) ) { @@ -234,6 +229,8 @@ int arch_domain_create(struct domain *d) void arch_domain_destroy(struct domain *d) { + shadow2_final_teardown(d); + free_xenheap_pages( d->arch.mm_perdomain_pt, get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t))); @@ -328,14 +325,6 @@ int arch_set_info_guest( if ( !hvm_initialize_guest_resources(v) ) return -EINVAL; } - else if ( shadow_mode_refcounts(d) ) - { - if ( !get_page(mfn_to_page(cr3_pfn), d) ) - { - destroy_gdt(v); - return -EINVAL; - } - } else { if ( !get_page_and_type(mfn_to_page(cr3_pfn), d, @@ -344,9 +333,16 @@ int arch_set_info_guest( destroy_gdt(v); return -EINVAL; } - } + } - update_pagetables(v); + /* Shadow2: make sure the domain has enough shadow memory to + * boot another vcpu */ + if ( shadow2_mode_enabled(d) + && d->arch.shadow2_total_pages < shadow2_min_acceptable_pages(d) ) + { + destroy_gdt(v); + return -ENOMEM; + } if ( v->vcpu_id == 0 ) update_domain_wallclock_time(d); @@ -354,6 +350,11 @@ int arch_set_info_guest( /* Don't redo final setup */ set_bit(_VCPUF_initialised, &v->vcpu_flags); + if ( shadow2_mode_enabled(d) ) + shadow2_update_paging_modes(v); + + update_cr3(v); + return 0; } @@ -669,7 +670,6 @@ static void __context_switch(void) loaddebug(&n->arch.guest_context, 6); loaddebug(&n->arch.guest_context, 7); } - n->arch.ctxt_switch_to(n); } @@ -927,29 +927,34 @@ void domain_relinquish_resources(struct domain *d) /* Drop the in-use references to page-table bases. */ for_each_vcpu ( d, v ) { - if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 ) + /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling, + * or sh2_update_paging_modes()) */ + pfn = pagetable_get_pfn(v->arch.guest_table); + if ( pfn != 0 ) { - if ( !shadow_mode_refcounts(d) ) - put_page_type(mfn_to_page(pfn)); - put_page(mfn_to_page(pfn)); - + if ( shadow2_mode_refcounts(d) ) + put_page(mfn_to_page(pfn)); + else + put_page_and_type(mfn_to_page(pfn)); v->arch.guest_table = pagetable_null(); } - if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 ) +#ifdef __x86_64__ + /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */ + pfn = pagetable_get_pfn(v->arch.guest_table_user); + if ( pfn != 0 ) { - if ( !shadow_mode_refcounts(d) ) - put_page_type(mfn_to_page(pfn)); - put_page(mfn_to_page(pfn)); - + put_page_and_type(mfn_to_page(pfn)); v->arch.guest_table_user = pagetable_null(); } +#endif } if ( d->vcpu[0] && hvm_guest(d->vcpu[0]) ) hvm_relinquish_guest_resources(d); - shadow_mode_disable(d); + /* Tear down shadow mode stuff. */ + shadow2_teardown(d); /* * Relinquish GDT mappings. No need for explicit unmapping of the LDT as @@ -964,26 +969,23 @@ void domain_relinquish_resources(struct domain *d) /* Free page used by xen oprofile buffer */ free_xenoprof_pages(d); - } void arch_dump_domain_info(struct domain *d) { - if ( shadow_mode_enabled(d) ) + if ( shadow2_mode_enabled(d) ) { - printk(" shadow mode: "); - if ( shadow_mode_refcounts(d) ) + printk(" shadow2 mode: "); + if ( d->arch.shadow2_mode & SHM2_enable ) + printk("enabled "); + if ( shadow2_mode_refcounts(d) ) printk("refcounts "); - if ( shadow_mode_write_all(d) ) - printk("write_all "); - if ( shadow_mode_log_dirty(d) ) + if ( shadow2_mode_log_dirty(d) ) printk("log_dirty "); - if ( shadow_mode_translate(d) ) + if ( shadow2_mode_translate(d) ) printk("translate "); - if ( shadow_mode_external(d) ) + if ( shadow2_mode_external(d) ) printk("external "); - if ( shadow_mode_wr_pt_pte(d) ) - printk("wr_pt_pte "); printk("\n"); } } diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index a1d95f77c6..5d270336fc 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -683,8 +683,11 @@ int construct_dom0(struct domain *d, for ( i = 1; i < opt_dom0_max_vcpus; i++ ) (void)alloc_vcpu(d, i, i); - /* Set up monitor table */ - update_pagetables(v); + /* Set up CR3 value for write_ptbase */ + if ( shadow2_mode_enabled(v->domain) ) + shadow2_update_paging_modes(v); + else + update_cr3(v); /* Install the new page tables. */ local_irq_disable(); @@ -796,10 +799,8 @@ int construct_dom0(struct domain *d, new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start); if ( opt_dom0_shadow ) - { - shadow_mode_enable(d, SHM_enable); - update_pagetables(v); - } + if ( shadow2_test_enable(d) == 0 ) + shadow2_update_paging_modes(v); if ( supervisor_mode_kernel ) { diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index fd4b69423b..6ffbf751f9 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -42,10 +43,6 @@ #include #include #include -#include -#if CONFIG_PAGING_LEVELS >= 3 -#include -#endif #include #include #include @@ -61,7 +58,7 @@ struct hvm_function_table hvm_funcs; static void hvm_zap_mmio_range( struct domain *d, unsigned long pfn, unsigned long nr_pfn) { - unsigned long i, val = INVALID_MFN; + unsigned long i; ASSERT(d == current->domain); @@ -70,7 +67,8 @@ static void hvm_zap_mmio_range( if ( pfn + i >= 0xfffff ) break; - __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val)); + if ( VALID_MFN(gmfn_to_mfn(d, pfn + i)) ) + guest_remove_page(d, pfn + i); } } @@ -262,11 +260,13 @@ void hvm_setup_platform(struct domain* d) if ( !hvm_guest(v) || (v->vcpu_id != 0) ) return; +#if 0 /* SHADOW2 does not have this */ if ( shadow_direct_map_init(d) == 0 ) { printk("Can not allocate shadow direct map for HVM domain.\n"); domain_crash_synchronous(); } +#endif hvm_zap_iommu_pages(d); @@ -380,6 +380,8 @@ void hvm_hlt(unsigned long rflags) */ int hvm_copy(void *buf, unsigned long vaddr, int size, int dir) { + struct vcpu *v = current; + unsigned long gfn; unsigned long mfn; char *addr; int count; @@ -389,10 +391,9 @@ int hvm_copy(void *buf, unsigned long vaddr, int size, int dir) if (count > size) count = size; - if (hvm_paging_enabled(current)) - mfn = gva_to_mfn(vaddr); - else - mfn = get_mfn_from_gpfn(vaddr >> PAGE_SHIFT); + gfn = shadow2_gva_to_gfn(v, vaddr); + mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn)); + if (mfn == INVALID_MFN) return 0; @@ -545,7 +546,7 @@ void hvm_do_hypercall(struct cpu_user_regs *pregs) return; } - if ( current->domain->arch.ops->guest_paging_levels == PAGING_L4 ) + if ( current->arch.shadow2->guest_levels == 4 ) { pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi, pregs->rsi, diff --git a/xen/arch/x86/hvm/platform.c b/xen/arch/x86/hvm/platform.c index f1bfd4c479..920e7786a0 100644 --- a/xen/arch/x86/hvm/platform.c +++ b/xen/arch/x86/hvm/platform.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -35,9 +35,6 @@ #include #include #include -#if CONFIG_PAGING_LEVELS >= 3 -#include -#endif #define DECODE_success 1 #define DECODE_failure 0 @@ -724,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *regs, unsigned long port, if (pvalid) { if (hvm_paging_enabled(current)) - p->u.pdata = (void *) gva_to_gpa(value); + p->u.data = shadow2_gva_to_gpa(current, value); else p->u.pdata = (void *) value; /* guest VA == guest PA */ } else @@ -774,7 +771,7 @@ void send_mmio_req( if (pvalid) { if (hvm_paging_enabled(v)) - p->u.pdata = (void *) gva_to_gpa(value); + p->u.data = shadow2_gva_to_gpa(v, value); else p->u.pdata = (void *) value; /* guest VA == guest PA */ } else diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index f7ae00937e..c6b3e813d5 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -26,9 +26,10 @@ #include #include #include +#include #include #include -#include +#include #include #include #include @@ -43,10 +44,6 @@ #include #include #include -#include -#if CONFIG_PAGING_LEVELS >= 3 -#include -#endif #include #define SVM_EXTRA_DEBUG @@ -414,7 +411,7 @@ static int svm_realmode(struct vcpu *v) return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE); } -static int svm_instruction_length(struct vcpu *v) +int svm_guest_x86_mode(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode; @@ -423,10 +420,20 @@ static int svm_instruction_length(struct vcpu *v) mode = vmcb->cs.attributes.fields.l ? 8 : 4; else mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4; - return svm_instrlen(guest_cpu_user_regs(), mode); + return mode; } -static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num) +int svm_instruction_length(struct vcpu *v) +{ + return svm_instrlen(guest_cpu_user_regs(), svm_guest_x86_mode(v)); +} + +void svm_update_host_cr3(struct vcpu *v) +{ + /* SVM doesn't have a HOST_CR3 equivalent to update. */ +} + +unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num) { switch ( num ) { @@ -436,6 +443,8 @@ static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num) return v->arch.hvm_svm.cpu_cr2; case 3: return v->arch.hvm_svm.cpu_cr3; + case 4: + return v->arch.hvm_svm.cpu_shadow_cr4; default: BUG(); } @@ -526,8 +535,6 @@ static void svm_init_hypercall_page(struct domain *d, void *hypercall_page) } - - int svm_dbg_on = 0; static inline int svm_do_debugout(unsigned long exit_code) @@ -647,6 +654,11 @@ static void svm_load_cpu_guest_regs( svm_load_cpu_user_regs(v, regs); } +int svm_long_mode_enabled(struct vcpu *v) +{ + return SVM_LONG_GUEST(v); +} + static void arch_svm_do_launch(struct vcpu *v) @@ -726,7 +738,6 @@ static void svm_ctxt_switch_to(struct vcpu *v) static void svm_final_setup_guest(struct vcpu *v) { struct domain *d = v->domain; - struct vcpu *vc; v->arch.schedule_tail = arch_svm_do_launch; v->arch.ctxt_switch_from = svm_ctxt_switch_from; @@ -735,9 +746,12 @@ static void svm_final_setup_guest(struct vcpu *v) if ( v != d->vcpu[0] ) return; - /* Initialize monitor page table */ - for_each_vcpu( d, vc ) - vc->arch.monitor_table = pagetable_null(); + if ( !shadow2_mode_external(d) ) + { + DPRINTK("Can't init HVM for dom %u vcpu %u: " + "not in shadow2 external mode\n", d->domain_id, v->vcpu_id); + domain_crash(d); + } /* * Required to do this once per domain @@ -745,13 +759,6 @@ static void svm_final_setup_guest(struct vcpu *v) */ memset(&d->shared_info->evtchn_mask[0], 0xff, sizeof(d->shared_info->evtchn_mask)); - - /* - * Put the domain in shadow mode even though we're going to be using - * the shared 1:1 page table initially. It shouldn't hurt - */ - shadow_mode_enable(d, SHM_enable|SHM_refcounts| - SHM_translate|SHM_external|SHM_wr_pt_pte); } @@ -809,9 +816,13 @@ int start_svm(void) hvm_funcs.realmode = svm_realmode; hvm_funcs.paging_enabled = svm_paging_enabled; + hvm_funcs.long_mode_enabled = svm_long_mode_enabled; + hvm_funcs.guest_x86_mode = svm_guest_x86_mode; hvm_funcs.instruction_length = svm_instruction_length; hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg; + hvm_funcs.update_host_cr3 = svm_update_host_cr3; + hvm_funcs.stts = svm_stts; hvm_funcs.set_tsc_offset = svm_set_tsc_offset; @@ -834,7 +845,6 @@ static void svm_relinquish_guest_resources(struct domain *d) continue; destroy_vmcb(&v->arch.hvm_svm); - free_monitor_pagetable(v); kill_timer(&v->arch.hvm_vcpu.hlt_timer); if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) { @@ -851,8 +861,6 @@ static void svm_relinquish_guest_resources(struct domain *d) if ( d->arch.hvm_domain.buffered_io_va ) unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va); - - shadow_direct_map_clean(d); } @@ -894,7 +902,6 @@ static int svm_do_page_fault(unsigned long va, struct cpu_user_regs *regs) { struct vcpu *v = current; unsigned long eip; - unsigned long gpa; /* FIXME: PAE */ int result; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -907,43 +914,7 @@ static int svm_do_page_fault(unsigned long va, struct cpu_user_regs *regs) va, eip, (unsigned long)regs->error_code); //#endif - if ( !svm_paging_enabled(v) ) - { - if ( shadow_direct_map_fault(va, regs) ) - return 1; - - handle_mmio(va, va); - return 1; - } - - - gpa = gva_to_gpa(va); - - /* Use 1:1 page table to identify MMIO address space */ - if (mmio_space(gpa)) - { - /* No support for APIC */ - if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) - { - int inst_len; - inst_len = svm_instruction_length(v); - if (inst_len == -1) - { - printf("%s: INST_LEN - Unable to decode properly\n", __func__); - domain_crash_synchronous(); - } - - __update_guest_eip(vmcb, inst_len); - - return 1; - } - - handle_mmio(va, gpa); - - return 1; - } - - result = shadow_fault(va, regs); + result = shadow2_fault(va, regs); if( result ) { /* Let's make sure that the Guest TLB is flushed */ @@ -1035,19 +1006,12 @@ static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb, unsigned long input, clear_bit(X86_FEATURE_APIC, &edx); } -#if CONFIG_PAGING_LEVELS < 3 - clear_bit(X86_FEATURE_PAE, &edx); - clear_bit(X86_FEATURE_PSE, &edx); - clear_bit(X86_FEATURE_PSE36, &edx); -#else - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 ) - { - if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) - clear_bit(X86_FEATURE_PAE, &edx); - clear_bit(X86_FEATURE_PSE, &edx); - clear_bit(X86_FEATURE_PSE36, &edx); - } +#if CONFIG_PAGING_LEVELS >= 3 + if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) #endif + clear_bit(X86_FEATURE_PAE, &edx); + clear_bit(X86_FEATURE_PSE36, &edx); + /* Clear out reserved bits. */ ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED; edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED; @@ -1097,23 +1061,12 @@ static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb, unsigned long input, clear_bit(X86_FEATURE_SYSCALL & 31, &edx); #endif -#if CONFIG_PAGING_LEVELS < 3 - clear_bit(X86_FEATURE_NX & 31, &edx); - clear_bit(X86_FEATURE_PAE, &edx); - clear_bit(X86_FEATURE_PSE, &edx); - clear_bit(X86_FEATURE_PSE36, &edx); -#else - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 ) - { - if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) - { - clear_bit(X86_FEATURE_NX & 31, &edx); - clear_bit(X86_FEATURE_PAE, &edx); - } - clear_bit(X86_FEATURE_PSE, &edx); - clear_bit(X86_FEATURE_PSE36, &edx); - } + +#if CONFIG_PAGING_LEVELS >= 3 + if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) #endif + clear_bit(X86_FEATURE_PAE, &edx); + clear_bit(X86_FEATURE_PSE36, &edx); /* Make SVM feature invisible to the guest. */ clear_bit(X86_FEATURE_SVME & 31, &ecx); @@ -1555,6 +1508,7 @@ static int svm_set_cr0(unsigned long value) unsigned long mfn; int paging_enabled; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + unsigned long old_base_mfn; ASSERT(vmcb); @@ -1600,54 +1554,21 @@ static int svm_set_cr0(unsigned long value) set_bit(SVM_CPU_STATE_LMA_ENABLED, &v->arch.hvm_svm.cpu_state); vmcb->efer |= (EFER_LMA | EFER_LME); - if (!shadow_set_guest_paging_levels(v->domain, PAGING_L4) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } } - else #endif /* __x86_64__ */ - { -#if CONFIG_PAGING_LEVELS >= 3 - /* seems it's a 32-bit or 32-bit PAE guest */ - if ( test_bit(SVM_CPU_STATE_PAE_ENABLED, - &v->arch.hvm_svm.cpu_state) ) - { - /* The guest enables PAE first and then it enables PG, it is - * really a PAE guest */ - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); - } - } - else - { - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } - } -#endif - } /* Now arch.guest_table points to machine physical. */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); + if ( old_base_mfn ) + put_page(mfn_to_page(old_base_mfn)); + shadow2_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table); - - /* arch->shadow_table should hold the next CR3 for shadow */ - HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx\n", - v->arch.hvm_svm.cpu_cr3, mfn); - - return 1; } if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled ) @@ -1667,17 +1588,16 @@ static int svm_set_cr0(unsigned long value) svm_inject_exception(v, TRAP_gp_fault, 1, 0); return 0; } - - clear_all_shadow_status( v->domain ); + shadow2_update_paging_modes(v); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table); } else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE ) { /* we should take care of this kind of situation */ - clear_all_shadow_status(v->domain); + shadow2_update_paging_modes(v); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table); } return 1; @@ -1786,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); if (mfn != pagetable_get_pfn(v->arch.guest_table)) __hvm_bug(regs); - shadow_sync_all(v->domain); + shadow2_update_cr3(v); } else { @@ -1812,14 +1732,10 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) /* * arch.shadow_table should now hold the next CR3 for shadow */ -#if CONFIG_PAGING_LEVELS >= 3 - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 ) - shadow_sync_all(v->domain); -#endif v->arch.hvm_svm.cpu_cr3 = value; - update_pagetables(v); + update_cr3(v); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); - vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table); } break; } @@ -1839,12 +1755,6 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) #if CONFIG_PAGING_LEVELS >= 3 unsigned long mfn, old_base_mfn; - if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } - if ( !VALID_MFN(mfn = get_mfn_from_gpfn( v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)) || !get_page(mfn_to_page(mfn), v->domain) ) @@ -1853,21 +1763,20 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) domain_crash_synchronous(); /* need to take a clean path */ } - old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - if ( old_base_mfn ) - put_page(mfn_to_page(old_base_mfn)); - /* * Now arch.guest_table points to machine physical. */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); + if ( old_base_mfn ) + put_page(mfn_to_page(old_base_mfn)); + shadow2_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); - vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; /* * arch->shadow_table should hold the next CR3 for shadow @@ -1876,33 +1785,6 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx", v->arch.hvm_svm.cpu_cr3, mfn); -#endif - } - else - { - /* The guest is a 64 bit or 32-bit PAE guest. */ -#if CONFIG_PAGING_LEVELS >= 3 - if ( (v->domain->arch.ops != NULL) && - v->domain->arch.ops->guest_paging_levels == PAGING_L2) - { - /* Seems the guest first enables PAE without enabling PG, - * it must enable PG after that, and it is a 32-bit PAE - * guest */ - - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3)) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); - } - } - else - { - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4)) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); - } - } #endif } } @@ -1926,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs) if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) { set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - shadow_sync_all(v->domain); + shadow2_update_paging_modes(v); } break; } @@ -2267,7 +2149,7 @@ void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs) /* Overkill, we may not this */ set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - shadow_invlpg(v, g_vaddr); + shadow2_invlpg(v, g_vaddr); } @@ -2638,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned long gva) struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned long gpa; - gpa = gva_to_gpa( gva ); + gpa = shadow2_gva_to_gpa(current, gva); printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 ); if( !svm_paging_enabled(v) || mmio_space(gpa) ) return; @@ -2662,8 +2544,12 @@ void walk_shadow_and_guest_pt(unsigned long gva) __copy_from_user(&gpte, &linear_pg_table[ l1_linear_offset(gva) ], sizeof(gpte) ); printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) ); - __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ], + + BUG(); // need to think about this, and convert usage of + // phys_to_machine_mapping to use pagetable format... + __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ], sizeof(spte) ); + printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte)); } #endif /* SVM_WALK_GUEST_PAGES */ @@ -2704,7 +2590,8 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs) if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) { - if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2))) + if (svm_paging_enabled(v) && + !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2))) { printk("I%08ld,ExC=%s(%d),IP=%x:%llx,I1=%llx,I2=%llx,INT=%llx, " "gpa=%llx\n", intercepts_counter, @@ -2713,7 +2600,7 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs) (unsigned long long) vmcb->exitinfo1, (unsigned long long) vmcb->exitinfo2, (unsigned long long) vmcb->exitintinfo.bytes, - (unsigned long long) gva_to_gpa( vmcb->exitinfo2 ) ); + (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2)); } else { @@ -2757,7 +2644,7 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs) && ( ( vmcb->exitinfo2 == vmcb->rip ) || vmcb->exitintinfo.bytes) ) { - if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2))) + if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2))) walk_shadow_and_guest_pt( vmcb->exitinfo2 ); } #endif diff --git a/xen/arch/x86/hvm/svm/vmcb.c b/xen/arch/x86/hvm/svm/vmcb.c index 349381e3ec..82f7195e73 100644 --- a/xen/arch/x86/hvm/svm/vmcb.c +++ b/xen/arch/x86/hvm/svm/vmcb.c @@ -380,8 +380,8 @@ void svm_do_launch(struct vcpu *v) printk("%s: phys_table = %lx\n", __func__, pt); } - /* At launch we always use the phys_table */ - vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table); + /* Set cr3 from hw_cr3 even when guest-visible paging is not enabled */ + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; if (svm_dbg_on) { diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c index 7ebca89693..9cb27656c3 100644 --- a/xen/arch/x86/hvm/vlapic.c +++ b/xen/arch/x86/hvm/vlapic.c @@ -21,7 +21,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index ebd8a42f68..75de5f49ea 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -34,12 +34,8 @@ #include #include #include -#include #include - -#if CONFIG_PAGING_LEVELS >= 3 -#include -#endif +#include static int vmcs_size; static int vmcs_order; @@ -238,7 +234,7 @@ static void vmx_set_host_env(struct vcpu *v) static void vmx_do_launch(struct vcpu *v) { -/* Update CR3, GDT, LDT, TR */ +/* Update CR3, CR0, CR4, GDT, LDT, TR */ unsigned int error = 0; unsigned long cr0, cr4; @@ -276,8 +272,11 @@ static void vmx_do_launch(struct vcpu *v) error |= __vmwrite(GUEST_TR_BASE, 0); error |= __vmwrite(GUEST_TR_LIMIT, 0xff); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table)); - __vmwrite(HOST_CR3, pagetable_get_paddr(v->arch.monitor_table)); + shadow2_update_paging_modes(v); + printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n", + __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); + __vmwrite(HOST_CR3, v->arch.cr3); v->arch.schedule_tail = arch_vmx_do_resume; diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 658ee8ae73..0233f26595 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -26,9 +26,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -40,10 +40,7 @@ #include #include #include -#include -#if CONFIG_PAGING_LEVELS >= 3 -#include -#endif +#include #include #include #include @@ -69,11 +66,16 @@ static int vmx_initialize_guest_resources(struct vcpu *v) if ( v->vcpu_id != 0 ) return 1; - for_each_vcpu ( d, vc ) + if ( !shadow2_mode_external(d) ) { - /* Initialize monitor page table */ - vc->arch.monitor_table = pagetable_null(); + DPRINTK("Can't init HVM for dom %u vcpu %u: " + "not in shadow2 external mode\n", + d->domain_id, v->vcpu_id); + domain_crash(d); + } + for_each_vcpu ( d, vc ) + { memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct)); if ( (rc = vmx_create_vmcs(vc)) != 0 ) @@ -107,6 +109,7 @@ static int vmx_initialize_guest_resources(struct vcpu *v) vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a; vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b; + } /* @@ -116,11 +119,6 @@ static int vmx_initialize_guest_resources(struct vcpu *v) memset(&d->shared_info->evtchn_mask[0], 0xff, sizeof(d->shared_info->evtchn_mask)); - /* Put the domain in shadow mode even though we're going to be using - * the shared 1:1 page table initially. It shouldn't hurt */ - shadow_mode_enable( - d, SHM_enable|SHM_refcounts|SHM_translate|SHM_external|SHM_wr_pt_pte); - return 1; } @@ -133,7 +131,6 @@ static void vmx_relinquish_guest_resources(struct domain *d) vmx_destroy_vmcs(v); if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) continue; - free_monitor_pagetable(v); kill_timer(&v->arch.hvm_vcpu.hlt_timer); if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) { @@ -153,8 +150,6 @@ static void vmx_relinquish_guest_resources(struct domain *d) if ( d->arch.hvm_domain.buffered_io_va ) unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va); - - shadow_direct_map_clean(d); } #ifdef __x86_64__ @@ -595,14 +590,6 @@ static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs) vmx_vmcs_exit(v); } -static int vmx_realmode(struct vcpu *v) -{ - unsigned long rflags; - - __vmread(GUEST_RFLAGS, &rflags); - return rflags & X86_EFLAGS_VM; -} - static int vmx_instruction_length(struct vcpu *v) { unsigned long inst_len; @@ -622,6 +609,8 @@ static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num) return v->arch.hvm_vmx.cpu_cr2; case 3: return v->arch.hvm_vmx.cpu_cr3; + case 4: + return v->arch.hvm_vmx.cpu_shadow_cr4; default: BUG(); } @@ -753,9 +742,13 @@ static void vmx_setup_hvm_funcs(void) hvm_funcs.realmode = vmx_realmode; hvm_funcs.paging_enabled = vmx_paging_enabled; + hvm_funcs.long_mode_enabled = vmx_long_mode_enabled; + hvm_funcs.guest_x86_mode = vmx_guest_x86_mode; hvm_funcs.instruction_length = vmx_instruction_length; hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg; + hvm_funcs.update_host_cr3 = vmx_update_host_cr3; + hvm_funcs.stts = vmx_stts; hvm_funcs.set_tsc_offset = vmx_set_tsc_offset; @@ -855,53 +848,25 @@ static void inline __update_guest_eip(unsigned long inst_len) __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); } - static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs) { - unsigned long gpa; /* FIXME: PAE */ int result; #if 0 /* keep for debugging */ { - unsigned long eip; + unsigned long eip, cs; + __vmread(GUEST_CS_BASE, &cs); __vmread(GUEST_RIP, &eip); HVM_DBG_LOG(DBG_LEVEL_VMMU, - "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx", - va, eip, (unsigned long)regs->error_code); + "vmx_do_page_fault = 0x%lx, cs_base=%lx, " + "eip = %lx, error_code = %lx\n", + va, cs, eip, (unsigned long)regs->error_code); } #endif - if ( !vmx_paging_enabled(current) ) - { - /* construct 1-to-1 direct mapping */ - if ( shadow_direct_map_fault(va, regs) ) - return 1; - - handle_mmio(va, va); - TRACE_VMEXIT (2,2); - return 1; - } - gpa = gva_to_gpa(va); - - /* Use 1:1 page table to identify MMIO address space */ - if ( mmio_space(gpa) ){ - struct vcpu *v = current; - /* No support for APIC */ - if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) { - u32 inst_len; - __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len)); - __update_guest_eip(inst_len); - return 1; - } - TRACE_VMEXIT (2,2); - /* in the case of MMIO, we are more interested in gpa than in va */ - TRACE_VMEXIT (4,gpa); - handle_mmio(va, gpa); - return 1; - } + result = shadow2_fault(va, regs); - result = shadow_fault(va, regs); TRACE_VMEXIT (2,result); #if 0 if ( !result ) @@ -972,23 +937,11 @@ static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs) clear_bit(X86_FEATURE_APIC, &edx); } -#if CONFIG_PAGING_LEVELS < 3 - edx &= ~(bitmaskof(X86_FEATURE_PAE) | - bitmaskof(X86_FEATURE_PSE) | - bitmaskof(X86_FEATURE_PSE36)); -#else - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 ) - { - if ( v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) - clear_bit(X86_FEATURE_PSE36, &edx); - else - { - clear_bit(X86_FEATURE_PAE, &edx); - clear_bit(X86_FEATURE_PSE, &edx); - clear_bit(X86_FEATURE_PSE36, &edx); - } - } +#if CONFIG_PAGING_LEVELS >= 3 + if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) #endif + clear_bit(X86_FEATURE_PAE, &edx); + clear_bit(X86_FEATURE_PSE36, &edx); ebx &= NUM_THREADS_RESET_MASK; @@ -1086,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigned long va) * We do the safest things first, then try to update the shadow * copying from guest */ - shadow_invlpg(v, va); + shadow2_invlpg(v, va); } @@ -1307,11 +1260,8 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c) error |= __vmwrite(CR0_READ_SHADOW, c->cr0); - if (!vmx_paging_enabled(v)) { - HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table"); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table)); + if (!vmx_paging_enabled(v)) goto skip_cr3; - } if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) { /* @@ -1325,7 +1275,6 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c) domain_crash_synchronous(); return 0; } - shadow_sync_all(v->domain); } else { /* * If different, make a shadow. Check if the PDBR is valid @@ -1348,13 +1297,17 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c) * arch.shadow_table should now hold the next CR3 for shadow */ v->arch.hvm_vmx.cpu_cr3 = c->cr3; - update_pagetables(v); - HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table)); } skip_cr3: + shadow2_update_paging_modes(v); + if (!vmx_paging_enabled(v)) + HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table"); + else + HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); + error |= __vmread(CR4_READ_SHADOW, &old_cr4); error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK)); error |= __vmwrite(CR4_READ_SHADOW, c->cr4); @@ -1485,6 +1438,7 @@ static int vmx_set_cr0(unsigned long value) int paging_enabled; unsigned long vm_entry_value; unsigned long old_cr0; + unsigned long old_base_mfn; /* * CR0: We don't want to lose PE and PG. @@ -1514,7 +1468,8 @@ static int vmx_set_cr0(unsigned long value) v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) || !get_page(mfn_to_page(mfn), v->domain) ) { - printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3); + printk("Invalid CR3 value = %lx (mfn=%lx)\n", + v->arch.hvm_vmx.cpu_cr3, mfn); domain_crash_synchronous(); /* need to take a clean path */ } @@ -1539,51 +1494,22 @@ static int vmx_set_cr0(unsigned long value) __vmread(VM_ENTRY_CONTROLS, &vm_entry_value); vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE; __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value); - - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } } - else -#endif /* __x86_64__ */ - { -#if CONFIG_PAGING_LEVELS >= 3 - /* seems it's a 32-bit or 32-bit PAE guest */ - - if ( test_bit(VMX_CPU_STATE_PAE_ENABLED, - &v->arch.hvm_vmx.cpu_state) ) - { - /* The guest enables PAE first and then it enables PG, it is - * really a PAE guest */ - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); - } - } - else - { - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } - } #endif - } /* * Now arch.guest_table points to machine physical. */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); + if (old_base_mfn) + put_page(mfn_to_page(old_base_mfn)); + shadow2_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table)); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); /* * arch->shadow_table should hold the next CR3 for shadow */ @@ -1625,7 +1551,6 @@ static int vmx_set_cr0(unsigned long value) } } - clear_all_shadow_status(v->domain); if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) { set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state); __vmread(GUEST_RIP, &eip); @@ -1651,9 +1576,8 @@ static int vmx_set_cr0(unsigned long value) } else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE ) { - /* we should take care of this kind of situation */ - clear_all_shadow_status(v->domain); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table)); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); + shadow2_update_paging_modes(v); } return 1; @@ -1738,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs) mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); if (mfn != pagetable_get_pfn(v->arch.guest_table)) __hvm_bug(regs); - shadow_sync_all(v->domain); + shadow2_update_cr3(v); } else { /* * If different, make a shadow. Check if the PDBR is valid @@ -1759,16 +1683,11 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs) /* * arch.shadow_table should now hold the next CR3 for shadow */ -#if CONFIG_PAGING_LEVELS >= 3 - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 ) - shadow_sync_all(v->domain); -#endif - v->arch.hvm_vmx.cpu_cr3 = value; - update_pagetables(v); + update_cr3(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table)); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); } break; } @@ -1786,12 +1705,6 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs) #if CONFIG_PAGING_LEVELS >= 3 unsigned long mfn, old_base_mfn; - if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } - if ( !VALID_MFN(mfn = get_mfn_from_gpfn( v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) || !get_page(mfn_to_page(mfn), v->domain) ) @@ -1800,21 +1713,20 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs) domain_crash_synchronous(); /* need to take a clean path */ } - old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - if ( old_base_mfn ) - put_page(mfn_to_page(old_base_mfn)); /* * Now arch.guest_table points to machine physical. */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); + if ( old_base_mfn ) + put_page(mfn_to_page(old_base_mfn)); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table)); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); /* * arch->shadow_table should hold the next CR3 for shadow @@ -1822,27 +1734,6 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs) HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx", v->arch.hvm_vmx.cpu_cr3, mfn); -#endif - } - else - { - /* The guest is a 64 bit or 32-bit PAE guest. */ -#if CONFIG_PAGING_LEVELS >= 3 - if ( (v->domain->arch.ops != NULL) && - v->domain->arch.ops->guest_paging_levels == PAGING_L2) - { - /* Seems the guest first enables PAE without enabling PG, - * it must enable PG after that, and it is a 32-bit PAE - * guest */ - - if ( !shadow_set_guest_paging_levels(v->domain, - PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - /* need to take a clean path */ - domain_crash_synchronous(); - } - } #endif } } @@ -1864,8 +1755,7 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs) * all TLB entries except global entries. */ if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) - shadow_sync_all(v->domain); - + shadow2_update_paging_modes(v); break; } default: diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 0c35c9b52d..6c0abad2e2 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -137,7 +137,7 @@ static void free_l1_table(struct page_info *page); static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long, unsigned long type); -static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); +static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn); /* Used to defer flushing of memory structures. */ struct percpu_mm_info { @@ -274,9 +274,9 @@ void share_xen_page_with_privileged_guests( #else /* * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths. - * We cannot safely shadow the idle page table, nor shadow-mode page tables + * We cannot safely shadow the idle page table, nor shadow (v1) page tables * (detected by lack of an owning domain). As required for correctness, we - * always shadow PDPTs aboive 4GB. + * always shadow PDPTs above 4GB. */ #define l3tab_needs_shadow(mfn) \ (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \ @@ -297,17 +297,21 @@ static int __init cache_pae_fixmap_address(void) } __initcall(cache_pae_fixmap_address); -static void __write_ptbase(unsigned long mfn) +static DEFINE_PER_CPU(u32, make_cr3_timestamp); + +void make_cr3(struct vcpu *v, unsigned long mfn) +/* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if + * necessary, and sets v->arch.cr3 to the value to load in CR3. */ { l3_pgentry_t *highmem_l3tab, *lowmem_l3tab; - struct pae_l3_cache *cache = ¤t->arch.pae_l3_cache; + struct pae_l3_cache *cache = &v->arch.pae_l3_cache; unsigned int cpu = smp_processor_id(); - /* Fast path 1: does this mfn need a shadow at all? */ + /* Fast path: does this mfn need a shadow at all? */ if ( !l3tab_needs_shadow(mfn) ) { - write_cr3(mfn << PAGE_SHIFT); - /* Cache is no longer in use or valid (/after/ write to %cr3). */ + v->arch.cr3 = mfn << PAGE_SHIFT; + /* Cache is no longer in use or valid */ cache->high_mfn = 0; return; } @@ -315,13 +319,6 @@ static void __write_ptbase(unsigned long mfn) /* Caching logic is not interrupt safe. */ ASSERT(!in_irq()); - /* Fast path 2: is this mfn already cached? */ - if ( cache->high_mfn == mfn ) - { - write_cr3(__pa(cache->table[cache->inuse_idx])); - return; - } - /* Protects against pae_flush_pgd(). */ spin_lock(&cache->lock); @@ -330,29 +327,33 @@ static void __write_ptbase(unsigned long mfn) /* Map the guest L3 table and copy to the chosen low-memory cache. */ *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR); + /* First check the previous high mapping can't be in the TLB. + * (i.e. have we loaded CR3 since we last did this?) */ + if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) ) + local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu)); highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu); lowmem_l3tab = cache->table[cache->inuse_idx]; memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0])); *(fix_pae_highmem_pl1e - cpu) = l1e_empty(); + this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time); - /* Install the low-memory L3 table in CR3. */ - write_cr3(__pa(lowmem_l3tab)); + v->arch.cr3 = __pa(lowmem_l3tab); spin_unlock(&cache->lock); } #else /* !CONFIG_X86_PAE */ -static void __write_ptbase(unsigned long mfn) +void make_cr3(struct vcpu *v, unsigned long mfn) { - write_cr3(mfn << PAGE_SHIFT); + v->arch.cr3 = mfn << PAGE_SHIFT; } #endif /* !CONFIG_X86_PAE */ void write_ptbase(struct vcpu *v) { - __write_ptbase(pagetable_get_pfn(v->arch.monitor_table)); + write_cr3(v->arch.cr3); } void invalidate_shadow_ldt(struct vcpu *v) @@ -423,8 +424,6 @@ int map_ldt_shadow_page(unsigned int off) BUG_ON(unlikely(in_irq())); - shadow_sync_va(v, gva); - TOGGLE_MODE(); __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)], sizeof(l1e)); @@ -440,12 +439,12 @@ int map_ldt_shadow_page(unsigned int off) res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); - if ( !res && unlikely(shadow_mode_refcounts(d)) ) + if ( !res && unlikely(shadow2_mode_refcounts(d)) ) { - shadow_lock(d); - shadow_remove_all_write_access(d, gmfn, mfn); + shadow2_lock(d); + shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0); res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); - shadow_unlock(d); + shadow2_unlock(d); } if ( unlikely(!res) ) @@ -513,7 +512,7 @@ get_linear_pagetable( struct page_info *page; unsigned long pfn; - ASSERT( !shadow_mode_refcounts(d) ); + ASSERT( !shadow2_mode_refcounts(d) ); if ( (root_get_flags(re) & _PAGE_RW) ) { @@ -576,7 +575,8 @@ get_page_from_l1e( if ( !iomem_access_permitted(d, mfn, mfn) ) { - MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn); + MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx", + d->domain_id, mfn); return 0; } @@ -587,9 +587,14 @@ get_page_from_l1e( d = dom_io; } - okay = ((l1e_get_flags(l1e) & _PAGE_RW) ? - get_page_and_type(page, d, PGT_writable_page) : - get_page(page, d)); + /* Foreign mappings into guests in shadow2 external mode don't + * contribute to writeable mapping refcounts. (This allows the + * qemu-dm helper process in dom0 to map the domain's memory without + * messing up the count of "real" writable mappings.) */ + okay = (((l1e_get_flags(l1e) & _PAGE_RW) && + !(unlikely(shadow2_mode_external(d) && (d != current->domain)))) + ? get_page_and_type(page, d, PGT_writable_page) + : get_page(page, d)); if ( !okay ) { MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte @@ -610,8 +615,6 @@ get_page_from_l2e( { int rc; - ASSERT(!shadow_mode_refcounts(d)); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) return 1; @@ -641,8 +644,6 @@ get_page_from_l3e( { int rc; - ASSERT(!shadow_mode_refcounts(d)); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) return 1; @@ -669,8 +670,6 @@ get_page_from_l4e( { int rc; - ASSERT( !shadow_mode_refcounts(d) ); - if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return 1; @@ -727,7 +726,10 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) domain_crash(d); } - if ( l1e_get_flags(l1e) & _PAGE_RW ) + /* Remember we didn't take a type-count of foreign writable mappings + * to shadow2 external domains */ + if ( (l1e_get_flags(l1e) & _PAGE_RW) && + !(unlikely((e != d) && shadow2_mode_external(e))) ) { put_page_and_type(page); } @@ -784,7 +786,7 @@ static int alloc_l1_table(struct page_info *page) l1_pgentry_t *pl1e; int i; - ASSERT(!shadow_mode_refcounts(d)); + ASSERT(!shadow2_mode_refcounts(d)); pl1e = map_domain_page(pfn); @@ -832,6 +834,8 @@ static int create_pae_xen_mappings(l3_pgentry_t *pl3e) * 2. Cannot appear in another page table's L3: * a. alloc_l3_table() calls this function and this check will fail * b. mod_l3_entry() disallows updates to slot 3 in an existing table + * + * XXX -- this needs revisiting for shadow2_mode_refcount()==true... */ page = l3e_get_page(l3e3); BUG_ON(page->u.inuse.type_info & PGT_pinned); @@ -955,11 +959,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) l2_pgentry_t *pl2e; int i; - /* See the code in shadow_promote() to understand why this is here. */ - if ( (PGT_base_page_table == PGT_l2_page_table) && - unlikely(shadow_mode_refcounts(d)) ) - return 1; - ASSERT(!shadow_mode_refcounts(d)); + ASSERT(!shadow2_mode_refcounts(d)); pl2e = map_domain_page(pfn); @@ -1009,11 +1009,7 @@ static int alloc_l3_table(struct page_info *page, unsigned long type) l3_pgentry_t *pl3e; int i; - /* See the code in shadow_promote() to understand why this is here. */ - if ( (PGT_base_page_table == PGT_l3_page_table) && - shadow_mode_refcounts(d) ) - return 1; - ASSERT(!shadow_mode_refcounts(d)); + ASSERT(!shadow2_mode_refcounts(d)); #ifdef CONFIG_X86_PAE /* @@ -1072,11 +1068,7 @@ static int alloc_l4_table(struct page_info *page, unsigned long type) unsigned long vaddr; int i; - /* See the code in shadow_promote() to understand why this is here. */ - if ( (PGT_base_page_table == PGT_l4_page_table) && - shadow_mode_refcounts(d) ) - return 1; - ASSERT(!shadow_mode_refcounts(d)); + ASSERT(!shadow2_mode_refcounts(d)); for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) { @@ -1183,42 +1175,55 @@ static void free_l4_table(struct page_info *page) static inline int update_l1e(l1_pgentry_t *pl1e, l1_pgentry_t ol1e, - l1_pgentry_t nl1e) + l1_pgentry_t nl1e, + unsigned long gl1mfn, + struct vcpu *v) { + int rv = 1; + if ( unlikely(shadow2_mode_enabled(v->domain)) ) + shadow2_lock(v->domain); #ifndef PTE_UPDATE_WITH_CMPXCHG - return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e)); + rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e))); #else - intpte_t o = l1e_get_intpte(ol1e); - intpte_t n = l1e_get_intpte(nl1e); - - for ( ; ; ) { - if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ) + intpte_t o = l1e_get_intpte(ol1e); + intpte_t n = l1e_get_intpte(nl1e); + + for ( ; ; ) { - MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte - ": saw %" PRIpte, - l1e_get_intpte(ol1e), - l1e_get_intpte(nl1e), - o); - return 0; - } + if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ) + { + MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte + ": saw %" PRIpte, + l1e_get_intpte(ol1e), + l1e_get_intpte(nl1e), + o); + rv = 0; + break; + } - if ( o == l1e_get_intpte(ol1e) ) - break; + if ( o == l1e_get_intpte(ol1e) ) + break; - /* Allowed to change in Accessed/Dirty flags only. */ - BUG_ON((o ^ l1e_get_intpte(ol1e)) & - ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY)); - ol1e = l1e_from_intpte(o); + /* Allowed to change in Accessed/Dirty flags only. */ + BUG_ON((o ^ l1e_get_intpte(ol1e)) & + ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY)); + ol1e = l1e_from_intpte(o); + } } - - return 1; #endif + if ( unlikely(shadow2_mode_enabled(v->domain)) ) + { + shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e); + shadow2_unlock(v->domain); + } + return rv; } /* Update the L1 entry at pl1e to new value nl1e. */ -static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, + unsigned long gl1mfn) { l1_pgentry_t ol1e; struct domain *d = current->domain; @@ -1226,9 +1231,6 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ) return 0; - if ( unlikely(shadow_mode_refcounts(d)) ) - return update_l1e(pl1e, ol1e, nl1e); - if ( l1e_get_flags(nl1e) & _PAGE_PRESENT ) { if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) ) @@ -1239,13 +1241,13 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) } /* Fast path for identical mapping, r/w and presence. */ - if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT)) - return update_l1e(pl1e, ol1e, nl1e); + if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) ) + return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current); if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) return 0; - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) ) { put_page_from_l1e(nl1e, d); return 0; @@ -1253,7 +1255,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) } else { - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) ) return 0; } @@ -1262,9 +1264,9 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) } #ifndef PTE_UPDATE_WITH_CMPXCHG -#define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; }) +#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; }) #else -#define UPDATE_ENTRY(_t,_p,_o,_n) ({ \ +#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ \ for ( ; ; ) \ { \ intpte_t __o = cmpxchg((intpte_t *)(_p), \ @@ -1279,6 +1281,18 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) } \ 1; }) #endif +#define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \ + int rv; \ + if ( unlikely(shadow2_mode_enabled(current->domain)) ) \ + shadow2_lock(current->domain); \ + rv = _UPDATE_ENTRY(_t, _p, _o, _n); \ + if ( unlikely(shadow2_mode_enabled(current->domain)) ) \ + { \ + shadow2_validate_guest_entry(current, _mfn(_m), (_p)); \ + shadow2_unlock(current->domain); \ + } \ + rv; \ +}) /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ static int mod_l2_entry(l2_pgentry_t *pl2e, @@ -1309,19 +1323,19 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, /* Fast path for identical mapping and presence. */ if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT)) - return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e); + return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn); if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) || unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) ) return 0; - if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) ) + if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) ) { put_page_from_l2e(nl2e, pfn); return 0; } } - else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) ) + else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) ) { return 0; } @@ -1330,7 +1344,6 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, return 1; } - #if CONFIG_PAGING_LEVELS >= 3 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */ @@ -1356,7 +1369,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, */ if ( pgentry_ptr_to_slot(pl3e) >= 3 ) return 0; -#endif +#endif if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) ) return 0; @@ -1372,7 +1385,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, /* Fast path for identical mapping and presence. */ if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT)) - return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e); + return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn); #if CONFIG_PAGING_LEVELS >= 4 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) || @@ -1383,15 +1396,15 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, << L3_PAGETABLE_SHIFT; if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) ) return 0; -#endif +#endif - if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) ) + if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) ) { put_page_from_l3e(nl3e, pfn); return 0; } } - else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) ) + else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) ) { return 0; } @@ -1438,19 +1451,19 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, /* Fast path for identical mapping and presence. */ if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT)) - return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e); + return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn); if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) || unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) ) return 0; - if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) ) + if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) ) { put_page_from_l4e(nl4e, pfn); return 0; } } - else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) ) + else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) ) { return 0; } @@ -1506,18 +1519,21 @@ void free_page_type(struct page_info *page, unsigned long type) */ this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS; - if ( unlikely(shadow_mode_enabled(owner)) ) + if ( unlikely(shadow2_mode_enabled(owner) + && !shadow2_lock_is_acquired(owner)) ) { /* Raw page tables are rewritten during save/restore. */ - if ( !shadow_mode_translate(owner) ) + if ( !shadow2_mode_translate(owner) ) mark_dirty(owner, page_to_mfn(page)); - if ( shadow_mode_refcounts(owner) ) + if ( shadow2_mode_refcounts(owner) ) return; gmfn = mfn_to_gmfn(owner, page_to_mfn(page)); ASSERT(VALID_M2P(gmfn)); - remove_shadow(owner, gmfn, type & PGT_type_mask); + shadow2_lock(owner); + shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn)); + shadow2_unlock(owner); } } @@ -1573,9 +1589,6 @@ void put_page_type(struct page_info *page) if ( unlikely((nx & PGT_count_mask) == 0) ) { - /* Record TLB information for flush later. Races are harmless. */ - page->tlbflush_timestamp = tlbflush_current_time(); - if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && likely(nx & PGT_validated) ) { @@ -1593,6 +1606,9 @@ void put_page_type(struct page_info *page) x &= ~PGT_validated; nx &= ~PGT_validated; } + + /* Record TLB information for flush later. */ + page->tlbflush_timestamp = tlbflush_current_time(); } else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) == (PGT_pinned|PGT_l1_page_table|1)) ) @@ -1682,7 +1698,7 @@ int get_page_type(struct page_info *page, unsigned long type) #endif /* Fixme: add code to propagate va_unknown to subtables. */ if ( ((type & PGT_type_mask) >= PGT_l2_page_table) && - !shadow_mode_refcounts(page_get_owner(page)) ) + !shadow2_mode_refcounts(page_get_owner(page)) ) return 0; /* This table is possibly mapped at multiple locations. */ nx &= ~PGT_va_mask; @@ -1729,7 +1745,10 @@ int new_guest_cr3(unsigned long mfn) int okay; unsigned long old_base_mfn; - if ( shadow_mode_refcounts(d) ) + if ( hvm_guest(v) && !hvm_paging_enabled(v) ) + domain_crash_synchronous(); + + if ( shadow2_mode_refcounts(d) ) { okay = get_page_from_pagenr(mfn, d); if ( unlikely(!okay) ) @@ -1747,7 +1766,7 @@ int new_guest_cr3(unsigned long mfn) MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn); old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_null(); - update_pagetables(v); + update_cr3(v); write_cr3(__pa(idle_pg_table)); if ( old_base_mfn != 0 ) put_page_and_type(mfn_to_page(old_base_mfn)); @@ -1769,30 +1788,20 @@ int new_guest_cr3(unsigned long mfn) invalidate_shadow_ldt(v); old_base_mfn = pagetable_get_pfn(v->arch.guest_table); + v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); /* update shadow_table and monitor_table */ + update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */ write_ptbase(v); if ( likely(old_base_mfn != 0) ) { - if ( shadow_mode_refcounts(d) ) + if ( shadow2_mode_refcounts(d) ) put_page(mfn_to_page(old_base_mfn)); else put_page_and_type(mfn_to_page(old_base_mfn)); } - /* CR3 also holds a ref to its shadow... */ - if ( shadow_mode_enabled(d) ) - { - if ( v->arch.monitor_shadow_ref ) - put_shadow_ref(v->arch.monitor_shadow_ref); - v->arch.monitor_shadow_ref = - pagetable_get_pfn(v->arch.monitor_table); - ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref))); - get_shadow_ref(v->arch.monitor_shadow_ref); - } - return 1; } @@ -1807,8 +1816,6 @@ static void process_deferred_ops(void) if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) ) { - if ( shadow_mode_enabled(d) ) - shadow_sync_all(d); if ( deferred_ops & DOP_FLUSH_ALL_TLBS ) flush_tlb_mask(d->domain_dirty_cpumask); else @@ -1974,7 +1981,7 @@ int do_mmuext_op( type = PGT_root_page_table; pin_page: - if ( shadow_mode_refcounts(FOREIGNDOM) ) + if ( shadow2_mode_refcounts(FOREIGNDOM) ) break; okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM); @@ -1996,7 +2003,7 @@ int do_mmuext_op( break; case MMUEXT_UNPIN_TABLE: - if ( shadow_mode_refcounts(d) ) + if ( shadow2_mode_refcounts(d) ) break; if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) ) @@ -2009,6 +2016,12 @@ int do_mmuext_op( { put_page_and_type(page); put_page(page); + if ( shadow2_mode_enabled(d) ) + { + shadow2_lock(d); + shadow2_remove_all_shadows(v, _mfn(mfn)); + shadow2_unlock(d); + } } else { @@ -2050,9 +2063,9 @@ int do_mmuext_op( break; case MMUEXT_INVLPG_LOCAL: - if ( shadow_mode_enabled(d) ) - shadow_invlpg(v, op.arg1.linear_addr); - local_flush_tlb_one(op.arg1.linear_addr); + if ( !shadow2_mode_enabled(d) + || shadow2_invlpg(v, op.arg1.linear_addr) != 0 ) + local_flush_tlb_one(op.arg1.linear_addr); break; case MMUEXT_TLB_FLUSH_MULTI: @@ -2098,7 +2111,7 @@ int do_mmuext_op( unsigned long ptr = op.arg1.linear_addr; unsigned long ents = op.arg2.nr_ents; - if ( shadow_mode_external(d) ) + if ( shadow2_mode_external(d) ) { MEM_LOG("ignoring SET_LDT hypercall from external " "domain %u", d->domain_id); @@ -2171,9 +2184,6 @@ int do_mmu_update( LOCK_BIGLOCK(d); - if ( unlikely(shadow_mode_enabled(d)) ) - check_pagetable(v, "pre-mmu"); /* debug */ - if ( unlikely(count & MMU_UPDATE_PREEMPTED) ) { count &= ~MMU_UPDATE_PREEMPTED; @@ -2248,7 +2258,12 @@ int do_mmu_update( case PGT_l3_page_table: case PGT_l4_page_table: { - ASSERT(!shadow_mode_refcounts(d)); + if ( shadow2_mode_refcounts(d) ) + { + DPRINTK("mmu update on shadow-refcounted domain!"); + break; + } + if ( unlikely(!get_page_type( page, type_info & (PGT_type_mask|PGT_va_mask))) ) goto not_a_pt; @@ -2258,10 +2273,7 @@ int do_mmu_update( case PGT_l1_page_table: { l1_pgentry_t l1e = l1e_from_intpte(req.val); - okay = mod_l1_entry(va, l1e); - if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l1_normal_pt_update( - d, req.ptr, l1e, &sh_mapcache); + okay = mod_l1_entry(va, l1e, mfn); } break; case PGT_l2_page_table: @@ -2269,9 +2281,6 @@ int do_mmu_update( l2_pgentry_t l2e = l2e_from_intpte(req.val); okay = mod_l2_entry( (l2_pgentry_t *)va, l2e, mfn, type_info); - if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l2_normal_pt_update( - d, req.ptr, l2e, &sh_mapcache); } break; #if CONFIG_PAGING_LEVELS >= 3 @@ -2279,9 +2288,6 @@ int do_mmu_update( { l3_pgentry_t l3e = l3e_from_intpte(req.val); okay = mod_l3_entry(va, l3e, mfn, type_info); - if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l3_normal_pt_update( - d, req.ptr, l3e, &sh_mapcache); } break; #endif @@ -2290,9 +2296,6 @@ int do_mmu_update( { l4_pgentry_t l4e = l4e_from_intpte(req.val); okay = mod_l4_entry(va, l4e, mfn, type_info); - if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l4_normal_pt_update( - d, req.ptr, l4e, &sh_mapcache); } break; #endif @@ -2308,19 +2311,17 @@ int do_mmu_update( if ( unlikely(!get_page_type(page, PGT_writable_page)) ) break; - if ( shadow_mode_enabled(d) ) - { - shadow_lock(d); - __mark_dirty(d, mfn); - if ( page_is_page_table(page) && !page_out_of_sync(page) ) - shadow_mark_mfn_out_of_sync(v, gmfn, mfn); - } + if ( unlikely(shadow2_mode_enabled(d)) ) + shadow2_lock(d); *(intpte_t *)va = req.val; okay = 1; - if ( shadow_mode_enabled(d) ) - shadow_unlock(d); + if ( unlikely(shadow2_mode_enabled(d)) ) + { + shadow2_validate_guest_entry(v, _mfn(mfn), va); + shadow2_unlock(d); + } put_page_type(page); } @@ -2334,12 +2335,6 @@ int do_mmu_update( case MMU_MACHPHYS_UPDATE: - if ( shadow_mode_translate(FOREIGNDOM) ) - { - MEM_LOG("can't mutate m2p table of translate mode guest"); - break; - } - mfn = req.ptr >> PAGE_SHIFT; gpfn = req.val; @@ -2349,9 +2344,13 @@ int do_mmu_update( break; } - set_gpfn_from_mfn(mfn, gpfn); + if ( shadow2_mode_translate(FOREIGNDOM) ) + shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn); + else + set_gpfn_from_mfn(mfn, gpfn); okay = 1; + // Mark the new gfn dirty... mark_dirty(FOREIGNDOM, mfn); put_page(mfn_to_page(mfn)); @@ -2382,9 +2381,6 @@ int do_mmu_update( if ( unlikely(!guest_handle_is_null(pdone)) ) copy_to_guest(pdone, &done, 1); - if ( unlikely(shadow_mode_enabled(d)) ) - check_pagetable(v, "post-mmu"); /* debug */ - UNLOCK_BIGLOCK(d); return rc; } @@ -2402,7 +2398,6 @@ static int create_grant_pte_mapping( struct domain *d = v->domain; ASSERT(spin_is_locked(&d->big_lock)); - ASSERT(!shadow_mode_refcounts(d)); gmfn = pte_addr >> PAGE_SHIFT; mfn = gmfn_to_mfn(d, gmfn); @@ -2418,7 +2413,7 @@ static int create_grant_pte_mapping( page = mfn_to_page(mfn); type_info = page->u.inuse.type_info; - if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) || + if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) || !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) ) { MEM_LOG("Grant map attempted to update a non-L1 page"); @@ -2427,28 +2422,22 @@ static int create_grant_pte_mapping( } ol1e = *(l1_pgentry_t *)va; - if ( !update_l1e(va, ol1e, _nl1e) ) + if ( !update_l1e(va, ol1e, _nl1e, mfn, v) ) { put_page_type(page); rc = GNTST_general_error; goto failed; } - put_page_from_l1e(ol1e, d); - - if ( unlikely(shadow_mode_enabled(d)) ) - { - struct domain_mmap_cache sh_mapcache; - domain_mmap_cache_init(&sh_mapcache); - shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache); - domain_mmap_cache_destroy(&sh_mapcache); - } + if ( !shadow2_mode_refcounts(d) ) + put_page_from_l1e(ol1e, d); put_page_type(page); failed: unmap_domain_page(va); put_page(page); + return rc; } @@ -2462,8 +2451,6 @@ static int destroy_grant_pte_mapping( u32 type_info; l1_pgentry_t ol1e; - ASSERT(!shadow_mode_refcounts(d)); - gmfn = addr >> PAGE_SHIFT; mfn = gmfn_to_mfn(d, gmfn); @@ -2504,7 +2491,9 @@ static int destroy_grant_pte_mapping( } /* Delete pagetable entry. */ - if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) ) + if ( unlikely(!update_l1e( + (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, + d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) ) { MEM_LOG("Cannot delete PTE entry at %p", va); put_page_type(page); @@ -2512,14 +2501,6 @@ static int destroy_grant_pte_mapping( goto failed; } - if ( unlikely(shadow_mode_enabled(d)) ) - { - struct domain_mmap_cache sh_mapcache; - domain_mmap_cache_init(&sh_mapcache); - shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache); - domain_mmap_cache_destroy(&sh_mapcache); - } - put_page_type(page); failed: @@ -2536,31 +2517,22 @@ static int create_grant_va_mapping( struct domain *d = v->domain; ASSERT(spin_is_locked(&d->big_lock)); - ASSERT(!shadow_mode_refcounts(d)); - - /* - * This is actually overkill - we don't need to sync the L1 itself, - * just everything involved in getting to this L1 (i.e. we need - * linear_pg_table[l1_linear_offset(va)] to be in sync)... - */ - __shadow_sync_va(v, va); pl1e = &linear_pg_table[l1_linear_offset(va)]; if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) || - !update_l1e(pl1e, ol1e, _nl1e) ) + !update_l1e(pl1e, ol1e, _nl1e, + l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) ) return GNTST_general_error; - put_page_from_l1e(ol1e, d); - - if ( unlikely(shadow_mode_enabled(d)) ) - shadow_do_update_va_mapping(va, _nl1e, v); + if ( !shadow2_mode_refcounts(d) ) + put_page_from_l1e(ol1e, d); return GNTST_okay; } static int destroy_grant_va_mapping( - unsigned long addr, unsigned long frame) + unsigned long addr, unsigned long frame, struct domain *d) { l1_pgentry_t *pl1e, ol1e; @@ -2584,12 +2556,14 @@ static int destroy_grant_va_mapping( } /* Delete pagetable entry. */ - if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) ) + if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), + l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]), + d->vcpu[0] /* Change for per-vcpu shadows */)) ) { MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); return GNTST_general_error; } - + return 0; } @@ -2597,7 +2571,7 @@ int create_grant_host_mapping( unsigned long addr, unsigned long frame, unsigned int flags) { l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS); - + if ( (flags & GNTMAP_application_map) ) l1e_add_flags(pte,_PAGE_USER); if ( !(flags & GNTMAP_readonly) ) @@ -2613,7 +2587,7 @@ int destroy_grant_host_mapping( { if ( flags & GNTMAP_contains_pte ) return destroy_grant_pte_mapping(addr, frame, current->domain); - return destroy_grant_va_mapping(addr, frame); + return destroy_grant_va_mapping(addr, frame, current->domain); } int steal_page( @@ -2675,46 +2649,44 @@ int do_update_va_mapping(unsigned long va, u64 val64, perfc_incrc(calls_to_update_va); - if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) ) + if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) ) return -EINVAL; - LOCK_BIGLOCK(d); - - if ( unlikely(shadow_mode_enabled(d)) ) - check_pagetable(v, "pre-va"); /* debug */ + if ( unlikely(shadow2_mode_refcounts(d)) ) + { + DPRINTK("Grant op on a shadow-refcounted domain\n"); + return -EINVAL; + } - if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)], - val)) ) - rc = -EINVAL; + LOCK_BIGLOCK(d); - if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) ) + if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) ) { if ( unlikely(this_cpu(percpu_mm_info).foreign && - (shadow_mode_translate(d) || - shadow_mode_translate( + (shadow2_mode_translate(d) || + shadow2_mode_translate( this_cpu(percpu_mm_info).foreign))) ) { /* * The foreign domain's pfn's are in a different namespace. There's - * not enough information in just a gpte to figure out how to + * not enough information in just a gpte to figure out how to * (re-)shadow this entry. */ domain_crash(d); } - - rc = shadow_do_update_va_mapping(va, val, v); - - check_pagetable(v, "post-va"); /* debug */ } + if ( unlikely(!mod_l1_entry( + &linear_pg_table[l1_linear_offset(va)], val, + l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) ) + rc = -EINVAL; + switch ( flags & UVMF_FLUSHTYPE_MASK ) { case UVMF_TLB_FLUSH: switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: - if ( unlikely(shadow_mode_enabled(d)) ) - shadow_sync_all(d); local_flush_tlb(); break; case UVMF_ALL: @@ -2733,9 +2705,9 @@ int do_update_va_mapping(unsigned long va, u64 val64, switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: - if ( unlikely(shadow_mode_enabled(d)) ) - shadow_invlpg(current, va); - local_flush_tlb_one(va); + if ( !shadow2_mode_enabled(d) + || (shadow2_invlpg(current, va) != 0) ) + local_flush_tlb_one(va); break; case UVMF_ALL: flush_tlb_one_mask(d->domain_dirty_cpumask, va); @@ -2808,8 +2780,6 @@ long set_gdt(struct vcpu *v, if ( entries > FIRST_RESERVED_GDT_ENTRY ) return -EINVAL; - shadow_sync_all(d); - /* Check the pages in the new GDT. */ for ( i = 0; i < nr_pages; i++ ) { mfn = frames[i] = gmfn_to_mfn(d, frames[i]); @@ -2912,24 +2882,13 @@ long do_update_descriptor(u64 pa, u64 desc) break; } - if ( shadow_mode_enabled(dom) ) - { - shadow_lock(dom); - - __mark_dirty(dom, mfn); - - if ( page_is_page_table(page) && !page_out_of_sync(page) ) - shadow_mark_mfn_out_of_sync(current, gmfn, mfn); - } + mark_dirty(dom, mfn); /* All is good so make the update. */ gdt_pent = map_domain_page(mfn); memcpy(&gdt_pent[offset], &d, 8); unmap_domain_page(gdt_pent); - if ( shadow_mode_enabled(dom) ) - shadow_unlock(dom); - put_page_type(page); ret = 0; /* success */ @@ -2981,8 +2940,8 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg) default: break; } - - if ( !shadow_mode_translate(d) || (mfn == 0) ) + + if ( !shadow2_mode_translate(d) || (mfn == 0) ) { put_domain(d); return -EINVAL; @@ -3011,7 +2970,7 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg) guest_physmap_add_page(d, xatp.gpfn, mfn); UNLOCK_BIGLOCK(d); - + put_domain(d); break; @@ -3136,7 +3095,8 @@ static int ptwr_emulated_update( unsigned long pfn; struct page_info *page; l1_pgentry_t pte, ol1e, nl1e, *pl1e; - struct domain *d = current->domain; + struct vcpu *v = current; + struct domain *d = v->domain; /* Aligned access only, thank you. */ if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) ) @@ -3196,25 +3156,36 @@ static int ptwr_emulated_update( return X86EMUL_UNHANDLEABLE; } + /* Checked successfully: do the update (write or cmpxchg). */ pl1e = map_domain_page(page_to_mfn(page)); pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK)); if ( do_cmpxchg ) { + if ( shadow2_mode_enabled(d) ) + shadow2_lock(d); ol1e = l1e_from_intpte(old); if ( cmpxchg((intpte_t *)pl1e, old, val) != old ) { + if ( shadow2_mode_enabled(d) ) + shadow2_unlock(d); unmap_domain_page(pl1e); put_page_from_l1e(nl1e, d); return X86EMUL_CMPXCHG_FAILED; } + if ( unlikely(shadow2_mode_enabled(v->domain)) ) + { + shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e); + shadow2_unlock(v->domain); + } } else { ol1e = *pl1e; - if ( !update_l1e(pl1e, ol1e, nl1e) ) + if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) ) BUG(); } + unmap_domain_page(pl1e); /* Finally, drop the old PTE. */ diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index ff0589082a..01782320b3 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -532,8 +532,6 @@ void __init __start_xen(multiboot_info_t *mbi) if ( opt_watchdog ) watchdog_enable(); - shadow_mode_init(); - /* initialize access control security module */ acm_init(&initrdidx, mbi, initial_images_start); diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c deleted file mode 100644 index 88e2ec8417..0000000000 --- a/xen/arch/x86/shadow.c +++ /dev/null @@ -1,4150 +0,0 @@ -/****************************************************************************** - * arch/x86/shadow.c - * - * Copyright (c) 2005 Michael A Fetterman - * Based on an earlier implementation by Ian Pratt et al - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -/* - * Jun Nakajima - * Chengyuan Li - * - * Extended to support 32-bit PAE and 64-bit guests. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Use this to have the compiler remove unnecessary branches */ -#define SH_L1_HAS_NEXT_PAGE (GUEST_L1_PAGETABLE_ENTRIES - L1_PAGETABLE_ENTRIES) - -extern void free_shadow_pages(struct domain *d); - -#if 0 // this code has not been updated for 32pae & 64 bit modes -#if SHADOW_DEBUG -static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn); -#endif -#endif - -#if CONFIG_PAGING_LEVELS == 3 -static unsigned long shadow_l3_table( - struct vcpu *v, unsigned long gpfn, unsigned long gmfn); -#endif - -#if CONFIG_PAGING_LEVELS == 4 -static unsigned long shadow_l4_table( - struct vcpu *v, unsigned long gpfn, unsigned long gmfn); -#endif - -#if CONFIG_PAGING_LEVELS >= 3 -static void shadow_map_into_current(struct vcpu *v, - unsigned long va, unsigned int from, unsigned int to); -static inline void validate_bl2e_change( struct domain *d, - guest_root_pgentry_t *new_gle_p, pgentry_64_t *shadow_l3, int index); -static void update_top_level_shadow(struct vcpu *v, unsigned long smfn); -#endif - -/******** - -There's a per-domain shadow table spin lock which works fine for SMP -hosts. We don't have to worry about interrupts as no shadow operations -happen in an interrupt context. It's probably not quite ready for SMP -guest operation as we have to worry about synchonisation between gpte -and spte updates. Its possible that this might only happen in a -hypercall context, in which case we'll probably at have a per-domain -hypercall lock anyhow (at least initially). - -********/ - -static inline int -shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, - unsigned long new_type) -{ - struct page_info *page = mfn_to_page(gmfn); - int pinned = 0, okay = 1; - - if ( page_out_of_sync(page) ) - { - // Don't know how long ago this snapshot was taken. - // Can't trust it to be recent enough. - // - __shadow_sync_mfn(d, gmfn); - } - - if ( !shadow_mode_refcounts(d) ) - return 1; - - if ( unlikely(page_is_page_table(page)) ) - return 1; - - FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type); - - if ( !shadow_remove_all_write_access(d, gpfn, gmfn) ) - { - FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx", - __func__, gpfn, gmfn); -#if 1 || defined(LIVE_DANGEROUSLY) - set_bit(_PGC_page_table, &page->count_info); - return 1; -#endif - return 0; - } - - // To convert this page to use as a page table, the writable count - // should now be zero. Test this by grabbing the page as an page table, - // and then immediately releasing. This will also deal with any - // necessary TLB flushing issues for us. - // - // The cruft here about pinning doesn't really work right. This - // needs rethinking/rewriting... Need to gracefully deal with the - // TLB flushes required when promoting a writable page, and also deal - // with any outstanding (external) writable refs to this page (by - // refusing to promote it). The pinning headache complicates this - // code -- it would all get much simpler if we stop using - // shadow_lock() and move the shadow code to BIGLOCK(). - // - if ( unlikely(!get_page(page, d)) ) - BUG(); // XXX -- needs more thought for a graceful failure - if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) ) - { - pinned = 1; - put_page_and_type(page); - } - if ( get_page_type(page, PGT_base_page_table) ) - { - set_bit(_PGC_page_table, &page->count_info); - put_page_type(page); - } - else - { - printk("shadow_promote: get_page_type failed " - "dom%d gpfn=%lx gmfn=%lx t=%08lx\n", - d->domain_id, gpfn, gmfn, new_type); - okay = 0; - } - - // Now put the type back to writable... - if ( unlikely(!get_page_type(page, PGT_writable_page)) ) - BUG(); // XXX -- needs more thought for a graceful failure - if ( unlikely(pinned) ) - { - if ( unlikely(test_and_set_bit(_PGT_pinned, - &page->u.inuse.type_info)) ) - BUG(); // hmm... someone pinned this again? - } - else - put_page_and_type(page); - - return okay; -} - - -/* - * Things in shadow mode that collect get_page() refs to the domain's - * pages are: - * - PGC_allocated takes a gen count, just like normal. - * - A writable page can be pinned (paravirtualized guests may consider - * these pages to be L1s or L2s, and don't know the difference). - * Pinning a page takes a gen count (but, for domains in shadow mode, - * it *doesn't* take a type count) - * - CR3 grabs a ref to whatever it points at, just like normal. - * - Shadow mode grabs an initial gen count for itself, as a placehold - * for whatever references will exist. - * - Shadow PTEs that point to a page take a gen count, just like regular - * PTEs. However, they don't get a type count, as get_page_type() is - * hardwired to keep writable pages' counts at 1 for domains in shadow - * mode. - * - Whenever we shadow a page, the entry in the shadow hash grabs a - * general ref to the page. - * - Whenever a page goes out of sync, the out of sync entry grabs a - * general ref to the page. - */ -/* - * page_info fields for pages allocated as shadow pages: - * - * All 32 bits of count_info are a simple count of refs to this shadow - * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table), - * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync - * references. - * - * u.inuse._domain is left NULL, to prevent accidently allow some random - * domain from gaining permissions to map this page. - * - * u.inuse.type_info & PGT_type_mask remembers what kind of page is being - * shadowed. - * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed. - * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow - * is currently exists because this is a shadow of a root page, and we - * don't want to let those disappear just because no CR3 is currently pointing - * at it. - * - * tlbflush_timestamp holds a min & max index of valid page table entries - * within the shadow page. - */ -static inline void -shadow_page_info_init(struct page_info *page, - unsigned long gmfn, - u32 psh_type) -{ - ASSERT( (gmfn & ~PGT_mfn_mask) == 0 ); - page->u.inuse.type_info = psh_type | gmfn; - page->count_info = 0; - page->tlbflush_timestamp = 0; -} - -static inline unsigned long -alloc_shadow_page(struct domain *d, - unsigned long gpfn, unsigned long gmfn, - u32 psh_type) -{ - struct page_info *page; - unsigned long smfn, real_gpfn; - int pin = 0; - void *l1, *lp; - u64 index = 0; - - // Currently, we only keep pre-zero'ed pages around for use as L1's... - // This will change. Soon. - // - if ( psh_type == PGT_l1_shadow ) - { - if ( !list_empty(&d->arch.free_shadow_frames) ) - { - struct list_head *entry = d->arch.free_shadow_frames.next; - page = list_entry(entry, struct page_info, list); - list_del(entry); - perfc_decr(free_l1_pages); - } - else - { - if ( SH_L1_HAS_NEXT_PAGE && - d->arch.ops->guest_paging_levels == PAGING_L2) - { -#if CONFIG_PAGING_LEVELS >= 3 - /* - * For 32-bit HVM guest, 2 shadow L1s are required to - * simulate 1 guest L1 So need allocate 2 shadow L1 - * pages each time. - * - * --> Need to avoidalloc_domheap_pages. - */ - page = alloc_domheap_pages(NULL, SL1_ORDER, 0); - if (!page) - goto no_shadow_page; - - l1 = map_domain_page(page_to_mfn(page)); - memset(l1, 0, PAGE_SIZE); - unmap_domain_page(l1); - - l1 = map_domain_page(page_to_mfn(page + 1)); - memset(l1, 0, PAGE_SIZE); - unmap_domain_page(l1); - - /* we'd like to initialize the second continuous page here - * and leave the first page initialization later */ - - shadow_page_info_init(page+1, gmfn, psh_type); -#else - page = alloc_domheap_page(NULL); - if (!page) - goto no_shadow_page; - - l1 = map_domain_page(page_to_mfn(page)); - memset(l1, 0, PAGE_SIZE); - unmap_domain_page(l1); -#endif - } - else - { - page = alloc_domheap_page(NULL); - if (!page) - goto no_shadow_page; - - l1 = map_domain_page(page_to_mfn(page)); - memset(l1, 0, PAGE_SIZE); - unmap_domain_page(l1); - } - } - } - else { -#if CONFIG_PAGING_LEVELS == 2 - page = alloc_domheap_page(NULL); -#elif CONFIG_PAGING_LEVELS >= 3 - if ( d->arch.ops->guest_paging_levels == PAGING_L2 && - psh_type == PGT_l4_shadow ) /* allocated for PAE PDP page */ - page = alloc_domheap_pages(NULL, 0, MEMF_dma); - else if ( d->arch.ops->guest_paging_levels == PAGING_L3 && - (psh_type == PGT_l3_shadow || psh_type == PGT_l4_shadow) ) - page = alloc_domheap_pages(NULL, 0, MEMF_dma); /* allocated for PAE PDP page */ - else - page = alloc_domheap_page(NULL); -#endif - if (!page) - goto no_shadow_page; - - lp = map_domain_page(page_to_mfn(page)); - memset(lp, 0, PAGE_SIZE); - unmap_domain_page(lp); - } - - smfn = page_to_mfn(page); - - shadow_page_info_init(page, gmfn, psh_type); - - switch ( psh_type ) - { - case PGT_l1_shadow: - if ( !shadow_promote(d, gpfn, gmfn, psh_type) ) - goto fail; - perfc_incr(shadow_l1_pages); - d->arch.shadow_page_count++; - break; - - case PGT_l2_shadow: - if ( !shadow_promote(d, gpfn, gmfn, psh_type) ) - goto fail; - perfc_incr(shadow_l2_pages); - d->arch.shadow_page_count++; - if ( PGT_l2_page_table == PGT_root_page_table ) - pin = 1; - - break; - - case PGT_l3_shadow: - if ( !shadow_promote(d, gpfn, gmfn, psh_type) ) - goto fail; - perfc_incr(shadow_l3_pages); - d->arch.shadow_page_count++; - if ( PGT_l3_page_table == PGT_root_page_table ) - pin = 1; - break; - - case PGT_l4_shadow: - real_gpfn = gpfn & PGT_mfn_mask; - if ( !shadow_promote(d, real_gpfn, gmfn, psh_type) ) - goto fail; - perfc_incr(shadow_l4_pages); - d->arch.shadow_page_count++; - if ( PGT_l4_page_table == PGT_root_page_table ) - pin = 1; -#if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32) - /* - * We use PGT_l4_shadow for 2-level paging guests on PAE - */ - if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) - pin = 1; -#endif - -#if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE ) - /* - * We use PGT_l4_shadow for 2-level paging guests on PAE - */ - if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) - pin = 1; -#endif - if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) - index = get_cr3_idxval(current); - break; - -#if CONFIG_PAGING_LEVELS >= 3 - case PGT_fl1_shadow: - perfc_incr(shadow_l1_pages); - d->arch.shadow_page_count++; - break; -#else - - case PGT_hl2_shadow: - // Treat an hl2 as an L1 for purposes of promotion. - // For external mode domains, treat them as an L2 for purposes of - // pinning. - // - if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) ) - goto fail; - perfc_incr(hl2_table_pages); - d->arch.hl2_page_count++; - if ( shadow_mode_external(d) && - (PGT_l2_page_table == PGT_root_page_table) ) - pin = 1; - - break; -#endif - case PGT_snapshot: - perfc_incr(snapshot_pages); - d->arch.snapshot_page_count++; - break; - - default: - printk("Alloc shadow weird page type type=%08x\n", psh_type); - BUG(); - break; - } - - // Don't add a new shadow of something that already has a snapshot. - // - ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) ); - - set_shadow_status(d, gpfn, gmfn, smfn, psh_type, index); - - if ( pin ) - shadow_pin(smfn); - - return smfn; - -fail: - FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?", - gpfn, gmfn); - if (psh_type == PGT_l1_shadow) - { - if (d->arch.ops->guest_paging_levels == PAGING_L2) - { -#if CONFIG_PAGING_LEVELS >=3 - free_domheap_pages(page, SL1_ORDER); -#else - free_domheap_page(page); -#endif - } - else - free_domheap_page(page); - } - else - free_domheap_page(page); - - return 0; - -no_shadow_page: - ASSERT(page == NULL); - printk("Couldn't alloc shadow page! dom%d count=%d\n", - d->domain_id, d->arch.shadow_page_count); - printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n", - perfc_value(shadow_l1_pages), - perfc_value(shadow_l2_pages), - perfc_value(hl2_table_pages), - perfc_value(snapshot_pages)); - /* XXX FIXME: try a shadow flush to free up some memory. */ - domain_crash_synchronous(); - - return 0; -} - -#if CONFIG_PAGING_LEVELS == 2 -static unsigned long -shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn, - unsigned long smfn) -{ - unsigned long hl2mfn; - l1_pgentry_t *hl2; - int limit; - - ASSERT(PGT_base_page_table == PGT_l2_page_table); - - if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) ) - { - printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n", - gpfn, gmfn); - BUG(); /* XXX Deal gracefully with failure. */ - } - - SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx", - gpfn, gmfn, smfn, hl2mfn); - perfc_incrc(shadow_hl2_table_count); - - hl2 = map_domain_page(hl2mfn); - - if ( shadow_mode_external(d) ) - limit = L2_PAGETABLE_ENTRIES; - else - limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; - - memset(hl2, 0, limit * sizeof(l1_pgentry_t)); - - if ( !shadow_mode_external(d) ) - { - memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0, - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - - // Setup easy access to the GL2, SL2, and HL2 frames. - // - hl2[l2_table_offset(LINEAR_PT_VIRT_START)] = - l1e_from_pfn(gmfn, __PAGE_HYPERVISOR); - hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = - l1e_from_pfn(smfn, __PAGE_HYPERVISOR); - hl2[l2_table_offset(PERDOMAIN_VIRT_START)] = - l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR); - } - - unmap_domain_page(hl2); - - return hl2mfn; -} - -/* - * This could take and use a snapshot, and validate the entire page at - * once, or it could continue to fault in entries one at a time... - * Might be worth investigating... - */ -static unsigned long shadow_l2_table( - struct vcpu *v, unsigned long gpfn, unsigned long gmfn) -{ - unsigned long smfn; - l2_pgentry_t *spl2e; - struct domain *d = v->domain; - int i; - - SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn); - - perfc_incrc(shadow_l2_table_count); - - if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) ) - { - printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n", - gpfn, gmfn); - BUG(); /* XXX Deal gracefully with failure. */ - } - - spl2e = (l2_pgentry_t *)map_domain_page(smfn); - - /* Install hypervisor and 2x linear p.t. mapings. */ - if ( (PGT_base_page_table == PGT_l2_page_table) && - !shadow_mode_external(d) ) - { - /* - * We could proactively fill in PDEs for pages that are already - * shadowed *and* where the guest PDE has _PAGE_ACCESSED set - * (restriction required for coherence of the accessed bit). However, - * we tried it and it didn't help performance. This is simpler. - */ - memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t)); - - /* Install hypervisor and 2x linear p.t. mapings. */ - memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - - spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = - l2e_from_pfn(smfn, __PAGE_HYPERVISOR); - - for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) - spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = - l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))-> - arch.mm_perdomain_pt) + i, - __PAGE_HYPERVISOR); - - if ( shadow_mode_translate(d) ) // NB: not external - { - unsigned long hl2mfn; - - spl2e[l2_table_offset(RO_MPT_VIRT_START)] = - l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table), - __PAGE_HYPERVISOR); - - if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) - hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); - - // shadow_mode_translate (but not external) sl2 tables hold a - // ref to their hl2. - // - if ( !get_shadow_ref(hl2mfn) ) - BUG(); - - spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = - l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR); - } - else - spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = - l2e_from_pfn(gmfn, __PAGE_HYPERVISOR); - } - else - { - memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t)); - } - - unmap_domain_page(spl2e); - - SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn); - return smfn; -} -#endif /* CONFIG_PAGING_LEVELS == 2 */ - -static void shadow_map_l1_into_current_l2(unsigned long va) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - l1_pgentry_t *spl1e, *spl1e_next = 0; - l2_pgentry_t sl2e; - guest_l1_pgentry_t *gpl1e; - guest_l2_pgentry_t gl2e = {0}; - unsigned long gl1pfn, gl1mfn, sl1mfn; - int i, init_table = 0; - - __guest_get_l2e(v, va, &gl2e); - ASSERT(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT); - gl1pfn = l2e_get_pfn(gl2e); - - if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) ) - { - /* This L1 is NOT already shadowed so we need to shadow it. */ - SH_VVLOG("4a: l1 not shadowed"); - - gl1mfn = gmfn_to_mfn(d, gl1pfn); - if ( unlikely(!VALID_MFN(gl1mfn)) ) - { - // Attempt to use an invalid pfn as an L1 page. - // XXX this needs to be more graceful! - BUG(); - } - - if ( unlikely(!(sl1mfn = - alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) ) - { - printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n", - gl1pfn, gl1mfn); - BUG(); /* XXX Need to deal gracefully with failure. */ - } - - perfc_incrc(shadow_l1_table_count); - init_table = 1; - } - else - { - /* This L1 is shadowed already, but the L2 entry is missing. */ - SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn); - } - -#ifndef NDEBUG - { - l2_pgentry_t old_sl2e; - __shadow_get_l2e(v, va, &old_sl2e); - ASSERT(!(l2e_get_flags(old_sl2e) & _PAGE_PRESENT)); - } -#endif - -#if CONFIG_PAGING_LEVELS >= 3 - if ( SH_L1_HAS_NEXT_PAGE && - d->arch.ops->guest_paging_levels == PAGING_L2 ) - { - /* for 32-bit HVM guest on 64-bit or PAE host, - * need update two L2 entries each time - */ - if ( !get_shadow_ref(sl1mfn)) - BUG(); - l2pde_general(d, &gl2e, &sl2e, sl1mfn); - __guest_set_l2e(v, va, &gl2e); - __shadow_set_l2e(v, va & ~((1<= 3 - if ( SH_L1_HAS_NEXT_PAGE && - d->arch.ops->guest_paging_levels == PAGING_L2 ) - __shadow_get_l2e(v, va & ~((1UL << L2_PAGETABLE_SHIFT_32) - 1), &tmp_sl2e); - else -#endif - __shadow_get_l2e(v, va, &tmp_sl2e); - - spl1e = (l1_pgentry_t *) map_domain_page(l2e_get_pfn(tmp_sl2e)); - - if ( SH_L1_HAS_NEXT_PAGE ) - spl1e_next = (l1_pgentry_t *) map_domain_page( - (l2e_get_pfn(tmp_sl2e) + 1UL)); - - for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ ) - { - l1pte_propagate_from_guest(d, gpl1e[i], &sl1e); - if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) && - unlikely(!shadow_get_page_from_l1e(sl1e, d)) ) - sl1e = l1e_empty(); - if ( l1e_get_flags(sl1e) == 0 ) - { - // First copy entries from 0 until first invalid. - // Then copy entries from index until first invalid. - // - if ( i < index ) { - i = index - 1; - continue; - } - break; - } - - if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES ) - spl1e_next[i - L1_PAGETABLE_ENTRIES] = sl1e; - else - spl1e[i] = sl1e; - - if ( unlikely(i < min) ) - min = i; - if ( likely(i > max) ) - max = i; - set_guest_back_ptr(d, sl1e, sl1mfn, i); - } - - mfn_to_page(sl1mfn)->tlbflush_timestamp = - SHADOW_ENCODE_MIN_MAX(min, max); - - unmap_domain_page(gpl1e); - unmap_domain_page(spl1e); - - if ( SH_L1_HAS_NEXT_PAGE ) - unmap_domain_page(spl1e_next); - } -} - -#if CONFIG_PAGING_LEVELS == 2 -static void -shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - l2_pgentry_t sl2e = {0}; - - __shadow_get_l2e(v, va, &sl2e); - if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) ) - { - /* - * Either the L1 is not shadowed, or the shadow isn't linked into - * the current shadow L2. - */ - if ( create_l1_shadow ) - { - perfc_incrc(shadow_set_l1e_force_map); - shadow_map_l1_into_current_l2(va); - } - else /* check to see if it exists; if so, link it in */ - { - l2_pgentry_t gpde = {0}; - unsigned long gl1pfn; - unsigned long sl1mfn; - - __guest_get_l2e(v, va, &gpde); - - if ( l2e_get_flags(gpde) & _PAGE_PRESENT ) - { - gl1pfn = l2e_get_pfn(gpde); - sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow); - } - else - { - // no shadow exists, so there's nothing to do. - perfc_incrc(shadow_set_l1e_fail); - return; - } - - if ( sl1mfn ) - { - perfc_incrc(shadow_set_l1e_unlinked); - if ( !get_shadow_ref(sl1mfn) ) - BUG(); - l2pde_general(d, (guest_l2_pgentry_t *)&gpde, &sl2e, sl1mfn); - __guest_set_l2e(v, va, &gpde); - __shadow_set_l2e(v, va, &sl2e); - } - else - { - // no shadow exists, so there's nothing to do. - perfc_incrc(shadow_set_l1e_fail); - return; - } - } - } - - __shadow_get_l2e(v, va, &sl2e); - - if ( shadow_mode_refcounts(d) ) - { - l1_pgentry_t old_spte; - __shadow_get_l1e(v, va, &old_spte); - - // only do the ref counting if something important changed. - // - if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) ) - { - if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(new_spte, d) ) - new_spte = l1e_empty(); - if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) - shadow_put_page_from_l1e(old_spte, d); - } - } - - set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va)); - __shadow_set_l1e(v, va, &new_spte); - shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va)); -} - -static void shadow_invlpg_32(struct vcpu *v, unsigned long va) -{ - struct domain *d = v->domain; - l1_pgentry_t gpte, spte; - - ASSERT(shadow_mode_enabled(d)); - - shadow_lock(d); - - __shadow_sync_va(v, va); - - // XXX mafetter: will need to think about 4MB pages... - - // It's not strictly necessary to update the shadow here, - // but it might save a fault later. - // - /*if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT], - sizeof(gpte))) {*/ - if (unlikely(!__guest_get_l1e(v, va, &gpte))) { - perfc_incrc(shadow_invlpg_faults); - shadow_unlock(d); - return; - } - l1pte_propagate_from_guest(d, gpte, &spte); - shadow_set_l1e(va, spte, 1); - - shadow_unlock(d); -} -#endif /* CONFIG_PAGING_LEVELS == 2 */ - -#if CONFIG_PAGING_LEVELS >= 3 -static void shadow_set_l1e_64( - unsigned long va, pgentry_64_t *sl1e_p, - int create_l1_shadow) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - pgentry_64_t sle = { 0 }; - pgentry_64_t sle_up = {0}; - l1_pgentry_t old_spte; - l1_pgentry_t sl1e = *(l1_pgentry_t *)sl1e_p; - int i; - unsigned long orig_va = 0; - - if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) - { - /* This is for 32-bit VMX guest on 64-bit host */ - orig_va = va; - va = va & (~((1<= PAGING_L2; i-- ) - { - if ( !__rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i) ) - { - sl1e = l1e_empty(); - goto out; - } - if ( !(entry_get_flags(sle) & _PAGE_PRESENT) ) - { - if ( create_l1_shadow ) - { - perfc_incrc(shadow_set_l3e_force_map); - shadow_map_into_current(v, va, i-1, i); - __rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i); - } - } - if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) - { - if ( i < PAGING_L3 ) - shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i)); - } - else - { - if ( i < PAGING_L4 ) - shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i)); - } - - sle_up = sle; - } - - if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) - { - va = orig_va; - } - - if ( shadow_mode_refcounts(d) ) - { - __shadow_get_l1e(v, va, &old_spte); - if ( l1e_has_changed(old_spte, sl1e, _PAGE_RW | _PAGE_PRESENT) ) - { - if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(sl1e, d) ) - sl1e = l1e_empty(); - if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) - put_page_from_l1e(old_spte, d); - } - } - -out: - __shadow_set_l1e(v, va, &sl1e); - - shadow_update_min_max(entry_get_pfn(sle_up), guest_l1_table_offset(va)); -} -#endif /* CONFIG_PAGING_LEVELS >= 3 */ - -static struct out_of_sync_entry * -shadow_alloc_oos_entry(struct domain *d) -{ - struct out_of_sync_entry *f, *extra; - unsigned size, i; - - if ( unlikely(d->arch.out_of_sync_free == NULL) ) - { - FSH_LOG("Allocate more fullshadow tuple blocks."); - - size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f)); - extra = xmalloc_bytes(size); - - /* XXX Should be more graceful here. */ - if ( extra == NULL ) - BUG(); - - memset(extra, 0, size); - - /* Record the allocation block so it can be correctly freed later. */ - d->arch.out_of_sync_extras_count++; - *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = - d->arch.out_of_sync_extras; - d->arch.out_of_sync_extras = &extra[0]; - - /* Thread a free chain through the newly-allocated nodes. */ - for ( i = 0; i < (out_of_sync_extra_size - 1); i++ ) - extra[i].next = &extra[i+1]; - extra[i].next = NULL; - - /* Add the new nodes to the free list. */ - d->arch.out_of_sync_free = &extra[0]; - } - - /* Allocate a new node from the quicklist. */ - f = d->arch.out_of_sync_free; - d->arch.out_of_sync_free = f->next; - - return f; -} - -static inline unsigned long -shadow_make_snapshot( - struct domain *d, unsigned long gpfn, unsigned long gmfn) -{ - unsigned long smfn, sl1mfn = 0; - void *original, *snapshot; - u32 min_max = 0; - int min, max, length; - - if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) ) - { - ASSERT(__shadow_status(d, gpfn, PGT_snapshot)); - return SHADOW_SNAPSHOT_ELSEWHERE; - } - - perfc_incrc(shadow_make_snapshot); - - if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) ) - { - printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n" - "Dom%d snapshot_count_count=%d\n", - gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count); - BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */ - } - - if ( !get_shadow_ref(smfn) ) - BUG(); - - if ( shadow_mode_refcounts(d) && - (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) ) - min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp; - mfn_to_page(smfn)->tlbflush_timestamp = min_max; - - min = SHADOW_MIN(min_max); - max = SHADOW_MAX(min_max); - length = max - min + 1; - perfc_incr_histo(snapshot_copies, length, PT_UPDATES); - - min *= sizeof(guest_l1_pgentry_t); - length *= sizeof(guest_l1_pgentry_t); - - original = map_domain_page(gmfn); - snapshot = map_domain_page(smfn); - memcpy(snapshot + min, original + min, length); - unmap_domain_page(original); - unmap_domain_page(snapshot); - - return smfn; -} - -static struct out_of_sync_entry * -__mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn, - unsigned long mfn) -{ - struct domain *d = v->domain; - struct page_info *page = mfn_to_page(mfn); - struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d); - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(mfn_valid(mfn)); - -#ifndef NDEBUG - { - u32 type = page->u.inuse.type_info & PGT_type_mask; - if ( shadow_mode_refcounts(d) ) - { - ASSERT(type == PGT_writable_page); - } - else - { - ASSERT(type && (type < PGT_l4_page_table)); - } - } -#endif - - FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__, - gpfn, mfn, page->count_info, page->u.inuse.type_info); - - // XXX this will require some more thought... Cross-domain sharing and - // modification of page tables? Hmm... - // - if ( d != page_get_owner(page) ) - BUG(); - - perfc_incrc(shadow_mark_mfn_out_of_sync_calls); - - entry->v = v; - entry->gpfn = gpfn; - entry->gmfn = mfn; - entry->writable_pl1e = -1; - -#if 0 // this code has not been updated for 32pae & 64 bit modes -#if SHADOW_DEBUG - mark_shadows_as_reflecting_snapshot(d, gpfn); -#endif -#endif - - // increment guest's ref count to represent the entry in the - // full shadow out-of-sync list. - // - get_page(page, d); - - return entry; -} - -static struct out_of_sync_entry * -mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn, - unsigned long mfn) -{ - struct out_of_sync_entry *entry = - __mark_mfn_out_of_sync(v, gpfn, mfn); - struct domain *d = v->domain; - - entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn); - // Add to the out-of-sync list - // - entry->next = d->arch.out_of_sync; - d->arch.out_of_sync = entry; - - return entry; - -} - -static void shadow_mark_va_out_of_sync( - struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va) -{ - struct out_of_sync_entry *entry = - __mark_mfn_out_of_sync(v, gpfn, mfn); - l2_pgentry_t sl2e; - struct domain *d = v->domain; - -#if CONFIG_PAGING_LEVELS >= 3 - { - l4_pgentry_t sl4e; - l3_pgentry_t sl3e; - - __shadow_get_l4e(v, va, &sl4e); - if ( !(l4e_get_flags(sl4e) & _PAGE_PRESENT)) { - shadow_map_into_current(v, va, PAGING_L3, PAGING_L4); - } - - if (!__shadow_get_l3e(v, va, &sl3e)) { - BUG(); - } - - if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT)) { - shadow_map_into_current(v, va, PAGING_L2, PAGING_L3); - } - } -#endif - - // We need the address of shadow PTE that maps @va. - // It might not exist yet. Make sure it's there. - // - __shadow_get_l2e(v, va, &sl2e); - if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) ) - { - // either this L1 isn't shadowed yet, or the shadow isn't linked into - // the current L2. - shadow_map_l1_into_current_l2(va); - __shadow_get_l2e(v, va, &sl2e); - } - ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT); - - entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn); - // NB: this is stored as a machine address. - entry->writable_pl1e = - l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va)); - ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ); - entry->va = va; - - // Increment shadow's page count to represent the reference - // inherent in entry->writable_pl1e - // - if ( !get_shadow_ref(l2e_get_pfn(sl2e)) ) - BUG(); - - // Add to the out-of-sync list - // - entry->next = d->arch.out_of_sync; - d->arch.out_of_sync = entry; - - FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)", - __func__, va, entry->writable_pl1e); -} - -/* - * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches. - * Returns 0 otherwise. - */ -static int snapshot_entry_matches( - struct domain *d, guest_l1_pgentry_t *guest_pt, - unsigned long gpfn, unsigned index) -{ - unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot); - guest_l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ... - int entries_match; - - perfc_incrc(snapshot_entry_matches_calls); - - if ( !smfn ) - return 0; - - snapshot = map_domain_page(smfn); - - if (__copy_from_user(&gpte, &guest_pt[index], - sizeof(gpte))) - { - unmap_domain_page(snapshot); - return 0; - } - - // This could probably be smarter, but this is sufficent for - // our current needs. - // - entries_match = !guest_l1e_has_changed(gpte, snapshot[index], - PAGE_FLAG_MASK); - - unmap_domain_page(snapshot); - -#ifdef PERF_COUNTERS - if ( entries_match ) - perfc_incrc(snapshot_entry_matches_true); -#endif - - return entries_match; -} - -/* - * Returns 1 if va's shadow mapping is out-of-sync. - * Returns 0 otherwise. - */ -static int is_out_of_sync(struct vcpu *v, unsigned long va) /* __shadow_out_of_sync */ -{ - struct domain *d = v->domain; -#if CONFIG_PAGING_LEVELS == 4 - unsigned long l2mfn = ((v->arch.flags & TF_kernel_mode)? - pagetable_get_pfn(v->arch.guest_table) : - pagetable_get_pfn(v->arch.guest_table_user)); -#else - unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table); -#endif - unsigned long l2pfn = mfn_to_gmfn(d, l2mfn); - guest_l2_pgentry_t l2e; - unsigned long l1pfn, l1mfn; - guest_l1_pgentry_t *guest_pt; - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(VALID_M2P(l2pfn)); - - perfc_incrc(shadow_out_of_sync_calls); - -#if CONFIG_PAGING_LEVELS >= 3 - -#define unmap_and_return(x) \ - if ( guest_pt != (guest_l1_pgentry_t *) v->arch.guest_vtable ) \ - unmap_domain_page(guest_pt); \ - return (x); - - if (d->arch.ops->guest_paging_levels >= PAGING_L3) - { - pgentry_64_t le; - unsigned long gmfn; - unsigned long gpfn; - int i; - unsigned int base_idx = 0; - base_idx = get_cr3_idxval(v); - - gmfn = l2mfn; - gpfn = l2pfn; - guest_pt = (guest_l1_pgentry_t *)v->arch.guest_vtable; - - for ( i = PAGING_L4; i >= PAGING_L3; i-- ) - { - if (d->arch.ops->guest_paging_levels == PAGING_L3 - && i == PAGING_L4) - continue; /* skip the top-level for 3-level */ - - if ( page_out_of_sync(mfn_to_page(gmfn)) && - !snapshot_entry_matches( - d, guest_pt, gpfn, guest_table_offset_64(va, i, base_idx)) ) - { - unmap_and_return (1); - } - - le = entry_empty(); - __rw_entry(v, va, &le, GUEST_ENTRY | GET_ENTRY | i); - - if ( !(entry_get_flags(le) & _PAGE_PRESENT) ) - { - unmap_and_return (0); - } - gpfn = entry_get_pfn(le); - gmfn = gmfn_to_mfn(d, gpfn); - if ( !VALID_MFN(gmfn) ) - { - unmap_and_return (0); - } - if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable ) - unmap_domain_page(guest_pt); - guest_pt = (guest_l1_pgentry_t *)map_domain_page(gmfn); - } - - /* L2 */ - if ( page_out_of_sync(mfn_to_page(gmfn)) && - !snapshot_entry_matches(d, guest_pt, gpfn, l2_table_offset(va)) ) - { - unmap_and_return (1); - } - - if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable ) - unmap_domain_page(guest_pt); - - } - else -#undef unmap_and_return -#endif /* CONFIG_PAGING_LEVELS >= 3 */ - { - if ( page_out_of_sync(mfn_to_page(l2mfn)) && - !snapshot_entry_matches(d, (guest_l1_pgentry_t *)v->arch.guest_vtable, - l2pfn, guest_l2_table_offset(va)) ) - return 1; - } - - __guest_get_l2e(v, va, &l2e); - if ( !(guest_l2e_get_flags(l2e) & _PAGE_PRESENT) || - (guest_l2e_get_flags(l2e) & _PAGE_PSE)) - return 0; - - l1pfn = l2e_get_pfn(l2e); - l1mfn = gmfn_to_mfn(d, l1pfn); - - // If the l1 pfn is invalid, it can't be out of sync... - if ( !VALID_MFN(l1mfn) ) - return 0; - - guest_pt = (guest_l1_pgentry_t *) map_domain_page(l1mfn); - - if ( page_out_of_sync(mfn_to_page(l1mfn)) && - !snapshot_entry_matches( - d, guest_pt, l1pfn, guest_l1_table_offset(va)) ) - { - unmap_domain_page(guest_pt); - return 1; - } - - unmap_domain_page(guest_pt); - return 0; -} - -static int fix_entry( - struct domain *d, - l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find) -{ - l1_pgentry_t old = *pt; - l1_pgentry_t new = old; - - l1e_remove_flags(new,_PAGE_RW); - if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) ) - BUG(); - (*found)++; - *pt = new; - if ( is_l1_shadow ) - shadow_put_page_from_l1e(old, d); - - return (*found == max_refs_to_find); -} - -static u32 remove_all_write_access_in_ptpage( - struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn, - unsigned long readonly_gpfn, unsigned long readonly_gmfn, - u32 max_refs_to_find, unsigned long prediction) -{ - l1_pgentry_t *pt = map_domain_page(pt_mfn); - l1_pgentry_t *pt_next = 0, *sl1e_p; - l1_pgentry_t match; - unsigned long flags = _PAGE_RW | _PAGE_PRESENT; - int i; - u32 found = 0; - int is_l1_shadow = - ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) == - PGT_l1_shadow); -#if CONFIG_PAGING_LEVELS >= 3 - is_l1_shadow |= - ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) == - PGT_fl1_shadow); -#endif - - if ( SH_L1_HAS_NEXT_PAGE ) - pt_next = map_domain_page(pt_mfn + 1); - - match = l1e_from_pfn(readonly_gmfn, flags); - - if ( shadow_mode_external(d) ) - { - i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask) - >> PGT_va_shift; - - if ( SH_L1_HAS_NEXT_PAGE && - i >= L1_PAGETABLE_ENTRIES ) - sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES]; - else - sl1e_p = &pt[i]; - - if ( (i >= 0 && i < GUEST_L1_PAGETABLE_ENTRIES) && - !l1e_has_changed(*sl1e_p, match, flags) && - fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) && - !prediction ) - goto out; - } - - for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ ) - { - if ( SH_L1_HAS_NEXT_PAGE && - i >= L1_PAGETABLE_ENTRIES ) - sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES]; - else - sl1e_p = &pt[i]; - - if ( unlikely(!l1e_has_changed(*sl1e_p, match, flags)) && - fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) ) - break; - } - -out: - unmap_domain_page(pt); - if ( SH_L1_HAS_NEXT_PAGE ) - unmap_domain_page(pt_next); - - return found; -} - -static int remove_all_write_access( - struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn) -{ - int i; - struct shadow_status *a; - u32 found = 0, write_refs; - unsigned long predicted_smfn; - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(VALID_MFN(readonly_gmfn)); - - perfc_incrc(remove_write_access); - - // If it's not a writable page, then no writable refs can be outstanding. - // - if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) != - PGT_writable_page ) - { - perfc_incrc(remove_write_not_writable); - return 1; - } - - // How many outstanding writable PTEs for this page are there? - // - write_refs = - (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask); - if ( write_refs && MFN_PINNED(readonly_gmfn) ) - { - write_refs--; - } - - if ( write_refs == 0 ) - { - perfc_incrc(remove_write_no_work); - return 1; - } - - if ( shadow_mode_external(d) ) { - if (--write_refs == 0) - return 0; - - // Use the back pointer to locate the shadow page that can contain - // the PTE of interest - if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) { - found += remove_all_write_access_in_ptpage( - d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0); - if ( found == write_refs ) - return 0; - } - } - - // Search all the shadow L1 page tables... - // - for (i = 0; i < shadow_ht_buckets; i++) - { - a = &d->arch.shadow_ht[i]; - while ( a && a->gpfn_and_flags ) - { - if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow -#if CONFIG_PAGING_LEVELS >= 3 - || (a->gpfn_and_flags & PGT_type_mask) == PGT_fl1_shadow -#endif - ) - - { - found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask); - if ( found == write_refs ) - return 0; - } - - a = a->next; - } - } - - FSH_LOG("%s: looking for %d refs, found %d refs", - __func__, write_refs, found); - - return 0; -} - -static void resync_pae_guest_l3(struct domain *d) -{ - struct out_of_sync_entry *entry; - unsigned long i, idx; - unsigned long smfn, gmfn; - pgentry_64_t *guest, *shadow_l3, *snapshot; - struct vcpu *v = current; - int max = -1; - int unshadow = 0; - - - ASSERT( shadow_mode_external(d) ); - - gmfn = pagetable_get_pfn(v->arch.guest_table); - - for ( entry = d->arch.out_of_sync; entry; entry = entry->next ) - { - if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) - continue; - if ( entry->gmfn != gmfn ) - continue; - - idx = get_cr3_idxval(v); - - smfn = __shadow_status(d, entry->gpfn, PGT_l4_shadow); - - if ( !smfn ) - continue; - - guest = (pgentry_64_t *)map_domain_page(entry->gmfn); - snapshot = (pgentry_64_t *)map_domain_page(entry->snapshot_mfn); - shadow_l3 = (pgentry_64_t *)map_domain_page(smfn); - - for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ ) - { - int index = i + idx * PAE_L3_PAGETABLE_ENTRIES; - if ( entry_has_changed( - guest[index], snapshot[index], PAGE_FLAG_MASK) ) - { - unsigned long gpfn; - - /* - * Looks like it's no longer a page table. - */ - if ( unlikely(entry_get_value(guest[index]) & PAE_PDPT_RESERVED) ) - { - if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT ) - put_shadow_ref(entry_get_pfn(shadow_l3[i])); - - shadow_l3[i] = entry_empty(); - continue; - } - - gpfn = entry_get_pfn(guest[index]); - - if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) ) - { - if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT ) - put_shadow_ref(entry_get_pfn(shadow_l3[i])); - - shadow_l3[i] = entry_empty(); - continue; - } - - validate_entry_change(d, &guest[index], - &shadow_l3[i], PAGING_L3); - } - - if ( entry_get_value(guest[index]) != 0 ) - max = i; - - if ( !(entry_get_flags(guest[index]) & _PAGE_PRESENT) && - unlikely(entry_get_value(guest[index]) != 0) && - !unshadow && - (frame_table[smfn].u.inuse.type_info & PGT_pinned) ) - unshadow = 1; - - } - if ( max == -1 ) - unshadow = 1; - - unmap_domain_page(guest); - unmap_domain_page(snapshot); - unmap_domain_page(shadow_l3); - - if ( unlikely(unshadow) ) - shadow_unpin(smfn); - break; - } -} - -static int resync_all(struct domain *d, u32 stype) -{ - struct out_of_sync_entry *entry; - unsigned i; - unsigned long smfn; - void *guest, *shadow, *snapshot; - int need_flush = 0, external = shadow_mode_external(d); - int unshadow; - int changed; - u32 min_max_shadow, min_max_snapshot; - int min_shadow, max_shadow, min_snapshot, max_snapshot; - struct vcpu *v; - - ASSERT(shadow_lock_is_acquired(d)); - - for ( entry = d->arch.out_of_sync; entry; entry = entry->next) - { - int max = -1; - - if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) - continue; - - smfn = __shadow_status(d, entry->gpfn, stype); - - if ( !smfn ) - { - // For heavy weight shadows: no need to update refcounts if - // there's no shadow page. - // - if ( shadow_mode_refcounts(d) ) - continue; - - // For light weight shadows: only need up resync the refcounts to - // the new contents of the guest page iff this it has the right - // page type. - // - if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) ) - continue; - } - - FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx", - stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn); - - // Compare guest's new contents to its snapshot, validating - // and updating its shadow as appropriate. - // - guest = map_domain_page(entry->gmfn); - snapshot = map_domain_page(entry->snapshot_mfn); - - if ( smfn ) - shadow = map_domain_page(smfn); - else - shadow = NULL; - - unshadow = 0; - - min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp; - min_shadow = SHADOW_MIN(min_max_shadow); - max_shadow = SHADOW_MAX(min_max_shadow); - - min_max_snapshot= mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp; - min_snapshot = SHADOW_MIN(min_max_snapshot); - max_snapshot = SHADOW_MAX(min_max_snapshot); - - switch ( stype ) - { - case PGT_l1_shadow: - { - guest_l1_pgentry_t *guest1 = guest; - l1_pgentry_t *shadow1 = shadow; - l1_pgentry_t *shadow1_next = 0, *sl1e_p; - guest_l1_pgentry_t *snapshot1 = snapshot; - int unshadow_l1 = 0; - - ASSERT(shadow_mode_write_l1(d) || - shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d)); - - if ( !shadow_mode_refcounts(d) ) - revalidate_l1(d, (l1_pgentry_t *)guest1, (l1_pgentry_t *)snapshot1); - if ( !smfn ) - break; - - changed = 0; - - if ( SH_L1_HAS_NEXT_PAGE && shadow1 ) - shadow1_next = map_domain_page(smfn + 1); - - for ( i = min_shadow; i <= max_shadow; i++ ) - { - - if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES ) - sl1e_p = &shadow1_next[i - L1_PAGETABLE_ENTRIES]; - else - sl1e_p = &shadow1[i]; - - if ( (i < min_snapshot) || (i > max_snapshot) || - guest_l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) ) - { - int error; - -#if CONFIG_PAGING_LEVELS >= 3 - unsigned long gpfn; - - gpfn = guest_l1e_get_paddr(guest1[i]) >> PAGE_SHIFT; - - if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) ) - { - guest_l1_pgentry_t tmp_gl1e = guest_l1e_empty(); - validate_pte_change(d, tmp_gl1e, sl1e_p); - unshadow_l1 = 1; - continue; - } -#endif - - error = validate_pte_change(d, guest1[i], sl1e_p); - if ( error == -1 ) - unshadow_l1 = 1; - else { - need_flush |= error; - if ( l1e_get_flags(*sl1e_p) & _PAGE_PRESENT ) - set_guest_back_ptr(d, *sl1e_p, smfn, i); - } - // can't update snapshots of linear page tables -- they - // are used multiple times... - // - // snapshot[i] = new_pte; - - changed++; - } - } - - if ( shadow1_next ) - unmap_domain_page(shadow1_next); - - perfc_incrc(resync_l1); - perfc_incr_histo(wpt_updates, changed, PT_UPDATES); - perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES); - - if ( d->arch.ops->guest_paging_levels >= PAGING_L3 && - unshadow_l1 ) { - pgentry_64_t l2e = { 0 }; - - __shadow_get_l2e(entry->v, entry->va, &l2e); - - if ( entry_get_flags(l2e) & _PAGE_PRESENT ) { - put_shadow_ref(entry_get_pfn(l2e)); - l2e = entry_empty(); - __shadow_set_l2e(entry->v, entry->va, &l2e); - - if (entry->v == current) - need_flush = 1; - } - } - - break; - } -#if CONFIG_PAGING_LEVELS == 2 - case PGT_l2_shadow: - { - l2_pgentry_t *guest2 = guest; - l2_pgentry_t *shadow2 = shadow; - l2_pgentry_t *snapshot2 = snapshot; - - ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d)); - BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented - - changed = 0; - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - { - if ( !is_guest_l2_slot(0,i) && !external ) - continue; - - l2_pgentry_t new_pde = guest2[i]; - if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK)) - { - need_flush |= validate_pde_change(d, new_pde, &shadow2[i]); - - // can't update snapshots of linear page tables -- they - // are used multiple times... - // - // snapshot[i] = new_pde; - - changed++; - } - if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */ - max = i; - - // XXX - This hack works for linux guests. - // Need a better solution long term. - if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) && - unlikely(l2e_get_intpte(new_pde) != 0) && - !unshadow && MFN_PINNED(smfn) ) - unshadow = 1; - } - if ( max == -1 ) - unshadow = 1; - perfc_incrc(resync_l2); - perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES); - break; - } - case PGT_hl2_shadow: - { - l2_pgentry_t *guest2 = guest; - l2_pgentry_t *snapshot2 = snapshot; - l1_pgentry_t *shadow2 = shadow; - - ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d)); - BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented - - changed = 0; - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - { - if ( !is_guest_l2_slot(0, i) && !external ) - continue; - - l2_pgentry_t new_pde = guest2[i]; - if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) ) - { - need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]); - - // can't update snapshots of linear page tables -- they - // are used multiple times... - // - // snapshot[i] = new_pde; - - changed++; - } - } - perfc_incrc(resync_hl2); - perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES); - break; - } -#elif CONFIG_PAGING_LEVELS >= 3 - case PGT_l2_shadow: - case PGT_l3_shadow: - { - pgentry_64_t *guest_pt = guest; - pgentry_64_t *shadow_pt = shadow; - pgentry_64_t *snapshot_pt = snapshot; - - changed = 0; - for ( i = min_shadow; i <= max_shadow; i++ ) - { - if ( (i < min_snapshot) || (i > max_snapshot) || - entry_has_changed( - guest_pt[i], snapshot_pt[i], PAGE_FLAG_MASK) ) - { - unsigned long gpfn; - - gpfn = entry_get_pfn(guest_pt[i]); - /* - * Looks like it's no longer a page table. - */ - if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) ) - { - if ( entry_get_flags(shadow_pt[i]) & _PAGE_PRESENT ) - put_shadow_ref(entry_get_pfn(shadow_pt[i])); - shadow_pt[i] = entry_empty(); - continue; - } - - need_flush |= validate_entry_change( - d, &guest_pt[i], &shadow_pt[i], - shadow_type_to_level(stype)); - changed++; - } -#if CONFIG_PAGING_LEVELS == 3 - if ( stype == PGT_l3_shadow ) - { - if ( entry_get_value(guest_pt[i]) != 0 ) - max = i; - - if ( !(entry_get_flags(guest_pt[i]) & _PAGE_PRESENT) && - unlikely(entry_get_value(guest_pt[i]) != 0) && - !unshadow && - (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) ) - unshadow = 1; - } -#endif - } - - if ( d->arch.ops->guest_paging_levels == PAGING_L3 - && max == -1 && stype == PGT_l3_shadow ) - unshadow = 1; - - perfc_incrc(resync_l3); - perfc_incr_histo(shm_l3_updates, changed, PT_UPDATES); - break; - } - case PGT_l4_shadow: - { - guest_root_pgentry_t *guest_root = guest; - guest_root_pgentry_t *snapshot_root = snapshot; - - changed = 0; - for ( i = 0; i < GUEST_ROOT_PAGETABLE_ENTRIES; i++ ) - { - guest_root_pgentry_t new_root_e = guest_root[i]; - if ( !is_guest_l4_slot(i) && !external ) - continue; - if ( root_entry_has_changed( - new_root_e, snapshot_root[i], PAGE_FLAG_MASK)) - { -#ifndef GUEST_PGENTRY_32 - l4_pgentry_t *shadow4 = shadow; - unsigned long gpfn; - - gpfn = l4e_get_pfn(new_root_e); - - /* - * Looks like it's no longer a page table. - */ - if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) ) - { - if ( l4e_get_flags(shadow4[i]) & _PAGE_PRESENT ) - put_shadow_ref(l4e_get_pfn(shadow4[i])); - shadow4[i] = l4e_empty(); - continue; - } - - if ( d->arch.ops->guest_paging_levels == PAGING_L4 ) - { - need_flush |= validate_entry_change( - d, (pgentry_64_t *)&new_root_e, - (pgentry_64_t *)&shadow4[i], shadow_type_to_level(stype)); - } - else -#endif - { - validate_bl2e_change(d, &new_root_e, shadow, i); - } - changed++; - ESH_LOG("%d: shadow4 mfn: %lx, shadow root: %lx\n", i, - smfn, pagetable_get_paddr(current->arch.shadow_table)); - } - if ( guest_root_get_intpte(new_root_e) != 0 ) /* FIXME: check flags? */ - max = i; - - // Need a better solution in the long term. - if ( !(guest_root_get_flags(new_root_e) & _PAGE_PRESENT) && - unlikely(guest_root_get_intpte(new_root_e) != 0) && - !unshadow && - (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) ) - unshadow = 1; - } - if ( max == -1 ) - unshadow = 1; - perfc_incrc(resync_l4); - perfc_incr_histo(shm_l4_updates, changed, PT_UPDATES); - break; - } - -#endif /* CONFIG_PAGING_LEVELS >= 3 */ - default: - BUG(); - } - - if ( smfn ) - unmap_domain_page(shadow); - unmap_domain_page(snapshot); - unmap_domain_page(guest); - - if ( unlikely(unshadow && stype == PGT_root_page_table) ) - { - for_each_vcpu(d, v) - if(smfn == pagetable_get_pfn(v->arch.shadow_table)) - return need_flush; - perfc_incrc(unshadow_l2_count); - shadow_unpin(smfn); -#if CONFIG_PAGING_LEVELS == 2 - if ( unlikely(shadow_mode_external(d)) ) - { - unsigned long hl2mfn; - - if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) && - MFN_PINNED(hl2mfn) ) - shadow_unpin(hl2mfn); - } -#endif - } - } - - return need_flush; -} - -#if CONFIG_PAGING_LEVELS == 2 -static int resync_all_levels_guest_page(struct domain *d) -{ - int need_flush = 0; - - need_flush |= resync_all(d, PGT_l1_shadow); - if ( d->arch.ops->guest_paging_levels == PAGING_L2 && - shadow_mode_translate(d) ) - { - need_flush |= resync_all(d, PGT_hl2_shadow); - } - return need_flush; -} -#elif CONFIG_PAGING_LEVELS == 3 -static int resync_all_levels_guest_page(struct domain *d) -{ - int need_flush = 0; - - need_flush |= resync_all(d, PGT_l1_shadow); - if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) - need_flush |= resync_all(d, PGT_l4_shadow); - else - { - need_flush |= resync_all(d, PGT_l2_shadow); - if ( shadow_mode_log_dirty(d) ) - { - need_flush |= resync_all(d, PGT_l3_shadow); - need_flush |= resync_all(d, PGT_l4_shadow); - } - else - resync_pae_guest_l3(d); - } - - return need_flush; -} -#elif CONFIG_PAGING_LEVELS == 4 -static int resync_all_levels_guest_page(struct domain *d) -{ - int need_flush = 0; - - need_flush |= resync_all(d, PGT_l1_shadow); - if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) - need_flush |= resync_all(d, PGT_l4_shadow); - else - { - need_flush |= resync_all(d, PGT_l2_shadow); - if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) - resync_pae_guest_l3(d); - else - { - need_flush |= resync_all(d, PGT_l3_shadow); - need_flush |= resync_all(d, PGT_l4_shadow); - } - } - return need_flush; -} -#endif - -static void sync_all(struct domain *d) -{ - struct out_of_sync_entry *entry; - int need_flush = 0; - l1_pgentry_t *ppte, opte, npte; - cpumask_t other_vcpus_mask; - - perfc_incrc(shadow_sync_all); - - ASSERT(shadow_lock_is_acquired(d)); - - // First, remove all write permissions to the page tables - // - for ( entry = d->arch.out_of_sync; entry; entry = entry->next) - { - // Skip entries that have low bits set... Those aren't - // real PTEs. - // - if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) ) - continue; - - ppte = (l1_pgentry_t *)( - (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) + - (entry->writable_pl1e & ~PAGE_MASK)); - opte = npte = *ppte; - l1e_remove_flags(npte, _PAGE_RW); - - if ( (l1e_get_flags(npte) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(npte, d) ) - BUG(); - *ppte = npte; - set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT, - (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t)); - shadow_put_page_from_l1e(opte, d); - - unmap_domain_page(ppte); - } - - /* Other VCPUs mustn't use the revoked writable mappings. */ - other_vcpus_mask = d->domain_dirty_cpumask; - cpu_clear(smp_processor_id(), other_vcpus_mask); - flush_tlb_mask(other_vcpus_mask); - - /* Flush ourself later. */ - need_flush = 1; - - need_flush |= resync_all_levels_guest_page(d); - - if ( need_flush && !unlikely(shadow_mode_external(d)) ) - local_flush_tlb(); - - free_out_of_sync_state(d); -} - -static inline int l1pte_write_fault( - struct vcpu *v, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p, - unsigned long va) -{ - struct domain *d = v->domain; - guest_l1_pgentry_t gpte = *gpte_p; - l1_pgentry_t spte; - unsigned long gpfn = l1e_get_pfn(gpte); - unsigned long gmfn = gmfn_to_mfn(d, gpfn); - - //printk("l1pte_write_fault gmfn=%lx\n", gmfn); - - if ( unlikely(!VALID_MFN(gmfn)) ) - { - SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn); - *spte_p = l1e_empty(); - return 0; - } - - ASSERT(guest_l1e_get_flags(gpte) & _PAGE_RW); - guest_l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED); - spte = l1e_from_pfn(gmfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL); - - SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte, - l1e_get_intpte(spte), l1e_get_intpte(gpte)); - - __mark_dirty(d, gmfn); - - if ( mfn_is_page_table(gmfn) ) - shadow_mark_va_out_of_sync(v, gpfn, gmfn, va); - - *gpte_p = gpte; - *spte_p = spte; - - return 1; -} - -static inline int l1pte_read_fault( - struct domain *d, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p) -{ - guest_l1_pgentry_t gpte = *gpte_p; - l1_pgentry_t spte = *spte_p; - unsigned long pfn = l1e_get_pfn(gpte); - unsigned long mfn = gmfn_to_mfn(d, pfn); - - if ( unlikely(!VALID_MFN(mfn)) ) - { - SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn); - *spte_p = l1e_empty(); - return 0; - } - - guest_l1e_add_flags(gpte, _PAGE_ACCESSED); - spte = l1e_from_pfn(mfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL); - - if ( shadow_mode_log_dirty(d) || !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) || - mfn_is_page_table(mfn) ) - { - l1e_remove_flags(spte, _PAGE_RW); - } - - SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte, - l1e_get_intpte(spte), l1e_get_intpte(gpte)); - *gpte_p = gpte; - *spte_p = spte; - - return 1; -} -#if CONFIG_PAGING_LEVELS == 2 -static int shadow_fault_32(unsigned long va, struct cpu_user_regs *regs) -{ - l1_pgentry_t gpte, spte, orig_gpte; - struct vcpu *v = current; - struct domain *d = v->domain; - l2_pgentry_t gpde; - - spte = l1e_empty(); - - SH_VVLOG("shadow_fault( va=%lx, code=%lu )", - va, (unsigned long)regs->error_code); - perfc_incrc(shadow_fault_calls); - - check_pagetable(v, "pre-sf"); - - /* - * Don't let someone else take the guest's table pages out-of-sync. - */ - shadow_lock(d); - - /* XXX - FIX THIS COMMENT!!! - * STEP 1. Check to see if this fault might have been caused by an - * out-of-sync table page entry, or if we should pass this - * fault onto the guest. - */ - __shadow_sync_va(v, va); - - /* - * STEP 2. Check the guest PTE. - */ - __guest_get_l2e(v, va, &gpde); - if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) ) - { - SH_VVLOG("shadow_fault - EXIT: L1 not present"); - perfc_incrc(shadow_fault_bail_pde_not_present); - goto fail; - } - - // This can't fault because we hold the shadow lock and we've ensured that - // the mapping is in-sync, so the check of the PDE's present bit, above, - // covers this access. - // - //orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)]; - __guest_get_l1e(v, va, &gpte); - orig_gpte = gpte; - - if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) ) - { - SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")", - l1e_get_intpte(gpte)); - perfc_incrc(shadow_fault_bail_pte_not_present); - goto fail; - } - - /* Write fault? */ - if ( regs->error_code & 2 ) - { - int allow_writes = 0; - - if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) ) - { - if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) ) - { - allow_writes = 1; - l1e_add_flags(gpte, _PAGE_RW); - } - else - { - /* Write fault on a read-only mapping. */ - SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")", - l1e_get_intpte(gpte)); - perfc_incrc(shadow_fault_bail_ro_mapping); - goto fail; - } - } - else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) ) - { - SH_LOG("l1pte_write_fault: no write access to page table page"); - domain_crash_synchronous(); - } - - if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) ) - { - SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed"); - perfc_incrc(write_fault_bail); - shadow_unlock(d); - return 0; - } - - if ( allow_writes ) - l1e_remove_flags(gpte, _PAGE_RW); - } - else - { - if ( !l1pte_read_fault(d, &gpte, &spte) ) - { - SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed"); - perfc_incrc(read_fault_bail); - shadow_unlock(d); - return 0; - } - } - - /* - * STEP 3. Write the modified shadow PTE and guest PTE back to the tables. - */ - if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) ) - { - /* XXX Watch out for read-only L2 entries! (not used in Linux). */ - /*if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)], - &gpte, sizeof(gpte))) )*/ - if ( unlikely(!__guest_set_l1e(v, va, &gpte))) - { - printk("%s() failed, crashing domain %d " - "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n", - __func__,d->domain_id, l2e_get_intpte(gpde), va); - domain_crash_synchronous(); - } - - __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde))); - } - - shadow_set_l1e(va, spte, 1); - - perfc_incrc(shadow_fault_fixed); - d->arch.shadow_fault_count++; - - shadow_unlock(d); - - check_pagetable(v, "post-sf"); - return EXCRET_fault_fixed; - -fail: - shadow_unlock(d); - return 0; -} -#endif /* CONFIG_PAGING_LEVELS == 2 */ - -static inline unsigned long va_to_l1mfn(struct vcpu *v, unsigned long va) -{ - struct domain *d = v->domain; - guest_l2_pgentry_t gl2e = {0}; - - __guest_get_l2e(v, va, &gl2e); - - if ( unlikely(!(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT)) ) - return INVALID_MFN; - - return gmfn_to_mfn(d, l2e_get_pfn(gl2e)); -} - -static int do_update_va_mapping(unsigned long va, - l1_pgentry_t val, - struct vcpu *v) -{ - struct domain *d = v->domain; - l1_pgentry_t spte; - int rc = 0; - - shadow_lock(d); - - // This is actually overkill - we don't need to sync the L1 itself, - // just everything involved in getting to this L1 (i.e. we need - // linear_pg_table[l1_linear_offset(va)] to be in sync)... - // - __shadow_sync_va(v, va); - - l1pte_propagate_from_guest(d, *(guest_l1_pgentry_t *)&val, &spte); -#if CONFIG_PAGING_LEVELS == 2 - shadow_set_l1e(va, spte, 0); -#elif CONFIG_PAGING_LEVELS >= 3 - shadow_set_l1e_64(va, (pgentry_64_t *) &spte, 0); -#endif - /* - * If we're in log-dirty mode then we need to note that we've updated - * the PTE in the PT-holding page. We need the machine frame number - * for this. - */ - __mark_dirty(d, va_to_l1mfn(v, va)); - - shadow_unlock(d); - - return rc; -} - - -/* - * What lives where in the 32-bit address space in the various shadow modes, - * and what it uses to get/maintain that mapping. - * - * SHADOW MODE: none enable translate external - * - * 4KB things: - * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2 - * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2 - * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2 - * monitor_vtable n/a n/a n/a mapped once - * - * 4MB things: - * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2 - * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2 - * monitor_linear n/a n/a n/a ??? - * perdomain perdomain perdomain perdomain perdomain - * R/O M2P R/O M2P R/O M2P n/a n/a - * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P - * P2M n/a n/a R/O M2P R/O M2P - * - * NB: - * update_pagetables(), shadow_update_pagetables(), shadow_mode_enable(), - * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable() - * all play a part in maintaining these mappings. - */ -static void shadow_update_pagetables(struct vcpu *v) -{ - struct domain *d = v->domain; -#if CONFIG_PAGING_LEVELS == 4 - unsigned long gmfn = ((v->arch.flags & TF_kernel_mode)? - pagetable_get_pfn(v->arch.guest_table) : - pagetable_get_pfn(v->arch.guest_table_user)); -#else - unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table); -#endif - - unsigned long gpfn = mfn_to_gmfn(d, gmfn); - unsigned long smfn, old_smfn; - -#if CONFIG_PAGING_LEVELS == 2 - unsigned long hl2mfn; -#endif - int need_sync = 0; - - int max_mode = ( shadow_mode_external(d) ? SHM_external - : shadow_mode_translate(d) ? SHM_translate - : shadow_mode_enabled(d) ? SHM_enable - : 0 ); - - ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); - ASSERT( max_mode ); - - /* - * arch.guest_vtable - */ - if ( max_mode & (SHM_enable | SHM_external) ) - { - if ( likely(v->arch.guest_vtable != NULL) ) - unmap_domain_page_global(v->arch.guest_vtable); - v->arch.guest_vtable = map_domain_page_global(gmfn); - } - - /* - * arch.shadow_table - */ -#if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32) - /* - * We use PGT_l4_shadow for 2-level paging guests on PAE - */ - if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) - { - if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) ) - smfn = shadow_l3_table(v, gpfn, gmfn); - } - else -#endif - -#if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE ) - /* - * We use PGT_l4_shadow for 2-level paging guests on PAE - */ - if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) - { - if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) ) - smfn = shadow_l3_table(v, gpfn, gmfn); - else - { - update_top_level_shadow(v, smfn); - need_sync = 1; - } - } - else -#endif - if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) ) - { -#if CONFIG_PAGING_LEVELS == 2 - smfn = shadow_l2_table(v, gpfn, gmfn); -#elif CONFIG_PAGING_LEVELS == 3 - smfn = shadow_l3_table(v, gpfn, gmfn); -#elif CONFIG_PAGING_LEVELS == 4 - smfn = shadow_l4_table(v, gpfn, gmfn); -#endif - } - else - { -#if CONFIG_PAGING_LEVELS >= 3 - if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 ) - update_top_level_shadow(v, smfn); -#endif - /* - * move sync later in order to avoid this smfn been - * unshadowed occasionally - */ - need_sync = 1; - } - - - if ( !get_shadow_ref(smfn) ) - BUG(); - old_smfn = pagetable_get_pfn(v->arch.shadow_table); - v->arch.shadow_table = pagetable_from_pfn(smfn); - if ( old_smfn ) - put_shadow_ref(old_smfn); - - SH_VVLOG("shadow_update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn); - - /* - * arch.shadow_vtable - */ - if ( max_mode == SHM_external -#if CONFIG_PAGING_LEVELS >=3 - || max_mode & SHM_enable -#endif - ) - { - if ( v->arch.shadow_vtable ) - unmap_domain_page_global(v->arch.shadow_vtable); - v->arch.shadow_vtable = map_domain_page_global(smfn); - } - -#if CONFIG_PAGING_LEVELS == 2 - /* - * arch.hl2_vtable - */ - - // if max_mode == SHM_translate, then the hl2 is already installed - // correctly in its smfn, and there's nothing to do. - // - if ( max_mode == SHM_external ) - { - if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) - hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); - if ( v->arch.hl2_vtable ) - unmap_domain_page_global(v->arch.hl2_vtable); - v->arch.hl2_vtable = map_domain_page_global(hl2mfn); - } - - /* - * fixup pointers in monitor table, as necessary - */ - if ( max_mode == SHM_external ) - { - l2_pgentry_t *mpl2e = v->arch.monitor_vtable; - l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; - l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; - - ASSERT( shadow_mode_translate(d) ); - - if ( !get_shadow_ref(hl2mfn) ) - BUG(); - mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = - l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR); - if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT ) - put_shadow_ref(l2e_get_pfn(old_hl2e)); - - if ( !get_shadow_ref(smfn) ) - BUG(); - mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = - l2e_from_pfn(smfn, __PAGE_HYPERVISOR); - if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT ) - put_shadow_ref(l2e_get_pfn(old_sl2e)); - - // XXX - maybe this can be optimized somewhat?? - local_flush_tlb(); - } -#endif /* CONFIG_PAGING_LEVELS == 2 */ - -#if CONFIG_PAGING_LEVELS == 3 - /* - * fixup pointers in monitor table, as necessary - */ - if ( max_mode == SHM_external ) - { - l3_pgentry_t *mpl3e = (l3_pgentry_t *) v->arch.monitor_vtable; - l2_pgentry_t *spl2e; - unsigned long s2mfn; - int i; - - ASSERT( shadow_mode_translate(d) ); - s2mfn = l3e_get_pfn(mpl3e[L3_PAGETABLE_ENTRIES - 1]); - - ASSERT( s2mfn); - spl2e = map_domain_page(s2mfn); - - for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ ) - spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] = - (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ? - l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) : - l2e_empty(); - - unmap_domain_page(spl2e); - local_flush_tlb(); - } -#endif - - if(likely(need_sync)) - shadow_sync_all(d); -} - - -/************************************************************************/ -/************************************************************************/ -/************************************************************************/ - -#if 0 // this code has not been updated for 32pae & 64 bit modes -#if SHADOW_DEBUG - -// The following is entirely for _check_pagetable()'s benefit. -// _check_pagetable() wants to know whether a given entry in a -// shadow page table is supposed to be the shadow of the guest's -// current entry, or the shadow of the entry held in the snapshot -// taken above. -// -// Here, we mark all currently existing entries as reflecting -// the snapshot, above. All other places in xen that update -// the shadow will keep the shadow in sync with the guest's -// entries (via l1pte_propagate_from_guest and friends), which clear -// the SHADOW_REFLECTS_SNAPSHOT bit. -// -static void -mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn) -{ - unsigned long smfn; - l1_pgentry_t *l1e; - l2_pgentry_t *l2e; - unsigned i; - - if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) ) - { - l1e = map_domain_page(smfn); - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - if ( is_guest_l1_slot(i) && - (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) ) - l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT); - unmap_domain_page(l1e); - } - - if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) ) - { - l2e = map_domain_page(smfn); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - if ( is_guest_l2_slot(0, i) && - (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) ) - l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT); - unmap_domain_page(l2e); - } -} - -// BUG: these are not SMP safe... -static int sh_l2_present; -static int sh_l1_present; -static char *sh_check_name; -// int shadow_status_noswap; // declared in shadow32.c - -#define v2m(_v, _adr) ({ \ - unsigned long _a = (unsigned long)(_adr); \ - l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \ - unsigned long _pa = -1; \ - if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \ - { \ - l1_pgentry_t _pte; \ - _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \ - if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \ - _pa = l1e_get_paddr(_pte); \ - } \ - _pa | (_a & ~PAGE_MASK); \ -}) - -#define FAIL(_f, _a...) \ - do { \ - printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \ - sh_check_name, level, l2_idx, l1_idx, ## _a, \ - __FILE__, __LINE__); \ - printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \ - " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \ - " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \ - " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \ - l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \ - l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \ - p_guest_pte, p_shadow_pte, p_snapshot_pte, \ - (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \ - (void *)v2m(v, p_snapshot_pte), \ - (l2_idx << L2_PAGETABLE_SHIFT) | \ - (l1_idx << L1_PAGETABLE_SHIFT)); \ - errors++; \ - } while ( 0 ) - -static int check_pte( - struct vcpu *v, - l1_pgentry_t *p_guest_pte, - l1_pgentry_t *p_shadow_pte, - l1_pgentry_t *p_snapshot_pte, - int level, int l2_idx, int l1_idx) -{ - struct domain *d = v->domain; - l1_pgentry_t guest_pte = *p_guest_pte; - l1_pgentry_t shadow_pte = *p_shadow_pte; - l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty(); - l1_pgentry_t eff_guest_pte; - unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn; - int errors = 0, guest_writable; - int page_table_page; - - if ( (l1e_get_intpte(shadow_pte) == 0) || - (l1e_get_intpte(shadow_pte) == 0xdeadface) || - (l1e_get_intpte(shadow_pte) == 0x00000E00) ) - return errors; /* always safe */ - - if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) ) - FAIL("Non zero not present shadow_pte"); - - if ( level == 2 ) sh_l2_present++; - if ( level == 1 ) sh_l1_present++; - - if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte ) - eff_guest_pte = snapshot_pte; - else - eff_guest_pte = guest_pte; - - if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) ) - FAIL("Guest not present yet shadow is"); - - mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK); - - if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) ) - FAIL("Corrupt?"); - - if ( (level == 1) && - (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) && - !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) ) - FAIL("Dirty coherence"); - - if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) && - !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) ) - FAIL("Accessed coherence"); - - if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL ) - FAIL("global bit set in shadow"); - - eff_guest_pfn = l1e_get_pfn(eff_guest_pte); - eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn); - shadow_mfn = l1e_get_pfn(shadow_pte); - - if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) ) - FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n", - __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte)); - - page_table_page = mfn_is_page_table(eff_guest_mfn); - - guest_writable = - (l1e_get_flags(eff_guest_pte) & _PAGE_RW) || - (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn)); - - if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable ) - { - printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n", - eff_guest_pfn, eff_guest_mfn, shadow_mfn, - mfn_to_page(eff_guest_mfn)->u.inuse.type_info, - page_table_page); - FAIL("RW coherence"); - } - - if ( (level == 1) && - (l1e_get_flags(shadow_pte) & _PAGE_RW ) && - !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) ) - { - printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n", - eff_guest_pfn, eff_guest_mfn, shadow_mfn, - mfn_to_page(eff_guest_mfn)->u.inuse.type_info, - page_table_page); - FAIL("RW2 coherence"); - } - - if ( eff_guest_mfn == shadow_mfn ) - { - if ( level > 1 ) - FAIL("Linear map ???"); /* XXX this will fail on BSD */ - } - else - { - if ( level < 2 ) - FAIL("Shadow in L1 entry?"); - - if ( level == 2 ) - { - if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn ) - FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn, - __shadow_status(d, eff_guest_pfn, PGT_l1_shadow)); - } - else - BUG(); // XXX -- not handled yet. - } - - return errors; -} -#undef FAIL -#undef v2m - -static int check_l1_table( - struct vcpu *v, unsigned long gpfn, - unsigned long gmfn, unsigned long smfn, unsigned l2_idx) -{ - struct domain *d = v->domain; - int i; - unsigned long snapshot_mfn; - l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL; - int errors = 0; - - if ( page_out_of_sync(mfn_to_page(gmfn)) ) - { - snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot); - ASSERT(snapshot_mfn); - p_snapshot = map_domain_page(snapshot_mfn); - } - - p_guest = map_domain_page(gmfn); - p_shadow = map_domain_page(smfn); - - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - errors += check_pte(v, p_guest+i, p_shadow+i, - p_snapshot ? p_snapshot+i : NULL, - 1, l2_idx, i); - - unmap_domain_page(p_shadow); - unmap_domain_page(p_guest); - if ( p_snapshot ) - unmap_domain_page(p_snapshot); - - return errors; -} - -#define FAILPT(_f, _a...) \ - do { \ - printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \ - errors++; \ - } while ( 0 ) - -static int check_l2_table( - struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes) -{ - struct domain *d = v->domain; - l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn); - l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn); - l2_pgentry_t match; - int i; - int errors = 0; - int limit; - - if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) ) - FAILPT("domain doesn't own page"); - if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) ) - FAILPT("bogus owner for snapshot page"); - if ( page_get_owner(mfn_to_page(smfn)) != NULL ) - FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d", - smfn, page_get_owner(mfn_to_page(smfn))->domain_id); - -#if 0 - if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) - - DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) ) - { - for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE; - i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT); - i++ ) - printk("+++ (%d) %lx %lx\n",i, - l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i])); - FAILPT("hypervisor entries inconsistent"); - } - - if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != - l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) ) - FAILPT("hypervisor linear map inconsistent"); -#endif - - match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR); - if ( !shadow_mode_external(d) && - l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT], - match, PAGE_FLAG_MASK)) - { - FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte, - l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >> - L2_PAGETABLE_SHIFT]), - l2e_get_intpte(match)); - } - - match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR); - if ( !shadow_mode_external(d) && - l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT], - match, PAGE_FLAG_MASK)) - { - FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte, - l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]), - d->arch.mm_perdomain_pt, - l2e_get_intpte(match)); - } - -#if CONFIG_PAGING_LEVELS == 2 - if ( shadow_mode_external(d) ) - limit = L2_PAGETABLE_ENTRIES; - else - limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; -#else - limit = 0; /* XXX x86/64 XXX */ -#endif - - /* Check the whole L2. */ - for ( i = 0; i < limit; i++ ) - errors += check_pte(v, - (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */ - (l1_pgentry_t*)(&spl2e[i]), - NULL, - 2, i, 0); - - unmap_domain_page(spl2e); - unmap_domain_page(gpl2e); - -#if 1 - if ( errors ) - printk("check_l2_table returning %d errors\n", errors); -#endif - - return errors; -} -#undef FAILPT - -int _check_pagetable(struct vcpu *v, char *s) -{ - struct domain *d = v->domain; -#if CONFIG_PAGING_LEVELS == 4 - pagetable_t pt = ((v->arch.flags & TF_kernel_mode)? - v->arch.guest_table : v->arch.guest_table_user); -#else - pagetable_t pt = v->arch.guest_table; -#endif - unsigned long gptbase = pagetable_get_paddr(pt); - unsigned long ptbase_pfn, smfn; - unsigned long i; - l2_pgentry_t *gpl2e, *spl2e; - unsigned long ptbase_mfn = 0; - int errors = 0, limit, oos_pdes = 0; - - //_audit_domain(d, AUDIT_QUIET); - shadow_lock(d); - - sh_check_name = s; - //SH_VVLOG("%s-PT Audit", s); - sh_l2_present = sh_l1_present = 0; - perfc_incrc(check_pagetable); - - ptbase_mfn = gptbase >> PAGE_SHIFT; - ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn); - - if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) ) - { - printk("%s-PT %lx not shadowed\n", s, gptbase); - goto out; - } - if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) ) - { - ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot); - oos_pdes = 1; - ASSERT(ptbase_mfn); - } - - errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes); - - gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn); - spl2e = (l2_pgentry_t *) map_domain_page(smfn); - - /* Go back and recurse. */ -#if CONFIG_PAGING_LEVELS == 2 - if ( shadow_mode_external(d) ) - limit = L2_PAGETABLE_ENTRIES; - else - limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; -#else - limit = 0; /* XXX x86/64 XXX */ -#endif - - for ( i = 0; i < limit; i++ ) - { - unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]); - unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn); - unsigned long sl1mfn = l2e_get_pfn(spl2e[i]); - - if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */ - { - errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i); - } - } - - unmap_domain_page(spl2e); - unmap_domain_page(gpl2e); - -#if 0 - SH_VVLOG("PT verified : l2_present = %d, l1_present = %d", - sh_l2_present, sh_l1_present); -#endif - - out: - if ( errors ) - BUG(); - - shadow_unlock(d); - - return errors; -} - -int _check_all_pagetables(struct vcpu *v, char *s) -{ - struct domain *d = v->domain; - int i; - struct shadow_status *a; - unsigned long gmfn; - int errors = 0; - - shadow_status_noswap = 1; - - sh_check_name = s; - SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id); - sh_l2_present = sh_l1_present = 0; - perfc_incrc(check_all_pagetables); - - for (i = 0; i < shadow_ht_buckets; i++) - { - a = &d->arch.shadow_ht[i]; - while ( a && a->gpfn_and_flags ) - { - gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask); - - switch ( a->gpfn_and_flags & PGT_type_mask ) - { - case PGT_l1_shadow: - errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask, - gmfn, a->smfn, 0); - break; - case PGT_l2_shadow: - errors += check_l2_table(v, gmfn, a->smfn, - page_out_of_sync(mfn_to_page(gmfn))); - break; - case PGT_l3_shadow: - case PGT_l4_shadow: - case PGT_hl2_shadow: - BUG(); // XXX - ought to fix this... - break; - case PGT_snapshot: - case PGT_writable_pred: - break; - default: - errors++; - printk("unexpected shadow type %lx, gpfn=%lx, " - "gmfn=%lx smfn=%lx\n", - a->gpfn_and_flags & PGT_type_mask, - a->gpfn_and_flags & PGT_mfn_mask, - gmfn, a->smfn); - BUG(); - } - a = a->next; - } - } - - shadow_status_noswap = 0; - - if ( errors ) - BUG(); - - return errors; -} - -#endif // SHADOW_DEBUG -#endif // this code has not been updated for 32pae & 64 bit modes - -#if CONFIG_PAGING_LEVELS >= 3 -/****************************************************************************/ -/* 64-bit shadow-mode code testing */ -/****************************************************************************/ -/* - * init_bl2() is for 32-bit VMX guest on 64-bit host - * Using 1 shadow L4(l3) and 4 shadow L2s to simulate guest L2 - */ -static inline unsigned long init_bl2( - struct domain *d, unsigned long gpfn, unsigned long gmfn) -{ - unsigned int count; - unsigned long sl2mfn; - unsigned long smfn; - struct page_info *page; - l4_pgentry_t *spl4e; - void *l2; - - if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) ) - { - printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn); - /* XXX Deal gracefully with failure. */ - domain_crash_synchronous(); - } - - spl4e = (l4_pgentry_t *)map_domain_page(smfn); - - /* Map the self entry, L4&L3 share the same page */ - spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR); - - /* Allocate 4 shadow L2s */ - page = alloc_domheap_pages(NULL, SL2_ORDER, 0); - if ( !page ) - domain_crash_synchronous(); - - for ( count = 0; count < PAE_L3_PAGETABLE_ENTRIES; count++ ) - { - sl2mfn = page_to_mfn(page+count); - l2 = map_domain_page(sl2mfn); - memset(l2, 0, PAGE_SIZE); - unmap_domain_page(l2); - spl4e[count] = l4e_from_pfn(sl2mfn, _PAGE_PRESENT); - } - - unmap_domain_page(spl4e); - - return smfn; -} - -static inline unsigned long init_l3( - struct vcpu *v, unsigned long gpfn, unsigned long gmfn) -{ - unsigned long smfn; - l4_pgentry_t *spl4e; - unsigned long index; - - if ( unlikely(!(smfn = alloc_shadow_page(v->domain, gpfn, gmfn, PGT_l4_shadow))) ) - { - printk("Couldn't alloc an L4 shadow for pfn= %lx mfn= %lx\n", gpfn, gmfn); - BUG(); /* XXX Deal gracefully wiht failure. */ - } - - /* Map the self entry, L4&L3 share the same page */ - spl4e = (l4_pgentry_t *)map_domain_page(smfn); - - /* - * Shadow L4's pfn_info->tlbflush_timestamp - * should also save it's own index. - */ - - index = get_cr3_idxval(v); - frame_table[smfn].tlbflush_timestamp = index; - - memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t)); - spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR); - unmap_domain_page(spl4e); - return smfn; -} -#endif - -#if CONFIG_PAGING_LEVELS == 3 -static unsigned long shadow_l3_table( - struct vcpu *v, unsigned long gpfn, unsigned long gmfn) -{ - unsigned long smfn; - l3_pgentry_t *spl3e; - struct domain *d = v->domain; - - perfc_incrc(shadow_l3_table_count); - - SH_VVLOG("shadow_l3_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn); - - if ( SH_L1_HAS_NEXT_PAGE && - d->arch.ops->guest_paging_levels == PAGING_L2 ) - { - return init_bl2(d, gpfn, gmfn); - } - - if ( SH_GUEST_32PAE && - d->arch.ops->guest_paging_levels == PAGING_L3 ) - { - return init_l3(v, gpfn, gmfn); - } - - if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l3_shadow))) ) - { - printk("Couldn't alloc an L3 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn); - BUG(); /* XXX Deal gracefully with failure. */ - } - - spl3e = (l3_pgentry_t *)map_domain_page(smfn); - - /* Make the self entry */ - spl3e[PAE_SHADOW_SELF_ENTRY] = l3e_from_pfn(smfn, __PAGE_HYPERVISOR); - - if ( (PGT_base_page_table == PGT_l3_page_table) && - !shadow_mode_external(d) ) { - int i; - unsigned long g2mfn, s2mfn; - l2_pgentry_t *spl2e; - l3_pgentry_t *gpl3e; - - /* Get the top entry */ - gpl3e = (l3_pgentry_t *)map_domain_page(gmfn); - - if ( !(l3e_get_flags(gpl3e[L3_PAGETABLE_ENTRIES - 1]) & _PAGE_PRESENT) ) - { - BUG(); - } - - g2mfn = l3e_get_pfn(gpl3e[L3_PAGETABLE_ENTRIES - 1]); - - /* NB. g2mfn should be same as g2pfn */ - if (!(s2mfn = __shadow_status(d, g2mfn, PGT_l2_shadow))) { - if ( unlikely(!(s2mfn = - alloc_shadow_page(d, g2mfn, g2mfn, PGT_l2_shadow))) ) { - printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n", - g2mfn, g2mfn); - BUG(); /* XXX Deal gracefully with failure. */ - } - } - - if (!get_shadow_ref(s2mfn)) - BUG(); - - /* Map shadow L2 into shadow L3 */ - spl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(s2mfn, _PAGE_PRESENT); - shadow_update_min_max(smfn, L3_PAGETABLE_ENTRIES -1); - - /* - * Xen private mappings. Do the similar things as - * create_pae_xen_mappings(). - */ - spl2e = (l2_pgentry_t *)map_domain_page(s2mfn); - - /* - * When we free L2 pages, we need to tell if the page contains - * Xen private mappings. Use the va_mask part. - */ - mfn_to_page(s2mfn)->u.inuse.type_info |= - (unsigned long) 3 << PGT_score_shift; - - memset(spl2e, 0, - (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)) * sizeof(l2_pgentry_t)); - - memcpy(&spl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)], - &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT], - L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); - - for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) - spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = - l2e_from_page( - virt_to_page(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_pt) + i, - __PAGE_HYPERVISOR); - for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ ) - spl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] = - (l3e_get_flags(gpl3e[i]) & _PAGE_PRESENT) ? - l2e_from_pfn(l3e_get_pfn(gpl3e[i]), __PAGE_HYPERVISOR) : - l2e_empty(); - - unmap_domain_page(spl2e); - unmap_domain_page(gpl3e); - } - unmap_domain_page(spl3e); - - return smfn; -} -#endif /* CONFIG_PAGING_LEVELS == 3 */ - -#if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE)) -static unsigned long gva_to_gpa_pae(unsigned long gva) -{ - BUG(); - return 43; -} -#endif - -#if CONFIG_PAGING_LEVELS == 4 -static unsigned long shadow_l4_table( - struct vcpu *v, unsigned long gpfn, unsigned long gmfn) -{ - unsigned long smfn; - l4_pgentry_t *spl4e; - struct domain *d = v->domain; - - SH_VVLOG("shadow_l4_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn); - - perfc_incrc(shadow_l4_table_count); - - if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) - { - return init_bl2(d, gpfn, gmfn); - } - - if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 ) - { - return init_l3(v, gpfn, gmfn); - } - - if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) ) - { - printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn); - BUG(); /* XXX Deal gracefully with failure. */ - } - - spl4e = (l4_pgentry_t *)map_domain_page(smfn); - - /* Install hypervisor and 4x linear p.t. mapings. */ - if ( (PGT_base_page_table == PGT_l4_page_table) && - !shadow_mode_external(d) ) - { - /* - * We could proactively fill in PDEs for pages that are already - * shadowed *and* where the guest PDE has _PAGE_ACCESSED set - * (restriction required for coherence of the accessed bit). However, - * we tried it and it didn't help performance. This is simpler. - */ - memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t)); - - /* Install hypervisor and 2x linear p.t. mapings. */ - memcpy(&spl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], - &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], - ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t)); - - spl4e[l4_table_offset(PERDOMAIN_VIRT_START)] = - l4e_from_paddr(__pa(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_l3), - __PAGE_HYPERVISOR); - - if ( shadow_mode_translate(d) ) // NB: not external - { - spl4e[l4_table_offset(RO_MPT_VIRT_START)] = - l4e_from_paddr(pagetable_get_paddr(d->arch.phys_table), - __PAGE_HYPERVISOR); - } - else - spl4e[l4_table_offset(LINEAR_PT_VIRT_START)] = - l4e_from_pfn(gmfn, __PAGE_HYPERVISOR); - - } else - memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t)); - - unmap_domain_page(spl4e); - - ESH_LOG("shadow_l4_table(%lx -> %lx)", gmfn, smfn); - return smfn; -} -#endif /* CONFIG_PAGING_LEVELS == 4 */ - -#if CONFIG_PAGING_LEVELS >= 3 -static void -update_top_level_shadow(struct vcpu *v, unsigned long smfn) -{ - unsigned long index = get_cr3_idxval(v); - pgentry_64_t *sple = (pgentry_64_t *)map_domain_page(smfn); - pgentry_64_t *gple = (pgentry_64_t *)&v->arch.guest_vtable; - int i; - - for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ ) - { - unsigned long gpfn; - - /* - * Looks like it's no longer a page table. - */ - if ( unlikely(entry_get_value(gple[index*4+i]) & PAE_PDPT_RESERVED) ) - { - if ( entry_get_flags(sple[i]) & _PAGE_PRESENT ) - put_shadow_ref(entry_get_pfn(sple[i])); - - sple[i] = entry_empty(); - continue; - } - - gpfn = entry_get_pfn(gple[index*4+i]); - - if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) ) - { - if ( entry_get_flags(sple[i]) & _PAGE_PRESENT ) - put_shadow_ref(entry_get_pfn(sple[i])); - - sple[i] = entry_empty(); - continue; - } - - validate_entry_change( - v->domain, &gple[index*4+i], &sple[i], PAGING_L3); - } - - unmap_domain_page(sple); -} - -/* - * validate_bl2e_change() - * The code is for 32-bit HVM guest on 64-bit host. - * To sync guest L2. - */ - -static inline void -validate_bl2e_change( - struct domain *d, - guest_root_pgentry_t *new_gle_p, - pgentry_64_t *shadow_l3, - int index) -{ - int sl3_idx, sl2_idx; - unsigned long sl2mfn, sl1mfn; - pgentry_64_t *sl2_p; - - /* Using guest l2 pte index to get shadow l3&l2 index - * index: 0 ~ 1023, PAGETABLE_ENTRIES: 512 - */ - sl3_idx = index / (PAGETABLE_ENTRIES / 2); - sl2_idx = (index % (PAGETABLE_ENTRIES / 2)) * 2; - - sl2mfn = entry_get_pfn(shadow_l3[sl3_idx]); - sl2_p = (pgentry_64_t *)map_domain_page(sl2mfn); - - validate_pde_change( - d, *(guest_l2_pgentry_t *)new_gle_p, (l2_pgentry_t *)&sl2_p[sl2_idx]); - - /* Mapping the second l1 shadow page */ - if (entry_get_flags(sl2_p[sl2_idx]) & _PAGE_PRESENT) { - sl1mfn = entry_get_pfn(sl2_p[sl2_idx]); - sl2_p[sl2_idx + 1] = - entry_from_pfn(sl1mfn + 1, entry_get_flags(sl2_p[sl2_idx])); - } - else - sl2_p[sl2_idx + 1] = (pgentry_64_t){0}; - unmap_domain_page(sl2_p); - -} - -/* - * This shadow_mark_va_out_of_sync() is for 2M page shadow - */ -static void shadow_mark_va_out_of_sync_2mp( - struct vcpu *v, unsigned long gpfn, unsigned long mfn, paddr_t writable_pl1e) -{ - struct out_of_sync_entry *entry = - shadow_mark_mfn_out_of_sync(v, gpfn, mfn); - - entry->writable_pl1e = writable_pl1e; - ESH_LOG(" gpfn = %lx\n", gpfn); - if ( !get_shadow_ref(writable_pl1e >> L1_PAGETABLE_SHIFT) ) - BUG(); -} - -static int get_shadow_mfn(struct domain *d, unsigned long gpfn, unsigned long *spmfn, u32 flag) -{ - unsigned long gmfn; - if ( !(*spmfn = __shadow_status(d, gpfn, flag)) ) - { - /* This is NOT already shadowed so we need to shadow it. */ - SH_VVLOG(": not shadowed"); - - gmfn = gmfn_to_mfn(d, gpfn); - if ( unlikely(!VALID_MFN(gmfn)) ) - { - // Attempt to use an invalid pfn as an shadow page. - // XXX this needs to be more graceful! - BUG(); - } - - if ( unlikely(!(*spmfn = - alloc_shadow_page(d, gpfn, gmfn, flag))) ) - { - printk("Couldn't alloc an shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn); - BUG(); /* XXX Need to deal gracefully with failure. */ - } - switch(flag) { - case PGT_l1_shadow: - perfc_incrc(shadow_l1_table_count); - break; - case PGT_l2_shadow: - perfc_incrc(shadow_l2_table_count); - break; - case PGT_l3_shadow: - perfc_incrc(shadow_l3_table_count); - break; - case PGT_hl2_shadow: - perfc_incrc(shadow_hl2_table_count); - break; - } - - return 1; - } else { - /* This L1 is shadowed already, but the L2 entry is missing. */ - SH_VVLOG("4b: was shadowed, l2 missing (%lx)", *spmfn); - return 0; - } -} - -static void shadow_map_into_current(struct vcpu *v, - unsigned long va, unsigned int from, unsigned int to) -{ - pgentry_64_t gle = {0}, sle; - unsigned long gpfn, smfn; - - if (from == PAGING_L1 && to == PAGING_L2) { - shadow_map_l1_into_current_l2(va); - return; - } - - __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | to); - ASSERT(entry_get_flags(gle) & _PAGE_PRESENT); - gpfn = entry_get_pfn(gle); - - get_shadow_mfn(v->domain, gpfn, &smfn, shadow_level_to_type(from)); - - if ( !get_shadow_ref(smfn) ) - BUG(); - entry_general(v->domain, &gle, &sle, smfn, to); - __rw_entry(v, va, &gle, GUEST_ENTRY | SET_ENTRY | to); - __rw_entry(v, va, &sle, SHADOW_ENTRY | SET_ENTRY | to); -} - -/* - * shadow_set_lxe should be put in shadow.h - */ -static void shadow_set_l2e_64(unsigned long va, l2_pgentry_t sl2e, - int create_l2_shadow, int put_ref_check) -{ - struct vcpu *v = current; - l4_pgentry_t sl4e; - l3_pgentry_t sl3e; - - __shadow_get_l4e(v, va, &sl4e); - if (!(l4e_get_flags(sl4e) & _PAGE_PRESENT)) { - if (create_l2_shadow) { - perfc_incrc(shadow_set_l3e_force_map); - shadow_map_into_current(v, va, PAGING_L3, PAGING_L4); - __shadow_get_l4e(v, va, &sl4e); - } else { - printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow); - } - } - - __shadow_get_l3e(v, va, &sl3e); - if (!(l3e_get_flags(sl3e) & _PAGE_PRESENT)) { - if (create_l2_shadow) { - perfc_incrc(shadow_set_l2e_force_map); - shadow_map_into_current(v, va, PAGING_L2, PAGING_L3); - __shadow_get_l3e(v, va, &sl3e); - } else { - printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow); - } - - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L4 ) - shadow_update_min_max(l4e_get_pfn(sl4e), l3_table_offset(va)); - } - - if ( put_ref_check ) { - l2_pgentry_t tmp_sl2e; - if ( __shadow_get_l2e(v, va, &tmp_sl2e) ) { - if ( l2e_get_flags(tmp_sl2e) & _PAGE_PRESENT ) - if ( l2e_get_pfn(tmp_sl2e) == l2e_get_pfn(sl2e) ) { - put_shadow_ref(l2e_get_pfn(sl2e)); - } - } - - } - - if (! __shadow_set_l2e(v, va, &sl2e)) - BUG(); - shadow_update_min_max(l3e_get_pfn(sl3e), l2_table_offset(va)); -} - - -/* As 32-bit guest don't support 4M page yet, - * we don't concern double compile for this function - */ -static inline int l2e_rw_fault( - struct vcpu *v, l2_pgentry_t *gl2e_p, unsigned long va, int rw) -{ - struct domain *d = v->domain; - l2_pgentry_t gl2e = *gl2e_p; - l2_pgentry_t tmp_l2e = gl2e; - unsigned long start_gpfn = l2e_get_pfn(gl2e); - unsigned long gpfn, mfn; - unsigned long l1_mfn, gmfn; - l1_pgentry_t *l1_p; - l1_pgentry_t sl1e; - l1_pgentry_t old_sl1e; - l2_pgentry_t sl2e; -#ifdef __x86_64__ - u64 nx = 0; -#endif - int put_ref_check = 0; - /* Check if gpfn is 2M aligned */ - - /* Update guest l2e */ - if (rw) { - ASSERT(l2e_get_flags(gl2e) & _PAGE_RW); - l2e_add_flags(gl2e, _PAGE_DIRTY | _PAGE_ACCESSED); - } else { - l2e_add_flags(gl2e, _PAGE_ACCESSED); - } - - l2e_remove_flags(tmp_l2e, _PAGE_PSE); - if (l2e_get_flags(gl2e) & _PAGE_NX) { - l2e_remove_flags(tmp_l2e, _PAGE_NX); -#ifdef __x86_64__ - nx = PGT_high_mfn_nx; -#endif - } - - - /* Get the shadow l2 first */ - if ( !__shadow_get_l2e(v, va, &sl2e) ) - sl2e = l2e_empty(); - -#ifdef __x86_64__ - l1_mfn = __shadow_status(d, start_gpfn | nx, PGT_fl1_shadow); -#else - l1_mfn = __shadow_status(d, start_gpfn, PGT_fl1_shadow); -#endif - - /* Check the corresponding l2e */ - if (l1_mfn) { - /* Why it is PRESENT?*/ - if ((l2e_get_flags(sl2e) & _PAGE_PRESENT) && - l2e_get_pfn(sl2e) == l1_mfn) { - ESH_LOG("sl2e PRSENT bit is set: %lx, l1_mfn = %lx\n", l2e_get_pfn(sl2e), l1_mfn); - } else { - put_ref_check = 1; - if (!get_shadow_ref(l1_mfn)) - BUG(); - } - l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn); - sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e)); - } else { - /* Allocate a new page as shadow page table if need */ - gmfn = gmfn_to_mfn(d, start_gpfn); -#ifdef __x86_64__ - l1_mfn = alloc_shadow_page(d, start_gpfn | nx, gmfn, PGT_fl1_shadow); -#else - l1_mfn = alloc_shadow_page(d, start_gpfn, gmfn, PGT_fl1_shadow); -#endif - if (unlikely(!l1_mfn)) { - BUG(); - } - - if (!get_shadow_ref(l1_mfn)) - BUG(); - l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn ); - sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e)); - memset(l1_p, 0, PAGE_SIZE); - ESH_LOG("Alloc a shadow page: %lx\n", l1_mfn); - } - - ESH_LOG("<%s>: sl2e = %lx\n", __func__, l2e_get_intpte(sl2e)); - /* Map the page to l2*/ - shadow_set_l2e_64(va, sl2e, 1, put_ref_check); - - if (l2e_get_flags(gl2e) & _PAGE_NX) - l2e_add_flags(tmp_l2e, _PAGE_NX); - - /* Propagate the shadow page table, i.e. setting sl1e */ - for (gpfn = start_gpfn; - gpfn < (start_gpfn + L1_PAGETABLE_ENTRIES); gpfn++) { - - mfn = gmfn_to_mfn(d, gpfn); - - if ( unlikely(!VALID_MFN(mfn)) ) - { - continue; - } - - sl1e = l1e_from_pfn(mfn, l2e_get_flags(tmp_l2e)); - - if (!rw) { - if ( shadow_mode_log_dirty(d) || - !(l2e_get_flags(gl2e) & _PAGE_DIRTY) || mfn_is_page_table(mfn) ) - { - l1e_remove_flags(sl1e, _PAGE_RW); - } - } else { - /* __mark_dirty(d, gmfn); */ - } - // printk("<%s> gpfn: %lx, mfn: %lx, sl1e: %lx\n", __func__, gpfn, mfn, l1e_get_intpte(sl1e)); - /* The shadow entrys need setup before shadow_mark_va_out_of_sync()*/ - old_sl1e = l1_p[gpfn - start_gpfn]; - - if ( l1e_has_changed(old_sl1e, sl1e, _PAGE_RW | _PAGE_PRESENT) ) - { - if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(sl1e, d) ) { - ESH_LOG("%lx, mfn: %lx why make me empty, start_pfn: %lx, gpfn: %lx\n", l1e_get_intpte(sl1e),mfn, start_gpfn, gpfn); - sl1e = l1e_empty(); - } - if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) - put_page_from_l1e(old_sl1e, d); - } - - if (rw) { - /* shadow_mark_va_out_of_sync() need modificatin for 2M pages*/ - if ( mfn_is_page_table(mfn) ) - shadow_mark_va_out_of_sync_2mp(v, gpfn, mfn, - l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * (gpfn - start_gpfn))); - } - - l1_p[gpfn - start_gpfn] = sl1e; - } - - unmap_domain_page(l1_p); - *gl2e_p = gl2e; - return 1; -} - -/* - * Check P, R/W, U/S bits in the guest page table. - * If the fault belongs to guest return 1, - * else return 0. - */ -#if defined( GUEST_PGENTRY_32 ) -static inline int guest_page_fault( - struct vcpu *v, - unsigned long va, unsigned int error_code, - guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e) -{ - /* The following check for 32-bit guest on 64-bit host */ - - __guest_get_l2e(v, va, gpl2e); - - /* Check the guest L2 page-table entry first*/ - if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_PRESENT)) ) - return 1; - - if ( error_code & ERROR_W ) - { - if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_RW)) ) - return 1; - } - - if ( error_code & ERROR_U ) - { - if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_USER)) ) - return 1; - } - - if ( guest_l2e_get_flags(*gpl2e) & _PAGE_PSE ) - { - printk("None-PAE HVM guests can NOT use PSE, " - "because we don't support 4MBytes PSE pages.\n"); - printk("remove pae=1 from your config file.\n"); - domain_crash_synchronous(); - return 0; - } - - __guest_get_l1e(v, va, gpl1e); - - /* Then check the guest L1 page-table entry */ - if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_PRESENT)) ) - return 1; - - if ( error_code & ERROR_W ) - { - if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_RW)) ) - return 1; - } - - if ( error_code & ERROR_U ) - { - if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_USER)) ) - return 1; - } - - return 0; -} -#else -static inline int guest_page_fault( - struct vcpu *v, - unsigned long va, unsigned int error_code, - guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e) -{ - struct domain *d = v->domain; - pgentry_64_t gle = { 0 }; - unsigned long gpfn = 0, mfn; - int i; - unsigned int base_idx = 0; - base_idx = get_cr3_idxval(v); - - ASSERT( d->arch.ops->guest_paging_levels >= PAGING_L3 ); - -#if CONFIG_PAGING_LEVELS >= 3 - if ( (error_code & (ERROR_I | ERROR_P)) == (ERROR_I | ERROR_P) ) - return 1; -#endif - -#if CONFIG_PAGING_LEVELS == 4 - if ( d->arch.ops->guest_paging_levels == PAGING_L4 ) - { - __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | PAGING_L4); - if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) ) - return 1; - - if ( error_code & ERROR_W ) - { - if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) ) - return 1; - } - - if ( error_code & ERROR_U ) - { - if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) ) - return 1; - } - gpfn = entry_get_pfn(gle); - } -#endif - -#if CONFIG_PAGING_LEVELS >= 3 - if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) - { - if ( SH_GUEST_32PAE ) - gpfn = (hvm_get_guest_ctrl_reg(v, 3)) >> PAGE_SHIFT; - else - gpfn = pagetable_get_pfn(v->arch.guest_table); - } -#endif - - for ( i = PAGING_L3; i >= PAGING_L1; i-- ) - { - pgentry_64_t *lva; - /* - * If it's not external mode, then mfn should be machine physical. - */ - mfn = gmfn_to_mfn(d, gpfn); - - lva = (pgentry_64_t *) map_domain_page(mfn); - gle = lva[guest_table_offset_64(va, i, base_idx)]; - - unmap_domain_page(lva); - - gpfn = entry_get_pfn(gle); - - if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) ) - return 1; - - if ( i < PAGING_L3 || - d->arch.ops->guest_paging_levels == PAGING_L4 ) - { - if ( error_code & ERROR_W ) - { - if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) ) - { - if ( i == PAGING_L1 ) - if ( gpl1e ) - gpl1e->l1 = gle.lo; - return 1; - } - } - if ( error_code & ERROR_U ) - { - if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) ) - return 1; - } - } - - if ( i == PAGING_L2 ) - { - if ( gpl2e ) - gpl2e->l2 = gle.lo; - if ( likely(entry_get_flags(gle) & _PAGE_PSE) ) - return 0; - } - - if ( i == PAGING_L1 ) - if ( gpl1e ) - gpl1e->l1 = gle.lo; - } - - return 0; - -} -#endif - -static int shadow_fault_64(unsigned long va, struct cpu_user_regs *regs) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - guest_l2_pgentry_t gl2e; - guest_l1_pgentry_t gl1e, orig_gl1e; - l1_pgentry_t sl1e; - - gl1e = guest_l1e_empty(); gl2e = guest_l2e_empty(); - - sl1e = l1e_empty(); - - perfc_incrc(shadow_fault_calls); - - ESH_LOG(" va=%lx, rip = %lx, error code = %x\n", - va, regs->eip, regs->error_code); - - /* - * Don't let someone else take the guest's table pages out-of-sync. - */ - shadow_lock(d); - - /* - * STEP 1. Check to see if this fault might have been caused by an - * out-of-sync table page entry, or if we should pass this - * fault onto the guest. - */ - __shadow_sync_va(v, va); - - /* - * STEP 2. Check if the fault belongs to guest - */ - if ( guest_page_fault(v, va, regs->error_code, &gl2e, &gl1e) ) - { - if ( unlikely(shadow_mode_log_dirty(d)) && l1e_get_intpte(gl1e) != 0 ) - goto check_writeable; - - goto fail; - } - - if ( unlikely((guest_l2e_get_flags(gl2e) & _PAGE_PSE)) ) - goto pse; - - /* - * Handle 4K pages here - */ -check_writeable: - orig_gl1e = gl1e; - - /* Write fault? */ - if ( regs->error_code & 2 ) - { - int allow_writes = 0; - - if ( unlikely(!(guest_l1e_get_flags(gl1e) & _PAGE_RW)) ) - { - if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gl1e)) ) - { - allow_writes = 1; - l1e_add_flags(gl1e, _PAGE_RW); - } - else - { - /* Write fault on a read-only mapping. */ - SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")", - l1e_get_intpte(gl1e)); - perfc_incrc(shadow_fault_bail_ro_mapping); - goto fail; - } - } - - if ( !l1pte_write_fault(v, &gl1e, &sl1e, va) ) - { - SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed"); - perfc_incrc(write_fault_bail); - shadow_unlock(d); - return 0; - } - - if (allow_writes) - l1e_remove_flags(gl1e, _PAGE_RW); - } - else - { - if ( !l1pte_read_fault(d, &gl1e, &sl1e) ) - { - SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed"); - perfc_incrc(read_fault_bail); - shadow_unlock(d); - return 0; - } - } - - /* - * STEP 3. Write the modified shadow PTE and guest PTE back to the tables - */ - if ( l1e_has_changed(orig_gl1e, gl1e, PAGE_FLAG_MASK) ) - { - if (unlikely(!__guest_set_l1e(v, va, &gl1e))) - domain_crash_synchronous(); - - __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gl2e))); - } - - shadow_set_l1e_64(va, (pgentry_64_t *)&sl1e, 1); - - perfc_incrc(shadow_fault_fixed); - d->arch.shadow_fault_count++; - - shadow_unlock(d); - - return EXCRET_fault_fixed; - -pse: - /* - * Handle 2M pages here - */ - if ( unlikely(!shadow_mode_external(d)) ) - BUG(); - - /* Write fault? */ - if ( regs->error_code & 2 ) - { - if ( !l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, WRITE_FAULT) ) - { - goto fail; - } - } - else - { - l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, READ_FAULT); - } - - /* - * STEP 3. Write guest/shadow l2e back - */ - - if ( unlikely(!__guest_set_l2e(v, va, &gl2e)) ) - { - domain_crash_synchronous(); - } - - /* - * Todo: if necessary, record the page table page as dirty - */ - - perfc_incrc(shadow_fault_fixed); - d->arch.shadow_fault_count++; - - shadow_unlock(d); - - return EXCRET_fault_fixed; -fail: - shadow_unlock(d); - ESH_LOG("Guest fault~~~\n"); - return 0; -} - -static void shadow_invlpg_64(struct vcpu *v, unsigned long va) -{ - struct domain *d = v->domain; - l1_pgentry_t sl1e, old_sl1e; - - shadow_lock(d); - - __shadow_sync_va(v, va); - - if ( shadow_mode_external(d) && __shadow_get_l1e(v, va, &old_sl1e) ) - if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) - put_page_from_l1e(old_sl1e, d); - - sl1e = l1e_empty(); - __shadow_set_l1e(v, va, &sl1e); - - shadow_unlock(d); -} - -static unsigned long gva_to_gpa_64(unsigned long gva) -{ - struct vcpu *v = current; - guest_l1_pgentry_t gl1e = {0}; - guest_l2_pgentry_t gl2e = {0}; - unsigned long gpa; - - if (guest_page_fault(v, gva, 0, &gl2e, &gl1e)) - return 0; - - if (guest_l2e_get_flags(gl2e) & _PAGE_PSE) - gpa = guest_l2e_get_paddr(gl2e) + (gva & ((1 << GUEST_L2_PAGETABLE_SHIFT) - 1)); - else - gpa = guest_l1e_get_paddr(gl1e) + (gva & ~PAGE_MASK); - - return gpa; -} - -/* - * The naming convention of the shadow_ops: - * MODE___HANDLER - */ -#if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE)) -struct shadow_ops MODE_64_3_HANDLER = { - .guest_paging_levels = 3, - .invlpg = shadow_invlpg_64, - .fault = shadow_fault_64, - .update_pagetables = shadow_update_pagetables, - .sync_all = sync_all, - .remove_all_write_access = remove_all_write_access, - .do_update_va_mapping = do_update_va_mapping, - .mark_mfn_out_of_sync = mark_mfn_out_of_sync, - .is_out_of_sync = is_out_of_sync, - .gva_to_gpa = gva_to_gpa_pae, -}; - -struct shadow_ops MODE_64_4_HANDLER = { - .guest_paging_levels = 4, - .invlpg = shadow_invlpg_64, - .fault = shadow_fault_64, - .update_pagetables = shadow_update_pagetables, - .sync_all = sync_all, - .remove_all_write_access = remove_all_write_access, - .do_update_va_mapping = do_update_va_mapping, - .mark_mfn_out_of_sync = mark_mfn_out_of_sync, - .is_out_of_sync = is_out_of_sync, - .gva_to_gpa = gva_to_gpa_64, -}; -#endif /* GUEST_PGENTRY_32 */ -#endif /* CONFIG_PAGING_LEVELS >= 3 */ - - -#if CONFIG_PAGING_LEVELS == 2 -struct shadow_ops MODE_32_2_HANDLER = { - .guest_paging_levels = 2, - .invlpg = shadow_invlpg_32, - .fault = shadow_fault_32, - .update_pagetables = shadow_update_pagetables, - .sync_all = sync_all, - .remove_all_write_access = remove_all_write_access, - .do_update_va_mapping = do_update_va_mapping, - .mark_mfn_out_of_sync = mark_mfn_out_of_sync, - .is_out_of_sync = is_out_of_sync, - .gva_to_gpa = gva_to_gpa_64, -}; -#endif - -#if ( CONFIG_PAGING_LEVELS == 3 && !defined (GUEST_PGENTRY_32) && !defined (GUEST_32PAE) ) || \ - ( CONFIG_PAGING_LEVELS == 4 && defined (GUEST_PGENTRY_32) ) - - -/* - * Use GUEST_PGENTRY_32 to force PAE_SHADOW_SELF_ENTRY for L4. - * - * Very simple shadow code to handle 1:1 direct mapping for guest - * non-paging code, which actually is running in PAE/vm86 mode with - * paging-enabled. - * - * We expect that the top level (L3) page has been allocated and initialized. - */ -int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - l3_pgentry_t sl3e, *sl3e_p; - l2_pgentry_t sl2e, *sl2e_p; - l1_pgentry_t sl1e; - unsigned long mfn, smfn; - struct page_info *page; - - /* - * If the faulting address is within the MMIO range, we continue - * on handling the #PF as such. - */ - if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN ) - return 0; - - shadow_lock(d); - - __direct_get_l3e(v, vpa, &sl3e); - - if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT) ) - { - page = alloc_domheap_page(NULL); - if ( !page ) - goto nomem; - - smfn = page_to_mfn(page); - sl3e = l3e_from_pfn(smfn, _PAGE_PRESENT); - - sl3e_p = (l3_pgentry_t *)map_domain_page(smfn); - memset(sl3e_p, 0, PAGE_SIZE); - unmap_domain_page(sl3e_p); - - __direct_set_l3e(v, vpa, &sl3e); - } - - __direct_get_l2e(v, vpa, &sl2e); - - if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) ) - { - page = alloc_domheap_page(NULL); - if ( !page ) - goto nomem; - - smfn = page_to_mfn(page); - sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER); - sl2e_p = (l2_pgentry_t *)map_domain_page(smfn); - memset(sl2e_p, 0, PAGE_SIZE); - unmap_domain_page(sl2e_p); - - __direct_set_l2e(v, vpa, &sl2e); - } - - __direct_get_l1e(v, vpa, &sl1e); - - if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) ) - { - sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER); - __direct_set_l1e(v, vpa, &sl1e); - } - - shadow_unlock(d); - return EXCRET_fault_fixed; - -nomem: - shadow_direct_map_clean(d); - domain_crash_synchronous(); -} -#endif - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/arch/x86/shadow2-common.c b/xen/arch/x86/shadow2-common.c new file mode 100644 index 0000000000..eab6361c3d --- /dev/null +++ b/xen/arch/x86/shadow2-common.c @@ -0,0 +1,3394 @@ +/****************************************************************************** + * arch/x86/shadow2-common.c + * + * Shadow2 code that does not need to be multiply compiled. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define SHADOW2 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if SHADOW2_AUDIT +int shadow2_audit_enable = 0; +#endif + +static void sh2_free_log_dirty_bitmap(struct domain *d); + +int _shadow2_mode_refcounts(struct domain *d) +{ + return shadow2_mode_refcounts(d); +} + + +/**************************************************************************/ +/* x86 emulator support for the shadow2 code + */ + +static int +sh2_x86_emulate_read_std(unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; + if ( hvm_guest(v) ) + { + *val = 0; + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that is only a user vs supervisor access check. + // + if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) ) + { +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, + addr, *val, bytes); +#endif + return X86EMUL_CONTINUE; + } + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating. */ + SHADOW2_PRINTK("read failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh2_x86_emulate_write_std(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, val, bytes); +#endif + if ( hvm_guest(v) ) + { + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that includes user vs supervisor, and + // write access. + // + if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) ) + return X86EMUL_CONTINUE; + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating, + * which should be handled by sh2_x86_emulate_write_emulated. */ + SHADOW2_PRINTK("write failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh2_x86_emulate_write_emulated(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, val, bytes); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow2->x86_emulate_write(v, addr, &val, bytes, ctxt); + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh2_x86_emulate_cmpxchg_emulated(unsigned long addr, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, old, new, bytes); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow2->x86_emulate_cmpxchg(v, addr, old, new, + bytes, ctxt); + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n", + v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo, + new_hi, new_lo, ctxt); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow2->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi, + new_lo, new_hi, ctxt); + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + + +struct x86_emulate_ops shadow2_emulator_ops = { + .read_std = sh2_x86_emulate_read_std, + .write_std = sh2_x86_emulate_write_std, + .read_emulated = sh2_x86_emulate_read_std, + .write_emulated = sh2_x86_emulate_write_emulated, + .cmpxchg_emulated = sh2_x86_emulate_cmpxchg_emulated, + .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated, +}; + + +/**************************************************************************/ +/* Code for "promoting" a guest page to the point where the shadow code is + * willing to let it be treated as a guest page table. This generally + * involves making sure there are no writable mappings available to the guest + * for this page. + */ +void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type) +{ + struct page_info *page = mfn_to_page(gmfn); + unsigned long type_info; + + ASSERT(valid_mfn(gmfn)); + + /* We should never try to promote a gmfn that has writeable mappings */ + ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0); + + // Is the page already shadowed? + if ( !test_and_set_bit(_PGC_page_table, &page->count_info) ) + { + // No prior shadow exists... + + // Grab a type-ref. We don't really care if we are racing with another + // vcpu or not, or even what kind of type we get; we just want the type + // count to be > 0. + // + do { + type_info = + page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask); + } while ( !get_page_type(page, type_info) ); + + // Now that the type ref is non-zero, we can safely use the + // shadow2_flags. + // + page->shadow2_flags = 0; + } + + ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags)); + set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags); +} + +void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type) +{ + struct page_info *page = mfn_to_page(gmfn); + + ASSERT(test_bit(_PGC_page_table, &page->count_info)); + ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags)); + + clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags); + + if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 ) + { + // release the extra type ref + put_page_type(page); + + // clear the is-a-page-table bit. + clear_bit(_PGC_page_table, &page->count_info); + } +} + +/**************************************************************************/ +/* Validate a pagetable change from the guest and update the shadows. + * Returns a bitmask of SHADOW2_SET_* flags. */ + +static int +__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size) +{ + int result = 0; + struct page_info *page = mfn_to_page(gmfn); + + sh2_mark_dirty(v->domain, gmfn); + + // Determine which types of shadows are affected, and update each. + // + // Always validate L1s before L2s to prevent another cpu with a linear + // mapping of this gmfn from seeing a walk that results from + // using the new L2 value and the old L1 value. (It is OK for such a + // guest to see a walk that uses the old L2 value with the new L1 value, + // as hardware could behave this way if one level of the pagewalk occurs + // before the store, and the next level of the pagewalk occurs after the + // store. + // + // Ditto for L2s before L3s, etc. + // + + if ( !(page->count_info & PGC_page_table) ) + return 0; /* Not shadowed at all */ + +#if CONFIG_PAGING_LEVELS == 2 + if ( page->shadow2_flags & SH2F_L1_32 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2) + (v, gmfn, entry, size); +#else + if ( page->shadow2_flags & SH2F_L1_32 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2) + (v, gmfn, entry, size); +#endif + +#if CONFIG_PAGING_LEVELS == 2 + if ( page->shadow2_flags & SH2F_L2_32 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2) + (v, gmfn, entry, size); +#else + if ( page->shadow2_flags & SH2F_L2_32 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2) + (v, gmfn, entry, size); +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + if ( page->shadow2_flags & SH2F_L1_PAE ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L2_PAE ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L2H_PAE ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L3_PAE ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3) + (v, gmfn, entry, size); +#else /* 32-bit non-PAE hypervisor does not support PAE guests */ + ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0); +#endif + +#if CONFIG_PAGING_LEVELS >= 4 + if ( page->shadow2_flags & SH2F_L1_64 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L2_64 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L3_64 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L4_64 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4) + (v, gmfn, entry, size); +#else /* 32-bit/PAE hypervisor does not support 64-bit guests */ + ASSERT((page->shadow2_flags + & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0); +#endif + + return result; +} + + +int +shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry) +/* This is the entry point from hypercalls. It returns a bitmask of all the + * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */ +{ + int rc; + + ASSERT(shadow2_lock_is_acquired(v->domain)); + rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t)); + shadow2_audit_tables(v); + return rc; +} + +void +shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size) +/* This is the entry point for emulated writes to pagetables in HVM guests */ +{ + struct domain *d = v->domain; + int rc; + + ASSERT(shadow2_lock_is_acquired(v->domain)); + rc = __shadow2_validate_guest_entry(v, gmfn, entry, size); + if ( rc & SHADOW2_SET_FLUSH ) + { + // Flush everyone except the local processor, which will flush when it + // re-enters the HVM guest. + // + cpumask_t mask = d->domain_dirty_cpumask; + cpu_clear(v->processor, mask); + flush_tlb_mask(mask); + } + if ( rc & SHADOW2_SET_ERROR ) + { + /* This page is probably not a pagetable any more: tear it out of the + * shadows, along with any tables that reference it */ + shadow2_remove_all_shadows_and_parents(v, gmfn); + } + /* We ignore the other bits: since we are about to change CR3 on + * VMENTER we don't need to do any extra TLB flushes. */ +} + + +/**************************************************************************/ +/* Memory management for shadow pages. */ + +/* Meaning of the count_info field in shadow pages + * ---------------------------------------------- + * + * A count of all references to this page from other shadow pages and + * guest CR3s (a.k.a. v->arch.shadow_table). + * + * The top bits hold the shadow type and the pinned bit. Top-level + * shadows are pinned so that they don't disappear when not in a CR3 + * somewhere. + * + * We don't need to use get|put_page for this as the updates are all + * protected by the shadow lock. We can't use get|put_page for this + * as the size of the count on shadow pages is different from that on + * normal guest pages. + */ + +/* Meaning of the type_info field in shadow pages + * ---------------------------------------------- + * + * type_info use depends on the shadow type (from count_info) + * + * PGC_SH2_none : This page is in the shadow2 free pool. type_info holds + * the chunk order for our freelist allocator. + * + * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info + * holds the mfn of the guest page being shadowed, + * + * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage. + * type_info holds the gfn being shattered. + * + * PGC_SH2_monitor_table : This page is part of a monitor table. + * type_info is not used. + */ + +/* Meaning of the _domain field in shadow pages + * -------------------------------------------- + * + * In shadow pages, this field will always have its least significant bit + * set. This ensures that all attempts to get_page() will fail (as all + * valid pickled domain pointers have a zero for their least significant bit). + * Instead, the remaining upper bits are used to record the shadow generation + * counter when the shadow was created. + */ + +/* Meaning of the shadow2_flags field + * ---------------------------------- + * + * In guest pages that are shadowed, one bit for each kind of shadow they have. + * + * In shadow pages, will be used for holding a representation of the populated + * entries in this shadow (either a min/max, or a bitmap, or ...) + * + * In monitor-table pages, holds the level of the particular page (to save + * spilling the shadow types into an extra bit by having three types of monitor + * page). + */ + +/* Meaning of the list_head struct in shadow pages + * ----------------------------------------------- + * + * In free shadow pages, this is used to hold the free-lists of chunks. + * + * In top-level shadow tables, this holds a linked-list of all top-level + * shadows (used for recovering memory and destroying shadows). + * + * In lower-level shadows, this holds the physical address of a higher-level + * shadow entry that holds a reference to this shadow (or zero). + */ + +/* Allocating shadow pages + * ----------------------- + * + * Most shadow pages are allocated singly, but there are two cases where we + * need to allocate multiple pages together. + * + * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows. + * A 32-bit guest l1 table covers 4MB of virtuial address space, + * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB + * of virtual address space each). Similarly, a 32-bit guest l2 table + * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va + * each). These multi-page shadows are contiguous and aligned; + * functions for handling offsets into them are defined in shadow2.c + * (shadow_l1_index() etc.) + * + * 2: Shadowing PAE top-level pages. Each guest page that contains + * any PAE top-level pages requires two shadow pages to shadow it. + * They contain alternating l3 tables and pae_l3_bookkeeping structs. + * + * This table shows the allocation behaviour of the different modes: + * + * Xen paging 32b pae pae 64b 64b 64b + * Guest paging 32b 32b pae 32b pae 64b + * PV or HVM * HVM * HVM HVM * + * Shadow paging 32b pae pae pae pae 64b + * + * sl1 size 4k 8k 4k 8k 4k 4k + * sl2 size 4k 16k 4k 16k 4k 4k + * sl3 size - - 8k - 8k 4k + * sl4 size - - - - - 4k + * + * We allocate memory from xen in four-page units and break them down + * with a simple buddy allocator. Can't use the xen allocator to handle + * this as it only works for contiguous zones, and a domain's shadow + * pool is made of fragments. + * + * In HVM guests, the p2m table is built out of shadow pages, and we provide + * a function for the p2m management to steal pages, in max-order chunks, from + * the free pool. We don't provide for giving them back, yet. + */ + +/* Figure out the least acceptable quantity of shadow memory. + * The minimum memory requirement for always being able to free up a + * chunk of memory is very small -- only three max-order chunks per + * vcpu to hold the top level shadows and pages with Xen mappings in them. + * + * But for a guest to be guaranteed to successfully execute a single + * instruction, we must be able to map a large number (about thirty) VAs + * at the same time, which means that to guarantee progress, we must + * allow for more than ninety allocated pages per vcpu. We round that + * up to 128 pages, or half a megabyte per vcpu. */ +unsigned int shadow2_min_acceptable_pages(struct domain *d) +{ + u32 vcpu_count = 0; + struct vcpu *v; + + for_each_vcpu(d, v) + vcpu_count++; + + return (vcpu_count * 128); +} + +/* Using the type_info field to store freelist order */ +#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info) +#define SH2_SET_PFN_ORDER(_p, _o) \ + do { (_p)->u.inuse.type_info = (_o); } while (0) + + +/* Figure out the order of allocation needed for a given shadow type */ +static inline u32 +shadow_order(u32 shadow_type) +{ +#if CONFIG_PAGING_LEVELS > 2 + static const u32 type_to_order[16] = { + 0, /* PGC_SH2_none */ + 1, /* PGC_SH2_l1_32_shadow */ + 1, /* PGC_SH2_fl1_32_shadow */ + 2, /* PGC_SH2_l2_32_shadow */ + 0, /* PGC_SH2_l1_pae_shadow */ + 0, /* PGC_SH2_fl1_pae_shadow */ + 0, /* PGC_SH2_l2_pae_shadow */ + 0, /* PGC_SH2_l2h_pae_shadow */ + 1, /* PGC_SH2_l3_pae_shadow */ + 0, /* PGC_SH2_l1_64_shadow */ + 0, /* PGC_SH2_fl1_64_shadow */ + 0, /* PGC_SH2_l2_64_shadow */ + 0, /* PGC_SH2_l3_64_shadow */ + 0, /* PGC_SH2_l4_64_shadow */ + 2, /* PGC_SH2_p2m_table */ + 0 /* PGC_SH2_monitor_table */ + }; + u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift; + return type_to_order[type]; +#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */ + return 0; +#endif +} + + +/* Do we have a free chunk of at least this order? */ +static inline int chunk_is_available(struct domain *d, int order) +{ + int i; + + for ( i = order; i <= SHADOW2_MAX_ORDER; i++ ) + if ( !list_empty(&d->arch.shadow2_freelists[i]) ) + return 1; + return 0; +} + +/* Dispatcher function: call the per-mode function that will unhook the + * non-Xen mappings in this top-level shadow mfn */ +void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift ) + { + case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift: +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn); +#else + SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn); +#endif + break; +#if CONFIG_PAGING_LEVELS >= 3 + case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn); + break; +#endif +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn); + break; +#endif + default: + SHADOW2_PRINTK("top-level shadow has bad type %08lx\n", + (unsigned long)((pg->count_info & PGC_SH2_type_mask) + >> PGC_SH2_type_shift)); + BUG(); + } +} + + +/* Make sure there is at least one chunk of the required order available + * in the shadow page pool. This must be called before any calls to + * shadow2_alloc(). Since this will free existing shadows to make room, + * it must be called early enough to avoid freeing shadows that the + * caller is currently working on. */ +void shadow2_prealloc(struct domain *d, unsigned int order) +{ + /* Need a vpcu for calling unpins; for now, since we don't have + * per-vcpu shadows, any will do */ + struct vcpu *v = d->vcpu[0]; + struct list_head *l, *t; + struct page_info *pg; + mfn_t smfn; + + if ( chunk_is_available(d, order) ) return; + + /* Stage one: walk the list of top-level pages, unpinning them */ + perfc_incrc(shadow2_prealloc_1); + list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + smfn = page_to_mfn(pg); + +#if CONFIG_PAGING_LEVELS >= 3 + if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow ) + { + /* For PAE, we need to unpin each subshadow on this shadow */ + SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); + } + else +#endif /* 32-bit code always takes this branch */ + { + /* Unpin this top-level shadow */ + sh2_unpin(v, smfn); + } + + /* See if that freed up a chunk of appropriate size */ + if ( chunk_is_available(d, order) ) return; + } + + /* Stage two: all shadow pages are in use in hierarchies that are + * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen + * mappings. */ + perfc_incrc(shadow2_prealloc_2); + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + /* Walk the list from the tail: recently used toplevels have been pulled + * to the head */ + list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + smfn = page_to_mfn(pg); + shadow2_unhook_mappings(v, smfn); + + /* Need to flush TLB if we've altered our own tables */ + if ( !shadow2_mode_external(d) + && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) ) + local_flush_tlb(); + + /* See if that freed up a chunk of appropriate size */ + if ( chunk_is_available(d, order) ) return; + } + + /* Nothing more we can do: all remaining shadows are of pages that + * hold Xen mappings for some vcpu. This can never happen. */ + SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n" + " shadow pages total = %u, free = %u, p2m=%u\n", + 1 << order, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + BUG(); +} + + +/* Allocate another shadow's worth of (contiguous, aligned) pages, + * and fill in the type and backpointer fields of their page_infos. + * Never fails to allocate. */ +mfn_t shadow2_alloc(struct domain *d, + u32 shadow_type, + unsigned long backpointer) +{ + struct page_info *pg = NULL; + unsigned int order = shadow_order(shadow_type); + cpumask_t mask; + void *p; + int i; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(order <= SHADOW2_MAX_ORDER); + ASSERT(shadow_type != PGC_SH2_none); + perfc_incrc(shadow2_alloc); + + /* Find smallest order which can satisfy the request. */ + for ( i = order; i <= SHADOW2_MAX_ORDER; i++ ) + if ( !list_empty(&d->arch.shadow2_freelists[i]) ) + { + pg = list_entry(d->arch.shadow2_freelists[i].next, + struct page_info, list); + list_del(&pg->list); + + /* We may have to halve the chunk a number of times. */ + while ( i != order ) + { + i--; + SH2_SET_PFN_ORDER(pg, i); + list_add_tail(&pg->list, &d->arch.shadow2_freelists[i]); + pg += 1 << i; + } + d->arch.shadow2_free_pages -= 1 << order; + + /* Init page info fields and clear the pages */ + for ( i = 0; i < 1<domain_dirty_cpumask; + tlbflush_filter(mask, pg[i].tlbflush_timestamp); + if ( unlikely(!cpus_empty(mask)) ) + { + perfc_incrc(shadow2_alloc_tlbflush); + flush_tlb_mask(mask); + } + /* Now safe to clear the page for reuse */ + p = sh2_map_domain_page(page_to_mfn(pg+i)); + ASSERT(p != NULL); + clear_page(p); + sh2_unmap_domain_page(p); + perfc_incr(shadow2_alloc_count); + } + return page_to_mfn(pg); + } + + /* If we get here, we failed to allocate. This should never happen. + * It means that we didn't call shadow2_prealloc() correctly before + * we allocated. We can't recover by calling prealloc here, because + * we might free up higher-level pages that the caller is working on. */ + SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order); + BUG(); +} + + +/* Return some shadow pages to the pool. */ +void shadow2_free(struct domain *d, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + u32 shadow_type; + unsigned long order; + unsigned long mask; + int i; + + ASSERT(shadow2_lock_is_acquired(d)); + perfc_incrc(shadow2_free); + + shadow_type = pg->count_info & PGC_SH2_type_mask; + ASSERT(shadow_type != PGC_SH2_none); + ASSERT(shadow_type != PGC_SH2_p2m_table); + order = shadow_order(shadow_type); + + d->arch.shadow2_free_pages += 1 << order; + + for ( i = 0; i < 1<count_info & PGC_SH2_type_mask) != PGT_none) + || (SH2_PFN_ORDER(pg-mask) != order) ) + break; + list_del(&(pg-mask)->list); + pg -= mask; + } else { + /* Merge with successor block? */ + if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none) + || (SH2_PFN_ORDER(pg+mask) != order) ) + break; + list_del(&(pg+mask)->list); + } + order++; + } + + SH2_SET_PFN_ORDER(pg, order); + list_add_tail(&pg->list, &d->arch.shadow2_freelists[order]); +} + +/* Divert some memory from the pool to be used by the p2m mapping. + * This action is irreversible: the p2m mapping only ever grows. + * That's OK because the p2m table only exists for external domains, + * and those domains can't ever turn off shadow mode. + * Also, we only ever allocate a max-order chunk, so as to preserve + * the invariant that shadow2_prealloc() always works. + * Returns 0 iff it can't get a chunk (the caller should then + * free up some pages in domheap and call set_sh2_allocation); + * returns non-zero on success. + */ +static int +shadow2_alloc_p2m_pages(struct domain *d) +{ + struct page_info *pg; + u32 i; + ASSERT(shadow2_lock_is_acquired(d)); + + if ( d->arch.shadow2_total_pages + < (shadow2_min_acceptable_pages(d) + (1<arch.shadow2_p2m_pages += (1<arch.shadow2_total_pages -= (1<arch.shadow2_p2m_freelist); + } + return 1; +} + +// Returns 0 if no memory is available... +mfn_t +shadow2_alloc_p2m_page(struct domain *d) +{ + struct list_head *entry; + mfn_t mfn; + void *p; + + if ( list_empty(&d->arch.shadow2_p2m_freelist) && + !shadow2_alloc_p2m_pages(d) ) + return _mfn(0); + entry = d->arch.shadow2_p2m_freelist.next; + list_del(entry); + list_add_tail(entry, &d->arch.shadow2_p2m_inuse); + mfn = page_to_mfn(list_entry(entry, struct page_info, list)); + sh2_get_ref(mfn, 0); + p = sh2_map_domain_page(mfn); + clear_page(p); + sh2_unmap_domain_page(p); + + return mfn; +} + +#if CONFIG_PAGING_LEVELS == 3 +static void p2m_install_entry_in_monitors(struct domain *d, + l3_pgentry_t *l3e) +/* Special case, only used for external-mode domains on PAE hosts: + * update the mapping of the p2m table. Once again, this is trivial in + * other paging modes (one top-level entry points to the top-level p2m, + * no maintenance needed), but PAE makes life difficult by needing a + * copy the eight l3es of the p2m table in eight l2h slots in the + * monitor table. This function makes fresh copies when a p2m l3e + * changes. */ +{ + l2_pgentry_t *ml2e; + struct vcpu *v; + unsigned int index; + + index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t); + ASSERT(index < MACHPHYS_MBYTES>>1); + + for_each_vcpu(d, v) + { + if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) + continue; + ASSERT(shadow2_mode_external(v->domain)); + + SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n", + d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e)); + + if ( v == current ) /* OK to use linear map of monitor_table */ + ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START); + else + { + l3_pgentry_t *ml3e; + ml3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT); + ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3]))); + ml2e += l2_table_offset(RO_MPT_VIRT_START); + sh2_unmap_domain_page(ml3e); + } + ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR); + if ( v != current ) + sh2_unmap_domain_page(ml2e); + } +} +#endif + +// Find the next level's P2M entry, checking for out-of-range gfn's... +// Returns NULL on error. +// +static l1_pgentry_t * +p2m_find_entry(void *table, unsigned long *gfn_remainder, + unsigned long gfn, u32 shift, u32 max) +{ + u32 index; + + index = *gfn_remainder >> shift; + if ( index >= max ) + { + SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range " + "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n", + gfn, *gfn_remainder, shift, index, max); + return NULL; + } + *gfn_remainder &= (1 << shift) - 1; + return (l1_pgentry_t *)table + index; +} + +// Walk one level of the P2M table, allocating a new table if required. +// Returns 0 on error. +// +static int +p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, + unsigned long *gfn_remainder, unsigned long gfn, u32 shift, + u32 max, unsigned long type) +{ + l1_pgentry_t *p2m_entry; + void *next; + + if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, + shift, max)) ) + return 0; + + if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) ) + { + mfn_t mfn = shadow2_alloc_p2m_page(d); + if ( mfn_x(mfn) == 0 ) + return 0; + *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); + mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated; + mfn_to_page(mfn)->count_info = 1; +#if CONFIG_PAGING_LEVELS == 3 + if (type == PGT_l2_page_table) + { + /* We have written to the p2m l3: need to sync the per-vcpu + * copies of it in the monitor tables */ + p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry); + } +#endif + /* The P2M can be shadowed: keep the shadows synced */ + if ( d->vcpu[0] ) + (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn, + p2m_entry, sizeof *p2m_entry); + } + *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); + next = sh2_map_domain_page(*table_mfn); + sh2_unmap_domain_page(*table); + *table = next; + + return 1; +} + +// Returns 0 on error (out of memory) +int +shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) +{ + // XXX -- this might be able to be faster iff current->domain == d + mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table); + void *table = sh2_map_domain_page(table_mfn); + unsigned long gfn_remainder = gfn; + l1_pgentry_t *p2m_entry; + +#if CONFIG_PAGING_LEVELS >= 4 + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L4_PAGETABLE_SHIFT - PAGE_SHIFT, + L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) + return 0; +#endif +#if CONFIG_PAGING_LEVELS >= 3 + // When using PAE Xen, we only allow 33 bits of pseudo-physical + // address in translated guests (i.e. 8 GBytes). This restriction + // comes from wanting to map the P2M table into the 16MB RO_MPT hole + // in Xen's address space for translated PV guests. + // + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + (CONFIG_PAGING_LEVELS == 3 + ? 8 + : L3_PAGETABLE_ENTRIES), + PGT_l2_page_table) ) + return 0; +#endif + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) + return 0; + + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + if ( valid_mfn(mfn) ) + *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); + else + *p2m_entry = l1e_empty(); + + /* The P2M can be shadowed: keep the shadows synced */ + (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn, + p2m_entry, sizeof *p2m_entry); + + sh2_unmap_domain_page(table); + + return 1; +} + +// Allocate a new p2m table for a domain. +// +// The structure of the p2m table is that of a pagetable for xen (i.e. it is +// controlled by CONFIG_PAGING_LEVELS). +// +// Returns 0 if p2m table could not be initialized +// +static int +shadow2_alloc_p2m_table(struct domain *d) +{ + mfn_t p2m_top; + struct list_head *entry; + unsigned int page_count = 0; + + SHADOW2_PRINTK("allocating p2m table\n"); + ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0); + + p2m_top = shadow2_alloc_p2m_page(d); + mfn_to_page(p2m_top)->count_info = 1; + mfn_to_page(p2m_top)->u.inuse.type_info = +#if CONFIG_PAGING_LEVELS == 4 + PGT_l4_page_table +#elif CONFIG_PAGING_LEVELS == 3 + PGT_l3_page_table +#elif CONFIG_PAGING_LEVELS == 2 + PGT_l2_page_table +#endif + | 1 | PGT_validated; + + if ( mfn_x(p2m_top) == 0 ) + return 0; + + d->arch.phys_table = pagetable_from_mfn(p2m_top); + + SHADOW2_PRINTK("populating p2m table\n"); + + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + struct page_info *page = list_entry(entry, struct page_info, list); + mfn_t mfn = page_to_mfn(page); + unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn)); + page_count++; + if ( +#ifdef __x86_64__ + (gfn != 0x5555555555555555L) +#else + (gfn != 0x55555555L) +#endif + && gfn != INVALID_M2P_ENTRY + && !shadow2_set_p2m_entry(d, gfn, mfn) ) + { + SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH2_PRI_mfn "\n", + gfn, mfn_x(mfn)); + return 0; + } + } + + SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count); + return 1; +} + +mfn_t +sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn) +/* Read another domain's p2m entries */ +{ + mfn_t mfn; + unsigned long addr = gpfn << PAGE_SHIFT; + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(shadow2_mode_translate(d)); + mfn = pagetable_get_mfn(d->arch.phys_table); + + +#if CONFIG_PAGING_LEVELS > 2 + if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) + /* This pfn is higher than the p2m map can hold */ + return _mfn(INVALID_MFN); +#endif + + +#if CONFIG_PAGING_LEVELS >= 4 + { + l4_pgentry_t *l4e = sh2_map_domain_page(mfn); + l4e += l4_table_offset(addr); + if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) + { + sh2_unmap_domain_page(l4e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l4e_get_pfn(*l4e)); + sh2_unmap_domain_page(l4e); + } +#endif +#if CONFIG_PAGING_LEVELS >= 3 + { + l3_pgentry_t *l3e = sh2_map_domain_page(mfn); + l3e += l3_table_offset(addr); + if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) + { + sh2_unmap_domain_page(l3e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l3e_get_pfn(*l3e)); + sh2_unmap_domain_page(l3e); + } +#endif + + l2e = sh2_map_domain_page(mfn); + l2e += l2_table_offset(addr); + if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) + { + sh2_unmap_domain_page(l2e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l2e_get_pfn(*l2e)); + sh2_unmap_domain_page(l2e); + + l1e = sh2_map_domain_page(mfn); + l1e += l1_table_offset(addr); + if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) + { + sh2_unmap_domain_page(l1e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l1e_get_pfn(*l1e)); + sh2_unmap_domain_page(l1e); + + return mfn; +} + +unsigned long +shadow2_gfn_to_mfn_foreign(unsigned long gpfn) +{ + return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn)); +} + + +static void shadow2_p2m_teardown(struct domain *d) +/* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ +{ + struct list_head *entry, *n; + struct page_info *pg; + + d->arch.phys_table = pagetable_null(); + + list_for_each_safe(entry, n, &d->arch.shadow2_p2m_inuse) + { + pg = list_entry(entry, struct page_info, list); + list_del(entry); + /* Should have just the one ref we gave it in alloc_p2m_page() */ + if ( (pg->count_info & PGC_SH2_count_mask) != 1 ) + { + SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n", + pg->count_info, pg->u.inuse.type_info); + } + ASSERT(page_get_owner(pg) == d); + /* Free should not decrement domain's total allocation, since + * these pages were allocated without an owner. */ + page_set_owner(pg, NULL); + free_domheap_pages(pg, 0); + d->arch.shadow2_p2m_pages--; + perfc_decr(shadow2_alloc_count); + } + list_for_each_safe(entry, n, &d->arch.shadow2_p2m_freelist) + { + list_del(entry); + pg = list_entry(entry, struct page_info, list); + ASSERT(page_get_owner(pg) == d); + /* Free should not decrement domain's total allocation. */ + page_set_owner(pg, NULL); + free_domheap_pages(pg, 0); + d->arch.shadow2_p2m_pages--; + perfc_decr(shadow2_alloc_count); + } + ASSERT(d->arch.shadow2_p2m_pages == 0); +} + +/* Set the pool of shadow pages to the required number of pages. + * Input will be rounded up to at least shadow2_min_acceptable_pages(), + * plus space for the p2m table. + * Returns 0 for success, non-zero for failure. */ +static unsigned int set_sh2_allocation(struct domain *d, + unsigned int pages, + int *preempted) +{ + struct page_info *pg; + unsigned int lower_bound; + int j; + + ASSERT(shadow2_lock_is_acquired(d)); + + /* Don't allocate less than the minimum acceptable, plus one page per + * megabyte of RAM (for the p2m table) */ + lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256); + if ( pages > 0 && pages < lower_bound ) + pages = lower_bound; + /* Round up to largest block size */ + pages = (pages + ((1<arch.shadow2_total_pages, pages); + + while ( d->arch.shadow2_total_pages != pages ) + { + if ( d->arch.shadow2_total_pages < pages ) + { + /* Need to allocate more memory from domheap */ + pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0); + if ( pg == NULL ) + { + SHADOW2_PRINTK("failed to allocate shadow pages.\n"); + return -ENOMEM; + } + d->arch.shadow2_free_pages += 1<arch.shadow2_total_pages += 1<list, + &d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]); + } + else if ( d->arch.shadow2_total_pages > pages ) + { + /* Need to return memory to domheap */ + shadow2_prealloc(d, SHADOW2_MAX_ORDER); + ASSERT(!list_empty(&d->arch.shadow2_freelists[SHADOW2_MAX_ORDER])); + pg = list_entry(d->arch.shadow2_freelists[SHADOW2_MAX_ORDER].next, + struct page_info, list); + list_del(&pg->list); + d->arch.shadow2_free_pages -= 1<arch.shadow2_total_pages -= 1<domain_id, + d->arch.shadow2_total_pages, + shadow2_get_allocation(d)); + shadow2_unlock(d); + return rv; +} + +/**************************************************************************/ +/* Hash table for storing the guest->shadow mappings */ + +/* Hash function that takes a gfn or mfn, plus another byte of type info */ +typedef u32 key_t; +static inline key_t sh2_hash(unsigned long n, u8 t) +{ + unsigned char *p = (unsigned char *)&n; + key_t k = t; + int i; + for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k; + return k; +} + +#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL) + +/* Before we get to the mechanism, define a pair of audit functions + * that sanity-check the contents of the hash table. */ +static void sh2_hash_audit_bucket(struct domain *d, int bucket) +/* Audit one bucket of the hash table */ +{ + struct shadow2_hash_entry *e, *x; + struct page_info *pg; + + if ( !(SHADOW2_AUDIT_ENABLE) ) + return; + + e = &d->arch.shadow2_hash_table[bucket]; + if ( e->t == 0 ) return; /* Bucket is empty */ + while ( e ) + { + /* Empty link? */ + BUG_ON( e->t == 0 ); + /* Bogus type? */ + BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) ); + /* Wrong bucket? */ + BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket ); + /* Duplicate entry? */ + for ( x = e->next; x; x = x->next ) + BUG_ON( x->n == e->n && x->t == e->t ); + /* Bogus MFN? */ + BUG_ON( !valid_mfn(e->smfn) ); + pg = mfn_to_page(e->smfn); + /* Not a shadow? */ + BUG_ON( page_get_owner(pg) != 0 ); + /* Wrong kind of shadow? */ + BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift + != e->t ); + /* Bad backlink? */ + BUG_ON( pg->u.inuse.type_info != e->n ); + if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift) + && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift) + && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) ) + { + /* Bad shadow flags on guest page? */ + BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<t)) ); + } + /* That entry was OK; on we go */ + e = e->next; + } +} + +#else +#define sh2_hash_audit_bucket(_d, _b) +#endif /* Hashtable bucket audit */ + + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL + +static void sh2_hash_audit(struct domain *d) +/* Full audit: audit every bucket in the table */ +{ + int i; + + if ( !(SHADOW2_AUDIT_ENABLE) ) + return; + + for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) + { + sh2_hash_audit_bucket(d, i); + } +} + +#else +#define sh2_hash_audit(_d) +#endif /* Hashtable bucket audit */ + +/* Memory management interface for bucket allocation. + * These ought to come out of shadow memory, but at least on 32-bit + * machines we are forced to allocate them from xenheap so that we can + * address them. */ +static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d) +{ + struct shadow2_hash_entry *extra, *x; + int i; + + /* We need to allocate a new node. Ensure the free list is not empty. + * Allocate new entries in units the same size as the original table. */ + if ( unlikely(d->arch.shadow2_hash_freelist == NULL) ) + { + size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x)); + extra = xmalloc_bytes(sz); + + if ( extra == NULL ) + { + /* No memory left! */ + SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n"); + domain_crash_synchronous(); + } + memset(extra, 0, sz); + + /* Record the allocation block so it can be correctly freed later. */ + *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) = + d->arch.shadow2_hash_allocations; + d->arch.shadow2_hash_allocations = &extra[0]; + + /* Thread a free chain through the newly-allocated nodes. */ + for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ ) + extra[i].next = &extra[i+1]; + extra[i].next = NULL; + + /* Add the new nodes to the free list. */ + d->arch.shadow2_hash_freelist = &extra[0]; + } + + /* Allocate a new node from the free list. */ + x = d->arch.shadow2_hash_freelist; + d->arch.shadow2_hash_freelist = x->next; + return x; +} + +static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e) +{ + /* Mark the bucket as empty and return it to the free list */ + e->t = 0; + e->next = d->arch.shadow2_hash_freelist; + d->arch.shadow2_hash_freelist = e; +} + + +/* Allocate and initialise the table itself. + * Returns 0 for success, 1 for error. */ +static int shadow2_hash_alloc(struct domain *d) +{ + struct shadow2_hash_entry *table; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(!d->arch.shadow2_hash_table); + + table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS); + if ( !table ) return 1; + memset(table, 0, + SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry)); + d->arch.shadow2_hash_table = table; + return 0; +} + +/* Tear down the hash table and return all memory to Xen. + * This function does not care whether the table is populated. */ +static void shadow2_hash_teardown(struct domain *d) +{ + struct shadow2_hash_entry *a, *n; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_table); + + /* Return the table itself */ + xfree(d->arch.shadow2_hash_table); + d->arch.shadow2_hash_table = NULL; + + /* Return any extra allocations */ + a = d->arch.shadow2_hash_allocations; + while ( a ) + { + /* We stored a linked-list pointer at the end of each allocation */ + n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS])); + xfree(a); + a = n; + } + d->arch.shadow2_hash_allocations = NULL; + d->arch.shadow2_hash_freelist = NULL; +} + + +mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t) +/* Find an entry in the hash table. Returns the MFN of the shadow, + * or INVALID_MFN if it doesn't exist */ +{ + struct domain *d = v->domain; + struct shadow2_hash_entry *p, *x, *head; + key_t key; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_table); + ASSERT(t); + + sh2_hash_audit(d); + + perfc_incrc(shadow2_hash_lookups); + key = sh2_hash(n, t); + + x = head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS]; + p = NULL; + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); + + do + { + ASSERT(x->t || ((x == head) && (x->next == NULL))); + + if ( x->n == n && x->t == t ) + { + /* Pull-to-front if 'x' isn't already the head item */ + if ( unlikely(x != head) ) + { + if ( unlikely(d->arch.shadow2_hash_walking != 0) ) + /* Can't reorder: someone is walking the hash chains */ + return x->smfn; + else + { + /* Delete 'x' from list and reinsert after head. */ + p->next = x->next; + x->next = head->next; + head->next = x; + + /* Swap 'x' contents with head contents. */ + SWAP(head->n, x->n); + SWAP(head->t, x->t); + SWAP(head->smfn, x->smfn); + } + } + else + { + perfc_incrc(shadow2_hash_lookup_head); + } + return head->smfn; + } + + p = x; + x = x->next; + } + while ( x != NULL ); + + perfc_incrc(shadow2_hash_lookup_miss); + return _mfn(INVALID_MFN); +} + +void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) +/* Put a mapping (n,t)->smfn into the hash table */ +{ + struct domain *d = v->domain; + struct shadow2_hash_entry *x, *head; + key_t key; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_table); + ASSERT(t); + + sh2_hash_audit(d); + + perfc_incrc(shadow2_hash_inserts); + key = sh2_hash(n, t); + + head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS]; + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); + + /* If the bucket is empty then insert the new page as the head item. */ + if ( head->t == 0 ) + { + head->n = n; + head->t = t; + head->smfn = smfn; + ASSERT(head->next == NULL); + } + else + { + /* Insert a new entry directly after the head item. */ + x = sh2_alloc_hash_entry(d); + x->n = n; + x->t = t; + x->smfn = smfn; + x->next = head->next; + head->next = x; + } + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); +} + +void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) +/* Excise the mapping (n,t)->smfn from the hash table */ +{ + struct domain *d = v->domain; + struct shadow2_hash_entry *p, *x, *head; + key_t key; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_table); + ASSERT(t); + + sh2_hash_audit(d); + + perfc_incrc(shadow2_hash_deletes); + key = sh2_hash(n, t); + + head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS]; + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); + + /* Match on head item? */ + if ( head->n == n && head->t == t ) + { + if ( (x = head->next) != NULL ) + { + /* Overwrite head with contents of following node. */ + head->n = x->n; + head->t = x->t; + head->smfn = x->smfn; + + /* Delete following node. */ + head->next = x->next; + sh2_free_hash_entry(d, x); + } + else + { + /* This bucket is now empty. Initialise the head node. */ + head->t = 0; + } + } + else + { + /* Not at the head; need to walk the chain */ + p = head; + x = head->next; + + while(1) + { + ASSERT(x); /* We can't have hit the end, since our target is + * still in the chain somehwere... */ + if ( x->n == n && x->t == t ) + { + /* Delete matching node. */ + p->next = x->next; + sh2_free_hash_entry(d, x); + break; + } + p = x; + x = x->next; + } + } + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); +} + +typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn); + +static void hash_foreach(struct vcpu *v, + unsigned int callback_mask, + hash_callback_t callbacks[], + mfn_t callback_mfn) +/* Walk the hash table looking at the types of the entries and + * calling the appropriate callback function for each entry. + * The mask determines which shadow types we call back for, and the array + * of callbacks tells us which function to call. + * Any callback may return non-zero to let us skip the rest of the scan. + * + * WARNING: Callbacks MUST NOT add or remove hash entries unless they + * then return non-zero to terminate the scan. */ +{ + int i, done = 0; + struct domain *d = v->domain; + struct shadow2_hash_entry *x; + + /* Say we're here, to stop hash-lookups reordering the chains */ + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_walking == 0); + d->arch.shadow2_hash_walking = 1; + + callback_mask &= ~1; /* Never attempt to call back on empty buckets */ + for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) + { + /* WARNING: This is not safe against changes to the hash table. + * The callback *must* return non-zero if it has inserted or + * deleted anything from the hash (lookups are OK, though). */ + for ( x = &d->arch.shadow2_hash_table[i]; x; x = x->next ) + { + if ( callback_mask & (1 << x->t) ) + { + ASSERT(x->t <= 15); + ASSERT(callbacks[x->t] != NULL); + if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 ) + break; + } + } + if ( done ) break; + } + d->arch.shadow2_hash_walking = 0; +} + + +/**************************************************************************/ +/* Destroy a shadow page: simple dispatcher to call the per-type destructor + * which will decrement refcounts appropriately and return memory to the + * free pool. */ + +void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + u32 t = pg->count_info & PGC_SH2_type_mask; + + + SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn)); + + /* Double-check, if we can, that the shadowed page belongs to this + * domain, (by following the back-pointer). */ + ASSERT(t == PGC_SH2_fl1_32_shadow || + t == PGC_SH2_fl1_pae_shadow || + t == PGC_SH2_fl1_64_shadow || + t == PGC_SH2_monitor_table || + (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) + == v->domain)); + + /* The down-shifts here are so that the switch statement is on nice + * small numbers that the compiler will enjoy */ + switch ( t >> PGC_SH2_type_shift ) + { +#if CONFIG_PAGING_LEVELS == 2 + case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift: + case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn); + break; + case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn); + break; +#else /* PAE or 64bit */ + case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift: + case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn); + break; + case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn); + break; +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift: + case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn); + break; + case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift: + case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn); + break; + case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn); + break; +#endif + +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift: + case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn); + break; + case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn); + break; + case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn); + break; + case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn); + break; +#endif + default: + SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n", + (unsigned long)t); + BUG(); + } +} + +/**************************************************************************/ +/* Remove all writeable mappings of a guest frame from the shadow tables + * Returns non-zero if we need to flush TLBs. + * level and fault_addr desribe how we found this to be a pagetable; + * level==0 means we have some other reason for revoking write access.*/ + +int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn, + unsigned int level, + unsigned long fault_addr) +{ + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32 */ +#else + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32 */ +#endif + NULL, /* l2_32 */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae */ + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */ +#else + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#endif + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64 */ + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64 */ +#else + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#endif + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ + NULL, /* p2m */ + NULL /* unused */ + }; + + static unsigned int callback_mask = + 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) + ; + struct page_info *pg = mfn_to_page(gmfn); + + ASSERT(shadow2_lock_is_acquired(v->domain)); + + /* Only remove writable mappings if we are doing shadow refcounts. + * In guest refcounting, we trust Xen to already be restricting + * all the writes to the guest page tables, so we do not need to + * do more. */ + if ( !shadow2_mode_refcounts(v->domain) ) + return 0; + + /* Early exit if it's already a pagetable, or otherwise not writeable */ + if ( sh2_mfn_is_a_page_table(gmfn) + || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) + return 0; + + perfc_incrc(shadow2_writeable); + + /* If this isn't a "normal" writeable page, the domain is trying to + * put pagetables in special memory of some kind. We can't allow that. */ + if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page ) + { + SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %" + PRtype_info "\n", + mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info); + domain_crash(v->domain); + } + +#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC + if ( v == current && level != 0 ) + { + unsigned long gfn; + /* Heuristic: there is likely to be only one writeable mapping, + * and that mapping is likely to be in the current pagetable, + * either in the guest's linear map (linux, windows) or in a + * magic slot used to map high memory regions (linux HIGHTPTE) */ + +#define GUESS(_a, _h) do { \ + if ( v->arch.shadow2->guess_wrmap(v, (_a), gmfn) ) \ + perfc_incrc(shadow2_writeable_h_ ## _h); \ + if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \ + return 1; \ + } while (0) + + + /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */ + if ( v == current + && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 ) + GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4); + + if ( v->arch.shadow2->guest_levels == 2 ) + { + if ( level == 1 ) + /* 32bit non-PAE w2k3: linear map at 0xC0000000 */ + GUESS(0xC0000000UL + (fault_addr >> 10), 1); + } +#if CONFIG_PAGING_LEVELS >= 3 + else if ( v->arch.shadow2->guest_levels == 3 ) + { + /* 32bit PAE w2k3: linear map at 0xC0000000 */ + switch ( level ) + { + case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break; + case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break; + } + } +#if CONFIG_PAGING_LEVELS >= 4 + else if ( v->arch.shadow2->guest_levels == 4 ) + { + /* 64bit w2k3: linear map at 0x0000070000000000 */ + switch ( level ) + { + case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break; + case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break; + case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break; + } + } +#endif /* CONFIG_PAGING_LEVELS >= 4 */ +#endif /* CONFIG_PAGING_LEVELS >= 3 */ + +#undef GUESS + + } +#endif + + /* Brute-force search of all the shadows, by walking the hash */ + perfc_incrc(shadow2_writeable_bf); + hash_foreach(v, callback_mask, callbacks, gmfn); + + /* If that didn't catch the mapping, something is very wrong */ + if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) + { + SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: " + "%lu left\n", mfn_x(gmfn), + (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); + domain_crash(v->domain); + } + + /* We killed at least one writeable mapping, so must flush TLBs. */ + return 1; +} + + + +/**************************************************************************/ +/* Remove all mappings of a guest frame from the shadow tables. + * Returns non-zero if we need to flush TLBs. */ + +int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn) +{ + struct page_info *page = mfn_to_page(gmfn); + int expected_count; + + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32 */ +#else + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32 */ +#endif + NULL, /* l2_32 */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae */ + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */ +#else + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#endif + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64 */ + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64 */ +#else + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#endif + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ + NULL, /* p2m */ + NULL /* unused */ + }; + + static unsigned int callback_mask = + 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) + ; + + perfc_incrc(shadow2_mappings); + if ( (page->count_info & PGC_count_mask) == 0 ) + return 0; + + ASSERT(shadow2_lock_is_acquired(v->domain)); + + /* XXX TODO: + * Heuristics for finding the (probably) single mapping of this gmfn */ + + /* Brute-force search of all the shadows, by walking the hash */ + perfc_incrc(shadow2_mappings_bf); + hash_foreach(v, callback_mask, callbacks, gmfn); + + /* If that didn't catch the mapping, something is very wrong */ + expected_count = (page->count_info & PGC_allocated) ? 1 : 0; + if ( (page->count_info & PGC_count_mask) != expected_count ) + { + /* Don't complain if we're in HVM and there's one extra mapping: + * The qemu helper process has an untyped mapping of this dom's RAM */ + if ( !(shadow2_mode_external(v->domain) + && (page->count_info & PGC_count_mask) <= 2 + && (page->u.inuse.type_info & PGT_count_mask) == 0) ) + { + SHADOW2_ERROR("can't find all mappings of mfn %lx: " + "c=%08x t=%08lx\n", mfn_x(gmfn), + page->count_info, page->u.inuse.type_info); + } + } + + /* We killed at least one mapping, so must flush TLBs. */ + return 1; +} + + +/**************************************************************************/ +/* Remove all shadows of a guest frame from the shadow tables */ + +static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn) +/* Follow this shadow's up-pointer, if it has one, and remove the reference + * found there. Returns 1 if that was the only reference to this shadow */ +{ + struct page_info *pg = mfn_to_page(smfn); + mfn_t pmfn; + void *vaddr; + int rc; + + ASSERT((pg->count_info & PGC_SH2_type_mask) > 0); + ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow); + ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow); + ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow); + ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow); + + if (pg->up == 0) return 0; + pmfn = _mfn(pg->up >> PAGE_SHIFT); + ASSERT(valid_mfn(pmfn)); + vaddr = sh2_map_domain_page(pmfn); + ASSERT(vaddr); + vaddr += pg->up & (PAGE_SIZE-1); + ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn)); + + /* Is this the only reference to this shadow? */ + rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0; + + /* Blank the offending entry */ + switch ((pg->count_info & PGC_SH2_type_mask)) + { + case PGC_SH2_l1_32_shadow: + case PGC_SH2_l2_32_shadow: +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn); +#else + SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn); +#endif + break; +#if CONFIG_PAGING_LEVELS >=3 + case PGC_SH2_l1_pae_shadow: + case PGC_SH2_l2_pae_shadow: + case PGC_SH2_l2h_pae_shadow: + case PGC_SH2_l3_pae_shadow: + SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn); + break; +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH2_l1_64_shadow: + case PGC_SH2_l2_64_shadow: + case PGC_SH2_l3_64_shadow: + case PGC_SH2_l4_64_shadow: + SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn); + break; +#endif +#endif + default: BUG(); /* Some wierd unknown shadow type */ + } + + sh2_unmap_domain_page(vaddr); + if ( rc ) + perfc_incrc(shadow2_up_pointer); + else + perfc_incrc(shadow2_unshadow_bf); + + return rc; +} + +void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all) +/* Remove the shadows of this guest page. + * If all != 0, find all shadows, if necessary by walking the tables. + * Otherwise, just try the (much faster) heuristics, which will remove + * at most one reference to each shadow of the page. */ +{ + struct page_info *pg; + mfn_t smfn; + u32 sh_flags; + unsigned char t; + + /* Dispatch table for getting per-type functions: each level must + * be called with the function to remove a lower-level shadow. */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ + NULL, /* l1_32 */ + NULL, /* fl1_32 */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32 */ +#else + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32 */ +#endif + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae */ + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */ + SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae */ +#else + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#endif + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64 */ + SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64 */ + SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64 */ +#else + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ +#endif + NULL, /* p2m */ + NULL /* unused */ + }; + + /* Another lookup table, for choosing which mask to use */ + static unsigned int masks[16] = { + 0, /* none */ + 1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32 */ + 0, /* fl1_32 */ + 0, /* l2_32 */ + ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift)) + | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae */ + 0, /* fl1_pae */ + 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae */ + 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae */ + 0, /* l3_pae */ + 1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64 */ + 0, /* fl1_64 */ + 1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64 */ + 1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64 */ + 0, /* l4_64 */ + 0, /* p2m */ + 0 /* unused */ + }; + + SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); + + ASSERT(shadow2_lock_is_acquired(v->domain)); + + pg = mfn_to_page(gmfn); + + /* Bale out now if the page is not shadowed */ + if ( (pg->count_info & PGC_page_table) == 0 ) + return; + + /* Search for this shadow in all appropriate shadows */ + perfc_incrc(shadow2_unshadow); + sh_flags = pg->shadow2_flags; + + /* Lower-level shadows need to be excised from upper-level shadows. + * This call to hash_foreach() looks dangerous but is in fact OK: each + * call will remove at most one shadow, and terminate immediately when + * it does remove it, so we never walk the hash after doing a deletion. */ +#define DO_UNSHADOW(_type) do { \ + t = (_type) >> PGC_SH2_type_shift; \ + smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \ + if ( !sh2_remove_shadow_via_pointer(v, smfn) && all ) \ + hash_foreach(v, masks[t], callbacks, smfn); \ +} while (0) + + /* Top-level shadows need to be unpinned */ +#define DO_UNPIN(_type) do { \ + t = (_type) >> PGC_SH2_type_shift; \ + smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \ + if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned ) \ + sh2_unpin(v, smfn); \ + if ( (_type) == PGC_SH2_l3_pae_shadow ) \ + SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \ +} while (0) + + if ( sh_flags & SH2F_L1_32 ) DO_UNSHADOW(PGC_SH2_l1_32_shadow); + if ( sh_flags & SH2F_L2_32 ) DO_UNPIN(PGC_SH2_l2_32_shadow); +#if CONFIG_PAGING_LEVELS >= 3 + if ( sh_flags & SH2F_L1_PAE ) DO_UNSHADOW(PGC_SH2_l1_pae_shadow); + if ( sh_flags & SH2F_L2_PAE ) DO_UNSHADOW(PGC_SH2_l2_pae_shadow); + if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow); + if ( sh_flags & SH2F_L3_PAE ) DO_UNPIN(PGC_SH2_l3_pae_shadow); +#if CONFIG_PAGING_LEVELS >= 4 + if ( sh_flags & SH2F_L1_64 ) DO_UNSHADOW(PGC_SH2_l1_64_shadow); + if ( sh_flags & SH2F_L2_64 ) DO_UNSHADOW(PGC_SH2_l2_64_shadow); + if ( sh_flags & SH2F_L3_64 ) DO_UNSHADOW(PGC_SH2_l3_64_shadow); + if ( sh_flags & SH2F_L4_64 ) DO_UNPIN(PGC_SH2_l4_64_shadow); +#endif +#endif + +#undef DO_UNSHADOW +#undef DO_UNPIN + + +#if CONFIG_PAGING_LEVELS > 2 + /* We may have caused some PAE l3 entries to change: need to + * fix up the copies of them in various places */ + if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) ) + sh2_pae_recopy(v->domain); +#endif + + /* If that didn't catch the shadows, something is wrong */ + if ( all && (pg->count_info & PGC_page_table) ) + { + SHADOW2_ERROR("can't find all shadows of mfn %05lx (shadow2_flags=%08x)\n", + mfn_x(gmfn), pg->shadow2_flags); + domain_crash(v->domain); + } +} + +void +shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn) +/* Even harsher: this is a HVM page that we thing is no longer a pagetable. + * Unshadow it, and recursively unshadow pages that reference it. */ +{ + shadow2_remove_all_shadows(v, gmfn); + /* XXX TODO: + * Rework this hashtable walker to return a linked-list of all + * the shadows it modified, then do breadth-first recursion + * to find the way up to higher-level tables and unshadow them too. + * + * The current code (just tearing down each page's shadows as we + * detect that it is not a pagetable) is correct, but very slow. + * It means extra emulated writes and slows down removal of mappings. */ +} + +/**************************************************************************/ + +void sh2_update_paging_modes(struct vcpu *v) +{ + struct domain *d = v->domain; + struct shadow2_entry_points *old_entries = v->arch.shadow2; + mfn_t old_guest_table; + + ASSERT(shadow2_lock_is_acquired(d)); + + // Valid transitions handled by this function: + // - For PV guests: + // - after a shadow mode has been changed + // - For HVM guests: + // - after a shadow mode has been changed + // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE + // + + // Avoid determining the current shadow2 mode for uninitialized CPUs, as + // we can not yet determine whether it is an HVM or PV domain. + // + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + { + printk("%s: postponing determination of shadow2 mode\n", __func__); + return; + } + + // First, tear down any old shadow tables held by this vcpu. + // + if ( v->arch.shadow2 ) + shadow2_detach_old_tables(v); + + if ( !hvm_guest(v) ) + { + /// + /// PV guest + /// +#if CONFIG_PAGING_LEVELS == 4 + if ( pv_32bit_guest(v) ) + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 3); + else + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4); +#elif CONFIG_PAGING_LEVELS == 3 + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3); +#elif CONFIG_PAGING_LEVELS == 2 + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2); +#else +#error unexpected paging mode +#endif + } + else + { + /// + /// HVM guest + /// + ASSERT(shadow2_mode_translate(d)); + ASSERT(shadow2_mode_external(d)); + + if ( !hvm_paging_enabled(v) ) + { + // paging disabled... + clear_bit(_VCPUF_shadow2_translate, &v->vcpu_flags); + + /* Set v->arch.guest_table to use the p2m map, and choose + * the appropriate shadow mode */ + old_guest_table = pagetable_get_mfn(v->arch.guest_table); +#if CONFIG_PAGING_LEVELS == 2 + v->arch.guest_table = + pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,2,2); +#elif CONFIG_PAGING_LEVELS == 3 + v->arch.guest_table = + pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3); +#else /* CONFIG_PAGING_LEVELS == 4 */ + { + l4_pgentry_t *l4e; + /* Use the start of the first l3 table as a PAE l3 */ + ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0); + l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); + ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); + v->arch.guest_table = + pagetable_from_pfn(l4e_get_pfn(l4e[0])); + sh2_unmap_domain_page(l4e); + } + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3); +#endif + /* Fix up refcounts on guest_table */ + get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d); + if ( mfn_x(old_guest_table) != 0 ) + put_page(mfn_to_page(old_guest_table)); + } + else + { + set_bit(_VCPUF_shadow2_translate, &v->vcpu_flags); + +#ifdef __x86_64__ + if ( hvm_long_mode_enabled(v) ) + { + // long mode guest... + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4); + } + else +#endif + if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE ) + { +#if CONFIG_PAGING_LEVELS >= 3 + // 32-bit PAE mode guest... + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3); +#else + SHADOW2_ERROR("PAE not supported in 32-bit Xen\n"); + domain_crash(d); + return; +#endif + } + else + { + // 32-bit 2 level guest... +#if CONFIG_PAGING_LEVELS >= 3 + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 2); +#else + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2); +#endif + } + } + + if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) + { + mfn_t mmfn = shadow2_make_monitor_table(v); + v->arch.monitor_table = pagetable_from_mfn(mmfn); + v->arch.monitor_vtable = sh2_map_domain_page(mmfn); + } + + if ( v->arch.shadow2 != old_entries ) + { + SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u " + "(was g=%u s=%u)\n", + d->domain_id, v->vcpu_id, + v->arch.shadow2->guest_levels, + v->arch.shadow2->shadow_levels, + old_entries ? old_entries->guest_levels : 0, + old_entries ? old_entries->shadow_levels : 0); + if ( old_entries && + (v->arch.shadow2->shadow_levels != + old_entries->shadow_levels) ) + { + /* Need to make a new monitor table for the new mode */ + mfn_t new_mfn, old_mfn; + + if ( v != current ) + { + SHADOW2_ERROR("Some third party (d=%u v=%u) is changing " + "this HVM vcpu's (d=%u v=%u) paging mode!\n", + current->domain->domain_id, current->vcpu_id, + v->domain->domain_id, v->vcpu_id); + domain_crash(v->domain); + return; + } + + sh2_unmap_domain_page(v->arch.monitor_vtable); + old_mfn = pagetable_get_mfn(v->arch.monitor_table); + v->arch.monitor_table = pagetable_null(); + new_mfn = v->arch.shadow2->make_monitor_table(v); + v->arch.monitor_table = pagetable_from_mfn(new_mfn); + v->arch.monitor_vtable = sh2_map_domain_page(new_mfn); + SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n", + mfn_x(new_mfn)); + + /* Don't be running on the old monitor table when we + * pull it down! Switch CR3, and warn the HVM code that + * its host cr3 has changed. */ + make_cr3(v, mfn_x(new_mfn)); + write_ptbase(v); + hvm_update_host_cr3(v); + old_entries->destroy_monitor_table(v, old_mfn); + } + } + + // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE. + // These are HARD: think about the case where two CPU's have + // different values for CR4.PSE and CR4.PGE at the same time. + // This *does* happen, at least for CR4.PGE... + } + + v->arch.shadow2->update_cr3(v); +} + +/**************************************************************************/ +/* Turning on and off shadow2 features */ + +static void sh2_new_mode(struct domain *d, u32 new_mode) +/* Inform all the vcpus that the shadow mode has been changed */ +{ + struct vcpu *v; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d != current->domain); + d->arch.shadow2_mode = new_mode; + if ( new_mode & SHM2_translate ) + shadow2_audit_p2m(d); + for_each_vcpu(d, v) + sh2_update_paging_modes(v); +} + +static int shadow2_enable(struct domain *d, u32 mode) +/* Turn on "permanent" shadow features: external, translate, refcount. + * Can only be called once on a domain, and these features cannot be + * disabled. + * Returns 0 for success, -errno for failure. */ +{ + unsigned int old_pages; + int rv = 0; + + domain_pause(d); + shadow2_lock(d); + + /* Sanity check the arguments */ + if ( d == current->domain + || shadow2_mode_enabled(d) + || !(mode & SHM2_enable) + || ((mode & SHM2_external) && !(mode & SHM2_translate)) ) + { + rv = -EINVAL; + goto out; + } + + // XXX -- eventually would like to require that all memory be allocated + // *after* shadow2_enabled() is called... So here, we would test to make + // sure that d->page_list is empty. +#if 0 + spin_lock(&d->page_alloc_lock); + if ( !list_empty(&d->page_list) ) + { + spin_unlock(&d->page_alloc_lock); + rv = -EINVAL; + goto out; + } + spin_unlock(&d->page_alloc_lock); +#endif + + /* Init the shadow memory allocation if the user hasn't done so */ + old_pages = d->arch.shadow2_total_pages; + if ( old_pages == 0 ) + if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */ + { + set_sh2_allocation(d, 0, NULL); + rv = -ENOMEM; + goto out; + } + + /* Init the hash table */ + if ( shadow2_hash_alloc(d) != 0 ) + { + set_sh2_allocation(d, old_pages, NULL); + rv = -ENOMEM; + goto out; + } + + /* Init the P2M table */ + if ( mode & SHM2_translate ) + if ( !shadow2_alloc_p2m_table(d) ) + { + shadow2_hash_teardown(d); + set_sh2_allocation(d, old_pages, NULL); + shadow2_p2m_teardown(d); + rv = -ENOMEM; + goto out; + } + + /* Update the bits */ + sh2_new_mode(d, mode); + shadow2_audit_p2m(d); + out: + shadow2_unlock(d); + domain_unpause(d); + return 0; +} + +void shadow2_teardown(struct domain *d) +/* Destroy the shadow pagetables of this domain and free its shadow memory. + * Should only be called for dying domains. */ +{ + struct vcpu *v; + mfn_t mfn; + + ASSERT(test_bit(_DOMF_dying, &d->domain_flags)); + ASSERT(d != current->domain); + + if ( !shadow2_lock_is_acquired(d) ) + shadow2_lock(d); /* Keep various asserts happy */ + + if ( shadow2_mode_enabled(d) ) + { + /* Release the shadow and monitor tables held by each vcpu */ + for_each_vcpu(d, v) + { + if ( v->arch.shadow2 ) + shadow2_detach_old_tables(v); + if ( shadow2_mode_external(d) ) + { + mfn = pagetable_get_mfn(v->arch.monitor_table); + if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) ) + shadow2_destroy_monitor_table(v, mfn); + v->arch.monitor_table = pagetable_null(); + } + } + } + + if ( d->arch.shadow2_total_pages != 0 ) + { + SHADOW2_PRINTK("teardown of domain %u starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + /* Destroy all the shadows and release memory to domheap */ + set_sh2_allocation(d, 0, NULL); + /* Release the hash table back to xenheap */ + if (d->arch.shadow2_hash_table) + shadow2_hash_teardown(d); + /* Release the log-dirty bitmap of dirtied pages */ + sh2_free_log_dirty_bitmap(d); + /* Should not have any more memory held */ + SHADOW2_PRINTK("teardown done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + ASSERT(d->arch.shadow2_total_pages == 0); + } + + /* We leave the "permanent" shadow modes enabled, but clear the + * log-dirty mode bit. We don't want any more mark_dirty() + * calls now that we've torn down the bitmap */ + d->arch.shadow2_mode &= ~SHM2_log_dirty; + + shadow2_unlock(d); +} + +void shadow2_final_teardown(struct domain *d) +/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */ +{ + + SHADOW2_PRINTK("dom %u final teardown starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + + /* Double-check that the domain didn't have any shadow memory. + * It is possible for a domain that never got domain_kill()ed + * to get here with its shadow allocation intact. */ + if ( d->arch.shadow2_total_pages != 0 ) + shadow2_teardown(d); + + /* It is now safe to pull down the p2m map. */ + if ( d->arch.shadow2_p2m_pages != 0 ) + shadow2_p2m_teardown(d); + + SHADOW2_PRINTK("dom %u final teardown done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); +} + +static int shadow2_one_bit_enable(struct domain *d, u32 mode) +/* Turn on a single shadow mode feature */ +{ + ASSERT(shadow2_lock_is_acquired(d)); + + /* Sanity check the call */ + if ( d == current->domain || (d->arch.shadow2_mode & mode) ) + { + return -EINVAL; + } + + if ( d->arch.shadow2_mode == 0 ) + { + /* Init the shadow memory allocation and the hash table */ + if ( set_sh2_allocation(d, 1, NULL) != 0 + || shadow2_hash_alloc(d) != 0 ) + { + set_sh2_allocation(d, 0, NULL); + return -ENOMEM; + } + } + + /* Update the bits */ + sh2_new_mode(d, d->arch.shadow2_mode | mode); + + return 0; +} + +static int shadow2_one_bit_disable(struct domain *d, u32 mode) +/* Turn off a single shadow mode feature */ +{ + struct vcpu *v; + ASSERT(shadow2_lock_is_acquired(d)); + + /* Sanity check the call */ + if ( d == current->domain || !(d->arch.shadow2_mode & mode) ) + { + return -EINVAL; + } + + /* Update the bits */ + sh2_new_mode(d, d->arch.shadow2_mode & ~mode); + if ( d->arch.shadow2_mode == 0 ) + { + /* Get this domain off shadows */ + SHADOW2_PRINTK("un-shadowing of domain %u starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + for_each_vcpu(d, v) + { + if ( v->arch.shadow2 ) + shadow2_detach_old_tables(v); +#if CONFIG_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user)); + else +#endif + make_cr3(v, pagetable_get_pfn(v->arch.guest_table)); + + } + + /* Pull down the memory allocation */ + if ( set_sh2_allocation(d, 0, NULL) != 0 ) + { + // XXX - How can this occur? + // Seems like a bug to return an error now that we've + // disabled the relevant shadow mode. + // + return -ENOMEM; + } + shadow2_hash_teardown(d); + SHADOW2_PRINTK("un-shadowing of domain %u done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + } + + return 0; +} + +/* Enable/disable ops for the "test" and "log-dirty" modes */ +int shadow2_test_enable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow2_lock(d); + + if ( shadow2_mode_enabled(d) ) + { + SHADOW2_ERROR("Don't support enabling test mode" + "on already shadowed doms\n"); + ret = -EINVAL; + goto out; + } + + ret = shadow2_one_bit_enable(d, SHM2_enable); + out: + shadow2_unlock(d); + domain_unpause(d); + + return ret; +} + +int shadow2_test_disable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow2_lock(d); + ret = shadow2_one_bit_disable(d, SHM2_enable); + shadow2_unlock(d); + domain_unpause(d); + + return ret; +} + +static int +sh2_alloc_log_dirty_bitmap(struct domain *d) +{ + ASSERT(d->arch.shadow_dirty_bitmap == NULL); + d->arch.shadow_dirty_bitmap_size = + (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) & + ~(BITS_PER_LONG - 1); + d->arch.shadow_dirty_bitmap = + xmalloc_array(unsigned long, + d->arch.shadow_dirty_bitmap_size / BITS_PER_LONG); + if ( d->arch.shadow_dirty_bitmap == NULL ) + { + d->arch.shadow_dirty_bitmap_size = 0; + return -ENOMEM; + } + memset(d->arch.shadow_dirty_bitmap, 0, d->arch.shadow_dirty_bitmap_size/8); + + return 0; +} + +static void +sh2_free_log_dirty_bitmap(struct domain *d) +{ + d->arch.shadow_dirty_bitmap_size = 0; + if ( d->arch.shadow_dirty_bitmap ) + { + xfree(d->arch.shadow_dirty_bitmap); + d->arch.shadow_dirty_bitmap = NULL; + } +} + +static int shadow2_log_dirty_enable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow2_lock(d); + + if ( shadow2_mode_log_dirty(d) ) + { + ret = -EINVAL; + goto out; + } + + if ( shadow2_mode_enabled(d) ) + { + SHADOW2_ERROR("Don't (yet) support enabling log-dirty" + "on already shadowed doms\n"); + ret = -EINVAL; + goto out; + } + + ret = sh2_alloc_log_dirty_bitmap(d); + if ( ret != 0 ) + { + sh2_free_log_dirty_bitmap(d); + goto out; + } + + ret = shadow2_one_bit_enable(d, SHM2_log_dirty); + if ( ret != 0 ) + sh2_free_log_dirty_bitmap(d); + + out: + shadow2_unlock(d); + domain_unpause(d); + return ret; +} + +static int shadow2_log_dirty_disable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow2_lock(d); + ret = shadow2_one_bit_disable(d, SHM2_log_dirty); + if ( !shadow2_mode_log_dirty(d) ) + sh2_free_log_dirty_bitmap(d); + shadow2_unlock(d); + domain_unpause(d); + + return ret; +} + +/**************************************************************************/ +/* P2M map manipulations */ + +static void +sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn) +{ + struct vcpu *v; + + if ( !shadow2_mode_translate(d) ) + return; + + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + + + SHADOW2_PRINTK("removing gfn=%#lx mfn=%#lx\n", gfn, mfn); + + ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn); + //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn); + + shadow2_remove_all_shadows_and_parents(v, _mfn(mfn)); + if ( shadow2_remove_all_mappings(v, _mfn(mfn)) ) + flush_tlb_mask(d->domain_dirty_cpumask); + shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN)); + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); +} + +void +shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + shadow2_lock(d); + shadow2_audit_p2m(d); + sh2_p2m_remove_page(d, gfn, mfn); + shadow2_audit_p2m(d); + shadow2_unlock(d); +} + +void +shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + struct vcpu *v; + unsigned long ogfn; + mfn_t omfn; + + if ( !shadow2_mode_translate(d) ) + return; + + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + + shadow2_lock(d); + shadow2_audit_p2m(d); + + SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn); + + omfn = sh2_gfn_to_mfn(d, gfn); + if ( valid_mfn(omfn) ) + { + /* Get rid of the old mapping, especially any shadows */ + shadow2_remove_all_shadows_and_parents(v, omfn); + if ( shadow2_remove_all_mappings(v, omfn) ) + flush_tlb_mask(d->domain_dirty_cpumask); + set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + } + + ogfn = sh2_mfn_to_gfn(d, _mfn(mfn)); + if ( +#ifdef __x86_64__ + (ogfn != 0x5555555555555555L) +#else + (ogfn != 0x55555555L) +#endif + && (ogfn != INVALID_M2P_ENTRY) + && (ogfn != gfn) ) + { + /* This machine frame is already mapped at another physical address */ + SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n", + mfn, ogfn, gfn); + if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) ) + { + SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", + ogfn , mfn_x(omfn)); + if ( mfn_x(omfn) == mfn ) + sh2_p2m_remove_page(d, ogfn, mfn); + } + } + + shadow2_set_p2m_entry(d, gfn, _mfn(mfn)); + set_gpfn_from_mfn(mfn, gfn); + shadow2_audit_p2m(d); + shadow2_unlock(d); +} + +/**************************************************************************/ +/* Log-dirty mode support */ + +/* Convert a shadow to log-dirty mode. */ +void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn) +{ + BUG(); +} + + +/* Read a domain's log-dirty bitmap and stats. + * If the operation is a CLEAN, clear the bitmap and stats as well. */ +static int shadow2_log_dirty_op(struct domain *d, dom0_shadow_control_t *sc) +{ + int i, rv = 0, clean = 0; + + domain_pause(d); + shadow2_lock(d); + + if ( sc->op == DOM0_SHADOW_CONTROL_OP_CLEAN + || sc->op == DOM0_SHADOW_CONTROL_OP_FLUSH ) + clean = 1; + else + ASSERT(sc->op == DOM0_SHADOW_CONTROL_OP_PEEK); + + SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", + (clean) ? "clean" : "peek", + d->domain_id, + d->arch.shadow_fault_count, + d->arch.shadow_dirty_count); + + sc->stats.fault_count = d->arch.shadow_fault_count; + sc->stats.dirty_count = d->arch.shadow_dirty_count; + + if ( clean ) + { + struct list_head *l, *t; + struct page_info *pg; + + /* Need to revoke write access to the domain's pages again. + * In future, we'll have a less heavy-handed approach to this, + * but for now, we just unshadow everything except Xen. */ + list_for_each_safe(l, t, &d->arch.shadow2_toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg)); + } + + d->arch.shadow_fault_count = 0; + d->arch.shadow_dirty_count = 0; + } + + if ( guest_handle_is_null(sc->dirty_bitmap) || + (d->arch.shadow_dirty_bitmap == NULL) ) + { + rv = -EINVAL; + goto out; + } + + if ( sc->pages > d->arch.shadow_dirty_bitmap_size ) + sc->pages = d->arch.shadow_dirty_bitmap_size; + +#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ + for ( i = 0; i < sc->pages; i += CHUNK ) + { + int bytes = ((((sc->pages - i) > CHUNK) + ? CHUNK + : (sc->pages - i)) + 7) / 8; + + if ( copy_to_guest_offset( + sc->dirty_bitmap, + i/(8*sizeof(unsigned long)), + d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))), + (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) ) + { + rv = -EINVAL; + goto out; + } + + if ( clean ) + memset(d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))), + 0, bytes); + } +#undef CHUNK + + out: + shadow2_unlock(d); + domain_unpause(d); + return 0; +} + + +/* Mark a page as dirty */ +void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn) +{ + unsigned long pfn; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(shadow2_mode_log_dirty(d)); + + if ( !valid_mfn(gmfn) ) + return; + + ASSERT(d->arch.shadow_dirty_bitmap != NULL); + + /* We /really/ mean PFN here, even for non-translated guests. */ + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + + /* + * Values with the MSB set denote MFNs that aren't really part of the + * domain's pseudo-physical memory map (e.g., the shared info frame). + * Nothing to do here... + */ + if ( unlikely(!VALID_M2P(pfn)) ) + return; + + /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */ + if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) ) + { + if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) ) + { + SHADOW2_DEBUG(LOGDIRTY, + "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n", + mfn_x(gmfn), pfn, d->domain_id); + d->arch.shadow_dirty_count++; + } + } + else + { + SHADOW2_PRINTK("mark_dirty OOR! " + "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n" + "owner=%d c=%08x t=%" PRtype_info "\n", + mfn_x(gmfn), + pfn, + d->arch.shadow_dirty_bitmap_size, + d->domain_id, + (page_get_owner(mfn_to_page(gmfn)) + ? page_get_owner(mfn_to_page(gmfn))->domain_id + : -1), + mfn_to_page(gmfn)->count_info, + mfn_to_page(gmfn)->u.inuse.type_info); + } +} + + +/**************************************************************************/ +/* Shadow-control DOM0_OP dispatcher */ + +int shadow2_control_op(struct domain *d, + dom0_shadow_control_t *sc, + XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op) +{ + int rc, preempted = 0; + + if ( unlikely(d == current->domain) ) + { + DPRINTK("Don't try to do a shadow op on yourself!\n"); + return -EINVAL; + } + + switch ( sc->op ) + { + case DOM0_SHADOW_CONTROL_OP_OFF: + if ( shadow2_mode_log_dirty(d) ) + if ( (rc = shadow2_log_dirty_disable(d)) != 0 ) + return rc; + if ( d->arch.shadow2_mode & SHM2_enable ) + if ( (rc = shadow2_test_disable(d)) != 0 ) + return rc; + return 0; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST: + return shadow2_test_enable(d); + + case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY: + return shadow2_log_dirty_enable(d); + + case DOM0_SHADOW_CONTROL_OP_FLUSH: + case DOM0_SHADOW_CONTROL_OP_CLEAN: + case DOM0_SHADOW_CONTROL_OP_PEEK: + return shadow2_log_dirty_op(d, sc); + + + + case DOM0_SHADOW2_CONTROL_OP_ENABLE: + return shadow2_enable(d, sc->mode << SHM2_shift); + + case DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION: + sc->mb = shadow2_get_allocation(d); + return 0; + + case DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION: + rc = shadow2_set_allocation(d, sc->mb, &preempted); + if ( preempted ) + /* Not finished. Set up to re-run the call. */ + rc = hypercall_create_continuation( + __HYPERVISOR_dom0_op, "h", u_dom0_op); + else + /* Finished. Return the new allocation */ + sc->mb = shadow2_get_allocation(d); + return rc; + + + default: + SHADOW2_ERROR("Bad shadow op %u\n", sc->op); + return -EINVAL; + } +} + + +/**************************************************************************/ +/* Auditing shadow tables */ + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL + +void shadow2_audit_tables(struct vcpu *v) +{ + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2), /* l2_32 */ +#else + SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2), /* l2_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3), /* l1_pae */ + SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2_pae */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2h_pae */ + SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3), /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4), /* l1_64 */ + SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4), /* l2_64 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4), /* l3_64 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4), /* l4_64 */ +#endif /* CONFIG_PAGING_LEVELS >= 4 */ +#endif /* CONFIG_PAGING_LEVELS > 2 */ + NULL /* All the rest */ + }; + unsigned int mask; + + if ( !(SHADOW2_AUDIT_ENABLE) ) + return; + + if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL ) + mask = ~1; /* Audit every table in the system */ + else + { + /* Audit only the current mode's tables */ + switch (v->arch.shadow2->guest_levels) + { + case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break; + case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE + |SH2F_L2H_PAE|SH2F_L3_PAE); break; + case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64 + |SH2F_L3_64|SH2F_L4_64); break; + default: BUG(); + } + } + + hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN)); +} + +#endif /* Shadow audit */ + + +/**************************************************************************/ +/* Auditing p2m tables */ + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M + +void shadow2_audit_p2m(struct domain *d) +{ + struct list_head *entry; + struct page_info *page; + struct domain *od; + unsigned long mfn, gfn, m2pfn, lp2mfn = 0; + mfn_t p2mfn; + unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; + int test_linear; + + if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) ) + return; + + //SHADOW2_PRINTK("p2m audit starts\n"); + + test_linear = ( (d == current->domain) && current->arch.monitor_vtable ); + if ( test_linear ) + local_flush_tlb(); + + /* Audit part one: walk the domain's page allocation list, checking + * the m2p entries. */ + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + page = list_entry(entry, struct page_info, list); + mfn = mfn_x(page_to_mfn(page)); + + // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn); + + od = page_get_owner(page); + + if ( od != d ) + { + SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n", + mfn, od, (od?od->domain_id:-1), d, d->domain_id); + continue; + } + + gfn = get_gpfn_from_mfn(mfn); + if ( gfn == INVALID_M2P_ENTRY ) + { + orphans_i++; + //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n", + // mfn); + continue; + } + + if ( gfn == 0x55555555 ) + { + orphans_d++; + //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", + // mfn); + continue; + } + + p2mfn = sh2_gfn_to_mfn_foreign(d, gfn); + if ( mfn_x(p2mfn) != mfn ) + { + mpbad++; + SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx" + " (-> gfn %#lx)\n", + mfn, gfn, mfn_x(p2mfn), + (mfn_valid(p2mfn) + ? get_gpfn_from_mfn(mfn_x(p2mfn)) + : -1u)); + /* This m2p entry is stale: the domain has another frame in + * this physical slot. No great disaster, but for neatness, + * blow away the m2p entry. */ + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); + } + + if ( test_linear ) + { + lp2mfn = get_mfn_from_gpfn(gfn); + if ( lp2mfn != mfn_x(p2mfn) ) + { + SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx " + "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn); + } + } + + // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", + // mfn, gfn, p2mfn, lp2mfn); + } + + /* Audit part two: walk the domain's p2m table, checking the entries. */ + if ( pagetable_get_pfn(d->arch.phys_table) != 0 ) + { + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + int i1, i2; + +#if CONFIG_PAGING_LEVELS == 4 + l4_pgentry_t *l4e; + l3_pgentry_t *l3e; + int i3, i4; + l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#elif CONFIG_PAGING_LEVELS == 3 + l3_pgentry_t *l3e; + int i3; + l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#else /* CONFIG_PAGING_LEVELS == 2 */ + l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#endif + + gfn = 0; +#if CONFIG_PAGING_LEVELS >= 3 +#if CONFIG_PAGING_LEVELS >= 4 + for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) + { + if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4]))); +#endif /* now at levels 3 or 4... */ + for ( i3 = 0; + i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); + i3++ ) + { + if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3]))); +#endif /* all levels... */ + for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) + { + if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2]))); + + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) + { + if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) + continue; + mfn = l1e_get_pfn(l1e[i1]); + ASSERT(valid_mfn(_mfn(mfn))); + m2pfn = get_gpfn_from_mfn(mfn); + if ( m2pfn != gfn ) + { + pmbad++; + SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn, mfn, m2pfn); + BUG(); + } + } + sh2_unmap_domain_page(l1e); + } +#if CONFIG_PAGING_LEVELS >= 3 + sh2_unmap_domain_page(l2e); + } +#if CONFIG_PAGING_LEVELS >= 4 + sh2_unmap_domain_page(l3e); + } +#endif +#endif + +#if CONFIG_PAGING_LEVELS == 4 + sh2_unmap_domain_page(l4e); +#elif CONFIG_PAGING_LEVELS == 3 + sh2_unmap_domain_page(l3e); +#else /* CONFIG_PAGING_LEVELS == 2 */ + sh2_unmap_domain_page(l2e); +#endif + + } + + //SHADOW2_PRINTK("p2m audit complete\n"); + //if ( orphans_i | orphans_d | mpbad | pmbad ) + // SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n", + // orphans_i + orphans_d, orphans_i, orphans_d, + if ( mpbad | pmbad ) + SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", + pmbad, mpbad); +} + +#endif /* p2m audit */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/shadow2.c b/xen/arch/x86/shadow2.c new file mode 100644 index 0000000000..9d845cb797 --- /dev/null +++ b/xen/arch/x86/shadow2.c @@ -0,0 +1,4469 @@ +/****************************************************************************** + * arch/x86/shadow2.c + * + * Simple, mostly-synchronous shadow page tables. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +// DESIGN QUESTIONS: +// Why use subshadows for PAE guests? +// - reduces pressure in the hash table +// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3) +// - would need to find space in the page_info to store 7 more bits of +// backpointer +// - independent shadows of 32 byte chunks makes it non-obvious how to quickly +// figure out when to demote the guest page from l3 status +// +// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space. +// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address +// space for both PV and HVM guests. +// + +#define SHADOW2 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The first cut: an absolutely synchronous, trap-and-emulate version, + * supporting only HVM guests (and so only "external" shadow mode). + * + * THINGS TO DO LATER: + * + * FIX GVA_TO_GPA + * The current interface returns an unsigned long, which is not big enough + * to hold a physical address in PAE. Should return a gfn instead. + * + * TEARDOWN HEURISTICS + * Also: have a heuristic for when to destroy a previous paging-mode's + * shadows. When a guest is done with its start-of-day 32-bit tables + * and reuses the memory we want to drop those shadows. Start with + * shadows in a page in two modes as a hint, but beware of clever tricks + * like reusing a pagetable for both PAE and 64-bit during boot... + * + * PAE LINEAR MAPS + * Rework shadow_get_l*e() to have the option of using map_domain_page() + * instead of linear maps. Add appropriate unmap_l*e calls in the users. + * Then we can test the speed difference made by linear maps. If the + * map_domain_page() version is OK on PAE, we could maybe allow a lightweight + * l3-and-l2h-only shadow mode for PAE PV guests that would allow them + * to share l2h pages again. + * + * PAE L3 COPYING + * In this code, we copy all 32 bytes of a PAE L3 every time we change an + * entry in it, and every time we change CR3. We copy it for the linear + * mappings (ugh! PAE linear mappings) and we copy it to the low-memory + * buffer so it fits in CR3. Maybe we can avoid some of this recopying + * by using the shadow directly in some places. + * Also, for SMP, need to actually respond to seeing shadow2_pae_flip_pending. + * + * GUEST_WALK_TABLES TLB FLUSH COALESCE + * guest_walk_tables can do up to three remote TLB flushes as it walks to + * the first l1 of a new pagetable. Should coalesce the flushes to the end, + * and if we do flush, re-do the walk. If anything has changed, then + * pause all the other vcpus and do the walk *again*. + * + * WP DISABLED + * Consider how to implement having the WP bit of CR0 set to 0. + * Since we need to be able to cause write faults to pagetables, this might + * end up looking like not having the (guest) pagetables present at all in + * HVM guests... + * + * PSE disabled / PSE36 + * We don't support any modes other than PSE enabled, PSE36 disabled. + * Neither of those would be hard to change, but we'd need to be able to + * deal with shadows made in one mode and used in another. + */ + +#define FETCH_TYPE_PREFETCH 1 +#define FETCH_TYPE_DEMAND 2 +#define FETCH_TYPE_WRITE 4 +typedef enum { + ft_prefetch = FETCH_TYPE_PREFETCH, + ft_demand_read = FETCH_TYPE_DEMAND, + ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE, +} fetch_type_t; + +#ifndef NDEBUG +static char *fetch_type_names[] = { + [ft_prefetch] "prefetch", + [ft_demand_read] "demand read", + [ft_demand_write] "demand write", +}; +#endif + +/* XXX forward declarations */ +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res); +#endif +static inline void sh2_update_linear_entries(struct vcpu *v); + +/**************************************************************************/ +/* Hash table mapping from guest pagetables to shadows + * + * Normal case: maps the mfn of a guest page to the mfn of its shadow page. + * FL1's: maps the *gfn* of the start of a superpage to the mfn of a + * shadow L1 which maps its "splinters". + * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the + * PAE L3 info page for that CR3 value. + */ + +static inline mfn_t +get_fl1_shadow_status(struct vcpu *v, gfn_t gfn) +/* Look for FL1 shadows in the hash table */ +{ + mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn), + PGC_SH2_fl1_shadow >> PGC_SH2_type_shift); + + if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) + { + struct page_info *page = mfn_to_page(smfn); + if ( !(page->count_info & PGC_SH2_log_dirty) ) + shadow2_convert_to_log_dirty(v, smfn); + } + + return smfn; +} + +static inline mfn_t +get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type) +/* Look for shadows in the hash table */ +{ + mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn), + shadow_type >> PGC_SH2_type_shift); + perfc_incrc(shadow2_get_shadow_status); + + if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) + { + struct page_info *page = mfn_to_page(smfn); + if ( !(page->count_info & PGC_SH2_log_dirty) ) + shadow2_convert_to_log_dirty(v, smfn); + } + + return smfn; +} + +static inline void +set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) +/* Put an FL1 shadow into the hash table */ +{ + SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n", + gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn)); + + if ( unlikely(shadow2_mode_log_dirty(v->domain)) ) + // mark this shadow as a log dirty shadow... + set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info); + else + clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info); + + shadow2_hash_insert(v, gfn_x(gfn), + PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn); +} + +static inline void +set_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) +/* Put a shadow into the hash table */ +{ + struct domain *d = v->domain; + int res; + + SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", + d->domain_id, v->vcpu_id, mfn_x(gmfn), + shadow_type, mfn_x(smfn)); + + if ( unlikely(shadow2_mode_log_dirty(d)) ) + // mark this shadow as a log dirty shadow... + set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info); + else + clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info); + + res = get_page(mfn_to_page(gmfn), d); + ASSERT(res == 1); + + shadow2_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH2_type_shift, + smfn); +} + +static inline void +delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) +/* Remove a shadow from the hash table */ +{ + SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n", + gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn)); + + shadow2_hash_delete(v, gfn_x(gfn), + PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn); +} + +static inline void +delete_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) +/* Remove a shadow from the hash table */ +{ + SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", + v->domain->domain_id, v->vcpu_id, + mfn_x(gmfn), shadow_type, mfn_x(smfn)); + shadow2_hash_delete(v, mfn_x(gmfn), + shadow_type >> PGC_SH2_type_shift, smfn); + put_page(mfn_to_page(gmfn)); +} + + +/**************************************************************************/ +/* Functions for walking the guest page tables */ + + +/* Walk the guest pagetables, filling the walk_t with what we see. + * Takes an uninitialised walk_t. The caller must call unmap_walk() + * on the walk_t before discarding it or calling guest_walk_tables again. + * If "guest_op" is non-zero, we are serving a genuine guest memory access, + * and must (a) be under the shadow2 lock, and (b) remove write access + * from any gueat PT pages we see, as we will be using their contents to + * perform shadow updates. + * Returns 0 for success or non-zero if the guest pagetables are malformed. + * N.B. Finding a not-present entry does not cause a non-zero return code. */ +static inline int +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op) +{ + ASSERT(!guest_op || shadow2_lock_is_acquired(v->domain)); + + perfc_incrc(shadow2_guest_walk); + memset(gw, 0, sizeof(*gw)); + gw->va = va; + +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + /* Get l4e from the top level table */ + gw->l4mfn = pagetable_get_mfn(v->arch.guest_table); + gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va); + /* Walk down to the l3e */ + if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0; + gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e)); + if ( !valid_mfn(gw->l3mfn) ) return 1; + /* This mfn is a pagetable: make sure the guest can't write to it. */ + if ( guest_op && shadow2_remove_write_access(v, gw->l3mfn, 3, va) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + gw->l3e = ((guest_l3e_t *)sh2_map_domain_page(gw->l3mfn)) + + guest_l3_table_offset(va); +#else /* PAE only... */ + /* Get l3e from the top level table */ + gw->l3mfn = pagetable_get_mfn(v->arch.guest_table); + gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va); +#endif /* PAE or 64... */ + /* Walk down to the l2e */ + if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0; + gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e)); + if ( !valid_mfn(gw->l2mfn) ) return 1; + /* This mfn is a pagetable: make sure the guest can't write to it. */ + if ( guest_op && shadow2_remove_write_access(v, gw->l2mfn, 2, va) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + gw->l2e = ((guest_l2e_t *)sh2_map_domain_page(gw->l2mfn)) + + guest_l2_table_offset(va); +#else /* 32-bit only... */ + /* Get l2e from the top level table */ + gw->l2mfn = pagetable_get_mfn(v->arch.guest_table); + gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va); +#endif /* All levels... */ + + if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0; + if ( guest_supports_superpages(v) && + (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) ) + { + /* Special case: this guest VA is in a PSE superpage, so there's + * no guest l1e. We make one up so that the propagation code + * can generate a shadow l1 table. Start with the gfn of the + * first 4k-page of the superpage. */ + gfn_t start = guest_l2e_get_gfn(*gw->l2e); + /* Grant full access in the l1e, since all the guest entry's + * access controls are enforced in the shadow l2e. This lets + * us reflect l2 changes later without touching the l1s. */ + int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| + _PAGE_ACCESSED|_PAGE_DIRTY); + /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7 + * of the level 1 */ + if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) ) + flags |= _PAGE_PAT; + /* Increment the pfn by the right number of 4k pages. + * The ~0x1 is to mask out the PAT bit mentioned above. */ + start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va)); + gw->eff_l1e = guest_l1e_from_gfn(start, flags); + gw->l1e = NULL; + gw->l1mfn = _mfn(INVALID_MFN); + } + else + { + /* Not a superpage: carry on and find the l1e. */ + gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e)); + if ( !valid_mfn(gw->l1mfn) ) return 1; + /* This mfn is a pagetable: make sure the guest can't write to it. */ + if ( guest_op + && shadow2_remove_write_access(v, gw->l1mfn, 1, va) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + gw->l1e = ((guest_l1e_t *)sh2_map_domain_page(gw->l1mfn)) + + guest_l1_table_offset(va); + gw->eff_l1e = *gw->l1e; + } + + return 0; +} + +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding frame number. */ +static inline gfn_t +guest_walk_to_gfn(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) ) + return _gfn(INVALID_GFN); + return guest_l1e_get_gfn(gw->eff_l1e); +} + +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding physical address. */ +static inline paddr_t +guest_walk_to_gpa(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) ) + return 0; + return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK); +} + + +/* Unmap (and reinitialise) a guest walk. + * Call this to dispose of any walk filled in by guest_walk_tables() */ +static void unmap_walk(struct vcpu *v, walk_t *gw) +{ +#if GUEST_PAGING_LEVELS >= 3 +#if GUEST_PAGING_LEVELS >= 4 + if ( gw->l3e != NULL ) sh2_unmap_domain_page(gw->l3e); +#endif + if ( gw->l2e != NULL ) sh2_unmap_domain_page(gw->l2e); +#endif + if ( gw->l1e != NULL ) sh2_unmap_domain_page(gw->l1e); +#ifdef DEBUG + memset(gw, 0, sizeof(*gw)); +#endif +} + + +/* Pretty-print the contents of a guest-walk */ +static inline void print_gw(walk_t *gw) +{ + SHADOW2_PRINTK("GUEST WALK TO %#lx:\n", gw->va); +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + SHADOW2_PRINTK(" l4mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l4mfn)); + SHADOW2_PRINTK(" l4e=%p\n", gw->l4e); + if ( gw->l4e ) + SHADOW2_PRINTK(" *l4e=%" SH2_PRI_gpte "\n", gw->l4e->l4); +#endif /* PAE or 64... */ + SHADOW2_PRINTK(" l3mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l3mfn)); + SHADOW2_PRINTK(" l3e=%p\n", gw->l3e); + if ( gw->l3e ) + SHADOW2_PRINTK(" *l3e=%" SH2_PRI_gpte "\n", gw->l3e->l3); +#endif /* All levels... */ + SHADOW2_PRINTK(" l2mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l2mfn)); + SHADOW2_PRINTK(" l2e=%p\n", gw->l2e); + if ( gw->l2e ) + SHADOW2_PRINTK(" *l2e=%" SH2_PRI_gpte "\n", gw->l2e->l2); + SHADOW2_PRINTK(" l1mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l1mfn)); + SHADOW2_PRINTK(" l1e=%p\n", gw->l1e); + if ( gw->l1e ) + SHADOW2_PRINTK(" *l1e=%" SH2_PRI_gpte "\n", gw->l1e->l1); + SHADOW2_PRINTK(" eff_l1e=%" SH2_PRI_gpte "\n", gw->eff_l1e.l1); +} + + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES +/* Lightweight audit: pass all the shadows associated with this guest walk + * through the audit mechanisms */ +static void sh2_audit_gw(struct vcpu *v, walk_t *gw) +{ + mfn_t smfn; + + if ( !(SHADOW2_AUDIT_ENABLE) ) + return; + +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + if ( valid_mfn(gw->l4mfn) + && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn, + PGC_SH2_l4_shadow))) ) + (void) sh2_audit_l4_table(v, smfn, _mfn(INVALID_MFN)); +#endif /* PAE or 64... */ + if ( valid_mfn(gw->l3mfn) + && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn, + PGC_SH2_l3_shadow))) ) + (void) sh2_audit_l3_table(v, smfn, _mfn(INVALID_MFN)); +#endif /* All levels... */ + if ( valid_mfn(gw->l2mfn) ) + { + if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, + PGC_SH2_l2_shadow))) ) + (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); +#if GUEST_PAGING_LEVELS == 3 + if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, + PGC_SH2_l2h_shadow))) ) + (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); +#endif + } + if ( valid_mfn(gw->l1mfn) + && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn, + PGC_SH2_l1_shadow))) ) + (void) sh2_audit_l1_table(v, smfn, _mfn(INVALID_MFN)); + else if ( gw->l2e + && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) + && valid_mfn( + (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) ) + (void) sh2_audit_fl1_table(v, smfn, _mfn(INVALID_MFN)); +} + +#else +#define sh2_audit_gw(_v, _gw) do {} while(0) +#endif /* audit code */ + + + +/**************************************************************************/ +/* Function to write to the guest tables, for propagating accessed and + * dirty bits from the shadow to the guest. + * Takes a guest mfn, a pointer to the guest entry, the level of pagetable, + * and an operation type. The guest entry is always passed as an l1e: + * since we only ever write flags, that's OK. + * Returns the new flag bits of the guest entry. */ + +static u32 guest_set_ad_bits(struct vcpu *v, + mfn_t gmfn, + guest_l1e_t *ep, + unsigned int level, + fetch_type_t ft) +{ + u32 flags, shflags, bit; + struct page_info *pg; + int res = 0; + + ASSERT(valid_mfn(gmfn) + && (sh2_mfn_is_a_page_table(gmfn) + || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) + == 0))); + ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1))); + ASSERT(level <= GUEST_PAGING_LEVELS); + ASSERT(ft == ft_demand_read || ft == ft_demand_write); + ASSERT(shadow2_lock_is_acquired(v->domain)); + + flags = guest_l1e_get_flags(*ep); + + /* PAE l3s do not have A and D bits */ + if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) ) + return flags; + + /* Need the D bit as well for writes, in l1es and PSE l2es. */ + if ( ft == ft_demand_write + && (level == 1 || (level == 2 && (flags & _PAGE_PSE))) ) + { + if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) + == (_PAGE_DIRTY | _PAGE_ACCESSED) ) + return flags; /* Guest already has A and D bits set */ + flags |= _PAGE_DIRTY | _PAGE_ACCESSED; + perfc_incrc(shadow2_ad_update); + } + else + { + if ( flags & _PAGE_ACCESSED ) + return flags; /* Guest already has A bit set */ + flags |= _PAGE_ACCESSED; + perfc_incrc(shadow2_a_update); + } + + /* Set the bit(s) */ + sh2_mark_dirty(v->domain, gmfn); + SHADOW2_DEBUG(A_AND_D, "gfn = %"SH2_PRI_gfn", " + "old flags = %#x, new flags = %#x\n", + guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags); + *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags); + + /* May need to propagate this change forward to other kinds of shadow */ + pg = mfn_to_page(gmfn); + if ( !sh2_mfn_is_a_page_table(gmfn) ) + { + /* This guest pagetable is not yet shadowed at all. */ + // MAF: I think this assert is busted... If this gmfn has not yet + // been promoted, then it seems perfectly reasonable for there to be + // outstanding type refs to it... + /* TJD: No. If the gmfn has not been promoted, we must at least + * have recognised that it is a pagetable, and pulled write access. + * The type count should only be non-zero if it is actually a page + * table. The test above was incorrect, though, so I've fixed it. */ + ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0); + return flags; + } + + shflags = pg->shadow2_flags & SH2F_page_type_mask; + while ( shflags ) + { + bit = find_first_set_bit(shflags); + ASSERT(shflags & (1u << bit)); + shflags &= ~(1u << bit); + if ( !(pg->shadow2_flags & (1u << bit)) ) + continue; + switch ( bit ) + { + case PGC_SH2_type_to_index(PGC_SH2_l1_shadow): + if (level != 1) + res |= sh2_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep)); + break; + case PGC_SH2_type_to_index(PGC_SH2_l2_shadow): + if (level != 2) + res |= sh2_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep)); + break; +#if GUEST_PAGING_LEVELS == 3 /* PAE only */ + case PGC_SH2_type_to_index(PGC_SH2_l2h_shadow): + if (level != 2) + res |= sh2_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep)); + break; +#endif +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ + case PGC_SH2_type_to_index(PGC_SH2_l3_shadow): + if (level != 3) + res |= sh2_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep)); + break; +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + case PGC_SH2_type_to_index(PGC_SH2_l4_shadow): + if (level != 4) + res |= sh2_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep)); + break; +#endif +#endif + default: + SHADOW2_ERROR("mfn %"SH2_PRI_mfn" is shadowed in multiple " + "modes: A&D bits may be out of sync (flags=%#x).\n", + mfn_x(gmfn), pg->shadow2_flags); + /* XXX Shadows in other modes will not be updated, so will + * have their A and D bits out of sync. */ + } + } + + /* We should never need to flush the TLB or recopy PAE entries */ + ASSERT( res == 0 || res == SHADOW2_SET_CHANGED ); + return flags; +} + +/**************************************************************************/ +/* Functions to compute the correct index into a shadow page, given an + * index into the guest page (as returned by guest_get_index()). + * This is trivial when the shadow and guest use the same sized PTEs, but + * gets more interesting when those sizes are mismatched (e.g. 32-bit guest, + * PAE- or 64-bit shadows). + * + * These functions also increment the shadow mfn, when necessary. When PTE + * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1 + * page. In this case, we allocate 2 contiguous pages for the shadow L1, and + * use simple pointer arithmetic on a pointer to the guest L1e to figure out + * which shadow page we really want. Similarly, when PTE sizes are + * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest + * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address + * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address + * space.) + * + * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes + * of shadow (to store both the shadow, and the info that would normally be + * stored in page_info fields). This arrangement allows the shadow and the + * "page_info" fields to always be stored in the same page (in fact, in + * the same cache line), avoiding an extra call to map_domain_page(). + */ + +static inline u32 +guest_index(void *ptr) +{ + return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t); +} + +static inline u32 +shadow_l1_index(mfn_t *smfn, u32 guest_index) +{ +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) + *smfn = _mfn(mfn_x(*smfn) + + (guest_index / SHADOW_L1_PAGETABLE_ENTRIES)); + return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES); +#else + return guest_index; +#endif +} + +static inline u32 +shadow_l2_index(mfn_t *smfn, u32 guest_index) +{ +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) + // Because we use 2 shadow l2 entries for each guest entry, the number of + // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2 + // + *smfn = _mfn(mfn_x(*smfn) + + (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2))); + + // We multiple by two to get the index of the first of the two entries + // used to shadow the specified guest entry. + return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2; +#else + return guest_index; +#endif +} + +#if GUEST_PAGING_LEVELS >= 3 + +static inline u32 +shadow_l3_index(mfn_t *smfn, u32 guest_index) +{ +#if GUEST_PAGING_LEVELS == 3 + u32 group_id; + + // Because we use twice the space in L3 shadows as was consumed in guest + // L3s, the number of guest entries per shadow page is + // SHADOW_L2_PAGETABLE_ENTRIES/2. (Note this is *not* + // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...) + // + *smfn = _mfn(mfn_x(*smfn) + + (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2))); + + // We store PAE L3 shadows in groups of 4, alternating shadows and + // pae_l3_bookkeeping structs. So the effective shadow index is + // the the group_id * 8 + the offset within the group. + // + guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2); + group_id = guest_index / 4; + return (group_id * 8) + (guest_index % 4); +#else + return guest_index; +#endif +} + +#endif // GUEST_PAGING_LEVELS >= 3 + +#if GUEST_PAGING_LEVELS >= 4 + +static inline u32 +shadow_l4_index(mfn_t *smfn, u32 guest_index) +{ + return guest_index; +} + +#endif // GUEST_PAGING_LEVELS >= 4 + + +/**************************************************************************/ +/* Functions which compute shadow entries from their corresponding guest + * entries. + * + * These are the "heart" of the shadow code. + * + * There are two sets of these: those that are called on demand faults (read + * faults and write faults), and those that are essentially called to + * "prefetch" (or propagate) entries from the guest into the shadow. The read + * fault and write fault are handled as two separate cases for L1 entries (due + * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together + * into the respective demand_fault functions. + */ + +#define CHECK(_cond) \ +do { \ + if (unlikely(!(_cond))) \ + { \ + printk("%s %s %d ASSERTION (%s) FAILED\n", \ + __func__, __FILE__, __LINE__, #_cond); \ + return -1; \ + } \ +} while (0); + +// The function below tries to capture all of the flag manipulation for the +// demand and propagate functions into one place. +// +static always_inline u32 +sh2_propagate_flags(struct vcpu *v, mfn_t target_mfn, + u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, + int mmio, int level, fetch_type_t ft) +{ + struct domain *d = v->domain; + u32 pass_thru_flags; + u32 sflags; + int lowest_level_guest_mapping; + + // XXX -- might want to think about PAT support for HVM guests... + +#ifndef NDEBUG + // MMIO can only occur from L1e's + // + if ( mmio ) + CHECK(level == 1); + + // We should always have a pointer to the guest entry if it's a non-PSE + // non-MMIO demand access. + if ( ft & FETCH_TYPE_DEMAND ) + CHECK(guest_entry_ptr || level == 1); +#endif + + // A not-present guest entry has a special signature in the shadow table, + // so that we do not have to consult the guest tables multiple times... + // + if ( unlikely(!(gflags & _PAGE_PRESENT)) ) + return _PAGE_SHADOW_GUEST_NOT_PRESENT; + + // Must have a valid target_mfn, unless this is mmio, or unless this is a + // prefetch. In the case of a prefetch, an invalid mfn means that we can + // not usefully shadow anything, and so we return early. + // + if ( !valid_mfn(target_mfn) ) + { + CHECK((ft == ft_prefetch) || mmio); + if ( !mmio ) + return 0; + } + + // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's... + // + if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) ) + pass_thru_flags = _PAGE_PRESENT; + else + { + pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER | + _PAGE_RW | _PAGE_PRESENT); + if ( guest_supports_nx(v) ) + pass_thru_flags |= _PAGE_NX_BIT; + } + + // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their + // L3e's; they are all implied. So we emulate them here. + // + if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) ) + gflags = pass_thru_flags; + + // Propagate bits from the guest to the shadow. + // Some of these may be overwritten, below. + // Since we know the guest's PRESENT bit is set, we also set the shadow's + // SHADOW_PRESENT bit. + // + sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT; + + // Copy the guest's RW bit into the SHADOW_RW bit. + // + if ( gflags & _PAGE_RW ) + sflags |= _PAGE_SHADOW_RW; + + // Set the A&D bits for higher level shadows. + // Higher level entries do not, strictly speaking, have dirty bits, but + // since we use shadow linear tables, each of these entries may, at some + // point in time, also serve as a shadow L1 entry. + // By setting both the A&D bits in each of these, we eliminate the burden + // on the hardware to update these bits on initial accesses. + // + if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) ) + sflags |= _PAGE_ACCESSED | _PAGE_DIRTY; + + lowest_level_guest_mapping = + ((level == 1) || + ((level == 2) && guest_supports_superpages(v) && + (gflags & _PAGE_PSE))); + + // Set the A and D bits in the guest entry, if we need to. + if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) ) + gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft); + + // If the A or D bit has not yet been set in the guest, then we must + // prevent the corresponding kind of access. + // + if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) && + !(gflags & _PAGE_ACCESSED)) ) + sflags &= ~_PAGE_PRESENT; + + if ( unlikely(lowest_level_guest_mapping && + !(gflags & _PAGE_DIRTY)) ) + sflags &= ~_PAGE_RW; + + // MMIO caching + // + // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit + // to cache the fact that this entry is in MMIO space. + // + if ( (level == 1) && mmio ) + { + sflags &= ~(_PAGE_PRESENT); + sflags |= _PAGE_SHADOW_MMIO; + } + else + { + // shadow2_mode_log_dirty support + // + // Only allow the guest write access to a page a) on a demand fault, + // or b) if the page is already marked as dirty. + // + if ( unlikely((level == 1) && + !(ft & FETCH_TYPE_WRITE) && + shadow2_mode_log_dirty(d) && + !sh2_mfn_is_dirty(d, target_mfn)) ) + { + sflags &= ~_PAGE_RW; + } + + // protect guest page tables + // + if ( unlikely((level == 1) && + sh2_mfn_is_a_page_table(target_mfn)) ) + { + if ( shadow2_mode_trap_reads(d) ) + { + // if we are trapping both reads & writes, then mark this page + // as not present... + // + sflags &= ~_PAGE_PRESENT; + } + else + { + // otherwise, just prevent any writes... + // + sflags &= ~_PAGE_RW; + } + } + } + + return sflags; +} + +#undef CHECK + +#if GUEST_PAGING_LEVELS >= 4 +static void +l4e_propagate_from_guest(struct vcpu *v, + guest_l4e_t *gl4e, + mfn_t gl4mfn, + mfn_t sl3mfn, + shadow_l4e_t *sl4p, + fetch_type_t ft) +{ + u32 gflags = guest_l4e_get_flags(*gl4e); + u32 sflags = sh2_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e, + gl4mfn, 0, 4, ft); + + *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags); + + SHADOW2_DEBUG(PROPAGATE, + "%s gl4e=%" SH2_PRI_gpte " sl4e=%" SH2_PRI_pte "\n", + fetch_type_names[ft], gl4e->l4, sl4p->l4); + ASSERT(sflags != -1); +} +#endif // GUEST_PAGING_LEVELS >= 4 + +#if GUEST_PAGING_LEVELS >= 3 +static void +l3e_propagate_from_guest(struct vcpu *v, + guest_l3e_t *gl3e, + mfn_t gl3mfn, + mfn_t sl2mfn, + shadow_l3e_t *sl3p, + fetch_type_t ft) +{ + u32 gflags = guest_l3e_get_flags(*gl3e); + u32 sflags = sh2_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e, + gl3mfn, 0, 3, ft); + + *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags); + + SHADOW2_DEBUG(PROPAGATE, + "%s gl3e=%" SH2_PRI_gpte " sl3e=%" SH2_PRI_pte "\n", + fetch_type_names[ft], gl3e->l3, sl3p->l3); + ASSERT(sflags != -1); +} +#endif // GUEST_PAGING_LEVELS >= 3 + +static void +l2e_propagate_from_guest(struct vcpu *v, + guest_l2e_t *gl2e, + mfn_t gl2mfn, + mfn_t sl1mfn, + shadow_l2e_t *sl2p, + fetch_type_t ft) +{ + u32 gflags = guest_l2e_get_flags(*gl2e); + u32 sflags = sh2_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e, + gl2mfn, 0, 2, ft); + + *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags); + + SHADOW2_DEBUG(PROPAGATE, + "%s gl2e=%" SH2_PRI_gpte " sl2e=%" SH2_PRI_pte "\n", + fetch_type_names[ft], gl2e->l2, sl2p->l2); + ASSERT(sflags != -1); +} + +static inline int +l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p, + int mmio) +/* returns 1 if emulation is required, and 0 otherwise */ +{ + struct domain *d = v->domain; + u32 gflags = guest_l1e_get_flags(gw->eff_l1e); + u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn, + mmio, 1, ft_demand_read); + + if ( shadow2_mode_trap_reads(d) && !mmio && sh2_mfn_is_a_page_table(gmfn) ) + { + // emulation required! + *sl1p = shadow_l1e_empty(); + return 1; + } + + *sl1p = shadow_l1e_from_mfn(gmfn, sflags); + + SHADOW2_DEBUG(PROPAGATE, + "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n", + (void *)gw->va, gw->eff_l1e.l1, sl1p->l1); + + ASSERT(sflags != -1); + return 0; +} + +static inline int +l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p, + int mmio) +/* returns 1 if emulation is required, and 0 otherwise */ +{ + struct domain *d = v->domain; + u32 gflags = guest_l1e_get_flags(gw->eff_l1e); + u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn, + mmio, 1, ft_demand_write); + + sh2_mark_dirty(d, gmfn); + + if ( !mmio && sh2_mfn_is_a_page_table(gmfn) ) + { + // emulation required! + *sl1p = shadow_l1e_empty(); + return 1; + } + + *sl1p = shadow_l1e_from_mfn(gmfn, sflags); + + SHADOW2_DEBUG(PROPAGATE, + "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n", + (void *)gw->va, gw->eff_l1e.l1, sl1p->l1); + + ASSERT(sflags != -1); + return 0; +} + +static inline void +l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p, + int mmio) +{ + gfn_t gfn = guest_l1e_get_gfn(gl1e); + mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn); + u32 gflags = guest_l1e_get_flags(gl1e); + u32 sflags = sh2_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN), + mmio, 1, ft_prefetch); + + *sl1p = shadow_l1e_from_mfn(gmfn, sflags); + + SHADOW2_DEBUG(PROPAGATE, + "gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n", + gl1e.l1, sl1p->l1); + + ASSERT(sflags != -1); +} + + +/**************************************************************************/ +/* These functions update shadow entries (and do bookkeeping on the shadow + * tables they are in). It is intended that they are the only + * functions which ever write (non-zero) data onto a shadow page. + * + * They return a set of flags: + * SHADOW2_SET_CHANGED -- we actually wrote a new value to the shadow. + * SHADOW2_SET_FLUSH -- the caller must cause a TLB flush. + * SHADOW2_SET_ERROR -- the input is not a valid entry (for example, if + * shadow2_get_page_from_l1e() fails). + * SHADOW2_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local + * copies of their PAE L3 entries re-copied. + */ + +static inline void safe_write_entry(void *dst, void *src) +/* Copy one PTE safely when processors might be running on the + * destination pagetable. This does *not* give safety against + * concurrent writes (that's what the shadow lock is for), just + * stops the hardware picking up partially written entries. */ +{ + volatile unsigned long *d = dst; + unsigned long *s = src; + ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1))); +#if CONFIG_PAGING_LEVELS == 3 + /* In PAE mode, pagetable entries are larger + * than machine words, so won't get written atomically. We need to make + * sure any other cpu running on these shadows doesn't see a + * half-written entry. Do this by marking the entry not-present first, + * then writing the high word before the low word. */ + BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long)); + d[0] = 0; + d[1] = s[1]; + d[0] = s[0]; +#else + /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word, + * which will be an atomic write, since the entry is aligned. */ + BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long)); + *d = *s; +#endif +} + + +static inline void +shadow_write_entries(void *d, void *s, int entries, mfn_t mfn) +/* This function does the actual writes to shadow pages. + * It must not be called directly, since it doesn't do the bookkeeping + * that shadow_set_l*e() functions do. */ +{ + shadow_l1e_t *dst = d; + shadow_l1e_t *src = s; + void *map = NULL; + int i; + + /* Because we mirror access rights at all levels in the shadow, an + * l2 (or higher) entry with the RW bit cleared will leave us with + * no write access through the linear map. + * We detect that by writing to the shadow with copy_to_user() and + * using map_domain_page() to get a writeable mapping if we need to. */ + if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 ) + { + perfc_incrc(shadow2_linear_map_failed); + map = sh2_map_domain_page(mfn); + ASSERT(map != NULL); + dst = map + ((unsigned long)dst & (PAGE_SIZE - 1)); + } + + + for ( i = 0; i < entries; i++ ) + safe_write_entry(dst++, src++); + + if ( map != NULL ) sh2_unmap_domain_page(map); + + /* XXX TODO: + * Update min/max field in page_info struct of this mfn */ +} + +static inline int +perms_strictly_increased(u32 old_flags, u32 new_flags) +/* Given the flags of two entries, are the new flags a strict + * increase in rights over the old ones? */ +{ + u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX); + u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX); + /* Flip the NX bit, since it's the only one that decreases rights; + * we calculate as if it were an "X" bit. */ + of ^= _PAGE_NX_BIT; + nf ^= _PAGE_NX_BIT; + /* If the changed bits are all set in the new flags, then rights strictly + * increased between old and new. */ + return ((of | (of ^ nf)) == nf); +} + +static int inline +shadow2_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) +{ + int res; + mfn_t mfn; + struct domain *owner; + shadow_l1e_t sanitized_sl1e = + shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT); + + //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT); + //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0); + + if ( !shadow2_mode_refcounts(d) ) + return 1; + + res = get_page_from_l1e(sanitized_sl1e, d); + + // If a privileged domain is attempting to install a map of a page it does + // not own, we let it succeed anyway. + // + if ( unlikely(!res) && + IS_PRIV(d) && + !shadow2_mode_translate(d) && + valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) && + (owner = page_get_owner(mfn_to_page(mfn))) && + (d != owner) ) + { + res = get_page_from_l1e(sanitized_sl1e, owner); + SHADOW2_PRINTK("privileged domain %d installs map of mfn %05lx " + "which is owned by domain %d: %s\n", + d->domain_id, mfn_x(mfn), owner->domain_id, + res ? "success" : "failed"); + } + + if ( unlikely(!res) ) + { + perfc_incrc(shadow2_get_page_fail); + SHADOW2_PRINTK("failed: l1e=" SH2_PRI_pte "\n"); + } + + return res; +} + +static void inline +shadow2_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) +{ + if ( !shadow2_mode_refcounts(d) ) + return; + + put_page_from_l1e(sl1e, d); +} + +#if GUEST_PAGING_LEVELS >= 4 +static int shadow_set_l4e(struct vcpu *v, + shadow_l4e_t *sl4e, + shadow_l4e_t new_sl4e, + mfn_t sl4mfn) +{ + int flags = 0; + shadow_l4e_t old_sl4e; + paddr_t paddr; + ASSERT(sl4e != NULL); + old_sl4e = *sl4e; + + if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */ + + paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) + | (((unsigned long)sl4e) & ~PAGE_MASK)); + + if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + sh2_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr); + } + + /* Write the new entry */ + shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn); + flags |= SHADOW2_SET_CHANGED; + + if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e); + if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e))) + || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e), + shadow_l4e_get_flags(new_sl4e)) ) + { + flags |= SHADOW2_SET_FLUSH; + } + sh2_put_ref(v, osl3mfn, paddr); + } + return flags; +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + +#if GUEST_PAGING_LEVELS >= 3 +static int shadow_set_l3e(struct vcpu *v, + shadow_l3e_t *sl3e, + shadow_l3e_t new_sl3e, + mfn_t sl3mfn) +{ + int flags = 0; + shadow_l3e_t old_sl3e; + paddr_t paddr; + ASSERT(sl3e != NULL); + old_sl3e = *sl3e; + + if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */ + + paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) + | (((unsigned long)sl3e) & ~PAGE_MASK)); + + if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + sh2_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr); + } + + /* Write the new entry */ + shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn); + flags |= SHADOW2_SET_CHANGED; + +#if GUEST_PAGING_LEVELS == 3 + /* We wrote a guest l3e in a PAE pagetable. This table is copied in + * the linear pagetable entries of its l2s, and may also be copied + * to a low memory location to make it fit in CR3. Report that we + * need to resync those copies (we can't wait for the guest to flush + * the TLB because it might be an increase in rights). */ + { + struct vcpu *vcpu; + + struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e); + for_each_vcpu(v->domain, vcpu) + { + if (info->vcpus & (1 << vcpu->vcpu_id)) + { + // Remember that this flip/update needs to occur. + vcpu->arch.shadow2_pae_flip_pending = 1; + flags |= SHADOW2_SET_L3PAE_RECOPY; + } + } + } +#endif + + if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e); + if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) || + !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e), + shadow_l3e_get_flags(new_sl3e)) ) + { + flags |= SHADOW2_SET_FLUSH; + } + sh2_put_ref(v, osl2mfn, paddr); + } + return flags; +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + +static int shadow_set_l2e(struct vcpu *v, + shadow_l2e_t *sl2e, + shadow_l2e_t new_sl2e, + mfn_t sl2mfn) +{ + int flags = 0; + shadow_l2e_t old_sl2e; + paddr_t paddr; + +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 + /* In 2-on-3 we work with pairs of l2es pointing at two-page + * shadows. Reference counting and up-pointers track from the first + * page of the shadow to the first l2e, so make sure that we're + * working with those: + * Align the pointer down so it's pointing at the first of the pair */ + sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t))); + /* Align the mfn of the shadow entry too */ + new_sl2e.l2 &= ~(1< 2 + { + shadow_l2e_t pair[2] = { new_sl2e, new_sl2e }; + /* The l1 shadow is two pages long and need to be pointed to by + * two adjacent l1es. The pair have the same flags, but point + * at odd and even MFNs */ + ASSERT(!(pair[0].l2 & (1<domain; + shadow_l1e_t old_sl1e; + ASSERT(sl1e != NULL); + + old_sl1e = *sl1e; + + if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */ + + if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + if ( shadow2_mode_refcounts(d) ) { + if ( shadow2_get_page_from_l1e(new_sl1e, d) == 0 ) + { + /* Doesn't look like a pagetable. */ + flags |= SHADOW2_SET_ERROR; + new_sl1e = shadow_l1e_empty(); + } + } + } + + /* Write the new entry */ + shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn); + flags |= SHADOW2_SET_CHANGED; + + if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + /* N.B. Unlike higher-level sets, never need an extra flush + * when writing an l1e. Because it points to the same guest frame + * as the guest l1e did, it's the guest's responsibility to + * trigger a flush later. */ + if ( shadow2_mode_refcounts(d) ) + { + shadow2_put_page_from_l1e(old_sl1e, d); + } + } + return flags; +} + + +/**************************************************************************/ +/* These functions take a vcpu and a virtual address, and return a pointer + * to the appropriate level N entry from the shadow tables. + * If the necessary tables are not present in the shadow, they return NULL. */ + +/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has + * more levels than the guest, the upper levels are always fixed and do not + * reflect any information from the guest, so we do not use these functions + * to access them. */ + +#if GUEST_PAGING_LEVELS >= 4 +static shadow_l4e_t * +shadow_get_l4e(struct vcpu *v, unsigned long va) +{ + /* Reading the top level table is always valid. */ + return sh2_linear_l4_table(v) + shadow_l4_linear_offset(va); +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + + +#if GUEST_PAGING_LEVELS >= 3 +static shadow_l3e_t * +shadow_get_l3e(struct vcpu *v, unsigned long va) +{ +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ + /* Get the l4 */ + shadow_l4e_t *sl4e = shadow_get_l4e(v, va); + ASSERT(sl4e != NULL); + if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) ) + return NULL; + ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e))); + /* l4 was present; OK to get the l3 */ + return sh2_linear_l3_table(v) + shadow_l3_linear_offset(va); +#else /* PAE... */ + /* Top level is always mapped */ + ASSERT(v->arch.shadow_vtable); + return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va); +#endif +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + + +static shadow_l2e_t * +shadow_get_l2e(struct vcpu *v, unsigned long va) +{ +#if GUEST_PAGING_LEVELS >= 3 /* 64bit/PAE... */ + /* Get the l3 */ + shadow_l3e_t *sl3e = shadow_get_l3e(v, va); + if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) + return NULL; + ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e))); + /* l3 was present; OK to get the l2 */ +#endif + return sh2_linear_l2_table(v) + shadow_l2_linear_offset(va); +} + + +#if 0 // avoid the compiler warning for now... + +static shadow_l1e_t * +shadow_get_l1e(struct vcpu *v, unsigned long va) +{ + /* Get the l2 */ + shadow_l2e_t *sl2e = shadow_get_l2e(v, va); + if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) ) + return NULL; + ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e))); + /* l2 was present; OK to get the l1 */ + return sh2_linear_l1_table(v) + shadow_l1_linear_offset(va); +} + +#endif + + +/**************************************************************************/ +/* Macros to walk pagetables. These take the shadow of a pagetable and + * walk every "interesting" entry. That is, they don't touch Xen mappings, + * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every + * second entry (since pairs of entries are managed together). For multi-page + * shadows they walk all pages. + * + * Arguments are an MFN, the variable to point to each entry, a variable + * to indicate that we are done (we will shortcut to the end of the scan + * when _done != 0), a variable to indicate that we should avoid Xen mappings, + * and the code. + * + * WARNING: These macros have side-effects. They change the values of both + * the pointer and the MFN. */ + +static inline void increment_ptr_to_guest_entry(void *ptr) +{ + if ( ptr ) + { + guest_l1e_t **entry = ptr; + (*entry)++; + } +} + +/* All kinds of l1: touch all entries */ +#define _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ +do { \ + int _i; \ + shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \ + ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_l1_shadow \ + || (mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_fl1_shadow); \ + for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \ + { \ + (_sl1e) = _sp + _i; \ + if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl1p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */ +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 +#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ +do { \ + int __done = 0; \ + _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ + ({ (__done = _done); }), _code); \ + _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \ + if ( !__done ) \ + _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ + ({ (__done = _done); }), _code); \ +} while (0) +#else /* Everything else; l1 shadows are only one page */ +#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ + _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) +#endif + + +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 + +/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */ +#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i, _j, __done = 0; \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_l2_32_shadow); \ + for ( _j = 0; _j < 4 && !__done; _j++ ) \ + { \ + shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \ + if ( (!(_xen)) \ + || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \ + < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( (__done = (_done)) ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ + _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \ + } \ +} while (0) + +#elif GUEST_PAGING_LEVELS == 2 + +/* 32-bit on 32-bit: avoid Xen entries */ +#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_l2_32_shadow); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ + if ( (!(_xen)) \ + || \ + (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#elif GUEST_PAGING_LEVELS == 3 + +/* PAE: if it's an l2h, don't touch Xen mappings */ +#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_l2_pae_shadow \ + || (mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_l2h_pae_shadow); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ + if ( (!(_xen)) \ + || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ + != PGC_SH2_l2h_pae_shadow) \ + || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \ + < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#else + +/* 64-bit l2: touch all entries */ +#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_l2_64_shadow); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#endif /* different kinds of l2 */ + +#if GUEST_PAGING_LEVELS == 3 + +/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */ +#define SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code) \ +do { \ + int _i; \ + for ( _i = 0; _i < 4; _i++ ) \ + { \ + if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + _sl3e++; \ + increment_ptr_to_guest_entry(_gl3p); \ + } \ +} while (0) + +/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */ +#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ +do { \ + int _i, _j, _k, __done = 0; \ + ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_l3_pae_shadow); \ + /* The subshadows are split, 64 on each page of the shadow */ \ + for ( _j = 0; _j < 2 && !__done; _j++ ) \ + { \ + void *_sp = sh2_map_domain_page(_sl3mfn); \ + for ( _i = 0; _i < 64; _i++ ) \ + { \ + /* Every second 32-byte region is a bookkeeping entry */ \ + _sl3e = (shadow_l3e_t *)(_sp + (64 * _i)); \ + if ( (sl3p_to_info(_sl3e))->refcount > 0 ) \ + SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, \ + ({ __done = (_done); __done; }), \ + _code); \ + else \ + for ( _k = 0 ; _k < 4 ; _k++ ) \ + increment_ptr_to_guest_entry(_gl3p); \ + if ( __done ) break; \ + } \ + sh2_unmap_domain_page(_sp); \ + _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1); \ + } \ +} while (0) + +#elif GUEST_PAGING_LEVELS == 4 + +/* 64-bit l3: touch all entries */ +#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ +do { \ + int _i; \ + shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \ + ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_l3_64_shadow); \ + for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \ + { \ + (_sl3e) = _sp + _i; \ + if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl3p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +/* 64-bit l4: avoid Xen mappings */ +#define SHADOW2_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \ + ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH2_type_mask) \ + == PGC_SH2_l4_64_shadow); \ + for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \ + { \ + if ( (!(_xen)) || is_guest_l4_slot(_i) ) \ + { \ + (_sl4e) = _sp + _i; \ + if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + } \ + increment_ptr_to_guest_entry(_gl4p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#endif + + + +/**************************************************************************/ +/* Functions to install Xen mappings and linear mappings in shadow pages */ + +static mfn_t sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type); + +// XXX -- this function should probably be moved to shadow2-common.c, but that +// probably wants to wait until the shadow types have been moved from +// shadow2-types.h to shadow2-private.h +// +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4 +void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn) +{ + struct domain *d = v->domain; + shadow_l4e_t *sl4e; + + sl4e = sh2_map_domain_page(sl4mfn); + ASSERT(sl4e != NULL); + ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t)); + + /* Copy the common Xen mappings from the idle domain */ + memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], + &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], + ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] = + shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)), + __PAGE_HYPERVISOR); + + /* Linear mapping */ + sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR); + sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = + shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR); + + if ( shadow2_mode_translate(v->domain) ) + { + /* install domain-specific P2M table */ + sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = + shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table), + __PAGE_HYPERVISOR); + } + + sh2_unmap_domain_page(sl4e); +} +#endif + +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3 +// For 3-on-3 PV guests, we need to make sure the xen mappings are in +// place, which means that we need to populate the l2h entry in the l3 +// table. + +void sh2_install_xen_entries_in_l2h(struct vcpu *v, + mfn_t sl2hmfn) +{ + struct domain *d = v->domain; + shadow_l2e_t *sl2e; + int i; + + sl2e = sh2_map_domain_page(sl2hmfn); + ASSERT(sl2e != NULL); + ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t)); + + /* Copy the common Xen mappings from the idle domain */ + memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)], + &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT], + L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] = + shadow_l2e_from_mfn( + page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i), + __PAGE_HYPERVISOR); + + /* We don't set up a linear mapping here because we can't until this + * l2h is installed in an l3e. sh2_update_linear_entries() handles + * the linear mappings when the l3 is loaded. */ + + if ( shadow2_mode_translate(d) ) + { + /* Install the domain-specific p2m table */ + l3_pgentry_t *p2m; + ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0); + p2m = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); + for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ ) + { + sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] = + shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])), + __PAGE_HYPERVISOR); + } + sh2_unmap_domain_page(p2m); + } + + sh2_unmap_domain_page(sl2e); +} + +void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn) +{ + shadow_l3e_t *sl3e; + guest_l3e_t *gl3e = v->arch.guest_vtable; + shadow_l3e_t new_sl3e; + gfn_t l2gfn; + mfn_t l2gmfn, l2smfn; + int r; + + ASSERT(!shadow2_mode_external(v->domain)); + ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT); + l2gfn = guest_l3e_get_gfn(gl3e[3]); + l2gmfn = sh2_gfn_to_mfn(v->domain, gfn_x(l2gfn)); + l2smfn = get_shadow_status(v, l2gmfn, PGC_SH2_l2h_shadow); + if ( !valid_mfn(l2smfn) ) + { + l2smfn = sh2_make_shadow(v, l2gmfn, PGC_SH2_l2h_shadow); + } + l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e, + ft_prefetch); + sl3e = sh2_map_domain_page(sl3mfn); + r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn); + sh2_unmap_domain_page(sl3e); +} +#endif + + +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2 +void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn) +{ + struct domain *d = v->domain; + shadow_l2e_t *sl2e; + int i; + + sl2e = sh2_map_domain_page(sl2mfn); + ASSERT(sl2e != NULL); + ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t)); + + /* Copy the common Xen mappings from the idle domain */ + memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT], + &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT], + L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] = + shadow_l2e_from_mfn( + page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i), + __PAGE_HYPERVISOR); + + /* Linear mapping */ + sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR); + sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR); + + if ( shadow2_mode_translate(d) ) + { + /* install domain-specific P2M table */ + sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] = + shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table), + __PAGE_HYPERVISOR); + } + + sh2_unmap_domain_page(sl2e); +} +#endif + + + + + +/**************************************************************************/ +/* Create a shadow of a given guest page. + */ +static mfn_t +sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type) +{ + mfn_t smfn = shadow2_alloc(v->domain, shadow_type, mfn_x(gmfn)); + SHADOW2_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n", + mfn_x(gmfn), shadow_type, mfn_x(smfn)); + + if ( shadow_type != PGC_SH2_guest_root_type ) + /* Lower-level shadow, not yet linked form a higher level */ + mfn_to_page(smfn)->up = 0; + + // Create the Xen mappings... + if ( !shadow2_mode_external(v->domain) ) + { + switch (shadow_type) + { +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4 + case PGC_SH2_l4_shadow: + sh2_install_xen_entries_in_l4(v, gmfn, smfn); break; +#endif +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3 + case PGC_SH2_l3_shadow: + sh2_install_xen_entries_in_l3(v, gmfn, smfn); break; + case PGC_SH2_l2h_shadow: + sh2_install_xen_entries_in_l2h(v, smfn); break; +#endif +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2 + case PGC_SH2_l2_shadow: + sh2_install_xen_entries_in_l2(v, gmfn, smfn); break; +#endif + default: /* Do nothing */ break; + } + } + + shadow2_promote(v, gmfn, shadow_type); + set_shadow2_status(v, gmfn, shadow_type, smfn); + + return smfn; +} + +/* Make a splintered superpage shadow */ +static mfn_t +make_fl1_shadow(struct vcpu *v, gfn_t gfn) +{ + mfn_t smfn = shadow2_alloc(v->domain, PGC_SH2_fl1_shadow, + (unsigned long) gfn_x(gfn)); + + SHADOW2_DEBUG(MAKE_SHADOW, "(%" SH2_PRI_gfn ")=>%" SH2_PRI_mfn "\n", + gfn_x(gfn), mfn_x(smfn)); + + set_fl1_shadow_status(v, gfn, smfn); + return smfn; +} + + +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS +mfn_t +sh2_make_monitor_table(struct vcpu *v) +{ + + ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0); + +#if CONFIG_PAGING_LEVELS == 4 + { + struct domain *d = v->domain; + mfn_t m4mfn; + m4mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); + sh2_install_xen_entries_in_l4(v, m4mfn, m4mfn); + /* Remember the level of this table */ + mfn_to_page(m4mfn)->shadow2_flags = 4; +#if SHADOW_PAGING_LEVELS < 4 + // Install a monitor l3 table in slot 0 of the l4 table. + // This is used for shadow linear maps. + { + mfn_t m3mfn; + l4_pgentry_t *l4e; + m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); + mfn_to_page(m3mfn)->shadow2_flags = 3; + l4e = sh2_map_domain_page(m4mfn); + l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR); + sh2_unmap_domain_page(l4e); + } +#endif /* SHADOW_PAGING_LEVELS < 4 */ + return m4mfn; + } + +#elif CONFIG_PAGING_LEVELS == 3 + + { + struct domain *d = v->domain; + mfn_t m3mfn, m2mfn; + l3_pgentry_t *l3e; + l2_pgentry_t *l2e; + int i; + + m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); + /* Remember the level of this table */ + mfn_to_page(m3mfn)->shadow2_flags = 3; + + // Install a monitor l2 table in slot 3 of the l3 table. + // This is used for all Xen entries, including linear maps + m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); + mfn_to_page(m2mfn)->shadow2_flags = 2; + l3e = sh2_map_domain_page(m3mfn); + l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT); + sh2_install_xen_entries_in_l2h(v, m2mfn); + /* Install the monitor's own linear map */ + l2e = sh2_map_domain_page(m2mfn); + for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] = + (l3e_get_flags(l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR) + : l2e_empty(); + sh2_unmap_domain_page(l2e); + sh2_unmap_domain_page(l3e); + + SHADOW2_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn)); + return m3mfn; + } + +#elif CONFIG_PAGING_LEVELS == 2 + + { + struct domain *d = v->domain; + mfn_t m2mfn; + m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0); + sh2_install_xen_entries_in_l2(v, m2mfn, m2mfn); + /* Remember the level of this table */ + mfn_to_page(m2mfn)->shadow2_flags = 2; + return m2mfn; + } + +#else +#error this should not happen +#endif /* CONFIG_PAGING_LEVELS */ +} +#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */ + +/**************************************************************************/ +/* These functions also take a virtual address and return the level-N + * shadow table mfn and entry, but they create the shadow pagetables if + * they are needed. The "demand" argument is non-zero when handling + * a demand fault (so we know what to do about accessed bits &c). + * If the necessary tables are not present in the guest, they return NULL. */ +#if GUEST_PAGING_LEVELS >= 4 +static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, + walk_t *gw, + mfn_t *sl4mfn) +{ + /* There is always a shadow of the top level table. Get it. */ + *sl4mfn = pagetable_get_mfn(v->arch.shadow_table); + /* Reading the top level table is always valid. */ + return sh2_linear_l4_table(v) + shadow_l4_linear_offset(gw->va); +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + + +#if GUEST_PAGING_LEVELS >= 3 +static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, + walk_t *gw, + mfn_t *sl3mfn, + fetch_type_t ft) +{ +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ + mfn_t sl4mfn; + shadow_l4e_t *sl4e; + if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */ + /* Get the l4e */ + sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn); + ASSERT(sl4e != NULL); + if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) + { + *sl3mfn = shadow_l4e_get_mfn(*sl4e); + ASSERT(valid_mfn(*sl3mfn)); + } + else + { + int r; + shadow_l4e_t new_sl4e; + /* No l3 shadow installed: find and install it. */ + *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH2_l3_shadow); + if ( !valid_mfn(*sl3mfn) ) + { + /* No l3 shadow of this page exists at all: make one. */ + *sl3mfn = sh2_make_shadow(v, gw->l3mfn, PGC_SH2_l3_shadow); + } + /* Install the new sl3 table in the sl4e */ + l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn, + *sl3mfn, &new_sl4e, ft); + r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn); + ASSERT((r & SHADOW2_SET_FLUSH) == 0); + } + /* Now follow it down a level. Guaranteed to succeed. */ + return sh2_linear_l3_table(v) + shadow_l3_linear_offset(gw->va); +#else /* PAE... */ + /* There is always a shadow of the top level table. Get it. */ + *sl3mfn = pagetable_get_mfn(v->arch.shadow_table); + /* This next line is important: the shadow l3 table is in an 8k + * shadow and we need to return the right mfn of the pair. This call + * will set it for us as a side-effect. */ + (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e)); + ASSERT(v->arch.shadow_vtable); + return ((shadow_l3e_t *)v->arch.shadow_vtable) + + shadow_l3_table_offset(gw->va); +#endif /* GUEST_PAGING_LEVELS >= 4 */ +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + + +static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, + walk_t *gw, + mfn_t *sl2mfn, + fetch_type_t ft) +{ +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */ + mfn_t sl3mfn = _mfn(INVALID_MFN); + shadow_l3e_t *sl3e; + if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */ + /* Get the l3e */ + sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft); + ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */ + if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) + { + *sl2mfn = shadow_l3e_get_mfn(*sl3e); + ASSERT(valid_mfn(*sl2mfn)); + } + else + { + int r; + shadow_l3e_t new_sl3e; + /* No l2 shadow installed: find and install it. */ + *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH2_l2_shadow); + if ( !valid_mfn(*sl2mfn) ) + { + /* No l2 shadow of this page exists at all: make one. */ + *sl2mfn = sh2_make_shadow(v, gw->l2mfn, PGC_SH2_l2_shadow); + } + /* Install the new sl2 table in the sl3e */ + l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn, + *sl2mfn, &new_sl3e, ft); + r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn); + ASSERT((r & SHADOW2_SET_FLUSH) == 0); +#if GUEST_PAGING_LEVELS == 3 + /* Need to sync up the linear maps, as we are about to use them */ + ASSERT( r & SHADOW2_SET_L3PAE_RECOPY ); + sh2_pae_recopy(v->domain); +#endif + } + /* Now follow it down a level. Guaranteed to succeed. */ + return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); +#else /* 32bit... */ + /* There is always a shadow of the top level table. Get it. */ + *sl2mfn = pagetable_get_mfn(v->arch.shadow_table); + /* This next line is important: the guest l2 has a 16k + * shadow, we need to return the right mfn of the four. This + * call will set it for us as a side-effect. */ + (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e)); + /* Reading the top level table is always valid. */ + return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); +#endif +} + + +static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, + walk_t *gw, + mfn_t *sl1mfn, + fetch_type_t ft) +{ + mfn_t sl2mfn; + shadow_l2e_t *sl2e; + + /* Get the l2e */ + sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft); + if ( sl2e == NULL ) return NULL; + if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) + { + *sl1mfn = shadow_l2e_get_mfn(*sl2e); + ASSERT(valid_mfn(*sl1mfn)); + } + else + { + shadow_l2e_t new_sl2e; + int r, flags = guest_l2e_get_flags(*gw->l2e); + /* No l1 shadow installed: find and install it. */ + if ( !(flags & _PAGE_PRESENT) ) + return NULL; /* No guest page. */ + if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) ) + { + /* Splintering a superpage */ + gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e); + *sl1mfn = get_fl1_shadow_status(v, l2gfn); + if ( !valid_mfn(*sl1mfn) ) + { + /* No fl1 shadow of this superpage exists at all: make one. */ + *sl1mfn = make_fl1_shadow(v, l2gfn); + } + } + else + { + /* Shadowing an actual guest l1 table */ + if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */ + *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH2_l1_shadow); + if ( !valid_mfn(*sl1mfn) ) + { + /* No l1 shadow of this page exists at all: make one. */ + *sl1mfn = sh2_make_shadow(v, gw->l1mfn, PGC_SH2_l1_shadow); + } + } + /* Install the new sl1 table in the sl2e */ + l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn, + *sl1mfn, &new_sl2e, ft); + r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn); + ASSERT((r & SHADOW2_SET_FLUSH) == 0); + /* This next line is important: in 32-on-PAE and 32-on-64 modes, + * the guest l1 table has an 8k shadow, and we need to return + * the right mfn of the pair. This call will set it for us as a + * side-effect. (In all other cases, it's a no-op and will be + * compiled out.) */ + (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va)); + } + /* Now follow it down a level. Guaranteed to succeed. */ + return sh2_linear_l1_table(v) + shadow_l1_linear_offset(gw->va); +} + + + +/**************************************************************************/ +/* Destructors for shadow tables: + * Unregister the shadow, decrement refcounts of any entries present in it, + * and release the memory. + * + * N.B. These destructors do not clear the contents of the shadows. + * This allows us to delay TLB shootdowns until the page is being reused. + * See shadow2_alloc() and shadow2_free() for how this is handled. + */ + +#if GUEST_PAGING_LEVELS >= 4 +void sh2_destroy_l4_shadow(struct vcpu *v, mfn_t smfn) +{ + shadow_l4e_t *sl4e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask; + mfn_t gmfn, sl4mfn; + int xen_mappings; + + SHADOW2_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH2_l4_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow2_status(v, gmfn, t, smfn); + shadow2_demote(v, gmfn, t); + /* Take this shadow off the list of root shadows */ + list_del_init(&mfn_to_page(smfn)->list); + + /* Decrement refcounts of all the old entries */ + xen_mappings = (!shadow2_mode_external(v->domain)); + sl4mfn = smfn; + SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, { + if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) + { + sh2_put_ref(v, shadow_l4e_get_mfn(*sl4e), + (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) + | ((unsigned long)sl4e & ~PAGE_MASK)); + } + }); + + /* Put the memory back in the pool */ + shadow2_free(v->domain, smfn); +} +#endif + +#if GUEST_PAGING_LEVELS >= 3 +void sh2_destroy_l3_shadow(struct vcpu *v, mfn_t smfn) +{ + shadow_l3e_t *sl3e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask; + mfn_t gmfn, sl3mfn; + + SHADOW2_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH2_l3_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow2_status(v, gmfn, t, smfn); + shadow2_demote(v, gmfn, t); +#if GUEST_PAGING_LEVELS == 3 + /* Take this shadow off the list of root shadows */ + list_del_init(&mfn_to_page(smfn)->list); +#endif + + /* Decrement refcounts of all the old entries */ + sl3mfn = smfn; + SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, { + if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) + sh2_put_ref(v, shadow_l3e_get_mfn(*sl3e), + (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) + | ((unsigned long)sl3e & ~PAGE_MASK)); + }); + + /* Put the memory back in the pool */ + shadow2_free(v->domain, smfn); +} +#endif + + +#if GUEST_PAGING_LEVELS == 3 +static void sh2_destroy_l3_subshadow(struct vcpu *v, + shadow_l3e_t *sl3e) +/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */ +{ + int i; + ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0); + for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ ) + if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT ) + sh2_put_ref(v, shadow_l3e_get_mfn(sl3e[i]), + mapped_domain_page_to_maddr(sl3e)); +} +#endif + +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +void sh2_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn) +/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */ +{ + int i, j; + struct pae_l3_bookkeeping *bk; + + ASSERT((mfn_to_page(smfn)->count_info & PGC_SH2_type_mask) + == PGC_SH2_l3_pae_shadow); + /* The subshadows are split, 64 on each page of the shadow */ + for ( i = 0; i < 2; i++ ) + { + void *p = sh2_map_domain_page(_mfn(mfn_x(smfn) + i)); + for ( j = 0; j < 64; j++ ) + { + /* Every second 32-byte region is a bookkeeping entry */ + bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32); + if ( bk->pinned ) + sh2_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn); + /* Check whether we've just freed the whole shadow */ + if ( (mfn_to_page(smfn)->count_info & PGC_SH2_count_mask) == 0 ) + { + sh2_unmap_domain_page(p); + return; + } + } + sh2_unmap_domain_page(p); + } +} +#endif + +void sh2_destroy_l2_shadow(struct vcpu *v, mfn_t smfn) +{ + shadow_l2e_t *sl2e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask; + mfn_t gmfn, sl2mfn; + int xen_mappings; + + SHADOW2_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH2_l2_shadow + || t == PGC_SH2_l2h_pae_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow2_status(v, gmfn, t, smfn); + shadow2_demote(v, gmfn, t); +#if GUEST_PAGING_LEVELS == 2 + /* Take this shadow off the list of root shadows */ + list_del_init(&mfn_to_page(smfn)->list); +#endif + + /* Decrement refcounts of all the old entries */ + sl2mfn = smfn; + xen_mappings = (!shadow2_mode_external(v->domain) && + ((GUEST_PAGING_LEVELS == 2) || + ((GUEST_PAGING_LEVELS == 3) && + (t == PGC_SH2_l2h_pae_shadow)))); + SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, { + if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) + sh2_put_ref(v, shadow_l2e_get_mfn(*sl2e), + (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) + | ((unsigned long)sl2e & ~PAGE_MASK)); + }); + + /* Put the memory back in the pool */ + shadow2_free(v->domain, smfn); +} + +void sh2_destroy_l1_shadow(struct vcpu *v, mfn_t smfn) +{ + struct domain *d = v->domain; + shadow_l1e_t *sl1e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask; + + SHADOW2_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH2_l1_shadow || t == PGC_SH2_fl1_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + if ( t == PGC_SH2_fl1_shadow ) + { + gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_fl1_shadow_status(v, gfn, smfn); + } + else + { + mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow2_status(v, gmfn, t, smfn); + shadow2_demote(v, gmfn, t); + } + + if ( shadow2_mode_refcounts(d) ) + { + /* Decrement refcounts of all the old entries */ + mfn_t sl1mfn = smfn; + SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, 0, { + if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT ) + shadow2_put_page_from_l1e(*sl1e, d); + }); + } + + /* Put the memory back in the pool */ + shadow2_free(v->domain, smfn); +} + +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS +void sh2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn) +{ + struct domain *d = v->domain; + ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH2_type_mask) + == PGC_SH2_monitor_table); + +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4) + /* Need to destroy the l3 monitor page in slot 0 too */ + { + l4_pgentry_t *l4e = sh2_map_domain_page(mmfn); + ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); + shadow2_free(d, _mfn(l4e_get_pfn(l4e[0]))); + sh2_unmap_domain_page(l4e); + } +#elif CONFIG_PAGING_LEVELS == 3 + /* Need to destroy the l2 monitor page in slot 4 too */ + { + l3_pgentry_t *l3e = sh2_map_domain_page(mmfn); + ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); + shadow2_free(d, _mfn(l3e_get_pfn(l3e[3]))); + sh2_unmap_domain_page(l3e); + } +#endif + + /* Put the memory back in the pool */ + shadow2_free(d, mmfn); +} +#endif + +/**************************************************************************/ +/* Functions to destroy non-Xen mappings in a pagetable hierarchy. + * These are called from common code when we are running out of shadow + * memory, and unpinning all the top-level shadows hasn't worked. + * + * This implementation is pretty crude and slow, but we hope that it won't + * be called very often. */ + +#if GUEST_PAGING_LEVELS == 2 + +void sh2_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn) +{ + shadow_l2e_t *sl2e; + int xen_mappings = !shadow2_mode_external(v->domain); + SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, { + (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); + }); +} + +#elif GUEST_PAGING_LEVELS == 3 + +void sh2_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn) +/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */ +{ + shadow_l3e_t *sl3e; + SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, { + if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) { + mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e); + if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask) + == PGC_SH2_l2h_pae_shadow ) + { + /* High l2: need to pick particular l2es to unhook */ + shadow_l2e_t *sl2e; + SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, { + (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); + }); + } + else + { + /* Normal l2: can safely unhook the whole l3e */ + (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn); + } + } + }); + /* We've changed PAE L3 entries: must sync up various copies of them */ + sh2_pae_recopy(v->domain); +} + +#elif GUEST_PAGING_LEVELS == 4 + +void sh2_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn) +{ + shadow_l4e_t *sl4e; + int xen_mappings = !shadow2_mode_external(v->domain); + SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, { + (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn); + }); +} + +#endif + +/**************************************************************************/ +/* Internal translation functions. + * These functions require a pointer to the shadow entry that will be updated. + */ + +/* These functions take a new guest entry, translate it to shadow and write + * the shadow entry. + * + * They return the same bitmaps as the shadow_set_lXe() functions. + */ + +#if GUEST_PAGING_LEVELS >= 4 +static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se) +{ + shadow_l4e_t new_sl4e; + guest_l4e_t *new_gl4e = new_ge; + shadow_l4e_t *sl4p = se; + mfn_t sl3mfn = _mfn(INVALID_MFN); + int result = 0; + + perfc_incrc(shadow2_validate_gl4e_calls); + + if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT ) + { + gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e); + mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn); + if ( valid_mfn(gl3mfn) ) + sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH2_l3_shadow); + else + result |= SHADOW2_SET_ERROR; + } + l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN), + sl3mfn, &new_sl4e, ft_prefetch); + result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn); + return result; +} +#endif // GUEST_PAGING_LEVELS >= 4 + +#if GUEST_PAGING_LEVELS >= 3 +static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se) +{ + shadow_l3e_t new_sl3e; + guest_l3e_t *new_gl3e = new_ge; + shadow_l3e_t *sl3p = se; + mfn_t sl2mfn = _mfn(INVALID_MFN); + int result = 0; + + perfc_incrc(shadow2_validate_gl3e_calls); + + if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT ) + { + gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e); + mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn); + if ( valid_mfn(gl2mfn) ) + sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH2_l2_shadow); + else + result |= SHADOW2_SET_ERROR; + } + l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN), + sl2mfn, &new_sl3e, ft_prefetch); + result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn); + +#if GUEST_PAGING_LEVELS == 3 + /* We have changed a PAE l3 entry: need to sync up the possible copies + * of it */ + if ( result & SHADOW2_SET_L3PAE_RECOPY ) + sh2_pae_recopy(v->domain); +#endif + + return result; +} +#endif // GUEST_PAGING_LEVELS >= 3 + +static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se) +{ + shadow_l2e_t new_sl2e; + guest_l2e_t *new_gl2e = new_ge; + shadow_l2e_t *sl2p = se; + mfn_t sl1mfn = _mfn(INVALID_MFN); + int result = 0; + + perfc_incrc(shadow2_validate_gl2e_calls); + + if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT ) + { + gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e); + if ( guest_supports_superpages(v) && + (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) ) + { + // superpage -- need to look up the shadow L1 which holds the + // splitters... + sl1mfn = get_fl1_shadow_status(v, gl1gfn); +#if 0 + // XXX - it's possible that we want to do some kind of prefetch + // for superpage fl1's here, but this is *not* on the demand path, + // so we'll hold off trying that for now... + // + if ( !valid_mfn(sl1mfn) ) + sl1mfn = make_fl1_shadow(v, gl1gfn); +#endif + } + else + { + mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn); + if ( valid_mfn(gl1mfn) ) + sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH2_l1_shadow); + else + result |= SHADOW2_SET_ERROR; + } + } + l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN), + sl1mfn, &new_sl2e, ft_prefetch); + result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn); + + return result; +} + +static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se) +{ + shadow_l1e_t new_sl1e; + guest_l1e_t *new_gl1e = new_ge; + shadow_l1e_t *sl1p = se; + gfn_t gfn; + mfn_t mfn; + int result = 0; + + perfc_incrc(shadow2_validate_gl1e_calls); + + gfn = guest_l1e_get_gfn(*new_gl1e); + mfn = vcpu_gfn_to_mfn(v, gfn); + + l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e, + /* mmio? */ !valid_mfn(mfn)); + + result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn); + return result; +} + + +/**************************************************************************/ +/* Functions which translate and install a the shadows of arbitrary guest + * entries that we have just seen the guest write. */ + + +static inline int +sh2_map_and_validate(struct vcpu *v, mfn_t gmfn, + void *new_gp, u32 size, u32 sh_type, + u32 (*shadow_index)(mfn_t *smfn, u32 idx), + int (*validate_ge)(struct vcpu *v, void *ge, + mfn_t smfn, void *se)) +/* Generic function for mapping and validating. */ +{ + mfn_t smfn, smfn2, map_mfn; + shadow_l1e_t *sl1p; + u32 shadow_idx, guest_idx; + int result = 0; + + /* Align address and size to guest entry boundaries */ + size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1); + new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1)); + size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1); + ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE); + + /* Map the shadow page */ + smfn = get_shadow_status(v, gmfn, sh_type); + ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */ + guest_idx = guest_index(new_gp); + map_mfn = smfn; + shadow_idx = shadow_index(&map_mfn, guest_idx); + sl1p = map_shadow_page(map_mfn); + + /* Validate one entry at a time */ + while ( size ) + { + smfn2 = smfn; + guest_idx = guest_index(new_gp); + shadow_idx = shadow_index(&smfn2, guest_idx); + if ( mfn_x(smfn2) != mfn_x(map_mfn) ) + { + /* We have moved to another page of the shadow */ + map_mfn = smfn2; + unmap_shadow_page(sl1p); + sl1p = map_shadow_page(map_mfn); + } + result |= validate_ge(v, + new_gp, + map_mfn, + &sl1p[shadow_idx]); + size -= sizeof(guest_l1e_t); + new_gp += sizeof(guest_l1e_t); + } + unmap_shadow_page(sl1p); + return result; +} + + +int +sh2_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn, + void *new_gl4p, u32 size) +{ +#if GUEST_PAGING_LEVELS >= 4 + return sh2_map_and_validate(v, gl4mfn, new_gl4p, size, + PGC_SH2_l4_shadow, + shadow_l4_index, + validate_gl4e); +#else // ! GUEST_PAGING_LEVELS >= 4 + SHADOW2_PRINTK("called in wrong paging mode!\n"); + BUG(); + return 0; +#endif +} + +int +sh2_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn, + void *new_gl3p, u32 size) +{ +#if GUEST_PAGING_LEVELS >= 3 + return sh2_map_and_validate(v, gl3mfn, new_gl3p, size, + PGC_SH2_l3_shadow, + shadow_l3_index, + validate_gl3e); +#else // ! GUEST_PAGING_LEVELS >= 3 + SHADOW2_PRINTK("called in wrong paging mode!\n"); + BUG(); + return 0; +#endif +} + +int +sh2_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn, + void *new_gl2p, u32 size) +{ + return sh2_map_and_validate(v, gl2mfn, new_gl2p, size, + PGC_SH2_l2_shadow, + shadow_l2_index, + validate_gl2e); +} + +int +sh2_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn, + void *new_gl2p, u32 size) +{ +#if GUEST_PAGING_LEVELS == 3 + return sh2_map_and_validate(v, gl2mfn, new_gl2p, size, + PGC_SH2_l2h_shadow, + shadow_l2_index, + validate_gl2e); +#else /* Non-PAE guests don't have different kinds of l2 table */ + SHADOW2_PRINTK("called in wrong paging mode!\n"); + BUG(); + return 0; +#endif +} + +int +sh2_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn, + void *new_gl1p, u32 size) +{ + return sh2_map_and_validate(v, gl1mfn, new_gl1p, size, + PGC_SH2_l1_shadow, + shadow_l1_index, + validate_gl1e); +} + + +/**************************************************************************/ +/* Optimization: If we see two emulated writes of zeros to the same + * page-table without another kind of page fault in between, we guess + * that this is a batch of changes (for process destruction) and + * unshadow the page so we don't take a pagefault on every entry. This + * should also make finding writeable mappings of pagetables much + * easier. */ + +/* Look to see if this is the second emulated write in a row to this + * page, and unshadow/unhook if it is */ +static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn) +{ +#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW + if ( v->arch.last_emulated_mfn == mfn_x(gmfn) && + sh2_mfn_is_a_page_table(gmfn) ) + { + u32 flags = mfn_to_page(gmfn)->shadow2_flags; + mfn_t smfn; + if ( !(flags & (SH2F_L2_32|SH2F_L3_PAE|SH2F_L4_64)) ) + { + perfc_incrc(shadow2_early_unshadow); + sh2_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ ); + return; + } + /* SH2F_unhooked_mappings is set to make sure we only unhook + * once in a single batch of updates. It is reset when this + * top-level page is loaded into CR3 again */ + if ( !(flags & SH2F_unhooked_mappings) ) + { + perfc_incrc(shadow2_early_unshadow_top); + mfn_to_page(gmfn)->shadow2_flags |= SH2F_unhooked_mappings; + if ( flags & SH2F_L2_32 ) + { + smfn = get_shadow_status(v, gmfn, PGC_SH2_l2_32_shadow); + shadow2_unhook_mappings(v, smfn); + } + if ( flags & SH2F_L3_PAE ) + { + smfn = get_shadow_status(v, gmfn, PGC_SH2_l3_pae_shadow); + shadow2_unhook_mappings(v, smfn); + } + if ( flags & SH2F_L4_64 ) + { + smfn = get_shadow_status(v, gmfn, PGC_SH2_l4_64_shadow); + shadow2_unhook_mappings(v, smfn); + } + } + } + v->arch.last_emulated_mfn = mfn_x(gmfn); +#endif +} + +/* Stop counting towards early unshadows, as we've seen a real page fault */ +static inline void reset_early_unshadow(struct vcpu *v) +{ +#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW + v->arch.last_emulated_mfn = INVALID_MFN; +#endif +} + + + +/**************************************************************************/ +/* Entry points into the shadow code */ + +/* Called from pagefault handler in Xen, and from the HVM trap handlers + * for pagefaults. Returns 1 if this fault was an artefact of the + * shadow code (and the guest should retry) or 0 if it is not (and the + * fault should be handled elsewhere or passed to the guest). */ + +static int sh2_page_fault(struct vcpu *v, + unsigned long va, + struct cpu_user_regs *regs) +{ + struct domain *d = v->domain; + walk_t gw; + u32 accumulated_gflags; + gfn_t gfn; + mfn_t gmfn, sl1mfn=_mfn(0); + shadow_l1e_t sl1e, *ptr_sl1e; + paddr_t gpa; + struct cpu_user_regs emul_regs; + struct x86_emulate_ctxt emul_ctxt; + int r, mmio; + fetch_type_t ft = 0; + + // + // XXX: Need to think about eventually mapping superpages directly in the + // shadow (when possible), as opposed to splintering them into a + // bunch of 4K maps. + // + + SHADOW2_PRINTK("d:v=%u:%u va=%#lx err=%u\n", + v->domain->domain_id, v->vcpu_id, va, regs->error_code); + + shadow2_lock(d); + + shadow2_audit_tables(v); + + if ( guest_walk_tables(v, va, &gw, 1) != 0 ) + { + SHADOW2_PRINTK("malformed guest pagetable!"); + print_gw(&gw); + } + + sh2_audit_gw(v, &gw); + + // We do not look at the gw->l1e, as that will not exist for superpages. + // Instead, we use the gw->eff_l1e... + // + // We need not check all the levels of the guest page table entries for + // present vs not-present, as the eff_l1e will always be not present if + // one of the higher level entries is not present. + // + if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) ) + { + if ( hvm_guest(v) && !shadow2_vcpu_mode_translate(v) ) + { + /* Not present in p2m map, means this is mmio */ + gpa = va; + goto mmio; + } + + perfc_incrc(shadow2_fault_bail_not_present); + goto not_a_shadow_fault; + } + + // All levels of the guest page table are now known to be present. + accumulated_gflags = accumulate_guest_flags(&gw); + + // Check for attempts to access supervisor-only pages from user mode, + // i.e. ring 3. Such errors are not caused or dealt with by the shadow + // code. + // + if ( (regs->error_code & X86_PFEC_SUPERVISOR_FAULT) && + !(accumulated_gflags & _PAGE_USER) ) + { + /* illegal user-mode access to supervisor-only page */ + perfc_incrc(shadow2_fault_bail_user_supervisor); + goto not_a_shadow_fault; + } + + // Was it a write fault? + // + if ( regs->error_code & X86_PFEC_WRITE_FAULT ) + { + if ( unlikely(!(accumulated_gflags & _PAGE_RW)) ) + { + perfc_incrc(shadow2_fault_bail_ro_mapping); + goto not_a_shadow_fault; + } + } + else // must have been either an insn fetch or read fault + { + // Check for NX bit violations: attempts to execute code that is + // marked "do not execute". Such errors are not caused or dealt with + // by the shadow code. + // + if ( regs->error_code & X86_PFEC_INSN_FETCH_FAULT ) + { + if ( accumulated_gflags & _PAGE_NX_BIT ) + { + /* NX prevented this code fetch */ + perfc_incrc(shadow2_fault_bail_nx); + goto not_a_shadow_fault; + } + } + } + + /* Is this an MMIO access? */ + gfn = guest_l1e_get_gfn(gw.eff_l1e); + mmio = ( hvm_guest(v) + && shadow2_vcpu_mode_translate(v) + && mmio_space(gfn_to_paddr(gfn)) ); + + /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds + * the equivalent mfn. */ + if ( mmio ) + gmfn = _mfn(gfn_x(gfn)); + else + { + gmfn = vcpu_gfn_to_mfn(v, gfn); + if ( !valid_mfn(gmfn) ) + { + perfc_incrc(shadow2_fault_bail_bad_gfn); + SHADOW2_PRINTK("BAD gfn=%"SH2_PRI_gfn" gmfn=%"SH2_PRI_mfn"\n", + gfn_x(gfn), mfn_x(gmfn)); + goto not_a_shadow_fault; + } + } + + /* Make sure there is enough free shadow memory to build a chain of + * shadow tables: one SHADOW2_MAX_ORDER chunk will always be enough + * to allocate all we need. (We never allocate a top-level shadow + * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */ + shadow2_prealloc(d, SHADOW2_MAX_ORDER); + + /* Acquire the shadow. This must happen before we figure out the rights + * for the shadow entry, since we might promote a page here. */ + // XXX -- this code will need to change somewhat if/when the shadow code + // can directly map superpages... + ft = ((regs->error_code & X86_PFEC_WRITE_FAULT) + ? ft_demand_write : ft_demand_read); + ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft); + ASSERT(ptr_sl1e); + + /* Calculate the shadow entry */ + if ( ft == ft_demand_write ) + { + if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) ) + { + perfc_incrc(shadow2_fault_emulate_write); + goto emulate; + } + } + else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) ) + { + perfc_incrc(shadow2_fault_emulate_read); + goto emulate; + } + + /* Quick sanity check: we never make an MMIO entry that's got the + * _PAGE_PRESENT flag set in it. */ + ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT)); + + r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn); + + if ( mmio ) + { + gpa = guest_walk_to_gpa(&gw); + goto mmio; + } + +#if 0 + if ( !(r & SHADOW2_SET_CHANGED) ) + debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH2_PRI_pte + ") did not change anything\n", + __func__, gw.va, l1e_get_intpte(sl1e)); +#endif + + perfc_incrc(shadow2_fault_fixed); + d->arch.shadow_fault_count++; + reset_early_unshadow(v); + + done: + sh2_audit_gw(v, &gw); + unmap_walk(v, &gw); + SHADOW2_PRINTK("fixed\n"); + shadow2_audit_tables(v); + shadow2_unlock(d); + return EXCRET_fault_fixed; + + emulate: + + /* Take the register set we were called with */ + emul_regs = *regs; + if ( hvm_guest(v) ) + { + /* Add the guest's segment selectors, rip, rsp. rflags */ + hvm_store_cpu_guest_regs(v, &emul_regs, NULL); + } + emul_ctxt.regs = &emul_regs; + emul_ctxt.cr2 = va; + emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST; + + SHADOW2_PRINTK("emulate: eip=%#lx\n", emul_regs.eip); + + v->arch.shadow2_propagate_fault = 0; + if ( x86_emulate_memop(&emul_ctxt, &shadow2_emulator_ops) ) + { + SHADOW2_PRINTK("emulator failure, unshadowing mfn %#lx\n", + mfn_x(gmfn)); + perfc_incrc(shadow2_fault_emulate_failed); + /* If this is actually a page table, then we have a bug, and need + * to support more operations in the emulator. More likely, + * though, this is a hint that this page should not be shadowed. */ + shadow2_remove_all_shadows(v, gmfn); + /* This means that actual missing operations will cause the + * guest to loop on the same page fault. */ + goto done; + } + if ( v->arch.shadow2_propagate_fault ) + { + /* Emulation triggered another page fault */ + goto not_a_shadow_fault; + } + + /* Emulator has changed the user registers: write back */ + if ( hvm_guest(v) ) + { + /* Write back the guest's segment selectors, rip, rsp. rflags */ + hvm_load_cpu_guest_regs(v, &emul_regs); + /* And don't overwrite those in the caller's regs. */ + emul_regs.eip = regs->eip; + emul_regs.cs = regs->cs; + emul_regs.eflags = regs->eflags; + emul_regs.esp = regs->esp; + emul_regs.ss = regs->ss; + emul_regs.es = regs->es; + emul_regs.ds = regs->ds; + emul_regs.fs = regs->fs; + emul_regs.gs = regs->gs; + } + *regs = emul_regs; + + goto done; + + mmio: + perfc_incrc(shadow2_fault_mmio); + if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) ) + { + /* Need to deal with these disabled-APIC accesses, as + * handle_mmio() apparently does not currently do that. */ + /* TJD: What about it, then? For now, I'm turning this BUG() + * into a domain_crash() since we don't want to kill Xen. */ + SHADOW2_ERROR("disabled-APIC access: not supported\n."); + domain_crash(d); + } + sh2_audit_gw(v, &gw); + unmap_walk(v, &gw); + SHADOW2_PRINTK("mmio\n"); + shadow2_audit_tables(v); + reset_early_unshadow(v); + shadow2_unlock(d); + sh2_log_mmio(v, gpa); + handle_mmio(va, gpa); + return EXCRET_fault_fixed; + + not_a_shadow_fault: + sh2_audit_gw(v, &gw); + unmap_walk(v, &gw); + SHADOW2_PRINTK("not a shadow fault\n"); + shadow2_audit_tables(v); + reset_early_unshadow(v); + shadow2_unlock(d); + return 0; +} + + +static int +sh2_invlpg(struct vcpu *v, unsigned long va) +/* Called when the guest requests an invlpg. Returns 1 if the invlpg + * instruction should be issued on the hardware, or 0 if it's safe not + * to do so. */ +{ + shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va); + + // XXX -- might be a good thing to prefetch the va into the shadow + + // no need to flush anything if there's no SL2... + // + if ( !ptr_sl2e ) + return 0; + + // If there's nothing shadowed for this particular sl2e, then + // there is no need to do an invlpg, either... + // + if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) ) + return 0; + + // Check to see if the SL2 is a splintered superpage... + // If so, then we'll need to flush the entire TLB (because that's + // easier than invalidating all of the individual 4K pages). + // + if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info & + PGC_SH2_type_mask) == PGC_SH2_fl1_shadow ) + { + local_flush_tlb(); + return 0; + } + + return 1; +} + +static unsigned long +sh2_gva_to_gfn(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + walk_t gw; + gfn_t gfn; + + guest_walk_tables(v, va, &gw, 0); + gfn = guest_walk_to_gfn(&gw); + unmap_walk(v, &gw); + + return gfn_x(gfn); +} + + +static unsigned long +sh2_gva_to_gpa(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + unsigned long gfn = sh2_gva_to_gfn(v, va); + if ( gfn == INVALID_GFN ) + return 0; + else + return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK); +} + + +// XXX -- should this be in this file? +// Or should it be moved to shadow2-common.c? +// +/* returns a lowmem machine address of the copied HVM L3 root table + * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy, + * otherwise blank out any entries with reserved bits in them. */ +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +static unsigned long +hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res) +{ + int i, f; + int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY); + l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab; + memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t)); + for ( i = 0; i < 4; i++ ) + { + f = l3e_get_flags(l3tab[i]); + if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) ) + new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res); + else + new_l3e = l3e_empty(); + safe_write_entry(©[i], &new_l3e); + } + return __pa(copy); +} +#endif + + +static inline void +sh2_update_linear_entries(struct vcpu *v) +/* Sync up all the linear mappings for this vcpu's pagetables */ +{ + struct domain *d = v->domain; + + /* Linear pagetables in PV guests + * ------------------------------ + * + * Guest linear pagetables, which map the guest pages, are at + * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the + * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these + * are set up at shadow creation time, but (of course!) the PAE case + * is subtler. Normal linear mappings are made by having an entry + * in the top-level table that points to itself (shadow linear) or + * to the guest top-level table (guest linear). For PAE, to set up + * a linear map requires us to copy the four top-level entries into + * level-2 entries. That means that every time we change a PAE l3e, + * we need to reflect the change into the copy. + * + * Linear pagetables in HVM guests + * ------------------------------- + * + * For HVM guests, the linear pagetables are installed in the monitor + * tables (since we can't put them in the shadow). Shadow linear + * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START, + * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for + * a linear pagetable of the monitor tables themselves. We have + * the same issue of having to re-copy PAE l3 entries whevever we use + * PAE shadows. + * + * Because HVM guests run on the same monitor tables regardless of the + * shadow tables in use, the linear mapping of the shadow tables has to + * be updated every time v->arch.shadow_table changes. + */ + + /* Don't try to update the monitor table if it doesn't exist */ + if ( shadow2_mode_external(d) + && pagetable_get_pfn(v->arch.monitor_table) == 0 ) + return; + +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4) + + /* For PV, one l4e points at the guest l4, one points at the shadow + * l4. No maintenance required. + * For HVM, just need to update the l4e that points to the shadow l4. */ + + if ( shadow2_mode_external(d) ) + { + /* Use the linear map if we can; otherwise make a new mapping */ + if ( v == current ) + { + __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = + l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + } + else + { + l4_pgentry_t *ml4e; + ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = + l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + sh2_unmap_domain_page(ml4e); + } + } + +#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3) + + /* This case only exists in HVM. To give ourselves a linear map of the + * shadows, we need to extend a PAE shadow to 4 levels. We do this by + * having a monitor l3 in slot 0 of the monitor l4 table, and + * copying the PAE l3 entries into it. Then, by having the monitor l4e + * for shadow pagetables also point to the monitor l4, we can use it + * to access the shadows. */ + + if ( shadow2_mode_external(d) ) + { + /* Install copies of the shadow l3es into the monitor l3 table. + * The monitor l3 table is hooked into slot 0 of the monitor + * l4 table, so we use l3 linear indices 0 to 3 */ + shadow_l3e_t *sl3e; + l3_pgentry_t *ml3e; + mfn_t l3mfn; + int i; + + /* Use linear mappings if we can; otherwise make new mappings */ + if ( v == current ) + { + ml3e = __linear_l3_table; + l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0])); +#if GUEST_PAGING_LEVELS == 2 + /* Shadow l3 tables are made up by update_cr3 */ + sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; +#else + sl3e = v->arch.shadow_vtable; +#endif + } + else + { + l4_pgentry_t *ml4e; + ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT); + l3mfn = _mfn(l4e_get_pfn(ml4e[0])); + ml3e = sh2_map_domain_page(l3mfn); + sh2_unmap_domain_page(ml4e); +#if GUEST_PAGING_LEVELS == 2 + /* Shadow l3 tables are made up by update_cr3 */ + sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; +#else + sl3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.shadow_table)); +#endif + } + + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + { + ml3e[i] = + (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) + ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])), + __PAGE_HYPERVISOR) + : l3e_empty(); + } + + if ( v != current ) + { + sh2_unmap_domain_page(ml3e); +#if GUEST_PAGING_LEVELS != 2 + sh2_unmap_domain_page(sl3e); +#endif + } + } + +#elif CONFIG_PAGING_LEVELS == 3 + + /* PV: need to copy the guest's l3 entries into the guest-linear-map l2 + * entries in the shadow, and the shadow's l3 entries into the + * shadow-linear-map l2 entries in the shadow. This is safe to do + * because Xen does not let guests share high-slot l2 tables between l3s, + * so we know we're not treading on anyone's toes. + * + * HVM: need to copy the shadow's l3 entries into the + * shadow-linear-map l2 entries in the monitor table. This is safe + * because we have one monitor table for each vcpu. The monitor's + * own l3es don't need to be copied because they never change. + * XXX That might change if we start stuffing things into the rest + * of the monitor's virtual address space. + */ + { + l2_pgentry_t *l2e, new_l2e; + shadow_l3e_t *guest_l3e = NULL, *shadow_l3e; + int i; + +#if GUEST_PAGING_LEVELS == 2 + /* Shadow l3 tables were built by update_cr3 */ + if ( shadow2_mode_external(d) ) + shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; + else + BUG(); /* PV 2-on-3 is not supported yet */ + +#else /* GUEST_PAGING_LEVELS == 3 */ + + /* Use local vcpu's mappings if we can; otherwise make new mappings */ + if ( v == current ) + { + shadow_l3e = v->arch.shadow_vtable; + if ( !shadow2_mode_external(d) ) + guest_l3e = v->arch.guest_vtable; + } + else + { + mfn_t smfn; + int idx; + + /* Map the shadow l3 */ + smfn = pagetable_get_mfn(v->arch.shadow_table); + idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable)); + shadow_l3e = sh2_map_domain_page(smfn); + shadow_l3e += idx; + if ( !shadow2_mode_external(d) ) + { + /* Also the guest l3 */ + mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table); + guest_l3e = sh2_map_domain_page(gmfn); + guest_l3e += guest_index(v->arch.guest_vtable); + } + } +#endif /* GUEST_PAGING_LEVELS */ + + /* Choose where to write the entries, using linear maps if possible */ + if ( v == current && shadow2_mode_external(d) ) + { + /* From the monitor tables, it's safe to use linear maps to update + * monitor l2s */ + l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES); + } + else if ( shadow2_mode_external(d) ) + { + /* Map the monitor table's high l2 */ + l3_pgentry_t *l3e; + l3e = sh2_map_domain_page( + pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); + l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[3]))); + sh2_unmap_domain_page(l3e); + } + else + { + /* Map the shadow table's high l2 */ + ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT); + l2e = sh2_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3])); + } + + + if ( !shadow2_mode_external(d) ) + { + /* Write linear mapping of guest. */ + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + { + new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])), + __PAGE_HYPERVISOR) + : l2e_empty(); + safe_write_entry( + &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], + &new_l2e); + } + } + + /* Write linear mapping of shadow. */ + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + { + new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])), + __PAGE_HYPERVISOR) + : l2e_empty(); + safe_write_entry( + &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i], + &new_l2e); + } + + if ( v != current || !shadow2_mode_external(d) ) + sh2_unmap_domain_page(l2e); + +#if GUEST_PAGING_LEVELS == 3 + if ( v != current) + { + sh2_unmap_domain_page(shadow_l3e); + if ( !shadow2_mode_external(d) ) + sh2_unmap_domain_page(guest_l3e); + } +#endif + } + +#elif CONFIG_PAGING_LEVELS == 2 + + /* For PV, one l2e points at the guest l2, one points at the shadow + * l2. No maintenance required. + * For HVM, just need to update the l2e that points to the shadow l2. */ + + if ( shadow2_mode_external(d) ) + { + /* Use the linear map if we can; otherwise make a new mapping */ + if ( v == current ) + { + __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] = + l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + } + else + { + l2_pgentry_t *ml2e; + ml2e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + sh2_unmap_domain_page(ml2e); + } + } + +#else +#error this should not happen +#endif +} + + +// XXX -- should this be in this file? +// Or should it be moved to shadow2-common.c? +// +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +void sh2_pae_recopy(struct domain *d) +/* Called whenever we write to the l3 entries of a PAE pagetable which + * is currently in use. Each vcpu that is using the table needs to + * resync its copies of the l3s in linear maps and any low-memory + * copies it might have made for fitting into 32bit CR3. + * Since linear maps are also resynced when we change CR3, we don't + * need to worry about changes to PAE l3es that are not currently in use.*/ +{ + struct vcpu *v; + cpumask_t flush_mask = CPU_MASK_NONE; + ASSERT(shadow2_lock_is_acquired(d)); + + for_each_vcpu(d, v) + { + if ( !v->arch.shadow2_pae_flip_pending ) + continue; + + cpu_set(v->processor, flush_mask); + + SHADOW2_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id); + + /* This vcpu has a copy in its linear maps */ + sh2_update_linear_entries(v); + if ( hvm_guest(v) ) + { + /* This vcpu has a copy in its HVM PAE l3 */ + v->arch.hvm_vcpu.hw_cr3 = + hvm_pae_copy_root(v, v->arch.shadow_vtable, + !shadow2_vcpu_mode_translate(v)); + } +#if CONFIG_PAGING_LEVELS == 3 + else + { + /* This vcpu might have copied the l3 to below 4GB */ + if ( v->arch.cr3 >> PAGE_SHIFT + != pagetable_get_pfn(v->arch.shadow_table) ) + { + /* Recopy to where that copy is. */ + int i; + l3_pgentry_t *dst, *src; + dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */ + src = v->arch.shadow_vtable; + for ( i = 0 ; i < 4 ; i++ ) + safe_write_entry(dst + i, src + i); + } + } +#endif + v->arch.shadow2_pae_flip_pending = 0; + } + + flush_tlb_mask(flush_mask); +} +#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */ + + +/* removes: + * vcpu->arch.guest_vtable + * vcpu->arch.shadow_table + * vcpu->arch.shadow_vtable + * Does all appropriate management/bookkeeping/refcounting/etc... + */ +static void +sh2_detach_old_tables(struct vcpu *v) +{ + mfn_t smfn; + + //// + //// vcpu->arch.guest_vtable + //// + if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) && + v->arch.guest_vtable ) + { + // Q: why does this need to use (un)map_domain_page_*global* ? + sh2_unmap_domain_page_global(v->arch.guest_vtable); + v->arch.guest_vtable = NULL; + } + + //// + //// vcpu->arch.shadow_table + //// + smfn = pagetable_get_mfn(v->arch.shadow_table); + if ( mfn_x(smfn) ) + { + ASSERT(v->arch.shadow_vtable); + +#if GUEST_PAGING_LEVELS == 3 + // PAE guests do not (necessarily) use an entire page for their + // 4-entry L3s, so we have to deal with them specially. + // + sh2_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn); +#else + sh2_put_ref(v, smfn, 0); +#endif + +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + { + struct pae_l3_bookkeeping *info = + sl3p_to_info(v->arch.shadow_vtable); + ASSERT(test_bit(v->vcpu_id, &info->vcpus)); + clear_bit(v->vcpu_id, &info->vcpus); + } +#endif + v->arch.shadow_table = pagetable_null(); + } + + //// + //// vcpu->arch.shadow_vtable + //// + if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) && + v->arch.shadow_vtable ) + { + // Q: why does this need to use (un)map_domain_page_*global* ? + // + sh2_unmap_domain_page_global(v->arch.shadow_vtable); + v->arch.shadow_vtable = NULL; + } +} + +static void +sh2_update_cr3(struct vcpu *v) +/* Updates vcpu->arch.shadow_table after the guest has changed CR3. + * Paravirtual guests should set v->arch.guest_table (and guest_table_user, + * if appropriate). + * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)... + */ +{ + struct domain *d = v->domain; + mfn_t gmfn, smfn; +#if GUEST_PAGING_LEVELS == 3 + u32 guest_idx=0; +#endif + + ASSERT(shadow2_lock_is_acquired(v->domain)); + ASSERT(v->arch.shadow2); + + //// + //// vcpu->arch.guest_table is already set + //// + +#ifndef NDEBUG + /* Double-check that the HVM code has sent us a sane guest_table */ + if ( hvm_guest(v) ) + { + gfn_t gfn; + + ASSERT(shadow2_mode_external(d)); + + // Is paging enabled on this vcpu? + if ( shadow2_vcpu_mode_translate(v) ) + { + gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3))); + gmfn = vcpu_gfn_to_mfn(v, gfn); + ASSERT(valid_mfn(gmfn)); + ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn)); + } + else + { + /* Paging disabled: guest_table points at (part of) p2m */ +#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */ + /* For everything else, they sould be the same */ + ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn); +#endif + } + } +#endif + + SHADOW2_PRINTK("d=%u v=%u guest_table=%05lx\n", + d->domain_id, v->vcpu_id, + (unsigned long)pagetable_get_pfn(v->arch.guest_table)); + +#if GUEST_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + gmfn = pagetable_get_mfn(v->arch.guest_table_user); + else +#endif + gmfn = pagetable_get_mfn(v->arch.guest_table); + + sh2_detach_old_tables(v); + + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + { + ASSERT(v->arch.cr3 == 0); + return; + } + + //// + //// vcpu->arch.guest_vtable + //// + if ( shadow2_mode_external(d) ) + { +#if GUEST_PAGING_LEVELS == 3 + if ( shadow2_vcpu_mode_translate(v) ) + /* Paging enabled: find where in the page the l3 table is */ + guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3)); + else + /* Paging disabled: l3 is at the start of a page (in the p2m) */ + guest_idx = 0; + + // Ignore the low 2 bits of guest_idx -- they are really just + // cache control. + guest_idx &= ~3; + // XXX - why does this need a global map? + v->arch.guest_vtable = + (guest_l3e_t *)sh2_map_domain_page_global(gmfn) + guest_idx; +#else + // XXX - why does this need a global map? + v->arch.guest_vtable = sh2_map_domain_page_global(gmfn); +#endif + } + else + { +#ifdef __x86_64__ + v->arch.guest_vtable = __linear_l4_table; +#elif GUEST_PAGING_LEVELS == 3 + // XXX - why does this need a global map? + v->arch.guest_vtable = sh2_map_domain_page_global(gmfn); +#else + v->arch.guest_vtable = __linear_l2_table; +#endif + } + +#if 0 + printk("%s %s %d gmfn=%05lx guest_vtable=%p\n", + __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable); +#endif + + //// + //// vcpu->arch.shadow_table + //// + smfn = get_shadow_status(v, gmfn, PGC_SH2_guest_root_type); + if ( valid_mfn(smfn) ) + { + /* Pull this root shadow to the front of the list of roots. */ + list_del(&mfn_to_page(smfn)->list); + list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2_toplevel_shadows); + } + else + { + /* This guest MFN is a pagetable. Must revoke write access. */ + if ( shadow2_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0) + != 0 ) + flush_tlb_mask(d->domain_dirty_cpumask); + /* Make sure there's enough free shadow memory. */ + shadow2_prealloc(d, SHADOW2_MAX_ORDER); + /* Shadow the page. */ + smfn = sh2_make_shadow(v, gmfn, PGC_SH2_guest_root_type); + list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2_toplevel_shadows); + } + ASSERT(valid_mfn(smfn)); + v->arch.shadow_table = pagetable_from_mfn(smfn); + +#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW + /* Once again OK to unhook entries from this table if we see fork/exit */ + ASSERT(sh2_mfn_is_a_page_table(gmfn)); + mfn_to_page(gmfn)->shadow2_flags &= ~SH2F_unhooked_mappings; +#endif + + + //// + //// vcpu->arch.shadow_vtable + //// + if ( shadow2_mode_external(d) ) + { +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + mfn_t adjusted_smfn = smfn; + u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx); + // Q: why does this need to use (un)map_domain_page_*global* ? + v->arch.shadow_vtable = + (shadow_l3e_t *)sh2_map_domain_page_global(adjusted_smfn) + + shadow_idx; +#else + // Q: why does this need to use (un)map_domain_page_*global* ? + v->arch.shadow_vtable = sh2_map_domain_page_global(smfn); +#endif + } + else + { +#if SHADOW_PAGING_LEVELS == 4 + v->arch.shadow_vtable = __sh2_linear_l4_table; +#elif GUEST_PAGING_LEVELS == 3 + // XXX - why does this need a global map? + v->arch.shadow_vtable = sh2_map_domain_page_global(smfn); +#else + v->arch.shadow_vtable = __sh2_linear_l2_table; +#endif + } + + //// + //// Take a ref to the new shadow table, and pin it. + //// + // + // This ref is logically "held" by v->arch.shadow_table entry itself. + // Release the old ref. + // +#if GUEST_PAGING_LEVELS == 3 + // PAE guests do not (necessarily) use an entire page for their + // 4-entry L3s, so we have to deal with them specially. + // + // XXX - might want to revisit this if/when we do multiple compilation for + // HVM-vs-PV guests, as PAE PV guests could get away without doing + // subshadows. + // + sh2_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn); + sh2_pin_l3_subshadow(v->arch.shadow_vtable, smfn); +#else + sh2_get_ref(smfn, 0); + sh2_pin(smfn); +#endif + +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + // PAE 3-on-3 shadows have to keep track of which vcpu's are using + // which l3 subshadow, in order handle the SHADOW2_SET_L3PAE_RECOPY + // case from validate_gl3e(). Search for SHADOW2_SET_L3PAE_RECOPY + // in the code for more info. + // + { + struct pae_l3_bookkeeping *info = + sl3p_to_info(v->arch.shadow_vtable); + ASSERT(!test_bit(v->vcpu_id, &info->vcpus)); + set_bit(v->vcpu_id, &info->vcpus); + } +#endif + + debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n", + __func__, gmfn, smfn); + + /// + /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3 + /// + if ( shadow2_mode_external(d) ) + { + ASSERT(hvm_guest(v)); + make_cr3(v, pagetable_get_pfn(v->arch.monitor_table)); + +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) +#if SHADOW_PAGING_LEVELS != 3 +#error unexpected combination of GUEST and SHADOW paging levels +#endif + /* 2-on-3: make a PAE l3 table that points at the four-page l2 */ + { + mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table); + int i; + + ASSERT(v->arch.hvm_vcpu.hw_cr3 == + virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab)); + for (i = 0; i < 4; i++) + { + v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] = + shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT); + } + } +#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) + /* 3-on-3: copy the shadow l3 to slots that are below 4GB. + * If paging is disabled, clear l3e reserved bits; otherwise + * remove entries that have reserved bits set. */ + v->arch.hvm_vcpu.hw_cr3 = + hvm_pae_copy_root(v, v->arch.shadow_vtable, + !shadow2_vcpu_mode_translate(v)); +#else + /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */ + v->arch.hvm_vcpu.hw_cr3 = + pagetable_get_paddr(v->arch.shadow_table); +#endif + } + else // not shadow2_mode_external... + { + /* We don't support PV except guest == shadow == config levels */ + BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS); + make_cr3(v, pagetable_get_pfn(v->arch.shadow_table)); + } + + /* Fix up the linear pagetable mappings */ + sh2_update_linear_entries(v); +} + + +/**************************************************************************/ +/* Functions to revoke guest rights */ + +#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC +static int sh2_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn) +/* Look up this vaddr in the current shadow and see if it's a writeable + * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */ +{ + shadow_l1e_t sl1e, *sl1p; + shadow_l2e_t *sl2p; +#if GUEST_PAGING_LEVELS >= 3 + shadow_l3e_t *sl3p; +#if GUEST_PAGING_LEVELS >= 4 + shadow_l4e_t *sl4p; +#endif +#endif + mfn_t sl1mfn; + + + /* Carefully look in the shadow linear map for the l1e we expect */ + if ( v->arch.shadow_vtable == NULL ) return 0; +#if GUEST_PAGING_LEVELS >= 4 + sl4p = sh2_linear_l4_table(v) + shadow_l4_linear_offset(vaddr); + if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) ) + return 0; + sl3p = sh2_linear_l3_table(v) + shadow_l3_linear_offset(vaddr); + if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) + return 0; +#elif GUEST_PAGING_LEVELS == 3 + sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable) + + shadow_l3_linear_offset(vaddr); + if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) + return 0; +#endif + sl2p = sh2_linear_l2_table(v) + shadow_l2_linear_offset(vaddr); + if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) ) + return 0; + sl1p = sh2_linear_l1_table(v) + shadow_l1_linear_offset(vaddr); + sl1e = *sl1p; + if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW)) + != (_PAGE_PRESENT|_PAGE_RW)) + || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) ) + return 0; + + /* Found it! Need to remove its write permissions. */ + sl1mfn = shadow_l2e_get_mfn(*sl2p); + sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); + shadow_set_l1e(v, sl1p, sl1e, sl1mfn); + return 1; +} +#endif + +int sh2_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn) +/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */ +{ + shadow_l1e_t *sl1e; + int done = 0; + int flags; + + SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, + { + flags = shadow_l1e_get_flags(*sl1e); + if ( (flags & _PAGE_PRESENT) + && (flags & _PAGE_RW) + && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) ) + { + shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn); + if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info + & PGT_count_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} + + +int sh2_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn) +/* Excises all mappings to guest frame from this shadow l1 table */ +{ + shadow_l1e_t *sl1e; + int done = 0; + int flags; + + SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, + { + flags = shadow_l1e_get_flags(*sl1e); + if ( (flags & _PAGE_PRESENT) + && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) ) + { + shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn); + if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} + +/**************************************************************************/ +/* Functions to excise all pointers to shadows from higher-level shadows. */ + +void sh2_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn) +/* Blank out a single shadow entry */ +{ + switch (mfn_to_page(smfn)->count_info & PGC_SH2_type_mask) + { + case PGC_SH2_l1_shadow: + shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break; + case PGC_SH2_l2_shadow: +#if GUEST_PAGING_LEVELS == 3 + case PGC_SH2_l2h_shadow: +#endif + shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break; +#if GUEST_PAGING_LEVELS >= 3 + case PGC_SH2_l3_shadow: + shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break; +#if GUEST_PAGING_LEVELS >= 4 + case PGC_SH2_l4_shadow: + shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break; +#endif +#endif + default: BUG(); /* Called with the wrong kind of shadow. */ + } +} + +int sh2_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn) +/* Remove all mappings of this l1 shadow from this l2 shadow */ +{ + shadow_l2e_t *sl2e; + int done = 0; + int flags; +#if GUEST_PAGING_LEVELS != 4 + int xen_mappings = !shadow2_mode_external(v->domain); +#endif + + SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings, + { + flags = shadow_l2e_get_flags(*sl2e); + if ( (flags & _PAGE_PRESENT) + && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) ) + { + shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); + if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH2_type_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} + +#if GUEST_PAGING_LEVELS >= 3 +int sh2_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn) +/* Remove all mappings of this l2 shadow from this l3 shadow */ +{ + shadow_l3e_t *sl3e; + int done = 0; + int flags; + + SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, done, + { + flags = shadow_l3e_get_flags(*sl3e); + if ( (flags & _PAGE_PRESENT) + && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) ) + { + shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn); + if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} + +#if GUEST_PAGING_LEVELS >= 4 +int sh2_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn) +/* Remove all mappings of this l3 shadow from this l4 shadow */ +{ + shadow_l4e_t *sl4e; + int done = 0; + int flags, xen_mappings = !shadow2_mode_external(v->domain); + + SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings, + { + flags = shadow_l4e_get_flags(*sl4e); + if ( (flags & _PAGE_PRESENT) + && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) ) + { + shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn); + if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH2_type_mask) == 0 ) + /* This breaks us cleanly out of the FOREACH macro */ + done = 1; + } + }); + return done; +} +#endif /* 64bit guest */ +#endif /* PAE guest */ + +/**************************************************************************/ +/* Handling HVM guest writes to pagetables */ + +/* Check that the user is allowed to perform this write. + * Returns a mapped pointer to write to, and the mfn it's on, + * or NULL for error. */ +static inline void * emulate_map_dest(struct vcpu *v, + unsigned long vaddr, + struct x86_emulate_ctxt *ctxt, + mfn_t *mfnp) +{ + walk_t gw; + u32 flags; + gfn_t gfn; + mfn_t mfn; + + guest_walk_tables(v, vaddr, &gw, 1); + flags = accumulate_guest_flags(&gw); + gfn = guest_l1e_get_gfn(gw.eff_l1e); + mfn = vcpu_gfn_to_mfn(v, gfn); + sh2_audit_gw(v, &gw); + unmap_walk(v, &gw); + + if ( !(flags & _PAGE_PRESENT) + || !(flags & _PAGE_RW) + || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) ) + { + /* This write would have faulted even on bare metal */ + v->arch.shadow2_propagate_fault = 1; + return NULL; + } + + if ( !valid_mfn(mfn) ) + { + /* Attempted a write to a bad gfn. This should never happen: + * after all, we're here because this write is to a page table. */ + BUG(); + } + + ASSERT(sh2_mfn_is_a_page_table(mfn)); + *mfnp = mfn; + return sh2_map_domain_page(mfn) + (vaddr & ~PAGE_MASK); +} + +int +sh2_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src, + u32 bytes, struct x86_emulate_ctxt *ctxt) +{ + ASSERT(shadow2_lock_is_acquired(v->domain)); + while ( bytes > 0 ) + { + mfn_t mfn; + int bytes_on_page; + void *addr; + + bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK); + if ( bytes_on_page > bytes ) + bytes_on_page = bytes; + + if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL ) + return X86EMUL_PROPAGATE_FAULT; + memcpy(addr, src, bytes_on_page); + shadow2_validate_guest_pt_write(v, mfn, addr, bytes_on_page); + bytes -= bytes_on_page; + /* If we are writing zeros to this page, might want to unshadow */ + if ( *(u8 *)addr == 0 ) + check_for_early_unshadow(v, mfn); + sh2_unmap_domain_page(addr); + } + shadow2_audit_tables(v); + return X86EMUL_CONTINUE; +} + +int +sh2_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, + unsigned long old, unsigned long new, + unsigned int bytes, struct x86_emulate_ctxt *ctxt) +{ + mfn_t mfn; + void *addr; + unsigned long prev; + int rv = X86EMUL_CONTINUE; + + ASSERT(shadow2_lock_is_acquired(v->domain)); + ASSERT(bytes <= sizeof (unsigned long)); + + if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL ) + return X86EMUL_PROPAGATE_FAULT; + + switch (bytes) + { + case 1: prev = cmpxchg(((u8 *)addr), old, new); break; + case 2: prev = cmpxchg(((u16 *)addr), old, new); break; + case 4: prev = cmpxchg(((u32 *)addr), old, new); break; + case 8: prev = cmpxchg(((u64 *)addr), old, new); break; + default: + SHADOW2_PRINTK("cmpxchg of size %i is not supported\n", bytes); + prev = ~old; + } + + if ( (prev == old) ) + shadow2_validate_guest_pt_write(v, mfn, addr, bytes); + else + rv = X86EMUL_CMPXCHG_FAILED; + + SHADOW2_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx" + " wanted %#lx now %#lx bytes %u\n", + vaddr, prev, old, new, *(unsigned long *)addr, bytes); + + /* If we are writing zeros to this page, might want to unshadow */ + if ( *(u8 *)addr == 0 ) + check_for_early_unshadow(v, mfn); + + sh2_unmap_domain_page(addr); + shadow2_audit_tables(v); + check_for_early_unshadow(v, mfn); + return rv; +} + +int +sh2_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr, + unsigned long old_lo, unsigned long old_hi, + unsigned long new_lo, unsigned long new_hi, + struct x86_emulate_ctxt *ctxt) +{ + mfn_t mfn; + void *addr; + u64 old, new, prev; + int rv = X86EMUL_CONTINUE; + + ASSERT(shadow2_lock_is_acquired(v->domain)); + + if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL ) + return X86EMUL_PROPAGATE_FAULT; + + old = (((u64) old_hi) << 32) | (u64) old_lo; + new = (((u64) new_hi) << 32) | (u64) new_lo; + prev = cmpxchg(((u64 *)addr), old, new); + + if ( (prev == old) ) + shadow2_validate_guest_pt_write(v, mfn, addr, 8); + else + rv = X86EMUL_CMPXCHG_FAILED; + + /* If we are writing zeros to this page, might want to unshadow */ + if ( *(u8 *)addr == 0 ) + check_for_early_unshadow(v, mfn); + + sh2_unmap_domain_page(addr); + shadow2_audit_tables(v); + check_for_early_unshadow(v, mfn); + return rv; +} + + +/**************************************************************************/ +/* Audit tools */ + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES + +#define AUDIT_FAIL(_level, _fmt, _a...) do { \ + printk("Shadow2 %u-on-%u audit failed at level %i, index %i\n" \ + "gl" #_level "mfn = %" SH2_PRI_mfn \ + " sl" #_level "mfn = %" SH2_PRI_mfn \ + " &gl" #_level "e = %p &sl" #_level "e = %p" \ + " gl" #_level "e = %" SH2_PRI_gpte \ + " sl" #_level "e = %" SH2_PRI_pte "\nError: " _fmt "\n", \ + GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ + _level, guest_index(gl ## _level ## e), \ + mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ + gl ## _level ## e, sl ## _level ## e, \ + gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ + ##_a); \ + BUG(); \ + done = 1; \ +} while (0) + + +static char * sh2_audit_flags(struct vcpu *v, int level, + int gflags, int sflags) +/* Common code for auditing flag bits */ +{ + if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) ) + return "shadow is present but guest is not present"; + if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) ) + return "global bit set in PV shadow"; + if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE))) + && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) ) + return "dirty bit not propagated"; + if ( level == 2 && (sflags & _PAGE_PSE) ) + return "PS bit set in shadow"; +#if SHADOW_PAGING_LEVELS == 3 + if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */ +#endif + if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) ) + return "user/supervisor bit does not match"; + if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) ) + return "NX bit does not match"; + if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) ) + return "shadow grants write access but guest does not"; + if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) ) + return "accessed bit not propagated"; + return NULL; +} + +static inline mfn_t +audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn) +/* Convert this gfn to an mfn in the manner appropriate for the + * guest pagetable it's used in (gmfn) */ +{ + if ( !shadow2_mode_translate(v->domain) ) + return _mfn(gfn_x(gfn)); + + if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask) + != PGT_writable_page ) + return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */ + else + return sh2_gfn_to_mfn(v->domain, gfn_x(gfn)); +} + + +int sh2_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x) +{ + guest_l1e_t *gl1e, *gp; + shadow_l1e_t *sl1e; + mfn_t mfn, gmfn, gl1mfn; + gfn_t gfn; + char *s; + int done = 0; + + /* Follow the backpointer */ + gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info); + gl1e = gp = sh2_map_domain_page(gl1mfn); + SHADOW2_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, { + + s = sh2_audit_flags(v, 1, guest_l1e_get_flags(*gl1e), + shadow_l1e_get_flags(*sl1e)); + if ( s ) AUDIT_FAIL(1, "%s", s); + + if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS ) + { + gfn = guest_l1e_get_gfn(*gl1e); + mfn = shadow_l1e_get_mfn(*sl1e); + gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn); + if ( mfn_x(gmfn) != mfn_x(mfn) ) + AUDIT_FAIL(1, "bad translation: gfn %" SH2_PRI_gfn + " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n", + gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); + } + }); + sh2_unmap_domain_page(gp); + return done; +} + +int sh2_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x) +{ + guest_l1e_t *gl1e, e; + shadow_l1e_t *sl1e; + mfn_t gl1mfn = _mfn(INVALID_MFN); + int f; + int done = 0; + + /* fl1 has no useful backpointer: all we can check are flags */ + e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */ + SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, { + f = shadow_l1e_get_flags(*sl1e); + f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2); + if ( !(f == 0 + || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| + _PAGE_ACCESSED|_PAGE_DIRTY) + || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) ) + AUDIT_FAIL(1, "fl1e has bad flags"); + }); + return 0; +} + +int sh2_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x) +{ + guest_l2e_t *gl2e, *gp; + shadow_l2e_t *sl2e; + mfn_t mfn, gmfn, gl2mfn; + gfn_t gfn; + char *s; + int done = 0; +#if GUEST_PAGING_LEVELS != 4 + int xen_mappings = !shadow2_mode_external(v->domain); +#endif + + /* Follow the backpointer */ + gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info); + gl2e = gp = sh2_map_domain_page(gl2mfn); + SHADOW2_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, { + + s = sh2_audit_flags(v, 2, guest_l2e_get_flags(*gl2e), + shadow_l2e_get_flags(*sl2e)); + if ( s ) AUDIT_FAIL(2, "%s", s); + + if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS ) + { + gfn = guest_l2e_get_gfn(*gl2e); + mfn = shadow_l2e_get_mfn(*sl2e); + gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) + ? get_fl1_shadow_status(v, gfn) + : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn), + PGC_SH2_l1_shadow); + if ( mfn_x(gmfn) != mfn_x(mfn) ) + AUDIT_FAIL(2, "bad translation: gfn %" SH2_PRI_gfn + " (--> %" SH2_PRI_mfn ")" + " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n", + gfn_x(gfn), + (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0 + : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)), + mfn_x(gmfn), mfn_x(mfn)); + } + }); + sh2_unmap_domain_page(gp); + return 0; +} + +#if GUEST_PAGING_LEVELS >= 3 +int sh2_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x) +{ + guest_l3e_t *gl3e, *gp; + shadow_l3e_t *sl3e; + mfn_t mfn, gmfn, gl3mfn; + gfn_t gfn; + char *s; + int done = 0; + + /* Follow the backpointer */ + gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info); + gl3e = gp = sh2_map_domain_page(gl3mfn); + SHADOW2_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, { + + s = sh2_audit_flags(v, 3, guest_l3e_get_flags(*gl3e), + shadow_l3e_get_flags(*sl3e)); + if ( s ) AUDIT_FAIL(3, "%s", s); + + if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS ) + { + gfn = guest_l3e_get_gfn(*gl3e); + mfn = shadow_l3e_get_mfn(*sl3e); + gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn), + (GUEST_PAGING_LEVELS == 3 + && !shadow2_mode_external(v->domain) + && (guest_index(gl3e) % 4) == 3) + ? PGC_SH2_l2h_pae_shadow + : PGC_SH2_l2_shadow); + if ( mfn_x(gmfn) != mfn_x(mfn) ) + AUDIT_FAIL(3, "bad translation: gfn %" SH2_PRI_gfn + " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n", + gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); + } + }); + sh2_unmap_domain_page(gp); + return 0; +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + +#if GUEST_PAGING_LEVELS >= 4 +int sh2_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x) +{ + guest_l4e_t *gl4e, *gp; + shadow_l4e_t *sl4e; + mfn_t mfn, gmfn, gl4mfn; + gfn_t gfn; + char *s; + int done = 0; + int xen_mappings = !shadow2_mode_external(v->domain); + + /* Follow the backpointer */ + gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info); + gl4e = gp = sh2_map_domain_page(gl4mfn); + SHADOW2_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings, + { + s = sh2_audit_flags(v, 4, guest_l4e_get_flags(*gl4e), + shadow_l4e_get_flags(*sl4e)); + if ( s ) AUDIT_FAIL(4, "%s", s); + + if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS ) + { + gfn = guest_l4e_get_gfn(*gl4e); + mfn = shadow_l4e_get_mfn(*sl4e); + gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn), + PGC_SH2_l3_shadow); + if ( mfn_x(gmfn) != mfn_x(mfn) ) + AUDIT_FAIL(4, "bad translation: gfn %" SH2_PRI_gfn + " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n", + gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); + } + }); + sh2_unmap_domain_page(gp); + return 0; +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + + +#undef AUDIT_FAIL + +#endif /* Audit code */ + +/**************************************************************************/ +/* Entry points into this mode of the shadow code. + * This will all be mangled by the preprocessor to uniquify everything. */ +struct shadow2_entry_points shadow2_entry = { + .page_fault = sh2_page_fault, + .invlpg = sh2_invlpg, + .gva_to_gpa = sh2_gva_to_gpa, + .gva_to_gfn = sh2_gva_to_gfn, + .update_cr3 = sh2_update_cr3, + .map_and_validate_gl1e = sh2_map_and_validate_gl1e, + .map_and_validate_gl2e = sh2_map_and_validate_gl2e, + .map_and_validate_gl2he = sh2_map_and_validate_gl2he, + .map_and_validate_gl3e = sh2_map_and_validate_gl3e, + .map_and_validate_gl4e = sh2_map_and_validate_gl4e, + .detach_old_tables = sh2_detach_old_tables, + .x86_emulate_write = sh2_x86_emulate_write, + .x86_emulate_cmpxchg = sh2_x86_emulate_cmpxchg, + .x86_emulate_cmpxchg8b = sh2_x86_emulate_cmpxchg8b, + .make_monitor_table = sh2_make_monitor_table, + .destroy_monitor_table = sh2_destroy_monitor_table, +#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC + .guess_wrmap = sh2_guess_wrmap, +#endif + .guest_levels = GUEST_PAGING_LEVELS, + .shadow_levels = SHADOW_PAGING_LEVELS, +}; + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/shadow32.c b/xen/arch/x86/shadow32.c deleted file mode 100644 index 392669746e..0000000000 --- a/xen/arch/x86/shadow32.c +++ /dev/null @@ -1,3782 +0,0 @@ -/****************************************************************************** - * arch/x86/shadow.c - * - * Copyright (c) 2005 Michael A Fetterman - * Based on an earlier implementation by Ian Pratt et al - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned) -#define va_to_l1mfn(_ed, _va) \ - (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT])) - -static void shadow_free_snapshot(struct domain *d, - struct out_of_sync_entry *entry); -static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn); -static void free_writable_pte_predictions(struct domain *d); - -#if SHADOW_DEBUG -static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn); -#endif - -static int alloc_p2m_table(struct domain *d); -static void free_p2m_table(struct domain *d); - -/******** - -There's a per-domain shadow table spin lock which works fine for SMP -hosts. We don't have to worry about interrupts as no shadow operations -happen in an interrupt context. It's probably not quite ready for SMP -guest operation as we have to worry about synchonisation between gpte -and spte updates. Its possible that this might only happen in a -hypercall context, in which case we'll probably at have a per-domain -hypercall lock anyhow (at least initially). - -********/ - -static inline int -shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, - unsigned long new_type) -{ - struct page_info *page = mfn_to_page(gmfn); - int pinned = 0, okay = 1; - - if ( page_out_of_sync(page) ) - { - // Don't know how long ago this snapshot was taken. - // Can't trust it to be recent enough. - // - __shadow_sync_mfn(d, gmfn); - } - - if ( !shadow_mode_refcounts(d) ) - return 1; - - if ( unlikely(page_is_page_table(page)) ) - return 1; - - FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type); - - if ( !shadow_remove_all_write_access(d, gpfn, gmfn) ) - { - FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx", - __func__, gpfn, gmfn); -#if 1 || defined(LIVE_DANGEROUSLY) - set_bit(_PGC_page_table, &page->count_info); - return 1; -#endif - return 0; - - } - - // To convert this page to use as a page table, the writable count - // should now be zero. Test this by grabbing the page as an page table, - // and then immediately releasing. This will also deal with any - // necessary TLB flushing issues for us. - // - // The cruft here about pinning doesn't really work right. This - // needs rethinking/rewriting... Need to gracefully deal with the - // TLB flushes required when promoting a writable page, and also deal - // with any outstanding (external) writable refs to this page (by - // refusing to promote it). The pinning headache complicates this - // code -- it would all get much simpler if we stop using - // shadow_lock() and move the shadow code to BIGLOCK(). - // - if ( unlikely(!get_page(page, d)) ) - BUG(); // XXX -- needs more thought for a graceful failure - if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) ) - { - pinned = 1; - put_page_and_type(page); - } - if ( get_page_type(page, PGT_base_page_table) ) - { - set_bit(_PGC_page_table, &page->count_info); - put_page_type(page); - } - else - { - printk("shadow_promote: get_page_type failed " - "dom%d gpfn=%lx gmfn=%lx t=%08lx\n", - d->domain_id, gpfn, gmfn, new_type); - okay = 0; - } - - // Now put the type back to writable... - if ( unlikely(!get_page_type(page, PGT_writable_page)) ) - BUG(); // XXX -- needs more thought for a graceful failure - if ( unlikely(pinned) ) - { - if ( unlikely(test_and_set_bit(_PGT_pinned, - &page->u.inuse.type_info)) ) - BUG(); // hmm... someone pinned this again? - } - else - put_page_and_type(page); - - return okay; -} - -static inline void -shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn) -{ - if ( !shadow_mode_refcounts(d) ) - return; - - ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table); - - if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none ) - { - clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info); - - if ( page_out_of_sync(mfn_to_page(gmfn)) ) - { - remove_out_of_sync_entries(d, gmfn); - } - } -} - -/* - * Things in shadow mode that collect get_page() refs to the domain's - * pages are: - * - PGC_allocated takes a gen count, just like normal. - * - A writable page can be pinned (paravirtualized guests may consider - * these pages to be L1s or L2s, and don't know the difference). - * Pinning a page takes a gen count (but, for domains in shadow mode, - * it *doesn't* take a type count) - * - CR3 grabs a ref to whatever it points at, just like normal. - * - Shadow mode grabs an initial gen count for itself, as a placehold - * for whatever references will exist. - * - Shadow PTEs that point to a page take a gen count, just like regular - * PTEs. However, they don't get a type count, as get_page_type() is - * hardwired to keep writable pages' counts at 1 for domains in shadow - * mode. - * - Whenever we shadow a page, the entry in the shadow hash grabs a - * general ref to the page. - * - Whenever a page goes out of sync, the out of sync entry grabs a - * general ref to the page. - */ -/* - * page_info fields for pages allocated as shadow pages: - * - * All 32 bits of count_info are a simple count of refs to this shadow - * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table), - * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync - * references. - * - * u.inuse._domain is left NULL, to prevent accidently allow some random - * domain from gaining permissions to map this page. - * - * u.inuse.type_info & PGT_type_mask remembers what kind of page is being - * shadowed. - * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed. - * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow - * is currently exists because this is a shadow of a root page, and we - * don't want to let those disappear just because no CR3 is currently pointing - * at it. - * - * tlbflush_timestamp holds a min & max index of valid page table entries - * within the shadow page. - */ - -static inline unsigned long -alloc_shadow_page(struct domain *d, - unsigned long gpfn, unsigned long gmfn, - u32 psh_type) -{ - struct page_info *page; - unsigned long smfn; - int pin = 0; - void *l1; - - // Currently, we only keep pre-zero'ed pages around for use as L1's... - // This will change. Soon. - // - if ( psh_type == PGT_l1_shadow ) - { - if ( !list_empty(&d->arch.free_shadow_frames) ) - { - struct list_head *entry = d->arch.free_shadow_frames.next; - page = list_entry(entry, struct page_info, list); - list_del(entry); - perfc_decr(free_l1_pages); - } - else - { - page = alloc_domheap_page(NULL); - l1 = map_domain_page(page_to_mfn(page)); - memset(l1, 0, PAGE_SIZE); - unmap_domain_page(l1); - } - } - else - page = alloc_domheap_page(NULL); - - if ( unlikely(page == NULL) ) - { - printk("Couldn't alloc shadow page! dom%d count=%d\n", - d->domain_id, d->arch.shadow_page_count); - printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n", - perfc_value(shadow_l1_pages), - perfc_value(shadow_l2_pages), - perfc_value(hl2_table_pages), - perfc_value(snapshot_pages)); - /* XXX FIXME: try a shadow flush to free up some memory. */ - domain_crash_synchronous(); - } - - smfn = page_to_mfn(page); - - ASSERT( (gmfn & ~PGT_mfn_mask) == 0 ); - page->u.inuse.type_info = psh_type | gmfn; - page->count_info = 0; - page->tlbflush_timestamp = 0; - - switch ( psh_type ) - { - case PGT_l1_shadow: - if ( !shadow_promote(d, gpfn, gmfn, psh_type) ) - goto fail; - perfc_incr(shadow_l1_pages); - d->arch.shadow_page_count++; - break; - - case PGT_l2_shadow: - if ( !shadow_promote(d, gpfn, gmfn, psh_type) ) - goto fail; - perfc_incr(shadow_l2_pages); - d->arch.shadow_page_count++; - if ( PGT_l2_page_table == PGT_root_page_table ) - pin = 1; - - break; - - case PGT_hl2_shadow: - // Treat an hl2 as an L1 for purposes of promotion. - // For external mode domains, treat them as an L2 for purposes of - // pinning. - // - if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) ) - goto fail; - perfc_incr(hl2_table_pages); - d->arch.hl2_page_count++; - if ( shadow_mode_external(d) && - (PGT_l2_page_table == PGT_root_page_table) ) - pin = 1; - - break; - - case PGT_snapshot: - perfc_incr(snapshot_pages); - d->arch.snapshot_page_count++; - break; - - default: - printk("Alloc shadow weird page type type=%08x\n", psh_type); - BUG(); - break; - } - - // Don't add a new shadow of something that already has a snapshot. - // - ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) ); - - set_shadow_status(d, gpfn, gmfn, smfn, psh_type, 0); - - if ( pin ) - shadow_pin(smfn); - - return smfn; - - fail: - FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?", - gpfn, gmfn); - free_domheap_page(page); - return 0; -} - -static void inline -free_shadow_l1_table(struct domain *d, unsigned long smfn) -{ - l1_pgentry_t *pl1e = map_domain_page(smfn); - int i; - struct page_info *spage = mfn_to_page(smfn); - u32 min_max = spage->tlbflush_timestamp; - int min = SHADOW_MIN(min_max); - int max = SHADOW_MAX(min_max); - - for ( i = min; i <= max; i++ ) - { - shadow_put_page_from_l1e(pl1e[i], d); - pl1e[i] = l1e_empty(); - } - - unmap_domain_page(pl1e); -} - -static void inline -free_shadow_hl2_table(struct domain *d, unsigned long smfn) -{ - l1_pgentry_t *hl2 = map_domain_page(smfn); - int i, limit; - - SH_VVLOG("%s: smfn=%lx freed", __func__, smfn); - - if ( shadow_mode_external(d) ) - limit = L2_PAGETABLE_ENTRIES; - else - limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; - - for ( i = 0; i < limit; i++ ) - { - if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT ) - put_page(mfn_to_page(l1e_get_pfn(hl2[i]))); - } - - unmap_domain_page(hl2); -} - -static void inline -free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type) -{ - l2_pgentry_t *pl2e = map_domain_page(smfn); - int i, external = shadow_mode_external(d); - - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - if ( external || is_guest_l2_slot(type, i) ) - if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT ) - put_shadow_ref(l2e_get_pfn(pl2e[i])); - - if ( (PGT_base_page_table == PGT_l2_page_table) && - shadow_mode_translate(d) && !external ) - { - // free the ref to the hl2 - // - put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)])); - } - - unmap_domain_page(pl2e); -} - -void free_shadow_page(unsigned long smfn) -{ - struct page_info *page = mfn_to_page(smfn); - unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask; - struct domain *d = page_get_owner(mfn_to_page(gmfn)); - unsigned long gpfn = mfn_to_gmfn(d, gmfn); - unsigned long type = page->u.inuse.type_info & PGT_type_mask; - - SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn); - - ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); - - delete_shadow_status(d, gpfn, gmfn, type, 0); - - switch ( type ) - { - case PGT_l1_shadow: - perfc_decr(shadow_l1_pages); - shadow_demote(d, gpfn, gmfn); - free_shadow_l1_table(d, smfn); - d->arch.shadow_page_count--; - break; - - case PGT_l2_shadow: - perfc_decr(shadow_l2_pages); - shadow_demote(d, gpfn, gmfn); - free_shadow_l2_table(d, smfn, page->u.inuse.type_info); - d->arch.shadow_page_count--; - break; - - case PGT_hl2_shadow: - perfc_decr(hl2_table_pages); - shadow_demote(d, gpfn, gmfn); - free_shadow_hl2_table(d, smfn); - d->arch.hl2_page_count--; - break; - - case PGT_snapshot: - perfc_decr(snapshot_pages); - d->arch.snapshot_page_count--; - break; - - default: - printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n", - page_to_mfn(page), page->u.inuse.type_info); - break; - } - - // No TLB flushes are needed the next time this page gets allocated. - // - page->tlbflush_timestamp = 0; - page->u.free.cpumask = CPU_MASK_NONE; - - if ( type == PGT_l1_shadow ) - { - list_add(&page->list, &d->arch.free_shadow_frames); - perfc_incr(free_l1_pages); - } - else - free_domheap_page(page); -} - -void -remove_shadow(struct domain *d, unsigned long gpfn, u32 stype) -{ - unsigned long smfn; - - //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype); - - shadow_lock(d); - - while ( stype >= PGT_l1_shadow ) - { - smfn = __shadow_status(d, gpfn, stype); - if ( smfn && MFN_PINNED(smfn) ) - shadow_unpin(smfn); - stype -= PGT_l1_shadow; - } - - shadow_unlock(d); -} - -static void inline -release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry) -{ - struct page_info *page; - - page = mfn_to_page(entry->gmfn); - - // Decrement ref count of guest & shadow pages - // - put_page(page); - - // Only use entries that have low bits clear... - // - if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ) - { - put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT); - entry->writable_pl1e = -2; - } - else - ASSERT( entry->writable_pl1e == -1 ); - - // Free the snapshot - // - shadow_free_snapshot(d, entry); -} - -static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn) -{ - struct out_of_sync_entry *entry = d->arch.out_of_sync; - struct out_of_sync_entry **prev = &d->arch.out_of_sync; - struct out_of_sync_entry *found = NULL; - - // NB: Be careful not to call something that manipulates this list - // while walking it. Collect the results into a separate list - // first, then walk that list. - // - while ( entry ) - { - if ( entry->gmfn == gmfn ) - { - // remove from out of sync list - *prev = entry->next; - - // add to found list - entry->next = found; - found = entry; - - entry = *prev; - continue; - } - prev = &entry->next; - entry = entry->next; - } - - prev = NULL; - entry = found; - while ( entry ) - { - release_out_of_sync_entry(d, entry); - - prev = &entry->next; - entry = entry->next; - } - - // Add found list to free list - if ( prev ) - { - *prev = d->arch.out_of_sync_free; - d->arch.out_of_sync_free = found; - } -} - -static void free_out_of_sync_state(struct domain *d) -{ - struct out_of_sync_entry *entry; - - // NB: Be careful not to call something that manipulates this list - // while walking it. Remove one item at a time, and always - // restart from start of list. - // - while ( (entry = d->arch.out_of_sync) ) - { - d->arch.out_of_sync = entry->next; - release_out_of_sync_entry(d, entry); - - entry->next = d->arch.out_of_sync_free; - d->arch.out_of_sync_free = entry; - } -} - -static void free_shadow_pages(struct domain *d) -{ - int i; - struct shadow_status *x; - struct vcpu *v; - struct list_head *list_ent, *tmp; - - /* - * WARNING! The shadow page table must not currently be in use! - * e.g., You are expected to have paused the domain and synchronized CR3. - */ - - if( !d->arch.shadow_ht ) return; - - shadow_audit(d, 1); - - // first, remove any outstanding refs from out_of_sync entries... - // - free_out_of_sync_state(d); - - // second, remove any outstanding refs from v->arch.shadow_table - // and CR3. - // - for_each_vcpu(d, v) - { - if ( pagetable_get_paddr(v->arch.shadow_table) ) - { - put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table)); - v->arch.shadow_table = pagetable_null(); - - if ( shadow_mode_external(d) ) - { - if ( v->arch.shadow_vtable ) - unmap_domain_page_global(v->arch.shadow_vtable); - v->arch.shadow_vtable = NULL; - } - } - - if ( v->arch.monitor_shadow_ref ) - { - put_shadow_ref(v->arch.monitor_shadow_ref); - v->arch.monitor_shadow_ref = 0; - } - } - - // For external shadows, remove the monitor table's refs - // - if ( shadow_mode_external(d) ) - { - for_each_vcpu(d, v) - { - l2_pgentry_t *mpl2e = v->arch.monitor_vtable; - - if ( mpl2e ) - { - l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; - l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; - - if ( l2e_get_flags(hl2e) & _PAGE_PRESENT ) - { - put_shadow_ref(l2e_get_pfn(hl2e)); - mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty(); - } - if ( l2e_get_flags(smfn) & _PAGE_PRESENT ) - { - put_shadow_ref(l2e_get_pfn(smfn)); - mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty(); - } - } - } - } - - // Now, the only refs to shadow pages that are left are from the shadow - // pages themselves. We just unpin the pinned pages, and the rest - // should automatically disappear. - // - // NB: Beware: each explicitly or implicit call to free_shadow_page - // can/will result in the hash bucket getting rewritten out from - // under us... First, collect the list of pinned pages, then - // free them. - // - // FIXME: it would be good to just free all the pages referred to in - // the hash table without going through each of them to decrement their - // reference counts. In shadow_mode_refcount(), we've gotta do the hard - // work, but only for L1 shadows. If we're not in refcount mode, then - // there's no real hard work to do at all. Need to be careful with the - // writable_pte_predictions and snapshot entries in the hash table, but - // that's about it. - // - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - u32 count; - unsigned long *mfn_list; - - /* Skip empty buckets. */ - if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 ) - continue; - - count = 0; - - for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) { - /* Skip entries that are writable_pred) */ - switch(x->gpfn_and_flags & PGT_type_mask){ - case PGT_l1_shadow: - case PGT_l2_shadow: - case PGT_l3_shadow: - case PGT_l4_shadow: - case PGT_hl2_shadow: - if ( MFN_PINNED(x->smfn) ) - count++; - break; - case PGT_snapshot: - case PGT_writable_pred: - break; - default: - BUG(); - - } - } - - if ( !count ) - continue; - - mfn_list = xmalloc_array(unsigned long, count); - count = 0; - for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) { - /* Skip entries that are writable_pred) */ - switch(x->gpfn_and_flags & PGT_type_mask){ - case PGT_l1_shadow: - case PGT_l2_shadow: - case PGT_l3_shadow: - case PGT_l4_shadow: - case PGT_hl2_shadow: - if ( MFN_PINNED(x->smfn) ) - mfn_list[count++] = x->smfn; - break; - case PGT_snapshot: - case PGT_writable_pred: - break; - default: - BUG(); - - } - } - - while ( count ) - { - shadow_unpin(mfn_list[--count]); - } - xfree(mfn_list); - } - - /* Now free the pre-zero'ed pages from the domain */ - list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames) - { - struct page_info *page = list_entry(list_ent, struct page_info, list); - - list_del(list_ent); - perfc_decr(free_l1_pages); - - free_domheap_page(page); - } - - shadow_audit(d, 0); - - SH_VLOG("Free shadow table."); -} - -void shadow_mode_init(void) -{ -} - -int _shadow_mode_refcounts(struct domain *d) -{ - return shadow_mode_refcounts(d); -} - -static void alloc_monitor_pagetable(struct vcpu *v) -{ - unsigned long mmfn; - l2_pgentry_t *mpl2e; - struct page_info *mmfn_info; - struct domain *d = v->domain; - int i; - - ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0); - - mmfn_info = alloc_domheap_page(NULL); - ASSERT(mmfn_info != NULL); - - mmfn = page_to_mfn(mmfn_info); - mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn); - memset(mpl2e, 0, PAGE_SIZE); - - memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - - for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) - mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = - l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i, - __PAGE_HYPERVISOR); - - // Don't (yet) have mappings for these... - // Don't want to accidentally see the idle_pg_table's linear mapping. - // - mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty(); - mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty(); - mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = l2e_empty(); - - v->arch.monitor_table = pagetable_from_pfn(mmfn); - v->arch.monitor_vtable = mpl2e; - - if ( v->vcpu_id == 0 ) - alloc_p2m_table(d); - else - { - unsigned long mfn; - - mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); - if ( mfn ) - { - l2_pgentry_t *l2tab; - - l2tab = map_domain_page(mfn); - - mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = - l2tab[l2_table_offset(RO_MPT_VIRT_START)]; - - unmap_domain_page(l2tab); - } - } -} - -/* - * Free the pages for monitor_table and hl2_table - */ -void free_monitor_pagetable(struct vcpu *v) -{ - l2_pgentry_t *mpl2e, hl2e, sl2e; - unsigned long mfn; - - ASSERT( pagetable_get_paddr(v->arch.monitor_table) ); - - mpl2e = v->arch.monitor_vtable; - - /* - * First get the mfn for hl2_table by looking at monitor_table - */ - hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; - if ( l2e_get_flags(hl2e) & _PAGE_PRESENT ) - { - mfn = l2e_get_pfn(hl2e); - ASSERT(mfn); - put_shadow_ref(mfn); - } - - sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; - if ( l2e_get_flags(sl2e) & _PAGE_PRESENT ) - { - mfn = l2e_get_pfn(sl2e); - ASSERT(mfn); - put_shadow_ref(mfn); - } - - if ( v->vcpu_id == 0 ) - free_p2m_table(v->domain); - - /* - * Then free monitor_table. - */ - mfn = pagetable_get_pfn(v->arch.monitor_table); - unmap_domain_page_global(v->arch.monitor_vtable); - free_domheap_page(mfn_to_page(mfn)); - - v->arch.monitor_table = pagetable_null(); - v->arch.monitor_vtable = 0; -} - -static int -map_p2m_entry(l1_pgentry_t *l1tab, unsigned long gpfn, unsigned long mfn) -{ - unsigned long *l0tab = NULL; - l1_pgentry_t l1e = { 0 }; - struct page_info *page; - unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn)); - - l1e = l1tab[l1_table_offset(va)]; - if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) - { - page = alloc_domheap_page(NULL); - if ( !page ) - return 0; - - l0tab = map_domain_page(page_to_mfn(page)); - memset(l0tab, 0, PAGE_SIZE); - - l1e = l1tab[l1_table_offset(va)] = - l1e_from_page(page, __PAGE_HYPERVISOR); - } - else - l0tab = map_domain_page(l1e_get_pfn(l1e)); - - l0tab[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn; - - unmap_domain_page(l0tab); - - return 1; -} - -int -set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn, - struct domain_mmap_cache *l2cache, - struct domain_mmap_cache *l1cache) -{ - unsigned long tabpfn; - l2_pgentry_t *l2, l2e; - l1_pgentry_t *l1; - struct page_info *l1page; - unsigned long va = pfn << PAGE_SHIFT; - - if ( shadow_mode_external(d) ) - tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); - else - tabpfn = pagetable_get_pfn(d->arch.phys_table); - - ASSERT(tabpfn != 0); - ASSERT(shadow_lock_is_acquired(d)); - - l2 = map_domain_page_with_cache(tabpfn, l2cache); - - /* - * The following code covers (SHM_translate | SHM_external) mode. - */ - - if ( shadow_mode_external(d) ) - { - int error; - l1_pgentry_t *l1tab = NULL; - l2_pgentry_t l2e; - - l2e = l2[l2_table_offset(RO_MPT_VIRT_START)]; - - ASSERT( l2e_get_flags(l2e) & _PAGE_PRESENT ); - - l1tab = map_domain_page(l2e_get_pfn(l2e)); - if ( !(error = map_p2m_entry(l1tab, pfn, mfn)) ) - domain_crash(d); - - unmap_domain_page(l1tab); - unmap_domain_page_with_cache(l2, l2cache); - - return error; - } - - /* - * The following code covers SHM_translate mode. - */ - ASSERT(shadow_mode_translate(d)); - - l2e = l2[l2_table_offset(va)]; - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - { - l1page = alloc_domheap_page(NULL); - if ( !l1page ) - { - unmap_domain_page_with_cache(l2, l2cache); - return 0; - } - - l1 = map_domain_page_with_cache(page_to_mfn(l1page), l1cache); - /* Initialise entries to INVALID_MFN = ~0 */ - memset(l1, -1, PAGE_SIZE); - unmap_domain_page_with_cache(l1, l1cache); - - l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR); - l2[l2_table_offset(va)] = l2e; - } - unmap_domain_page_with_cache(l2, l2cache); - - l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache); - l1[l1_table_offset(va)] = (l1_pgentry_t) { mfn }; - unmap_domain_page_with_cache(l1, l1cache); - - return 1; -} - -static int -alloc_p2m_table(struct domain *d) -{ - struct list_head *list_ent; - - l2_pgentry_t *l2tab = NULL; - l1_pgentry_t *l1tab = NULL; - l2_pgentry_t l2e = { 0 }; - struct page_info *page; - unsigned long gpfn, mfn; - int error = 0; - - if ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) ) - { - l2tab = map_domain_page( - pagetable_get_pfn(d->vcpu[0]->arch.monitor_table)); - l2e = l2tab[l2_table_offset(RO_MPT_VIRT_START)]; - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - { - page = alloc_domheap_page(NULL); - - l1tab = map_domain_page(page_to_mfn(page)); - memset(l1tab, 0, PAGE_SIZE); - l2e = l2tab[l2_table_offset(RO_MPT_VIRT_START)] = - l2e_from_page(page, __PAGE_HYPERVISOR); - } - else - l1tab = map_domain_page(l2e_get_pfn(l2e)); - - if ( l2tab ) - unmap_domain_page(l2tab); - } - else - { - page = alloc_domheap_page(NULL); - if (!page) - { - printk("Alloc p2m table fail\n"); - domain_crash(d); - } - - l1tab = map_domain_page(page_to_mfn(page)); - memset(l1tab, 0, PAGE_SIZE); - d->arch.phys_table = pagetable_from_page(page); - } - - list_ent = d->page_list.next; - - while ( list_ent != &d->page_list ) - { - page = list_entry(list_ent, struct page_info, list); - mfn = page_to_mfn(page); - - gpfn = get_gpfn_from_mfn(mfn); - - if ( !(error = map_p2m_entry(l1tab, gpfn, mfn)) ) - { - domain_crash(d); - break; - } - - list_ent = page->list.next; - } - - unmap_domain_page(l1tab); - - return error; -} - -static void -free_p2m_table(struct domain *d) -{ - unsigned long va; - l2_pgentry_t *l2tab; - l1_pgentry_t *l1tab; - l2_pgentry_t l2e; - l1_pgentry_t l1e; - - ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) ); - - l2tab = map_domain_page( - pagetable_get_pfn(d->vcpu[0]->arch.monitor_table)); - - for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; ) - { - int i; - - l2e = l2tab[l2_table_offset(va)]; - if ( l2e_get_flags(l2e) & _PAGE_PRESENT ) - { - l1tab = map_domain_page(l2e_get_pfn(l2e)); - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++) - { - l1e = l1tab[l1_table_offset(va)]; - - if ( l1e_get_flags(l1e) & _PAGE_PRESENT ) - free_domheap_page(mfn_to_page(l1e_get_pfn(l1e))); - va += PAGE_SIZE; - } - unmap_domain_page(l1tab); - free_domheap_page(mfn_to_page(l2e_get_pfn(l2e))); - } - else - va += PAGE_SIZE * L1_PAGETABLE_ENTRIES; - } - unmap_domain_page(l2tab); -} - -int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - l2_pgentry_t sl2e; - l1_pgentry_t sl1e; - l1_pgentry_t *sple = NULL; - unsigned long mfn, smfn; - struct page_info *page; - - /* - * If the faulting address is within the MMIO range, we continue - * on handling the #PF as such. - */ - if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN ) - return 0; - - shadow_lock(d); - - __direct_get_l2e(v, vpa, &sl2e); - - if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) ) - { - page = alloc_domheap_page(NULL); - if ( !page ) - goto nomem; - - smfn = page_to_mfn(page); - sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER); - - sple = (l1_pgentry_t *)map_domain_page(smfn); - memset(sple, 0, PAGE_SIZE); - __direct_set_l2e(v, vpa, sl2e); - } - - if ( !sple ) - sple = (l1_pgentry_t *)map_domain_page(l2e_get_pfn(sl2e)); - - sl1e = sple[l1_table_offset(vpa)]; - - if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) ) - { - sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER); - sple[l1_table_offset(vpa)] = sl1e; - } - - if (sple) - unmap_domain_page(sple); - - shadow_unlock(d); - return EXCRET_fault_fixed; - -nomem: - shadow_direct_map_clean(d); - domain_crash_synchronous(); -} - - -int shadow_direct_map_init(struct domain *d) -{ - struct page_info *page; - l2_pgentry_t *root; - - if ( !(page = alloc_domheap_page(NULL)) ) - return 0; - - root = map_domain_page(page_to_mfn(page)); - memset(root, 0, PAGE_SIZE); - unmap_domain_page(root); - - d->arch.phys_table = pagetable_from_page(page); - - return 1; -} - -void shadow_direct_map_clean(struct domain *d) -{ - int i; - unsigned long mfn; - l2_pgentry_t *l2e; - - mfn = pagetable_get_pfn(d->arch.phys_table); - - /* - * We may fail very early before direct map is built. - */ - if ( !mfn ) - return; - - l2e = map_domain_page(mfn); - - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - { - if ( l2e_get_flags(l2e[i]) & _PAGE_PRESENT ) - free_domheap_page(mfn_to_page(l2e_get_pfn(l2e[i]))); - } - free_domheap_page(mfn_to_page(mfn)); - - unmap_domain_page(l2e); - - d->arch.phys_table = pagetable_null(); -} - -int __shadow_mode_enable(struct domain *d, unsigned int mode) -{ - struct vcpu *v; - int new_modes = (mode & ~d->arch.shadow_mode); - - if(!new_modes) /* Nothing to do - return success */ - return 0; - - // can't take anything away by calling this function. - ASSERT(!(d->arch.shadow_mode & ~mode)); - - for_each_vcpu(d, v) - { - invalidate_shadow_ldt(v); - - // We need to set these up for __update_pagetables(). - // See the comment there. - - /* - * arch.guest_vtable - */ - if ( v->arch.guest_vtable && - (v->arch.guest_vtable != __linear_l2_table) ) - { - unmap_domain_page_global(v->arch.guest_vtable); - } - if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) - v->arch.guest_vtable = __linear_l2_table; - else - v->arch.guest_vtable = NULL; - - /* - * arch.shadow_vtable - */ - if ( v->arch.shadow_vtable && - (v->arch.shadow_vtable != __shadow_linear_l2_table) ) - { - unmap_domain_page_global(v->arch.shadow_vtable); - } - if ( !(mode & SHM_external) ) - v->arch.shadow_vtable = __shadow_linear_l2_table; - else - v->arch.shadow_vtable = NULL; - - /* - * arch.hl2_vtable - */ - if ( v->arch.hl2_vtable && - (v->arch.hl2_vtable != __linear_hl2_table) ) - { - unmap_domain_page_global(v->arch.hl2_vtable); - } - if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) - v->arch.hl2_vtable = __linear_hl2_table; - else - v->arch.hl2_vtable = NULL; - - /* - * arch.monitor_table & arch.monitor_vtable - */ - if ( v->arch.monitor_vtable ) - { - free_monitor_pagetable(v); - } - if ( mode & SHM_external ) - { - alloc_monitor_pagetable(v); - } - } - - if ( new_modes & SHM_enable ) - { - ASSERT( !d->arch.shadow_ht ); - d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets); - if ( d->arch.shadow_ht == NULL ) - goto nomem; - - memset(d->arch.shadow_ht, 0, - shadow_ht_buckets * sizeof(struct shadow_status)); - } - - if ( new_modes & SHM_log_dirty ) - { - ASSERT( !d->arch.shadow_dirty_bitmap ); - d->arch.shadow_dirty_bitmap_size = - (d->shared_info->arch.max_pfn + 63) & ~63; - d->arch.shadow_dirty_bitmap = - xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size / - (8 * sizeof(unsigned long))); - if ( d->arch.shadow_dirty_bitmap == NULL ) - { - d->arch.shadow_dirty_bitmap_size = 0; - goto nomem; - } - memset(d->arch.shadow_dirty_bitmap, 0, - d->arch.shadow_dirty_bitmap_size/8); - } - - if ( new_modes & SHM_translate ) - { - if ( !(new_modes & SHM_external) ) - { - ASSERT( !pagetable_get_paddr(d->arch.phys_table) ); - if ( !alloc_p2m_table(d) ) - { - printk("alloc_p2m_table failed (out-of-memory?)\n"); - goto nomem; - } - } - } - - // Get rid of any shadow pages from any previous shadow mode. - // - free_shadow_pages(d); - - d->arch.shadow_mode = mode; - - if ( shadow_mode_refcounts(d) ) - { - struct list_head *list_ent; - struct page_info *page; - - /* - * Tear down its counts by disassembling its page-table-based refcounts - * Also remove CR3's gcount/tcount. - * That leaves things like GDTs and LDTs and external refs in tact. - * - * Most pages will be writable tcount=0. - * Some will still be L1 tcount=0 or L2 tcount=0. - * Maybe some pages will be type none tcount=0. - * Pages granted external writable refs (via grant tables?) will - * still have a non-zero tcount. That's OK. - * - * gcounts will generally be 1 for PGC_allocated. - * GDTs and LDTs will have additional gcounts. - * Any grant-table based refs will still be in the gcount. - * - * We attempt to grab writable refs to each page thus setting its type - * Immediately put back those type refs. - * - * Assert that no pages are left with L1/L2/L3/L4 type. - */ - audit_adjust_pgtables(d, -1, 1); - - - for (list_ent = d->page_list.next; list_ent != &d->page_list; - list_ent = page->list.next) { - - page = list_entry(list_ent, struct page_info, list); - - if ( !get_page_type(page, PGT_writable_page) ) - BUG(); - put_page_type(page); - /* - * We use tlbflush_timestamp as back pointer to smfn, and need to - * clean up it. - */ - if (shadow_mode_external(d)) - page->tlbflush_timestamp = 0; - } - - audit_adjust_pgtables(d, 1, 1); - - } - - return 0; - - nomem: - if ( (new_modes & SHM_enable) ) - { - xfree(d->arch.shadow_ht); - d->arch.shadow_ht = NULL; - } - if ( (new_modes & SHM_log_dirty) ) - { - xfree(d->arch.shadow_dirty_bitmap); - d->arch.shadow_dirty_bitmap = NULL; - } - - return -ENOMEM; -} - -int shadow_mode_enable(struct domain *d, unsigned int mode) -{ - int rc; - shadow_lock(d); - rc = __shadow_mode_enable(d, mode); - shadow_unlock(d); - return rc; -} - -static void -translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn) -{ - int i; - l1_pgentry_t *l1; - - l1 = map_domain_page(l1mfn); - for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) - { - if ( is_guest_l1_slot(i) && - (l1e_get_flags(l1[i]) & _PAGE_PRESENT) ) - { - unsigned long mfn = l1e_get_pfn(l1[i]); - unsigned long gpfn = mfn_to_gmfn(d, mfn); - ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn); - l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i])); - } - } - unmap_domain_page(l1); -} - -// This is not general enough to handle arbitrary pagetables -// with shared L1 pages, etc., but it is sufficient for bringing -// up dom0. -// -void -translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn, - unsigned int type) -{ - int i; - l2_pgentry_t *l2; - - ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d)); - - l2 = map_domain_page(l2mfn); - for (i = 0; i < L2_PAGETABLE_ENTRIES; i++) - { - if ( is_guest_l2_slot(type, i) && - (l2e_get_flags(l2[i]) & _PAGE_PRESENT) ) - { - unsigned long mfn = l2e_get_pfn(l2[i]); - unsigned long gpfn = mfn_to_gmfn(d, mfn); - ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn); - l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i])); - translate_l1pgtable(d, p2m, mfn); - } - } - unmap_domain_page(l2); -} - -static void free_shadow_ht_entries(struct domain *d) -{ - struct shadow_status *x, *n; - - SH_VLOG("freed tables count=%d l1=%d l2=%d", - d->arch.shadow_page_count, perfc_value(shadow_l1_pages), - perfc_value(shadow_l2_pages)); - - n = d->arch.shadow_ht_extras; - while ( (x = n) != NULL ) - { - d->arch.shadow_extras_count--; - n = *((struct shadow_status **)(&x[shadow_ht_extra_size])); - xfree(x); - } - - d->arch.shadow_ht_extras = NULL; - d->arch.shadow_ht_free = NULL; - - ASSERT(d->arch.shadow_extras_count == 0); - SH_VLOG("freed extras, now %d", d->arch.shadow_extras_count); - - if ( d->arch.shadow_dirty_bitmap != NULL ) - { - xfree(d->arch.shadow_dirty_bitmap); - d->arch.shadow_dirty_bitmap = 0; - d->arch.shadow_dirty_bitmap_size = 0; - } - - xfree(d->arch.shadow_ht); - d->arch.shadow_ht = NULL; -} - -static void free_out_of_sync_entries(struct domain *d) -{ - struct out_of_sync_entry *x, *n; - - n = d->arch.out_of_sync_extras; - while ( (x = n) != NULL ) - { - d->arch.out_of_sync_extras_count--; - n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size])); - xfree(x); - } - - d->arch.out_of_sync_extras = NULL; - d->arch.out_of_sync_free = NULL; - d->arch.out_of_sync = NULL; - - ASSERT(d->arch.out_of_sync_extras_count == 0); - FSH_LOG("freed extra out_of_sync entries, now %d", - d->arch.out_of_sync_extras_count); -} - -void __shadow_mode_disable(struct domain *d) -{ - struct vcpu *v; -#ifndef NDEBUG - int i; -#endif - - if ( unlikely(!shadow_mode_enabled(d)) ) - return; - - free_shadow_pages(d); - free_writable_pte_predictions(d); - -#ifndef NDEBUG - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 ) - { - printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n", - __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags); - BUG(); - } - } -#endif - - d->arch.shadow_mode = 0; - - free_shadow_ht_entries(d); - free_out_of_sync_entries(d); - - for_each_vcpu(d, v) - update_pagetables(v); -} - -static int shadow_mode_table_op( - struct domain *d, dom0_shadow_control_t *sc) -{ - unsigned int op = sc->op; - int i, rc = 0; - struct vcpu *v; - - ASSERT(shadow_lock_is_acquired(d)); - - SH_VLOG("shadow mode table op %lx %lx count %d", - (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */ - (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */ - d->arch.shadow_page_count); - - shadow_audit(d, 1); - - switch ( op ) - { - case DOM0_SHADOW_CONTROL_OP_FLUSH: - free_shadow_pages(d); - - d->arch.shadow_fault_count = 0; - d->arch.shadow_dirty_count = 0; - - break; - - case DOM0_SHADOW_CONTROL_OP_CLEAN: - free_shadow_pages(d); - - sc->stats.fault_count = d->arch.shadow_fault_count; - sc->stats.dirty_count = d->arch.shadow_dirty_count; - - d->arch.shadow_fault_count = 0; - d->arch.shadow_dirty_count = 0; - - if ( guest_handle_is_null(sc->dirty_bitmap) || - (d->arch.shadow_dirty_bitmap == NULL) ) - { - rc = -EINVAL; - break; - } - - if ( sc->pages > d->arch.shadow_dirty_bitmap_size ) - sc->pages = d->arch.shadow_dirty_bitmap_size; - -#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ - for ( i = 0; i < sc->pages; i += chunk ) - { - int bytes = ((((sc->pages - i) > chunk) ? - chunk : (sc->pages - i)) + 7) / 8; - - if ( copy_to_guest_offset( - sc->dirty_bitmap, i/(8*sizeof(unsigned long)), - d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), - (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) ) - { - rc = -EINVAL; - break; - } - - memset( - d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))), - 0, bytes); - } - - break; - - case DOM0_SHADOW_CONTROL_OP_PEEK: - sc->stats.fault_count = d->arch.shadow_fault_count; - sc->stats.dirty_count = d->arch.shadow_dirty_count; - - if ( guest_handle_is_null(sc->dirty_bitmap) || - (d->arch.shadow_dirty_bitmap == NULL) ) - { - rc = -EINVAL; - break; - } - - if ( sc->pages > d->arch.shadow_dirty_bitmap_size ) - sc->pages = d->arch.shadow_dirty_bitmap_size; - - if ( copy_to_guest(sc->dirty_bitmap, - d->arch.shadow_dirty_bitmap, - (((sc->pages+7)/8)+sizeof(unsigned long)-1) / - sizeof(unsigned long)) ) - { - rc = -EINVAL; - break; - } - - break; - - default: - rc = -EINVAL; - break; - } - - SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count); - shadow_audit(d, 1); - - for_each_vcpu(d,v) - __update_pagetables(v); - - return rc; -} - -int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) -{ - unsigned int op = sc->op; - int rc = 0; - struct vcpu *v; - - if ( unlikely(d == current->domain) ) - { - DPRINTK("Don't try to do a shadow op on yourself!\n"); - return -EINVAL; - } - - domain_pause(d); - - shadow_lock(d); - - switch ( op ) - { - case DOM0_SHADOW_CONTROL_OP_OFF: - if ( shadow_mode_enabled(d) ) - { - __shadow_sync_all(d); - __shadow_mode_disable(d); - } - break; - - case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST: - free_shadow_pages(d); - rc = __shadow_mode_enable(d, SHM_enable); - break; - - case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY: - free_shadow_pages(d); - rc = __shadow_mode_enable( - d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty); - break; - - case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE: - free_shadow_pages(d); - rc = __shadow_mode_enable( - d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate|SHM_wr_pt_pte); - break; - - default: - rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL; - break; - } - - shadow_unlock(d); - - for_each_vcpu(d,v) - update_pagetables(v); - - domain_unpause(d); - - return rc; -} - -unsigned long -get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn) -{ - unsigned long va, tabpfn; - l1_pgentry_t *l1, l1e; - l2_pgentry_t *l2, l2e; - - ASSERT(shadow_mode_translate(d)); - - perfc_incrc(get_mfn_from_gpfn_foreign); - - if ( shadow_mode_external(d) ) - { - unsigned long mfn; - unsigned long *l0; - - va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn)); - - tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); - if ( !tabpfn ) - return INVALID_MFN; - - l2 = map_domain_page(tabpfn); - l2e = l2[l2_table_offset(va)]; - unmap_domain_page(l2); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - return INVALID_MFN; - - l1 = map_domain_page(l2e_get_pfn(l2e)); - l1e = l1[l1_table_offset(va)]; - unmap_domain_page(l1); - if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) - return INVALID_MFN; - - l0 = map_domain_page(l1e_get_pfn(l1e)); - mfn = l0[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)]; - unmap_domain_page(l0); - return mfn; - } - else - { - va = gpfn << PAGE_SHIFT; - tabpfn = pagetable_get_pfn(d->arch.phys_table); - l2 = map_domain_page(tabpfn); - l2e = l2[l2_table_offset(va)]; - unmap_domain_page(l2); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - { -#if 0 - printk("%s(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte "\n", - __func__, d->domain_id, gpfn, l2e_get_intpte(l2e)); -#endif - return INVALID_MFN; - } - l1 = map_domain_page(l2e_get_pfn(l2e)); - l1e = l1[l1_table_offset(va)]; - unmap_domain_page(l1); -#if 0 - printk("%s(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx l1tab=%lx, l1e=%lx\n", - __func__, d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, l1tab, l1e); -#endif - - return l1e_get_intpte(l1e); - } - -} - -static unsigned long -shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn, - unsigned long smfn) -{ - unsigned long hl2mfn; - l1_pgentry_t *hl2; - l2_pgentry_t *gpgd; - int limit; - int x; - - ASSERT(PGT_base_page_table == PGT_l2_page_table); - - if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) ) - { - printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n", - gpfn, gmfn); - BUG(); /* XXX Deal gracefully with failure. */ - } - - SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx", - gpfn, gmfn, smfn, hl2mfn); - perfc_incrc(shadow_hl2_table_count); - - hl2 = map_domain_page(hl2mfn); - - if ( shadow_mode_external(d) ) - limit = L2_PAGETABLE_ENTRIES; - else - limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; - - memset(hl2, 0, limit * sizeof(l1_pgentry_t)); - - if ( !shadow_mode_external(d) ) - { - memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0, - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - - // Setup easy access to the GL2, SL2, and HL2 frames. - // - hl2[l2_table_offset(LINEAR_PT_VIRT_START)] = - l1e_from_pfn(gmfn, __PAGE_HYPERVISOR); - hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = - l1e_from_pfn(smfn, __PAGE_HYPERVISOR); - hl2[l2_table_offset(PERDOMAIN_VIRT_START)] = - l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR); - } - - gpgd = map_domain_page(gmfn); - for (x = 0; x < DOMAIN_ENTRIES_PER_L2_PAGETABLE; x++) - validate_hl2e_change(d, gpgd[x], &hl2[x]); - unmap_domain_page(gpgd); - - unmap_domain_page(hl2); - - return hl2mfn; -} - -/* - * This could take and use a snapshot, and validate the entire page at - * once, or it could continue to fault in entries one at a time... - * Might be worth investigating... - */ -static unsigned long shadow_l2_table( - struct domain *d, unsigned long gpfn, unsigned long gmfn) -{ - unsigned long smfn; - l2_pgentry_t *spl2e; - int i; - - SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn); - - perfc_incrc(shadow_l2_table_count); - - if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) ) - { - printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n", - gpfn, gmfn); - BUG(); /* XXX Deal gracefully with failure. */ - } - - spl2e = (l2_pgentry_t *)map_domain_page(smfn); - - /* Install hypervisor and 2x linear p.t. mapings. */ - if ( (PGT_base_page_table == PGT_l2_page_table) && - !shadow_mode_external(d) ) - { - /* - * We could proactively fill in PDEs for pages that are already - * shadowed *and* where the guest PDE has _PAGE_ACCESSED set - * (restriction required for coherence of the accessed bit). However, - * we tried it and it didn't help performance. This is simpler. - */ - memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t)); - - /* Install hypervisor and 2x linear p.t. mapings. */ - memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - - spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = - l2e_from_pfn(smfn, __PAGE_HYPERVISOR); - - for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) - spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = - l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))-> - arch.mm_perdomain_pt) + i, - __PAGE_HYPERVISOR); - - if ( shadow_mode_translate(d) ) // NB: not external - { - unsigned long hl2mfn; - - ASSERT(pagetable_get_paddr(d->arch.phys_table)); - spl2e[l2_table_offset(RO_MPT_VIRT_START)] = - l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table), - __PAGE_HYPERVISOR); - - if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) - hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); - - // shadow_mode_translate (but not external) sl2 tables hold a - // ref to their hl2. - // - if ( !get_shadow_ref(hl2mfn) ) - BUG(); - - spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = - l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR); - } - else - spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = - l2e_from_pfn(gmfn, __PAGE_HYPERVISOR); - } - else - { - memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t)); - } - - unmap_domain_page(spl2e); - - SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn); - return smfn; -} - -void shadow_map_l1_into_current_l2(unsigned long va) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - l1_pgentry_t *gpl1e, *spl1e; - l2_pgentry_t gl2e, sl2e; - unsigned long gl1pfn, gl1mfn, sl1mfn; - int i, init_table = 0; - - __guest_get_l2e(v, va, &gl2e); - ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT); - gl1pfn = l2e_get_pfn(gl2e); - - if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) ) - { - /* This L1 is NOT already shadowed so we need to shadow it. */ - SH_VVLOG("4a: l1 not shadowed"); - - gl1mfn = gmfn_to_mfn(d, gl1pfn); - if ( unlikely(!VALID_MFN(gl1mfn)) ) - { - // Attempt to use an invalid pfn as an L1 page. - // XXX this needs to be more graceful! - BUG(); - } - - if ( unlikely(!(sl1mfn = - alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) ) - { - printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n", - gl1pfn, gl1mfn); - BUG(); /* XXX Need to deal gracefully with failure. */ - } - - perfc_incrc(shadow_l1_table_count); - init_table = 1; - } - else - { - /* This L1 is shadowed already, but the L2 entry is missing. */ - SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn); - } - -#ifndef NDEBUG - { - l2_pgentry_t old_sl2e; - __shadow_get_l2e(v, va, &old_sl2e); - ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) ); - } -#endif - - if ( !get_shadow_ref(sl1mfn) ) - BUG(); - l2pde_general(d, &gl2e, &sl2e, sl1mfn); - __guest_set_l2e(v, va, gl2e); - __shadow_set_l2e(v, va, sl2e); - - if ( init_table ) - { - l1_pgentry_t sl1e; - int index = l1_table_offset(va); - int min = 1, max = 0; - - gpl1e = &(linear_pg_table[l1_linear_offset(va) & - ~(L1_PAGETABLE_ENTRIES-1)]); - - spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) & - ~(L1_PAGETABLE_ENTRIES-1)]); - - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - { - l1pte_propagate_from_guest(d, gpl1e[i], &sl1e); - if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) && - unlikely(!shadow_get_page_from_l1e(sl1e, d)) ) - sl1e = l1e_empty(); - if ( l1e_get_flags(sl1e) == 0 ) - { - // First copy entries from 0 until first invalid. - // Then copy entries from index until first invalid. - // - if ( i < index ) { - i = index - 1; - continue; - } - break; - } - spl1e[i] = sl1e; - if ( unlikely(i < min) ) - min = i; - if ( likely(i > max) ) - max = i; - set_guest_back_ptr(d, sl1e, sl1mfn, i); - } - - mfn_to_page(sl1mfn)->tlbflush_timestamp = - SHADOW_ENCODE_MIN_MAX(min, max); - } -} - -void shadow_invlpg(struct vcpu *v, unsigned long va) -{ - struct domain *d = v->domain; - l1_pgentry_t gpte, spte; - - ASSERT(shadow_mode_enabled(d)); - - shadow_lock(d); - - __shadow_sync_va(v, va); - - // XXX mafetter: will need to think about 4MB pages... - - // It's not strictly necessary to update the shadow here, - // but it might save a fault later. - // - if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT], - sizeof(gpte))) { - perfc_incrc(shadow_invlpg_faults); - shadow_unlock(d); - return; - } - l1pte_propagate_from_guest(d, gpte, &spte); - shadow_set_l1e(va, spte, 1); - - shadow_unlock(d); -} - -struct out_of_sync_entry * -shadow_alloc_oos_entry(struct domain *d) -{ - struct out_of_sync_entry *f, *extra; - unsigned size, i; - - if ( unlikely(d->arch.out_of_sync_free == NULL) ) - { - FSH_LOG("Allocate more fullshadow tuple blocks."); - - size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f)); - extra = xmalloc_bytes(size); - - /* XXX Should be more graceful here. */ - if ( extra == NULL ) - BUG(); - - memset(extra, 0, size); - - /* Record the allocation block so it can be correctly freed later. */ - d->arch.out_of_sync_extras_count++; - *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = - d->arch.out_of_sync_extras; - d->arch.out_of_sync_extras = &extra[0]; - - /* Thread a free chain through the newly-allocated nodes. */ - for ( i = 0; i < (out_of_sync_extra_size - 1); i++ ) - extra[i].next = &extra[i+1]; - extra[i].next = NULL; - - /* Add the new nodes to the free list. */ - d->arch.out_of_sync_free = &extra[0]; - } - - /* Allocate a new node from the quicklist. */ - f = d->arch.out_of_sync_free; - d->arch.out_of_sync_free = f->next; - - return f; -} - -static inline unsigned long -shadow_make_snapshot( - struct domain *d, unsigned long gpfn, unsigned long gmfn) -{ - unsigned long smfn, sl1mfn = 0; - void *original, *snapshot; - u32 min_max = 0; - int min, max, length; - - if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) ) - { - ASSERT(__shadow_status(d, gpfn, PGT_snapshot)); - return SHADOW_SNAPSHOT_ELSEWHERE; - } - - perfc_incrc(shadow_make_snapshot); - - if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) ) - { - printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n" - "Dom%d snapshot_count_count=%d\n", - gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count); - BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */ - } - - if ( !get_shadow_ref(smfn) ) - BUG(); - - if ( shadow_mode_refcounts(d) && - (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) ) - min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp; - mfn_to_page(smfn)->tlbflush_timestamp = min_max; - - min = SHADOW_MIN(min_max); - max = SHADOW_MAX(min_max); - length = max - min + 1; - perfc_incr_histo(snapshot_copies, length, PT_UPDATES); - - min *= sizeof(l1_pgentry_t); - length *= sizeof(l1_pgentry_t); - - original = map_domain_page(gmfn); - snapshot = map_domain_page(smfn); - memcpy(snapshot + min, original + min, length); - unmap_domain_page(original); - unmap_domain_page(snapshot); - - return smfn; -} - -static void -shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry) -{ - void *snapshot; - - if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) - return; - - // Clear the out_of_sync bit. - // - clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info); - - // XXX Need to think about how to protect the domain's - // information less expensively. - // - snapshot = map_domain_page(entry->snapshot_mfn); - memset(snapshot, 0, PAGE_SIZE); - unmap_domain_page(snapshot); - - put_shadow_ref(entry->snapshot_mfn); -} - -struct out_of_sync_entry * -__shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn, - unsigned long mfn) -{ - struct domain *d = v->domain; - struct page_info *page = mfn_to_page(mfn); - struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d); - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(mfn_valid(mfn)); - -#ifndef NDEBUG - { - u32 type = page->u.inuse.type_info & PGT_type_mask; - if ( shadow_mode_refcounts(d) ) - { - ASSERT(type == PGT_writable_page); - } - else - { - ASSERT(type && (type < PGT_l4_page_table)); - } - } -#endif - - FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08lx", __func__, - gpfn, mfn, page->count_info, page->u.inuse.type_info); - - // XXX this will require some more thought... Cross-domain sharing and - // modification of page tables? Hmm... - // - if ( d != page_get_owner(page) ) - BUG(); - - perfc_incrc(shadow_mark_mfn_out_of_sync_calls); - - entry->v = v; - entry->gpfn = gpfn; - entry->gmfn = mfn; - entry->writable_pl1e = -1; - -#if SHADOW_DEBUG - mark_shadows_as_reflecting_snapshot(d, gpfn); -#endif - - // increment guest's ref count to represent the entry in the - // full shadow out-of-sync list. - // - get_page(page, d); - - return entry; -} - -struct out_of_sync_entry * -shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn, - unsigned long mfn) -{ - struct out_of_sync_entry *entry = - __shadow_mark_mfn_out_of_sync(v, gpfn, mfn); - struct domain *d = v->domain; - - entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn); - // Add to the out-of-sync list - // - entry->next = d->arch.out_of_sync; - d->arch.out_of_sync = entry; - - return entry; -} - -void shadow_mark_va_out_of_sync( - struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va) -{ - struct out_of_sync_entry *entry = - __shadow_mark_mfn_out_of_sync(v, gpfn, mfn); - l2_pgentry_t sl2e; - struct domain *d = v->domain; - - // We need the address of shadow PTE that maps @va. - // It might not exist yet. Make sure it's there. - // - __shadow_get_l2e(v, va, &sl2e); - if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) ) - { - // either this L1 isn't shadowed yet, or the shadow isn't linked into - // the current L2. - shadow_map_l1_into_current_l2(va); - __shadow_get_l2e(v, va, &sl2e); - } - ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT); - - entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn); - // NB: this is stored as a machine address. - entry->writable_pl1e = - l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va)); - ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ); - entry->va = va; - - // Increment shadow's page count to represent the reference - // inherent in entry->writable_pl1e - // - if ( !get_shadow_ref(l2e_get_pfn(sl2e)) ) - BUG(); - - // Add to the out-of-sync list - // - entry->next = d->arch.out_of_sync; - d->arch.out_of_sync = entry; - - FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)", - __func__, va, entry->writable_pl1e); -} - -/* - * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches. - * Returns 0 otherwise. - */ -static int snapshot_entry_matches( - struct domain *d, l1_pgentry_t *guest_pt, - unsigned long gpfn, unsigned index) -{ - unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot); - l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ... - int entries_match; - - perfc_incrc(snapshot_entry_matches_calls); - - if ( !smfn ) - return 0; - - snapshot = map_domain_page(smfn); - - if (__copy_from_user(&gpte, &guest_pt[index], - sizeof(gpte))) { - unmap_domain_page(snapshot); - return 0; - } - - // This could probably be smarter, but this is sufficent for - // our current needs. - // - entries_match = !l1e_has_changed(gpte, snapshot[index], - PAGE_FLAG_MASK); - - unmap_domain_page(snapshot); - -#ifdef PERF_COUNTERS - if ( entries_match ) - perfc_incrc(snapshot_entry_matches_true); -#endif - - return entries_match; -} - -/* - * Returns 1 if va's shadow mapping is out-of-sync. - * Returns 0 otherwise. - */ -int __shadow_out_of_sync(struct vcpu *v, unsigned long va) -{ - struct domain *d = v->domain; - unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table); - unsigned long l2pfn = mfn_to_gmfn(d, l2mfn); - l2_pgentry_t l2e; - unsigned long l1pfn, l1mfn; - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(VALID_M2P(l2pfn)); - - perfc_incrc(shadow_out_of_sync_calls); - - if ( page_out_of_sync(mfn_to_page(l2mfn)) && - !snapshot_entry_matches(d, (l1_pgentry_t *)v->arch.guest_vtable, - l2pfn, l2_table_offset(va)) ) - return 1; - - __guest_get_l2e(v, va, &l2e); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - return 0; - - l1pfn = l2e_get_pfn(l2e); - l1mfn = gmfn_to_mfn(d, l1pfn); - - // If the l1 pfn is invalid, it can't be out of sync... - if ( !VALID_MFN(l1mfn) ) - return 0; - - if ( page_out_of_sync(mfn_to_page(l1mfn)) && - !snapshot_entry_matches( - d, &linear_pg_table[l1_linear_offset(va) & ~(L1_PAGETABLE_ENTRIES-1)], - l1pfn, l1_table_offset(va)) ) - return 1; - - return 0; -} - -#define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t))) -static inline unsigned long -predict_writable_pte_page(struct domain *d, unsigned long gpfn) -{ - return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred); -} - -static inline void -increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction) -{ - unsigned long score = prediction & PGT_score_mask; - int create = (score == 0); - - // saturating addition - score = (score + (1u << PGT_score_shift)) & PGT_score_mask; - score = score ? score : PGT_score_mask; - - prediction = (prediction & PGT_mfn_mask) | score; - - //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create); - set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred, 0); - - if ( create ) - perfc_incr(writable_pte_predictions); -} - -static inline void -decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction) -{ - unsigned long score = prediction & PGT_score_mask; - ASSERT(score); - - // divide score by 2... We don't like bad predictions. - // - score = (score >> 1) & PGT_score_mask; - - prediction = (prediction & PGT_mfn_mask) | score; - - //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score); - - if ( score ) - set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred, 0); - else - { - delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred, 0); - perfc_decr(writable_pte_predictions); - } -} - -static void -free_writable_pte_predictions(struct domain *d) -{ - int i; - struct shadow_status *x; - - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - u32 count; - unsigned long *gpfn_list; - - /* Skip empty buckets. */ - if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 ) - continue; - - count = 0; - for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) - if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred ) - count++; - - gpfn_list = xmalloc_array(unsigned long, count); - count = 0; - for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) - if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred ) - gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask; - - while ( count ) - { - count--; - /* delete_shadow_status() may do a shadow_audit(), so we need to - * keep an accurate count of writable_pte_predictions to keep it - * happy. - */ - delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred, 0); - perfc_decr(writable_pte_predictions); - } - - xfree(gpfn_list); - } -} - -static int fix_entry( - struct domain *d, - l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find) -{ - l1_pgentry_t old = *pt; - l1_pgentry_t new = old; - - l1e_remove_flags(new,_PAGE_RW); - if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) ) - BUG(); - (*found)++; - *pt = new; - if ( is_l1_shadow ) - shadow_put_page_from_l1e(old, d); - - return (*found == max_refs_to_find); -} - -static u32 remove_all_write_access_in_ptpage( - struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn, - unsigned long readonly_gpfn, unsigned long readonly_gmfn, - u32 max_refs_to_find, unsigned long prediction) -{ - l1_pgentry_t *pt = map_domain_page(pt_mfn); - l1_pgentry_t match; - unsigned long flags = _PAGE_RW | _PAGE_PRESENT; - int i; - u32 found = 0; - int is_l1_shadow = - ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) == - PGT_l1_shadow); - - match = l1e_from_pfn(readonly_gmfn, flags); - - if ( shadow_mode_external(d) ) { - i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask) - >> PGT_va_shift; - - if ( (i >= 0 && i < L1_PAGETABLE_ENTRIES) && - !l1e_has_changed(pt[i], match, flags) && - fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) && - !prediction ) - goto out; - } - - for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) - { - if ( unlikely(!l1e_has_changed(pt[i], match, flags)) && - fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) ) - break; - } - -out: - unmap_domain_page(pt); - - return found; -} - -int shadow_remove_all_write_access( - struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn) -{ - int i; - struct shadow_status *a; - u32 found = 0, write_refs; - unsigned long predicted_smfn; - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(VALID_MFN(readonly_gmfn)); - - perfc_incrc(remove_write_access); - - // If it's not a writable page, then no writable refs can be outstanding. - // - if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) != - PGT_writable_page ) - { - perfc_incrc(remove_write_not_writable); - return 1; - } - - // How many outstanding writable PTEs for this page are there? - // - write_refs = - (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask); - if ( write_refs && MFN_PINNED(readonly_gmfn) ) - { - write_refs--; - } - - if ( write_refs == 0 ) - { - perfc_incrc(remove_write_no_work); - return 1; - } - - if ( shadow_mode_external(d) ) { - if (--write_refs == 0) - return 0; - - // Use the back pointer to locate the shadow page that can contain - // the PTE of interest - if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) { - found += remove_all_write_access_in_ptpage( - d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0); - if ( found == write_refs ) - return 0; - } - } - - // Search all the shadow L1 page tables... - // - for (i = 0; i < shadow_ht_buckets; i++) - { - a = &d->arch.shadow_ht[i]; - while ( a && a->gpfn_and_flags ) - { - if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow ) - { - found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask); - if ( found == write_refs ) - return 0; - } - - a = a->next; - } - } - - FSH_LOG("%s: looking for %d refs, found %d refs", - __func__, write_refs, found); - - return 0; -} - -static u32 remove_all_access_in_page( - struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn) -{ - l1_pgentry_t *pl1e = map_domain_page(l1mfn); - l1_pgentry_t match, ol2e; - unsigned long flags = _PAGE_PRESENT; - int i; - u32 count = 0; - int is_l1_shadow = - ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) == - PGT_l1_shadow); - - match = l1e_from_pfn(forbidden_gmfn, flags); - - for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) - { - if ( l1e_has_changed(pl1e[i], match, flags) ) - continue; - - ol2e = pl1e[i]; - pl1e[i] = l1e_empty(); - count++; - - if ( is_l1_shadow ) - shadow_put_page_from_l1e(ol2e, d); - else /* must be an hl2 page */ - put_page(mfn_to_page(forbidden_gmfn)); - } - - unmap_domain_page(pl1e); - - return count; -} - -u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn) -{ - int i; - struct shadow_status *a; - u32 count = 0; - - if ( unlikely(!shadow_mode_enabled(d)) ) - return 0; - - ASSERT(shadow_lock_is_acquired(d)); - perfc_incrc(remove_all_access); - - for (i = 0; i < shadow_ht_buckets; i++) - { - a = &d->arch.shadow_ht[i]; - while ( a && a->gpfn_and_flags ) - { - switch (a->gpfn_and_flags & PGT_type_mask) - { - case PGT_l1_shadow: - case PGT_l2_shadow: - case PGT_l3_shadow: - case PGT_l4_shadow: - case PGT_hl2_shadow: - count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn); - break; - case PGT_snapshot: - case PGT_writable_pred: - // these can't hold refs to the forbidden page - break; - default: - BUG(); - } - - a = a->next; - } - } - - return count; -} - -static int resync_all(struct domain *d, u32 stype) -{ - struct out_of_sync_entry *entry; - unsigned i; - unsigned long smfn; - void *guest, *shadow, *snapshot; - int need_flush = 0, external = shadow_mode_external(d); - int unshadow; - int changed; - u32 min_max_shadow, min_max_snapshot; - int min_shadow, max_shadow, min_snapshot, max_snapshot; - struct vcpu *v; - - ASSERT(shadow_lock_is_acquired(d)); - - for ( entry = d->arch.out_of_sync; entry; entry = entry->next) - { - if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) - continue; - - smfn = __shadow_status(d, entry->gpfn, stype); - - if ( !smfn ) - { - // For heavy weight shadows: no need to update refcounts if - // there's no shadow page. - // - if ( shadow_mode_refcounts(d) ) - continue; - - // For light weight shadows: only need up resync the refcounts to - // the new contents of the guest page iff this it has the right - // page type. - // - if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) ) - continue; - } - - FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx", - stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn); - - // Compare guest's new contents to its snapshot, validating - // and updating its shadow as appropriate. - // - guest = map_domain_page(entry->gmfn); - snapshot = map_domain_page(entry->snapshot_mfn); - - if ( smfn ) - shadow = map_domain_page(smfn); - else - shadow = NULL; - - unshadow = 0; - - switch ( stype ) { - case PGT_l1_shadow: - { - l1_pgentry_t *guest1 = guest; - l1_pgentry_t *shadow1 = shadow; - l1_pgentry_t *snapshot1 = snapshot; - int unshadow_l1 = 0; - - ASSERT(shadow_mode_write_l1(d) || - shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d)); - - if ( !shadow_mode_refcounts(d) ) - revalidate_l1(d, guest1, snapshot1); - - if ( !smfn ) - break; - - min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp; - min_shadow = SHADOW_MIN(min_max_shadow); - max_shadow = SHADOW_MAX(min_max_shadow); - - min_max_snapshot = - mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp; - min_snapshot = SHADOW_MIN(min_max_snapshot); - max_snapshot = SHADOW_MAX(min_max_snapshot); - - changed = 0; - - for ( i = min_shadow; i <= max_shadow; i++ ) - { - if ( (i < min_snapshot) || (i > max_snapshot) || - l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) ) - { - int error; - - error = validate_pte_change(d, guest1[i], &shadow1[i]); - if ( error == -1 ) - unshadow_l1 = 1; - else { - need_flush |= error; - if ( l1e_get_flags(shadow1[i]) & _PAGE_PRESENT ) - set_guest_back_ptr(d, shadow1[i], smfn, i); - } - - // can't update snapshots of linear page tables -- they - // are used multiple times... - // - // snapshot[i] = new_pte; - changed++; - } - } - perfc_incrc(resync_l1); - perfc_incr_histo(wpt_updates, changed, PT_UPDATES); - perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES); - if (unshadow_l1) { - l2_pgentry_t l2e; - - __shadow_get_l2e(entry->v, entry->va, &l2e); - if (l2e_get_flags(l2e) & _PAGE_PRESENT) { - put_shadow_ref(l2e_get_pfn(l2e)); - l2e = l2e_empty(); - __shadow_set_l2e(entry->v, entry->va, l2e); - - if (entry->v == current) - need_flush = 1; - } - } - - break; - } - case PGT_l2_shadow: - { - int max = -1; - - l2_pgentry_t *guest2 = guest; - l2_pgentry_t *shadow2 = shadow; - l2_pgentry_t *snapshot2 = snapshot; - - ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d)); - BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented - - changed = 0; - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - { - l2_pgentry_t new_pde = guest2[i]; - - if ( !is_guest_l2_slot(0,i) && !external ) - continue; - - if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK)) - { - need_flush |= validate_pde_change(d, new_pde, &shadow2[i]); - - // can't update snapshots of linear page tables -- they - // are used multiple times... - // - // snapshot[i] = new_pde; - - changed++; - } - if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */ - max = i; - - // XXX - This hack works for linux guests. - // Need a better solution long term. - if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) && - unlikely(l2e_get_intpte(new_pde) != 0) && - !unshadow && MFN_PINNED(smfn) ) - unshadow = 1; - } - if ( max == -1 ) - unshadow = 1; - perfc_incrc(resync_l2); - perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES); - break; - } - case PGT_hl2_shadow: - { - l2_pgentry_t *guest2 = guest; - l2_pgentry_t *snapshot2 = snapshot; - l1_pgentry_t *shadow2 = shadow; - - ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d)); - BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented - - changed = 0; - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - { - l2_pgentry_t new_pde = guest2[i]; - - if ( !is_guest_l2_slot(0, i) && !external ) - continue; - - if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) ) - { - need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]); - - // can't update snapshots of linear page tables -- they - // are used multiple times... - // - // snapshot[i] = new_pde; - - changed++; - } - } - perfc_incrc(resync_hl2); - perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES); - break; - } - default: - BUG(); - } - - if ( smfn ) - unmap_domain_page(shadow); - unmap_domain_page(snapshot); - unmap_domain_page(guest); - - if ( unlikely(unshadow) ) - { - for_each_vcpu(d, v) - if(smfn == pagetable_get_pfn(v->arch.shadow_table)) - return need_flush; - perfc_incrc(unshadow_l2_count); - shadow_unpin(smfn); - if ( unlikely(shadow_mode_external(d)) ) - { - unsigned long hl2mfn; - - if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) && - MFN_PINNED(hl2mfn) ) - shadow_unpin(hl2mfn); - } - } - } - - return need_flush; -} - -void __shadow_sync_all(struct domain *d) -{ - struct out_of_sync_entry *entry; - int need_flush = 0; - l1_pgentry_t *ppte, opte, npte; - cpumask_t other_vcpus_mask; - - perfc_incrc(shadow_sync_all); - - ASSERT(shadow_lock_is_acquired(d)); - - // First, remove all write permissions to the page tables - // - for ( entry = d->arch.out_of_sync; entry; entry = entry->next) - { - // Skip entries that have low bits set... Those aren't - // real PTEs. - // - if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) ) - continue; - - ppte = (l1_pgentry_t *)( - (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) + - (entry->writable_pl1e & ~PAGE_MASK)); - opte = npte = *ppte; - l1e_remove_flags(npte, _PAGE_RW); - - if ( (l1e_get_flags(npte) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(npte, d) ) - BUG(); - *ppte = npte; - set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT, - (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t)); - shadow_put_page_from_l1e(opte, d); - - unmap_domain_page(ppte); - } - - /* Other VCPUs mustn't use the revoked writable mappings. */ - other_vcpus_mask = d->domain_dirty_cpumask; - cpu_clear(smp_processor_id(), other_vcpus_mask); - flush_tlb_mask(other_vcpus_mask); - - /* Flush ourself later. */ - need_flush = 1; - - /* Second, resync all L1 pages, then L2 pages, etc... */ - need_flush |= resync_all(d, PGT_l1_shadow); - if ( shadow_mode_translate(d) ) - need_flush |= resync_all(d, PGT_hl2_shadow); - need_flush |= resync_all(d, PGT_l2_shadow); - - if ( need_flush && !unlikely(shadow_mode_external(d)) ) - local_flush_tlb(); - - free_out_of_sync_state(d); -} - -int shadow_fault(unsigned long va, struct cpu_user_regs *regs) -{ - l1_pgentry_t gpte, spte, orig_gpte; - struct vcpu *v = current; - struct domain *d = v->domain; - l2_pgentry_t gpde; - - spte = l1e_empty(); - - SH_VVLOG("shadow_fault( va=%lx, code=%lu )", - va, (unsigned long)regs->error_code); - perfc_incrc(shadow_fault_calls); - - check_pagetable(v, "pre-sf"); - - /* - * Don't let someone else take the guest's table pages out-of-sync. - */ - shadow_lock(d); - - /* XXX - FIX THIS COMMENT!!! - * STEP 1. Check to see if this fault might have been caused by an - * out-of-sync table page entry, or if we should pass this - * fault onto the guest. - */ - __shadow_sync_va(v, va); - - /* - * STEP 2. Check the guest PTE. - */ - __guest_get_l2e(v, va, &gpde); - if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) ) - { - SH_VVLOG("shadow_fault - EXIT: L2 not present (%x)", - l2e_get_intpte(gpde)); - perfc_incrc(shadow_fault_bail_pde_not_present); - goto fail; - } - - // This can't fault because we hold the shadow lock and we've ensured that - // the mapping is in-sync, so the check of the PDE's present bit, above, - // covers this access. - // - if ( __copy_from_user(&gpte, - &linear_pg_table[l1_linear_offset(va)], - sizeof(gpte)) ) { - printk("%s() failed, crashing domain %d " - "due to a unaccessible linear page table (gpde=%" PRIpte "), va=%lx\n", - __func__, d->domain_id, l2e_get_intpte(gpde), va); - domain_crash_synchronous(); - } - orig_gpte = gpte; - - if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) ) - { - SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ") (gpde %" PRIpte ")", - l1e_get_intpte(gpte), - l2e_get_intpte(gpde)); - perfc_incrc(shadow_fault_bail_pte_not_present); - goto fail; - } - - /* Write fault? */ - if ( regs->error_code & 2 ) - { - int allow_writes = 0; - - if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) ) - { - if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) ) - { - allow_writes = 1; - l1e_add_flags(gpte, _PAGE_RW); - } - else - { - /* Write fault on a read-only mapping. */ - SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")", - l1e_get_intpte(gpte)); - perfc_incrc(shadow_fault_bail_ro_mapping); - goto fail; - } - } - else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) ) - { - SH_LOG("l1pte_write_fault: no write access to page table page"); - domain_crash_synchronous(); - } - - /* User access violation in guest? */ - if ( unlikely((regs->error_code & 4) && - !(l1e_get_flags(gpte) & _PAGE_USER))) - { - SH_VVLOG("shadow_fault - EXIT: wr fault on super page (%" PRIpte ")", - l1e_get_intpte(gpte)); - goto fail; - - } - - if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) ) - { - SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed"); - perfc_incrc(write_fault_bail); - shadow_unlock(d); - return 0; - } - - if ( allow_writes ) - l1e_remove_flags(gpte, _PAGE_RW); - } - else - { - /* Read-protection violation in guest? */ - if ( unlikely((regs->error_code & 1) )) - { - SH_VVLOG("shadow_fault - EXIT: read fault on super page (%" PRIpte ")", - l1e_get_intpte(gpte)); - goto fail; - - } - - - if ( !l1pte_read_fault(d, &gpte, &spte) ) - { - SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed"); - perfc_incrc(read_fault_bail); - shadow_unlock(d); - return 0; - } - } - - /* - * STEP 3. Write the modified shadow PTE and guest PTE back to the tables. - */ - if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) ) - { - /* XXX Watch out for read-only L2 entries! (not used in Linux). */ - if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)], - &gpte, sizeof(gpte))) ) - { - printk("%s() failed, crashing domain %d " - "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n", - __func__,d->domain_id, l2e_get_intpte(gpde), va); - domain_crash_synchronous(); - } - - __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde))); - } - - shadow_set_l1e(va, spte, 1); - - perfc_incrc(shadow_fault_fixed); - d->arch.shadow_fault_count++; - - shadow_unlock(d); - - check_pagetable(v, "post-sf"); - return EXCRET_fault_fixed; - - fail: - shadow_unlock(d); - return 0; -} - -void shadow_l1_normal_pt_update( - struct domain *d, - unsigned long pa, l1_pgentry_t gpte, - struct domain_mmap_cache *cache) -{ - unsigned long sl1mfn; - l1_pgentry_t *spl1e, spte; - - shadow_lock(d); - - sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow); - if ( sl1mfn ) - { - SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte, - (void *)pa, l1e_get_intpte(gpte)); - l1pte_propagate_from_guest(current->domain, gpte, &spte); - - spl1e = map_domain_page_with_cache(sl1mfn, cache); - spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte; - unmap_domain_page_with_cache(spl1e, cache); - } - - shadow_unlock(d); -} - -void shadow_l2_normal_pt_update( - struct domain *d, - unsigned long pa, l2_pgentry_t gpde, - struct domain_mmap_cache *cache) -{ - unsigned long sl2mfn, hl2mfn; - l2_pgentry_t *spl2e; - l1_pgentry_t *hl2e; - - shadow_lock(d); - - sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow); - if ( sl2mfn ) - { - SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte, - (void *)pa, l2e_get_intpte(gpde)); - spl2e = map_domain_page_with_cache(sl2mfn, cache); - validate_pde_change(d, gpde, - &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]); - unmap_domain_page_with_cache(spl2e, cache); - } - hl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, - PGT_hl2_shadow); - if ( hl2mfn ) - { - hl2e = map_domain_page(hl2mfn); - validate_hl2e_change(d, gpde, - &hl2e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)]); - unmap_domain_page(hl2e); - } - - shadow_unlock(d); -} - -#if CONFIG_PAGING_LEVELS >= 3 -void shadow_l3_normal_pt_update( - struct domain *d, - unsigned long pa, l3_pgentry_t gpde, - struct domain_mmap_cache *cache) -{ - BUG(); // not yet implemented -} -#endif - -#if CONFIG_PAGING_LEVELS >= 4 -void shadow_l4_normal_pt_update( - struct domain *d, - unsigned long pa, l4_pgentry_t gpde, - struct domain_mmap_cache *cache) -{ - BUG(); // not yet implemented -} -#endif - -int shadow_do_update_va_mapping(unsigned long va, - l1_pgentry_t val, - struct vcpu *v) -{ - struct domain *d = v->domain; - l1_pgentry_t spte; - int rc = 0; - - shadow_lock(d); - - // This is actually overkill - we don't need to sync the L1 itself, - // just everything involved in getting to this L1 (i.e. we need - // linear_pg_table[l1_linear_offset(va)] to be in sync)... - // - __shadow_sync_va(v, va); - - l1pte_propagate_from_guest(d, val, &spte); - shadow_set_l1e(va, spte, 0); - - /* - * If we're in log-dirty mode then we need to note that we've updated - * the PTE in the PT-holding page. We need the machine frame number - * for this. - */ - __mark_dirty(d, va_to_l1mfn(v, va)); - - shadow_unlock(d); - - return rc; -} - - -/* - * What lives where in the 32-bit address space in the various shadow modes, - * and what it uses to get/maintain that mapping. - * - * SHADOW MODE: none enable translate external - * - * 4KB things: - * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2 - * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2 - * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2 - * monitor_vtable n/a n/a n/a mapped once - * - * 4MB things: - * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2 - * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2 - * monitor_linear n/a n/a n/a ??? - * perdomain perdomain perdomain perdomain perdomain - * R/O M2P R/O M2P R/O M2P n/a n/a - * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P - * P2M n/a n/a R/O M2P R/O M2P - * - * NB: - * update_pagetables(), __update_pagetables(), shadow_mode_enable(), - * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable() - * all play a part in maintaining these mappings. - */ -void __update_pagetables(struct vcpu *v) -{ - struct domain *d = v->domain; - unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table); - unsigned long gpfn = mfn_to_gmfn(d, gmfn); - unsigned long smfn, hl2mfn, old_smfn; - int need_sync = 0; - - int max_mode = ( shadow_mode_external(d) ? SHM_external - : shadow_mode_translate(d) ? SHM_translate - : shadow_mode_enabled(d) ? SHM_enable - : 0 ); - - ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); - ASSERT( max_mode ); - - /* - * arch.guest_vtable - */ - if ( max_mode & (SHM_enable | SHM_external) ) - { - if ( likely(v->arch.guest_vtable != NULL) ) - unmap_domain_page_global(v->arch.guest_vtable); - v->arch.guest_vtable = map_domain_page_global(gmfn); - } - - /* - * arch.shadow_table - */ - if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) ) - smfn = shadow_l2_table(d, gpfn, gmfn); - else - { - /* - * move sync later in order to avoid this smfn been - * unshadowed occasionally - */ - need_sync = 1; - } - if ( !get_shadow_ref(smfn) ) - BUG(); - old_smfn = pagetable_get_pfn(v->arch.shadow_table); - v->arch.shadow_table = pagetable_from_pfn(smfn); - if ( old_smfn ) - put_shadow_ref(old_smfn); - - SH_VVLOG("__update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn); - - /* - * arch.shadow_vtable - */ - if ( max_mode == SHM_external ) - { - if ( v->arch.shadow_vtable ) - unmap_domain_page_global(v->arch.shadow_vtable); - v->arch.shadow_vtable = map_domain_page_global(smfn); - } - - /* - * arch.hl2_vtable - */ - - // if max_mode == SHM_translate, then the hl2 is already installed - // correctly in its smfn, and there's nothing to do. - // - if ( max_mode == SHM_external ) - { - if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) - hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); - if ( v->arch.hl2_vtable ) - unmap_domain_page_global(v->arch.hl2_vtable); - v->arch.hl2_vtable = map_domain_page_global(hl2mfn); - } - - /* - * fixup pointers in monitor table, as necessary - */ - if ( max_mode == SHM_external ) - { - l2_pgentry_t *mpl2e = v->arch.monitor_vtable; - l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; - l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; - - ASSERT( shadow_mode_translate(d) ); - - if ( !get_shadow_ref(hl2mfn) ) - BUG(); - mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = - l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR); - if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT ) - put_shadow_ref(l2e_get_pfn(old_hl2e)); - - if ( !get_shadow_ref(smfn) ) - BUG(); - mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = - l2e_from_pfn(smfn, __PAGE_HYPERVISOR); - if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT ) - put_shadow_ref(l2e_get_pfn(old_sl2e)); - - // XXX - maybe this can be optimized somewhat?? - local_flush_tlb(); - } - - if(likely(need_sync)) - shadow_sync_all(d); -} - -void clear_all_shadow_status(struct domain *d) -{ - struct vcpu *v = current; - - /* - * Don't clean up while other vcpus are working. - */ - if ( v->vcpu_id ) - return; - - shadow_lock(d); - - free_shadow_pages(d); - free_shadow_ht_entries(d); - d->arch.shadow_ht = - xmalloc_array(struct shadow_status, shadow_ht_buckets); - if ( d->arch.shadow_ht == NULL ) { - printk("clear all shadow status: xmalloc failed\n"); - domain_crash_synchronous(); - } - memset(d->arch.shadow_ht, 0, - shadow_ht_buckets * sizeof(struct shadow_status)); - - free_out_of_sync_entries(d); - - shadow_unlock(d); -} - -/************************************************************************/ -/************************************************************************/ -/************************************************************************/ - -#if SHADOW_DEBUG - -// The following is entirely for _check_pagetable()'s benefit. -// _check_pagetable() wants to know whether a given entry in a -// shadow page table is supposed to be the shadow of the guest's -// current entry, or the shadow of the entry held in the snapshot -// taken above. -// -// Here, we mark all currently existing entries as reflecting -// the snapshot, above. All other places in xen that update -// the shadow will keep the shadow in sync with the guest's -// entries (via l1pte_propagate_from_guest and friends), which clear -// the SHADOW_REFLECTS_SNAPSHOT bit. -// -static void -mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn) -{ - unsigned long smfn; - l1_pgentry_t *l1e; - l2_pgentry_t *l2e; - unsigned i; - - if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) ) - { - l1e = map_domain_page(smfn); - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - if ( is_guest_l1_slot(i) && - (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) ) - l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT); - unmap_domain_page(l1e); - } - - if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) ) - { - l2e = map_domain_page(smfn); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - if ( is_guest_l2_slot(0, i) && - (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) ) - l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT); - unmap_domain_page(l2e); - } -} - -// BUG: these are not SMP safe... -static int sh_l2_present; -static int sh_l1_present; -static char *sh_check_name; -int shadow_status_noswap; - -#define v2m(_v, _adr) ({ \ - unsigned long _a = (unsigned long)(_adr); \ - l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \ - unsigned long _pa = -1; \ - if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \ - { \ - l1_pgentry_t _pte; \ - _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \ - if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \ - _pa = l1e_get_paddr(_pte); \ - } \ - _pa | (_a & ~PAGE_MASK); \ -}) - -#define FAIL(_f, _a...) \ - do { \ - printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \ - sh_check_name, level, l2_idx, l1_idx, ## _a, \ - __FILE__, __LINE__); \ - printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \ - " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \ - " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \ - " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \ - l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \ - l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \ - p_guest_pte, p_shadow_pte, p_snapshot_pte, \ - (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \ - (void *)v2m(v, p_snapshot_pte), \ - (l2_idx << L2_PAGETABLE_SHIFT) | \ - (l1_idx << L1_PAGETABLE_SHIFT)); \ - errors++; \ - } while ( 0 ) - -static int check_pte( - struct vcpu *v, - l1_pgentry_t *p_guest_pte, - l1_pgentry_t *p_shadow_pte, - l1_pgentry_t *p_snapshot_pte, - int level, int l2_idx, int l1_idx) -{ - struct domain *d = v->domain; - l1_pgentry_t guest_pte = *p_guest_pte; - l1_pgentry_t shadow_pte = *p_shadow_pte; - l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty(); - l1_pgentry_t eff_guest_pte = l1e_empty(); - unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn; - int errors = 0, guest_writable; - int page_table_page; - - if ( (l1e_get_intpte(shadow_pte) == 0) || - (l1e_get_intpte(shadow_pte) == 0xdeadface) || - (l1e_get_intpte(shadow_pte) == 0x00000E00) ) - return errors; /* always safe */ - - if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) ) - FAIL("Non zero not present shadow_pte"); - - if ( level == 2 ) sh_l2_present++; - if ( level == 1 ) sh_l1_present++; - - if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte ) - eff_guest_pte = snapshot_pte; - else - eff_guest_pte = guest_pte; - - if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) ) - FAIL("Guest not present yet shadow is"); - - mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK); - - if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) ) - FAIL("Corrupt?"); - - if ( (level == 1) && - (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) && - !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) ) - FAIL("Dirty coherence"); - - if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) && - !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) ) - FAIL("Accessed coherence"); - - if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL ) - FAIL("global bit set in shadow"); - - eff_guest_pfn = l1e_get_pfn(eff_guest_pte); - eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn); - shadow_mfn = l1e_get_pfn(shadow_pte); - - if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) ) - FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n", - __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte)); - - page_table_page = mfn_is_page_table(eff_guest_mfn); - - guest_writable = - (l1e_get_flags(eff_guest_pte) & _PAGE_RW) || - (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn)); - - if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable ) - { - printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n", - eff_guest_pfn, eff_guest_mfn, shadow_mfn, - mfn_to_page(eff_guest_mfn)->u.inuse.type_info, - page_table_page); - FAIL("RW coherence"); - } - - if ( (level == 1) && - (l1e_get_flags(shadow_pte) & _PAGE_RW ) && - !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) ) - { - printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n", - eff_guest_pfn, eff_guest_mfn, shadow_mfn, - mfn_to_page(eff_guest_mfn)->u.inuse.type_info, - page_table_page); - FAIL("RW2 coherence"); - } - - if ( eff_guest_mfn == shadow_mfn ) - { - if ( level > 1 ) - FAIL("Linear map ???"); /* XXX this will fail on BSD */ - } - else - { - if ( level < 2 ) - FAIL("Shadow in L1 entry?"); - - if ( level == 2 ) - { - if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn ) - FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn, - __shadow_status(d, eff_guest_pfn, PGT_l1_shadow)); - } - else - BUG(); // XXX -- not handled yet. - } - - return errors; -} -#undef FAIL -#undef v2m - -static int check_l1_table( - struct vcpu *v, unsigned long gpfn, - unsigned long gmfn, unsigned long smfn, unsigned l2_idx) -{ - struct domain *d = v->domain; - int i; - unsigned long snapshot_mfn; - l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL; - int errors = 0; - - if ( page_out_of_sync(mfn_to_page(gmfn)) ) - { - snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot); - ASSERT(snapshot_mfn); - p_snapshot = map_domain_page(snapshot_mfn); - } - - p_guest = map_domain_page(gmfn); - p_shadow = map_domain_page(smfn); - - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - errors += check_pte(v, p_guest+i, p_shadow+i, - p_snapshot ? p_snapshot+i : NULL, - 1, l2_idx, i); - - unmap_domain_page(p_shadow); - unmap_domain_page(p_guest); - if ( p_snapshot ) - unmap_domain_page(p_snapshot); - - return errors; -} - -#define FAILPT(_f, _a...) \ - do { \ - printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \ - errors++; \ - } while ( 0 ) - -int check_l2_table( - struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes) -{ - struct domain *d = v->domain; - l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn); - l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn); - l2_pgentry_t match; - int i; - int errors = 0; - int limit; - - if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) ) - FAILPT("domain doesn't own page"); - if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) ) - FAILPT("bogus owner for snapshot page"); - if ( page_get_owner(mfn_to_page(smfn)) != NULL ) - FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d", - smfn, page_get_owner(mfn_to_page(smfn))->domain_id); - -#if 0 - if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) - - DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) ) - { - for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE; - i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT); - i++ ) - printk("+++ (%d) %lx %lx\n",i, - l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i])); - FAILPT("hypervisor entries inconsistent"); - } - - if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != - l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) ) - FAILPT("hypervisor linear map inconsistent"); -#endif - - match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR); - if ( !shadow_mode_external(d) && - l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT], - match, PAGE_FLAG_MASK)) - { - FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte, - l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >> - L2_PAGETABLE_SHIFT]), - l2e_get_intpte(match)); - } - - match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR); - if ( !shadow_mode_external(d) && - l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT], - match, PAGE_FLAG_MASK)) - { - FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte, - l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]), - d->arch.mm_perdomain_pt, - l2e_get_intpte(match)); - } - - if ( shadow_mode_external(d) ) - limit = L2_PAGETABLE_ENTRIES; - else - limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; - - /* Check the whole L2. */ - for ( i = 0; i < limit; i++ ) - errors += check_pte(v, - (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */ - (l1_pgentry_t*)(&spl2e[i]), - NULL, - 2, i, 0); - - unmap_domain_page(spl2e); - unmap_domain_page(gpl2e); - -#if 1 - if ( errors ) - printk("check_l2_table returning %d errors\n", errors); -#endif - - return errors; -} -#undef FAILPT - -int _check_pagetable(struct vcpu *v, char *s) -{ - struct domain *d = v->domain; - pagetable_t pt = v->arch.guest_table; - unsigned long gptbase = pagetable_get_paddr(pt); - unsigned long ptbase_pfn, smfn; - unsigned long i; - l2_pgentry_t *gpl2e, *spl2e; - unsigned long ptbase_mfn = 0; - int errors = 0, limit, oos_pdes = 0; - - //_audit_domain(d, AUDIT_QUIET); - shadow_lock(d); - - sh_check_name = s; - //SH_VVLOG("%s-PT Audit", s); - sh_l2_present = sh_l1_present = 0; - perfc_incrc(check_pagetable); - - ptbase_mfn = gptbase >> PAGE_SHIFT; - ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn); - - if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) ) - { - printk("%s-PT %lx not shadowed\n", s, gptbase); - goto out; - } - if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) ) - { - ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot); - oos_pdes = 1; - ASSERT(ptbase_mfn); - } - - errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes); - - gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn); - spl2e = (l2_pgentry_t *) map_domain_page(smfn); - - /* Go back and recurse. */ - if ( shadow_mode_external(d) ) - limit = L2_PAGETABLE_ENTRIES; - else - limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; - - for ( i = 0; i < limit; i++ ) - { - unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]); - unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn); - unsigned long sl1mfn = l2e_get_pfn(spl2e[i]); - - if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */ - { - errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i); - } - } - - unmap_domain_page(spl2e); - unmap_domain_page(gpl2e); - - out: - if ( errors ) - BUG(); - - shadow_unlock(d); - - return errors; -} - -int _check_all_pagetables(struct vcpu *v, char *s) -{ - struct domain *d = v->domain; - int i; - struct shadow_status *a; - unsigned long gmfn; - int errors = 0; - - shadow_status_noswap = 1; - - sh_check_name = s; - SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id); - sh_l2_present = sh_l1_present = 0; - perfc_incrc(check_all_pagetables); - - for (i = 0; i < shadow_ht_buckets; i++) - { - a = &d->arch.shadow_ht[i]; - while ( a && a->gpfn_and_flags ) - { - gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask); - - switch ( a->gpfn_and_flags & PGT_type_mask ) - { - case PGT_l1_shadow: - errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask, - gmfn, a->smfn, 0); - break; - case PGT_l2_shadow: - errors += check_l2_table(v, gmfn, a->smfn, - page_out_of_sync(mfn_to_page(gmfn))); - break; - case PGT_l3_shadow: - case PGT_l4_shadow: - case PGT_hl2_shadow: - BUG(); // XXX - ought to fix this... - break; - case PGT_snapshot: - case PGT_writable_pred: - break; - default: - errors++; - printk("unexpected shadow type %lx, gpfn=%lx, " - "gmfn=%lx smfn=%lx\n", - a->gpfn_and_flags & PGT_type_mask, - a->gpfn_and_flags & PGT_mfn_mask, - gmfn, a->smfn); - BUG(); - } - a = a->next; - } - } - - shadow_status_noswap = 0; - - if ( errors ) - BUG(); - - return errors; -} - -#endif // SHADOW_DEBUG - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/arch/x86/shadow_guest32.c b/xen/arch/x86/shadow_guest32.c deleted file mode 100644 index bdc58257cd..0000000000 --- a/xen/arch/x86/shadow_guest32.c +++ /dev/null @@ -1,16 +0,0 @@ -#define GUEST_PGENTRY_32 - -#include "shadow.c" -struct shadow_ops MODE_64_2_HANDLER = { - .guest_paging_levels = 2, - .invlpg = shadow_invlpg_64, - .fault = shadow_fault_64, - .update_pagetables = shadow_update_pagetables, - .sync_all = sync_all, - .remove_all_write_access = remove_all_write_access, - .do_update_va_mapping = do_update_va_mapping, - .mark_mfn_out_of_sync = mark_mfn_out_of_sync, - .is_out_of_sync = is_out_of_sync, - .gva_to_gpa = gva_to_gpa_64, -}; - diff --git a/xen/arch/x86/shadow_guest32pae.c b/xen/arch/x86/shadow_guest32pae.c deleted file mode 100644 index 432c9b9cb4..0000000000 --- a/xen/arch/x86/shadow_guest32pae.c +++ /dev/null @@ -1,16 +0,0 @@ -#define GUEST_32PAE - -#include "shadow.c" -struct shadow_ops MODE_64_PAE_HANDLER = { - .guest_paging_levels = 3, - .invlpg = shadow_invlpg_64, - .fault = shadow_fault_64, - .update_pagetables = shadow_update_pagetables, - .sync_all = sync_all, - .remove_all_write_access = remove_all_write_access, - .do_update_va_mapping = do_update_va_mapping, - .mark_mfn_out_of_sync = mark_mfn_out_of_sync, - .is_out_of_sync = is_out_of_sync, - .gva_to_gpa = gva_to_gpa_64, -}; - diff --git a/xen/arch/x86/shadow_public.c b/xen/arch/x86/shadow_public.c deleted file mode 100644 index 40aa22e4ea..0000000000 --- a/xen/arch/x86/shadow_public.c +++ /dev/null @@ -1,2143 +0,0 @@ -/****************************************************************************** - * arch/x86/shadow_public.c - * - * Copyright (c) 2005 Michael A Fetterman - * Based on an earlier implementation by Ian Pratt et al - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int alloc_p2m_table(struct domain *d); -static void free_p2m_table(struct domain *d); - -#define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) - ((_encoded) >> 16)) - - -int shadow_direct_map_init(struct domain *d) -{ - struct page_info *page; - l3_pgentry_t *root; - - if ( !(page = alloc_domheap_pages(NULL, 0, MEMF_dma)) ) - return 0; - - root = map_domain_page(page_to_mfn(page)); - memset(root, 0, PAGE_SIZE); - root[PAE_SHADOW_SELF_ENTRY] = l3e_from_page(page, __PAGE_HYPERVISOR); - - d->arch.phys_table = pagetable_from_page(page); - - unmap_domain_page(root); - return 1; -} - -void shadow_direct_map_clean(struct domain *d) -{ - unsigned long mfn; - l2_pgentry_t *l2e; - l3_pgentry_t *l3e; - int i, j; - - mfn = pagetable_get_pfn(d->arch.phys_table); - - /* - * We may fail very early before direct map is built. - */ - if ( !mfn ) - return; - - l3e = (l3_pgentry_t *)map_domain_page(mfn); - - for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ ) - { - if ( l3e_get_flags(l3e[i]) & _PAGE_PRESENT ) - { - l2e = map_domain_page(l3e_get_pfn(l3e[i])); - - for ( j = 0; j < L2_PAGETABLE_ENTRIES; j++ ) - { - if ( l2e_get_flags(l2e[j]) & _PAGE_PRESENT ) - free_domheap_page(mfn_to_page(l2e_get_pfn(l2e[j]))); - } - unmap_domain_page(l2e); - free_domheap_page(mfn_to_page(l3e_get_pfn(l3e[i]))); - } - } - free_domheap_page(mfn_to_page(mfn)); - - unmap_domain_page(l3e); - - d->arch.phys_table = pagetable_null(); -} - -/****************************************************************************/ -/************* export interface functions ***********************************/ -/****************************************************************************/ -void free_shadow_pages(struct domain *d); - -int shadow_set_guest_paging_levels(struct domain *d, int levels) -{ - struct vcpu *v = current; - - /* - * Need to wait for VCPU0 to complete the on-going shadow ops. - */ - - if ( v->domain == d && v->vcpu_id ) - return 1; - - shadow_lock(d); - - switch(levels) { -#if CONFIG_PAGING_LEVELS == 4 - case 4: - if ( d->arch.ops != &MODE_64_4_HANDLER ) - d->arch.ops = &MODE_64_4_HANDLER; - shadow_unlock(d); - return 1; -#endif -#if CONFIG_PAGING_LEVELS == 3 - case 3: - if ( d->arch.ops == NULL || - shadow_mode_log_dirty(d) ) - { - if ( d->arch.ops != &MODE_64_3_HANDLER ) - d->arch.ops = &MODE_64_3_HANDLER; - } - else - { - if ( d->arch.ops == &MODE_64_2_HANDLER ) - free_shadow_pages(d); - if ( d->arch.ops != &MODE_64_PAE_HANDLER ) - d->arch.ops = &MODE_64_PAE_HANDLER; - } - shadow_unlock(d); - return 1; -#endif -#if CONFIG_PAGING_LEVELS == 4 - case 3: - if ( d->arch.ops == &MODE_64_2_HANDLER ) - free_shadow_pages(d); - if ( d->arch.ops != &MODE_64_PAE_HANDLER ) - d->arch.ops = &MODE_64_PAE_HANDLER; - shadow_unlock(d); - return 1; -#endif - case 2: -#if CONFIG_PAGING_LEVELS == 2 - if ( d->arch.ops != &MODE_32_2_HANDLER ) - d->arch.ops = &MODE_32_2_HANDLER; -#elif CONFIG_PAGING_LEVELS >= 3 - if ( d->arch.ops != &MODE_64_2_HANDLER ) - d->arch.ops = &MODE_64_2_HANDLER; -#endif - shadow_unlock(d); - return 1; - default: - shadow_unlock(d); - return 0; - } -} - -void shadow_invlpg(struct vcpu *v, unsigned long va) -{ - struct domain *d = current->domain; - d->arch.ops->invlpg(v, va); -} - -int shadow_fault(unsigned long va, struct cpu_user_regs *regs) -{ - struct domain *d = current->domain; - return d->arch.ops->fault(va, regs); -} - -void __update_pagetables(struct vcpu *v) -{ - struct domain *d = v->domain; - d->arch.ops->update_pagetables(v); -} - -void __shadow_sync_all(struct domain *d) -{ - d->arch.ops->sync_all(d); -} - -int shadow_remove_all_write_access( - struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn) -{ - return d->arch.ops->remove_all_write_access(d, readonly_gpfn, readonly_gmfn); -} - -int shadow_do_update_va_mapping(unsigned long va, - l1_pgentry_t val, - struct vcpu *v) -{ - struct domain *d = v->domain; - return d->arch.ops->do_update_va_mapping(va, val, v); -} - -struct out_of_sync_entry * -shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn, - unsigned long mfn) -{ - struct domain *d = v->domain; - return d->arch.ops->mark_mfn_out_of_sync(v, gpfn, mfn); -} - -/* - * Returns 1 if va's shadow mapping is out-of-sync. - * Returns 0 otherwise. - */ -int __shadow_out_of_sync(struct vcpu *v, unsigned long va) -{ - struct domain *d = v->domain; - return d->arch.ops->is_out_of_sync(v, va); -} - -unsigned long gva_to_gpa(unsigned long gva) -{ - struct domain *d = current->domain; - return d->arch.ops->gva_to_gpa(gva); -} -/****************************************************************************/ -/****************************************************************************/ -#if CONFIG_PAGING_LEVELS >= 3 - -static void inline -free_shadow_fl1_table(struct domain *d, unsigned long smfn) -{ - l1_pgentry_t *pl1e = map_domain_page(smfn); - int i; - - for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) - put_page_from_l1e(pl1e[i], d); - - unmap_domain_page(pl1e); -} - -/* - * Free l2, l3, l4 shadow tables - */ - -void free_fake_shadow_l2(struct domain *d,unsigned long smfn); - -static void inline -free_shadow_tables(struct domain *d, unsigned long smfn, u32 level) -{ - pgentry_64_t *ple = map_domain_page(smfn); - int i, external = shadow_mode_external(d); - -#if CONFIG_PAGING_LEVELS >= 3 - if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) - { - struct page_info *page = mfn_to_page(smfn); - for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ ) - { - if ( entry_get_flags(ple[i]) & _PAGE_PRESENT ) - free_fake_shadow_l2(d, entry_get_pfn(ple[i])); - } - - page = mfn_to_page(entry_get_pfn(ple[0])); - free_domheap_pages(page, SL2_ORDER); - unmap_domain_page(ple); - } - else -#endif - { - /* - * No Xen mappings in external pages - */ - if ( external ) - { - for ( i = 0; i < PAGETABLE_ENTRIES; i++ ) { - if ( entry_get_flags(ple[i]) & _PAGE_PRESENT ) - put_shadow_ref(entry_get_pfn(ple[i])); - if (d->arch.ops->guest_paging_levels == PAGING_L3) - { -#if CONFIG_PAGING_LEVELS >= 3 - if ( i == PAE_L3_PAGETABLE_ENTRIES && level == PAGING_L4 ) -#endif - break; - } - } - } - else - { - for ( i = 0; i < PAGETABLE_ENTRIES; i++ ) - { - /* - * List the skip/break conditions to avoid freeing - * Xen private mappings. - */ -#if CONFIG_PAGING_LEVELS == 2 - if ( level == PAGING_L2 && !is_guest_l2_slot(0, i) ) - continue; -#endif -#if CONFIG_PAGING_LEVELS == 3 - if ( level == PAGING_L3 && i == L3_PAGETABLE_ENTRIES ) - break; - if ( level == PAGING_L2 ) - { - struct page_info *page = mfn_to_page(smfn); - if ( is_xen_l2_slot(page->u.inuse.type_info, i) ) - continue; - } -#endif -#if CONFIG_PAGING_LEVELS == 4 - if ( level == PAGING_L4 && !is_guest_l4_slot(i)) - continue; -#endif - if ( entry_get_flags(ple[i]) & _PAGE_PRESENT ) - put_shadow_ref(entry_get_pfn(ple[i])); - } - } - unmap_domain_page(ple); - } -} -#endif - -#if CONFIG_PAGING_LEVELS == 4 -static void alloc_monitor_pagetable(struct vcpu *v) -{ - unsigned long mmfn; - l4_pgentry_t *mpl4e; - struct page_info *mmfn_info; - struct domain *d = v->domain; - - ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */ - - mmfn_info = alloc_domheap_page(NULL); - ASSERT( mmfn_info ); - if (!mmfn_info) - { - printk("Fail to allocate monitor pagetable\n"); - domain_crash(v->domain); - } - - mmfn = page_to_mfn(mmfn_info); - mpl4e = (l4_pgentry_t *) map_domain_page_global(mmfn); - memcpy(mpl4e, idle_pg_table, PAGE_SIZE); - mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] = - l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR); - - /* map the phys_to_machine map into the per domain Read-Only MPT space */ - - v->arch.monitor_table = pagetable_from_pfn(mmfn); - v->arch.monitor_vtable = (l2_pgentry_t *) mpl4e; - mpl4e[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty(); - - if ( v->vcpu_id == 0 ) - alloc_p2m_table(d); - else - { - unsigned long mfn; - - mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); - if ( mfn ) - { - l4_pgentry_t *l4tab; - - l4tab = map_domain_page(mfn); - - mpl4e[l4_table_offset(RO_MPT_VIRT_START)] = - l4tab[l4_table_offset(RO_MPT_VIRT_START)]; - - unmap_domain_page(l4tab); - } - } -} - -void free_monitor_pagetable(struct vcpu *v) -{ - unsigned long mfn; - - /* - * free monitor_table. - */ - if ( v->vcpu_id == 0 ) - free_p2m_table(v->domain); - - /* - * Then free monitor_table. - */ - mfn = pagetable_get_pfn(v->arch.monitor_table); - unmap_domain_page_global(v->arch.monitor_vtable); - free_domheap_page(mfn_to_page(mfn)); - - v->arch.monitor_table = pagetable_null(); - v->arch.monitor_vtable = 0; -} -#elif CONFIG_PAGING_LEVELS == 3 -static void alloc_monitor_pagetable(struct vcpu *v) -{ - unsigned long m2mfn, m3mfn; - l2_pgentry_t *mpl2e; - l3_pgentry_t *mpl3e; - struct page_info *m2mfn_info, *m3mfn_info; - struct domain *d = v->domain; - int i; - - ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */ - - m3mfn_info = alloc_domheap_pages(NULL, 0, MEMF_dma); - ASSERT( m3mfn_info ); - - m3mfn = page_to_mfn(m3mfn_info); - mpl3e = (l3_pgentry_t *) map_domain_page_global(m3mfn); - memset(mpl3e, 0, L3_PAGETABLE_ENTRIES * sizeof(l3_pgentry_t)); - - v->arch.monitor_table = pagetable_from_pfn(m3mfn); - v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e; - - m2mfn_info = alloc_domheap_page(NULL); - ASSERT( m2mfn_info ); - - m2mfn = page_to_mfn(m2mfn_info); - mpl2e = (l2_pgentry_t *) map_domain_page(m2mfn); - memset(mpl2e, 0, PAGE_SIZE); - - /* Map L2 page into L3 */ - mpl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(m2mfn, _PAGE_PRESENT); - - memcpy(&mpl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)], - &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT], - L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); - - for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) - mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = - l2e_from_page( - virt_to_page(d->arch.mm_perdomain_pt) + i, - __PAGE_HYPERVISOR); - for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ ) - mpl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] = - (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ? - l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) : - l2e_empty(); - for ( i = 0; i < (MACHPHYS_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ ) - mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] = l2e_empty(); - - if ( v->vcpu_id == 0 ) - { - unsigned long m1mfn; - l1_pgentry_t *mpl1e; - struct page_info *m1mfn_info; - - /* - * 2 l2 slots are allocated here, so that 4M for p2m table, - * with this we can guarantee PCI MMIO p2m entries, especially - * Cirrus VGA, can be seen by all other vcpus. - */ - for ( i = 0; i < 2; i++ ) - { - m1mfn_info = alloc_domheap_page(NULL); - ASSERT( m1mfn_info ); - - m1mfn = page_to_mfn(m1mfn_info); - mpl1e = (l1_pgentry_t *) map_domain_page(m1mfn); - memset(mpl1e, 0, PAGE_SIZE); - unmap_domain_page(mpl1e); - - /* Map L1 page into L2 */ - mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] = - l2e_from_pfn(m1mfn, __PAGE_HYPERVISOR); - } - - alloc_p2m_table(d); - } - else - { - unsigned long mfn; - - mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); - if ( mfn ) - { - l3_pgentry_t *l3tab, l3e; - l2_pgentry_t *l2tab; - - l3tab = map_domain_page(mfn); - l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)]; - - /* - * NB: when CONFIG_PAGING_LEVELS == 3, - * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here. - * alloc_monitor_pagetable should guarantee this. - */ - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - BUG(); - - l2tab = map_domain_page(l3e_get_pfn(l3e)); - - for ( i = 0; i < (MACHPHYS_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ ) - mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] = - l2tab[l2_table_offset(RO_MPT_VIRT_START) + i]; - - unmap_domain_page(l2tab); - unmap_domain_page(l3tab); - } - } - - unmap_domain_page(mpl2e); -} - -void free_monitor_pagetable(struct vcpu *v) -{ - unsigned long m2mfn, m3mfn; - /* - * free monitor_table. - */ - if ( v->vcpu_id == 0 ) - free_p2m_table(v->domain); - - m3mfn = pagetable_get_pfn(v->arch.monitor_table); - m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]); - - free_domheap_page(mfn_to_page(m2mfn)); - unmap_domain_page_global(v->arch.monitor_vtable); - free_domheap_page(mfn_to_page(m3mfn)); - - v->arch.monitor_table = pagetable_null(); - v->arch.monitor_vtable = 0; -} -#endif - -static void -shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry) -{ - void *snapshot; - - if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) - return; - - // Clear the out_of_sync bit. - // - clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info); - - // XXX Need to think about how to protect the domain's - // information less expensively. - // - snapshot = map_domain_page(entry->snapshot_mfn); - memset(snapshot, 0, PAGE_SIZE); - unmap_domain_page(snapshot); - - put_shadow_ref(entry->snapshot_mfn); -} - -void -release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry) -{ - struct page_info *page; - - page = mfn_to_page(entry->gmfn); - - // Decrement ref count of guest & shadow pages - // - put_page(page); - - // Only use entries that have low bits clear... - // - if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ) - { - put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT); - entry->writable_pl1e = -2; - } - else - ASSERT( entry->writable_pl1e == -1 ); - - // Free the snapshot - // - shadow_free_snapshot(d, entry); -} - -static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn) -{ - struct out_of_sync_entry *entry = d->arch.out_of_sync; - struct out_of_sync_entry **prev = &d->arch.out_of_sync; - struct out_of_sync_entry *found = NULL; - - // NB: Be careful not to call something that manipulates this list - // while walking it. Collect the results into a separate list - // first, then walk that list. - // - while ( entry ) - { - if ( entry->gmfn == gmfn ) - { - // remove from out of sync list - *prev = entry->next; - - // add to found list - entry->next = found; - found = entry; - - entry = *prev; - continue; - } - prev = &entry->next; - entry = entry->next; - } - - prev = NULL; - entry = found; - while ( entry ) - { - release_out_of_sync_entry(d, entry); - - prev = &entry->next; - entry = entry->next; - } - - // Add found list to free list - if ( prev ) - { - *prev = d->arch.out_of_sync_free; - d->arch.out_of_sync_free = found; - } -} - -static inline void -shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn) -{ - if ( !shadow_mode_refcounts(d) ) - return; - - ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table); - - if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none ) - { - clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info); - - if ( page_out_of_sync(mfn_to_page(gmfn)) ) - { - remove_out_of_sync_entries(d, gmfn); - } - } -} - -static void inline -free_shadow_l1_table(struct domain *d, unsigned long smfn) -{ - l1_pgentry_t *pl1e = map_domain_page(smfn); - l1_pgentry_t *pl1e_next = 0, *sl1e_p; - int i; - struct page_info *spage = mfn_to_page(smfn); - u32 min_max = spage->tlbflush_timestamp; - int min = SHADOW_MIN(min_max); - int max; - - if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) - { - max = SHADOW_MAX_GUEST32(min_max); - pl1e_next = map_domain_page(smfn + 1); - } - else - max = SHADOW_MAX(min_max); - - for ( i = min; i <= max; i++ ) - { - if ( pl1e_next && i >= L1_PAGETABLE_ENTRIES ) - sl1e_p = &pl1e_next[i - L1_PAGETABLE_ENTRIES]; - else - sl1e_p = &pl1e[i]; - - shadow_put_page_from_l1e(*sl1e_p, d); - *sl1e_p = l1e_empty(); - } - - unmap_domain_page(pl1e); - if ( pl1e_next ) - unmap_domain_page(pl1e_next); -} - -static void inline -free_shadow_hl2_table(struct domain *d, unsigned long smfn) -{ - l1_pgentry_t *hl2 = map_domain_page(smfn); - int i, limit; - - SH_VVLOG("%s: smfn=%lx freed", __func__, smfn); - -#if CONFIG_PAGING_LEVELS == 2 - if ( shadow_mode_external(d) ) - limit = L2_PAGETABLE_ENTRIES; - else - limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; -#endif - - for ( i = 0; i < limit; i++ ) - { - if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT ) - put_page(mfn_to_page(l1e_get_pfn(hl2[i]))); - } - - unmap_domain_page(hl2); -} - -static void inline -free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type) -{ - l2_pgentry_t *pl2e = map_domain_page(smfn); - int i, external = shadow_mode_external(d); - - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - if ( external || is_guest_l2_slot(type, i) ) - if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT ) - put_shadow_ref(l2e_get_pfn(pl2e[i])); - - if ( (PGT_base_page_table == PGT_l2_page_table) && - shadow_mode_translate(d) && !external ) - { - // free the ref to the hl2 - // - put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)])); - } - - unmap_domain_page(pl2e); -} - -void free_fake_shadow_l2(struct domain *d, unsigned long smfn) -{ - pgentry_64_t *ple = map_domain_page(smfn); - int i; - - for ( i = 0; i < PAGETABLE_ENTRIES; i = i + 2 ) - if ( entry_get_flags(ple[i]) & _PAGE_PRESENT ) - put_shadow_ref(entry_get_pfn(ple[i])); - - unmap_domain_page(ple); -} - -void free_shadow_page(unsigned long smfn) -{ - struct page_info *page = mfn_to_page(smfn); - unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask; - struct domain *d = page_get_owner(mfn_to_page(gmfn)); - unsigned long gpfn = mfn_to_gmfn(d, gmfn); - unsigned long type = page->u.inuse.type_info & PGT_type_mask; - u64 index = 0; - - SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn); - - ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); -#if CONFIG_PAGING_LEVELS >= 4 - if ( type == PGT_fl1_shadow ) - { - unsigned long mfn; - mfn = __shadow_status(d, gpfn, PGT_fl1_shadow); - if ( !mfn ) - gpfn |= PGT_high_mfn_nx; - } -#endif -#if CONFIG_PAGING_LEVELS >= 3 - if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) - { - if ( type == PGT_l4_shadow ) - index = page->tlbflush_timestamp; - } -#endif - - delete_shadow_status(d, gpfn, gmfn, type, index); - - switch ( type ) - { - case PGT_l1_shadow: - perfc_decr(shadow_l1_pages); - shadow_demote(d, gpfn, gmfn); - free_shadow_l1_table(d, smfn); - d->arch.shadow_page_count--; - break; -#if CONFIG_PAGING_LEVELS == 2 - case PGT_l2_shadow: - perfc_decr(shadow_l2_pages); - shadow_demote(d, gpfn, gmfn); - free_shadow_l2_table(d, smfn, page->u.inuse.type_info); - d->arch.shadow_page_count--; - break; - - case PGT_hl2_shadow: - perfc_decr(hl2_table_pages); - shadow_demote(d, gpfn, gmfn); - free_shadow_hl2_table(d, smfn); - d->arch.hl2_page_count--; - break; -#endif -#if CONFIG_PAGING_LEVELS >= 3 - case PGT_l2_shadow: - case PGT_l3_shadow: - shadow_demote(d, gpfn, gmfn); - free_shadow_tables(d, smfn, shadow_type_to_level(type)); - d->arch.shadow_page_count--; - break; - - case PGT_l4_shadow: - gpfn = gpfn & PGT_mfn_mask; - if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) - { - /* - * Since a single PDPT page can have multiple PDPs, it's possible - * that shadow_demote() has been already called for gmfn. - */ - if ( mfn_is_page_table(gmfn) ) - shadow_demote(d, gpfn, gmfn); - } else - shadow_demote(d, gpfn, gmfn); - - free_shadow_tables(d, smfn, shadow_type_to_level(type)); - d->arch.shadow_page_count--; - break; - - case PGT_fl1_shadow: - free_shadow_fl1_table(d, smfn); - d->arch.shadow_page_count--; - break; -#endif - case PGT_snapshot: - perfc_decr(snapshot_pages); - break; - - default: - printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n", - page_to_mfn(page), page->u.inuse.type_info); - break; - } - - // No TLB flushes are needed the next time this page gets allocated. - // - page->tlbflush_timestamp = 0; - page->u.free.cpumask = CPU_MASK_NONE; - - if ( type == PGT_l1_shadow ) - { - list_add(&page->list, &d->arch.free_shadow_frames); - perfc_incr(free_l1_pages); - } - else - free_domheap_page(page); -} - -static void -free_writable_pte_predictions(struct domain *d) -{ - int i; - struct shadow_status *x; - - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - u32 count; - unsigned long *gpfn_list; - - /* Skip empty buckets. */ - if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 ) - continue; - - count = 0; - for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) - if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred ) - count++; - - gpfn_list = xmalloc_array(unsigned long, count); - count = 0; - for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) - if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred ) - gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask; - - while ( count ) - { - count--; - delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred, 0); - } - - xfree(gpfn_list); - } -} - -static void free_shadow_ht_entries(struct domain *d) -{ - struct shadow_status *x, *n; - - SH_VLOG("freed tables count=%d l1=%d l2=%d", - d->arch.shadow_page_count, perfc_value(shadow_l1_pages), - perfc_value(shadow_l2_pages)); - - n = d->arch.shadow_ht_extras; - while ( (x = n) != NULL ) - { - d->arch.shadow_extras_count--; - n = *((struct shadow_status **)(&x[shadow_ht_extra_size])); - xfree(x); - } - - d->arch.shadow_ht_extras = NULL; - d->arch.shadow_ht_free = NULL; - - ASSERT(d->arch.shadow_extras_count == 0); - SH_LOG("freed extras, now %d", d->arch.shadow_extras_count); - - if ( d->arch.shadow_dirty_bitmap != NULL ) - { - xfree(d->arch.shadow_dirty_bitmap); - d->arch.shadow_dirty_bitmap = 0; - d->arch.shadow_dirty_bitmap_size = 0; - } - - xfree(d->arch.shadow_ht); - d->arch.shadow_ht = NULL; -} - -static void free_out_of_sync_entries(struct domain *d) -{ - struct out_of_sync_entry *x, *n; - - n = d->arch.out_of_sync_extras; - while ( (x = n) != NULL ) - { - d->arch.out_of_sync_extras_count--; - n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size])); - xfree(x); - } - - d->arch.out_of_sync_extras = NULL; - d->arch.out_of_sync_free = NULL; - d->arch.out_of_sync = NULL; - - ASSERT(d->arch.out_of_sync_extras_count == 0); - FSH_LOG("freed extra out_of_sync entries, now %d", - d->arch.out_of_sync_extras_count); -} - -void free_shadow_pages(struct domain *d) -{ - int i; - struct shadow_status *x; - struct vcpu *v; - struct list_head *list_ent, *tmp; - - /* - * WARNING! The shadow page table must not currently be in use! - * e.g., You are expected to have paused the domain and synchronized CR3. - */ - - if( !d->arch.shadow_ht ) return; - - shadow_audit(d, 1); - - // first, remove any outstanding refs from out_of_sync entries... - // - free_out_of_sync_state(d); - - // second, remove any outstanding refs from v->arch.shadow_table - // and CR3. - // - for_each_vcpu(d, v) - { - if ( pagetable_get_paddr(v->arch.shadow_table) ) - { - put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table)); - v->arch.shadow_table = pagetable_null(); - - if ( shadow_mode_external(d) ) - { - if ( v->arch.shadow_vtable ) - unmap_domain_page_global(v->arch.shadow_vtable); - v->arch.shadow_vtable = NULL; - } - } - - if ( v->arch.monitor_shadow_ref ) - { - put_shadow_ref(v->arch.monitor_shadow_ref); - v->arch.monitor_shadow_ref = 0; - } - } - -#if CONFIG_PAGING_LEVELS == 2 - // For external shadows, remove the monitor table's refs - // - if ( shadow_mode_external(d) ) - { - for_each_vcpu(d, v) - { - l2_pgentry_t *mpl2e = v->arch.monitor_vtable; - - if ( mpl2e ) - { - l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; - l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; - - if ( l2e_get_flags(hl2e) & _PAGE_PRESENT ) - { - put_shadow_ref(l2e_get_pfn(hl2e)); - mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty(); - } - if ( l2e_get_flags(smfn) & _PAGE_PRESENT ) - { - put_shadow_ref(l2e_get_pfn(smfn)); - mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty(); - } - } - } - } -#endif - // Now, the only refs to shadow pages that are left are from the shadow - // pages themselves. We just unpin the pinned pages, and the rest - // should automatically disappear. - // - // NB: Beware: each explicitly or implicit call to free_shadow_page - // can/will result in the hash bucket getting rewritten out from - // under us... First, collect the list of pinned pages, then - // free them. - // - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - u32 count; - unsigned long *mfn_list; - - /* Skip empty buckets. */ - if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 ) - continue; - - count = 0; - for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) - if ( MFN_PINNED(x->smfn) ) - count++; - if ( !count ) - continue; - - mfn_list = xmalloc_array(unsigned long, count); - count = 0; - for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) - if ( MFN_PINNED(x->smfn) ) - mfn_list[count++] = x->smfn; - - while ( count ) - { - shadow_unpin(mfn_list[--count]); - } - xfree(mfn_list); - } - - /* Now free the pre-zero'ed pages from the domain. */ - list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames) - { - struct page_info *page = list_entry(list_ent, struct page_info, list); - - list_del(list_ent); - perfc_decr(free_l1_pages); - - if (d->arch.ops->guest_paging_levels == PAGING_L2) - { -#if CONFIG_PAGING_LEVELS >=3 - free_domheap_pages(page, SL1_ORDER); -#else - free_domheap_page(page); -#endif - } - else - free_domheap_page(page); - } - - shadow_audit(d, 0); - - SH_LOG("Free shadow table."); -} - -void __shadow_mode_disable(struct domain *d) -{ - struct vcpu *v; -#ifndef NDEBUG - int i; -#endif - - if ( unlikely(!shadow_mode_enabled(d)) ) - return; - - free_shadow_pages(d); - free_writable_pte_predictions(d); - -#ifndef NDEBUG - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 ) - { - printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%"PRIx64"\n", - __FILE__, i, (u64)d->arch.shadow_ht[i].gpfn_and_flags); - BUG(); - } - } -#endif - - d->arch.shadow_mode = 0; - - free_shadow_ht_entries(d); - free_out_of_sync_entries(d); - - for_each_vcpu(d, v) - update_pagetables(v); -} - - -int __shadow_mode_enable(struct domain *d, unsigned int mode) -{ - struct vcpu *v; - int new_modes = (mode & ~d->arch.shadow_mode); -#if defined(CONFIG_PAGING_LEVELS) - int initial_paging_levels = 3; -#endif - - // Gotta be adding something to call this function. - ASSERT(new_modes); - - // can't take anything away by calling this function. - ASSERT(!(d->arch.shadow_mode & ~mode)); - -#if defined(CONFIG_PAGING_LEVELS) - if ( CONFIG_PAGING_LEVELS == 2 ) - initial_paging_levels = CONFIG_PAGING_LEVELS; - if ( !shadow_set_guest_paging_levels(d, - initial_paging_levels) ) { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } -#endif - - for_each_vcpu(d, v) - { - invalidate_shadow_ldt(v); - - // We need to set these up for __update_pagetables(). - // See the comment there. - - /* - * arch.guest_vtable - */ - if ( v->arch.guest_vtable && - (v->arch.guest_vtable != __linear_l2_table) ) - { - unmap_domain_page_global(v->arch.guest_vtable); - } - if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) - v->arch.guest_vtable = __linear_l2_table; - else - v->arch.guest_vtable = NULL; - - /* - * arch.shadow_vtable - */ - if ( v->arch.shadow_vtable && - (v->arch.shadow_vtable != __shadow_linear_l2_table) ) - { - unmap_domain_page_global(v->arch.shadow_vtable); - } - if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2) - v->arch.shadow_vtable = __shadow_linear_l2_table; - else - v->arch.shadow_vtable = NULL; - -#if CONFIG_PAGING_LEVELS == 2 - /* - * arch.hl2_vtable - */ - if ( v->arch.hl2_vtable && - (v->arch.hl2_vtable != __linear_hl2_table) ) - { - unmap_domain_page_global(v->arch.hl2_vtable); - } - if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) - v->arch.hl2_vtable = __linear_hl2_table; - else - v->arch.hl2_vtable = NULL; -#endif - /* - * arch.monitor_table & arch.monitor_vtable - */ - if ( v->arch.monitor_vtable ) - { - free_monitor_pagetable(v); - } - if ( mode & SHM_external ) - { - alloc_monitor_pagetable(v); - } - } - - if ( new_modes & SHM_enable ) - { - ASSERT( !d->arch.shadow_ht ); - d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets); - if ( d->arch.shadow_ht == NULL ) - goto nomem; - - memset(d->arch.shadow_ht, 0, - shadow_ht_buckets * sizeof(struct shadow_status)); - } - - if ( new_modes & SHM_log_dirty ) - { - ASSERT( !d->arch.shadow_dirty_bitmap ); - d->arch.shadow_dirty_bitmap_size = - (d->shared_info->arch.max_pfn + 63) & ~63; - d->arch.shadow_dirty_bitmap = - xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size / - (8 * sizeof(unsigned long))); - if ( d->arch.shadow_dirty_bitmap == NULL ) - { - d->arch.shadow_dirty_bitmap_size = 0; - goto nomem; - } - memset(d->arch.shadow_dirty_bitmap, 0, - d->arch.shadow_dirty_bitmap_size/8); - } - - if ( new_modes & SHM_translate ) - { - if ( !(new_modes & SHM_external) ) - { - ASSERT( !pagetable_get_paddr(d->arch.phys_table) ); - if ( !alloc_p2m_table(d) ) - { - printk("alloc_p2m_table failed (out-of-memory?)\n"); - goto nomem; - } - } - } - - // Get rid of any shadow pages from any previous shadow mode. - // - free_shadow_pages(d); - - d->arch.shadow_mode = mode; - - if ( shadow_mode_refcounts(d) ) - { - struct list_head *list_ent; - struct page_info *page; - - /* - * Tear down its counts by disassembling its page-table-based refcounts - * Also remove CR3's gcount/tcount. - * That leaves things like GDTs and LDTs and external refs in tact. - * - * Most pages will be writable tcount=0. - * Some will still be L1 tcount=0 or L2 tcount=0. - * Maybe some pages will be type none tcount=0. - * Pages granted external writable refs (via grant tables?) will - * still have a non-zero tcount. That's OK. - * - * gcounts will generally be 1 for PGC_allocated. - * GDTs and LDTs will have additional gcounts. - * Any grant-table based refs will still be in the gcount. - * - * We attempt to grab writable refs to each page thus setting its type - * Immediately put back those type refs. - * - * Assert that no pages are left with L1/L2/L3/L4 type. - */ - audit_adjust_pgtables(d, -1, 1); - - - for (list_ent = d->page_list.next; list_ent != &d->page_list; - list_ent = page->list.next) { - - page = list_entry(list_ent, struct page_info, list); - if ( !get_page_type(page, PGT_writable_page) ) - BUG(); - put_page_type(page); - /* - * We use tlbflush_timestamp as back pointer to smfn, and need to - * clean up it. - */ - if (shadow_mode_external(d)) - page->tlbflush_timestamp = 0; - } - - audit_adjust_pgtables(d, 1, 1); - - } - - return 0; - - nomem: - if ( (new_modes & SHM_enable) ) - { - xfree(d->arch.shadow_ht); - d->arch.shadow_ht = NULL; - } - if ( (new_modes & SHM_log_dirty) ) - { - xfree(d->arch.shadow_dirty_bitmap); - d->arch.shadow_dirty_bitmap = NULL; - } - - return -ENOMEM; -} - - -int shadow_mode_enable(struct domain *d, unsigned int mode) -{ - int rc; - shadow_lock(d); - rc = __shadow_mode_enable(d, mode); - shadow_unlock(d); - return rc; -} - -static int shadow_mode_table_op( - struct domain *d, dom0_shadow_control_t *sc) -{ - unsigned int op = sc->op; - int i, rc = 0; - struct vcpu *v; - - ASSERT(shadow_lock_is_acquired(d)); - - SH_VLOG("shadow mode table op %lx %lx count %d", - (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */ - (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */ - d->arch.shadow_page_count); - - shadow_audit(d, 1); - - switch ( op ) - { - case DOM0_SHADOW_CONTROL_OP_FLUSH: - free_shadow_pages(d); - - d->arch.shadow_fault_count = 0; - d->arch.shadow_dirty_count = 0; - - break; - - case DOM0_SHADOW_CONTROL_OP_CLEAN: - free_shadow_pages(d); - - sc->stats.fault_count = d->arch.shadow_fault_count; - sc->stats.dirty_count = d->arch.shadow_dirty_count; - - d->arch.shadow_fault_count = 0; - d->arch.shadow_dirty_count = 0; - - if ( guest_handle_is_null(sc->dirty_bitmap) || - (d->arch.shadow_dirty_bitmap == NULL) ) - { - rc = -EINVAL; - break; - } - - if ( sc->pages > d->arch.shadow_dirty_bitmap_size ) - sc->pages = d->arch.shadow_dirty_bitmap_size; - -#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ - for ( i = 0; i < sc->pages; i += chunk ) - { - int bytes = ((((sc->pages - i) > chunk) ? - chunk : (sc->pages - i)) + 7) / 8; - - if ( copy_to_guest_offset( - sc->dirty_bitmap, i/(8*sizeof(unsigned long)), - d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), - (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) ) - { - rc = -EINVAL; - break; - } - memset( - d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))), - 0, bytes); - } - - break; - - case DOM0_SHADOW_CONTROL_OP_PEEK: - sc->stats.fault_count = d->arch.shadow_fault_count; - sc->stats.dirty_count = d->arch.shadow_dirty_count; - - if ( guest_handle_is_null(sc->dirty_bitmap) || - (d->arch.shadow_dirty_bitmap == NULL) ) - { - rc = -EINVAL; - break; - } - - if ( sc->pages > d->arch.shadow_dirty_bitmap_size ) - sc->pages = d->arch.shadow_dirty_bitmap_size; - - if ( copy_to_guest(sc->dirty_bitmap, - d->arch.shadow_dirty_bitmap, - (((sc->pages+7)/8)+sizeof(unsigned long)-1) / - sizeof(unsigned long)) ) - { - rc = -EINVAL; - break; - } - - break; - - default: - rc = -EINVAL; - break; - } - - SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count); - shadow_audit(d, 1); - - for_each_vcpu(d,v) - __update_pagetables(v); - - return rc; -} - -int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) -{ - unsigned int op = sc->op; - int rc = 0; - struct vcpu *v; - - if ( unlikely(d == current->domain) ) - { - DPRINTK("Don't try to do a shadow op on yourself!\n"); - return -EINVAL; - } - - domain_pause(d); - - shadow_lock(d); - - switch ( op ) - { - case DOM0_SHADOW_CONTROL_OP_OFF: - if ( shadow_mode_enabled(d) ) - { - __shadow_sync_all(d); - __shadow_mode_disable(d); - } - break; - - case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST: - free_shadow_pages(d); - rc = __shadow_mode_enable(d, SHM_enable); - break; - - case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY: - free_shadow_pages(d); - rc = __shadow_mode_enable( - d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty); - break; - - case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE: - free_shadow_pages(d); - rc = __shadow_mode_enable( - d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate); - break; - - default: - rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL; - break; - } - - shadow_unlock(d); - - for_each_vcpu(d,v) - update_pagetables(v); - - domain_unpause(d); - - return rc; -} - -void shadow_mode_init(void) -{ -} - -int _shadow_mode_refcounts(struct domain *d) -{ - return shadow_mode_refcounts(d); -} - -static int -map_p2m_entry(pgentry_64_t *top_tab, unsigned long gpfn, unsigned long mfn) -{ -#if CONFIG_PAGING_LEVELS >= 4 - pgentry_64_t l4e = { 0 }; - pgentry_64_t *l3tab = NULL; -#endif -#if CONFIG_PAGING_LEVELS >= 3 - pgentry_64_t l3e = { 0 }; -#endif - l2_pgentry_t *l2tab = NULL; - l1_pgentry_t *l1tab = NULL; - unsigned long *l0tab = NULL; - l2_pgentry_t l2e = { 0 }; - l1_pgentry_t l1e = { 0 }; - struct page_info *page; - unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn)); - -#if CONFIG_PAGING_LEVELS >= 4 - l4e = top_tab[l4_table_offset(va)]; - if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) ) - { - page = alloc_domheap_page(NULL); - if ( !page ) - goto nomem; - - l3tab = map_domain_page(page_to_mfn(page)); - memset(l3tab, 0, PAGE_SIZE); - l4e = top_tab[l4_table_offset(va)] = - entry_from_page(page, __PAGE_HYPERVISOR); - } - else - l3tab = map_domain_page(entry_get_pfn(l4e)); - - l3e = l3tab[l3_table_offset(va)]; - if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) ) - { - page = alloc_domheap_page(NULL); - if ( !page ) - goto nomem; - - l2tab = map_domain_page(page_to_mfn(page)); - memset(l2tab, 0, PAGE_SIZE); - l3e = l3tab[l3_table_offset(va)] = - entry_from_page(page, __PAGE_HYPERVISOR); - } - else - l2tab = map_domain_page(entry_get_pfn(l3e)); - - unmap_domain_page(l3tab); -#else - l3e = top_tab[l3_table_offset(va)]; - - /* - * NB: when CONFIG_PAGING_LEVELS == 3, - * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here. - * alloc_monitor_pagetable should guarantee this. - */ - if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) ) - BUG(); - - l2tab = map_domain_page(entry_get_pfn(l3e)); -#endif - - l2e = l2tab[l2_table_offset(va)]; - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - { - page = alloc_domheap_page(NULL); - if ( !page ) - goto nomem; - - l1tab = map_domain_page(page_to_mfn(page)); - memset(l1tab, 0, PAGE_SIZE); - l2e = l2tab[l2_table_offset(va)] = - l2e_from_page(page, __PAGE_HYPERVISOR); - } - else - l1tab = map_domain_page(l2e_get_pfn(l2e)); - - unmap_domain_page(l2tab); - - l1e = l1tab[l1_table_offset(va)]; - if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) - { - page = alloc_domheap_page(NULL); - if ( !page ) - goto nomem; - - l0tab = map_domain_page(page_to_mfn(page)); - memset(l0tab, 0, PAGE_SIZE); - l1e = l1tab[l1_table_offset(va)] = - l1e_from_page(page, __PAGE_HYPERVISOR); - } - else - l0tab = map_domain_page(l1e_get_pfn(l1e)); - - unmap_domain_page(l1tab); - - l0tab[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn; - - unmap_domain_page(l0tab); - - return 1; - -nomem: - return 0; -} - -int -set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn, - struct domain_mmap_cache *l2cache, - struct domain_mmap_cache *l1cache) -{ - unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); - pgentry_64_t *top_tab; - int error; - - ASSERT(tabmfn != 0); - ASSERT(shadow_lock_is_acquired(d)); - - top_tab = map_domain_page_with_cache(tabmfn, l2cache); - - if ( !(error = map_p2m_entry(top_tab, gpfn, mfn)) ) - domain_crash(d); - - unmap_domain_page_with_cache(top_tab, l2cache); - - return error; -} - -static int -alloc_p2m_table(struct domain *d) -{ - struct list_head *list_ent; - pgentry_64_t *top_tab = NULL; - unsigned long gpfn, mfn; - int error = 0; - - ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) ); - - top_tab = map_domain_page( - pagetable_get_pfn(d->vcpu[0]->arch.monitor_table)); - - list_ent = d->page_list.next; - - while ( list_ent != &d->page_list ) - { - struct page_info *page; - - page = list_entry(list_ent, struct page_info, list); - mfn = page_to_mfn(page); - - gpfn = get_gpfn_from_mfn(mfn); - - if ( !(error = map_p2m_entry(top_tab, gpfn, mfn)) ) - { - domain_crash(d); - break; - } - - list_ent = page->list.next; - } - - unmap_domain_page(top_tab); - - return error; -} - -#if CONFIG_PAGING_LEVELS >= 3 -static void -free_p2m_table(struct domain *d) -{ - unsigned long va; - l1_pgentry_t *l1tab; - l1_pgentry_t l1e; - l2_pgentry_t *l2tab; - l2_pgentry_t l2e; -#if CONFIG_PAGING_LEVELS >= 3 - l3_pgentry_t *l3tab; - l3_pgentry_t l3e; -#endif -#if CONFIG_PAGING_LEVELS == 4 - int i3; - l4_pgentry_t *l4tab; - l4_pgentry_t l4e; -#endif - - ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) ); - -#if CONFIG_PAGING_LEVELS == 4 - l4tab = map_domain_page( - pagetable_get_pfn(d->vcpu[0]->arch.monitor_table)); -#endif -#if CONFIG_PAGING_LEVELS == 3 - l3tab = map_domain_page( - pagetable_get_pfn(d->vcpu[0]->arch.monitor_table)); - - l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)]; - - /* - * NB: when CONFIG_PAGING_LEVELS == 3, - * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here. - * alloc_monitor_pagetable should guarantee this. - */ - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - BUG(); - - l2tab = map_domain_page(l3e_get_pfn(l3e)); -#endif - - for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; ) - { -#if CONFIG_PAGING_LEVELS == 4 - l4e = l4tab[l4_table_offset(va)]; - - if ( l4e_get_flags(l4e) & _PAGE_PRESENT ) - { - l3tab = map_domain_page(l4e_get_pfn(l4e)); - - for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ ) - { - l3e = l3tab[l3_table_offset(va)]; - - if ( l3e_get_flags(l3e) & _PAGE_PRESENT ) - { - int i2; - - l2tab = map_domain_page(l3e_get_pfn(l3e)); - - for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) - { -#endif - l2e = l2tab[l2_table_offset(va)]; - - if ( l2e_get_flags(l2e) & _PAGE_PRESENT ) - { - int i1; - - l1tab = map_domain_page(l2e_get_pfn(l2e)); - - /* - * unsigned long phys_to_machine_mapping[] - */ - for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++ ) - { - l1e = l1tab[l1_table_offset(va)]; - - if ( l1e_get_flags(l1e) & _PAGE_PRESENT ) - free_domheap_page(mfn_to_page(l1e_get_pfn(l1e))); - - va += PAGE_SIZE; - } - unmap_domain_page(l1tab); - free_domheap_page(mfn_to_page(l2e_get_pfn(l2e))); - } - else - va += PAGE_SIZE * L1_PAGETABLE_ENTRIES; - -#if CONFIG_PAGING_LEVELS == 4 - } - unmap_domain_page(l2tab); - free_domheap_page(mfn_to_page(l3e_get_pfn(l3e))); - } - else - va += PAGE_SIZE * L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES; - } - unmap_domain_page(l3tab); - free_domheap_page(mfn_to_page(l4e_get_pfn(l4e))); - } - else - va += PAGE_SIZE * - L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES * L3_PAGETABLE_ENTRIES; -#endif - } - -#if CONFIG_PAGING_LEVELS == 4 - unmap_domain_page(l4tab); -#endif -#if CONFIG_PAGING_LEVELS == 3 - unmap_domain_page(l3tab); -#endif -} -#endif - -void shadow_l1_normal_pt_update( - struct domain *d, - paddr_t pa, l1_pgentry_t gpte, - struct domain_mmap_cache *cache) -{ - unsigned long sl1mfn; - l1_pgentry_t *spl1e, spte; - - shadow_lock(d); - - sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow); - if ( sl1mfn ) - { - SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpde=%" PRIpte, - (void *)pa, l1e_get_intpte(gpte)); - l1pte_propagate_from_guest(current->domain, gpte, &spte); - - spl1e = map_domain_page_with_cache(sl1mfn, cache); - spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte; - unmap_domain_page_with_cache(spl1e, cache); - } - - shadow_unlock(d); -} - -void shadow_l2_normal_pt_update( - struct domain *d, - paddr_t pa, l2_pgentry_t gpde, - struct domain_mmap_cache *cache) -{ - unsigned long sl2mfn; - l2_pgentry_t *spl2e; - - shadow_lock(d); - - sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow); - if ( sl2mfn ) - { - SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte, - (void *)pa, l2e_get_intpte(gpde)); - spl2e = map_domain_page_with_cache(sl2mfn, cache); - validate_pde_change(d, gpde, - &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]); - unmap_domain_page_with_cache(spl2e, cache); - } - - shadow_unlock(d); -} - -#if CONFIG_PAGING_LEVELS >= 3 -void shadow_l3_normal_pt_update( - struct domain *d, - paddr_t pa, l3_pgentry_t l3e, - struct domain_mmap_cache *cache) -{ - unsigned long sl3mfn; - pgentry_64_t *spl3e; - - shadow_lock(d); - - sl3mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l3_shadow); - if ( sl3mfn ) - { - SH_VVLOG("shadow_l3_normal_pt_update pa=%p, l3e=%" PRIpte, - (void *)pa, l3e_get_intpte(l3e)); - spl3e = (pgentry_64_t *) map_domain_page_with_cache(sl3mfn, cache); - validate_entry_change(d, (pgentry_64_t *) &l3e, - &spl3e[(pa & ~PAGE_MASK) / sizeof(l3_pgentry_t)], - shadow_type_to_level(PGT_l3_shadow)); - unmap_domain_page_with_cache(spl3e, cache); - } - - shadow_unlock(d); -} -#endif - -#if CONFIG_PAGING_LEVELS >= 4 -void shadow_l4_normal_pt_update( - struct domain *d, - paddr_t pa, l4_pgentry_t l4e, - struct domain_mmap_cache *cache) -{ - unsigned long sl4mfn; - pgentry_64_t *spl4e; - - shadow_lock(d); - - sl4mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l4_shadow); - if ( sl4mfn ) - { - SH_VVLOG("shadow_l4_normal_pt_update pa=%p, l4e=%" PRIpte, - (void *)pa, l4e_get_intpte(l4e)); - spl4e = (pgentry_64_t *)map_domain_page_with_cache(sl4mfn, cache); - validate_entry_change(d, (pgentry_64_t *)&l4e, - &spl4e[(pa & ~PAGE_MASK) / sizeof(l4_pgentry_t)], - shadow_type_to_level(PGT_l4_shadow)); - unmap_domain_page_with_cache(spl4e, cache); - } - - shadow_unlock(d); -} -#endif - -static void -translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn) -{ - int i; - l1_pgentry_t *l1; - - l1 = map_domain_page(l1mfn); - for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) - { - if ( is_guest_l1_slot(i) && - (l1e_get_flags(l1[i]) & _PAGE_PRESENT) ) - { - unsigned long mfn = l1e_get_pfn(l1[i]); - unsigned long gpfn = mfn_to_gmfn(d, mfn); - ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn); - l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i])); - } - } - unmap_domain_page(l1); -} - -// This is not general enough to handle arbitrary pagetables -// with shared L1 pages, etc., but it is sufficient for bringing -// up dom0. -// -void -translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn, - unsigned int type) -{ - int i; - l2_pgentry_t *l2; - - ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d)); - - l2 = map_domain_page(l2mfn); - for (i = 0; i < L2_PAGETABLE_ENTRIES; i++) - { - if ( is_guest_l2_slot(type, i) && - (l2e_get_flags(l2[i]) & _PAGE_PRESENT) ) - { - unsigned long mfn = l2e_get_pfn(l2[i]); - unsigned long gpfn = mfn_to_gmfn(d, mfn); - ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn); - l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i])); - translate_l1pgtable(d, p2m, mfn); - } - } - unmap_domain_page(l2); -} - -void -remove_shadow(struct domain *d, unsigned long gpfn, u32 stype) -{ - unsigned long smfn; - - shadow_lock(d); - - while ( stype >= PGT_l1_shadow ) - { - smfn = __shadow_status(d, gpfn, stype); - if ( smfn && MFN_PINNED(smfn) ) - shadow_unpin(smfn); - stype -= PGT_l1_shadow; - } - - shadow_unlock(d); -} - -unsigned long -get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn) -{ - unsigned long va, tabpfn; - l1_pgentry_t *l1, l1e; - l2_pgentry_t *l2, l2e; -#if CONFIG_PAGING_LEVELS >= 4 - pgentry_64_t *l4 = NULL; - pgentry_64_t l4e = { 0 }; -#endif - pgentry_64_t *l3 = NULL; - pgentry_64_t l3e = { 0 }; - unsigned long *l0tab = NULL; - unsigned long mfn; - - ASSERT(shadow_mode_translate(d)); - - perfc_incrc(get_mfn_from_gpfn_foreign); - - va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn)); - - tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); - if ( !tabpfn ) - return INVALID_MFN; - -#if CONFIG_PAGING_LEVELS >= 4 - l4 = map_domain_page(tabpfn); - l4e = l4[l4_table_offset(va)]; - unmap_domain_page(l4); - if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) ) - return INVALID_MFN; - - l3 = map_domain_page(entry_get_pfn(l4e)); -#else - l3 = map_domain_page(tabpfn); -#endif - l3e = l3[l3_table_offset(va)]; - unmap_domain_page(l3); - if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) ) - return INVALID_MFN; - l2 = map_domain_page(entry_get_pfn(l3e)); - l2e = l2[l2_table_offset(va)]; - unmap_domain_page(l2); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - return INVALID_MFN; - - l1 = map_domain_page(l2e_get_pfn(l2e)); - l1e = l1[l1_table_offset(va)]; - unmap_domain_page(l1); - if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) - return INVALID_MFN; - - l0tab = map_domain_page(l1e_get_pfn(l1e)); - mfn = l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1)]; - unmap_domain_page(l0tab); - return mfn; -} - -static u32 remove_all_access_in_page( - struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn) -{ - l1_pgentry_t *pl1e = map_domain_page(l1mfn); - l1_pgentry_t match, ol2e; - unsigned long flags = _PAGE_PRESENT; - int i; - u32 count = 0; - int is_l1_shadow = - ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) == - PGT_l1_shadow); - - match = l1e_from_pfn(forbidden_gmfn, flags); - - for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) - { - if ( l1e_has_changed(pl1e[i], match, flags) ) - continue; - - ol2e = pl1e[i]; - pl1e[i] = l1e_empty(); - count++; - - if ( is_l1_shadow ) - shadow_put_page_from_l1e(ol2e, d); - else /* must be an hl2 page */ - put_page(mfn_to_page(forbidden_gmfn)); - } - - unmap_domain_page(pl1e); - - return count; -} - -static u32 __shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn) -{ - int i; - struct shadow_status *a; - u32 count = 0; - - if ( unlikely(!shadow_mode_enabled(d)) ) - return 0; - - ASSERT(shadow_lock_is_acquired(d)); - perfc_incrc(remove_all_access); - - for (i = 0; i < shadow_ht_buckets; i++) - { - a = &d->arch.shadow_ht[i]; - while ( a && a->gpfn_and_flags ) - { - switch (a->gpfn_and_flags & PGT_type_mask) - { - case PGT_l1_shadow: - case PGT_l2_shadow: - case PGT_l3_shadow: - case PGT_l4_shadow: - case PGT_hl2_shadow: - count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn); - break; - case PGT_snapshot: - case PGT_writable_pred: - // these can't hold refs to the forbidden page - break; - default: - BUG(); - } - - a = a->next; - } - } - - return count; -} - -void shadow_drop_references( - struct domain *d, struct page_info *page) -{ - if ( likely(!shadow_mode_refcounts(d)) || - ((page->u.inuse.type_info & PGT_count_mask) == 0) ) - return; - - /* XXX This needs more thought... */ - printk("%s: needing to call __shadow_remove_all_access for mfn=%lx\n", - __func__, page_to_mfn(page)); - printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page), - page->count_info, page->u.inuse.type_info); - - shadow_lock(d); - __shadow_remove_all_access(d, page_to_mfn(page)); - shadow_unlock(d); - - printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page), - page->count_info, page->u.inuse.type_info); -} - -/* XXX Needs more thought. Neither pretty nor fast: a place holder. */ -void shadow_sync_and_drop_references( - struct domain *d, struct page_info *page) -{ - if ( likely(!shadow_mode_refcounts(d)) ) - return; - - shadow_lock(d); - - if ( page_out_of_sync(page) ) - __shadow_sync_mfn(d, page_to_mfn(page)); - - __shadow_remove_all_access(d, page_to_mfn(page)); - - shadow_unlock(d); -} - -void clear_all_shadow_status(struct domain *d) -{ - struct vcpu *v = current; - - /* - * Don't clean up while other vcpus are working. - */ - if ( v->vcpu_id ) - return; - - shadow_lock(d); - - free_shadow_pages(d); - free_shadow_ht_entries(d); - d->arch.shadow_ht = - xmalloc_array(struct shadow_status, shadow_ht_buckets); - if ( d->arch.shadow_ht == NULL ) { - printk("clear all shadow status:xmalloc fail\n"); - domain_crash_synchronous(); - } - memset(d->arch.shadow_ht, 0, - shadow_ht_buckets * sizeof(struct shadow_status)); - - free_out_of_sync_entries(d); - - shadow_unlock(d); -} - - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c index a78ed07d26..734bd41797 100644 --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -896,7 +896,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu) v = alloc_idle_vcpu(cpu); BUG_ON(v == NULL); - v->arch.monitor_table = pagetable_from_paddr(__pa(idle_pg_table)); + v->arch.cr3 = __pa(idle_pg_table); /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 87f9a4fd42..2d398712fe 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -277,6 +277,21 @@ void show_stack(struct cpu_user_regs *regs) show_trace(regs); } +void show_xen_trace() +{ + struct cpu_user_regs regs; +#ifdef __x86_64 + __asm__("movq %%rsp,%0" : "=m" (regs.rsp)); + __asm__("movq %%rbp,%0" : "=m" (regs.rbp)); + __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip)); +#else + __asm__("movl %%esp,%0" : "=m" (regs.esp)); + __asm__("movl %%ebp,%0" : "=m" (regs.ebp)); + __asm__("call 1f; 1: popl %0" : "=a" (regs.eip)); +#endif + show_trace(®s); +} + void show_stack_overflow(unsigned long esp) { #ifdef MEMORY_GUARD @@ -861,8 +876,8 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) { - if ( shadow_mode_external(d) && guest_mode(regs) ) - return shadow_fault(addr, regs); + if ( shadow2_mode_external(d) && guest_mode(regs) ) + return shadow2_fault(addr, regs); if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) return handle_gdt_ldt_mapping_fault( addr - GDT_LDT_VIRT_START, regs); @@ -873,15 +888,15 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0); } - if ( unlikely(shadow_mode_enabled(d)) ) - return shadow_fault(addr, regs); - if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) && guest_kernel_mode(v, regs) && ((regs->error_code & (PGERR_write_access|PGERR_page_present)) == (PGERR_write_access|PGERR_page_present)) ) return ptwr_do_page_fault(d, addr, regs) ? EXCRET_fault_fixed : 0; + if ( shadow2_mode_enabled(d) ) + return shadow2_fault(addr, regs); + return 0; } @@ -906,6 +921,13 @@ asmlinkage int do_page_fault(struct cpu_user_regs *regs) perfc_incrc(page_faults); + if ( shadow2_mode_enabled(current->domain) ) + debugtrace_printk("%s %s %d dom=%d eip=%p cr2=%p code=%d cs=%x\n", + __func__, __FILE__, __LINE__, + current->domain->domain_id, + (void *)regs->eip, (void *)addr, regs->error_code, + regs->cs); + if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) ) return rc; diff --git a/xen/arch/x86/x86_32/domain_page.c b/xen/arch/x86/x86_32/domain_page.c index db3237242c..8fe7b9b344 100644 --- a/xen/arch/x86/x86_32/domain_page.c +++ b/xen/arch/x86/x86_32/domain_page.c @@ -15,6 +15,7 @@ #include #include #include +#include static inline struct vcpu *mapcache_current_vcpu(void) { @@ -58,10 +59,10 @@ void *map_domain_page(unsigned long pfn) cache = &v->domain->arch.mapcache; hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)]; - if ( hashent->pfn == pfn ) + if ( hashent->pfn == pfn && (idx = hashent->idx) != MAPHASHENT_NOTINUSE ) { - idx = hashent->idx; hashent->refcnt++; + ASSERT(idx < MAPCACHE_ENTRIES); ASSERT(hashent->refcnt != 0); ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn); goto out; @@ -178,6 +179,30 @@ void mapcache_init(struct domain *d) MAPHASHENT_NOTINUSE; } +paddr_t mapped_domain_page_to_maddr(void *va) +/* Convert a pointer in a mapped domain page to a machine address. + * Takes any pointer that's valid for use in unmap_domain_page() */ +{ + unsigned int idx; + struct vcpu *v; + struct mapcache *cache; + unsigned long pfn; + + ASSERT(!in_irq()); + + ASSERT((void *)MAPCACHE_VIRT_START <= va); + ASSERT(va < (void *)MAPCACHE_VIRT_END); + + v = mapcache_current_vcpu(); + + cache = &v->domain->arch.mapcache; + + idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT; + pfn = l1e_get_pfn(cache->l1tab[idx]); + return ((paddr_t) pfn << PAGE_SHIFT + | ((unsigned long) va & ~PAGE_MASK)); +} + #define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT)) static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)]; static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)]; @@ -233,6 +258,8 @@ void unmap_domain_page_global(void *va) l1_pgentry_t *pl1e; unsigned int idx; + ASSERT((__va >= IOREMAP_VIRT_START) && (__va <= (IOREMAP_VIRT_END - 1))); + /* /First/, we zap the PTE. */ pl2e = virt_to_xen_l2e(__va); pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(__va); diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c index 868140e586..dc2450201a 100644 --- a/xen/arch/x86/x86_32/mm.c +++ b/xen/arch/x86/x86_32/mm.c @@ -75,8 +75,7 @@ void __init paging_init(void) printk("PAE disabled.\n"); #endif - idle_vcpu[0]->arch.monitor_table = - pagetable_from_paddr(__pa(idle_pg_table)); + idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table); if ( cpu_has_pge ) { diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index d5db7f3b30..f173c05d83 100644 --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -81,8 +81,7 @@ void __init paging_init(void) l2_pgentry_t *l2_ro_mpt; struct page_info *pg; - idle_vcpu[0]->arch.monitor_table = - pagetable_from_paddr(__pa(idle_pg_table)); + idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table); /* Create user-accessible L2 directory to map the MPT for guests. */ l3_ro_mpt = alloc_xenheap_page(); diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index cfe2a6a5a0..84c9c35952 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -84,7 +84,8 @@ void show_page_walk(unsigned long addr) l4e = l4t[l4_table_offset(addr)]; mfn = l4e_get_pfn(l4e); pfn = get_gpfn_from_mfn(mfn); - printk(" L4 = %"PRIpte" %016lx\n", l4e_get_intpte(l4e), pfn); + printk(" L4[0x%lx] = %"PRIpte" %016lx\n", + l4_table_offset(addr), l4e_get_intpte(l4e), pfn); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return; @@ -92,7 +93,8 @@ void show_page_walk(unsigned long addr) l3e = l3t[l3_table_offset(addr)]; mfn = l3e_get_pfn(l3e); pfn = get_gpfn_from_mfn(mfn); - printk(" L3 = %"PRIpte" %016lx\n", l3e_get_intpte(l3e), pfn); + printk(" L3[0x%lx] = %"PRIpte" %016lx\n", + l3_table_offset(addr), l3e_get_intpte(l3e), pfn); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) return; @@ -100,7 +102,8 @@ void show_page_walk(unsigned long addr) l2e = l2t[l2_table_offset(addr)]; mfn = l2e_get_pfn(l2e); pfn = get_gpfn_from_mfn(mfn); - printk(" L2 = %"PRIpte" %016lx %s\n", l2e_get_intpte(l2e), pfn, + printk(" L2[0x%lx] = %"PRIpte" %016lx %s\n", + l2_table_offset(addr), l2e_get_intpte(l2e), pfn, (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : ""); if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_flags(l2e) & _PAGE_PSE) ) @@ -110,7 +113,8 @@ void show_page_walk(unsigned long addr) l1e = l1t[l1_table_offset(addr)]; mfn = l1e_get_pfn(l1e); pfn = get_gpfn_from_mfn(mfn); - printk(" L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn); + printk(" L1[0x%lx] = %"PRIpte" %016lx\n", + l1_table_offset(addr), l1e_get_intpte(l1e), pfn); } asmlinkage void double_fault(void); @@ -162,7 +166,7 @@ void toggle_guest_mode(struct vcpu *v) { v->arch.flags ^= TF_kernel_mode; __asm__ __volatile__ ( "swapgs" ); - update_pagetables(v); + update_cr3(v); write_ptbase(v); } diff --git a/xen/common/acm_ops.c b/xen/common/acm_ops.c index 6c65612799..3692577873 100644 --- a/xen/common/acm_ops.c +++ b/xen/common/acm_ops.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c index ad33217711..c8ba260711 100644 --- a/xen/common/grant_table.c +++ b/xen/common/grant_table.c @@ -434,7 +434,7 @@ __gnttab_unmap_grant_ref( /* If just unmapped a writable mapping, mark as dirtied */ if ( !(flags & GNTMAP_readonly) ) - gnttab_log_dirty(rd, frame); + gnttab_mark_dirty(rd, frame); if ( ((act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0) && !(flags & GNTMAP_readonly) ) @@ -731,7 +731,7 @@ __release_grant_for_copy( const unsigned long r_frame = act->frame; if ( !readonly ) - gnttab_log_dirty(rd, r_frame); + gnttab_mark_dirty(rd, r_frame); spin_lock(&rd->grant_table->lock); if ( readonly ) diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index fb7118e71f..1fb50b6bd2 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -241,9 +241,6 @@ static void read_clocks(unsigned char key) } extern void dump_runq(unsigned char key); -#ifndef NDEBUG -extern void audit_domains_key(unsigned char key); -#endif #ifdef PERF_COUNTERS extern void perfc_printall(unsigned char key); @@ -261,10 +258,16 @@ static void do_debug_key(unsigned char key, struct cpu_user_regs *regs) #ifndef NDEBUG static void debugtrace_key(unsigned char key) { - debugtrace_send_to_console = !debugtrace_send_to_console; - debugtrace_dump(); - printk("debugtrace_printk now writing to %s.\n", - debugtrace_send_to_console ? "console" : "buffer"); + debugtrace_toggle(); +} + +static void shadow2_audit_key(unsigned char key) +{ + extern int shadow2_audit_enable; + + shadow2_audit_enable = !shadow2_audit_enable; + printk("%s shadow2_audit_enable=%d\n", + __func__, shadow2_audit_enable); } #endif @@ -288,7 +291,7 @@ void initialize_keytable(void) #ifndef NDEBUG register_keyhandler( - 'o', audit_domains_key, "audit domains >0 EXPERIMENTAL"); + 'O', shadow2_audit_key, "toggle shadow2 audits"); register_keyhandler( 'T', debugtrace_key, "toggle debugtrace to console/buffer"); #endif diff --git a/xen/common/memory.c b/xen/common/memory.c index 0a631ca83e..9962c2e89a 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -126,6 +126,11 @@ populate_physmap( for ( j = 0; j < (1 << extent_order); j++ ) guest_physmap_add_page(d, gpfn + j, mfn + j); } + else if ( unlikely(shadow2_mode_translate(d)) ) + { + for ( j = 0; j < (1 << extent_order); j++ ) + shadow2_guest_physmap_add_page(d, gpfn + j, mfn + j); + } else { for ( j = 0; j < (1 << extent_order); j++ ) @@ -153,7 +158,7 @@ guest_remove_page( if ( unlikely(!mfn_valid(mfn)) ) { DPRINTK("Domain %u page number %lx invalid\n", - d->domain_id, mfn); + d->domain_id, gmfn); return 0; } @@ -179,7 +184,7 @@ guest_remove_page( (unsigned long)page->count_info, page->u.inuse.type_info); } - guest_physmap_remove_page(d, gmfn, mfn); + shadow2_guest_physmap_remove_page(d, gmfn, mfn); put_page(page); @@ -250,7 +255,7 @@ translate_gpfn_list( if ( (d = find_domain_by_id(op.domid)) == NULL ) return -ESRCH; - if ( !shadow_mode_translate(d) ) + if ( !(shadow_mode_translate(d) || shadow2_mode_translate(d)) ) { put_domain(d); return -EINVAL; diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index 8bd1c28915..974f6e3d8e 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -569,7 +569,7 @@ int console_getc(void) #ifndef NDEBUG /* Send output direct to console, or buffer it? */ -int debugtrace_send_to_console; +static volatile int debugtrace_send_to_console; static char *debugtrace_buf; /* Debug-trace buffer */ static unsigned int debugtrace_prd; /* Producer index */ @@ -578,17 +578,11 @@ static unsigned int debugtrace_used; static DEFINE_SPINLOCK(debugtrace_lock); integer_param("debugtrace", debugtrace_kilobytes); -void debugtrace_dump(void) +static void debugtrace_dump_worker(void) { - unsigned long flags; - if ( (debugtrace_bytes == 0) || !debugtrace_used ) return; - watchdog_disable(); - - spin_lock_irqsave(&debugtrace_lock, flags); - printk("debugtrace_dump() starting\n"); /* Print oldest portion of the ring. */ @@ -602,15 +596,47 @@ void debugtrace_dump(void) memset(debugtrace_buf, '\0', debugtrace_bytes); printk("debugtrace_dump() finished\n"); +} + +void debugtrace_toggle(void) +{ + unsigned long flags; + + watchdog_disable(); + spin_lock_irqsave(&debugtrace_lock, flags); + + // dump the buffer *before* toggling, in case the act of dumping the + // buffer itself causes more printk's... + // + printk("debugtrace_printk now writing to %s.\n", + !debugtrace_send_to_console ? "console": "buffer"); + if ( !debugtrace_send_to_console ) + debugtrace_dump_worker(); + + debugtrace_send_to_console = !debugtrace_send_to_console; spin_unlock_irqrestore(&debugtrace_lock, flags); + watchdog_enable(); + +} + +void debugtrace_dump(void) +{ + unsigned long flags; + watchdog_disable(); + spin_lock_irqsave(&debugtrace_lock, flags); + + debugtrace_dump_worker(); + + spin_unlock_irqrestore(&debugtrace_lock, flags); watchdog_enable(); } void debugtrace_printk(const char *fmt, ...) { static char buf[1024]; + static u32 count; va_list args; char *p; @@ -625,8 +651,10 @@ void debugtrace_printk(const char *fmt, ...) ASSERT(debugtrace_buf[debugtrace_bytes - 1] == 0); + sprintf(buf, "%u ", ++count); + va_start(args, fmt); - (void)vsnprintf(buf, sizeof(buf), fmt, args); + (void)vsnprintf(buf + strlen(buf), sizeof(buf), fmt, args); va_end(args); if ( debugtrace_send_to_console ) diff --git a/xen/include/asm-x86/bitops.h b/xen/include/asm-x86/bitops.h index b2ee953361..b9fd2557d0 100644 --- a/xen/include/asm-x86/bitops.h +++ b/xen/include/asm-x86/bitops.h @@ -75,6 +75,24 @@ static __inline__ void clear_bit(int nr, volatile void * addr) :"=m" (ADDR) :"dIr" (nr)); } + +/** + * __clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * Unlike clear_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static __inline__ void __clear_bit(int nr, volatile void * addr) +{ + __asm__( + "btrl %1,%0" + :"=m" (ADDR) + :"dIr" (nr)); +} + #define smp_mb__before_clear_bit() barrier() #define smp_mb__after_clear_bit() barrier() diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index 99c74cf5ad..74a123de6f 100644 --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -79,9 +79,14 @@ #ifndef __ASSEMBLY__ extern unsigned long _end; /* standard ELF symbol */ -#endif /* __ASSEMBLY__ */ -#define FORCE_CRASH() __asm__ __volatile__ ( "ud2" ) +static inline void FORCE_CRASH(void) __attribute__((noreturn,always_inline)); +static inline void FORCE_CRASH(void) +{ + __asm__ __volatile__ ( "ud2" ); + while(1); +} +#endif /* __ASSEMBLY__ */ #if defined(__x86_64__) @@ -149,9 +154,14 @@ extern unsigned long _end; /* standard ELF symbol */ /* Slot 256: read-only guest-accessible machine-to-phys translation table. */ #define RO_MPT_VIRT_START (PML4_ADDR(256)) #define RO_MPT_VIRT_END (RO_MPT_VIRT_START + PML4_ENTRY_BYTES/2) + +// current unused? +#if 0 /* Slot 257: read-only guest-accessible linear page table. */ #define RO_LINEAR_PT_VIRT_START (PML4_ADDR(257)) #define RO_LINEAR_PT_VIRT_END (RO_LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) +#endif + /* Slot 258: linear page table (guest table). */ #define LINEAR_PT_VIRT_START (PML4_ADDR(258)) #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) @@ -175,7 +185,7 @@ extern unsigned long _end; /* standard ELF symbol */ #define DIRECTMAP_VIRT_START (PML4_ADDR(262)) #define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + PML4_ENTRY_BYTES*2) -#define PGT_base_page_table PGT_l4_page_table +#define PGT_base_page_table PGT_l4_page_table #define __HYPERVISOR_CS64 0xe010 #define __HYPERVISOR_CS32 0xe008 @@ -274,9 +284,9 @@ extern unsigned long _end; /* standard ELF symbol */ (L2_PAGETABLE_LAST_XEN_SLOT - L2_PAGETABLE_FIRST_XEN_SLOT + 1) #ifdef CONFIG_X86_PAE -# define PGT_base_page_table PGT_l3_page_table +# define PGT_base_page_table PGT_l3_page_table #else -# define PGT_base_page_table PGT_l2_page_table +# define PGT_base_page_table PGT_l2_page_table #endif #define __HYPERVISOR_CS 0xe008 diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index a0efe89f0a..2ef0775795 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -73,42 +73,42 @@ struct arch_domain /* I/O-port admin-specified access capabilities. */ struct rangeset *ioport_caps; - /* Shadow mode status and controls. */ - struct shadow_ops *ops; - unsigned int shadow_mode; /* flags to control shadow table operation */ - unsigned int shadow_nest; /* Recursive depth of shadow_lock() nesting */ - - /* shadow hashtable */ - struct shadow_status *shadow_ht; - struct shadow_status *shadow_ht_free; - struct shadow_status *shadow_ht_extras; /* extra allocation units */ - unsigned int shadow_extras_count; - - /* shadow dirty bitmap */ + /* HVM stuff */ + struct hvm_domain hvm_domain; + + /* Shadow-translated guest: Pseudophys base address of reserved area. */ + unsigned long first_reserved_pfn; + + /* Shadow2 stuff */ + u32 shadow2_mode; /* flags to control shadow operation */ + spinlock_t shadow2_lock; /* shadow2 domain lock */ + int shadow2_locker; /* processor which holds the lock */ + const char *shadow2_locker_function; /* Func that took it */ + struct list_head shadow2_freelists[SHADOW2_MAX_ORDER + 1]; + struct list_head shadow2_p2m_freelist; + struct list_head shadow2_p2m_inuse; + struct list_head shadow2_toplevel_shadows; + unsigned int shadow2_total_pages; /* number of pages allocated */ + unsigned int shadow2_free_pages; /* number of pages on freelists */ + unsigned int shadow2_p2m_pages; /* number of pages in p2m map */ + + /* Shadow2 hashtable */ + struct shadow2_hash_entry *shadow2_hash_table; + struct shadow2_hash_entry *shadow2_hash_freelist; + struct shadow2_hash_entry *shadow2_hash_allocations; + int shadow2_hash_walking; /* Some function is walking the hash table */ + + /* Shadow log-dirty bitmap */ unsigned long *shadow_dirty_bitmap; unsigned int shadow_dirty_bitmap_size; /* in pages, bit per page */ - /* shadow mode stats */ - unsigned int shadow_page_count; - unsigned int hl2_page_count; - unsigned int snapshot_page_count; - + /* Shadow log-dirty mode stats */ unsigned int shadow_fault_count; unsigned int shadow_dirty_count; - /* full shadow mode */ - struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */ - struct out_of_sync_entry *out_of_sync_free; - struct out_of_sync_entry *out_of_sync_extras; - unsigned int out_of_sync_extras_count; + /* Shadow translated domain: P2M mapping */ + pagetable_t phys_table; - struct list_head free_shadow_frames; - - pagetable_t phys_table; /* guest 1:1 pagetable */ - struct hvm_domain hvm_domain; - - /* Shadow-translated guest: Pseudophys base address of reserved area. */ - unsigned long first_reserved_pfn; } __cacheline_aligned; #ifdef CONFIG_X86_PAE @@ -166,25 +166,34 @@ struct arch_vcpu */ l1_pgentry_t *perdomain_ptes; - pagetable_t guest_table_user; /* x86/64: user-space pagetable. */ - pagetable_t guest_table; /* (MA) guest notion of cr3 */ - pagetable_t shadow_table; /* (MA) shadow of guest */ - pagetable_t monitor_table; /* (MA) used in hypervisor */ - - l2_pgentry_t *guest_vtable; /* virtual address of pagetable */ - l2_pgentry_t *shadow_vtable; /* virtual address of shadow_table */ - l2_pgentry_t *monitor_vtable; /* virtual address of monitor_table */ - l1_pgentry_t *hl2_vtable; /* virtual address of hl2_table */ - #ifdef CONFIG_X86_64 - l3_pgentry_t *guest_vl3table; - l4_pgentry_t *guest_vl4table; + pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */ #endif + pagetable_t guest_table; /* (MFN) guest notion of cr3 */ + /* guest_table holds a ref to the page, and also a type-count unless + * shadow refcounts are in use */ + pagetable_t shadow_table; /* (MFN) shadow of guest */ + pagetable_t monitor_table; /* (MFN) hypervisor PT (for HVM) */ + unsigned long cr3; /* (MA) value to install in HW CR3 */ - unsigned long monitor_shadow_ref; + void *guest_vtable; /* virtual address of pagetable */ + void *shadow_vtable; /* virtual address of shadow_table */ + root_pgentry_t *monitor_vtable; /* virtual address of monitor_table */ /* Current LDT details. */ unsigned long shadow_ldt_mapcnt; + + /* Shadow2 stuff */ + /* -- pointers to mode-specific entry points */ + struct shadow2_entry_points *shadow2; + unsigned long last_emulated_mfn; /* last mfn we emulated a write to */ + u8 shadow2_propagate_fault; /* emulated fault needs to be */ + /* propagated to guest */ +#if CONFIG_PAGING_LEVELS >= 3 + u8 shadow2_pae_flip_pending; /* shadow update requires this PAE cpu + * to recopy/install its L3 table. + */ +#endif } __cacheline_aligned; /* shorthands to improve code legibility */ diff --git a/xen/include/asm-x86/grant_table.h b/xen/include/asm-x86/grant_table.h index 5c6600ac7e..277b93ca0c 100644 --- a/xen/include/asm-x86/grant_table.h +++ b/xen/include/asm-x86/grant_table.h @@ -31,7 +31,7 @@ int destroy_grant_host_mapping( #define gnttab_shared_gmfn(d, t, i) \ (mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i))) -#define gnttab_log_dirty(d, f) mark_dirty((d), (f)) +#define gnttab_mark_dirty(d, f) mark_dirty((d), (f)) static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr) { diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h index 73f3b31275..cb573e5d9c 100644 --- a/xen/include/asm-x86/hvm/hvm.h +++ b/xen/include/asm-x86/hvm/hvm.h @@ -56,9 +56,16 @@ struct hvm_function_table { */ int (*realmode)(struct vcpu *v); int (*paging_enabled)(struct vcpu *v); + int (*long_mode_enabled)(struct vcpu *v); + int (*guest_x86_mode)(struct vcpu *v); int (*instruction_length)(struct vcpu *v); unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num); + /* + * Re-set the value of CR3 that Xen runs on when handling VM exits + */ + void (*update_host_cr3)(struct vcpu *v); + /* * Update specifics of the guest state: * 1) TS bit in guest cr0 @@ -133,12 +140,30 @@ hvm_paging_enabled(struct vcpu *v) return hvm_funcs.paging_enabled(v); } +static inline int +hvm_long_mode_enabled(struct vcpu *v) +{ + return hvm_funcs.long_mode_enabled(v); +} + +static inline int +hvm_guest_x86_mode(struct vcpu *v) +{ + return hvm_funcs.guest_x86_mode(v); +} + static inline int hvm_instruction_length(struct vcpu *v) { return hvm_funcs.instruction_length(v); } +static inline void +hvm_update_host_cr3(struct vcpu *v) +{ + hvm_funcs.update_host_cr3(v); +} + void hvm_hypercall_page_initialise(struct domain *d, void *hypercall_page); diff --git a/xen/include/asm-x86/hvm/support.h b/xen/include/asm-x86/hvm/support.h index 35a0bfe464..6ccfdee678 100644 --- a/xen/include/asm-x86/hvm/support.h +++ b/xen/include/asm-x86/hvm/support.h @@ -116,10 +116,13 @@ enum hval_bitmaps { #define DBG_LEVEL_IOAPIC (1 << 9) extern unsigned int opt_hvm_debug_level; -#define HVM_DBG_LOG(level, _f, _a...) \ - if ( (level) & opt_hvm_debug_level ) \ - printk("[HVM:%d.%d] <%s> " _f "\n", \ - current->domain->domain_id, current->vcpu_id, __func__, ## _a) +#define HVM_DBG_LOG(level, _f, _a...) \ + do { \ + if ( (level) & opt_hvm_debug_level ) \ + printk("[HVM:%d.%d] <%s> " _f "\n", \ + current->domain->domain_id, current->vcpu_id, __func__, \ + ## _a); \ + } while (0) #else #define HVM_DBG_LOG(level, _f, _a...) #endif diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h index f89b6ad787..b607a4578b 100644 --- a/xen/include/asm-x86/hvm/vcpu.h +++ b/xen/include/asm-x86/hvm/vcpu.h @@ -29,6 +29,7 @@ #define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI 1 struct hvm_vcpu { + unsigned long hw_cr3; /* value we give to HW to use */ unsigned long ioflags; struct hvm_io_op io_op; struct vlapic *vlapic; @@ -40,6 +41,11 @@ struct hvm_vcpu { int xen_port; +#if CONFIG_PAGING_LEVELS >= 3 + l3_pgentry_t hvm_lowmem_l3tab[4] + __attribute__((__aligned__(32))); +#endif + /* Flags */ int flag_dr_dirty; diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h index 85ee7046fd..524411be34 100644 --- a/xen/include/asm-x86/hvm/vmx/vmcs.h +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h @@ -87,6 +87,7 @@ struct arch_vmx_struct { unsigned long cpu_cr0; /* copy of guest CR0 */ unsigned long cpu_shadow_cr0; /* copy of guest read shadow CR0 */ + unsigned long cpu_shadow_cr4; /* copy of guest read shadow CR4 */ unsigned long cpu_cr2; /* save CR2 */ unsigned long cpu_cr3; unsigned long cpu_state; diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h index 38ae0e3b0f..38e447259c 100644 --- a/xen/include/asm-x86/hvm/vmx/vmx.h +++ b/xen/include/asm-x86/hvm/vmx/vmx.h @@ -298,6 +298,9 @@ static always_inline void __vmwrite_vcpu( case GUEST_CR0: v->arch.hvm_vmx.cpu_cr0 = value; break; + case CR4_READ_SHADOW: + v->arch.hvm_vmx.cpu_shadow_cr4 = value; + break; case CPU_BASED_VM_EXEC_CONTROL: v->arch.hvm_vmx.cpu_based_exec_control = value; break; @@ -317,11 +320,14 @@ static always_inline void __vmread_vcpu( case GUEST_CR0: *value = v->arch.hvm_vmx.cpu_cr0; break; + case CR4_READ_SHADOW: + *value = v->arch.hvm_vmx.cpu_shadow_cr4; + break; case CPU_BASED_VM_EXEC_CONTROL: *value = v->arch.hvm_vmx.cpu_based_exec_control; break; default: - printk("__vmread_cpu: invalid field %lx\n", field); + printk("__vmread_vcpu: invalid field %lx\n", field); break; } } @@ -342,6 +348,7 @@ static inline int __vmwrite(unsigned long field, unsigned long value) switch ( field ) { case CR0_READ_SHADOW: case GUEST_CR0: + case CR4_READ_SHADOW: case CPU_BASED_VM_EXEC_CONTROL: __vmwrite_vcpu(v, field, value); break; @@ -404,6 +411,46 @@ static inline int vmx_paging_enabled(struct vcpu *v) return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG); } +/* Works only for vcpu == current */ +static inline int vmx_long_mode_enabled(struct vcpu *v) +{ + ASSERT(v == current); + return VMX_LONG_GUEST(current); +} + +/* Works only for vcpu == current */ +static inline int vmx_realmode(struct vcpu *v) +{ + unsigned long rflags; + ASSERT(v == current); + + __vmread(GUEST_RFLAGS, &rflags); + return rflags & X86_EFLAGS_VM; +} + +/* Works only for vcpu == current */ +static inline void vmx_update_host_cr3(struct vcpu *v) +{ + ASSERT(v == current); + __vmwrite(HOST_CR3, v->arch.cr3); +} + +static inline int vmx_guest_x86_mode(struct vcpu *v) +{ + unsigned long cs_ar_bytes; + ASSERT(v == current); + + if ( vmx_long_mode_enabled(v) ) + { + __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes); + return (cs_ar_bytes & (1u<<13)) ? 8 : 4; + } + if ( vmx_realmode(v) ) + return 2; + __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes); + return (cs_ar_bytes & (1u<<14)) ? 4 : 2; +} + static inline int vmx_pgbit_test(struct vcpu *v) { unsigned long cr0; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 06ea598754..0b19fbe7ec 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -20,7 +20,11 @@ struct page_info { /* Each frame can be threaded onto a doubly-linked list. */ - struct list_head list; + union { + struct list_head list; + /* Shadow2 uses this field as an up-pointer in lower-level shadows */ + paddr_t up; + }; /* Reference count and various PGC_xxx flags and fields. */ u32 count_info; @@ -46,8 +50,20 @@ struct page_info } u; - /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */ - u32 tlbflush_timestamp; + union { + /* Timestamp from 'TLB clock', used to reduce need for safety + * flushes. Only valid on a) free pages, and b) guest pages with a + * zero type count. */ + u32 tlbflush_timestamp; + + /* Only used on guest pages with a shadow. + * Guest pages with a shadow must have a non-zero type count, so this + * does not conflict with the tlbflush timestamp. */ + u32 shadow2_flags; + + // XXX -- we expect to add another field here, to be used for min/max + // purposes, which is only used for shadow pages. + }; }; /* The following page types are MUTUALLY EXCLUSIVE. */ @@ -60,6 +76,7 @@ struct page_info #define PGT_ldt_page (6U<<29) /* using this page in an LDT? */ #define PGT_writable_page (7U<<29) /* has writable mappings of this page? */ +#ifndef SHADOW2 #define PGT_l1_shadow PGT_l1_page_table #define PGT_l2_shadow PGT_l2_page_table #define PGT_l3_shadow PGT_l3_page_table @@ -69,14 +86,16 @@ struct page_info #define PGT_writable_pred (7U<<29) /* predicted gpfn with writable ref */ #define PGT_fl1_shadow (5U<<29) +#endif + #define PGT_type_mask (7U<<29) /* Bits 29-31. */ - /* Has this page been validated for use as its current type? */ -#define _PGT_validated 28 -#define PGT_validated (1U<<_PGT_validated) /* Owning guest has pinned this page to its current type? */ -#define _PGT_pinned 27 +#define _PGT_pinned 28 #define PGT_pinned (1U<<_PGT_pinned) + /* Has this page been validated for use as its current type? */ +#define _PGT_validated 27 +#define PGT_validated (1U<<_PGT_validated) #if defined(__i386__) /* The 11 most significant bits of virt address if this is a page table. */ #define PGT_va_shift 16 @@ -98,6 +117,7 @@ struct page_info /* 16-bit count of uses of this frame as its current type. */ #define PGT_count_mask ((1U<<16)-1) +#ifndef SHADOW2 #ifdef __x86_64__ #define PGT_high_mfn_shift 52 #define PGT_high_mfn_mask (0xfffUL << PGT_high_mfn_shift) @@ -112,19 +132,53 @@ struct page_info #define PGT_score_shift 23 #define PGT_score_mask (((1U<<4)-1)<u.inuse._domain)) #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d)) @@ -165,7 +227,7 @@ extern void invalidate_shadow_ldt(struct vcpu *d); extern int shadow_remove_all_write_access( struct domain *d, unsigned long gmfn, unsigned long mfn); extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn); -extern int _shadow_mode_refcounts(struct domain *d); +extern int _shadow2_mode_refcounts(struct domain *d); static inline void put_page(struct page_info *page) { @@ -197,8 +259,8 @@ static inline int get_page(struct page_info *page, unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ unlikely(d != _domain) ) /* Wrong owner? */ { - if ( !_shadow_mode_refcounts(domain) ) - DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" + if ( !_shadow2_mode_refcounts(domain) ) + DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" PRtype_info "\n", page_to_mfn(page), domain, unpickle_domptr(d), x, page->u.inuse.type_info); @@ -254,6 +316,16 @@ static inline int page_is_removable(struct page_info *page) ASSERT(((_p)->count_info & PGC_count_mask) != 0); \ ASSERT(page_get_owner(_p) == (_d)) +// Quick test for whether a given page can be represented directly in CR3. +// +#if CONFIG_PAGING_LEVELS == 3 +#define MFN_FITS_IN_CR3(_MFN) !(mfn_x(_MFN) >> 20) + +/* returns a lowmem machine address of the copied L3 root table */ +unsigned long +pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab); +#endif /* CONFIG_PAGING_LEVELS == 3 */ + int check_descriptor(struct desc_struct *d); /* @@ -271,29 +343,44 @@ int check_descriptor(struct desc_struct *d); #define set_gpfn_from_mfn(mfn, pfn) (machine_to_phys_mapping[(mfn)] = (pfn)) #define get_gpfn_from_mfn(mfn) (machine_to_phys_mapping[(mfn)]) + +#define mfn_to_gmfn(_d, mfn) \ + ( (shadow2_mode_translate(_d)) \ + ? get_gpfn_from_mfn(mfn) \ + : (mfn) ) + +#define gmfn_to_mfn(_d, gpfn) mfn_x(sh2_gfn_to_mfn(_d, gpfn)) + + /* * The phys_to_machine_mapping is the reversed mapping of MPT for full * virtualization. It is only used by shadow_mode_translate()==true * guests, so we steal the address space that would have normally * been used by the read-only MPT map. */ -#define phys_to_machine_mapping ((unsigned long *)RO_MPT_VIRT_START) -#define NR_P2M_TABLE_ENTRIES ((unsigned long *)RO_MPT_VIRT_END \ - - phys_to_machine_mapping) +#define phys_to_machine_mapping ((l1_pgentry_t *)RO_MPT_VIRT_START) #define INVALID_MFN (~0UL) #define VALID_MFN(_mfn) (!((_mfn) & (1U<<31))) -#define set_mfn_from_gpfn(pfn, mfn) (phys_to_machine_mapping[(pfn)] = (mfn)) static inline unsigned long get_mfn_from_gpfn(unsigned long pfn) { - unsigned long mfn; + l1_pgentry_t l1e = l1e_empty(); + int ret; + +#if CONFIG_PAGING_LEVELS > 2 + if ( pfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof (l1_pgentry_t) ) + /* This pfn is higher than the p2m map can hold */ + return INVALID_MFN; +#endif + + ret = __copy_from_user(&l1e, + &phys_to_machine_mapping[pfn], + sizeof(l1e)); - if ( unlikely(pfn >= NR_P2M_TABLE_ENTRIES) || - unlikely(__copy_from_user(&mfn, &phys_to_machine_mapping[pfn], - sizeof(mfn))) ) - mfn = INVALID_MFN; + if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) ) + return l1e_get_pfn(l1e); - return mfn; + return INVALID_MFN; } #ifdef MEMORY_GUARD @@ -333,6 +420,7 @@ void audit_domains(void); #endif int new_guest_cr3(unsigned long pfn); +void make_cr3(struct vcpu *v, unsigned long mfn); void propagate_page_fault(unsigned long addr, u16 error_code); diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h index f1c08cf500..07c09b2ae2 100644 --- a/xen/include/asm-x86/msr.h +++ b/xen/include/asm-x86/msr.h @@ -112,6 +112,10 @@ static inline void wrmsrl(unsigned int msr, __u64 val) #define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483 #define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484 #define MSR_IA32_VMX_MISC_MSR 0x485 +#define MSR_IA32_VMX_CR0_FIXED0 0x486 +#define MSR_IA32_VMX_CR0_FIXED1 0x487 +#define MSR_IA32_VMX_CR4_FIXED0 0x488 +#define MSR_IA32_VMX_CR4_FIXED1 0x489 #define IA32_FEATURE_CONTROL_MSR 0x3a #define IA32_FEATURE_CONTROL_MSR_LOCK 0x1 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON 0x4 diff --git a/xen/include/asm-x86/page-guest32.h b/xen/include/asm-x86/page-guest32.h index cf5595b078..e93206169a 100644 --- a/xen/include/asm-x86/page-guest32.h +++ b/xen/include/asm-x86/page-guest32.h @@ -89,15 +89,8 @@ static inline l2_pgentry_32_t l2e_from_paddr_32(paddr_t pa, unsigned int flags) #define linear_l1_table_32 \ ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START)) -#define __linear_l2_table_32 \ - ((l2_pgentry_32_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)))) #define linear_pg_table_32 linear_l1_table_32 -#define linear_l2_table_32(_ed) ((_ed)->arch.guest_vtable) - -#define va_to_l1mfn_32(_ed, _va) \ - (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT])) #endif /* __X86_PAGE_GUEST_H__ */ diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h index 6432402066..94158c7f3d 100644 --- a/xen/include/asm-x86/page.h +++ b/xen/include/asm-x86/page.h @@ -233,26 +233,18 @@ typedef struct { u64 pfn; } pagetable_t; + DOMAIN_ENTRIES_PER_L4_PAGETABLE) #endif -#define LINEAR_PT_OFFSET (LINEAR_PT_VIRT_START & VADDR_MASK) -#define linear_l1_table \ - ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) -#define __linear_l2_table \ - ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)))) -#define __linear_l3_table \ - ((l3_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)))) -#define __linear_l4_table \ - ((l4_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)) + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<2)))) - +/* Where to find each level of the linear mapping */ +#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) +#define __linear_l2_table \ + ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l3_table \ + ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l4_table \ + ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) + +#define linear_l1_table __linear_l1_table #define linear_pg_table linear_l1_table -#define linear_l2_table(v) ((v)->arch.guest_vtable) -#define linear_l3_table(v) ((v)->arch.guest_vl3table) -#define linear_l4_table(v) ((v)->arch.guest_vl4table) +#define linear_l2_table(v) ((l2_pgentry_t *)(v)->arch.guest_vtable) #ifndef __ASSEMBLY__ #if CONFIG_PAGING_LEVELS == 3 @@ -294,6 +286,7 @@ extern void paging_init(void); #define _PAGE_AVAIL1 0x400U #define _PAGE_AVAIL2 0x800U #define _PAGE_AVAIL 0xE00U +#define _PAGE_PSE_PAT 0x1000U /* * Debug option: Ensure that granted mappings are not implicitly unmapped. @@ -307,9 +300,9 @@ extern void paging_init(void); #endif /* - * Disallow unused flag bits plus PAT, PSE and GLOBAL. Also disallow GNTTAB - * if we are using it for grant-table debugging. Permit the NX bit if the - * hardware supports it. + * Disallow unused flag bits plus PAT, PSE and GLOBAL. + * Also disallow GNTTAB if we are using it for grant-table debugging. + * Permit the NX bit if the hardware supports it. */ #define BASE_DISALLOW_MASK ((0xFFFFF180U | _PAGE_GNTTAB) & ~_PAGE_NX) diff --git a/xen/include/asm-x86/perfc_defn.h b/xen/include/asm-x86/perfc_defn.h index 54bc01ea7c..d6e24b207d 100644 --- a/xen/include/asm-x86/perfc_defn.h +++ b/xen/include/asm-x86/perfc_defn.h @@ -144,4 +144,57 @@ PERFCOUNTER_CPU(remove_write_predicted, "remove_write predict hit&exit") PERFCOUNTER_CPU(remove_write_bad_prediction, "remove_write bad prediction") PERFCOUNTER_CPU(update_hl2e_invlpg, "update_hl2e calls invlpg") +/* Shadow2 counters */ +PERFCOUNTER_CPU(shadow2_alloc, "calls to shadow2_alloc") +PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs") +PERFSTATUS(shadow2_alloc_count, "number of shadow pages in use") +PERFCOUNTER_CPU(shadow2_free, "calls to shadow2_free") +PERFCOUNTER_CPU(shadow2_prealloc_1, "shadow2 recycles old shadows") +PERFCOUNTER_CPU(shadow2_prealloc_2, "shadow2 recycles in-use shadows") +PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map") +PERFCOUNTER_CPU(shadow2_a_update, "shadow2 A bit update") +PERFCOUNTER_CPU(shadow2_ad_update, "shadow2 A&D bit update") +PERFCOUNTER_CPU(shadow2_fault, "calls to shadow2_fault") +PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn") +PERFCOUNTER_CPU(shadow2_fault_bail_not_present, + "shadow2_fault guest not-present") +PERFCOUNTER_CPU(shadow2_fault_bail_nx, "shadow2_fault guest NX fault") +PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault") +PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor, + "shadow2_fault guest U/S fault") +PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read") +PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write") +PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails") +PERFCOUNTER_CPU(shadow2_fault_mmio, "shadow2_fault handled as mmio") +PERFCOUNTER_CPU(shadow2_fault_fixed, "shadow2_fault fixed fault") +PERFCOUNTER_CPU(shadow2_ptwr_emulate, "shadow2 causes ptwr to emulate") +PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e") +PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e") +PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e") +PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e") +PERFCOUNTER_CPU(shadow2_hash_lookups, "calls to shadow2_hash_lookup") +PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head") +PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses") +PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status") +PERFCOUNTER_CPU(shadow2_hash_inserts, "calls to shadow2_hash_insert") +PERFCOUNTER_CPU(shadow2_hash_deletes, "calls to shadow2_hash_delete") +PERFCOUNTER_CPU(shadow2_writeable, "shadow2 removes write access") +PERFCOUNTER_CPU(shadow2_writeable_h_1, "shadow2 writeable: 32b w2k3") +PERFCOUNTER_CPU(shadow2_writeable_h_2, "shadow2 writeable: 32pae w2k3") +PERFCOUNTER_CPU(shadow2_writeable_h_3, "shadow2 writeable: 64b w2k3") +PERFCOUNTER_CPU(shadow2_writeable_h_4, "shadow2 writeable: 32b linux low") +PERFCOUNTER_CPU(shadow2_writeable_bf, "shadow2 writeable brute-force") +PERFCOUNTER_CPU(shadow2_mappings, "shadow2 removes all mappings") +PERFCOUNTER_CPU(shadow2_mappings_bf, "shadow2 rm-mappings brute-force") +PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit") +PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit") +PERFCOUNTER_CPU(shadow2_unshadow, "shadow2 unshadows a page") +PERFCOUNTER_CPU(shadow2_up_pointer, "shadow2 unshadow by up-pointer") +PERFCOUNTER_CPU(shadow2_unshadow_bf, "shadow2 unshadow brute-force") +PERFCOUNTER_CPU(shadow2_get_page_fail, "shadow2_get_page_from_l1e failed") +PERFCOUNTER_CPU(shadow2_guest_walk, "shadow2 walks guest tables") +PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits") +PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses") + + /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index d460544d3e..81c8757f8e 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -545,6 +545,7 @@ extern always_inline void prefetchw(const void *x) #endif void show_stack(struct cpu_user_regs *regs); +void show_xen_trace(void); void show_stack_overflow(unsigned long esp); void show_registers(struct cpu_user_regs *regs); void show_execution_state(struct cpu_user_regs *regs); diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h index 7144b24d8b..efade3021c 100644 --- a/xen/include/asm-x86/shadow.h +++ b/xen/include/asm-x86/shadow.h @@ -1,8 +1,7 @@ /****************************************************************************** * include/asm-x86/shadow.h * - * Copyright (c) 2005 Michael A Fetterman - * Based on an earlier implementation by Ian Pratt et al + * Copyright (c) 2006 by XenSource Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,1782 +21,28 @@ #ifndef _XEN_SHADOW_H #define _XEN_SHADOW_H -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +/* This file is just a wrapper around the new Shadow2 header, + * providing names that must be defined in any shadow implementation. */ -/* Shadow PT operation mode : shadow-mode variable in arch_domain. */ +#include -#define SHM_enable (1<<0) /* we're in one of the shadow modes */ -#define SHM_refcounts (1<<1) /* refcounts based on shadow tables instead of - guest tables */ -#define SHM_write_all (1<<2) /* allow write access to all guest pt pages, - regardless of pte write permissions */ -#define SHM_log_dirty (1<<3) /* enable log dirty mode */ -#define SHM_translate (1<<4) /* Xen does p2m translation, not guest */ -#define SHM_external (1<<5) /* Xen does not steal address space from the - domain for its own booking; requires VT or - similar mechanisms */ -#define SHM_wr_pt_pte (1<<6) /* guest allowed to set PAGE_RW bit in PTEs which - point to page table pages. */ +/* How to make sure a page is not referred to in a shadow PT */ +/* This will need to be a for_each_vcpu if we go to per-vcpu shadows */ +#define shadow_drop_references(_d, _p) \ + shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) +#define shadow_sync_and_drop_references(_d, _p) \ + shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) -#define shadow_mode_enabled(_d) ((_d)->arch.shadow_mode) -#define shadow_mode_refcounts(_d) ((_d)->arch.shadow_mode & SHM_refcounts) -#define shadow_mode_write_l1(_d) (VM_ASSIST(_d, VMASST_TYPE_writable_pagetables)) -#define shadow_mode_write_all(_d) ((_d)->arch.shadow_mode & SHM_write_all) -#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty) -#define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate) -#define shadow_mode_external(_d) ((_d)->arch.shadow_mode & SHM_external) -#define shadow_mode_wr_pt_pte(_d) ((_d)->arch.shadow_mode & SHM_wr_pt_pte) +/* Whether we are translating the domain's frame numbers for it */ +#define shadow_mode_translate(d) shadow2_mode_translate(d) -#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START) -#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \ - (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))) -#define shadow_linear_l2_table(_v) ((_v)->arch.shadow_vtable) +/* ...and if so, how to add and remove entries in the mapping */ +#define guest_physmap_add_page(_d, _p, _m) \ + shadow2_guest_physmap_add_page((_d), (_p), (_m)) +#define guest_physmap_remove_page(_d, _p, _m ) \ + shadow2_guest_physmap_remove_page((_d), (_p), (_m)) -// easy access to the hl2 table (for translated but not external modes only) -#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))) - -/* - * For now we use the per-domain BIGLOCK rather than a shadow-specific lock. - * We usually have the BIGLOCK already acquired anyway, so this is unlikely - * to cause much unnecessary extra serialisation. Also it's a recursive - * lock, and there are some code paths containing nested shadow_lock(). - * The #if0'ed code below is therefore broken until such nesting is removed. - */ -#if 0 -#define shadow_lock_init(_d) \ - spin_lock_init(&(_d)->arch.shadow_lock) -#define shadow_lock_is_acquired(_d) \ - spin_is_locked(&(_d)->arch.shadow_lock) -#define shadow_lock(_d) \ -do { \ - ASSERT(!shadow_lock_is_acquired(_d)); \ - spin_lock(&(_d)->arch.shadow_lock); \ -} while (0) -#define shadow_unlock(_d) \ -do { \ - ASSERT(!shadow_lock_is_acquired(_d)); \ - spin_unlock(&(_d)->arch.shadow_lock); \ -} while (0) -#else -#define shadow_lock_init(_d) \ - ((_d)->arch.shadow_nest = 0) -#define shadow_lock_is_acquired(_d) \ - (spin_is_locked(&(_d)->big_lock) && ((_d)->arch.shadow_nest != 0)) -#define shadow_lock(_d) \ -do { \ - LOCK_BIGLOCK(_d); \ - (_d)->arch.shadow_nest++; \ -} while (0) -#define shadow_unlock(_d) \ -do { \ - ASSERT(shadow_lock_is_acquired(_d)); \ - (_d)->arch.shadow_nest--; \ - UNLOCK_BIGLOCK(_d); \ -} while (0) -#endif - -#if CONFIG_PAGING_LEVELS >= 3 -static inline u64 get_cr3_idxval(struct vcpu *v) -{ - u64 pae_cr3; - - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 && - !shadow_mode_log_dirty(v->domain) ) - { - pae_cr3 = hvm_get_guest_ctrl_reg(v, 3); /* get CR3 */ - return (pae_cr3 >> PAE_CR3_ALIGN) & PAE_CR3_IDX_MASK; - } - else - return 0; -} - -#define shadow_key_t u64 -#define index_to_key(x) ((x) << 32) -#else -#define get_cr3_idxval(v) (0) -#define shadow_key_t unsigned long -#define index_to_key(x) (0) -#endif - - -#define SHADOW_ENCODE_MIN_MAX(_min, _max) ((((GUEST_L1_PAGETABLE_ENTRIES - 1) - (_max)) << 16) | (_min)) -#define SHADOW_MIN(_encoded) ((_encoded) & ((1u<<16) - 1)) -#define SHADOW_MAX(_encoded) ((GUEST_L1_PAGETABLE_ENTRIES - 1) - ((_encoded) >> 16)) -extern void shadow_direct_map_clean(struct domain *d); -extern int shadow_direct_map_init(struct domain *d); -extern int shadow_direct_map_fault( - unsigned long vpa, struct cpu_user_regs *regs); -extern void shadow_mode_init(void); -extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc); -extern int shadow_fault(unsigned long va, struct cpu_user_regs *regs); -extern int shadow_mode_enable(struct domain *p, unsigned int mode); -extern void shadow_invlpg(struct vcpu *, unsigned long); -extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync( - struct vcpu *v, unsigned long gpfn, unsigned long mfn); -extern void free_monitor_pagetable(struct vcpu *v); -extern void __shadow_sync_all(struct domain *d); -extern int __shadow_out_of_sync(struct vcpu *v, unsigned long va); -extern int set_p2m_entry( - struct domain *d, unsigned long pfn, unsigned long mfn, - struct domain_mmap_cache *l2cache, - struct domain_mmap_cache *l1cache); -extern void remove_shadow(struct domain *d, unsigned long gpfn, u32 stype); - -extern void free_shadow_page(unsigned long smfn); - -extern void shadow_l1_normal_pt_update(struct domain *d, - paddr_t pa, l1_pgentry_t l1e, - struct domain_mmap_cache *cache); -extern void shadow_l2_normal_pt_update(struct domain *d, - paddr_t pa, l2_pgentry_t l2e, - struct domain_mmap_cache *cache); -#if CONFIG_PAGING_LEVELS >= 3 -#include -/* - * va_mask cannot be used because it's used by the shadow hash. - * Use the score area for for now. - */ -#define is_xen_l2_slot(t,s) \ - ( ((((t) & PGT_score_mask) >> PGT_score_shift) == 3) && \ - ((s) >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES - 1))) ) - -extern unsigned long gva_to_gpa(unsigned long gva); -extern void shadow_l3_normal_pt_update(struct domain *d, - paddr_t pa, l3_pgentry_t l3e, - struct domain_mmap_cache *cache); -#endif -#if CONFIG_PAGING_LEVELS >= 4 -extern void shadow_l4_normal_pt_update(struct domain *d, - paddr_t pa, l4_pgentry_t l4e, - struct domain_mmap_cache *cache); -#endif -extern int shadow_do_update_va_mapping(unsigned long va, - l1_pgentry_t val, - struct vcpu *v); - - -static inline unsigned long __shadow_status( - struct domain *d, unsigned long gpfn, unsigned long stype); - -#if CONFIG_PAGING_LEVELS <= 2 -static inline void update_hl2e(struct vcpu *v, unsigned long va); -#endif - -static inline int page_is_page_table(struct page_info *page) -{ - struct domain *owner = page_get_owner(page); - u32 type_info; - - if ( owner && shadow_mode_refcounts(owner) ) - return page->count_info & PGC_page_table; - - type_info = page->u.inuse.type_info & PGT_type_mask; - return type_info && (type_info <= PGT_l4_page_table); -} - -static inline int mfn_is_page_table(unsigned long mfn) -{ - if ( !mfn_valid(mfn) ) - return 0; - - return page_is_page_table(mfn_to_page(mfn)); -} - -static inline int page_out_of_sync(struct page_info *page) -{ - return page->count_info & PGC_out_of_sync; -} - -static inline int mfn_out_of_sync(unsigned long mfn) -{ - if ( !mfn_valid(mfn) ) - return 0; - - return page_out_of_sync(mfn_to_page(mfn)); -} - - -/************************************************************************/ - -static void inline -__shadow_sync_mfn(struct domain *d, unsigned long mfn) -{ - if ( d->arch.out_of_sync ) - { - // XXX - could be smarter - // - __shadow_sync_all(d); - } -} - -static void inline -__shadow_sync_va(struct vcpu *v, unsigned long va) -{ - struct domain *d = v->domain; - - if ( d->arch.out_of_sync && __shadow_out_of_sync(v, va) ) - { - perfc_incrc(shadow_sync_va); - - // XXX - could be smarter - // - __shadow_sync_all(v->domain); - } -#if CONFIG_PAGING_LEVELS <= 2 - // Also make sure the HL2 is up-to-date for this address. - // - if ( unlikely(shadow_mode_translate(v->domain)) ) - update_hl2e(v, va); -#endif -} - -static void inline -shadow_sync_all(struct domain *d) -{ - if ( unlikely(shadow_mode_enabled(d)) ) - { - shadow_lock(d); - - if ( d->arch.out_of_sync ) - __shadow_sync_all(d); - - ASSERT(d->arch.out_of_sync == NULL); - - shadow_unlock(d); - } -} - -// SMP BUG: This routine can't ever be used properly in an SMP context. -// It should be something like get_shadow_and_sync_va(). -// This probably shouldn't exist. -// -static void inline -shadow_sync_va(struct vcpu *v, unsigned long gva) -{ - struct domain *d = v->domain; - if ( unlikely(shadow_mode_enabled(d)) ) - { - shadow_lock(d); - __shadow_sync_va(v, gva); - shadow_unlock(d); - } -} - -extern void __shadow_mode_disable(struct domain *d); -static inline void shadow_mode_disable(struct domain *d) -{ - if ( unlikely(shadow_mode_enabled(d)) ) - { - shadow_lock(d); - __shadow_mode_disable(d); - shadow_unlock(d); - } -} - -/************************************************************************/ - -#define mfn_to_gmfn(_d, mfn) \ - ( (shadow_mode_translate(_d)) \ - ? get_gpfn_from_mfn(mfn) \ - : (mfn) ) - -#define gmfn_to_mfn(_d, gpfn) \ - ({ \ - unlikely(shadow_mode_translate(_d)) \ - ? (likely(current->domain == (_d)) \ - ? get_mfn_from_gpfn(gpfn) \ - : get_mfn_from_gpfn_foreign(_d, gpfn)) \ - : (gpfn); \ - }) - -extern unsigned long get_mfn_from_gpfn_foreign( - struct domain *d, unsigned long gpfn); - -/************************************************************************/ - -struct shadow_status { - struct shadow_status *next; /* Pull-to-front list per hash bucket. */ - shadow_key_t gpfn_and_flags; /* Guest pfn plus flags. */ - unsigned long smfn; /* Shadow mfn. */ -}; - -#define shadow_ht_extra_size 128 -#define shadow_ht_buckets 256 - -struct out_of_sync_entry { - struct out_of_sync_entry *next; - struct vcpu *v; - unsigned long gpfn; /* why is this here? */ - unsigned long gmfn; - unsigned long snapshot_mfn; - paddr_t writable_pl1e; /* NB: this is a machine address */ - unsigned long va; -}; - -#define out_of_sync_extra_size 127 - -#define SHADOW_SNAPSHOT_ELSEWHERE (-1L) - -/************************************************************************/ -#define SHADOW_DEBUG 0 -#define SHADOW_VERBOSE_DEBUG 0 -#define SHADOW_VVERBOSE_DEBUG 0 -#define SHADOW_VVVERBOSE_DEBUG 0 -#define SHADOW_HASH_DEBUG 0 -#define FULLSHADOW_DEBUG 0 - -#if SHADOW_DEBUG -extern int shadow_status_noswap; -#define SHADOW_REFLECTS_SNAPSHOT _PAGE_AVAIL0 -#endif - -#if SHADOW_VERBOSE_DEBUG -#define SH_LOG(_f, _a...) \ - printk("DOM%uP%u: SH_LOG(%d): " _f "\n", \ - current->domain->domain_id , smp_processor_id(), __LINE__ , ## _a ) -#define SH_VLOG(_f, _a...) \ - printk("DOM%uP%u: SH_VLOG(%d): " _f "\n", \ - current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) -#else -#define SH_LOG(_f, _a...) ((void)0) -#define SH_VLOG(_f, _a...) ((void)0) -#endif - -#if SHADOW_VVERBOSE_DEBUG -#define SH_VVLOG(_f, _a...) \ - printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n", \ - current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) -#else -#define SH_VVLOG(_f, _a...) ((void)0) -#endif - -#if SHADOW_VVVERBOSE_DEBUG -#define SH_VVVLOG(_f, _a...) \ - printk("DOM%uP%u: SH_VVVLOG(%d): " _f "\n", \ - current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) -#else -#define SH_VVVLOG(_f, _a...) ((void)0) -#endif - -#if FULLSHADOW_DEBUG -#define FSH_LOG(_f, _a...) \ - printk("DOM%uP%u: FSH_LOG(%d): " _f "\n", \ - current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) -#else -#define FSH_LOG(_f, _a...) ((void)0) -#endif - - -/************************************************************************/ - -static inline int -shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d) -{ - l1_pgentry_t nl1e; - int res; - unsigned long mfn; - struct domain *owner; - - ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT); - - if ( !shadow_mode_refcounts(d) ) - return 1; - - nl1e = l1e; - l1e_remove_flags(nl1e, _PAGE_GLOBAL); - - if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) ) - return 0; - - res = get_page_from_l1e(nl1e, d); - - if ( unlikely(!res) && IS_PRIV(d) && !shadow_mode_translate(d) && - !(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) && - (mfn = l1e_get_pfn(nl1e)) && - mfn_valid(mfn) && - (owner = page_get_owner(mfn_to_page(mfn))) && - (d != owner) ) - { - res = get_page_from_l1e(nl1e, owner); - printk("tried to map mfn %lx from domain %d into shadow page tables " - "of domain %d; %s\n", - mfn, owner->domain_id, d->domain_id, - res ? "success" : "failed"); - } - - if ( unlikely(!res) ) - { - perfc_incrc(shadow_get_page_fail); - FSH_LOG("%s failed to get ref l1e=%" PRIpte "\n", - __func__, l1e_get_intpte(l1e)); - } - - return res; -} - -static inline void -shadow_put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) -{ - if ( !shadow_mode_refcounts(d) ) - return; - - put_page_from_l1e(l1e, d); -} - -static inline void -shadow_put_page_type(struct domain *d, struct page_info *page) -{ - if ( !shadow_mode_refcounts(d) ) - return; - - put_page_type(page); -} - -static inline int shadow_get_page(struct domain *d, - struct page_info *page, - struct domain *owner) -{ - if ( !shadow_mode_refcounts(d) ) - return 1; - return get_page(page, owner); -} - -static inline void shadow_put_page(struct domain *d, - struct page_info *page) -{ - if ( !shadow_mode_refcounts(d) ) - return; - put_page(page); -} - -/************************************************************************/ - -static inline void __mark_dirty(struct domain *d, unsigned long mfn) -{ - unsigned long pfn; - - ASSERT(shadow_lock_is_acquired(d)); - - if ( likely(!shadow_mode_log_dirty(d)) || !VALID_MFN(mfn) ) - return; - - ASSERT(d->arch.shadow_dirty_bitmap != NULL); - - /* We /really/ mean PFN here, even for non-translated guests. */ - pfn = get_gpfn_from_mfn(mfn); - - /* - * Values with the MSB set denote MFNs that aren't really part of the - * domain's pseudo-physical memory map (e.g., the shared info frame). - * Nothing to do here... - */ - if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) ) - return; - - /* N.B. Can use non-atomic TAS because protected by shadow_lock. */ - if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) && - !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) ) - { - d->arch.shadow_dirty_count++; - } -#ifndef NDEBUG - else if ( mfn_valid(mfn) ) - { - SH_VLOG("mark_dirty OOR! mfn=%lx pfn=%lx max=%x (dom %p)", - mfn, pfn, d->arch.shadow_dirty_bitmap_size, d); - SH_VLOG("dom=%p caf=%08x taf=%" PRtype_info, - page_get_owner(mfn_to_page(mfn)), - mfn_to_page(mfn)->count_info, - mfn_to_page(mfn)->u.inuse.type_info ); - } -#endif -} - - -static inline void mark_dirty(struct domain *d, unsigned int mfn) -{ - if ( unlikely(shadow_mode_log_dirty(d)) ) - { - shadow_lock(d); - __mark_dirty(d, mfn); - shadow_unlock(d); - } -} - - -/************************************************************************/ -#if CONFIG_PAGING_LEVELS <= 2 -static inline void -__shadow_get_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e) -{ - ASSERT(shadow_mode_enabled(v->domain)); - - *psl2e = v->arch.shadow_vtable[l2_table_offset(va)]; -} - -static inline void -__shadow_set_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t value) -{ - ASSERT(shadow_mode_enabled(v->domain)); - - v->arch.shadow_vtable[l2_table_offset(va)] = value; -} - -static inline void -__guest_get_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t *pl2e) -{ - *pl2e = v->arch.guest_vtable[l2_table_offset(va)]; -} - -static inline void -__guest_set_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t value) -{ - struct domain *d = v->domain; - - v->arch.guest_vtable[l2_table_offset(va)] = value; - - if ( unlikely(shadow_mode_translate(d)) ) - update_hl2e(v, va); - - __mark_dirty(d, pagetable_get_pfn(v->arch.guest_table)); -} - -static inline void -__direct_get_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e) -{ - l2_pgentry_t *phys_vtable; - - ASSERT(shadow_mode_enabled(v->domain)); - - phys_vtable = map_domain_page( - pagetable_get_pfn(v->domain->arch.phys_table)); - - *psl2e = phys_vtable[l2_table_offset(va)]; - - unmap_domain_page(phys_vtable); -} - -static inline void -__direct_set_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t value) -{ - l2_pgentry_t *phys_vtable; - - ASSERT(shadow_mode_enabled(v->domain)); - - phys_vtable = map_domain_page( - pagetable_get_pfn(v->domain->arch.phys_table)); - - phys_vtable[l2_table_offset(va)] = value; - - unmap_domain_page(phys_vtable); -} - -static inline void -update_hl2e(struct vcpu *v, unsigned long va) -{ - int index = l2_table_offset(va); - unsigned long mfn; - l2_pgentry_t gl2e = v->arch.guest_vtable[index]; - l1_pgentry_t old_hl2e, new_hl2e; - int need_flush = 0; - - ASSERT(shadow_mode_translate(v->domain)); - - old_hl2e = v->arch.hl2_vtable[index]; - - if ( (l2e_get_flags(gl2e) & _PAGE_PRESENT) && - VALID_MFN(mfn = get_mfn_from_gpfn(l2e_get_pfn(gl2e))) ) - new_hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR); - else - new_hl2e = l1e_empty(); - - // only do the ref counting if something has changed. - // - if ( (l1e_has_changed(old_hl2e, new_hl2e, PAGE_FLAG_MASK)) ) - { - if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) && - !shadow_get_page(v->domain, mfn_to_page(l1e_get_pfn(new_hl2e)), - v->domain) ) - new_hl2e = l1e_empty(); - if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT ) - { - shadow_put_page(v->domain, mfn_to_page(l1e_get_pfn(old_hl2e))); - need_flush = 1; - } - - v->arch.hl2_vtable[l2_table_offset(va)] = new_hl2e; - - if ( need_flush ) - { - perfc_incrc(update_hl2e_invlpg); - flush_tlb_one_mask(v->domain->domain_dirty_cpumask, - &linear_pg_table[l1_linear_offset(va)]); - } - } -} - -static inline void shadow_drop_references( - struct domain *d, struct page_info *page) -{ - if ( likely(!shadow_mode_refcounts(d)) || - ((page->u.inuse.type_info & PGT_count_mask) == 0) ) - return; - - /* XXX This needs more thought... */ - printk("%s: needing to call shadow_remove_all_access for mfn=%lx\n", - __func__, page_to_mfn(page)); - printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page), - page->count_info, page->u.inuse.type_info); - - shadow_lock(d); - shadow_remove_all_access(d, page_to_mfn(page)); - shadow_unlock(d); - - printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page), - page->count_info, page->u.inuse.type_info); -} - -/* XXX Needs more thought. Neither pretty nor fast: a place holder. */ -static inline void shadow_sync_and_drop_references( - struct domain *d, struct page_info *page) -{ - if ( likely(!shadow_mode_refcounts(d)) ) - return; - - if ( page_out_of_sync(page) ) - __shadow_sync_mfn(d, page_to_mfn(page)); - - shadow_remove_all_access(d, page_to_mfn(page)); -} -#endif - -/************************************************************************/ - -/* - * Add another shadow reference to smfn. - */ -static inline int -get_shadow_ref(unsigned long smfn) -{ - u32 x, nx; - - ASSERT(mfn_valid(smfn)); - - x = mfn_to_page(smfn)->count_info; - nx = x + 1; - - if ( unlikely(nx == 0) ) - { - printk("get_shadow_ref overflow, gmfn=%" PRtype_info " smfn=%lx\n", - mfn_to_page(smfn)->u.inuse.type_info & PGT_mfn_mask, - smfn); - BUG(); - } - - // Guarded by the shadow lock... - // - mfn_to_page(smfn)->count_info = nx; - - return 1; -} - -/* - * Drop a shadow reference to smfn. - */ -static inline void -put_shadow_ref(unsigned long smfn) -{ - u32 x, nx; - - ASSERT(mfn_valid(smfn)); - - x = mfn_to_page(smfn)->count_info; - nx = x - 1; - - if ( unlikely(x == 0) ) - { - printk("put_shadow_ref underflow, smfn=%lx oc=%08x t=%" - PRtype_info "\n", - smfn, - mfn_to_page(smfn)->count_info, - mfn_to_page(smfn)->u.inuse.type_info); - BUG(); - } - - // Guarded by the shadow lock... - // - mfn_to_page(smfn)->count_info = nx; - - if ( unlikely(nx == 0) ) - { - free_shadow_page(smfn); - } -} - -static inline void -shadow_pin(unsigned long smfn) -{ - ASSERT( !(mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) ); - - mfn_to_page(smfn)->u.inuse.type_info |= PGT_pinned; - if ( unlikely(!get_shadow_ref(smfn)) ) - BUG(); -} - -static inline void -shadow_unpin(unsigned long smfn) -{ - ASSERT( (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) ); - - mfn_to_page(smfn)->u.inuse.type_info &= ~PGT_pinned; - put_shadow_ref(smfn); -} - -/* - * SMP issue. The following code assumes the shadow lock is held. Re-visit - * when working on finer-gained locks for shadow. - */ -static inline void set_guest_back_ptr( - struct domain *d, l1_pgentry_t spte, - unsigned long smfn, unsigned int index) -{ - struct page_info *gpage; - - ASSERT(shadow_lock_is_acquired(d)); - - if ( !shadow_mode_external(d) || - ((l1e_get_flags(spte) & (_PAGE_PRESENT|_PAGE_RW)) != - (_PAGE_PRESENT|_PAGE_RW)) ) - return; - - gpage = l1e_get_page(spte); - - ASSERT(smfn != 0); - ASSERT(page_to_mfn(gpage) != 0); - - gpage->tlbflush_timestamp = smfn; - gpage->u.inuse.type_info &= ~PGT_va_mask; - gpage->u.inuse.type_info |= (unsigned long)index << PGT_va_shift; -} - -/************************************************************************/ -#if CONFIG_PAGING_LEVELS <= 2 -extern void shadow_mark_va_out_of_sync( - struct vcpu *v, unsigned long gpfn, unsigned long mfn, - unsigned long va); - -static inline int l1pte_write_fault( - struct vcpu *v, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p, - unsigned long va) -{ - struct domain *d = v->domain; - l1_pgentry_t gpte = *gpte_p; - l1_pgentry_t spte; - unsigned long gpfn = l1e_get_pfn(gpte); - unsigned long gmfn = gmfn_to_mfn(d, gpfn); - - //printk("l1pte_write_fault gmfn=%lx\n", gmfn); - - if ( unlikely(!VALID_MFN(gmfn)) ) - { - SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn); - *spte_p = l1e_empty(); - return 0; - } - - ASSERT(l1e_get_flags(gpte) & _PAGE_RW); - l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED); - spte = l1e_from_pfn(gmfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL); - - SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte, - l1e_get_intpte(spte), l1e_get_intpte(gpte)); - - __mark_dirty(d, gmfn); - - if ( mfn_is_page_table(gmfn) ) - shadow_mark_va_out_of_sync(v, gpfn, gmfn, va); - - *gpte_p = gpte; - *spte_p = spte; - - return 1; -} - -static inline int l1pte_read_fault( - struct domain *d, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p) -{ - l1_pgentry_t gpte = *gpte_p; - l1_pgentry_t spte = *spte_p; - unsigned long pfn = l1e_get_pfn(gpte); - unsigned long mfn = gmfn_to_mfn(d, pfn); - - if ( unlikely(!VALID_MFN(mfn)) ) - { - SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn); - *spte_p = l1e_empty(); - return 0; - } - - l1e_add_flags(gpte, _PAGE_ACCESSED); - spte = l1e_from_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL); - - if ( shadow_mode_log_dirty(d) || !(l1e_get_flags(gpte) & _PAGE_DIRTY) || - mfn_is_page_table(mfn) ) - { - l1e_remove_flags(spte, _PAGE_RW); - } - - SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte, - l1e_get_intpte(spte), l1e_get_intpte(gpte)); - *gpte_p = gpte; - *spte_p = spte; - - return 1; -} -#endif - -static inline void l1pte_propagate_from_guest( - struct domain *d, guest_l1_pgentry_t gpte, l1_pgentry_t *spte_p) -{ - unsigned long mfn; - l1_pgentry_t spte; - - spte = l1e_empty(); - - if ( ((guest_l1e_get_flags(gpte) & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == - (_PAGE_PRESENT|_PAGE_ACCESSED)) && - VALID_MFN(mfn = gmfn_to_mfn(d, l1e_get_pfn(gpte))) ) - { - spte = l1e_from_pfn( - mfn, guest_l1e_get_flags(gpte) & ~(_PAGE_GLOBAL | _PAGE_AVAIL)); - - if ( shadow_mode_log_dirty(d) || - !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) || - mfn_is_page_table(mfn) ) - { - l1e_remove_flags(spte, _PAGE_RW); - } - } - - if ( l1e_get_intpte(spte) || l1e_get_intpte(gpte) ) - SH_VVVLOG("%s: gpte=%" PRIpte ", new spte=%" PRIpte, - __func__, l1e_get_intpte(gpte), l1e_get_intpte(spte)); - - *spte_p = spte; -} - -static inline void hl2e_propagate_from_guest( - struct domain *d, l2_pgentry_t gpde, l1_pgentry_t *hl2e_p) -{ - unsigned long pfn = l2e_get_pfn(gpde); - unsigned long mfn; - l1_pgentry_t hl2e; - - hl2e = l1e_empty(); - - if ( l2e_get_flags(gpde) & _PAGE_PRESENT ) - { - mfn = gmfn_to_mfn(d, pfn); - if ( VALID_MFN(mfn) && mfn_valid(mfn) ) - hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR); - } - - if ( l1e_get_intpte(hl2e) || l2e_get_intpte(gpde) ) - SH_VVLOG("%s: gpde=%" PRIpte " hl2e=%" PRIpte, __func__, - l2e_get_intpte(gpde), l1e_get_intpte(hl2e)); - - *hl2e_p = hl2e; -} - -static inline void l2pde_general( - struct domain *d, - guest_l2_pgentry_t *gpde_p, - l2_pgentry_t *spde_p, - unsigned long sl1mfn) -{ - guest_l2_pgentry_t gpde = *gpde_p; - l2_pgentry_t spde; - - spde = l2e_empty(); - if ( (guest_l2e_get_flags(gpde) & _PAGE_PRESENT) && (sl1mfn != 0) ) - { - spde = l2e_from_pfn( - sl1mfn, - (guest_l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED) & ~_PAGE_AVAIL); - - /* N.B. PDEs do not have a dirty bit. */ - guest_l2e_add_flags(gpde, _PAGE_ACCESSED); - - *gpde_p = gpde; - } - - if ( l2e_get_intpte(spde) || l2e_get_intpte(gpde) ) - SH_VVLOG("%s: gpde=%" PRIpte ", new spde=%" PRIpte, __func__, - l2e_get_intpte(gpde), l2e_get_intpte(spde)); - - *spde_p = spde; -} - -static inline void l2pde_propagate_from_guest( - struct domain *d, guest_l2_pgentry_t *gpde_p, l2_pgentry_t *spde_p) -{ - guest_l2_pgentry_t gpde = *gpde_p; - unsigned long sl1mfn = 0; - - if ( guest_l2e_get_flags(gpde) & _PAGE_PRESENT ) - sl1mfn = __shadow_status(d, l2e_get_pfn(gpde), PGT_l1_shadow); - l2pde_general(d, gpde_p, spde_p, sl1mfn); -} - -/************************************************************************/ - -// returns true if a tlb flush is needed -// -static int inline -validate_pte_change( - struct domain *d, - guest_l1_pgentry_t new_pte, - l1_pgentry_t *shadow_pte_p) -{ - l1_pgentry_t old_spte, new_spte; - int need_flush = 0; - - perfc_incrc(validate_pte_calls); - - l1pte_propagate_from_guest(d, new_pte, &new_spte); - - if ( shadow_mode_refcounts(d) ) - { - old_spte = *shadow_pte_p; - - if ( l1e_get_intpte(old_spte) == l1e_get_intpte(new_spte) ) - { - // No accounting required... - // - perfc_incrc(validate_pte_changes1); - } - else if ( l1e_get_intpte(old_spte) == (l1e_get_intpte(new_spte)|_PAGE_RW) ) - { - // Fast path for PTEs that have merely been write-protected - // (e.g., during a Unix fork()). A strict reduction in privilege. - // - perfc_incrc(validate_pte_changes2); - if ( likely(l1e_get_flags(new_spte) & _PAGE_PRESENT) ) - shadow_put_page_type(d, mfn_to_page(l1e_get_pfn(new_spte))); - } - else if ( ((l1e_get_flags(old_spte) | l1e_get_flags(new_spte)) & - _PAGE_PRESENT ) && - l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) ) - { - // only do the ref counting if something important changed. - // - perfc_incrc(validate_pte_changes3); - - if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) - { - shadow_put_page_from_l1e(old_spte, d); - need_flush = 1; - } - if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(new_spte, d) ) { - new_spte = l1e_empty(); - need_flush = -1; /* need to unshadow the page */ - } - } - else - { - perfc_incrc(validate_pte_changes4); - } - } - - *shadow_pte_p = new_spte; - - return need_flush; -} - -// returns true if a tlb flush is needed -// -static int inline -validate_hl2e_change( - struct domain *d, - l2_pgentry_t new_gpde, - l1_pgentry_t *shadow_hl2e_p) -{ - l1_pgentry_t old_hl2e, new_hl2e; - int need_flush = 0; - - perfc_incrc(validate_hl2e_calls); - - old_hl2e = *shadow_hl2e_p; - hl2e_propagate_from_guest(d, new_gpde, &new_hl2e); - - // Only do the ref counting if something important changed. - // - if ( ((l1e_get_flags(old_hl2e) | l1e_get_flags(new_hl2e)) & _PAGE_PRESENT) && - l1e_has_changed(old_hl2e, new_hl2e, _PAGE_PRESENT) ) - { - perfc_incrc(validate_hl2e_changes); - - if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) && - !get_page(mfn_to_page(l1e_get_pfn(new_hl2e)), d) ) - new_hl2e = l1e_empty(); - if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT ) - { - put_page(mfn_to_page(l1e_get_pfn(old_hl2e))); - need_flush = 1; - } - } - - *shadow_hl2e_p = new_hl2e; - - return need_flush; -} - -// returns true if a tlb flush is needed -// -static int inline -validate_pde_change( - struct domain *d, - guest_l2_pgentry_t new_gpde, - l2_pgentry_t *shadow_pde_p) -{ - l2_pgentry_t old_spde, new_spde; - int need_flush = 0; - - perfc_incrc(validate_pde_calls); - - old_spde = *shadow_pde_p; - l2pde_propagate_from_guest(d, &new_gpde, &new_spde); - - // Only do the ref counting if something important changed. - // - if ( ((l2e_get_intpte(old_spde) | l2e_get_intpte(new_spde)) & _PAGE_PRESENT) && - l2e_has_changed(old_spde, new_spde, _PAGE_PRESENT) ) - { - perfc_incrc(validate_pde_changes); - - if ( (l2e_get_flags(new_spde) & _PAGE_PRESENT) && - !get_shadow_ref(l2e_get_pfn(new_spde)) ) - BUG(); - if ( l2e_get_flags(old_spde) & _PAGE_PRESENT ) - { - put_shadow_ref(l2e_get_pfn(old_spde)); - need_flush = 1; - } - } - - *shadow_pde_p = new_spde; - - return need_flush; -} - -/*********************************************************************/ - -#if SHADOW_HASH_DEBUG - -static void shadow_audit(struct domain *d, int print) -{ - int live = 0, free = 0, j = 0, abs; - struct shadow_status *a; - - for ( j = 0; j < shadow_ht_buckets; j++ ) - { - a = &d->arch.shadow_ht[j]; - if ( a->gpfn_and_flags ) - { - live++; - ASSERT(a->smfn); - } - else - ASSERT(!a->next); - - a = a->next; - while ( a && (live < 9999) ) - { - live++; - if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) ) - { - printk("XXX live=%d gpfn+flags=%lx sp=%lx next=%p\n", - live, a->gpfn_and_flags, a->smfn, a->next); - BUG(); - } - ASSERT(a->smfn); - a = a->next; - } - ASSERT(live < 9999); - } - - for ( a = d->arch.shadow_ht_free; a != NULL; a = a->next ) - free++; - - if ( print ) - printk("Xlive=%d free=%d\n", live, free); - - // BUG: this only works if there's only a single domain which is - // using shadow tables. - // - abs = ( - perfc_value(shadow_l1_pages) + - perfc_value(shadow_l2_pages) + - perfc_value(hl2_table_pages) + - perfc_value(snapshot_pages) + - perfc_value(writable_pte_predictions) - ) - live; -#ifdef PERF_COUNTERS - if ( (abs < -1) || (abs > 1) ) - { - printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d writable_ptes=%d\n", - live, free, - perfc_value(shadow_l1_pages), - perfc_value(shadow_l2_pages), - perfc_value(hl2_table_pages), - perfc_value(snapshot_pages), - perfc_value(writable_pte_predictions)); - BUG(); - } -#endif - - // XXX ought to add some code to audit the out-of-sync entries, too. - // -} -#else -#define shadow_audit(p, print) ((void)0) -#endif - - -static inline struct shadow_status *hash_bucket( - struct domain *d, unsigned int gpfn) -{ - return &d->arch.shadow_ht[gpfn % shadow_ht_buckets]; -} - - -/* - * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace, - * which, depending on full shadow mode, may or may not equal - * its mfn). - * It returns the shadow's mfn, or zero if it doesn't exist. - */ -static inline unsigned long __shadow_status( - struct domain *d, unsigned long gpfn, unsigned long stype) -{ - struct shadow_status *p, *x, *head; - shadow_key_t key; -#if CONFIG_PAGING_LEVELS >= 3 - if ( d->arch.ops->guest_paging_levels == PAGING_L3 && stype == PGT_l4_shadow ) - key = gpfn | stype | index_to_key(get_cr3_idxval(current)); - else -#endif - key = gpfn | stype; - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(gpfn == (gpfn & PGT_mfn_mask)); - ASSERT(stype && !(stype & ~PGT_type_mask)); - - perfc_incrc(shadow_status_calls); - - x = head = hash_bucket(d, gpfn); - p = NULL; - - shadow_audit(d, 0); - - do - { - ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL))); - - if ( x->gpfn_and_flags == key ) - { -#if SHADOW_DEBUG - if ( unlikely(shadow_status_noswap) ) - return x->smfn; -#endif - /* Pull-to-front if 'x' isn't already the head item. */ - if ( unlikely(x != head) ) - { - /* Delete 'x' from list and reinsert immediately after head. */ - p->next = x->next; - x->next = head->next; - head->next = x; - - /* Swap 'x' contents with head contents. */ - SWAP(head->gpfn_and_flags, x->gpfn_and_flags); - SWAP(head->smfn, x->smfn); - } - else - { - perfc_incrc(shadow_status_hit_head); - } - - return head->smfn; - } - - p = x; - x = x->next; - } - while ( x != NULL ); - - perfc_incrc(shadow_status_miss); - return 0; -} - -/* - * Not clear if pull-to-front is worth while for this or not, - * as it generally needs to scan the entire bucket anyway. - * Much simpler without. - * - * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table. - */ -static inline u32 -shadow_max_pgtable_type(struct domain *d, unsigned long gpfn, - unsigned long *smfn) -{ - struct shadow_status *x; - u32 pttype = PGT_none, type; - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(gpfn == (gpfn & PGT_mfn_mask)); - - perfc_incrc(shadow_max_type); - - x = hash_bucket(d, gpfn); - - while ( x && x->gpfn_and_flags ) - { - if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn ) - { - type = x->gpfn_and_flags & PGT_type_mask; - - switch ( type ) - { - case PGT_hl2_shadow: - // Treat an HL2 as if it's an L1 - // - type = PGT_l1_shadow; - break; - case PGT_snapshot: - case PGT_writable_pred: - // Ignore snapshots -- they don't in and of themselves constitute - // treating a page as a page table - // - goto next; - case PGT_base_page_table: - // Early exit if we found the max possible value - // - return type; - default: - break; - } - - if ( type > pttype ) - { - pttype = type; - if ( smfn ) - *smfn = x->smfn; - } - } - next: - x = x->next; - } - - return pttype; -} - -static inline void delete_shadow_status( - struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned int stype, u64 index) -{ - struct shadow_status *p, *x, *n, *head; - - shadow_key_t key = gpfn | stype | index_to_key(index); - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(!(gpfn & ~PGT_mfn_mask)); - ASSERT(stype && !(stype & ~PGT_type_mask)); - - head = hash_bucket(d, gpfn); - - SH_VLOG("delete gpfn=%lx t=%08x bucket=%p", gpfn, stype, head); - shadow_audit(d, 0); - - /* Match on head item? */ - if ( head->gpfn_and_flags == key ) - { - if ( (n = head->next) != NULL ) - { - /* Overwrite head with contents of following node. */ - head->gpfn_and_flags = n->gpfn_and_flags; - head->smfn = n->smfn; - - /* Delete following node. */ - head->next = n->next; - - /* Add deleted node to the free list. */ - n->gpfn_and_flags = 0; - n->smfn = 0; - n->next = d->arch.shadow_ht_free; - d->arch.shadow_ht_free = n; - } - else - { - /* This bucket is now empty. Initialise the head node. */ - head->gpfn_and_flags = 0; - head->smfn = 0; - } - - goto found; - } - - p = head; - x = head->next; - - do - { - if ( x->gpfn_and_flags == key ) - { - /* Delete matching node. */ - p->next = x->next; - - /* Add deleted node to the free list. */ - x->gpfn_and_flags = 0; - x->smfn = 0; - x->next = d->arch.shadow_ht_free; - d->arch.shadow_ht_free = x; - - goto found; - } - - p = x; - x = x->next; - } - while ( x != NULL ); - - /* If we got here, it wasn't in the list! */ - BUG(); - - found: - // release ref to page - if ( stype != PGT_writable_pred ) - put_page(mfn_to_page(gmfn)); - - shadow_audit(d, 0); -} - -static inline void set_shadow_status( - struct domain *d, unsigned long gpfn, unsigned long gmfn, - unsigned long smfn, unsigned long stype, u64 index) -{ - struct shadow_status *x, *head, *extra; - int i; - - shadow_key_t key = gpfn | stype | index_to_key(index); - - SH_VVLOG("set gpfn=%lx gmfn=%lx smfn=%lx t=%lx", gpfn, gmfn, smfn, stype); - - ASSERT(shadow_lock_is_acquired(d)); - - ASSERT(shadow_mode_translate(d) || gpfn); - ASSERT(!(gpfn & ~PGT_mfn_mask)); - - // XXX - need to be more graceful. - ASSERT(VALID_MFN(gmfn)); - - ASSERT(stype && !(stype & ~PGT_type_mask)); - - x = head = hash_bucket(d, gpfn); - - SH_VLOG("set gpfn=%lx smfn=%lx t=%lx bucket=%p(%p)", - gpfn, smfn, stype, x, x->next); - shadow_audit(d, 0); - - // grab a reference to the guest page to represent the entry in the shadow - // hash table - // - // XXX - Should PGT_writable_pred grab a page ref? - // - Who/how are these hash table entry refs flushed if/when a page - // is given away by the domain? - // - if ( stype != PGT_writable_pred ) - get_page(mfn_to_page(gmfn), d); - - /* - * STEP 1. If page is already in the table, update it in place. - */ - do - { - if ( unlikely(x->gpfn_and_flags == key) ) - { - if ( stype != PGT_writable_pred ) - BUG(); // we should never replace entries into the hash table - x->smfn = smfn; - if ( stype != PGT_writable_pred ) - put_page(mfn_to_page(gmfn)); // already had a ref... - goto done; - } - - x = x->next; - } - while ( x != NULL ); - - /* - * STEP 2. The page must be inserted into the table. - */ - - /* If the bucket is empty then insert the new page as the head item. */ - if ( head->gpfn_and_flags == 0 ) - { - head->gpfn_and_flags = key; - head->smfn = smfn; - ASSERT(head->next == NULL); - goto done; - } - - /* We need to allocate a new node. Ensure the quicklist is non-empty. */ - if ( unlikely(d->arch.shadow_ht_free == NULL) ) - { - SH_VLOG("Allocate more shadow hashtable blocks."); - - extra = xmalloc_bytes( - sizeof(void *) + (shadow_ht_extra_size * sizeof(*x))); - - /* XXX Should be more graceful here. */ - if ( extra == NULL ) - BUG(); - - memset(extra, 0, sizeof(void *) + (shadow_ht_extra_size * sizeof(*x))); - - /* Record the allocation block so it can be correctly freed later. */ - d->arch.shadow_extras_count++; - *((struct shadow_status **)&extra[shadow_ht_extra_size]) = - d->arch.shadow_ht_extras; - d->arch.shadow_ht_extras = &extra[0]; - - /* Thread a free chain through the newly-allocated nodes. */ - for ( i = 0; i < (shadow_ht_extra_size - 1); i++ ) - extra[i].next = &extra[i+1]; - extra[i].next = NULL; - - /* Add the new nodes to the free list. */ - d->arch.shadow_ht_free = &extra[0]; - } - - /* Allocate a new node from the quicklist. */ - x = d->arch.shadow_ht_free; - d->arch.shadow_ht_free = x->next; - - /* Initialise the new node and insert directly after the head item. */ - x->gpfn_and_flags = key; - x->smfn = smfn; - x->next = head->next; - head->next = x; - - done: - shadow_audit(d, 0); - - if ( stype <= PGT_l4_shadow ) - { - // add to front of list of pages to check when removing write - // permissions for a page... - // - } -} - -/************************************************************************/ - -static inline void guest_physmap_add_page( - struct domain *d, unsigned long gpfn, unsigned long mfn) -{ - struct domain_mmap_cache c1, c2; - - if ( likely(!shadow_mode_translate(d)) ) - return; - - domain_mmap_cache_init(&c1); - domain_mmap_cache_init(&c2); - shadow_lock(d); - shadow_sync_and_drop_references(d, mfn_to_page(mfn)); - set_p2m_entry(d, gpfn, mfn, &c1, &c2); - set_gpfn_from_mfn(mfn, gpfn); - shadow_unlock(d); - domain_mmap_cache_destroy(&c1); - domain_mmap_cache_destroy(&c2); -} - -static inline void guest_physmap_remove_page( - struct domain *d, unsigned long gpfn, unsigned long mfn) -{ - struct domain_mmap_cache c1, c2; - unsigned long type; - - if ( likely(!shadow_mode_translate(d)) ) - return; - - domain_mmap_cache_init(&c1); - domain_mmap_cache_init(&c2); - shadow_lock(d); - shadow_sync_and_drop_references(d, mfn_to_page(mfn)); - while ( (type = shadow_max_pgtable_type(d, gpfn, NULL)) != PGT_none ) - free_shadow_page(__shadow_status(d, gpfn, type)); - set_p2m_entry(d, gpfn, -1, &c1, &c2); - set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); - shadow_unlock(d); - domain_mmap_cache_destroy(&c1); - domain_mmap_cache_destroy(&c2); -} - -/************************************************************************/ - -void static inline -shadow_update_min_max(unsigned long smfn, int index) -{ - struct page_info *sl1page = mfn_to_page(smfn); - u32 min_max = sl1page->tlbflush_timestamp; - int min = SHADOW_MIN(min_max); - int max = SHADOW_MAX(min_max); - int update = 0; - - if ( index < min ) - { - min = index; - update = 1; - } - if ( index > max ) - { - max = index; - update = 1; - } - if ( update ) - sl1page->tlbflush_timestamp = SHADOW_ENCODE_MIN_MAX(min, max); -} - -#if CONFIG_PAGING_LEVELS <= 2 -extern void shadow_map_l1_into_current_l2(unsigned long va); - -void static inline -shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - l2_pgentry_t sl2e = {0}; - - __shadow_get_l2e(v, va, &sl2e); - if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) ) - { - /* - * Either the L1 is not shadowed, or the shadow isn't linked into - * the current shadow L2. - */ - if ( create_l1_shadow ) - { - perfc_incrc(shadow_set_l1e_force_map); - shadow_map_l1_into_current_l2(va); - } - else /* check to see if it exists; if so, link it in */ - { - l2_pgentry_t gpde = linear_l2_table(v)[l2_table_offset(va)]; - unsigned long gl1pfn = l2e_get_pfn(gpde); - unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow); - - ASSERT( l2e_get_flags(gpde) & _PAGE_PRESENT ); - - if ( sl1mfn ) - { - perfc_incrc(shadow_set_l1e_unlinked); - if ( !get_shadow_ref(sl1mfn) ) - BUG(); - l2pde_general(d, &gpde, &sl2e, sl1mfn); - __guest_set_l2e(v, va, gpde); - __shadow_set_l2e(v, va, sl2e); - } - else - { - // no shadow exists, so there's nothing to do. - perfc_incrc(shadow_set_l1e_fail); - return; - } - } - } - - __shadow_get_l2e(v, va, &sl2e); - - if ( shadow_mode_refcounts(d) ) - { - l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)]; - - // only do the ref counting if something important changed. - // - if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) ) - { - if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(new_spte, d) ) - new_spte = l1e_empty(); - if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) - shadow_put_page_from_l1e(old_spte, d); - } - - } - - set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va)); - shadow_linear_pg_table[l1_linear_offset(va)] = new_spte; - shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va)); -} -#endif -/************************************************************************/ - -static inline int -shadow_mode_page_writable(unsigned long va, struct cpu_user_regs *regs, unsigned long gpfn) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - unsigned long mfn = gmfn_to_mfn(d, gpfn); - u32 type = mfn_to_page(mfn)->u.inuse.type_info & PGT_type_mask; - - if ( shadow_mode_refcounts(d) && - (type == PGT_writable_page) ) - type = shadow_max_pgtable_type(d, gpfn, NULL); - - // Strange but true: writable page tables allow kernel-mode access - // to L1 page table pages via write-protected PTEs... Similarly, write - // access to all page table pages is granted for shadow_mode_write_all - // clients. - // - if ( ((shadow_mode_write_l1(d) && (type == PGT_l1_page_table)) || - (shadow_mode_write_all(d) && type && (type <= PGT_l4_page_table))) && - ((va < HYPERVISOR_VIRT_START) -#if defined(__x86_64__) - || (va >= HYPERVISOR_VIRT_END) -#endif - ) && - guest_kernel_mode(v, regs) ) - return 1; - - return 0; -} - -#if CONFIG_PAGING_LEVELS <= 2 -static inline l1_pgentry_t gva_to_gpte(unsigned long gva) -{ - l2_pgentry_t gpde; - l1_pgentry_t gpte; - struct vcpu *v = current; - - ASSERT( shadow_mode_translate(current->domain) ); - - __guest_get_l2e(v, gva, &gpde); - if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) ) - return l1e_empty();; - - // This is actually overkill - we only need to make sure the hl2 - // is in-sync. - // - shadow_sync_va(v, gva); - - if ( unlikely(__copy_from_user(&gpte, - &linear_pg_table[gva >> PAGE_SHIFT], - sizeof(gpte))) ) - { - FSH_LOG("gva_to_gpte got a fault on gva=%lx", gva); - return l1e_empty(); - } - - return gpte; -} - -static inline unsigned long gva_to_gpa(unsigned long gva) -{ - l1_pgentry_t gpte; - - gpte = gva_to_gpte(gva); - if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) ) - return 0; - - return l1e_get_paddr(gpte) + (gva & ~PAGE_MASK); -} -#endif - -static inline unsigned long gva_to_mfn(unsigned long gva) -{ - unsigned long gpa = gva_to_gpa(gva); - return get_mfn_from_gpfn(gpa >> PAGE_SHIFT); -} - -/************************************************************************/ - -extern void __update_pagetables(struct vcpu *v); -static inline void update_pagetables(struct vcpu *v) -{ - struct domain *d = v->domain; - int paging_enabled; - - if ( hvm_guest(v) ) - paging_enabled = hvm_paging_enabled(v); - else - // HACK ALERT: there's currently no easy way to figure out if a domU - // has set its arch.guest_table to zero, vs not yet initialized it. - // - paging_enabled = !!pagetable_get_paddr(v->arch.guest_table); - - /* - * We don't call __update_pagetables() when hvm guest paging is - * disabled as we want the linear_pg_table to be inaccessible so that - * we bail out early of shadow_fault() if the hvm guest tries illegal - * accesses while it thinks paging is turned off. - */ - if ( unlikely(shadow_mode_enabled(d)) && paging_enabled ) - { - shadow_lock(d); - __update_pagetables(v); - shadow_unlock(d); - } - - if ( likely(!shadow_mode_external(d)) ) - { - if ( shadow_mode_enabled(d) ) - v->arch.monitor_table = v->arch.shadow_table; - else -#if CONFIG_PAGING_LEVELS == 4 - if ( !(v->arch.flags & TF_kernel_mode) ) - v->arch.monitor_table = v->arch.guest_table_user; - else -#endif - v->arch.monitor_table = v->arch.guest_table; - } -} - -void clear_all_shadow_status(struct domain *d); - -#if SHADOW_DEBUG -extern int _check_pagetable(struct vcpu *v, char *s); -extern int _check_all_pagetables(struct vcpu *v, char *s); - -#define check_pagetable(_v, _s) _check_pagetable(_v, _s) -//#define check_pagetable(_v, _s) _check_all_pagetables(_v, _s) - -#else -#define check_pagetable(_v, _s) ((void)0) -#endif - -#endif /* XEN_SHADOW_H */ +#endif /* _XEN_SHADOW_H */ /* * Local variables: diff --git a/xen/include/asm-x86/shadow2-multi.h b/xen/include/asm-x86/shadow2-multi.h new file mode 100644 index 0000000000..3b23a2f198 --- /dev/null +++ b/xen/include/asm-x86/shadow2-multi.h @@ -0,0 +1,116 @@ +/****************************************************************************** + * arch/x86/shadow2-multi.h + * + * Shadow2 declarations which will be multiply compiled. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +extern int +SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size); +extern int +SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size); +extern int +SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size); +extern int +SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size); +extern int +SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size); + +extern void +SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t smfn); +extern void +SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t smfn); +extern void +SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t smfn); +extern void +SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, SHADOW_LEVELS, GUEST_LEVELS)( + struct vcpu *v, mfn_t smfn); + +extern void +SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows, 3, 3) + (struct vcpu *v, mfn_t smfn); + +extern void +SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl2mfn); +extern void +SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl3mfn); +extern void +SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl4mfn); + +extern int +SHADOW2_INTERNAL_NAME(sh2_remove_write_access, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn); +extern int +SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn); + +extern void +SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, void *ep, mfn_t smfn); + +extern int +SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn); +extern int +SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn); +extern int +SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn); + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES +int +SHADOW2_INTERNAL_NAME(sh2_audit_l1_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl1mfn, mfn_t x); +int +SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl1mfn, mfn_t x); +int +SHADOW2_INTERNAL_NAME(sh2_audit_l2_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl2mfn, mfn_t x); +int +SHADOW2_INTERNAL_NAME(sh2_audit_l3_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl3mfn, mfn_t x); +int +SHADOW2_INTERNAL_NAME(sh2_audit_l4_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t sl4mfn, mfn_t x); +#endif + +#if SHADOW_LEVELS == GUEST_LEVELS +extern mfn_t +SHADOW2_INTERNAL_NAME(sh2_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v); +extern void +SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table, SHADOW_LEVELS, GUEST_LEVELS) + (struct vcpu *v, mfn_t mmfn); +#endif + +extern struct shadow2_entry_points +SHADOW2_INTERNAL_NAME(shadow2_entry, SHADOW_LEVELS, GUEST_LEVELS); diff --git a/xen/include/asm-x86/shadow2-private.h b/xen/include/asm-x86/shadow2-private.h new file mode 100644 index 0000000000..7b2ac57572 --- /dev/null +++ b/xen/include/asm-x86/shadow2-private.h @@ -0,0 +1,612 @@ +/****************************************************************************** + * arch/x86/shadow2-private.h + * + * Shadow2 code that is private, and does not need to be multiply compiled. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _XEN_SHADOW2_PRIVATE_H +#define _XEN_SHADOW2_PRIVATE_H + +// In order to override the definition of mfn_to_page, we make sure page.h has +// been included... +#include +#include +#include +#include + + +/****************************************************************************** + * Definitions for the use of the "available" bits in the shadow PTEs. + * + * Review of the low 12 bits of a shadow page table entry: + * + * in a guest: in a shadow: + * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB + * Bit 10: _PAGE_AVAIL1 _PAGE_SHADOW_RW ("SW" below) + * Bit 9: _PAGE_AVAIL0 _PAGE_SHADOW_PRESENT ("SP" below) + * Bit 8: _PAGE_GLOBAL _PAGE_SHADOW_MMIO ("MMIO" below), + * aka _PAGE_SHADOW_GUEST_NOT_PRESENT + * Bit 7: _PAGE_PSE, aka _PAGE_PAT + * Bit 6: _PAGE_DIRTY + * Bit 5: _PAGE_ACCESSED + * Bit 4: _PAGE_PCD + * Bit 3: _PAGE_PWT + * Bit 2: _PAGE_USER + * Bit 1: _PAGE_RW ("GW" below) + * Bit 0: _PAGE_PRESENT ("GP" below) + * + * Given a guest entry, as shown below, we can expect the following in the + * corresponding shadow entry: + * + * Guest entry Shadow entry Commentary + * ----------- ---------------- --------------------------------------------- + * Maps + * GP GW IO GP SP GW SW MMIO + * -- -- ---- -- -- -- -- ---- + * - - - 0 0 0 0 0 The guest entry has not yet been shadowed. + * 0 - - 0 0 0 0 1 The guest entry is marked not-present. + * 1 1 no ? 1 ? 1 0 Writable entry in the guest. + * 1 0 no ? 1 0 0 0 Read-only entry in the guest. + * 1 1 yes 0 1 ? 1 1 Writable MMIO mapping in the guest. + * 1 0 yes 0 1 0 0 1 Read-only MMIO mapping in the guest. + * + * Normally, we would expect that GP=1 in the guest to imply GP=1 in the + * shadow, and similarly for GW=1. However, various functionality that may be + * implemented via the shadow can cause GP or GW to be cleared in such cases. + * A & D bit emulation is a prime example of such functionality. + * + * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same + * entry will always be zero, too. + + * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests. It is + * currently available for random (ab)use in shadow entries. + * + * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow, + * but currently there is no benefit, as the guest's TLB is flushed on every + * transition of CR3 anyway due to the HVM exit/re-entry. + * + * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used + * as the _PAGE_SHADOW_MMIO bit. In such entries, if _PAGE_SHADOW_MMIO is + * set, then the entry contains the *gfn* directly from the corresponding + * guest entry (not an mfn!!). + * + * Bit 7 is set in a guest L2 to signify a superpage entry. The current + * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the + * resulting shadow L1 table is called an FL1. Note that there is no guest + * page that corresponds to an FL1. + * + * Bit 7 in a guest L1 is the PAT2 bit. Currently we do not support PAT in + * this shadow code. + * + * Bit 6 is the dirty bit. + * + * Bit 5 is the accessed bit. + * + * Bit 4 is the cache disable bit. If set in a guest, the hardware is + * supposed to refuse to cache anything found via this entry. It can be set + * in an L4e, L3e, L2e, or L1e. This shadow code currently does not support + * cache disable bits. They are silently ignored. + * + * Bit 4 is a guest L1 is also the PAT1 bit. Currently we do not support PAT + * in this shadow code. + * + * Bit 3 is the cache write-thru bit. If set in a guest, the hardware is + * supposed to use write-thru instead of write-back caching for anything found + * via this entry. It can be set in an L4e, L3e, L2e, or L1e. This shadow + * code currently does not support cache write-thru bits. They are silently + * ignored. + * + * Bit 3 is a guest L1 is also the PAT0 bit. Currently we do not support PAT + * in this shadow code. + * + * Bit 2 is the user bit. + * + * Bit 1 is the read-write bit. + * + * Bit 0 is the present bit. + */ + +// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by +// the appropriate shadow rules. +#define _PAGE_SHADOW_RW _PAGE_AVAIL1 + +// Copy of the _PAGE_PRESENT bit from the guest's PTE +#define _PAGE_SHADOW_PRESENT _PAGE_AVAIL0 + +// The matching guest entry maps MMIO space +#define _PAGE_SHADOW_MMIO _PAGE_GLOBAL + +// Shadow flags value used when the guest is not present +#define _PAGE_SHADOW_GUEST_NOT_PRESENT _PAGE_GLOBAL + + +/****************************************************************************** + * Debug and error-message output + */ +#define SHADOW2_PRINTK(_f, _a...) \ + debugtrace_printk("sh2: %s(): " _f, __func__, ##_a) +#define SHADOW2_ERROR(_f, _a...) \ + printk("sh2 error: %s(): " _f, __func__, ##_a) +#define SHADOW2_DEBUG(flag, _f, _a...) \ + do { \ + if (SHADOW2_DEBUG_ ## flag) \ + debugtrace_printk("sh2debug: %s(): " _f, __func__, ##_a); \ + } while (0) + +// The flags for use with SHADOW2_DEBUG: +#define SHADOW2_DEBUG_PROPAGATE 0 +#define SHADOW2_DEBUG_MAKE_SHADOW 0 +#define SHADOW2_DEBUG_DESTROY_SHADOW 0 +#define SHADOW2_DEBUG_P2M 0 +#define SHADOW2_DEBUG_A_AND_D 0 +#define SHADOW2_DEBUG_EMULATE 0 +#define SHADOW2_DEBUG_LOGDIRTY 1 + + +/****************************************************************************** + * Auditing routines + */ + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL +extern void shadow2_audit_tables(struct vcpu *v); +#else +#define shadow2_audit_tables(_v) do {} while(0) +#endif + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M +extern void shadow2_audit_p2m(struct domain *d); +#else +#define shadow2_audit_p2m(_d) do {} while(0) +#endif + + +/****************************************************************************** + * Mechanism for double-checking the optimized pagefault path: this + * structure contains a record of actions taken by the fault handling + * code. In paranoid mode, the fast-path code fills out one of these + * structures (but doesn't take any actual action) and then the normal + * path fills in another. When the fault handler finishes, the + * two are compared */ + +#ifdef SHADOW2_OPTIMIZATION_PARANOIA + +typedef struct shadow2_action_log sh2_log_t; +struct shadow2_action_log { + paddr_t ad[CONFIG_PAGING_LEVELS]; /* A & D bits propagated here */ + paddr_t mmio; /* Address of an mmio operation */ + int rv; /* Result of the fault handler */ +}; + +/* There are two logs, one for the fast path, one for the normal path */ +enum sh2_log_type { log_slow = 0, log_fast= 1 }; + +/* Alloc and zero the logs */ +static inline void sh2_init_log(struct vcpu *v) +{ + if ( unlikely(!v->arch.shadow2_action_log) ) + v->arch.shadow2_action_log = xmalloc_array(sh2_log_t, 2); + ASSERT(v->arch.shadow2_action_log); + memset(v->arch.shadow2_action_log, 0, 2 * sizeof (sh2_log_t)); +} + +/* Log an A&D-bit update */ +static inline void sh2_log_ad(struct vcpu *v, paddr_t e, unsigned int level) +{ + v->arch.shadow2_action_log[v->arch.shadow2_action_index].ad[level] = e; +} + +/* Log an MMIO address */ +static inline void sh2_log_mmio(struct vcpu *v, paddr_t m) +{ + v->arch.shadow2_action_log[v->arch.shadow2_action_index].mmio = m; +} + +/* Log the result */ +static inline void sh2_log_rv(struct vcpu *v, int rv) +{ + v->arch.shadow2_action_log[v->arch.shadow2_action_index].rv = rv; +} + +/* Set which mode we're in */ +static inline void sh2_set_log_mode(struct vcpu *v, enum sh2_log_type t) +{ + v->arch.shadow2_action_index = t; +} + +/* Know not to take action, because we're only checking the mechanism */ +static inline int sh2_take_no_action(struct vcpu *v) +{ + return (v->arch.shadow2_action_index == log_fast); +} + +#else /* Non-paranoid mode: these logs do not exist */ + +#define sh2_init_log(_v) do { (void)(_v); } while(0) +#define sh2_set_log_mode(_v,_t) do { (void)(_v); } while(0) +#define sh2_log_ad(_v,_e,_l) do { (void)(_v),(void)(_e),(void)(_l); } while (0) +#define sh2_log_mmio(_v,_m) do { (void)(_v),(void)(_m); } while (0) +#define sh2_log_rv(_v,_r) do { (void)(_v),(void)(_r); } while (0) +#define sh2_take_no_action(_v) (((void)(_v)), 0) + +#endif /* SHADOW2_OPTIMIZATION_PARANOIA */ + + +/****************************************************************************** + * Macro for dealing with the naming of the internal names of the + * shadow code's external entry points. + */ +#define SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) \ + name ## __shadow_ ## shadow_levels ## _guest_ ## guest_levels +#define SHADOW2_INTERNAL_NAME(name, shadow_levels, guest_levels) \ + SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) + +#if CONFIG_PAGING_LEVELS == 2 +#define GUEST_LEVELS 2 +#define SHADOW_LEVELS 2 +#include +#undef GUEST_LEVELS +#undef SHADOW_LEVELS +#endif /* CONFIG_PAGING_LEVELS == 2 */ + +#if CONFIG_PAGING_LEVELS == 3 +#define GUEST_LEVELS 2 +#define SHADOW_LEVELS 3 +#include +#undef GUEST_LEVELS +#undef SHADOW_LEVELS + +#define GUEST_LEVELS 3 +#define SHADOW_LEVELS 3 +#include +#undef GUEST_LEVELS +#undef SHADOW_LEVELS +#endif /* CONFIG_PAGING_LEVELS == 3 */ + +#if CONFIG_PAGING_LEVELS == 4 +#define GUEST_LEVELS 2 +#define SHADOW_LEVELS 3 +#include +#undef GUEST_LEVELS +#undef SHADOW_LEVELS + +#define GUEST_LEVELS 3 +#define SHADOW_LEVELS 3 +#include +#undef GUEST_LEVELS +#undef SHADOW_LEVELS + +#define GUEST_LEVELS 3 +#define SHADOW_LEVELS 4 +#include +#undef GUEST_LEVELS +#undef SHADOW_LEVELS + +#define GUEST_LEVELS 4 +#define SHADOW_LEVELS 4 +#include +#undef GUEST_LEVELS +#undef SHADOW_LEVELS +#endif /* CONFIG_PAGING_LEVELS == 4 */ + + +/****************************************************************************** + * Various function declarations + */ + +/* x86 emulator support */ +extern struct x86_emulate_ops shadow2_emulator_ops; + +/* Hash table functions */ +mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t); +void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn); +void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn); + +/* shadow promotion */ +void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type); +void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type); + +/* Shadow page allocation functions */ +void shadow2_prealloc(struct domain *d, unsigned int order); +mfn_t shadow2_alloc(struct domain *d, + u32 shadow_type, + unsigned long backpointer); +void shadow2_free(struct domain *d, mfn_t smfn); + +/* Function to convert a shadow to log-dirty */ +void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn); + +/* Dispatcher function: call the per-mode function that will unhook the + * non-Xen mappings in this top-level shadow mfn */ +void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn); + +/* Re-sync copies of PAE shadow L3 tables if they have been changed */ +void sh2_pae_recopy(struct domain *d); + +/* Install the xen mappings in various flavours of shadow */ +void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn); +void sh2_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn); +void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn); +void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn); + + +/****************************************************************************** + * MFN/page-info handling + */ + +// Override mfn_to_page from asm/page.h, which was #include'd above, +// in order to make it work with our mfn type. +#undef mfn_to_page +#define mfn_to_page(_mfn) (frame_table + mfn_x(_mfn)) + +// Override page_to_mfn from asm/page.h, which was #include'd above, +// in order to make it work with our mfn type. +#undef page_to_mfn +#define page_to_mfn(_pg) (_mfn((_pg) - frame_table)) + +// Override mfn_valid from asm/page.h, which was #include'd above, +// in order to make it work with our mfn type. +#undef mfn_valid +#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page) + +// Provide mfn_t-aware versions of common xen functions +static inline void * +sh2_map_domain_page(mfn_t mfn) +{ + /* XXX Using the monitor-table as a map will happen here */ + return map_domain_page(mfn_x(mfn)); +} + +static inline void +sh2_unmap_domain_page(void *p) +{ + /* XXX Using the monitor-table as a map will happen here */ + unmap_domain_page(p); +} + +static inline void * +sh2_map_domain_page_global(mfn_t mfn) +{ + /* XXX Using the monitor-table as a map will happen here */ + return map_domain_page_global(mfn_x(mfn)); +} + +static inline void +sh2_unmap_domain_page_global(void *p) +{ + /* XXX Using the monitor-table as a map will happen here */ + unmap_domain_page_global(p); +} + +static inline int +sh2_mfn_is_dirty(struct domain *d, mfn_t gmfn) +/* Is this guest page dirty? Call only in log-dirty mode. */ +{ + unsigned long pfn; + ASSERT(shadow2_mode_log_dirty(d)); + ASSERT(d->arch.shadow_dirty_bitmap != NULL); + + /* We /really/ mean PFN here, even for non-translated guests. */ + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + if ( likely(VALID_M2P(pfn)) + && likely(pfn < d->arch.shadow_dirty_bitmap_size) + && test_bit(pfn, d->arch.shadow_dirty_bitmap) ) + return 1; + + return 0; +} + +static inline int +sh2_mfn_is_a_page_table(mfn_t gmfn) +{ + struct page_info *page = mfn_to_page(gmfn); + struct domain *owner; + unsigned long type_info; + + if ( !valid_mfn(gmfn) ) + return 0; + + owner = page_get_owner(page); + if ( owner && shadow2_mode_refcounts(owner) + && (page->count_info & PGC_page_table) ) + return 1; + + type_info = page->u.inuse.type_info & PGT_type_mask; + return type_info && (type_info <= PGT_l4_page_table); +} + + +/**************************************************************************/ +/* Shadow-page refcounting. See comment in shadow2-common.c about the + * use of struct page_info fields for shadow pages */ + +void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn); + +/* Increase the refcount of a shadow page. Arguments are the mfn to refcount, + * and the physical address of the shadow entry that holds the ref (or zero + * if the ref is held by something else) */ +static inline void sh2_get_ref(mfn_t smfn, paddr_t entry_pa) +{ + u32 x, nx; + struct page_info *page = mfn_to_page(smfn); + + ASSERT(mfn_valid(smfn)); + + x = page->count_info & PGC_SH2_count_mask; + nx = x + 1; + + if ( unlikely(nx & ~PGC_SH2_count_mask) ) + { + SHADOW2_PRINTK("shadow ref overflow, gmfn=%" PRtype_info " smfn=%lx\n", + page->u.inuse.type_info, mfn_x(smfn)); + domain_crash_synchronous(); + } + + /* Guarded by the shadow lock, so no need for atomic update */ + page->count_info &= ~PGC_SH2_count_mask; + page->count_info |= nx; + + /* We remember the first shadow entry that points to each shadow. */ + if ( entry_pa != 0 && page->up == 0 ) + page->up = entry_pa; +} + + +/* Decrease the refcount of a shadow page. As for get_ref, takes the + * physical address of the shadow entry that held this reference. */ +static inline void sh2_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa) +{ + u32 x, nx; + struct page_info *page = mfn_to_page(smfn); + + ASSERT(mfn_valid(smfn)); + ASSERT(page_get_owner(page) == NULL); + + /* If this is the entry in the up-pointer, remove it */ + if ( entry_pa != 0 && page->up == entry_pa ) + page->up = 0; + + x = page->count_info & PGC_SH2_count_mask; + nx = x - 1; + + if ( unlikely(x == 0) ) + { + SHADOW2_PRINTK("shadow ref underflow, smfn=%lx oc=%08x t=%" + PRtype_info "\n", + mfn_x(smfn), + page->count_info & PGC_SH2_count_mask, + page->u.inuse.type_info); + domain_crash_synchronous(); + } + + /* Guarded by the shadow lock, so no need for atomic update */ + page->count_info &= ~PGC_SH2_count_mask; + page->count_info |= nx; + + if ( unlikely(nx == 0) ) + sh2_destroy_shadow(v, smfn); +} + + +/* Pin a shadow page: take an extra refcount and set the pin bit. */ +static inline void sh2_pin(mfn_t smfn) +{ + struct page_info *page; + + ASSERT(mfn_valid(smfn)); + page = mfn_to_page(smfn); + if ( !(page->count_info & PGC_SH2_pinned) ) + { + sh2_get_ref(smfn, 0); + page->count_info |= PGC_SH2_pinned; + } +} + +/* Unpin a shadow page: unset the pin bit and release the extra ref. */ +static inline void sh2_unpin(struct vcpu *v, mfn_t smfn) +{ + struct page_info *page; + + ASSERT(mfn_valid(smfn)); + page = mfn_to_page(smfn); + if ( page->count_info & PGC_SH2_pinned ) + { + page->count_info &= ~PGC_SH2_pinned; + sh2_put_ref(v, smfn, 0); + } +} + +/**************************************************************************/ +/* CPU feature support querying */ + +static inline int +guest_supports_superpages(struct vcpu *v) +{ + return hvm_guest(v) && (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE); +} + +static inline int +guest_supports_nx(struct vcpu *v) +{ + if ( !hvm_guest(v) ) + return cpu_has_nx; + + // XXX - fix this! + return 1; +} + +/**************************************************************************/ +/* Guest physmap (p2m) support */ + +/* Read our own P2M table, checking in the linear pagetables first to be + * sure that we will succeed. Call this function if you expect it to + * fail often, as it avoids page faults. If you expect to succeed, use + * vcpu_gfn_to_mfn, which copy_from_user()s the entry */ +static inline mfn_t +vcpu_gfn_to_mfn_nofault(struct vcpu *v, unsigned long gfn) +{ + unsigned long entry_addr = (unsigned long) &phys_to_machine_mapping[gfn]; +#if CONFIG_PAGING_LEVELS >= 4 + l4_pgentry_t *l4e; + l3_pgentry_t *l3e; +#endif + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(current == v); + if ( !shadow2_vcpu_mode_translate(v) ) + return _mfn(gfn); + +#if CONFIG_PAGING_LEVELS > 2 + if ( gfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) + /* This pfn is higher than the p2m map can hold */ + return _mfn(INVALID_MFN); +#endif + + /* Walk the linear pagetables. Note that this is *not* the same as + * the walk in sh2_gfn_to_mfn_foreign, which is walking the p2m map */ +#if CONFIG_PAGING_LEVELS >= 4 + l4e = __linear_l4_table + l4_linear_offset(entry_addr); + if ( !(l4e_get_flags(*l4e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); + l3e = __linear_l3_table + l3_linear_offset(entry_addr); + if ( !(l3e_get_flags(*l3e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); +#endif + l2e = __linear_l2_table + l2_linear_offset(entry_addr); + if ( !(l2e_get_flags(*l2e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); + l1e = __linear_l1_table + l1_linear_offset(entry_addr); + if ( !(l1e_get_flags(*l1e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); + + /* Safe to look at this part of the table */ + if ( l1e_get_flags(phys_to_machine_mapping[gfn]) & _PAGE_PRESENT ) + return _mfn(l1e_get_pfn(phys_to_machine_mapping[gfn])); + + return _mfn(INVALID_MFN); +} + + +#endif /* _XEN_SHADOW2_PRIVATE_H */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/include/asm-x86/shadow2-types.h b/xen/include/asm-x86/shadow2-types.h new file mode 100644 index 0000000000..f593c97822 --- /dev/null +++ b/xen/include/asm-x86/shadow2-types.h @@ -0,0 +1,705 @@ +/****************************************************************************** + * include/asm-x86/shadow2-types.h + * + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _XEN_SHADOW2_TYPES_H +#define _XEN_SHADOW2_TYPES_H + +// Map a shadow page +static inline void * +map_shadow_page(mfn_t smfn) +{ + // XXX -- Possible optimization/measurement question for 32-bit and PAE + // hypervisors: + // How often is this smfn already available in the shadow linear + // table? Might it be worth checking that table first, + // presumably using the reverse map hint in the page_info of this + // smfn, rather than calling map_domain_page()? + // + return sh2_map_domain_page(smfn); +} + +// matching unmap for map_shadow_page() +static inline void +unmap_shadow_page(void *p) +{ + sh2_unmap_domain_page(p); +} + +/* + * Define various types for handling pagetabels, based on these options: + * SHADOW_PAGING_LEVELS : Number of levels of shadow pagetables + * GUEST_PAGING_LEVELS : Number of levels of guest pagetables + */ + +#if (CONFIG_PAGING_LEVELS < SHADOW_PAGING_LEVELS) +#error Cannot have more levels of shadow pagetables than host pagetables +#endif + +#if (SHADOW_PAGING_LEVELS < GUEST_PAGING_LEVELS) +#error Cannot have more levels of guest pagetables than shadow pagetables +#endif + +#if SHADOW_PAGING_LEVELS == 2 +#define SHADOW_L1_PAGETABLE_ENTRIES 1024 +#define SHADOW_L2_PAGETABLE_ENTRIES 1024 +#define SHADOW_L1_PAGETABLE_SHIFT 12 +#define SHADOW_L2_PAGETABLE_SHIFT 22 +#endif + +#if SHADOW_PAGING_LEVELS == 3 +#define SHADOW_L1_PAGETABLE_ENTRIES 512 +#define SHADOW_L2_PAGETABLE_ENTRIES 512 +#define SHADOW_L3_PAGETABLE_ENTRIES 4 +#define SHADOW_L1_PAGETABLE_SHIFT 12 +#define SHADOW_L2_PAGETABLE_SHIFT 21 +#define SHADOW_L3_PAGETABLE_SHIFT 30 +#endif + +#if SHADOW_PAGING_LEVELS == 4 +#define SHADOW_L1_PAGETABLE_ENTRIES 512 +#define SHADOW_L2_PAGETABLE_ENTRIES 512 +#define SHADOW_L3_PAGETABLE_ENTRIES 512 +#define SHADOW_L4_PAGETABLE_ENTRIES 512 +#define SHADOW_L1_PAGETABLE_SHIFT 12 +#define SHADOW_L2_PAGETABLE_SHIFT 21 +#define SHADOW_L3_PAGETABLE_SHIFT 30 +#define SHADOW_L4_PAGETABLE_SHIFT 39 +#endif + +/* Types of the shadow page tables */ +typedef l1_pgentry_t shadow_l1e_t; +typedef l2_pgentry_t shadow_l2e_t; +#if SHADOW_PAGING_LEVELS >= 3 +typedef l3_pgentry_t shadow_l3e_t; +#if SHADOW_PAGING_LEVELS >= 4 +typedef l4_pgentry_t shadow_l4e_t; +#endif +#endif + +/* Access functions for them */ +static inline paddr_t shadow_l1e_get_paddr(shadow_l1e_t sl1e) +{ return l1e_get_paddr(sl1e); } +static inline paddr_t shadow_l2e_get_paddr(shadow_l2e_t sl2e) +{ return l2e_get_paddr(sl2e); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline paddr_t shadow_l3e_get_paddr(shadow_l3e_t sl3e) +{ return l3e_get_paddr(sl3e); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline paddr_t shadow_l4e_get_paddr(shadow_l4e_t sl4e) +{ return l4e_get_paddr(sl4e); } +#endif +#endif + +static inline mfn_t shadow_l1e_get_mfn(shadow_l1e_t sl1e) +{ return _mfn(l1e_get_pfn(sl1e)); } +static inline mfn_t shadow_l2e_get_mfn(shadow_l2e_t sl2e) +{ return _mfn(l2e_get_pfn(sl2e)); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline mfn_t shadow_l3e_get_mfn(shadow_l3e_t sl3e) +{ return _mfn(l3e_get_pfn(sl3e)); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline mfn_t shadow_l4e_get_mfn(shadow_l4e_t sl4e) +{ return _mfn(l4e_get_pfn(sl4e)); } +#endif +#endif + +static inline u32 shadow_l1e_get_flags(shadow_l1e_t sl1e) +{ return l1e_get_flags(sl1e); } +static inline u32 shadow_l2e_get_flags(shadow_l2e_t sl2e) +{ return l2e_get_flags(sl2e); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline u32 shadow_l3e_get_flags(shadow_l3e_t sl3e) +{ return l3e_get_flags(sl3e); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline u32 shadow_l4e_get_flags(shadow_l4e_t sl4e) +{ return l4e_get_flags(sl4e); } +#endif +#endif + +static inline shadow_l1e_t +shadow_l1e_remove_flags(shadow_l1e_t sl1e, u32 flags) +{ l1e_remove_flags(sl1e, flags); return sl1e; } + +static inline shadow_l1e_t shadow_l1e_empty(void) +{ return l1e_empty(); } +static inline shadow_l2e_t shadow_l2e_empty(void) +{ return l2e_empty(); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline shadow_l3e_t shadow_l3e_empty(void) +{ return l3e_empty(); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline shadow_l4e_t shadow_l4e_empty(void) +{ return l4e_empty(); } +#endif +#endif + +static inline shadow_l1e_t shadow_l1e_from_mfn(mfn_t mfn, u32 flags) +{ return l1e_from_pfn(mfn_x(mfn), flags); } +static inline shadow_l2e_t shadow_l2e_from_mfn(mfn_t mfn, u32 flags) +{ return l2e_from_pfn(mfn_x(mfn), flags); } +#if SHADOW_PAGING_LEVELS >= 3 +static inline shadow_l3e_t shadow_l3e_from_mfn(mfn_t mfn, u32 flags) +{ return l3e_from_pfn(mfn_x(mfn), flags); } +#if SHADOW_PAGING_LEVELS >= 4 +static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags) +{ return l4e_from_pfn(mfn_x(mfn), flags); } +#endif +#endif + +#define shadow_l1_table_offset(a) l1_table_offset(a) +#define shadow_l2_table_offset(a) l2_table_offset(a) +#define shadow_l3_table_offset(a) l3_table_offset(a) +#define shadow_l4_table_offset(a) l4_table_offset(a) + +/**************************************************************************/ +/* Access to the linear mapping of shadow page tables. */ + +/* Offsets into each level of the linear mapping for a virtual address. */ +#define shadow_l1_linear_offset(_a) \ + (((_a) & VADDR_MASK) >> SHADOW_L1_PAGETABLE_SHIFT) +#define shadow_l2_linear_offset(_a) \ + (((_a) & VADDR_MASK) >> SHADOW_L2_PAGETABLE_SHIFT) +#define shadow_l3_linear_offset(_a) \ + (((_a) & VADDR_MASK) >> SHADOW_L3_PAGETABLE_SHIFT) +#define shadow_l4_linear_offset(_a) \ + (((_a) & VADDR_MASK) >> SHADOW_L4_PAGETABLE_SHIFT) + +/* Where to find each level of the linear mapping. For PV guests, we use + * the shadow linear-map self-entry as many times as we need. For HVM + * guests, the shadow doesn't have a linear-map self-entry so we must use + * the monitor-table's linear-map entry N-1 times and then the shadow-map + * entry once. */ +#define __sh2_linear_l1_table ((shadow_l1e_t *)(SH_LINEAR_PT_VIRT_START)) +#define __sh2_linear_l2_table ((shadow_l2e_t *) \ + (__sh2_linear_l1_table + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START))) + +// shadow linear L3 and L4 tables only exist in 4 level paging... +#if SHADOW_PAGING_LEVELS == 4 +#define __sh2_linear_l3_table ((shadow_l3e_t *) \ + (__sh2_linear_l2_table + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START))) +#define __sh2_linear_l4_table ((shadow_l4e_t *) \ + (__sh2_linear_l3_table + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START))) +#endif + +#define sh2_linear_l1_table(v) ({ \ + ASSERT(current == (v)); \ + __sh2_linear_l1_table; \ +}) + +#define sh2_linear_l2_table(v) ({ \ + ASSERT(current == (v)); \ + ((shadow_l2e_t *) \ + (hvm_guest(v) ? __linear_l1_table : __sh2_linear_l1_table) + \ + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)); \ +}) + +// shadow linear L3 and L4 tables only exist in 4 level paging... +#if SHADOW_PAGING_LEVELS == 4 +#define sh2_linear_l3_table(v) ({ \ + ASSERT(current == (v)); \ + ((shadow_l3e_t *) \ + (hvm_guest(v) ? __linear_l2_table : __sh2_linear_l2_table) + \ + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)); \ +}) + +// we use l4_pgentry_t instead of shadow_l4e_t below because shadow_l4e_t is +// not defined for when xen_levels==4 & shadow_levels==3... +#define sh2_linear_l4_table(v) ({ \ + ASSERT(current == (v)); \ + ((l4_pgentry_t *) \ + (hvm_guest(v) ? __linear_l3_table : __sh2_linear_l3_table) + \ + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)); \ +}) +#endif + +#if GUEST_PAGING_LEVELS == 2 + +#include + +#define GUEST_L1_PAGETABLE_ENTRIES 1024 +#define GUEST_L2_PAGETABLE_ENTRIES 1024 +#define GUEST_L1_PAGETABLE_SHIFT 12 +#define GUEST_L2_PAGETABLE_SHIFT 22 + +/* Type of the guest's frame numbers */ +TYPE_SAFE(u32,gfn) +#define INVALID_GFN ((u32)(-1u)) +#define SH2_PRI_gfn "05x" + +/* Types of the guest's page tables */ +typedef l1_pgentry_32_t guest_l1e_t; +typedef l2_pgentry_32_t guest_l2e_t; + +/* Access functions for them */ +static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) +{ return l1e_get_paddr_32(gl1e); } +static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) +{ return l2e_get_paddr_32(gl2e); } + +static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) +{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); } +static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) +{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); } + +static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) +{ return l1e_get_flags_32(gl1e); } +static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) +{ return l2e_get_flags_32(gl2e); } + +static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags) +{ l1e_add_flags_32(gl1e, flags); return gl1e; } +static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags) +{ l2e_add_flags_32(gl2e, flags); return gl2e; } + +static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) +{ return l1e_from_pfn_32(gfn_x(gfn), flags); } +static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) +{ return l2e_from_pfn_32(gfn_x(gfn), flags); } + +#define guest_l1_table_offset(a) l1_table_offset_32(a) +#define guest_l2_table_offset(a) l2_table_offset_32(a) + +/* The shadow types needed for the various levels. */ +#define PGC_SH2_l1_shadow PGC_SH2_l1_32_shadow +#define PGC_SH2_l2_shadow PGC_SH2_l2_32_shadow +#define PGC_SH2_fl1_shadow PGC_SH2_fl1_32_shadow + +#else /* GUEST_PAGING_LEVELS != 2 */ + +#if GUEST_PAGING_LEVELS == 3 +#define GUEST_L1_PAGETABLE_ENTRIES 512 +#define GUEST_L2_PAGETABLE_ENTRIES 512 +#define GUEST_L3_PAGETABLE_ENTRIES 4 +#define GUEST_L1_PAGETABLE_SHIFT 12 +#define GUEST_L2_PAGETABLE_SHIFT 21 +#define GUEST_L3_PAGETABLE_SHIFT 30 +#else /* GUEST_PAGING_LEVELS == 4 */ +#define GUEST_L1_PAGETABLE_ENTRIES 512 +#define GUEST_L2_PAGETABLE_ENTRIES 512 +#define GUEST_L3_PAGETABLE_ENTRIES 512 +#define GUEST_L4_PAGETABLE_ENTRIES 512 +#define GUEST_L1_PAGETABLE_SHIFT 12 +#define GUEST_L2_PAGETABLE_SHIFT 21 +#define GUEST_L3_PAGETABLE_SHIFT 30 +#define GUEST_L4_PAGETABLE_SHIFT 39 +#endif + +/* Type of the guest's frame numbers */ +TYPE_SAFE(unsigned long,gfn) +#define INVALID_GFN ((unsigned long)(-1ul)) +#define SH2_PRI_gfn "05lx" + +/* Types of the guest's page tables */ +typedef l1_pgentry_t guest_l1e_t; +typedef l2_pgentry_t guest_l2e_t; +typedef l3_pgentry_t guest_l3e_t; +#if GUEST_PAGING_LEVELS >= 4 +typedef l4_pgentry_t guest_l4e_t; +#endif + +/* Access functions for them */ +static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) +{ return l1e_get_paddr(gl1e); } +static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) +{ return l2e_get_paddr(gl2e); } +static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e) +{ return l3e_get_paddr(gl3e); } +#if GUEST_PAGING_LEVELS >= 4 +static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e) +{ return l4e_get_paddr(gl4e); } +#endif + +static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) +{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); } +static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) +{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); } +static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e) +{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); } +#if GUEST_PAGING_LEVELS >= 4 +static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e) +{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); } +#endif + +static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) +{ return l1e_get_flags(gl1e); } +static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) +{ return l2e_get_flags(gl2e); } +static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e) +{ return l3e_get_flags(gl3e); } +#if GUEST_PAGING_LEVELS >= 4 +static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e) +{ return l4e_get_flags(gl4e); } +#endif + +static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags) +{ l1e_add_flags(gl1e, flags); return gl1e; } +static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags) +{ l2e_add_flags(gl2e, flags); return gl2e; } +static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags) +{ l3e_add_flags(gl3e, flags); return gl3e; } +#if GUEST_PAGING_LEVELS >= 4 +static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags) +{ l4e_add_flags(gl4e, flags); return gl4e; } +#endif + +static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) +{ return l1e_from_pfn(gfn_x(gfn), flags); } +static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) +{ return l2e_from_pfn(gfn_x(gfn), flags); } +static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags) +{ return l3e_from_pfn(gfn_x(gfn), flags); } +#if GUEST_PAGING_LEVELS >= 4 +static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags) +{ return l4e_from_pfn(gfn_x(gfn), flags); } +#endif + +#define guest_l1_table_offset(a) l1_table_offset(a) +#define guest_l2_table_offset(a) l2_table_offset(a) +#define guest_l3_table_offset(a) l3_table_offset(a) +#define guest_l4_table_offset(a) l4_table_offset(a) + +/* The shadow types needed for the various levels. */ +#if GUEST_PAGING_LEVELS == 3 +#define PGC_SH2_l1_shadow PGC_SH2_l1_pae_shadow +#define PGC_SH2_fl1_shadow PGC_SH2_fl1_pae_shadow +#define PGC_SH2_l2_shadow PGC_SH2_l2_pae_shadow +#define PGC_SH2_l2h_shadow PGC_SH2_l2h_pae_shadow +#define PGC_SH2_l3_shadow PGC_SH2_l3_pae_shadow +#else +#define PGC_SH2_l1_shadow PGC_SH2_l1_64_shadow +#define PGC_SH2_fl1_shadow PGC_SH2_fl1_64_shadow +#define PGC_SH2_l2_shadow PGC_SH2_l2_64_shadow +#define PGC_SH2_l3_shadow PGC_SH2_l3_64_shadow +#define PGC_SH2_l4_shadow PGC_SH2_l4_64_shadow +#endif + +#endif /* GUEST_PAGING_LEVELS != 2 */ + +#define VALID_GFN(m) (m != INVALID_GFN) + +static inline int +valid_gfn(gfn_t m) +{ + return VALID_GFN(gfn_x(m)); +} + +#if GUEST_PAGING_LEVELS == 2 +#define PGC_SH2_guest_root_type PGC_SH2_l2_32_shadow +#elif GUEST_PAGING_LEVELS == 3 +#define PGC_SH2_guest_root_type PGC_SH2_l3_pae_shadow +#else +#define PGC_SH2_guest_root_type PGC_SH2_l4_64_shadow +#endif + +/* Translation between mfns and gfns */ +static inline mfn_t +vcpu_gfn_to_mfn(struct vcpu *v, gfn_t gfn) +{ + return sh2_vcpu_gfn_to_mfn(v, gfn_x(gfn)); +} + +static inline gfn_t +mfn_to_gfn(struct domain *d, mfn_t mfn) +{ + return _gfn(sh2_mfn_to_gfn(d, mfn)); +} + +static inline paddr_t +gfn_to_paddr(gfn_t gfn) +{ + return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT; +} + +/* Type used for recording a walk through guest pagetables. It is + * filled in by the pagetable walk function, and also used as a cache + * for later walks. + * Any non-null pointer in this structure represents a mapping of guest + * memory. We must always call walk_init() before using a walk_t, and + * call walk_unmap() when we're done. + * The "Effective l1e" field is used when there isn't an l1e to point to, + * but we have fabricated an l1e for propagation to the shadow (e.g., + * for splintering guest superpages into many shadow l1 entries). */ +typedef struct shadow2_walk_t walk_t; +struct shadow2_walk_t +{ + unsigned long va; /* Address we were looking for */ +#if GUEST_PAGING_LEVELS >= 3 +#if GUEST_PAGING_LEVELS >= 4 + guest_l4e_t *l4e; /* Pointer to guest's level 4 entry */ +#endif + guest_l3e_t *l3e; /* Pointer to guest's level 3 entry */ +#endif + guest_l2e_t *l2e; /* Pointer to guest's level 2 entry */ + guest_l1e_t *l1e; /* Pointer to guest's level 1 entry */ + guest_l1e_t eff_l1e; /* Effective level 1 entry */ +#if GUEST_PAGING_LEVELS >= 3 +#if GUEST_PAGING_LEVELS >= 4 + mfn_t l4mfn; /* MFN that the level 4 entry is in */ +#endif + mfn_t l3mfn; /* MFN that the level 3 entry is in */ +#endif + mfn_t l2mfn; /* MFN that the level 2 entry is in */ + mfn_t l1mfn; /* MFN that the level 1 entry is in */ +}; + + +/* X86 error code bits: + * These bits certainly ought to be defined somewhere other than here, + * but until that place is determined, here they sit. + * + * "PFEC" == "Page Fault Error Code" + */ +#define X86_PFEC_PRESENT 1 /* 0 == page was not present */ +#define X86_PFEC_WRITE_FAULT 2 /* 0 == reading, 1 == writing */ +#define X86_PFEC_SUPERVISOR_FAULT 4 /* 0 == supervisor-mode, 1 == user */ +#define X86_PFEC_RESERVED_BIT_FAULT 8 /* 1 == reserved bits set in pte */ +#define X86_PFEC_INSN_FETCH_FAULT 16 /* 0 == normal, 1 == instr'n fetch */ + +/* macros for dealing with the naming of the internal function names of the + * shadow code's external entry points. + */ +#define INTERNAL_NAME(name) \ + SHADOW2_INTERNAL_NAME(name, SHADOW_PAGING_LEVELS, GUEST_PAGING_LEVELS) + +/* macros for renaming the primary entry points, so that they are more + * easily distinguished from a debugger + */ +#define sh2_page_fault INTERNAL_NAME(sh2_page_fault) +#define sh2_invlpg INTERNAL_NAME(sh2_invlpg) +#define sh2_gva_to_gpa INTERNAL_NAME(sh2_gva_to_gpa) +#define sh2_gva_to_gfn INTERNAL_NAME(sh2_gva_to_gfn) +#define sh2_update_cr3 INTERNAL_NAME(sh2_update_cr3) +#define sh2_remove_write_access INTERNAL_NAME(sh2_remove_write_access) +#define sh2_remove_all_mappings INTERNAL_NAME(sh2_remove_all_mappings) +#define sh2_remove_l1_shadow INTERNAL_NAME(sh2_remove_l1_shadow) +#define sh2_remove_l2_shadow INTERNAL_NAME(sh2_remove_l2_shadow) +#define sh2_remove_l3_shadow INTERNAL_NAME(sh2_remove_l3_shadow) +#define sh2_map_and_validate_gl4e INTERNAL_NAME(sh2_map_and_validate_gl4e) +#define sh2_map_and_validate_gl3e INTERNAL_NAME(sh2_map_and_validate_gl3e) +#define sh2_map_and_validate_gl2e INTERNAL_NAME(sh2_map_and_validate_gl2e) +#define sh2_map_and_validate_gl2he INTERNAL_NAME(sh2_map_and_validate_gl2he) +#define sh2_map_and_validate_gl1e INTERNAL_NAME(sh2_map_and_validate_gl1e) +#define sh2_destroy_l4_shadow INTERNAL_NAME(sh2_destroy_l4_shadow) +#define sh2_destroy_l3_shadow INTERNAL_NAME(sh2_destroy_l3_shadow) +#define sh2_destroy_l3_subshadow INTERNAL_NAME(sh2_destroy_l3_subshadow) +#define sh2_unpin_all_l3_subshadows INTERNAL_NAME(sh2_unpin_all_l3_subshadows) +#define sh2_destroy_l2_shadow INTERNAL_NAME(sh2_destroy_l2_shadow) +#define sh2_destroy_l1_shadow INTERNAL_NAME(sh2_destroy_l1_shadow) +#define sh2_unhook_32b_mappings INTERNAL_NAME(sh2_unhook_32b_mappings) +#define sh2_unhook_pae_mappings INTERNAL_NAME(sh2_unhook_pae_mappings) +#define sh2_unhook_64b_mappings INTERNAL_NAME(sh2_unhook_64b_mappings) +#define shadow2_entry INTERNAL_NAME(shadow2_entry) +#define sh2_detach_old_tables INTERNAL_NAME(sh2_detach_old_tables) +#define sh2_x86_emulate_write INTERNAL_NAME(sh2_x86_emulate_write) +#define sh2_x86_emulate_cmpxchg INTERNAL_NAME(sh2_x86_emulate_cmpxchg) +#define sh2_x86_emulate_cmpxchg8b INTERNAL_NAME(sh2_x86_emulate_cmpxchg8b) +#define sh2_audit_l1_table INTERNAL_NAME(sh2_audit_l1_table) +#define sh2_audit_fl1_table INTERNAL_NAME(sh2_audit_fl1_table) +#define sh2_audit_l2_table INTERNAL_NAME(sh2_audit_l2_table) +#define sh2_audit_l3_table INTERNAL_NAME(sh2_audit_l3_table) +#define sh2_audit_l4_table INTERNAL_NAME(sh2_audit_l4_table) +#define sh2_guess_wrmap INTERNAL_NAME(sh2_guess_wrmap) +#define sh2_clear_shadow_entry INTERNAL_NAME(sh2_clear_shadow_entry) + +/* sh2_make_monitor_table only depends on the number of shadow levels */ +#define sh2_make_monitor_table \ + SHADOW2_INTERNAL_NAME(sh2_make_monitor_table, \ + SHADOW_PAGING_LEVELS, \ + SHADOW_PAGING_LEVELS) +#define sh2_destroy_monitor_table \ + SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table, \ + SHADOW_PAGING_LEVELS, \ + SHADOW_PAGING_LEVELS) + + +#if GUEST_PAGING_LEVELS == 3 +/* + * Accounting information stored in the shadow of PAE Guest L3 pages. + * Because these "L3 pages" are only 32-bytes, it is inconvenient to keep + * various refcounts, etc., on the page_info of their page. We provide extra + * bookkeeping space in the shadow itself, and this is the structure + * definition for that bookkeeping information. + */ +struct pae_l3_bookkeeping { + u32 vcpus; /* bitmap of which vcpus are currently storing + * copies of this 32-byte page */ + u32 refcount; /* refcount for this 32-byte page */ + u8 pinned; /* is this 32-byte page pinned or not? */ +}; + +// Convert a shadow entry pointer into a pae_l3_bookkeeping pointer. +#define sl3p_to_info(_ptr) ((struct pae_l3_bookkeeping *) \ + (((unsigned long)(_ptr) & ~31) + 32)) + +static void sh2_destroy_l3_subshadow(struct vcpu *v, + shadow_l3e_t *sl3e); + +/* Increment a subshadow ref + * Called with a pointer to the subshadow, and the mfn of the + * *first* page of the overall shadow. */ +static inline void sh2_get_ref_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn) +{ + struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e); + + /* First ref to the subshadow takes a ref to the full shadow */ + if ( bk->refcount == 0 ) + sh2_get_ref(smfn, 0); + if ( unlikely(++(bk->refcount) == 0) ) + { + SHADOW2_PRINTK("shadow l3 subshadow ref overflow, smfn=%" SH2_PRI_mfn " sh=%p\n", + mfn_x(smfn), sl3e); + domain_crash_synchronous(); + } +} + +/* Decrement a subshadow ref. + * Called with a pointer to the subshadow, and the mfn of the + * *first* page of the overall shadow. Calling this may cause the + * entire shadow to disappear, so the caller must immediately unmap + * the pointer after calling. */ +static inline void sh2_put_ref_l3_subshadow(struct vcpu *v, + shadow_l3e_t *sl3e, + mfn_t smfn) +{ + struct pae_l3_bookkeeping *bk; + + bk = sl3p_to_info(sl3e); + + ASSERT(bk->refcount > 0); + if ( --(bk->refcount) == 0 ) + { + /* Need to destroy this subshadow */ + sh2_destroy_l3_subshadow(v, sl3e); + /* Last ref to the subshadow had a ref to the full shadow */ + sh2_put_ref(v, smfn, 0); + } +} + +/* Pin a subshadow + * Called with a pointer to the subshadow, and the mfn of the + * *first* page of the overall shadow. */ +static inline void sh2_pin_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn) +{ + struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e); + +#if 0 + debugtrace_printk("%s smfn=%05lx offset=%ld\n", + __func__, mfn_x(smfn), + ((unsigned long)sl3e & ~PAGE_MASK) / 64); +#endif + + if ( !bk->pinned ) + { + bk->pinned = 1; + sh2_get_ref_l3_subshadow(sl3e, smfn); + } +} + +/* Unpin a sub-shadow. + * Called with a pointer to the subshadow, and the mfn of the + * *first* page of the overall shadow. Calling this may cause the + * entire shadow to disappear, so the caller must immediately unmap + * the pointer after calling. */ +static inline void sh2_unpin_l3_subshadow(struct vcpu *v, + shadow_l3e_t *sl3e, + mfn_t smfn) +{ + struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e); + +#if 0 + debugtrace_printk("%s smfn=%05lx offset=%ld\n", + __func__, mfn_x(smfn), + ((unsigned long)sl3e & ~PAGE_MASK) / 64); +#endif + + if ( bk->pinned ) + { + bk->pinned = 0; + sh2_put_ref_l3_subshadow(v, sl3e, smfn); + } +} + +#endif /* GUEST_PAGING_LEVELS == 3 */ + +#if SHADOW_PAGING_LEVELS == 3 +#define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20) +#endif + +#if SHADOW_PAGING_LEVELS == 2 +#define SH2_PRI_pte "08x" +#else /* SHADOW_PAGING_LEVELS >= 3 */ +#ifndef __x86_64__ +#define SH2_PRI_pte "016llx" +#else +#define SH2_PRI_pte "016lx" +#endif +#endif /* SHADOW_PAGING_LEVELS >= 3 */ + +#if GUEST_PAGING_LEVELS == 2 +#define SH2_PRI_gpte "08x" +#else /* GUEST_PAGING_LEVELS >= 3 */ +#ifndef __x86_64__ +#define SH2_PRI_gpte "016llx" +#else +#define SH2_PRI_gpte "016lx" +#endif +#endif /* GUEST_PAGING_LEVELS >= 3 */ + +static inline u32 +accumulate_guest_flags(walk_t *gw) +{ + u32 accumulated_flags; + + // We accumulate the permission flags with bitwise ANDing. + // This works for the PRESENT bit, RW bit, and USER bit. + // For the NX bit, however, the polarity is wrong, so we accumulate the + // inverse of the NX bit. + // + accumulated_flags = guest_l1e_get_flags(gw->eff_l1e) ^ _PAGE_NX_BIT; + accumulated_flags &= guest_l2e_get_flags(*gw->l2e) ^ _PAGE_NX_BIT; + + // Note that PAE guests do not have USER or RW or NX bits in their L3s. + // +#if GUEST_PAGING_LEVELS == 3 + accumulated_flags &= + ~_PAGE_PRESENT | (guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT); +#elif GUEST_PAGING_LEVELS >= 4 + accumulated_flags &= guest_l3e_get_flags(*gw->l3e) ^ _PAGE_NX_BIT; + accumulated_flags &= guest_l4e_get_flags(*gw->l4e) ^ _PAGE_NX_BIT; +#endif + + // Finally, revert the NX bit back to its original polarity + accumulated_flags ^= _PAGE_NX_BIT; + + return accumulated_flags; +} + +#endif /* _XEN_SHADOW2_TYPES_H */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/include/asm-x86/shadow2.h b/xen/include/asm-x86/shadow2.h new file mode 100644 index 0000000000..94de7781f8 --- /dev/null +++ b/xen/include/asm-x86/shadow2.h @@ -0,0 +1,627 @@ +/****************************************************************************** + * include/asm-x86/shadow2.h + * + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _XEN_SHADOW2_H +#define _XEN_SHADOW2_H + +#include +#include +#include +#include + +/* Shadow PT operation mode : shadow-mode variable in arch_domain. */ + +#define SHM2_shift 10 +/* We're in one of the shadow modes */ +#define SHM2_enable (DOM0_SHADOW2_CONTROL_FLAG_ENABLE << SHM2_shift) +/* Refcounts based on shadow tables instead of guest tables */ +#define SHM2_refcounts (DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT << SHM2_shift) +/* Enable log dirty mode */ +#define SHM2_log_dirty (DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY << SHM2_shift) +/* Xen does p2m translation, not guest */ +#define SHM2_translate (DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE << SHM2_shift) +/* Xen does not steal address space from the domain for its own booking; + * requires VT or similar mechanisms */ +#define SHM2_external (DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL << SHM2_shift) + +#define shadow2_mode_enabled(_d) ((_d)->arch.shadow2_mode) +#define shadow2_mode_refcounts(_d) ((_d)->arch.shadow2_mode & SHM2_refcounts) +#define shadow2_mode_log_dirty(_d) ((_d)->arch.shadow2_mode & SHM2_log_dirty) +#define shadow2_mode_translate(_d) ((_d)->arch.shadow2_mode & SHM2_translate) +#define shadow2_mode_external(_d) ((_d)->arch.shadow2_mode & SHM2_external) + +/* Xen traps & emulates all reads of all page table pages: + *not yet supported + */ +#define shadow2_mode_trap_reads(_d) ({ (void)(_d); 0; }) + +// flags used in the return value of the shadow_set_lXe() functions... +#define SHADOW2_SET_CHANGED 0x1 +#define SHADOW2_SET_FLUSH 0x2 +#define SHADOW2_SET_ERROR 0x4 +#define SHADOW2_SET_L3PAE_RECOPY 0x8 + +// How do we tell that we have a 32-bit PV guest in a 64-bit Xen? +#ifdef __x86_64__ +#define pv_32bit_guest(_v) 0 // not yet supported +#else +#define pv_32bit_guest(_v) !hvm_guest(v) +#endif + +/* The shadow2 lock. + * + * This lock is per-domain. It is intended to allow us to make atomic + * updates to the software TLB that the shadow tables provide. + * + * Specifically, it protects: + * - all changes to shadow page table pages + * - the shadow hash table + * - the shadow page allocator + * - all changes to guest page table pages; if/when the notion of + * out-of-sync pages is added to this code, then the shadow lock is + * protecting all guest page table pages which are not listed as + * currently as both guest-writable and out-of-sync... + * XXX -- need to think about this relative to writable page tables. + * - all changes to the page_info->tlbflush_timestamp + * - the page_info->count fields on shadow pages + * - the shadow dirty bit array and count + * - XXX + */ +#ifndef CONFIG_SMP +#error shadow2.h currently requires CONFIG_SMP +#endif + +#define shadow2_lock_init(_d) \ + do { \ + spin_lock_init(&(_d)->arch.shadow2_lock); \ + (_d)->arch.shadow2_locker = -1; \ + (_d)->arch.shadow2_locker_function = "nobody"; \ + } while (0) + +#define shadow2_lock_is_acquired(_d) \ + (current->processor == (_d)->arch.shadow2_locker) + +#define shadow2_lock(_d) \ + do { \ + if ( unlikely((_d)->arch.shadow2_locker == current->processor) ) \ + { \ + printk("Error: shadow2 lock held by %s\n", \ + (_d)->arch.shadow2_locker_function); \ + BUG(); \ + } \ + spin_lock(&(_d)->arch.shadow2_lock); \ + ASSERT((_d)->arch.shadow2_locker == -1); \ + (_d)->arch.shadow2_locker = current->processor; \ + (_d)->arch.shadow2_locker_function = __func__; \ + } while (0) + +#define shadow2_unlock(_d) \ + do { \ + ASSERT((_d)->arch.shadow2_locker == current->processor); \ + (_d)->arch.shadow2_locker = -1; \ + (_d)->arch.shadow2_locker_function = "nobody"; \ + spin_unlock(&(_d)->arch.shadow2_lock); \ + } while (0) + +/* + * Levels of self-test and paranoia + * XXX should go in config files somewhere? + */ +#define SHADOW2_AUDIT_HASH 0x01 /* Check current hash bucket */ +#define SHADOW2_AUDIT_HASH_FULL 0x02 /* Check every hash bucket */ +#define SHADOW2_AUDIT_ENTRIES 0x04 /* Check this walk's shadows */ +#define SHADOW2_AUDIT_ENTRIES_FULL 0x08 /* Check every shadow */ +#define SHADOW2_AUDIT_ENTRIES_MFNS 0x10 /* Check gfn-mfn map in shadows */ +#define SHADOW2_AUDIT_P2M 0x20 /* Check the p2m table */ + +#ifdef NDEBUG +#define SHADOW2_AUDIT 0 +#define SHADOW2_AUDIT_ENABLE 0 +#else +#define SHADOW2_AUDIT 0x15 /* Basic audit of all except p2m. */ +#define SHADOW2_AUDIT_ENABLE shadow2_audit_enable +extern int shadow2_audit_enable; +#endif + +/* + * Levels of optimization + * XXX should go in config files somewhere? + */ +#define SH2OPT_WRITABLE_HEURISTIC 0x01 /* Guess at RW PTEs via linear maps */ +#define SH2OPT_EARLY_UNSHADOW 0x02 /* Unshadow l1s on fork or exit */ + +#define SHADOW2_OPTIMIZATIONS 0x03 + + +/* With shadow pagetables, the different kinds of address start + * to get get confusing. + * + * Virtual addresses are what they usually are: the addresses that are used + * to accessing memory while the guest is running. The MMU translates from + * virtual addresses to machine addresses. + * + * (Pseudo-)physical addresses are the abstraction of physical memory the + * guest uses for allocation and so forth. For the purposes of this code, + * we can largely ignore them. + * + * Guest frame numbers (gfns) are the entries that the guest puts in its + * pagetables. For normal paravirtual guests, they are actual frame numbers, + * with the translation done by the guest. + * + * Machine frame numbers (mfns) are the entries that the hypervisor puts + * in the shadow page tables. + * + * Elsewhere in the xen code base, the name "gmfn" is generally used to refer + * to a "machine frame number, from the guest's perspective", or in other + * words, pseudo-physical frame numbers. However, in the shadow code, the + * term "gmfn" means "the mfn of a guest page"; this combines naturally with + * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a + * guest L2 page), etc... + */ + +/* With this defined, we do some ugly things to force the compiler to + * give us type safety between mfns and gfns and other integers. + * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions + * that translate beween int and foo_t. + * + * It does have some performance cost because the types now have + * a different storage attribute, so may not want it on all the time. */ +#ifndef NDEBUG +#define TYPE_SAFETY 1 +#endif + +#ifdef TYPE_SAFETY +#define TYPE_SAFE(_type,_name) \ +typedef struct { _type _name; } _name##_t; \ +static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \ +static inline _type _name##_x(_name##_t n) { return n._name; } +#else +#define TYPE_SAFE(_type,_name) \ +typedef _type _name##_t; \ +static inline _name##_t _##_name(_type n) { return n; } \ +static inline _type _name##_x(_name##_t n) { return n; } +#endif + +TYPE_SAFE(unsigned long,mfn) +#define SH2_PRI_mfn "05lx" + +static inline int +valid_mfn(mfn_t m) +{ + return VALID_MFN(mfn_x(m)); +} + +static inline mfn_t +pagetable_get_mfn(pagetable_t pt) +{ + return _mfn(pagetable_get_pfn(pt)); +} + +static inline pagetable_t +pagetable_from_mfn(mfn_t mfn) +{ + return pagetable_from_pfn(mfn_x(mfn)); +} + +static inline int +shadow2_vcpu_mode_translate(struct vcpu *v) +{ + // Returns true if this VCPU needs to be using the P2M table to translate + // between GFNs and MFNs. + // + // This is true of translated HVM domains on a vcpu which has paging + // enabled. (HVM vcpu's with paging disabled are using the p2m table as + // its paging table, so no translation occurs in this case.) + // + return v->vcpu_flags & VCPUF_shadow2_translate; +} + + +/**************************************************************************/ +/* Mode-specific entry points into the shadow code */ + +struct x86_emulate_ctxt; +struct shadow2_entry_points { + int (*page_fault )(struct vcpu *v, unsigned long va, + struct cpu_user_regs *regs); + int (*invlpg )(struct vcpu *v, unsigned long va); + unsigned long (*gva_to_gpa )(struct vcpu *v, unsigned long va); + unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va); + void (*update_cr3 )(struct vcpu *v); + int (*map_and_validate_gl1e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl2e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl2he)(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl3e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl4e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + void (*detach_old_tables )(struct vcpu *v); + int (*x86_emulate_write )(struct vcpu *v, unsigned long va, + void *src, u32 bytes, + struct x86_emulate_ctxt *ctxt); + int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt); + int (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt *ctxt); + mfn_t (*make_monitor_table )(struct vcpu *v); + void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn); +#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC + int (*guess_wrmap )(struct vcpu *v, + unsigned long vaddr, mfn_t gmfn); +#endif + /* For outsiders to tell what mode we're in */ + unsigned int shadow_levels; + unsigned int guest_levels; +}; + +static inline int shadow2_guest_paging_levels(struct vcpu *v) +{ + ASSERT(v->arch.shadow2 != NULL); + return v->arch.shadow2->guest_levels; +} + +/**************************************************************************/ +/* Entry points into the shadow code */ + +/* Turning on shadow2 test mode */ +int shadow2_test_enable(struct domain *d); + +/* Handler for shadow control ops: enabling and disabling shadow modes, + * and log-dirty bitmap ops all happen through here. */ +int shadow2_control_op(struct domain *d, + dom0_shadow_control_t *sc, + XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op); + +/* Call when destroying a domain */ +void shadow2_teardown(struct domain *d); + +/* Call once all of the references to the domain have gone away */ +void shadow2_final_teardown(struct domain *d); + + +/* Mark a page as dirty in the bitmap */ +void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn); +static inline void mark_dirty(struct domain *d, unsigned long gmfn) +{ + if ( shadow2_mode_log_dirty(d) ) + { + shadow2_lock(d); + sh2_do_mark_dirty(d, _mfn(gmfn)); + shadow2_unlock(d); + } +} + +/* Internal version, for when the shadow lock is already held */ +static inline void sh2_mark_dirty(struct domain *d, mfn_t gmfn) +{ + ASSERT(shadow2_lock_is_acquired(d)); + if ( shadow2_mode_log_dirty(d) ) + sh2_do_mark_dirty(d, gmfn); +} + +static inline int +shadow2_fault(unsigned long va, struct cpu_user_regs *regs) +/* Called from pagefault handler in Xen, and from the HVM trap handlers + * for pagefaults. Returns 1 if this fault was an artefact of the + * shadow code (and the guest should retry) or 0 if it is not (and the + * fault should be handled elsewhere or passed to the guest). */ +{ + struct vcpu *v = current; + perfc_incrc(shadow2_fault); + return v->arch.shadow2->page_fault(v, va, regs); +} + +static inline int +shadow2_invlpg(struct vcpu *v, unsigned long va) +/* Called when the guest requests an invlpg. Returns 1 if the invlpg + * instruction should be issued on the hardware, or 0 if it's safe not + * to do so. */ +{ + return v->arch.shadow2->invlpg(v, va); +} + +static inline unsigned long +shadow2_gva_to_gpa(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + return v->arch.shadow2->gva_to_gpa(v, va); +} + +static inline unsigned long +shadow2_gva_to_gfn(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + return v->arch.shadow2->gva_to_gfn(v, va); +} + +static inline void +shadow2_update_cr3(struct vcpu *v) +/* Updates all the things that are derived from the guest's CR3. + * Called when the guest changes CR3. */ +{ + shadow2_lock(v->domain); + v->arch.shadow2->update_cr3(v); + shadow2_unlock(v->domain); +} + + +/* Should be called after CR3 is updated. + * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3. + * + * Also updates other state derived from CR3 (vcpu->arch.guest_vtable, + * shadow_vtable, etc). + * + * Uses values found in vcpu->arch.(guest_table and guest_table_user), and + * for HVM guests, arch.monitor_table and hvm's guest CR3. + * + * Update ref counts to shadow tables appropriately. + * For PAE, relocate L3 entries, if necessary, into low memory. + */ +static inline void update_cr3(struct vcpu *v) +{ + unsigned long cr3_mfn=0; + + if ( shadow2_mode_enabled(v->domain) ) + { + shadow2_update_cr3(v); + return; + } + +#if CONFIG_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user); + else +#endif + cr3_mfn = pagetable_get_pfn(v->arch.guest_table); + + /* Update vcpu->arch.cr3 */ + BUG_ON(cr3_mfn == 0); + make_cr3(v, cr3_mfn); +} + +extern void sh2_update_paging_modes(struct vcpu *v); + +/* Should be called to initialise paging structures if the paging mode + * has changed, and when bringing up a VCPU for the first time. */ +static inline void shadow2_update_paging_modes(struct vcpu *v) +{ + ASSERT(shadow2_mode_enabled(v->domain)); + shadow2_lock(v->domain); + sh2_update_paging_modes(v); + shadow2_unlock(v->domain); +} + +static inline void +shadow2_detach_old_tables(struct vcpu *v) +{ + v->arch.shadow2->detach_old_tables(v); +} + +static inline mfn_t +shadow2_make_monitor_table(struct vcpu *v) +{ + return v->arch.shadow2->make_monitor_table(v); +} + +static inline void +shadow2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn) +{ + v->arch.shadow2->destroy_monitor_table(v, mmfn); +} + +/* Validate a pagetable change from the guest and update the shadows. */ +extern int shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry); + +/* Update the shadows in response to a pagetable write from a HVM guest */ +extern void shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size); + +/* Remove all writeable mappings of a guest frame from the shadows. + * Returns non-zero if we need to flush TLBs. + * level and fault_addr desribe how we found this to be a pagetable; + * level==0 means we have some other reason for revoking write access. */ +extern int shadow2_remove_write_access(struct vcpu *v, mfn_t readonly_mfn, + unsigned int level, + unsigned long fault_addr); + +/* Remove all mappings of the guest mfn from the shadows. + * Returns non-zero if we need to flush TLBs. */ +extern int shadow2_remove_all_mappings(struct vcpu *v, mfn_t target_mfn); + +void +shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn); +/* This is a HVM page that we thing is no longer a pagetable. + * Unshadow it, and recursively unshadow pages that reference it. */ + +/* Remove all shadows of the guest mfn. */ +extern void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all); +static inline void shadow2_remove_all_shadows(struct vcpu *v, mfn_t gmfn) +{ + sh2_remove_shadows(v, gmfn, 1); +} + +/* Add a page to a domain */ +void +shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn); + +/* Remove a page from a domain */ +void +shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn); + +/* + * Definitions for the shadow2_flags field in page_info. + * These flags are stored on *guest* pages... + * Bits 1-13 are encodings for the shadow types. + */ +#define PGC_SH2_type_to_index(_type) ((_type) >> PGC_SH2_type_shift) +#define SH2F_page_type_mask \ + (((1u << PGC_SH2_type_to_index(PGC_SH2_max_shadow + 1u)) - 1u) - \ + ((1u << PGC_SH2_type_to_index(PGC_SH2_min_shadow)) - 1u)) + +#define SH2F_L1_32 (1u << PGC_SH2_type_to_index(PGC_SH2_l1_32_shadow)) +#define SH2F_FL1_32 (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_32_shadow)) +#define SH2F_L2_32 (1u << PGC_SH2_type_to_index(PGC_SH2_l2_32_shadow)) +#define SH2F_L1_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l1_pae_shadow)) +#define SH2F_FL1_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_pae_shadow)) +#define SH2F_L2_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l2_pae_shadow)) +#define SH2F_L2H_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l2h_pae_shadow)) +#define SH2F_L3_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l3_pae_shadow)) +#define SH2F_L1_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l1_64_shadow)) +#define SH2F_FL1_64 (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_64_shadow)) +#define SH2F_L2_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l2_64_shadow)) +#define SH2F_L3_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l3_64_shadow)) +#define SH2F_L4_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l4_64_shadow)) + +/* Used for hysteresis when automatically unhooking mappings on fork/exit */ +#define SH2F_unhooked_mappings (1u<<31) + +/* + * Allocation of shadow pages + */ + +/* Return the minumum acceptable number of shadow pages a domain needs */ +unsigned int shadow2_min_acceptable_pages(struct domain *d); + +/* Set the pool of shadow pages to the required number of MB. + * Input will be rounded up to at least min_acceptable_shadow_pages(). + * Returns 0 for success, 1 for failure. */ +unsigned int shadow2_set_allocation(struct domain *d, + unsigned int megabytes, + int *preempted); + +/* Return the size of the shadow2 pool, rounded up to the nearest MB */ +static inline unsigned int shadow2_get_allocation(struct domain *d) +{ + unsigned int pg = d->arch.shadow2_total_pages; + return ((pg >> (20 - PAGE_SHIFT)) + + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0)); +} + +/* + * Linked list for chaining entries in the shadow hash table. + */ +struct shadow2_hash_entry { + struct shadow2_hash_entry *next; + mfn_t smfn; /* MFN of the shadow */ +#ifdef _x86_64_ /* Shorten 'n' so we don't waste a whole word on storing 't' */ + unsigned long n:56; /* MFN of guest PT or GFN of guest superpage */ +#else + unsigned long n; /* MFN of guest PT or GFN of guest superpage */ +#endif + unsigned char t; /* shadow type bits, or 0 for empty */ +}; + +#define SHADOW2_HASH_BUCKETS 251 +/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */ + + +#if SHADOW2_OPTIMIZATIONS & SH2OPT_CACHE_WALKS +/* Optimization: cache the results of guest walks. This helps with MMIO + * and emulated writes, which tend to issue very similar walk requests + * repeatedly. We keep the results of the last few walks, and blow + * away the cache on guest cr3 write, mode change, or page fault. */ + +#define SH2_WALK_CACHE_ENTRIES 4 + +/* Rather than cache a guest walk, which would include mapped pointers + * to pages, we cache what a TLB would remember about the walk: the + * permissions and the l1 gfn */ +struct shadow2_walk_cache { + unsigned long va; /* The virtual address (or 0 == unused) */ + unsigned long gfn; /* The gfn from the effective l1e */ + u32 permissions; /* The aggregated permission bits */ +}; +#endif + + +/**************************************************************************/ +/* Guest physmap (p2m) support */ + +/* Walk another domain's P2M table, mapping pages as we go */ +extern mfn_t +sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn); + + +/* General conversion function from gfn to mfn */ +static inline mfn_t +sh2_gfn_to_mfn(struct domain *d, unsigned long gfn) +{ + if ( !shadow2_mode_translate(d) ) + return _mfn(gfn); + else if ( likely(current->domain == d) ) + return _mfn(get_mfn_from_gpfn(gfn)); + else + return sh2_gfn_to_mfn_foreign(d, gfn); +} + +// vcpu-specific version of gfn_to_mfn(). This is where we hide the dirty +// little secret that, for hvm guests with paging disabled, nearly all of the +// shadow code actually think that the guest is running on *untranslated* page +// tables (which is actually domain->phys_table). +// +static inline mfn_t +sh2_vcpu_gfn_to_mfn(struct vcpu *v, unsigned long gfn) +{ + if ( !shadow2_vcpu_mode_translate(v) ) + return _mfn(gfn); + if ( likely(current->domain == v->domain) ) + return _mfn(get_mfn_from_gpfn(gfn)); + return sh2_gfn_to_mfn_foreign(v->domain, gfn); +} + +static inline unsigned long +sh2_mfn_to_gfn(struct domain *d, mfn_t mfn) +{ + if ( shadow2_mode_translate(d) ) + return get_gpfn_from_mfn(mfn_x(mfn)); + else + return mfn_x(mfn); +} + + + +#endif /* _XEN_SHADOW2_H */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ + diff --git a/xen/include/asm-x86/shadow_64.h b/xen/include/asm-x86/shadow_64.h deleted file mode 100644 index d9afbdca18..0000000000 --- a/xen/include/asm-x86/shadow_64.h +++ /dev/null @@ -1,587 +0,0 @@ -/****************************************************************************** - * include/asm-x86/shadow_64.h - * - * Copyright (c) 2005 Michael A Fetterman - * Based on an earlier implementation by Ian Pratt et al - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -/* - * Jun Nakajima - * Chengyuan Li - * - * Extended to support 64-bit guests. - */ -#ifndef _XEN_SHADOW_64_H -#define _XEN_SHADOW_64_H -#include -#include -#include - -/* - * The naming convention of the shadow_ops: - * MODE___HANDLER - */ -extern struct shadow_ops MODE_64_2_HANDLER; -extern struct shadow_ops MODE_64_3_HANDLER; -extern struct shadow_ops MODE_64_PAE_HANDLER; -#if CONFIG_PAGING_LEVELS == 4 -extern struct shadow_ops MODE_64_4_HANDLER; -#endif - -#if CONFIG_PAGING_LEVELS == 3 -#define L4_PAGETABLE_SHIFT 39 -#define L4_PAGETABLE_ENTRIES (1<> 29) - -#define entry_get_value(_x) ((_x).lo) -#define entry_get_pfn(_x) \ - (((_x).lo & (PADDR_MASK&PAGE_MASK)) >> PAGE_SHIFT) -#define entry_get_paddr(_x) (((_x).lo & (PADDR_MASK&PAGE_MASK))) -#define entry_get_flags(_x) (get_pte_flags((_x).lo)) - -#define entry_empty() ((pgentry_64_t) { 0 }) -#define entry_from_pfn(pfn, flags) \ - ((pgentry_64_t) { ((intpte_t)(pfn) << PAGE_SHIFT) | put_pte_flags(flags) }) -#define entry_from_page(page, flags) (entry_from_pfn(page_to_mfn(page),(flags))) -#define entry_add_flags(x, flags) ((x).lo |= put_pte_flags(flags)) -#define entry_remove_flags(x, flags) ((x).lo &= ~put_pte_flags(flags)) -#define entry_has_changed(x,y,flags) \ - ( !!(((x).lo ^ (y).lo) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags(flags))) ) - -/******************************************************************************/ -/* - * The macro and inlines are for 32-bit PAE guest - */ -#define PAE_PDPT_RESERVED 0x1e6 /* [8:5], [2,1] */ - -#define PAE_SHADOW_SELF_ENTRY 259 -#define PAE_L3_PAGETABLE_ENTRIES 4 - -/******************************************************************************/ -static inline int table_offset_64(unsigned long va, int level) -{ - switch(level) { - case 1: - return (((va) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1)); - case 2: - return (((va) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1)); - case 3: - return (((va) >> L3_PAGETABLE_SHIFT) & (L3_PAGETABLE_ENTRIES - 1)); -#if CONFIG_PAGING_LEVELS == 3 - case 4: - return PAE_SHADOW_SELF_ENTRY; -#endif - -#if CONFIG_PAGING_LEVELS >= 4 -#ifndef GUEST_PGENTRY_32 -#ifndef GUEST_32PAE - case 4: - return (((va) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1)); -#else - case 4: - return PAE_SHADOW_SELF_ENTRY; -#endif -#else - case 4: - return PAE_SHADOW_SELF_ENTRY; -#endif -#endif - default: - return -1; - } -} - -/*****************************************************************************/ - -#if defined( GUEST_32PAE ) -static inline int guest_table_offset_64(unsigned long va, int level, unsigned int index) -{ - switch(level) { - case 1: - return (((va) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1)); - case 2: - return (((va) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1)); - case 3: - return (index * 4 + ((va) >> L3_PAGETABLE_SHIFT)); -#if CONFIG_PAGING_LEVELS == 3 - case 4: - return PAE_SHADOW_SELF_ENTRY; -#endif - -#if CONFIG_PAGING_LEVELS >= 4 -#ifndef GUEST_PGENTRY_32 - case 4: - return (((va) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1)); -#else - case 4: - return PAE_SHADOW_SELF_ENTRY; -#endif -#endif - default: - return -1; - } -} - -#define SH_GUEST_32PAE 1 -#else -#define guest_table_offset_64(va, level, index) \ - table_offset_64((va),(level)) -#define SH_GUEST_32PAE 0 -#endif - -/********************************************************************************/ - -static inline void free_out_of_sync_state(struct domain *d) -{ - struct out_of_sync_entry *entry; - - // NB: Be careful not to call something that manipulates this list - // while walking it. Remove one item at a time, and always - // restart from start of list. - // - while ( (entry = d->arch.out_of_sync) ) - { - d->arch.out_of_sync = entry->next; - release_out_of_sync_entry(d, entry); - - entry->next = d->arch.out_of_sync_free; - d->arch.out_of_sync_free = entry; - } -} - -static inline int __entry( - struct vcpu *v, unsigned long va, pgentry_64_t *e_p, u32 flag) -{ - int i; - pgentry_64_t *le_e; - pgentry_64_t *le_p = NULL; - pgentry_64_t *phys_vtable = NULL; - unsigned long mfn; - int index; - u32 level = flag & L_MASK; - struct domain *d = v->domain; - int root_level; - unsigned int base_idx; - - base_idx = get_cr3_idxval(v); - - if ( flag & SHADOW_ENTRY ) - { - root_level = ROOT_LEVEL_64; - index = table_offset_64(va, root_level); - le_e = (pgentry_64_t *)&v->arch.shadow_vtable[index]; - } - else if ( flag & GUEST_ENTRY ) - { - root_level = v->domain->arch.ops->guest_paging_levels; - if ( root_level == PAGING_L3 ) - index = guest_table_offset_64(va, PAGING_L3, base_idx); - else - index = guest_table_offset_64(va, root_level, base_idx); - le_e = (pgentry_64_t *)&v->arch.guest_vtable[index]; - } - else /* direct mode */ - { - root_level = PAE_PAGING_LEVELS; - index = table_offset_64(va, root_level); - phys_vtable = (pgentry_64_t *)map_domain_page( - pagetable_get_pfn(v->domain->arch.phys_table)); - le_e = &phys_vtable[index]; - } - - /* - * If it's not external mode, then mfn should be machine physical. - */ - for ( i = root_level - level; i > 0; i-- ) - { - if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) ) - { - if ( le_p ) - unmap_domain_page(le_p); - - if ( phys_vtable ) - unmap_domain_page(phys_vtable); - - return 0; - } - - mfn = entry_get_pfn(*le_e); - if ( (flag & GUEST_ENTRY) && shadow_mode_translate(d) ) - mfn = get_mfn_from_gpfn(mfn); - - if ( le_p ) - unmap_domain_page(le_p); - le_p = (pgentry_64_t *)map_domain_page(mfn); - - if ( flag & SHADOW_ENTRY ) - index = table_offset_64(va, (level + i - 1)); - else - index = guest_table_offset_64(va, (level + i - 1), base_idx); - le_e = &le_p[index]; - } - - if ( flag & SET_ENTRY ) - *le_e = *e_p; - else - *e_p = *le_e; - - if ( le_p ) - unmap_domain_page(le_p); - - if ( phys_vtable ) - unmap_domain_page(phys_vtable); - - return 1; -} - -static inline int __rw_entry( - struct vcpu *v, unsigned long va, void *e_p, u32 flag) -{ - pgentry_64_t *e = (pgentry_64_t *)e_p; - - if (e) { - return __entry(v, va, e, flag); - } - - return 0; -} - -#define __shadow_set_l4e(v, va, value) \ - __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L4) -#define __shadow_get_l4e(v, va, sl4e) \ - __rw_entry(v, va, sl4e, SHADOW_ENTRY | GET_ENTRY | PAGING_L4) -#define __shadow_set_l3e(v, va, value) \ - __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L3) -#define __shadow_get_l3e(v, va, sl3e) \ - __rw_entry(v, va, sl3e, SHADOW_ENTRY | GET_ENTRY | PAGING_L3) -#define __shadow_set_l2e(v, va, value) \ - __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L2) -#define __shadow_get_l2e(v, va, sl2e) \ - __rw_entry(v, va, sl2e, SHADOW_ENTRY | GET_ENTRY | PAGING_L2) -#define __shadow_set_l1e(v, va, value) \ - __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L1) -#define __shadow_get_l1e(v, va, sl1e) \ - __rw_entry(v, va, sl1e, SHADOW_ENTRY | GET_ENTRY | PAGING_L1) - -#define __guest_set_l4e(v, va, value) \ - __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L4) -#define __guest_get_l4e(v, va, gl4e) \ - __rw_entry(v, va, gl4e, GUEST_ENTRY | GET_ENTRY | PAGING_L4) -#define __guest_set_l3e(v, va, value) \ - __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L3) -#define __guest_get_l3e(v, va, sl3e) \ - __rw_entry(v, va, gl3e, GUEST_ENTRY | GET_ENTRY | PAGING_L3) - -#define __direct_set_l3e(v, va, value) \ - __rw_entry(v, va, value, DIRECT_ENTRY | SET_ENTRY | PAGING_L3) -#define __direct_get_l3e(v, va, sl3e) \ - __rw_entry(v, va, sl3e, DIRECT_ENTRY | GET_ENTRY | PAGING_L3) -#define __direct_set_l2e(v, va, value) \ - __rw_entry(v, va, value, DIRECT_ENTRY | SET_ENTRY | PAGING_L2) -#define __direct_get_l2e(v, va, sl2e) \ - __rw_entry(v, va, sl2e, DIRECT_ENTRY | GET_ENTRY | PAGING_L2) -#define __direct_set_l1e(v, va, value) \ - __rw_entry(v, va, value, DIRECT_ENTRY | SET_ENTRY | PAGING_L1) -#define __direct_get_l1e(v, va, sl1e) \ - __rw_entry(v, va, sl1e, DIRECT_ENTRY | GET_ENTRY | PAGING_L1) - - -static inline int __guest_set_l2e( - struct vcpu *v, unsigned long va, void *value, int size) -{ - switch(size) { - case 4: - // 32-bit guest - { - l2_pgentry_32_t *l2va; - - l2va = (l2_pgentry_32_t *)v->arch.guest_vtable; - if (value) - l2va[l2_table_offset_32(va)] = *(l2_pgentry_32_t *)value; - return 1; - } - case 8: - return __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L2); - default: - BUG(); - return 0; - } - return 0; -} - -#define __guest_set_l2e(v, va, value) \ - __guest_set_l2e(v, (unsigned long)va, value, sizeof(*value)) - -static inline int __guest_get_l2e( - struct vcpu *v, unsigned long va, void *gl2e, int size) -{ - switch(size) { - case 4: - // 32-bit guest - { - l2_pgentry_32_t *l2va; - l2va = (l2_pgentry_32_t *)v->arch.guest_vtable; - if (gl2e) - *(l2_pgentry_32_t *)gl2e = l2va[l2_table_offset_32(va)]; - return 1; - } - case 8: - return __rw_entry(v, va, gl2e, GUEST_ENTRY | GET_ENTRY | PAGING_L2); - default: - BUG(); - return 0; - } - return 0; -} - -#define __guest_get_l2e(v, va, gl2e) \ - __guest_get_l2e(v, (unsigned long)va, gl2e, sizeof(*gl2e)) - -static inline int __guest_set_l1e( - struct vcpu *v, unsigned long va, void *value, int size) -{ - switch(size) { - case 4: - // 32-bit guest - { - l2_pgentry_32_t gl2e; - l1_pgentry_32_t *l1va; - unsigned long l1mfn; - - if (!__guest_get_l2e(v, va, &gl2e)) - return 0; - if (unlikely(!(l2e_get_flags_32(gl2e) & _PAGE_PRESENT))) - return 0; - - l1mfn = get_mfn_from_gpfn( - l2e_get_pfn(gl2e)); - - l1va = (l1_pgentry_32_t *)map_domain_page(l1mfn); - if (value) - l1va[l1_table_offset_32(va)] = *(l1_pgentry_32_t *)value; - unmap_domain_page(l1va); - - return 1; - } - - case 8: - return __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L1); - default: - BUG(); - return 0; - } - return 0; -} - -#define __guest_set_l1e(v, va, value) \ - __guest_set_l1e(v, (unsigned long)va, value, sizeof(*value)) - -static inline int __guest_get_l1e( - struct vcpu *v, unsigned long va, void *gl1e, int size) -{ - switch(size) { - case 4: - // 32-bit guest - { - l2_pgentry_32_t gl2e; - l1_pgentry_32_t *l1va; - unsigned long l1mfn; - - if (!(__guest_get_l2e(v, va, &gl2e))) - return 0; - - - if (unlikely(!(l2e_get_flags_32(gl2e) & _PAGE_PRESENT))) - return 0; - - - l1mfn = get_mfn_from_gpfn( - l2e_get_pfn(gl2e)); - l1va = (l1_pgentry_32_t *) map_domain_page(l1mfn); - if (gl1e) - *(l1_pgentry_32_t *)gl1e = l1va[l1_table_offset_32(va)]; - unmap_domain_page(l1va); - return 1; - } - case 8: - // 64-bit guest - return __rw_entry(v, va, gl1e, GUEST_ENTRY | GET_ENTRY | PAGING_L1); - default: - BUG(); - return 0; - } - return 0; -} - -#define __guest_get_l1e(v, va, gl1e) \ - __guest_get_l1e(v, (unsigned long)va, gl1e, sizeof(*gl1e)) - -static inline void entry_general( - struct domain *d, - pgentry_64_t *gle_p, - pgentry_64_t *sle_p, - unsigned long smfn, u32 level) - -{ - pgentry_64_t gle = *gle_p; - pgentry_64_t sle; - - sle = entry_empty(); - if ( (entry_get_flags(gle) & _PAGE_PRESENT) && (smfn != 0) ) - { - if ((entry_get_flags(gle) & _PAGE_PSE) && level == PAGING_L2) { - sle = entry_from_pfn(smfn, entry_get_flags(gle)); - entry_remove_flags(sle, _PAGE_PSE); - - if ( shadow_mode_log_dirty(d) || - !(entry_get_flags(gle) & _PAGE_DIRTY) ) - { - pgentry_64_t *l1_p; - int i; - - l1_p =(pgentry_64_t *)map_domain_page(smfn); - for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) - { - if ( mfn_is_page_table(entry_get_pfn(l1_p[i])) ) - entry_remove_flags(l1_p[i], _PAGE_RW); - } - - unmap_domain_page(l1_p); - } - } else { - if (d->arch.ops->guest_paging_levels <= PAGING_L3 - && level == PAGING_L3) { - sle = entry_from_pfn(smfn, entry_get_flags(gle)); - } else { - - sle = entry_from_pfn( - smfn, - (entry_get_flags(gle) | _PAGE_RW | _PAGE_ACCESSED) & ~_PAGE_AVAIL); - entry_add_flags(gle, _PAGE_ACCESSED); - } - } - // XXX mafetter: Hmm... - // Shouldn't the dirty log be checked/updated here? - // Actually, it needs to be done in this function's callers. - // - *gle_p = gle; - } - - if ( entry_get_value(sle) || entry_get_value(gle) ) - SH_VVLOG("%s: gpde=%lx, new spde=%lx", __func__, - entry_get_value(gle), entry_get_value(sle)); - - *sle_p = sle; -} - -static inline void entry_propagate_from_guest( - struct domain *d, pgentry_64_t *gle_p, pgentry_64_t *sle_p, u32 level) -{ - pgentry_64_t gle = *gle_p; - unsigned long smfn = 0; - - if ( entry_get_flags(gle) & _PAGE_PRESENT ) { - if ((entry_get_flags(gle) & _PAGE_PSE) && level == PAGING_L2) { - smfn = __shadow_status(d, entry_get_pfn(gle), PGT_fl1_shadow); - } else { - smfn = __shadow_status(d, entry_get_pfn(gle), - shadow_level_to_type((level -1 ))); - } - } - entry_general(d, gle_p, sle_p, smfn, level); - -} - -static int inline -validate_entry_change( - struct domain *d, - pgentry_64_t *new_gle_p, - pgentry_64_t *shadow_le_p, - u32 level) -{ - pgentry_64_t old_sle, new_sle; - pgentry_64_t new_gle = *new_gle_p; - - old_sle = *shadow_le_p; - entry_propagate_from_guest(d, &new_gle, &new_sle, level); - - ESH_LOG("old_sle: %lx, new_gle: %lx, new_sle: %lx\n", - entry_get_value(old_sle), entry_get_value(new_gle), - entry_get_value(new_sle)); - - if ( ((entry_get_value(old_sle) | entry_get_value(new_sle)) & _PAGE_PRESENT) && - entry_has_changed(old_sle, new_sle, _PAGE_PRESENT) ) - { - perfc_incrc(validate_entry_changes); - - if ( (entry_get_flags(new_sle) & _PAGE_PRESENT) && - !get_shadow_ref(entry_get_pfn(new_sle)) ) - BUG(); - if ( entry_get_flags(old_sle) & _PAGE_PRESENT ) - put_shadow_ref(entry_get_pfn(old_sle)); - } - - *shadow_le_p = new_sle; - - return 1; -} - -#endif - - diff --git a/xen/include/asm-x86/shadow_ops.h b/xen/include/asm-x86/shadow_ops.h deleted file mode 100644 index 8765ed8b10..0000000000 --- a/xen/include/asm-x86/shadow_ops.h +++ /dev/null @@ -1,138 +0,0 @@ -/****************************************************************************** - * include/asm-x86/shadow_ops.h - * - * Copyright (c) 2005 Michael A Fetterman - * Based on an earlier implementation by Ian Pratt et al - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _XEN_SHADOW_OPS_H -#define _XEN_SHADOW_OPS_H - -#define PAGING_L4 4UL -#define PAGING_L3 3UL -#define PAGING_L2 2UL -#define PAGING_L1 1UL - -#define PAE_CR3_ALIGN 5 -#define PAE_CR3_IDX_MASK 0x7f - -#if defined( GUEST_PGENTRY_32 ) - -#define GUEST_L1_PAGETABLE_ENTRIES L1_PAGETABLE_ENTRIES_32 -#define GUEST_L2_PAGETABLE_ENTRIES L2_PAGETABLE_ENTRIES_32 -#define GUEST_ROOT_PAGETABLE_ENTRIES ROOT_PAGETABLE_ENTRIES_32 -#define GUEST_L2_PAGETABLE_SHIFT L2_PAGETABLE_SHIFT_32 - -#define guest_l1_pgentry_t l1_pgentry_32_t -#define guest_l2_pgentry_t l2_pgentry_32_t -#define guest_root_pgentry_t l2_pgentry_32_t - -#define guest_l1e_get_paddr l1e_get_paddr_32 -#define guest_l2e_get_paddr l2e_get_paddr_32 - -#define guest_get_pte_flags get_pte_flags_32 -#define guest_put_pte_flags put_pte_flags_32 - -#define guest_l1e_get_flags l1e_get_flags_32 -#define guest_l2e_get_flags l2e_get_flags_32 -#define guest_root_get_flags l2e_get_flags_32 -#define guest_root_get_intpte l2e_get_intpte - -#define guest_l1e_empty l1e_empty_32 -#define guest_l2e_empty l2e_empty_32 - -#define guest_l1e_from_pfn l1e_from_pfn_32 -#define guest_l2e_from_pfn l2e_from_pfn_32 - -#define guest_l1e_from_paddr l1e_from_paddr_32 -#define guest_l2e_from_paddr l2e_from_paddr_32 - -#define guest_l1e_from_page l1e_from_page_32 -#define guest_l2e_from_page l2e_from_page_32 - -#define guest_l1e_add_flags l1e_add_flags_32 -#define guest_l2e_add_flags l2e_add_flags_32 - -#define guest_l1e_remove_flag l1e_remove_flags_32 -#define guest_l2e_remove_flag l2e_remove_flags_32 - -#define guest_l1e_has_changed l1e_has_changed_32 -#define guest_l2e_has_changed l2e_has_changed_32 -#define root_entry_has_changed l2e_has_changed_32 - -#define guest_l1_table_offset l1_table_offset_32 -#define guest_l2_table_offset l2_table_offset_32 - -#define guest_linear_l1_table linear_pg_table_32 -#define guest_linear_l2_table linear_l2_table_32 - -#define guest_va_to_l1mfn va_to_l1mfn_32 - -#else - -#define GUEST_L1_PAGETABLE_ENTRIES L1_PAGETABLE_ENTRIES -#define GUEST_L2_PAGETABLE_ENTRIES L2_PAGETABLE_ENTRIES -#define GUEST_ROOT_PAGETABLE_ENTRIES ROOT_PAGETABLE_ENTRIES -#define GUEST_L2_PAGETABLE_SHIFT L2_PAGETABLE_SHIFT - -#define guest_l1_pgentry_t l1_pgentry_t -#define guest_l2_pgentry_t l2_pgentry_t -#define guest_root_pgentry_t l4_pgentry_t - -#define guest_l1e_get_paddr l1e_get_paddr -#define guest_l2e_get_paddr l2e_get_paddr - -#define guest_get_pte_flags get_pte_flags -#define guest_put_pte_flags put_pte_flags - -#define guest_l1e_get_flags l1e_get_flags -#define guest_l2e_get_flags l2e_get_flags -#define guest_root_get_flags l4e_get_flags -#define guest_root_get_intpte l4e_get_intpte - -#define guest_l1e_empty l1e_empty -#define guest_l2e_empty l2e_empty - -#define guest_l1e_from_pfn l1e_from_pfn -#define guest_l2e_from_pfn l2e_from_pfn - -#define guest_l1e_from_paddr l1e_from_paddr -#define guest_l2e_from_paddr l2e_from_paddr - -#define guest_l1e_from_page l1e_from_page -#define guest_l2e_from_page l2e_from_page - -#define guest_l1e_add_flags l1e_add_flags -#define guest_l2e_add_flags l2e_add_flags - -#define guest_l1e_remove_flag l1e_remove_flags -#define guest_l2e_remove_flag l2e_remove_flags - -#define guest_l1e_has_changed l1e_has_changed -#define guest_l2e_has_changed l2e_has_changed -#define root_entry_has_changed l4e_has_changed - -#define guest_l1_table_offset l1_table_offset -#define guest_l2_table_offset l2_table_offset - -#define guest_linear_l1_table linear_pg_table -#define guest_linear_l2_table linear_l2_table - -#define guest_va_to_l1mfn va_to_l1mfn -#endif - -#endif /* _XEN_SHADOW_OPS_H */ diff --git a/xen/include/asm-x86/shadow_public.h b/xen/include/asm-x86/shadow_public.h deleted file mode 100644 index e2b4b5fd57..0000000000 --- a/xen/include/asm-x86/shadow_public.h +++ /dev/null @@ -1,61 +0,0 @@ -/****************************************************************************** - * include/asm-x86/shadow_public.h - * - * Copyright (c) 2005 Michael A Fetterman - * Based on an earlier implementation by Ian Pratt et al - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _XEN_SHADOW_PUBLIC_H -#define _XEN_SHADOW_PUBLIC_H - -#if CONFIG_PAGING_LEVELS >= 3 -#define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned) - -extern void shadow_sync_and_drop_references( - struct domain *d, struct page_info *page); -extern void shadow_drop_references( - struct domain *d, struct page_info *page); - -extern int shadow_set_guest_paging_levels(struct domain *d, int levels); - -extern void release_out_of_sync_entry( - struct domain *d, struct out_of_sync_entry *entry); - -struct shadow_ops { - unsigned long guest_paging_levels; /* guest paging levels */ - void (*invlpg)(struct vcpu *v, unsigned long va); - int (*fault)(unsigned long va, struct cpu_user_regs *regs); - void (*update_pagetables)(struct vcpu *v); - void (*sync_all)(struct domain *d); - int (*remove_all_write_access)(struct domain *d, - unsigned long readonly_gpfn, unsigned long readonly_gmfn); - int (*do_update_va_mapping)(unsigned long va, l1_pgentry_t val, struct vcpu *v); - struct out_of_sync_entry * - (*mark_mfn_out_of_sync)(struct vcpu *v, unsigned long gpfn, - unsigned long mfn); - int (*is_out_of_sync)(struct vcpu *v, unsigned long va); - unsigned long (*gva_to_gpa)(unsigned long gva); -}; -#endif - -#if CONFIG_PAGING_LEVELS >= 4 -extern void shadow_l4_normal_pt_update(struct domain *d, - unsigned long pa, l4_pgentry_t l4e, - struct domain_mmap_cache *cache); -#endif - -#endif diff --git a/xen/include/asm-x86/x86_32/page-2level.h b/xen/include/asm-x86/x86_32/page-2level.h index 764b1c2c05..7f450c4624 100644 --- a/xen/include/asm-x86/x86_32/page-2level.h +++ b/xen/include/asm-x86/x86_32/page-2level.h @@ -46,6 +46,7 @@ typedef l2_pgentry_t root_pgentry_t; * 12-bit flags = (pte[11:0]) */ +#define _PAGE_NX_BIT 0U #define _PAGE_NX 0U /* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */ diff --git a/xen/include/asm-x86/x86_32/page-3level.h b/xen/include/asm-x86/x86_32/page-3level.h index 43e73033e3..e0187478cc 100644 --- a/xen/include/asm-x86/x86_32/page-3level.h +++ b/xen/include/asm-x86/x86_32/page-3level.h @@ -59,7 +59,8 @@ typedef l3_pgentry_t root_pgentry_t; * 32-bit flags = (pte[63:44],pte[11:0]) */ -#define _PAGE_NX (cpu_has_nx ? (1<<31) : 0) +#define _PAGE_NX_BIT (1U<<31) +#define _PAGE_NX (cpu_has_nx ? _PAGE_NX_BIT : 0) /* Extract flags into 32-bit integer, or turn 32-bit flags into a pte mask. */ #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF)) diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h index 0afb5e719b..429cfb8c5d 100644 --- a/xen/include/asm-x86/x86_64/page.h +++ b/xen/include/asm-x86/x86_64/page.h @@ -44,6 +44,8 @@ typedef l4_pgentry_t root_pgentry_t; /* Given a virtual address, get an entry offset into a linear page table. */ #define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> L1_PAGETABLE_SHIFT) #define l2_linear_offset(_a) (((_a) & VADDR_MASK) >> L2_PAGETABLE_SHIFT) +#define l3_linear_offset(_a) (((_a) & VADDR_MASK) >> L3_PAGETABLE_SHIFT) +#define l4_linear_offset(_a) (((_a) & VADDR_MASK) >> L4_PAGETABLE_SHIFT) #define is_guest_l1_slot(_s) (1) #define is_guest_l2_slot(_t, _s) (1) @@ -70,7 +72,8 @@ typedef l4_pgentry_t root_pgentry_t; #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF)) /* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/ -#define _PAGE_NX (cpu_has_nx ? (1U<<23) : 0U) +#define _PAGE_NX_BIT (1U<<23) +#define _PAGE_NX (cpu_has_nx ? _PAGE_NX_BIT : 0U) #define L1_DISALLOW_MASK BASE_DISALLOW_MASK #define L2_DISALLOW_MASK BASE_DISALLOW_MASK diff --git a/xen/include/public/dom0_ops.h b/xen/include/public/dom0_ops.h index d211ca1624..f12cc93108 100644 --- a/xen/include/public/dom0_ops.h +++ b/xen/include/public/dom0_ops.h @@ -262,6 +262,18 @@ DEFINE_XEN_GUEST_HANDLE(dom0_sched_id_t); #define DOM0_SHADOW_CONTROL_OP_CLEAN 11 #define DOM0_SHADOW_CONTROL_OP_PEEK 12 +/* Shadow2 operations */ +#define DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION 30 +#define DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION 31 +#define DOM0_SHADOW2_CONTROL_OP_ENABLE 32 + +/* Mode flags for Shadow2 enable op */ +#define DOM0_SHADOW2_CONTROL_FLAG_ENABLE (1 << 0) +#define DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT (1 << 1) +#define DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY (1 << 2) +#define DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE (1 << 3) +#define DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL (1 << 4) + struct dom0_shadow_control_stats { uint32_t fault_count; uint32_t dirty_count; @@ -277,7 +289,9 @@ struct dom0_shadow_control { uint32_t op; XEN_GUEST_HANDLE(ulong) dirty_bitmap; /* IN/OUT variables. */ - uint64_t pages; /* size of buffer, updated with actual size */ + uint64_t pages; /* size of buffer, updated with actual size */ + uint32_t mb; /* Shadow2 memory allocation in MB */ + uint32_t mode; /* Shadow2 mode to enable */ /* OUT variables. */ struct dom0_shadow_control_stats stats; }; diff --git a/xen/include/xen/domain_page.h b/xen/include/xen/domain_page.h index 03d7af5f0f..2a51fcbacb 100644 --- a/xen/include/xen/domain_page.h +++ b/xen/include/xen/domain_page.h @@ -26,6 +26,13 @@ extern void *map_domain_page(unsigned long pfn); */ extern void unmap_domain_page(void *va); +/* + * Convert a VA (within a page previously mapped in the context of the + * currently-executing VCPU via a call to map_domain_pages()) to a machine + * address + */ +extern paddr_t mapped_domain_page_to_maddr(void *va); + /* * Similar to the above calls, except the mapping is accessible in all * address spaces (not just within the VCPU that created the mapping). Global @@ -98,6 +105,7 @@ domain_mmap_cache_destroy(struct domain_mmap_cache *cache) #define map_domain_page(pfn) maddr_to_virt((pfn)<next, n = pos->next; pos != (head); \ pos = n, n = pos->next) +/** + * list_for_each_backwards_safe - iterate backwards over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_backwards_safe(pos, n, head) \ + for (pos = (head)->prev, n = pos->prev; pos != (head); \ + pos = n, n = pos->prev) + /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop counter. diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index c37e60f23a..d90b27adc7 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -376,9 +376,12 @@ extern struct domain *domain_list; /* VCPU is paused by the hypervisor? */ #define _VCPUF_paused 11 #define VCPUF_paused (1UL<<_VCPUF_paused) - /* VCPU is blocked awaiting an event to be consumed by Xen. */ +/* VCPU is blocked awaiting an event to be consumed by Xen. */ #define _VCPUF_blocked_in_xen 12 #define VCPUF_blocked_in_xen (1UL<<_VCPUF_blocked_in_xen) + /* HVM vcpu thinks CR0.PG == 0 */ +#define _VCPUF_shadow2_translate 13 +#define VCPUF_shadow2_translate (1UL<<_VCPUF_shadow2_translate) /* * Per-domain flags (domain_flags). -- cgit v1.2.3