aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authortdeegan@york.uk.xensource.com <tdeegan@york.uk.xensource.com>2006-08-16 17:02:35 +0100
committertdeegan@york.uk.xensource.com <tdeegan@york.uk.xensource.com>2006-08-16 17:02:35 +0100
commit49f7c7364e0acbc44604e0315599782096eee522 (patch)
treeb9dcdab90c9598f12d5559edc96efdaf7afd0da3
parente3e351919cc62f3cdabd8cef9b3a6be9ab4f13dd (diff)
downloadxen-49f7c7364e0acbc44604e0315599782096eee522.tar.gz
xen-49f7c7364e0acbc44604e0315599782096eee522.tar.bz2
xen-49f7c7364e0acbc44604e0315599782096eee522.zip
Replace shadow pagetable code with shadow2.
-rw-r--r--.hgtags10
-rw-r--r--tools/examples/xmexample.hvm4
-rw-r--r--tools/libxc/xc_domain.c13
-rw-r--r--tools/libxc/xc_hvm_build.c13
-rw-r--r--tools/libxc/xc_linux_build.c2
-rw-r--r--tools/libxc/xc_linux_save.c18
-rw-r--r--tools/libxc/xenctrl.h2
-rw-r--r--tools/misc/xc_shadow.c2
-rw-r--r--tools/python/xen/lowlevel/xc/xc.c69
-rw-r--r--tools/python/xen/xend/XendDomain.py24
-rw-r--r--tools/python/xen/xend/XendDomainInfo.py47
-rw-r--r--tools/python/xen/xend/image.py17
-rw-r--r--tools/python/xen/xm/create.py9
-rw-r--r--xen/arch/x86/Makefile16
-rw-r--r--xen/arch/x86/audit.c984
-rw-r--r--xen/arch/x86/dom0_ops.c2
-rw-r--r--xen/arch/x86/domain.c90
-rw-r--r--xen/arch/x86/domain_build.c13
-rw-r--r--xen/arch/x86/hvm/hvm.c23
-rw-r--r--xen/arch/x86/hvm/platform.c9
-rw-r--r--xen/arch/x86/hvm/svm/svm.c259
-rw-r--r--xen/arch/x86/hvm/svm/vmcb.c4
-rw-r--r--xen/arch/x86/hvm/vlapic.c3
-rw-r--r--xen/arch/x86/hvm/vmx/vmcs.c15
-rw-r--r--xen/arch/x86/hvm/vmx/vmx.c218
-rw-r--r--xen/arch/x86/mm.c463
-rw-r--r--xen/arch/x86/setup.c2
-rw-r--r--xen/arch/x86/shadow.c4150
-rw-r--r--xen/arch/x86/shadow2-common.c3394
-rw-r--r--xen/arch/x86/shadow2.c4469
-rw-r--r--xen/arch/x86/shadow32.c3782
-rw-r--r--xen/arch/x86/shadow_guest32.c16
-rw-r--r--xen/arch/x86/shadow_guest32pae.c16
-rw-r--r--xen/arch/x86/shadow_public.c2143
-rw-r--r--xen/arch/x86/smpboot.c2
-rw-r--r--xen/arch/x86/traps.c32
-rw-r--r--xen/arch/x86/x86_32/domain_page.c31
-rw-r--r--xen/arch/x86/x86_32/mm.c3
-rw-r--r--xen/arch/x86/x86_64/mm.c3
-rw-r--r--xen/arch/x86/x86_64/traps.c14
-rw-r--r--xen/common/acm_ops.c1
-rw-r--r--xen/common/grant_table.c4
-rw-r--r--xen/common/keyhandler.c19
-rw-r--r--xen/common/memory.c11
-rw-r--r--xen/drivers/char/console.c46
-rw-r--r--xen/include/asm-x86/bitops.h18
-rw-r--r--xen/include/asm-x86/config.h20
-rw-r--r--xen/include/asm-x86/domain.h93
-rw-r--r--xen/include/asm-x86/grant_table.h2
-rw-r--r--xen/include/asm-x86/hvm/hvm.h25
-rw-r--r--xen/include/asm-x86/hvm/support.h11
-rw-r--r--xen/include/asm-x86/hvm/vcpu.h6
-rw-r--r--xen/include/asm-x86/hvm/vmx/vmcs.h1
-rw-r--r--xen/include/asm-x86/hvm/vmx/vmx.h49
-rw-r--r--xen/include/asm-x86/mm.h136
-rw-r--r--xen/include/asm-x86/msr.h4
-rw-r--r--xen/include/asm-x86/page-guest32.h7
-rw-r--r--xen/include/asm-x86/page.h37
-rw-r--r--xen/include/asm-x86/perfc_defn.h53
-rw-r--r--xen/include/asm-x86/processor.h1
-rw-r--r--xen/include/asm-x86/shadow.h1791
-rw-r--r--xen/include/asm-x86/shadow2-multi.h116
-rw-r--r--xen/include/asm-x86/shadow2-private.h612
-rw-r--r--xen/include/asm-x86/shadow2-types.h705
-rw-r--r--xen/include/asm-x86/shadow2.h627
-rw-r--r--xen/include/asm-x86/shadow_64.h587
-rw-r--r--xen/include/asm-x86/shadow_ops.h138
-rw-r--r--xen/include/asm-x86/shadow_public.h61
-rw-r--r--xen/include/asm-x86/x86_32/page-2level.h1
-rw-r--r--xen/include/asm-x86/x86_32/page-3level.h3
-rw-r--r--xen/include/asm-x86/x86_64/page.h5
-rw-r--r--xen/include/public/dom0_ops.h16
-rw-r--r--xen/include/xen/domain_page.h13
-rw-r--r--xen/include/xen/lib.h4
-rw-r--r--xen/include/xen/list.h10
-rw-r--r--xen/include/xen/sched.h5
76 files changed, 11111 insertions, 14513 deletions
diff --git a/.hgtags b/.hgtags
index b097c216b9..41fa5ab702 100644
--- a/.hgtags
+++ b/.hgtags
@@ -15,3 +15,13 @@ fb875591fd72e15c31879c0e9034d99b80225595 RELEASE-2.0.4
c8fdb0caa77b429cf47f9707926e83947778cb48 RELEASE-3.0.0
af0573e9e5258db0a9d28aa954dd302ddd2c2d23 3.0.2-rc
d0d3fef37685be264a7f52201f8ef44c030daad3 3.0.2-branched
+6e864d7de9db066f92bea505d256bfe286200fed last-code-review
+a898a6510c5db4e3d1f69d40fcacb540643b0f22 mainline
+bfa6f4a0c594bc0ebd896437d69857b58dab0988 last-code-review
+fc6cbf31bd883bc76ceb97f4b817ac88078d696a latest patch to unstable
+8e55c5c1147589b7a6a1875384d4317aec7ccf84 mainline
+2d2ed4d9b1c14aeee29dfdd77acd6017d31290cd mainline
+0e32095a7b4611d18a82052a9d5b23e474f91af9 mainline
+88e6bd5e2b5439f97e1d50a8724103c619aeaadf mainline
+5233c4b076b9aa073eff63508461b7bfa597737c mainline
+fda70200da01b89d5339342df6c0db372369a16d mainline
diff --git a/tools/examples/xmexample.hvm b/tools/examples/xmexample.hvm
index 396274c860..dd07a3b90e 100644
--- a/tools/examples/xmexample.hvm
+++ b/tools/examples/xmexample.hvm
@@ -27,6 +27,10 @@ builder='hvm'
# and modules. Allocating less than 32MBs is not recommended.
memory = 128
+# Shadow pagetable memory for the domain, in MB.
+# Should be at least 2KB per MB of domain memory, plus a few MB per vcpu.
+shadow_memory = 8
+
# A name for your domain. All domains must have different names.
name = "ExampleHVMDomain"
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index 51117117f4..801e35ea08 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -213,21 +213,28 @@ int xc_shadow_control(int xc_handle,
unsigned int sop,
unsigned long *dirty_bitmap,
unsigned long pages,
- xc_shadow_control_stats_t *stats )
+ unsigned long *mb,
+ uint32_t mode,
+ xc_shadow_control_stats_t *stats)
{
int rc;
DECLARE_DOM0_OP;
op.cmd = DOM0_SHADOW_CONTROL;
op.u.shadow_control.domain = (domid_t)domid;
op.u.shadow_control.op = sop;
- set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap);
op.u.shadow_control.pages = pages;
+ op.u.shadow_control.mb = mb ? *mb : 0;
+ op.u.shadow_control.mode = mode;
+ set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap);
rc = do_dom0_op(xc_handle, &op);
if ( stats )
memcpy(stats, &op.u.shadow_control.stats,
sizeof(xc_shadow_control_stats_t));
+
+ if ( mb )
+ *mb = op.u.shadow_control.mb;
return (rc == 0) ? op.u.shadow_control.pages : rc;
}
@@ -391,7 +398,7 @@ int xc_domain_memory_populate_physmap(int xc_handle,
if ( err > 0 )
{
- DPRINTF("Failed deallocation for dom %d: %ld pages order %d\n",
+ DPRINTF("Failed allocation for dom %d: %ld pages order %d\n",
domid, nr_extents, extent_order);
errno = EBUSY;
err = -1;
diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c
index d4799abc87..173c6733ee 100644
--- a/tools/libxc/xc_hvm_build.c
+++ b/tools/libxc/xc_hvm_build.c
@@ -396,6 +396,19 @@ static int xc_hvm_build_internal(int xc_handle,
goto error_out;
}
+ /* HVM domains must be put into shadow2 mode at the start of day */
+ if ( xc_shadow_control(xc_handle, domid, DOM0_SHADOW2_CONTROL_OP_ENABLE,
+ NULL, 0, NULL,
+ DOM0_SHADOW2_CONTROL_FLAG_ENABLE
+ | DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT
+ | DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE
+ | DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL,
+ NULL) )
+ {
+ PERROR("Could not enable shadow paging for domain.\n");
+ goto error_out;
+ }
+
memset(ctxt, 0, sizeof(*ctxt));
ctxt->flags = VGCF_HVM_GUEST;
diff --git a/tools/libxc/xc_linux_build.c b/tools/libxc/xc_linux_build.c
index 9d7ea54a86..116429a729 100644
--- a/tools/libxc/xc_linux_build.c
+++ b/tools/libxc/xc_linux_build.c
@@ -972,7 +972,7 @@ static int setup_guest(int xc_handle,
/* Enable shadow translate mode */
if ( xc_shadow_control(xc_handle, dom,
DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE,
- NULL, 0, NULL) < 0 )
+ NULL, 0, NULL, 0, NULL) < 0 )
{
PERROR("Could not enable translation mode");
goto error_out;
diff --git a/tools/libxc/xc_linux_save.c b/tools/libxc/xc_linux_save.c
index 8cf21dced5..49d212995e 100644
--- a/tools/libxc/xc_linux_save.c
+++ b/tools/libxc/xc_linux_save.c
@@ -338,13 +338,13 @@ static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
int i;
xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
- arr, max_pfn, NULL);
+ arr, max_pfn, NULL, 0, NULL);
DPRINTF("#Flush\n");
for ( i = 0; i < 40; i++ ) {
usleep(50000);
now = llgettimeofday();
xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
- NULL, 0, &stats);
+ NULL, 0, NULL, 0, &stats);
DPRINTF("now= %lld faults= %" PRId32 " dirty= %" PRId32
" dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
@@ -727,7 +727,7 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
if (xc_shadow_control(xc_handle, dom,
DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL ) < 0) {
+ NULL, 0, NULL, 0, NULL) < 0) {
ERR("Couldn't enable shadow mode");
goto out;
}
@@ -879,7 +879,7 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
but this is fast enough for the moment. */
if (!last_iter && xc_shadow_control(
xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
- to_skip, max_pfn, NULL) != max_pfn) {
+ to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
ERR("Error peeking shadow bitmap");
goto out;
}
@@ -1084,8 +1084,9 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
(unsigned long)ctxt.user_regs.edx);
}
- if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
- to_send, max_pfn, &stats ) != max_pfn) {
+ if (xc_shadow_control(xc_handle, dom,
+ DOM0_SHADOW_CONTROL_OP_CLEAN, to_send,
+ max_pfn, NULL, 0, &stats) != max_pfn) {
ERR("Error flushing shadow PT");
goto out;
}
@@ -1174,8 +1175,9 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
out:
if (live) {
- if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
- NULL, 0, NULL ) < 0) {
+ if(xc_shadow_control(xc_handle, dom,
+ DOM0_SHADOW_CONTROL_OP_OFF,
+ NULL, 0, NULL, 0, NULL) < 0) {
DPRINTF("Warning - couldn't disable shadow mode");
}
}
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 2d301b2c43..a66a11839d 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -323,6 +323,8 @@ int xc_shadow_control(int xc_handle,
unsigned int sop,
unsigned long *dirty_bitmap,
unsigned long pages,
+ unsigned long *mb,
+ uint32_t mode,
xc_shadow_control_stats_t *stats);
int xc_bvtsched_global_set(int xc_handle,
diff --git a/tools/misc/xc_shadow.c b/tools/misc/xc_shadow.c
index 83c52ebc19..f0f60c9c5c 100644
--- a/tools/misc/xc_shadow.c
+++ b/tools/misc/xc_shadow.c
@@ -60,6 +60,8 @@ int main(int argc, char *argv[])
mode,
NULL,
0,
+ NULL,
+ 0,
NULL) < 0 )
{
fprintf(stderr, "Error reseting performance counters: %d (%s)\n",
diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c
index 3e5a9624d8..2c55ca079f 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -669,6 +669,59 @@ static PyObject *pyxc_sedf_domain_get(XcObject *self, PyObject *args)
"weight", weight);
}
+static PyObject *pyxc_shadow_control(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+
+ uint32_t dom;
+ int op=0;
+
+ static char *kwd_list[] = { "dom", "op", NULL };
+
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list,
+ &dom, &op) )
+ return NULL;
+
+ if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, 0, NULL)
+ < 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ Py_INCREF(zero);
+ return zero;
+}
+
+static PyObject *pyxc_shadow_mem_control(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+ int op;
+ uint32_t dom;
+ int mbarg = -1;
+ unsigned long mb;
+
+ static char *kwd_list[] = { "dom", "mb", NULL };
+
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list,
+ &dom, &mbarg) )
+ return NULL;
+
+ if ( mbarg < 0 )
+ op = DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION;
+ else
+ {
+ mb = mbarg;
+ op = DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION;
+ }
+ if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, &mb, 0, NULL) < 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ mbarg = mb;
+ return Py_BuildValue("i", mbarg);
+}
+
static PyObject *pyxc_sched_credit_domain_set(XcObject *self,
PyObject *args,
PyObject *kwds)
@@ -1119,6 +1172,22 @@ static PyMethodDef pyxc_methods[] = {
"Returns [dict]: information about Xen"
" [None]: on failure.\n" },
+ { "shadow_control",
+ (PyCFunction)pyxc_shadow_control,
+ METH_VARARGS | METH_KEYWORDS, "\n"
+ "Set parameter for shadow pagetable interface\n"
+ " dom [int]: Identifier of domain.\n"
+ " op [int, 0]: operation\n\n"
+ "Returns: [int] 0 on success; -1 on error.\n" },
+
+ { "shadow_mem_control",
+ (PyCFunction)pyxc_shadow_mem_control,
+ METH_VARARGS | METH_KEYWORDS, "\n"
+ "Set or read shadow pagetable memory use\n"
+ " dom [int]: Identifier of domain.\n"
+ " mb [int, -1]: MB of shadow memory this domain should have.\n\n"
+ "Returns: [int] MB of shadow memory in use by this domain.\n" },
+
{ "domain_setmaxmem",
(PyCFunction)pyxc_domain_setmaxmem,
METH_VARARGS, "\n"
diff --git a/tools/python/xen/xend/XendDomain.py b/tools/python/xen/xend/XendDomain.py
index 52cca550d4..c253dc2777 100644
--- a/tools/python/xen/xend/XendDomain.py
+++ b/tools/python/xen/xend/XendDomain.py
@@ -532,6 +532,30 @@ class XendDomain:
except Exception, ex:
raise XendError(str(ex))
+ def domain_shadow_control(self, domid, op):
+ """Shadow page control."""
+ dominfo = self.domain_lookup(domid)
+ try:
+ return xc.shadow_control(dominfo.getDomid(), op)
+ except Exception, ex:
+ raise XendError(str(ex))
+
+ def domain_shadow_mem_get(self, domid):
+ """Get shadow pagetable memory allocation."""
+ dominfo = self.domain_lookup(domid)
+ try:
+ return xc.shadow_mem_control(dominfo.getDomid())
+ except Exception, ex:
+ raise XendError(str(ex))
+
+ def domain_shadow_mem_set(self, domid, mb):
+ """Set shadow pagetable memory allocation."""
+ dominfo = self.domain_lookup(domid)
+ try:
+ return xc.shadow_mem_control(dominfo.getDomid(), mb=mb)
+ except Exception, ex:
+ raise XendError(str(ex))
+
def domain_sched_credit_get(self, domid):
"""Get credit scheduler parameters for a domain.
"""
diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py
index 3bc69981e8..ab0554fccd 100644
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -30,6 +30,7 @@ import string
import time
import threading
import os
+import math
import xen.lowlevel.xc
from xen.util import asserts
@@ -126,16 +127,17 @@ VM_CONFIG_PARAMS = [
# don't come out of xc in the same form as they are specified in the config
# file, so those are handled separately.
ROUNDTRIPPING_CONFIG_ENTRIES = [
- ('uuid', str),
- ('vcpus', int),
- ('vcpu_avail', int),
- ('cpu_weight', float),
- ('memory', int),
- ('maxmem', int),
- ('bootloader', str),
+ ('uuid', str),
+ ('vcpus', int),
+ ('vcpu_avail', int),
+ ('cpu_weight', float),
+ ('memory', int),
+ ('shadow_memory', int),
+ ('maxmem', int),
+ ('bootloader', str),
('bootloader_args', str),
- ('features', str),
- ('localtime', int),
+ ('features', str),
+ ('localtime', int),
]
ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
@@ -146,12 +148,13 @@ ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
# entries written to the store that cannot be reconfigured on-the-fly.
#
VM_STORE_ENTRIES = [
- ('uuid', str),
- ('vcpus', int),
- ('vcpu_avail', int),
- ('memory', int),
- ('maxmem', int),
- ('start_time', float),
+ ('uuid', str),
+ ('vcpus', int),
+ ('vcpu_avail', int),
+ ('memory', int),
+ ('shadow_memory', int),
+ ('maxmem', int),
+ ('start_time', float),
]
VM_STORE_ENTRIES += VM_CONFIG_PARAMS
@@ -572,6 +575,7 @@ class XendDomainInfo:
defaultInfo('vcpu_avail', lambda: (1 << self.info['vcpus']) - 1)
defaultInfo('memory', lambda: 0)
+ defaultInfo('shadow_memory', lambda: 0)
defaultInfo('maxmem', lambda: 0)
defaultInfo('bootloader', lambda: None)
defaultInfo('bootloader_args', lambda: None)
@@ -1280,7 +1284,18 @@ class XendDomainInfo:
xc.domain_setmaxmem(self.domid, self.info['maxmem'] * 1024)
m = self.image.getDomainMemory(self.info['memory'] * 1024)
- balloon.free(m)
+
+ # get the domain's shadow memory requirement
+ sm = int(math.ceil(self.image.getDomainShadowMemory(m) / 1024.0))
+ if self.info['shadow_memory'] > sm:
+ sm = self.info['shadow_memory']
+
+ # Make sure there's enough RAM available for the domain
+ balloon.free(m + sm * 1024)
+
+ # Set up the shadow memory
+ sm = xc.shadow_mem_control(self.domid, mb=sm)
+ self.info['shadow_memory'] = sm
init_reservation = self.info['memory'] * 1024
if os.uname()[4] in ('ia64', 'ppc64'):
diff --git a/tools/python/xen/xend/image.py b/tools/python/xen/xend/image.py
index 64fb810944..268462c581 100644
--- a/tools/python/xen/xend/image.py
+++ b/tools/python/xen/xend/image.py
@@ -153,6 +153,12 @@ class ImageHandler:
mem_kb += 4*1024;
return mem_kb
+ def getDomainShadowMemory(self, mem_kb):
+ """@return The minimum shadow memory required, in KiB, for a domain
+ with mem_kb KiB of RAM."""
+ # PV domains don't need any shadow memory
+ return 0
+
def buildDomain(self):
"""Build the domain. Define in subclass."""
raise NotImplementedError()
@@ -364,6 +370,17 @@ class HVMImageHandler(ImageHandler):
extra_pages = int( math.ceil( extra_mb*1024 / page_kb ))
return mem_kb + extra_pages * page_kb
+ def getDomainShadowMemory(self, mem_kb):
+ """@return The minimum shadow memory required, in KiB, for a domain
+ with mem_kb KiB of RAM."""
+ if os.uname()[4] in ('ia64', 'ppc64'):
+ # Explicit shadow memory is not a concept
+ return 0
+ else:
+ # 1MB per vcpu plus 4Kib/Mib of RAM. This is higher than
+ # the minimum that Xen would allocate if no value were given.
+ return 1024 * self.vm.getVCpuCount() + mem_kb / 256
+
def register_shutdown_watch(self):
""" add xen store watch on control/shutdown """
self.shutdownWatch = xswatch(self.vm.dompath + "/control/shutdown", \
diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py
index 549018e209..6416aaab3f 100644
--- a/tools/python/xen/xm/create.py
+++ b/tools/python/xen/xm/create.py
@@ -158,6 +158,10 @@ gopts.var('maxmem', val='MEMORY',
fn=set_int, default=None,
use="Maximum domain memory in MB.")
+gopts.var('shadow_memory', val='MEMORY',
+ fn=set_int, default=0,
+ use="Domain shadow memory in MB.")
+
gopts.var('cpu', val='CPU',
fn=set_int, default=None,
use="CPU to run the VCPU0 on.")
@@ -666,8 +670,9 @@ def make_config(vals):
if v:
config.append([n, v])
- map(add_conf, ['name', 'memory', 'maxmem', 'restart', 'on_poweroff',
- 'on_reboot', 'on_crash', 'vcpus', 'features'])
+ map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
+ 'restart', 'on_poweroff', 'on_reboot', 'on_crash',
+ 'vcpus', 'features'])
if vals.uuid is not None:
config.append(['uuid', vals.uuid])
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index aebee65e9c..e246594245 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -8,7 +8,6 @@ subdir-$(x86_32) += x86_32
subdir-$(x86_64) += x86_64
obj-y += apic.o
-obj-y += audit.o
obj-y += bitops.o
obj-y += compat.o
obj-y += delay.o
@@ -41,12 +40,21 @@ obj-y += usercopy.o
obj-y += x86_emulate.o
ifneq ($(pae),n)
-obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
else
-obj-$(x86_32) += shadow32.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
endif
-obj-$(x86_64) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
+ shadow2_g2_on_s3.o
+
+guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
+shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
+shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
+ -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
+
+shadow2_%.o: shadow2.c $(HDRS) Makefile
+ $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
obj-$(crash_debug) += gdbstub.o
diff --git a/xen/arch/x86/audit.c b/xen/arch/x86/audit.c
deleted file mode 100644
index bacdb9cc50..0000000000
--- a/xen/arch/x86/audit.c
+++ /dev/null
@@ -1,984 +0,0 @@
-/******************************************************************************
- * arch/x86/audit.c
- *
- * Copyright (c) 2002-2005 K A Fraser
- * Copyright (c) 2004 Christian Limpach
- * Copyright (c) 2005 Michael A Fetterman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <xen/config.h>
-#include <xen/init.h>
-#include <xen/kernel.h>
-#include <xen/lib.h>
-#include <xen/mm.h>
-#include <xen/perfc.h>
-#include <asm/shadow.h>
-#include <asm/page.h>
-#include <asm/flushtlb.h>
-
-/* XXX SMP bug -- these should not be statics... */
-static int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
-static int l1, l2, oos_count, page_count;
-
-#define FILE_AND_LINE 0
-
-#if FILE_AND_LINE
-#define adjust(_p, _a) _adjust((_p), (_a), __FILE__, __LINE__)
-#define ADJUST_EXTRA_ARGS ,const char *file, int line
-#define APRINTK(_f, _a...) printk(_f " %s:%d\n", ## _a, file, line)
-#else
-#define adjust _adjust
-#define ADJUST_EXTRA_ARGS
-#define APRINTK(_f, _a...) printk(_f "\n", ##_a)
-#endif
-
-int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
-{
- int errors = 0;
- int shadow_refcounts = !!shadow_mode_refcounts(d);
- int shadow_enabled = !!shadow_mode_enabled(d);
-
- int l2limit( unsigned long mfn )
- {
-
- if ( shadow_mode_external(d) )
- return L2_PAGETABLE_ENTRIES;
-
-#ifdef __i386__
-#ifdef CONFIG_X86_PAE
- /* 32b PAE */
- if ( (( mfn_to_page(mfn)->u.inuse.type_info & PGT_va_mask )
- >> PGT_va_shift) == 3 )
- return l2_table_offset(HYPERVISOR_VIRT_START);
- else
- return L2_PAGETABLE_ENTRIES;
-#else
- /* 32b non-PAE */
- return DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-#endif
-#else
- /* 64b */
- return 0; /* XXX x86/64 XXX */
-#endif
- }
-
- void _adjust(struct page_info *page, int adjtype ADJUST_EXTRA_ARGS)
- {
- int count;
-
- if ( adjtype )
- {
- /* adjust the type count */
- int tcount = page->u.inuse.type_info & PGT_count_mask;
- tcount += dir;
- ttot++;
-
- if ( page_get_owner(page) == NULL )
- {
- APRINTK("adjust(mfn=%lx, dir=%d, adjtype=%d) owner=NULL",
- page_to_mfn(page), dir, adjtype);
- errors++;
- }
-
- if ( tcount < 0 )
- {
- APRINTK("Audit %d: type count went below zero "
- "mfn=%lx t=%" PRtype_info " ot=%x",
- d->domain_id, page_to_mfn(page),
- page->u.inuse.type_info,
- page->tlbflush_timestamp);
- errors++;
- }
- else if ( (tcount & ~PGT_count_mask) != 0 )
- {
- APRINTK("Audit %d: type count overflowed "
- "mfn=%lx t=%" PRtype_info " ot=%x",
- d->domain_id, page_to_mfn(page),
- page->u.inuse.type_info,
- page->tlbflush_timestamp);
- errors++;
- }
- else
- page->u.inuse.type_info += dir;
- }
-
- /* adjust the general count */
- count = (page->count_info & PGC_count_mask) + dir;
- ctot++;
-
- if ( count < 0 )
- {
- APRINTK("Audit %d: general count went below zero "
- "mfn=%lx t=%" PRtype_info " ot=%x",
- d->domain_id, page_to_mfn(page),
- page->u.inuse.type_info,
- page->tlbflush_timestamp);
- errors++;
- }
- else if ( (count & ~PGT_count_mask) != 0 )
- {
- APRINTK("Audit %d: general count overflowed "
- "mfn=%lx t=%" PRtype_info " ot=%x",
- d->domain_id, page_to_mfn(page),
- page->u.inuse.type_info,
- page->tlbflush_timestamp);
- errors++;
- }
- else
- page->count_info += dir;
- }
-
- void adjust_l2_page(unsigned long mfn, int shadow)
- {
- l2_pgentry_t *pt = map_domain_page(mfn);
- int i;
- u32 page_type;
-
- for ( i = 0; i < l2limit(mfn); i++ )
- {
- if ( l2e_get_flags(pt[i]) & _PAGE_PRESENT )
- {
- unsigned long l1mfn = l2e_get_pfn(pt[i]);
- struct page_info *l1page = mfn_to_page(l1mfn);
-
- if ( noisy )
- {
- if ( shadow )
- {
- if ( page_get_owner(l1page) != NULL )
- {
- printk("L2: Bizarre shadow L1 page mfn=%lx "
- "belonging to a domain %p (id=%d)\n",
- l1mfn,
- page_get_owner(l1page),
- page_get_owner(l1page)->domain_id);
- errors++;
- continue;
- }
-
- page_type = l1page->u.inuse.type_info & PGT_type_mask;
- if ( page_type != PGT_l1_shadow )
- {
- printk("Audit %d: [Shadow L2 mfn=%lx i=%x] "
- "Expected Shadow L1 t=%" PRtype_info
- " mfn=%lx\n",
- d->domain_id, mfn, i,
- l1page->u.inuse.type_info, l1mfn);
- errors++;
- }
- }
- else
- {
- if ( page_get_owner(l1page) != d )
- {
- printk("L2: Skip bizarre L1 page mfn=%lx "
- "belonging to other dom %p (id=%d)\n",
- l1mfn,
- page_get_owner(l1page),
- (page_get_owner(l1page)
- ? page_get_owner(l1page)->domain_id
- : -1));
- errors++;
- continue;
- }
-
- page_type = l1page->u.inuse.type_info & PGT_type_mask;
- if ( page_type == PGT_l2_page_table )
- {
- printk("Audit %d: [%x] Found %s Linear PT "
- "t=%" PRtype_info " mfn=%lx\n",
- d->domain_id, i, (l1mfn==mfn) ? "Self" : "Other",
- l1page->u.inuse.type_info, l1mfn);
- }
- else if ( page_type != PGT_l1_page_table )
- {
- printk("Audit %d: [L2 mfn=%lx i=%x] "
- "Expected L1 t=%" PRtype_info " mfn=%lx\n",
- d->domain_id, mfn, i,
- l1page->u.inuse.type_info, l1mfn);
- errors++;
- }
- }
- }
-
- adjust(l1page, !shadow);
- }
- }
-
- if ( shadow_mode_translate(d) && !shadow_mode_external(d) )
- {
- unsigned long hl2mfn =
- l2e_get_pfn(pt[l2_table_offset(LINEAR_PT_VIRT_START)]);
- struct page_info *hl2page = mfn_to_page(hl2mfn);
- adjust(hl2page, 0);
- }
-
- unmap_domain_page(pt);
- }
-
- void adjust_hl2_page(unsigned long hl2mfn)
- {
- l2_pgentry_t *pt = map_domain_page(hl2mfn);
- int i;
-
- for ( i = 0; i < l2limit(hl2mfn); i++ )
- {
- if ( l2e_get_flags(pt[i]) & _PAGE_PRESENT )
- {
- unsigned long mfn = l2e_get_pfn(pt[i]);
- struct page_info *gpage = mfn_to_page(mfn);
-
- if ( mfn < 0x100 )
- {
- lowmem_mappings++;
- continue;
- }
-
- if ( !mfn_valid(mfn) )
- {
- io_mappings++;
- continue;
- }
-
- if ( noisy )
- {
- if ( page_get_owner(gpage) != d )
- {
- printk("Audit %d: [hl2mfn=%lx,i=%x] Skip foreign page "
- "dom=%p (id=%d) mfn=%lx c=%08x t=%"
- PRtype_info "\n",
- d->domain_id, hl2mfn, i,
- page_get_owner(gpage),
- page_get_owner(gpage)->domain_id,
- mfn,
- gpage->count_info,
- gpage->u.inuse.type_info);
- continue;
- }
- }
- adjust(gpage, 0);
- }
- }
-
- unmap_domain_page(pt);
- }
-
- void adjust_l1_page(unsigned long l1mfn)
- {
- l1_pgentry_t *pt = map_domain_page(l1mfn);
- int i;
-
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
- {
- if ( l1e_get_flags(pt[i]) & _PAGE_PRESENT )
- {
- unsigned long mfn = l1e_get_pfn(pt[i]);
- struct page_info *gpage = mfn_to_page(mfn);
-
- if ( mfn < 0x100 )
- {
- lowmem_mappings++;
- continue;
- }
-
- if ( !mfn_valid(mfn) )
- {
- io_mappings++;
- continue;
- }
-
- if ( noisy )
- {
- if ( l1e_get_flags(pt[i]) & _PAGE_RW )
- {
- // If it's not a writable page, complain.
- //
- if ( !((gpage->u.inuse.type_info & PGT_type_mask) ==
- PGT_writable_page) )
- {
- printk("Audit %d: [l1mfn=%lx, i=%x] Illegal RW "
- "t=%" PRtype_info " mfn=%lx\n",
- d->domain_id, l1mfn, i,
- gpage->u.inuse.type_info, mfn);
- errors++;
- }
-
- if ( shadow_refcounts &&
- page_is_page_table(gpage) &&
- ! page_out_of_sync(gpage) )
- {
- printk("Audit %d: [l1mfn=%lx, i=%x] Illegal RW of "
- "page table mfn=%lx\n",
- d->domain_id, l1mfn, i, mfn);
- errors++;
- }
- }
-
- if ( page_get_owner(gpage) != d )
- {
- printk("Audit %d: [l1mfn=%lx,i=%x] Skip foreign page "
- "dom=%p (id=%d) mfn=%lx c=%08x t=%"
- PRtype_info "\n",
- d->domain_id, l1mfn, i,
- page_get_owner(gpage),
- page_get_owner(gpage)->domain_id,
- mfn,
- gpage->count_info,
- gpage->u.inuse.type_info);
- continue;
- }
- }
-
- adjust(gpage, (l1e_get_flags(pt[i]) & _PAGE_RW) ? 1 : 0);
- }
- }
-
- unmap_domain_page(pt);
- }
-
- void adjust_shadow_tables(void)
- {
- struct shadow_status *a;
- unsigned long smfn, gmfn;
- struct page_info *page;
- int i;
-
- for ( i = 0; i < shadow_ht_buckets; i++ )
- {
- a = &d->arch.shadow_ht[i];
- while ( a && a->gpfn_and_flags )
- {
- gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
- smfn = a->smfn;
- page = mfn_to_page(smfn);
-
- switch ( a->gpfn_and_flags & PGT_type_mask ) {
- case PGT_writable_pred:
- break;
- case PGT_snapshot:
- adjust(mfn_to_page(gmfn), 0);
- break;
- case PGT_l1_shadow:
- adjust(mfn_to_page(gmfn), 0);
- if ( shadow_refcounts )
- adjust_l1_page(smfn);
- if ( page->u.inuse.type_info & PGT_pinned )
- adjust(page, 0);
- break;
- case PGT_hl2_shadow:
- adjust(mfn_to_page(gmfn), 0);
- if ( shadow_refcounts )
- adjust_hl2_page(smfn);
- if ( page->u.inuse.type_info & PGT_pinned )
- adjust(page, 0);
- break;
- case PGT_l2_shadow:
- adjust(mfn_to_page(gmfn), 0);
- adjust_l2_page(smfn, 1);
- if ( page->u.inuse.type_info & PGT_pinned )
- adjust(page, 0);
- break;
- default:
- BUG();
- break;
- }
-
- a = a->next;
- }
- }
- }
-
- void adjust_oos_list(void)
- {
- struct out_of_sync_entry *oos;
-
- if ( (oos = d->arch.out_of_sync) )
- ASSERT(shadow_enabled);
-
- while ( oos )
- {
- adjust(mfn_to_page(oos->gmfn), 0);
-
- // Only use entries that have low bits clear...
- //
- if ( !(oos->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
- adjust(mfn_to_page(oos->writable_pl1e >> PAGE_SHIFT), 0);
-
- if ( oos->snapshot_mfn != SHADOW_SNAPSHOT_ELSEWHERE )
- adjust(mfn_to_page(oos->snapshot_mfn), 0);
-
- oos = oos->next;
- oos_count++;
- }
- }
-
- void adjust_for_pgtbase(void)
- {
- struct vcpu *v;
-
- for_each_vcpu(d, v)
- {
- if ( !pagetable_is_null(v->arch.guest_table) )
- adjust(mfn_to_page(pagetable_get_pfn(v->arch.guest_table)),
- !shadow_mode_refcounts(d));
- if ( !pagetable_is_null(v->arch.shadow_table) )
- adjust(mfn_to_page(pagetable_get_pfn(v->arch.shadow_table)),
- 0);
- if ( v->arch.monitor_shadow_ref )
- adjust(mfn_to_page(v->arch.monitor_shadow_ref), 0);
- }
- }
-
- void adjust_guest_pages(void)
- {
- struct list_head *list_ent = d->page_list.next;
- struct page_info *page;
- unsigned long mfn, snapshot_mfn;
-
- while ( list_ent != &d->page_list )
- {
- u32 page_type;
-
- page = list_entry(list_ent, struct page_info, list);
- snapshot_mfn = mfn = page_to_mfn(page);
- page_type = page->u.inuse.type_info & PGT_type_mask;
-
- BUG_ON(page_get_owner(page) != d);
-
- page_count++;
-
- if ( shadow_enabled && !shadow_refcounts &&
- page_out_of_sync(page) )
- {
- unsigned long gpfn = mfn_to_gmfn(d, mfn);
- ASSERT( VALID_M2P(gpfn) );
- snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
- ASSERT( snapshot_mfn );
- }
-
- switch ( page_type )
- {
- case PGT_l2_page_table:
- l2++;
-
- if ( noisy )
- {
- if ( shadow_refcounts )
- {
- printk("Audit %d: found an L2 guest page "
- "mfn=%lx t=%" PRtype_info " c=%08x while in shadow mode\n",
- d->domain_id, mfn, page->u.inuse.type_info,
- page->count_info);
- errors++;
- }
-
- if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
- {
- if ( (page->u.inuse.type_info & PGT_validated) !=
- PGT_validated )
- {
- printk("Audit %d: L2 mfn=%lx not validated %"
- PRtype_info "\n",
- d->domain_id, mfn, page->u.inuse.type_info);
- errors++;
- }
-
- }
- }
-
- if ( page->u.inuse.type_info & PGT_pinned )
- adjust(page, 1);
-
- if ( page->u.inuse.type_info & PGT_validated )
- adjust_l2_page(snapshot_mfn, 0);
-
- break;
-
- case PGT_l1_page_table:
- l1++;
-
- if ( noisy )
- {
- if ( shadow_refcounts )
- {
- printk("found an L1 guest page mfn=%lx t=%"
- PRtype_info " c=%08x "
- "while in shadow mode\n",
- mfn, page->u.inuse.type_info, page->count_info);
- errors++;
- }
-
- if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
- {
- if ( (page->u.inuse.type_info & PGT_validated) !=
- PGT_validated )
- {
- printk("Audit %d: L1 not validated mfn=%lx t=%"
- PRtype_info "\n",
- d->domain_id, mfn, page->u.inuse.type_info);
- errors++;
- }
- }
- }
-
- if ( page->u.inuse.type_info & PGT_pinned )
- adjust(page, 1);
-
- if ( page->u.inuse.type_info & PGT_validated )
- adjust_l1_page(snapshot_mfn);
-
- break;
-
- case PGT_gdt_page:
- ASSERT( !page_out_of_sync(page) );
- adjust(page, 1);
- break;
-
- case PGT_ldt_page:
- ASSERT( !page_out_of_sync(page) );
- adjust(page, 1);
- break;
-
- case PGT_writable_page:
- if ( shadow_refcounts )
- {
- // In shadow mode, writable pages can get pinned by
- // paravirtualized guests that think they are pinning
- // their L1s and/or L2s.
- //
- if ( page->u.inuse.type_info & PGT_pinned )
- adjust(page, 1);
- }
- }
-
- list_ent = page->list.next;
- }
- }
-
- adjust_for_pgtbase();
-
- adjust_guest_pages();
-
- if ( shadow_enabled )
- {
- adjust_oos_list();
- adjust_shadow_tables();
- }
-
- adjust(virt_to_page(d->shared_info), 1);
-
- return errors;
-}
-
-
-#ifndef NDEBUG
-
-void audit_pagelist(struct domain *d)
-{
- struct list_head *list_ent;
- int xenpages, totpages;
-
- list_ent = d->xenpage_list.next;
- for ( xenpages = 0; (list_ent != &d->xenpage_list); xenpages++ )
- {
- list_ent = list_ent->next;
- }
- list_ent = d->page_list.next;
- for ( totpages = 0; (list_ent != &d->page_list); totpages++ )
- {
- list_ent = list_ent->next;
- }
-
- if ( xenpages != d->xenheap_pages ||
- totpages != d->tot_pages )
- {
- printk("ARGH! dom %d: xen=%d %d, pages=%d %d\n", d->domain_id,
- xenpages, d->xenheap_pages,
- totpages, d->tot_pages );
- }
-}
-
-void _audit_domain(struct domain *d, int flags)
-{
- int shadow_refcounts = !!shadow_mode_refcounts(d);
-
- void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn,
- unsigned long mfn)
- {
- struct page_info *page = mfn_to_page(mfn);
- l1_pgentry_t *pt = map_domain_page(mfn);
- int i;
-
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
- {
- if ( (l1e_get_flags(pt[i]) & _PAGE_PRESENT) &&
- (l1e_get_pfn(pt[i]) == xmfn) )
- printk(" found dom=%d mfn=%lx t=%" PRtype_info " c=%08x "
- "pt[i=%x]=%" PRIpte "\n",
- d->domain_id, mfn, page->u.inuse.type_info,
- page->count_info, i, l1e_get_intpte(pt[i]));
- }
-
- unmap_domain_page(pt);
- }
-
- void scan_for_pfn_in_grant_table(struct domain *d, unsigned xmfn)
- {
- int i;
- struct active_grant_entry *act = d->grant_table->active;
-
- spin_lock(&d->grant_table->lock);
-
- for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
- {
- if ( act[i].pin && (act[i].frame == xmfn) )
- {
- printk(" found active grant table entry i=%d dom=%d pin=%d\n",
- i, act[i].domid, act[i].pin);
- }
- }
-
- spin_unlock(&d->grant_table->lock);
- }
-
- void scan_for_pfn(struct domain *d, unsigned long xmfn)
- {
- scan_for_pfn_in_grant_table(d, xmfn);
-
- if ( !shadow_mode_enabled(d) )
- {
- struct list_head *list_ent = d->page_list.next;
- struct page_info *page;
-
- while ( list_ent != &d->page_list )
- {
- page = list_entry(list_ent, struct page_info, list);
-
- switch ( page->u.inuse.type_info & PGT_type_mask )
- {
- case PGT_l1_page_table:
- case PGT_l2_page_table:
- scan_for_pfn_in_mfn(d, xmfn, page_to_mfn(page));
- break;
- default:
- break;
- }
-
- list_ent = page->list.next;
- }
- }
- else
- {
- struct shadow_status *a;
- int i;
-
- for ( i = 0; i < shadow_ht_buckets; i++ )
- {
- a = &d->arch.shadow_ht[i];
- while ( a && a->gpfn_and_flags )
- {
- switch ( a->gpfn_and_flags & PGT_type_mask )
- {
- case PGT_l1_shadow:
- case PGT_l2_shadow:
- case PGT_hl2_shadow:
- scan_for_pfn_in_mfn(d, xmfn, a->smfn);
- break;
- case PGT_snapshot:
- case PGT_writable_pred:
- break;
- default:
- BUG();
- break;
- }
- a = a->next;
- }
- }
- }
- }
-
- void scan_for_pfn_remote(unsigned long xmfn)
- {
- struct domain *e;
- for_each_domain ( e )
- scan_for_pfn( e, xmfn );
- }
-
- unsigned long mfn;
- struct list_head *list_ent;
- struct page_info *page;
- int errors = 0;
-
- if ( (d != current->domain) && shadow_mode_translate(d) )
- {
- printk("skipping audit domain of translated domain %d "
- "from other context\n",
- d->domain_id);
- return;
- }
-
- if ( d != current->domain )
- domain_pause(d);
-
- // Maybe we should just be using BIGLOCK?
- //
- if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) )
- shadow_lock(d);
-
- spin_lock(&d->page_alloc_lock);
-
- audit_pagelist(d);
-
- /* PHASE 0 */
-
- list_ent = d->page_list.next;
- while ( list_ent != &d->page_list )
- {
- u32 page_type;
- unsigned long pfn;
-
- page = list_entry(list_ent, struct page_info, list);
- mfn = page_to_mfn(page);
- page_type = page->u.inuse.type_info & PGT_type_mask;
-
- BUG_ON(page_get_owner(page) != d);
-
- if ( (page->u.inuse.type_info & PGT_count_mask) >
- (page->count_info & PGC_count_mask) )
- {
- printk("taf(%" PRtype_info ") > caf(%08x) mfn=%lx\n",
- page->u.inuse.type_info, page->count_info, mfn);
- errors++;
- }
-
- if ( shadow_mode_refcounts(d) &&
- (page_type == PGT_writable_page) &&
- !(page->u.inuse.type_info & PGT_validated) )
- {
- printk("shadow mode writable page not validated mfn=%lx "
- "t=%" PRtype_info " c=%08x\n",
- mfn, page->u.inuse.type_info, page->count_info);
- errors++;
- }
-
-#if 0 /* SYSV shared memory pages plus writeable files. */
- if ( page_type == PGT_writable_page &&
- (page->u.inuse.type_info & PGT_count_mask) > 1 )
- {
- printk("writeable page with type count >1: "
- "mfn=%lx t=%" PRtype_info " c=%08x\n",
- mfn,
- page->u.inuse.type_info,
- page->count_info );
- errors++;
- scan_for_pfn_remote(mfn);
- }
-#endif
-
- if ( page_type == PGT_none &&
- (page->u.inuse.type_info & PGT_count_mask) > 0 )
- {
- printk("normal page with type count >0: mfn=%lx t=%" PRtype_info " c=%08x\n",
- mfn,
- page->u.inuse.type_info,
- page->count_info );
- errors++;
- }
-
- if ( page_out_of_sync(page) )
- {
- if ( !page_is_page_table(page) )
- {
- printk("out of sync page mfn=%lx is not a page table\n", mfn);
- errors++;
- }
- pfn = mfn_to_gmfn(d, mfn);
- if ( !__shadow_status(d, pfn, PGT_snapshot) )
- {
- printk("out of sync page mfn=%lx doesn't have a snapshot\n",
- mfn);
- errors++;
- }
- if ( shadow_refcounts
- ? (page_type != PGT_writable_page)
- : !(page_type && (page_type <= PGT_l4_page_table)) )
- {
- printk("out of sync page mfn=%lx has strange type "
- "t=%" PRtype_info " c=%08x\n",
- mfn, page->u.inuse.type_info, page->count_info);
- errors++;
- }
- }
-
- /* Use tlbflush_timestamp to store original type_info. */
- page->tlbflush_timestamp = page->u.inuse.type_info;
-
- list_ent = page->list.next;
- }
-
- /* PHASE 1 */
- io_mappings = lowmem_mappings = 0;
-
- errors += audit_adjust_pgtables(d, -1, 1);
-
- if ( !(flags & AUDIT_QUIET) &&
- ((io_mappings > 0) || (lowmem_mappings > 0)) )
- printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
- d->domain_id, lowmem_mappings, io_mappings);
-
- /* PHASE 2 */
-
- list_ent = d->page_list.next;
- while ( list_ent != &d->page_list )
- {
- page = list_entry(list_ent, struct page_info, list);
- mfn = page_to_mfn(page);
-
- switch ( page->u.inuse.type_info & PGT_type_mask)
- {
- case PGT_l1_page_table:
- case PGT_l2_page_table:
- case PGT_l3_page_table:
- case PGT_l4_page_table:
- if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
- {
- printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n",
- d->domain_id, page->u.inuse.type_info,
- page->tlbflush_timestamp,
- page->count_info, mfn);
- errors++;
- scan_for_pfn_remote(mfn);
- }
- break;
- case PGT_none:
- case PGT_writable_page:
- case PGT_gdt_page:
- case PGT_ldt_page:
- if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
- {
- printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n",
- d->domain_id, page->u.inuse.type_info,
- page->tlbflush_timestamp,
- page->count_info, mfn);
- //errors++;
- }
- break;
- default:
- BUG(); // XXX fix me...
- }
-
- if ( (page->count_info & PGC_count_mask) != 1 )
- {
- printk("Audit %d: gen count!=1 (c=%x) t=%" PRtype_info " ot=%x mfn=%lx\n",
- d->domain_id,
- page->count_info,
- page->u.inuse.type_info,
- page->tlbflush_timestamp, mfn );
- //errors++;
- scan_for_pfn_remote(mfn);
- }
-
- list_ent = page->list.next;
- }
-
- if ( shadow_mode_enabled(d) )
- {
- struct shadow_status *a;
- struct page_info *page;
- u32 page_type;
- int i;
-
- for ( i = 0; i < shadow_ht_buckets; i++ )
- {
- a = &d->arch.shadow_ht[i];
- while ( a && a->gpfn_and_flags )
- {
- page = mfn_to_page(a->smfn);
- page_type = a->gpfn_and_flags & PGT_type_mask;
-
- switch ( page_type ) {
- case PGT_l1_shadow:
- case PGT_l2_shadow:
- case PGT_hl2_shadow:
- case PGT_snapshot:
- if ( ((page->u.inuse.type_info & PGT_type_mask) != page_type ) ||
- (page->count_info != 0) )
- {
- printk("Audit %d: shadow page counts wrong "
- "mfn=%lx t=%" PRtype_info " c=%08x\n",
- d->domain_id, page_to_mfn(page),
- page->u.inuse.type_info,
- page->count_info);
- printk("a->gpfn_and_flags=%"PRIx64"\n",
- (u64)a->gpfn_and_flags);
- errors++;
- }
- break;
- case PGT_writable_pred:
- // XXX - nothing to check?
- break;
-
- default:
- BUG();
- break;
- }
-
- a = a->next;
- }
- }
- }
-
- /* PHASE 3 */
- ctot = ttot = page_count = l1 = l2 = oos_count = 0;
-
- audit_adjust_pgtables(d, 1, 0);
-
-#if 0
- // This covers our sins of trashing the tlbflush_timestamps...
- //
- local_flush_tlb();
-#endif
-
- spin_unlock(&d->page_alloc_lock);
-
- if ( !(flags & AUDIT_QUIET) )
- printk("Audit dom%d Done. "
- "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n",
- d->domain_id, page_count, oos_count, l1, l2, ctot, ttot);
-
- if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) )
- shadow_unlock(d);
-
- if ( d != current->domain )
- domain_unpause(d);
-
- if ( errors && !(flags & AUDIT_ERRORS_OK) )
- BUG();
-}
-
-void audit_domains(void)
-{
- struct domain *d;
- for_each_domain ( d )
- audit_domain(d);
-}
-
-void audit_domains_key(unsigned char key)
-{
- audit_domains();
-}
-#endif
diff --git a/xen/arch/x86/dom0_ops.c b/xen/arch/x86/dom0_ops.c
index 214b0c58f8..0038112d63 100644
--- a/xen/arch/x86/dom0_ops.c
+++ b/xen/arch/x86/dom0_ops.c
@@ -89,7 +89,7 @@ long arch_do_dom0_op(struct dom0_op *op, XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op)
d = find_domain_by_id(op->u.shadow_control.domain);
if ( d != NULL )
{
- ret = shadow_mode_control(d, &op->u.shadow_control);
+ ret = shadow2_control_op(d, &op->u.shadow_control, u_dom0_op);
put_domain(d);
copy_to_guest(u_dom0_op, op, 1);
}
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 4dd71b1859..65e4dc4b9c 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -134,13 +134,6 @@ struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
v->arch.perdomain_ptes =
d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
- v->arch.guest_vtable = __linear_l2_table;
- v->arch.shadow_vtable = __shadow_linear_l2_table;
-#if defined(__x86_64__)
- v->arch.guest_vl3table = __linear_l3_table;
- v->arch.guest_vl4table = __linear_l4_table;
-#endif
-
pae_l3_cache_init(&v->arch.pae_l3_cache);
return v;
@@ -155,9 +148,7 @@ int arch_domain_create(struct domain *d)
{
l1_pgentry_t gdt_l1e;
int vcpuid, pdpt_order;
-#ifdef __x86_64__
int i;
-#endif
pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
@@ -202,8 +193,12 @@ int arch_domain_create(struct domain *d)
#endif /* __x86_64__ */
- shadow_lock_init(d);
- INIT_LIST_HEAD(&d->arch.free_shadow_frames);
+ shadow2_lock_init(d);
+ for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
+ INIT_LIST_HEAD(&d->arch.shadow2_freelists[i]);
+ INIT_LIST_HEAD(&d->arch.shadow2_p2m_freelist);
+ INIT_LIST_HEAD(&d->arch.shadow2_p2m_inuse);
+ INIT_LIST_HEAD(&d->arch.shadow2_toplevel_shadows);
if ( !is_idle_domain(d) )
{
@@ -234,6 +229,8 @@ int arch_domain_create(struct domain *d)
void arch_domain_destroy(struct domain *d)
{
+ shadow2_final_teardown(d);
+
free_xenheap_pages(
d->arch.mm_perdomain_pt,
get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
@@ -328,14 +325,6 @@ int arch_set_info_guest(
if ( !hvm_initialize_guest_resources(v) )
return -EINVAL;
}
- else if ( shadow_mode_refcounts(d) )
- {
- if ( !get_page(mfn_to_page(cr3_pfn), d) )
- {
- destroy_gdt(v);
- return -EINVAL;
- }
- }
else
{
if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
@@ -344,9 +333,16 @@ int arch_set_info_guest(
destroy_gdt(v);
return -EINVAL;
}
- }
+ }
- update_pagetables(v);
+ /* Shadow2: make sure the domain has enough shadow memory to
+ * boot another vcpu */
+ if ( shadow2_mode_enabled(d)
+ && d->arch.shadow2_total_pages < shadow2_min_acceptable_pages(d) )
+ {
+ destroy_gdt(v);
+ return -ENOMEM;
+ }
if ( v->vcpu_id == 0 )
update_domain_wallclock_time(d);
@@ -354,6 +350,11 @@ int arch_set_info_guest(
/* Don't redo final setup */
set_bit(_VCPUF_initialised, &v->vcpu_flags);
+ if ( shadow2_mode_enabled(d) )
+ shadow2_update_paging_modes(v);
+
+ update_cr3(v);
+
return 0;
}
@@ -669,7 +670,6 @@ static void __context_switch(void)
loaddebug(&n->arch.guest_context, 6);
loaddebug(&n->arch.guest_context, 7);
}
-
n->arch.ctxt_switch_to(n);
}
@@ -927,29 +927,34 @@ void domain_relinquish_resources(struct domain *d)
/* Drop the in-use references to page-table bases. */
for_each_vcpu ( d, v )
{
- if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
+ /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
+ * or sh2_update_paging_modes()) */
+ pfn = pagetable_get_pfn(v->arch.guest_table);
+ if ( pfn != 0 )
{
- if ( !shadow_mode_refcounts(d) )
- put_page_type(mfn_to_page(pfn));
- put_page(mfn_to_page(pfn));
-
+ if ( shadow2_mode_refcounts(d) )
+ put_page(mfn_to_page(pfn));
+ else
+ put_page_and_type(mfn_to_page(pfn));
v->arch.guest_table = pagetable_null();
}
- if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
+#ifdef __x86_64__
+ /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+ pfn = pagetable_get_pfn(v->arch.guest_table_user);
+ if ( pfn != 0 )
{
- if ( !shadow_mode_refcounts(d) )
- put_page_type(mfn_to_page(pfn));
- put_page(mfn_to_page(pfn));
-
+ put_page_and_type(mfn_to_page(pfn));
v->arch.guest_table_user = pagetable_null();
}
+#endif
}
if ( d->vcpu[0] && hvm_guest(d->vcpu[0]) )
hvm_relinquish_guest_resources(d);
- shadow_mode_disable(d);
+ /* Tear down shadow mode stuff. */
+ shadow2_teardown(d);
/*
* Relinquish GDT mappings. No need for explicit unmapping of the LDT as
@@ -964,26 +969,23 @@ void domain_relinquish_resources(struct domain *d)
/* Free page used by xen oprofile buffer */
free_xenoprof_pages(d);
-
}
void arch_dump_domain_info(struct domain *d)
{
- if ( shadow_mode_enabled(d) )
+ if ( shadow2_mode_enabled(d) )
{
- printk(" shadow mode: ");
- if ( shadow_mode_refcounts(d) )
+ printk(" shadow2 mode: ");
+ if ( d->arch.shadow2_mode & SHM2_enable )
+ printk("enabled ");
+ if ( shadow2_mode_refcounts(d) )
printk("refcounts ");
- if ( shadow_mode_write_all(d) )
- printk("write_all ");
- if ( shadow_mode_log_dirty(d) )
+ if ( shadow2_mode_log_dirty(d) )
printk("log_dirty ");
- if ( shadow_mode_translate(d) )
+ if ( shadow2_mode_translate(d) )
printk("translate ");
- if ( shadow_mode_external(d) )
+ if ( shadow2_mode_external(d) )
printk("external ");
- if ( shadow_mode_wr_pt_pte(d) )
- printk("wr_pt_pte ");
printk("\n");
}
}
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index a1d95f77c6..5d270336fc 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -683,8 +683,11 @@ int construct_dom0(struct domain *d,
for ( i = 1; i < opt_dom0_max_vcpus; i++ )
(void)alloc_vcpu(d, i, i);
- /* Set up monitor table */
- update_pagetables(v);
+ /* Set up CR3 value for write_ptbase */
+ if ( shadow2_mode_enabled(v->domain) )
+ shadow2_update_paging_modes(v);
+ else
+ update_cr3(v);
/* Install the new page tables. */
local_irq_disable();
@@ -796,10 +799,8 @@ int construct_dom0(struct domain *d,
new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
if ( opt_dom0_shadow )
- {
- shadow_mode_enable(d, SHM_enable);
- update_pagetables(v);
- }
+ if ( shadow2_test_enable(d) == 0 )
+ shadow2_update_paging_modes(v);
if ( supervisor_mode_kernel )
{
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index fd4b69423b..6ffbf751f9 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -30,6 +30,7 @@
#include <xen/hypercall.h>
#include <xen/guest_access.h>
#include <xen/event.h>
+#include <xen/shadow.h>
#include <asm/current.h>
#include <asm/e820.h>
#include <asm/io.h>
@@ -42,10 +43,6 @@
#include <asm/spinlock.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/support.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
#include <public/sched.h>
#include <public/hvm/ioreq.h>
#include <public/version.h>
@@ -61,7 +58,7 @@ struct hvm_function_table hvm_funcs;
static void hvm_zap_mmio_range(
struct domain *d, unsigned long pfn, unsigned long nr_pfn)
{
- unsigned long i, val = INVALID_MFN;
+ unsigned long i;
ASSERT(d == current->domain);
@@ -70,7 +67,8 @@ static void hvm_zap_mmio_range(
if ( pfn + i >= 0xfffff )
break;
- __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
+ if ( VALID_MFN(gmfn_to_mfn(d, pfn + i)) )
+ guest_remove_page(d, pfn + i);
}
}
@@ -262,11 +260,13 @@ void hvm_setup_platform(struct domain* d)
if ( !hvm_guest(v) || (v->vcpu_id != 0) )
return;
+#if 0 /* SHADOW2 does not have this */
if ( shadow_direct_map_init(d) == 0 )
{
printk("Can not allocate shadow direct map for HVM domain.\n");
domain_crash_synchronous();
}
+#endif
hvm_zap_iommu_pages(d);
@@ -380,6 +380,8 @@ void hvm_hlt(unsigned long rflags)
*/
int hvm_copy(void *buf, unsigned long vaddr, int size, int dir)
{
+ struct vcpu *v = current;
+ unsigned long gfn;
unsigned long mfn;
char *addr;
int count;
@@ -389,10 +391,9 @@ int hvm_copy(void *buf, unsigned long vaddr, int size, int dir)
if (count > size)
count = size;
- if (hvm_paging_enabled(current))
- mfn = gva_to_mfn(vaddr);
- else
- mfn = get_mfn_from_gpfn(vaddr >> PAGE_SHIFT);
+ gfn = shadow2_gva_to_gfn(v, vaddr);
+ mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
+
if (mfn == INVALID_MFN)
return 0;
@@ -545,7 +546,7 @@ void hvm_do_hypercall(struct cpu_user_regs *pregs)
return;
}
- if ( current->domain->arch.ops->guest_paging_levels == PAGING_L4 )
+ if ( current->arch.shadow2->guest_levels == 4 )
{
pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
pregs->rsi,
diff --git a/xen/arch/x86/hvm/platform.c b/xen/arch/x86/hvm/platform.c
index f1bfd4c479..920e7786a0 100644
--- a/xen/arch/x86/hvm/platform.c
+++ b/xen/arch/x86/hvm/platform.c
@@ -21,7 +21,7 @@
#include <xen/config.h>
#include <xen/types.h>
#include <xen/mm.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
#include <xen/domain_page.h>
#include <asm/page.h>
#include <xen/event.h>
@@ -35,9 +35,6 @@
#include <xen/lib.h>
#include <xen/sched.h>
#include <asm/current.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
#define DECODE_success 1
#define DECODE_failure 0
@@ -724,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
if (pvalid) {
if (hvm_paging_enabled(current))
- p->u.pdata = (void *) gva_to_gpa(value);
+ p->u.data = shadow2_gva_to_gpa(current, value);
else
p->u.pdata = (void *) value; /* guest VA == guest PA */
} else
@@ -774,7 +771,7 @@ void send_mmio_req(
if (pvalid) {
if (hvm_paging_enabled(v))
- p->u.pdata = (void *) gva_to_gpa(value);
+ p->u.data = shadow2_gva_to_gpa(v, value);
else
p->u.pdata = (void *) value; /* guest VA == guest PA */
} else
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index f7ae00937e..c6b3e813d5 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -26,9 +26,10 @@
#include <xen/irq.h>
#include <xen/softirq.h>
#include <xen/hypercall.h>
+#include <xen/domain_page.h>
#include <asm/current.h>
#include <asm/io.h>
-#include <asm/shadow.h>
+#include <asm/shadow2.h>
#include <asm/regs.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
@@ -43,10 +44,6 @@
#include <asm/hvm/svm/emulate.h>
#include <asm/hvm/svm/vmmcall.h>
#include <asm/hvm/svm/intr.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
#include <public/sched.h>
#define SVM_EXTRA_DEBUG
@@ -414,7 +411,7 @@ static int svm_realmode(struct vcpu *v)
return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE);
}
-static int svm_instruction_length(struct vcpu *v)
+int svm_guest_x86_mode(struct vcpu *v)
{
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
@@ -423,10 +420,20 @@ static int svm_instruction_length(struct vcpu *v)
mode = vmcb->cs.attributes.fields.l ? 8 : 4;
else
mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
- return svm_instrlen(guest_cpu_user_regs(), mode);
+ return mode;
}
-static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
+int svm_instruction_length(struct vcpu *v)
+{
+ return svm_instrlen(guest_cpu_user_regs(), svm_guest_x86_mode(v));
+}
+
+void svm_update_host_cr3(struct vcpu *v)
+{
+ /* SVM doesn't have a HOST_CR3 equivalent to update. */
+}
+
+unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
{
switch ( num )
{
@@ -436,6 +443,8 @@ static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
return v->arch.hvm_svm.cpu_cr2;
case 3:
return v->arch.hvm_svm.cpu_cr3;
+ case 4:
+ return v->arch.hvm_svm.cpu_shadow_cr4;
default:
BUG();
}
@@ -526,8 +535,6 @@ static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
}
-
-
int svm_dbg_on = 0;
static inline int svm_do_debugout(unsigned long exit_code)
@@ -647,6 +654,11 @@ static void svm_load_cpu_guest_regs(
svm_load_cpu_user_regs(v, regs);
}
+int svm_long_mode_enabled(struct vcpu *v)
+{
+ return SVM_LONG_GUEST(v);
+}
+
static void arch_svm_do_launch(struct vcpu *v)
@@ -726,7 +738,6 @@ static void svm_ctxt_switch_to(struct vcpu *v)
static void svm_final_setup_guest(struct vcpu *v)
{
struct domain *d = v->domain;
- struct vcpu *vc;
v->arch.schedule_tail = arch_svm_do_launch;
v->arch.ctxt_switch_from = svm_ctxt_switch_from;
@@ -735,9 +746,12 @@ static void svm_final_setup_guest(struct vcpu *v)
if ( v != d->vcpu[0] )
return;
- /* Initialize monitor page table */
- for_each_vcpu( d, vc )
- vc->arch.monitor_table = pagetable_null();
+ if ( !shadow2_mode_external(d) )
+ {
+ DPRINTK("Can't init HVM for dom %u vcpu %u: "
+ "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
+ domain_crash(d);
+ }
/*
* Required to do this once per domain
@@ -745,13 +759,6 @@ static void svm_final_setup_guest(struct vcpu *v)
*/
memset(&d->shared_info->evtchn_mask[0], 0xff,
sizeof(d->shared_info->evtchn_mask));
-
- /*
- * Put the domain in shadow mode even though we're going to be using
- * the shared 1:1 page table initially. It shouldn't hurt
- */
- shadow_mode_enable(d, SHM_enable|SHM_refcounts|
- SHM_translate|SHM_external|SHM_wr_pt_pte);
}
@@ -809,9 +816,13 @@ int start_svm(void)
hvm_funcs.realmode = svm_realmode;
hvm_funcs.paging_enabled = svm_paging_enabled;
+ hvm_funcs.long_mode_enabled = svm_long_mode_enabled;
+ hvm_funcs.guest_x86_mode = svm_guest_x86_mode;
hvm_funcs.instruction_length = svm_instruction_length;
hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
+ hvm_funcs.update_host_cr3 = svm_update_host_cr3;
+
hvm_funcs.stts = svm_stts;
hvm_funcs.set_tsc_offset = svm_set_tsc_offset;
@@ -834,7 +845,6 @@ static void svm_relinquish_guest_resources(struct domain *d)
continue;
destroy_vmcb(&v->arch.hvm_svm);
- free_monitor_pagetable(v);
kill_timer(&v->arch.hvm_vcpu.hlt_timer);
if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
{
@@ -851,8 +861,6 @@ static void svm_relinquish_guest_resources(struct domain *d)
if ( d->arch.hvm_domain.buffered_io_va )
unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
- shadow_direct_map_clean(d);
}
@@ -894,7 +902,6 @@ static int svm_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
{
struct vcpu *v = current;
unsigned long eip;
- unsigned long gpa; /* FIXME: PAE */
int result;
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
@@ -907,43 +914,7 @@ static int svm_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
va, eip, (unsigned long)regs->error_code);
//#endif
- if ( !svm_paging_enabled(v) )
- {
- if ( shadow_direct_map_fault(va, regs) )
- return 1;
-
- handle_mmio(va, va);
- return 1;
- }
-
-
- gpa = gva_to_gpa(va);
-
- /* Use 1:1 page table to identify MMIO address space */
- if (mmio_space(gpa))
- {
- /* No support for APIC */
- if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
- {
- int inst_len;
- inst_len = svm_instruction_length(v);
- if (inst_len == -1)
- {
- printf("%s: INST_LEN - Unable to decode properly\n", __func__);
- domain_crash_synchronous();
- }
-
- __update_guest_eip(vmcb, inst_len);
-
- return 1;
- }
-
- handle_mmio(va, gpa);
-
- return 1;
- }
-
- result = shadow_fault(va, regs);
+ result = shadow2_fault(va, regs);
if( result ) {
/* Let's make sure that the Guest TLB is flushed */
@@ -1035,19 +1006,12 @@ static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb, unsigned long input,
clear_bit(X86_FEATURE_APIC, &edx);
}
-#if CONFIG_PAGING_LEVELS < 3
- clear_bit(X86_FEATURE_PAE, &edx);
- clear_bit(X86_FEATURE_PSE, &edx);
- clear_bit(X86_FEATURE_PSE36, &edx);
-#else
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
- clear_bit(X86_FEATURE_PAE, &edx);
- clear_bit(X86_FEATURE_PSE, &edx);
- clear_bit(X86_FEATURE_PSE36, &edx);
- }
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
#endif
+ clear_bit(X86_FEATURE_PAE, &edx);
+ clear_bit(X86_FEATURE_PSE36, &edx);
+
/* Clear out reserved bits. */
ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
@@ -1097,23 +1061,12 @@ static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb, unsigned long input,
clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
#endif
-#if CONFIG_PAGING_LEVELS < 3
- clear_bit(X86_FEATURE_NX & 31, &edx);
- clear_bit(X86_FEATURE_PAE, &edx);
- clear_bit(X86_FEATURE_PSE, &edx);
- clear_bit(X86_FEATURE_PSE36, &edx);
-#else
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
- {
- clear_bit(X86_FEATURE_NX & 31, &edx);
- clear_bit(X86_FEATURE_PAE, &edx);
- }
- clear_bit(X86_FEATURE_PSE, &edx);
- clear_bit(X86_FEATURE_PSE36, &edx);
- }
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
#endif
+ clear_bit(X86_FEATURE_PAE, &edx);
+ clear_bit(X86_FEATURE_PSE36, &edx);
/* Make SVM feature invisible to the guest. */
clear_bit(X86_FEATURE_SVME & 31, &ecx);
@@ -1555,6 +1508,7 @@ static int svm_set_cr0(unsigned long value)
unsigned long mfn;
int paging_enabled;
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+ unsigned long old_base_mfn;
ASSERT(vmcb);
@@ -1600,54 +1554,21 @@ static int svm_set_cr0(unsigned long value)
set_bit(SVM_CPU_STATE_LMA_ENABLED,
&v->arch.hvm_svm.cpu_state);
vmcb->efer |= (EFER_LMA | EFER_LME);
- if (!shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
}
- else
#endif /* __x86_64__ */
- {
-#if CONFIG_PAGING_LEVELS >= 3
- /* seems it's a 32-bit or 32-bit PAE guest */
- if ( test_bit(SVM_CPU_STATE_PAE_ENABLED,
- &v->arch.hvm_svm.cpu_state) )
- {
- /* The guest enables PAE first and then it enables PG, it is
- * really a PAE guest */
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous();
- }
- }
- else
- {
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
- }
-#endif
- }
/* Now arch.guest_table points to machine physical. */
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v);
+ if ( old_base_mfn )
+ put_page(mfn_to_page(old_base_mfn));
+ shadow2_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
-
- /* arch->shadow_table should hold the next CR3 for shadow */
- HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx\n",
- v->arch.hvm_svm.cpu_cr3, mfn);
-
- return 1;
}
if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
@@ -1667,17 +1588,16 @@ static int svm_set_cr0(unsigned long value)
svm_inject_exception(v, TRAP_gp_fault, 1, 0);
return 0;
}
-
- clear_all_shadow_status( v->domain );
+ shadow2_update_paging_modes(v);
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
}
else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
{
/* we should take care of this kind of situation */
- clear_all_shadow_status(v->domain);
+ shadow2_update_paging_modes(v);
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
}
return 1;
@@ -1786,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
if (mfn != pagetable_get_pfn(v->arch.guest_table))
__hvm_bug(regs);
- shadow_sync_all(v->domain);
+ shadow2_update_cr3(v);
}
else
{
@@ -1812,14 +1732,10 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
/*
* arch.shadow_table should now hold the next CR3 for shadow
*/
-#if CONFIG_PAGING_LEVELS >= 3
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
- shadow_sync_all(v->domain);
-#endif
v->arch.hvm_svm.cpu_cr3 = value;
- update_pagetables(v);
+ update_cr3(v);
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
- vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
}
break;
}
@@ -1839,12 +1755,6 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
#if CONFIG_PAGING_LEVELS >= 3
unsigned long mfn, old_base_mfn;
- if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
-
if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)) ||
!get_page(mfn_to_page(mfn), v->domain) )
@@ -1853,21 +1763,20 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
domain_crash_synchronous(); /* need to take a clean path */
}
- old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
- if ( old_base_mfn )
- put_page(mfn_to_page(old_base_mfn));
-
/*
* Now arch.guest_table points to machine physical.
*/
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v);
+ if ( old_base_mfn )
+ put_page(mfn_to_page(old_base_mfn));
+ shadow2_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
- vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
/*
* arch->shadow_table should hold the next CR3 for shadow
@@ -1878,33 +1787,6 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
v->arch.hvm_svm.cpu_cr3, mfn);
#endif
}
- else
- {
- /* The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
- if ( (v->domain->arch.ops != NULL) &&
- v->domain->arch.ops->guest_paging_levels == PAGING_L2)
- {
- /* Seems the guest first enables PAE without enabling PG,
- * it must enable PG after that, and it is a 32-bit PAE
- * guest */
-
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3))
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous();
- }
- }
- else
- {
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4))
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous();
- }
- }
-#endif
- }
}
else if (value & X86_CR4_PAE) {
set_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
@@ -1926,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
{
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- shadow_sync_all(v->domain);
+ shadow2_update_paging_modes(v);
}
break;
}
@@ -2267,7 +2149,7 @@ void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs)
/* Overkill, we may not this */
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- shadow_invlpg(v, g_vaddr);
+ shadow2_invlpg(v, g_vaddr);
}
@@ -2638,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned long gva)
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
unsigned long gpa;
- gpa = gva_to_gpa( gva );
+ gpa = shadow2_gva_to_gpa(current, gva);
printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
if( !svm_paging_enabled(v) || mmio_space(gpa) )
return;
@@ -2662,8 +2544,12 @@ void walk_shadow_and_guest_pt(unsigned long gva)
__copy_from_user(&gpte, &linear_pg_table[ l1_linear_offset(gva) ],
sizeof(gpte) );
printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) );
- __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
+
+ BUG(); // need to think about this, and convert usage of
+ // phys_to_machine_mapping to use pagetable format...
+ __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
sizeof(spte) );
+
printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte));
}
#endif /* SVM_WALK_GUEST_PAGES */
@@ -2704,7 +2590,8 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs)
if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF)
{
- if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
+ if (svm_paging_enabled(v) &&
+ !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
{
printk("I%08ld,ExC=%s(%d),IP=%x:%llx,I1=%llx,I2=%llx,INT=%llx, "
"gpa=%llx\n", intercepts_counter,
@@ -2713,7 +2600,7 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs)
(unsigned long long) vmcb->exitinfo1,
(unsigned long long) vmcb->exitinfo2,
(unsigned long long) vmcb->exitintinfo.bytes,
- (unsigned long long) gva_to_gpa( vmcb->exitinfo2 ) );
+ (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2));
}
else
{
@@ -2757,7 +2644,7 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs)
&& ( ( vmcb->exitinfo2 == vmcb->rip )
|| vmcb->exitintinfo.bytes) )
{
- if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
+ if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
walk_shadow_and_guest_pt( vmcb->exitinfo2 );
}
#endif
diff --git a/xen/arch/x86/hvm/svm/vmcb.c b/xen/arch/x86/hvm/svm/vmcb.c
index 349381e3ec..82f7195e73 100644
--- a/xen/arch/x86/hvm/svm/vmcb.c
+++ b/xen/arch/x86/hvm/svm/vmcb.c
@@ -380,8 +380,8 @@ void svm_do_launch(struct vcpu *v)
printk("%s: phys_table = %lx\n", __func__, pt);
}
- /* At launch we always use the phys_table */
- vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
+ /* Set cr3 from hw_cr3 even when guest-visible paging is not enabled */
+ vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
if (svm_dbg_on)
{
diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
index 7ebca89693..9cb27656c3 100644
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -21,7 +21,8 @@
#include <xen/types.h>
#include <xen/mm.h>
#include <xen/xmalloc.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
+#include <xen/domain_page.h>
#include <asm/page.h>
#include <xen/event.h>
#include <xen/trace.h>
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index ebd8a42f68..75de5f49ea 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -34,12 +34,8 @@
#include <asm/flushtlb.h>
#include <xen/event.h>
#include <xen/kernel.h>
-#include <asm/shadow.h>
#include <xen/keyhandler.h>
-
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
static int vmcs_size;
static int vmcs_order;
@@ -238,7 +234,7 @@ static void vmx_set_host_env(struct vcpu *v)
static void vmx_do_launch(struct vcpu *v)
{
-/* Update CR3, GDT, LDT, TR */
+/* Update CR3, CR0, CR4, GDT, LDT, TR */
unsigned int error = 0;
unsigned long cr0, cr4;
@@ -276,8 +272,11 @@ static void vmx_do_launch(struct vcpu *v)
error |= __vmwrite(GUEST_TR_BASE, 0);
error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
- __vmwrite(HOST_CR3, pagetable_get_paddr(v->arch.monitor_table));
+ shadow2_update_paging_modes(v);
+ printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
+ __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+ __vmwrite(HOST_CR3, v->arch.cr3);
v->arch.schedule_tail = arch_vmx_do_resume;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 658ee8ae73..0233f26595 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -26,9 +26,9 @@
#include <xen/softirq.h>
#include <xen/domain_page.h>
#include <xen/hypercall.h>
+#include <xen/perfc.h>
#include <asm/current.h>
#include <asm/io.h>
-#include <asm/shadow.h>
#include <asm/regs.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
@@ -40,10 +40,7 @@
#include <asm/hvm/vmx/vmx.h>
#include <asm/hvm/vmx/vmcs.h>
#include <asm/hvm/vmx/cpu.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
#include <public/sched.h>
#include <public/hvm/ioreq.h>
#include <asm/hvm/vpic.h>
@@ -69,11 +66,16 @@ static int vmx_initialize_guest_resources(struct vcpu *v)
if ( v->vcpu_id != 0 )
return 1;
- for_each_vcpu ( d, vc )
+ if ( !shadow2_mode_external(d) )
{
- /* Initialize monitor page table */
- vc->arch.monitor_table = pagetable_null();
+ DPRINTK("Can't init HVM for dom %u vcpu %u: "
+ "not in shadow2 external mode\n",
+ d->domain_id, v->vcpu_id);
+ domain_crash(d);
+ }
+ for_each_vcpu ( d, vc )
+ {
memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct));
if ( (rc = vmx_create_vmcs(vc)) != 0 )
@@ -107,6 +109,7 @@ static int vmx_initialize_guest_resources(struct vcpu *v)
vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a;
vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b;
+
}
/*
@@ -116,11 +119,6 @@ static int vmx_initialize_guest_resources(struct vcpu *v)
memset(&d->shared_info->evtchn_mask[0], 0xff,
sizeof(d->shared_info->evtchn_mask));
- /* Put the domain in shadow mode even though we're going to be using
- * the shared 1:1 page table initially. It shouldn't hurt */
- shadow_mode_enable(
- d, SHM_enable|SHM_refcounts|SHM_translate|SHM_external|SHM_wr_pt_pte);
-
return 1;
}
@@ -133,7 +131,6 @@ static void vmx_relinquish_guest_resources(struct domain *d)
vmx_destroy_vmcs(v);
if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
continue;
- free_monitor_pagetable(v);
kill_timer(&v->arch.hvm_vcpu.hlt_timer);
if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
{
@@ -153,8 +150,6 @@ static void vmx_relinquish_guest_resources(struct domain *d)
if ( d->arch.hvm_domain.buffered_io_va )
unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
- shadow_direct_map_clean(d);
}
#ifdef __x86_64__
@@ -595,14 +590,6 @@ static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
vmx_vmcs_exit(v);
}
-static int vmx_realmode(struct vcpu *v)
-{
- unsigned long rflags;
-
- __vmread(GUEST_RFLAGS, &rflags);
- return rflags & X86_EFLAGS_VM;
-}
-
static int vmx_instruction_length(struct vcpu *v)
{
unsigned long inst_len;
@@ -622,6 +609,8 @@ static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
return v->arch.hvm_vmx.cpu_cr2;
case 3:
return v->arch.hvm_vmx.cpu_cr3;
+ case 4:
+ return v->arch.hvm_vmx.cpu_shadow_cr4;
default:
BUG();
}
@@ -753,9 +742,13 @@ static void vmx_setup_hvm_funcs(void)
hvm_funcs.realmode = vmx_realmode;
hvm_funcs.paging_enabled = vmx_paging_enabled;
+ hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
+ hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
hvm_funcs.instruction_length = vmx_instruction_length;
hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
+ hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
+
hvm_funcs.stts = vmx_stts;
hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
@@ -855,53 +848,25 @@ static void inline __update_guest_eip(unsigned long inst_len)
__vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
}
-
static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
{
- unsigned long gpa; /* FIXME: PAE */
int result;
#if 0 /* keep for debugging */
{
- unsigned long eip;
+ unsigned long eip, cs;
+ __vmread(GUEST_CS_BASE, &cs);
__vmread(GUEST_RIP, &eip);
HVM_DBG_LOG(DBG_LEVEL_VMMU,
- "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
- va, eip, (unsigned long)regs->error_code);
+ "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
+ "eip = %lx, error_code = %lx\n",
+ va, cs, eip, (unsigned long)regs->error_code);
}
#endif
- if ( !vmx_paging_enabled(current) )
- {
- /* construct 1-to-1 direct mapping */
- if ( shadow_direct_map_fault(va, regs) )
- return 1;
-
- handle_mmio(va, va);
- TRACE_VMEXIT (2,2);
- return 1;
- }
- gpa = gva_to_gpa(va);
-
- /* Use 1:1 page table to identify MMIO address space */
- if ( mmio_space(gpa) ){
- struct vcpu *v = current;
- /* No support for APIC */
- if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) {
- u32 inst_len;
- __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
- __update_guest_eip(inst_len);
- return 1;
- }
- TRACE_VMEXIT (2,2);
- /* in the case of MMIO, we are more interested in gpa than in va */
- TRACE_VMEXIT (4,gpa);
- handle_mmio(va, gpa);
- return 1;
- }
+ result = shadow2_fault(va, regs);
- result = shadow_fault(va, regs);
TRACE_VMEXIT (2,result);
#if 0
if ( !result )
@@ -972,23 +937,11 @@ static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs)
clear_bit(X86_FEATURE_APIC, &edx);
}
-#if CONFIG_PAGING_LEVELS < 3
- edx &= ~(bitmaskof(X86_FEATURE_PAE) |
- bitmaskof(X86_FEATURE_PSE) |
- bitmaskof(X86_FEATURE_PSE36));
-#else
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- if ( v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
- clear_bit(X86_FEATURE_PSE36, &edx);
- else
- {
- clear_bit(X86_FEATURE_PAE, &edx);
- clear_bit(X86_FEATURE_PSE, &edx);
- clear_bit(X86_FEATURE_PSE36, &edx);
- }
- }
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
#endif
+ clear_bit(X86_FEATURE_PAE, &edx);
+ clear_bit(X86_FEATURE_PSE36, &edx);
ebx &= NUM_THREADS_RESET_MASK;
@@ -1086,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigned long va)
* We do the safest things first, then try to update the shadow
* copying from guest
*/
- shadow_invlpg(v, va);
+ shadow2_invlpg(v, va);
}
@@ -1307,11 +1260,8 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
- if (!vmx_paging_enabled(v)) {
- HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+ if (!vmx_paging_enabled(v))
goto skip_cr3;
- }
if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
/*
@@ -1325,7 +1275,6 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
domain_crash_synchronous();
return 0;
}
- shadow_sync_all(v->domain);
} else {
/*
* If different, make a shadow. Check if the PDBR is valid
@@ -1348,13 +1297,17 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
* arch.shadow_table should now hold the next CR3 for shadow
*/
v->arch.hvm_vmx.cpu_cr3 = c->cr3;
- update_pagetables(v);
- HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
}
skip_cr3:
+ shadow2_update_paging_modes(v);
+ if (!vmx_paging_enabled(v))
+ HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
+ else
+ HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+
error |= __vmread(CR4_READ_SHADOW, &old_cr4);
error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
@@ -1485,6 +1438,7 @@ static int vmx_set_cr0(unsigned long value)
int paging_enabled;
unsigned long vm_entry_value;
unsigned long old_cr0;
+ unsigned long old_base_mfn;
/*
* CR0: We don't want to lose PE and PG.
@@ -1514,7 +1468,8 @@ static int vmx_set_cr0(unsigned long value)
v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
!get_page(mfn_to_page(mfn), v->domain) )
{
- printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
+ printk("Invalid CR3 value = %lx (mfn=%lx)\n",
+ v->arch.hvm_vmx.cpu_cr3, mfn);
domain_crash_synchronous(); /* need to take a clean path */
}
@@ -1539,51 +1494,22 @@ static int vmx_set_cr0(unsigned long value)
__vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
__vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
-
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
}
- else
-#endif /* __x86_64__ */
- {
-#if CONFIG_PAGING_LEVELS >= 3
- /* seems it's a 32-bit or 32-bit PAE guest */
-
- if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
- &v->arch.hvm_vmx.cpu_state) )
- {
- /* The guest enables PAE first and then it enables PG, it is
- * really a PAE guest */
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous();
- }
- }
- else
- {
- if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
- }
#endif
- }
/*
* Now arch.guest_table points to machine physical.
*/
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v);
+ if (old_base_mfn)
+ put_page(mfn_to_page(old_base_mfn));
+ shadow2_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
/*
* arch->shadow_table should hold the next CR3 for shadow
*/
@@ -1625,7 +1551,6 @@ static int vmx_set_cr0(unsigned long value)
}
}
- clear_all_shadow_status(v->domain);
if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
__vmread(GUEST_RIP, &eip);
@@ -1651,9 +1576,8 @@ static int vmx_set_cr0(unsigned long value)
}
else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
{
- /* we should take care of this kind of situation */
- clear_all_shadow_status(v->domain);
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+ shadow2_update_paging_modes(v);
}
return 1;
@@ -1738,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
if (mfn != pagetable_get_pfn(v->arch.guest_table))
__hvm_bug(regs);
- shadow_sync_all(v->domain);
+ shadow2_update_cr3(v);
} else {
/*
* If different, make a shadow. Check if the PDBR is valid
@@ -1759,16 +1683,11 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
/*
* arch.shadow_table should now hold the next CR3 for shadow
*/
-#if CONFIG_PAGING_LEVELS >= 3
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
- shadow_sync_all(v->domain);
-#endif
-
v->arch.hvm_vmx.cpu_cr3 = value;
- update_pagetables(v);
+ update_cr3(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
value);
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
}
break;
}
@@ -1786,12 +1705,6 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
#if CONFIG_PAGING_LEVELS >= 3
unsigned long mfn, old_base_mfn;
- if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
-
if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
!get_page(mfn_to_page(mfn), v->domain) )
@@ -1800,21 +1713,20 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
domain_crash_synchronous(); /* need to take a clean path */
}
- old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
- if ( old_base_mfn )
- put_page(mfn_to_page(old_base_mfn));
/*
* Now arch.guest_table points to machine physical.
*/
+ old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v);
+ if ( old_base_mfn )
+ put_page(mfn_to_page(old_base_mfn));
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
- __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+ __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
/*
* arch->shadow_table should hold the next CR3 for shadow
@@ -1824,27 +1736,6 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
v->arch.hvm_vmx.cpu_cr3, mfn);
#endif
}
- else
- {
- /* The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
- if ( (v->domain->arch.ops != NULL) &&
- v->domain->arch.ops->guest_paging_levels == PAGING_L2)
- {
- /* Seems the guest first enables PAE without enabling PG,
- * it must enable PG after that, and it is a 32-bit PAE
- * guest */
-
- if ( !shadow_set_guest_paging_levels(v->domain,
- PAGING_L3) )
- {
- printk("Unsupported guest paging levels\n");
- /* need to take a clean path */
- domain_crash_synchronous();
- }
- }
-#endif
- }
}
else if ( value & X86_CR4_PAE )
set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
@@ -1864,8 +1755,7 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
* all TLB entries except global entries.
*/
if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
- shadow_sync_all(v->domain);
-
+ shadow2_update_paging_modes(v);
break;
}
default:
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 0c35c9b52d..6c0abad2e2 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -137,7 +137,7 @@ static void free_l1_table(struct page_info *page);
static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
unsigned long type);
-static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
+static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn);
/* Used to defer flushing of memory structures. */
struct percpu_mm_info {
@@ -274,9 +274,9 @@ void share_xen_page_with_privileged_guests(
#else
/*
* In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
- * We cannot safely shadow the idle page table, nor shadow-mode page tables
+ * We cannot safely shadow the idle page table, nor shadow (v1) page tables
* (detected by lack of an owning domain). As required for correctness, we
- * always shadow PDPTs aboive 4GB.
+ * always shadow PDPTs above 4GB.
*/
#define l3tab_needs_shadow(mfn) \
(((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
@@ -297,17 +297,21 @@ static int __init cache_pae_fixmap_address(void)
}
__initcall(cache_pae_fixmap_address);
-static void __write_ptbase(unsigned long mfn)
+static DEFINE_PER_CPU(u32, make_cr3_timestamp);
+
+void make_cr3(struct vcpu *v, unsigned long mfn)
+/* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
+ * necessary, and sets v->arch.cr3 to the value to load in CR3. */
{
l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
- struct pae_l3_cache *cache = &current->arch.pae_l3_cache;
+ struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
unsigned int cpu = smp_processor_id();
- /* Fast path 1: does this mfn need a shadow at all? */
+ /* Fast path: does this mfn need a shadow at all? */
if ( !l3tab_needs_shadow(mfn) )
{
- write_cr3(mfn << PAGE_SHIFT);
- /* Cache is no longer in use or valid (/after/ write to %cr3). */
+ v->arch.cr3 = mfn << PAGE_SHIFT;
+ /* Cache is no longer in use or valid */
cache->high_mfn = 0;
return;
}
@@ -315,13 +319,6 @@ static void __write_ptbase(unsigned long mfn)
/* Caching logic is not interrupt safe. */
ASSERT(!in_irq());
- /* Fast path 2: is this mfn already cached? */
- if ( cache->high_mfn == mfn )
- {
- write_cr3(__pa(cache->table[cache->inuse_idx]));
- return;
- }
-
/* Protects against pae_flush_pgd(). */
spin_lock(&cache->lock);
@@ -330,29 +327,33 @@ static void __write_ptbase(unsigned long mfn)
/* Map the guest L3 table and copy to the chosen low-memory cache. */
*(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
+ /* First check the previous high mapping can't be in the TLB.
+ * (i.e. have we loaded CR3 since we last did this?) */
+ if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
+ local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
lowmem_l3tab = cache->table[cache->inuse_idx];
memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
*(fix_pae_highmem_pl1e - cpu) = l1e_empty();
+ this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
- /* Install the low-memory L3 table in CR3. */
- write_cr3(__pa(lowmem_l3tab));
+ v->arch.cr3 = __pa(lowmem_l3tab);
spin_unlock(&cache->lock);
}
#else /* !CONFIG_X86_PAE */
-static void __write_ptbase(unsigned long mfn)
+void make_cr3(struct vcpu *v, unsigned long mfn)
{
- write_cr3(mfn << PAGE_SHIFT);
+ v->arch.cr3 = mfn << PAGE_SHIFT;
}
#endif /* !CONFIG_X86_PAE */
void write_ptbase(struct vcpu *v)
{
- __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
+ write_cr3(v->arch.cr3);
}
void invalidate_shadow_ldt(struct vcpu *v)
@@ -423,8 +424,6 @@ int map_ldt_shadow_page(unsigned int off)
BUG_ON(unlikely(in_irq()));
- shadow_sync_va(v, gva);
-
TOGGLE_MODE();
__copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
sizeof(l1e));
@@ -440,12 +439,12 @@ int map_ldt_shadow_page(unsigned int off)
res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
- if ( !res && unlikely(shadow_mode_refcounts(d)) )
+ if ( !res && unlikely(shadow2_mode_refcounts(d)) )
{
- shadow_lock(d);
- shadow_remove_all_write_access(d, gmfn, mfn);
+ shadow2_lock(d);
+ shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
- shadow_unlock(d);
+ shadow2_unlock(d);
}
if ( unlikely(!res) )
@@ -513,7 +512,7 @@ get_linear_pagetable(
struct page_info *page;
unsigned long pfn;
- ASSERT( !shadow_mode_refcounts(d) );
+ ASSERT( !shadow2_mode_refcounts(d) );
if ( (root_get_flags(re) & _PAGE_RW) )
{
@@ -576,7 +575,8 @@ get_page_from_l1e(
if ( !iomem_access_permitted(d, mfn, mfn) )
{
- MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
+ MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx",
+ d->domain_id, mfn);
return 0;
}
@@ -587,9 +587,14 @@ get_page_from_l1e(
d = dom_io;
}
- okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
- get_page_and_type(page, d, PGT_writable_page) :
- get_page(page, d));
+ /* Foreign mappings into guests in shadow2 external mode don't
+ * contribute to writeable mapping refcounts. (This allows the
+ * qemu-dm helper process in dom0 to map the domain's memory without
+ * messing up the count of "real" writable mappings.) */
+ okay = (((l1e_get_flags(l1e) & _PAGE_RW) &&
+ !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
+ ? get_page_and_type(page, d, PGT_writable_page)
+ : get_page(page, d));
if ( !okay )
{
MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
@@ -610,8 +615,6 @@ get_page_from_l2e(
{
int rc;
- ASSERT(!shadow_mode_refcounts(d));
-
if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
return 1;
@@ -641,8 +644,6 @@ get_page_from_l3e(
{
int rc;
- ASSERT(!shadow_mode_refcounts(d));
-
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
return 1;
@@ -669,8 +670,6 @@ get_page_from_l4e(
{
int rc;
- ASSERT( !shadow_mode_refcounts(d) );
-
if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
return 1;
@@ -727,7 +726,10 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
domain_crash(d);
}
- if ( l1e_get_flags(l1e) & _PAGE_RW )
+ /* Remember we didn't take a type-count of foreign writable mappings
+ * to shadow2 external domains */
+ if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
+ !(unlikely((e != d) && shadow2_mode_external(e))) )
{
put_page_and_type(page);
}
@@ -784,7 +786,7 @@ static int alloc_l1_table(struct page_info *page)
l1_pgentry_t *pl1e;
int i;
- ASSERT(!shadow_mode_refcounts(d));
+ ASSERT(!shadow2_mode_refcounts(d));
pl1e = map_domain_page(pfn);
@@ -832,6 +834,8 @@ static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
* 2. Cannot appear in another page table's L3:
* a. alloc_l3_table() calls this function and this check will fail
* b. mod_l3_entry() disallows updates to slot 3 in an existing table
+ *
+ * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
*/
page = l3e_get_page(l3e3);
BUG_ON(page->u.inuse.type_info & PGT_pinned);
@@ -955,11 +959,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
l2_pgentry_t *pl2e;
int i;
- /* See the code in shadow_promote() to understand why this is here. */
- if ( (PGT_base_page_table == PGT_l2_page_table) &&
- unlikely(shadow_mode_refcounts(d)) )
- return 1;
- ASSERT(!shadow_mode_refcounts(d));
+ ASSERT(!shadow2_mode_refcounts(d));
pl2e = map_domain_page(pfn);
@@ -1009,11 +1009,7 @@ static int alloc_l3_table(struct page_info *page, unsigned long type)
l3_pgentry_t *pl3e;
int i;
- /* See the code in shadow_promote() to understand why this is here. */
- if ( (PGT_base_page_table == PGT_l3_page_table) &&
- shadow_mode_refcounts(d) )
- return 1;
- ASSERT(!shadow_mode_refcounts(d));
+ ASSERT(!shadow2_mode_refcounts(d));
#ifdef CONFIG_X86_PAE
/*
@@ -1072,11 +1068,7 @@ static int alloc_l4_table(struct page_info *page, unsigned long type)
unsigned long vaddr;
int i;
- /* See the code in shadow_promote() to understand why this is here. */
- if ( (PGT_base_page_table == PGT_l4_page_table) &&
- shadow_mode_refcounts(d) )
- return 1;
- ASSERT(!shadow_mode_refcounts(d));
+ ASSERT(!shadow2_mode_refcounts(d));
for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
{
@@ -1183,42 +1175,55 @@ static void free_l4_table(struct page_info *page)
static inline int update_l1e(l1_pgentry_t *pl1e,
l1_pgentry_t ol1e,
- l1_pgentry_t nl1e)
+ l1_pgentry_t nl1e,
+ unsigned long gl1mfn,
+ struct vcpu *v)
{
+ int rv = 1;
+ if ( unlikely(shadow2_mode_enabled(v->domain)) )
+ shadow2_lock(v->domain);
#ifndef PTE_UPDATE_WITH_CMPXCHG
- return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e));
+ rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
#else
- intpte_t o = l1e_get_intpte(ol1e);
- intpte_t n = l1e_get_intpte(nl1e);
-
- for ( ; ; )
{
- if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
+ intpte_t o = l1e_get_intpte(ol1e);
+ intpte_t n = l1e_get_intpte(nl1e);
+
+ for ( ; ; )
{
- MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
- ": saw %" PRIpte,
- l1e_get_intpte(ol1e),
- l1e_get_intpte(nl1e),
- o);
- return 0;
- }
+ if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
+ {
+ MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
+ ": saw %" PRIpte,
+ l1e_get_intpte(ol1e),
+ l1e_get_intpte(nl1e),
+ o);
+ rv = 0;
+ break;
+ }
- if ( o == l1e_get_intpte(ol1e) )
- break;
+ if ( o == l1e_get_intpte(ol1e) )
+ break;
- /* Allowed to change in Accessed/Dirty flags only. */
- BUG_ON((o ^ l1e_get_intpte(ol1e)) &
- ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
- ol1e = l1e_from_intpte(o);
+ /* Allowed to change in Accessed/Dirty flags only. */
+ BUG_ON((o ^ l1e_get_intpte(ol1e)) &
+ ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
+ ol1e = l1e_from_intpte(o);
+ }
}
-
- return 1;
#endif
+ if ( unlikely(shadow2_mode_enabled(v->domain)) )
+ {
+ shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
+ shadow2_unlock(v->domain);
+ }
+ return rv;
}
/* Update the L1 entry at pl1e to new value nl1e. */
-static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
+ unsigned long gl1mfn)
{
l1_pgentry_t ol1e;
struct domain *d = current->domain;
@@ -1226,9 +1231,6 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
return 0;
- if ( unlikely(shadow_mode_refcounts(d)) )
- return update_l1e(pl1e, ol1e, nl1e);
-
if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
{
if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
@@ -1239,13 +1241,13 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
}
/* Fast path for identical mapping, r/w and presence. */
- if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
- return update_l1e(pl1e, ol1e, nl1e);
+ if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
+ return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
return 0;
- if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+ if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
{
put_page_from_l1e(nl1e, d);
return 0;
@@ -1253,7 +1255,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
}
else
{
- if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+ if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
return 0;
}
@@ -1262,9 +1264,9 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
}
#ifndef PTE_UPDATE_WITH_CMPXCHG
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
#else
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ \
for ( ; ; ) \
{ \
intpte_t __o = cmpxchg((intpte_t *)(_p), \
@@ -1279,6 +1281,18 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
} \
1; })
#endif
+#define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \
+ int rv; \
+ if ( unlikely(shadow2_mode_enabled(current->domain)) ) \
+ shadow2_lock(current->domain); \
+ rv = _UPDATE_ENTRY(_t, _p, _o, _n); \
+ if ( unlikely(shadow2_mode_enabled(current->domain)) ) \
+ { \
+ shadow2_validate_guest_entry(current, _mfn(_m), (_p)); \
+ shadow2_unlock(current->domain); \
+ } \
+ rv; \
+})
/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
static int mod_l2_entry(l2_pgentry_t *pl2e,
@@ -1309,19 +1323,19 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
/* Fast path for identical mapping and presence. */
if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
- return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
+ return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn);
if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
return 0;
- if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+ if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
{
put_page_from_l2e(nl2e, pfn);
return 0;
}
}
- else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+ else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
{
return 0;
}
@@ -1330,7 +1344,6 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
return 1;
}
-
#if CONFIG_PAGING_LEVELS >= 3
/* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
@@ -1356,7 +1369,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
*/
if ( pgentry_ptr_to_slot(pl3e) >= 3 )
return 0;
-#endif
+#endif
if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
return 0;
@@ -1372,7 +1385,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
/* Fast path for identical mapping and presence. */
if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
- return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
+ return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn);
#if CONFIG_PAGING_LEVELS >= 4
if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
@@ -1383,15 +1396,15 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
<< L3_PAGETABLE_SHIFT;
if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
return 0;
-#endif
+#endif
- if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+ if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
{
put_page_from_l3e(nl3e, pfn);
return 0;
}
}
- else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+ else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
{
return 0;
}
@@ -1438,19 +1451,19 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
/* Fast path for identical mapping and presence. */
if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
- return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
+ return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn);
if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
return 0;
- if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+ if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
{
put_page_from_l4e(nl4e, pfn);
return 0;
}
}
- else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+ else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
{
return 0;
}
@@ -1506,18 +1519,21 @@ void free_page_type(struct page_info *page, unsigned long type)
*/
this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
- if ( unlikely(shadow_mode_enabled(owner)) )
+ if ( unlikely(shadow2_mode_enabled(owner)
+ && !shadow2_lock_is_acquired(owner)) )
{
/* Raw page tables are rewritten during save/restore. */
- if ( !shadow_mode_translate(owner) )
+ if ( !shadow2_mode_translate(owner) )
mark_dirty(owner, page_to_mfn(page));
- if ( shadow_mode_refcounts(owner) )
+ if ( shadow2_mode_refcounts(owner) )
return;
gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
ASSERT(VALID_M2P(gmfn));
- remove_shadow(owner, gmfn, type & PGT_type_mask);
+ shadow2_lock(owner);
+ shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
+ shadow2_unlock(owner);
}
}
@@ -1573,9 +1589,6 @@ void put_page_type(struct page_info *page)
if ( unlikely((nx & PGT_count_mask) == 0) )
{
- /* Record TLB information for flush later. Races are harmless. */
- page->tlbflush_timestamp = tlbflush_current_time();
-
if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
likely(nx & PGT_validated) )
{
@@ -1593,6 +1606,9 @@ void put_page_type(struct page_info *page)
x &= ~PGT_validated;
nx &= ~PGT_validated;
}
+
+ /* Record TLB information for flush later. */
+ page->tlbflush_timestamp = tlbflush_current_time();
}
else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) ==
(PGT_pinned|PGT_l1_page_table|1)) )
@@ -1682,7 +1698,7 @@ int get_page_type(struct page_info *page, unsigned long type)
#endif
/* Fixme: add code to propagate va_unknown to subtables. */
if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
- !shadow_mode_refcounts(page_get_owner(page)) )
+ !shadow2_mode_refcounts(page_get_owner(page)) )
return 0;
/* This table is possibly mapped at multiple locations. */
nx &= ~PGT_va_mask;
@@ -1729,7 +1745,10 @@ int new_guest_cr3(unsigned long mfn)
int okay;
unsigned long old_base_mfn;
- if ( shadow_mode_refcounts(d) )
+ if ( hvm_guest(v) && !hvm_paging_enabled(v) )
+ domain_crash_synchronous();
+
+ if ( shadow2_mode_refcounts(d) )
{
okay = get_page_from_pagenr(mfn, d);
if ( unlikely(!okay) )
@@ -1747,7 +1766,7 @@ int new_guest_cr3(unsigned long mfn)
MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn);
old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
v->arch.guest_table = pagetable_null();
- update_pagetables(v);
+ update_cr3(v);
write_cr3(__pa(idle_pg_table));
if ( old_base_mfn != 0 )
put_page_and_type(mfn_to_page(old_base_mfn));
@@ -1769,30 +1788,20 @@ int new_guest_cr3(unsigned long mfn)
invalidate_shadow_ldt(v);
old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+
v->arch.guest_table = pagetable_from_pfn(mfn);
- update_pagetables(v); /* update shadow_table and monitor_table */
+ update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */
write_ptbase(v);
if ( likely(old_base_mfn != 0) )
{
- if ( shadow_mode_refcounts(d) )
+ if ( shadow2_mode_refcounts(d) )
put_page(mfn_to_page(old_base_mfn));
else
put_page_and_type(mfn_to_page(old_base_mfn));
}
- /* CR3 also holds a ref to its shadow... */
- if ( shadow_mode_enabled(d) )
- {
- if ( v->arch.monitor_shadow_ref )
- put_shadow_ref(v->arch.monitor_shadow_ref);
- v->arch.monitor_shadow_ref =
- pagetable_get_pfn(v->arch.monitor_table);
- ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
- get_shadow_ref(v->arch.monitor_shadow_ref);
- }
-
return 1;
}
@@ -1807,8 +1816,6 @@ static void process_deferred_ops(void)
if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
{
- if ( shadow_mode_enabled(d) )
- shadow_sync_all(d);
if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
flush_tlb_mask(d->domain_dirty_cpumask);
else
@@ -1974,7 +1981,7 @@ int do_mmuext_op(
type = PGT_root_page_table;
pin_page:
- if ( shadow_mode_refcounts(FOREIGNDOM) )
+ if ( shadow2_mode_refcounts(FOREIGNDOM) )
break;
okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
@@ -1996,7 +2003,7 @@ int do_mmuext_op(
break;
case MMUEXT_UNPIN_TABLE:
- if ( shadow_mode_refcounts(d) )
+ if ( shadow2_mode_refcounts(d) )
break;
if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
@@ -2009,6 +2016,12 @@ int do_mmuext_op(
{
put_page_and_type(page);
put_page(page);
+ if ( shadow2_mode_enabled(d) )
+ {
+ shadow2_lock(d);
+ shadow2_remove_all_shadows(v, _mfn(mfn));
+ shadow2_unlock(d);
+ }
}
else
{
@@ -2050,9 +2063,9 @@ int do_mmuext_op(
break;
case MMUEXT_INVLPG_LOCAL:
- if ( shadow_mode_enabled(d) )
- shadow_invlpg(v, op.arg1.linear_addr);
- local_flush_tlb_one(op.arg1.linear_addr);
+ if ( !shadow2_mode_enabled(d)
+ || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
+ local_flush_tlb_one(op.arg1.linear_addr);
break;
case MMUEXT_TLB_FLUSH_MULTI:
@@ -2098,7 +2111,7 @@ int do_mmuext_op(
unsigned long ptr = op.arg1.linear_addr;
unsigned long ents = op.arg2.nr_ents;
- if ( shadow_mode_external(d) )
+ if ( shadow2_mode_external(d) )
{
MEM_LOG("ignoring SET_LDT hypercall from external "
"domain %u", d->domain_id);
@@ -2171,9 +2184,6 @@ int do_mmu_update(
LOCK_BIGLOCK(d);
- if ( unlikely(shadow_mode_enabled(d)) )
- check_pagetable(v, "pre-mmu"); /* debug */
-
if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
{
count &= ~MMU_UPDATE_PREEMPTED;
@@ -2248,7 +2258,12 @@ int do_mmu_update(
case PGT_l3_page_table:
case PGT_l4_page_table:
{
- ASSERT(!shadow_mode_refcounts(d));
+ if ( shadow2_mode_refcounts(d) )
+ {
+ DPRINTK("mmu update on shadow-refcounted domain!");
+ break;
+ }
+
if ( unlikely(!get_page_type(
page, type_info & (PGT_type_mask|PGT_va_mask))) )
goto not_a_pt;
@@ -2258,10 +2273,7 @@ int do_mmu_update(
case PGT_l1_page_table:
{
l1_pgentry_t l1e = l1e_from_intpte(req.val);
- okay = mod_l1_entry(va, l1e);
- if ( okay && unlikely(shadow_mode_enabled(d)) )
- shadow_l1_normal_pt_update(
- d, req.ptr, l1e, &sh_mapcache);
+ okay = mod_l1_entry(va, l1e, mfn);
}
break;
case PGT_l2_page_table:
@@ -2269,9 +2281,6 @@ int do_mmu_update(
l2_pgentry_t l2e = l2e_from_intpte(req.val);
okay = mod_l2_entry(
(l2_pgentry_t *)va, l2e, mfn, type_info);
- if ( okay && unlikely(shadow_mode_enabled(d)) )
- shadow_l2_normal_pt_update(
- d, req.ptr, l2e, &sh_mapcache);
}
break;
#if CONFIG_PAGING_LEVELS >= 3
@@ -2279,9 +2288,6 @@ int do_mmu_update(
{
l3_pgentry_t l3e = l3e_from_intpte(req.val);
okay = mod_l3_entry(va, l3e, mfn, type_info);
- if ( okay && unlikely(shadow_mode_enabled(d)) )
- shadow_l3_normal_pt_update(
- d, req.ptr, l3e, &sh_mapcache);
}
break;
#endif
@@ -2290,9 +2296,6 @@ int do_mmu_update(
{
l4_pgentry_t l4e = l4e_from_intpte(req.val);
okay = mod_l4_entry(va, l4e, mfn, type_info);
- if ( okay && unlikely(shadow_mode_enabled(d)) )
- shadow_l4_normal_pt_update(
- d, req.ptr, l4e, &sh_mapcache);
}
break;
#endif
@@ -2308,19 +2311,17 @@ int do_mmu_update(
if ( unlikely(!get_page_type(page, PGT_writable_page)) )
break;
- if ( shadow_mode_enabled(d) )
- {
- shadow_lock(d);
- __mark_dirty(d, mfn);
- if ( page_is_page_table(page) && !page_out_of_sync(page) )
- shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
- }
+ if ( unlikely(shadow2_mode_enabled(d)) )
+ shadow2_lock(d);
*(intpte_t *)va = req.val;
okay = 1;
- if ( shadow_mode_enabled(d) )
- shadow_unlock(d);
+ if ( unlikely(shadow2_mode_enabled(d)) )
+ {
+ shadow2_validate_guest_entry(v, _mfn(mfn), va);
+ shadow2_unlock(d);
+ }
put_page_type(page);
}
@@ -2334,12 +2335,6 @@ int do_mmu_update(
case MMU_MACHPHYS_UPDATE:
- if ( shadow_mode_translate(FOREIGNDOM) )
- {
- MEM_LOG("can't mutate m2p table of translate mode guest");
- break;
- }
-
mfn = req.ptr >> PAGE_SHIFT;
gpfn = req.val;
@@ -2349,9 +2344,13 @@ int do_mmu_update(
break;
}
- set_gpfn_from_mfn(mfn, gpfn);
+ if ( shadow2_mode_translate(FOREIGNDOM) )
+ shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
+ else
+ set_gpfn_from_mfn(mfn, gpfn);
okay = 1;
+ // Mark the new gfn dirty...
mark_dirty(FOREIGNDOM, mfn);
put_page(mfn_to_page(mfn));
@@ -2382,9 +2381,6 @@ int do_mmu_update(
if ( unlikely(!guest_handle_is_null(pdone)) )
copy_to_guest(pdone, &done, 1);
- if ( unlikely(shadow_mode_enabled(d)) )
- check_pagetable(v, "post-mmu"); /* debug */
-
UNLOCK_BIGLOCK(d);
return rc;
}
@@ -2402,7 +2398,6 @@ static int create_grant_pte_mapping(
struct domain *d = v->domain;
ASSERT(spin_is_locked(&d->big_lock));
- ASSERT(!shadow_mode_refcounts(d));
gmfn = pte_addr >> PAGE_SHIFT;
mfn = gmfn_to_mfn(d, gmfn);
@@ -2418,7 +2413,7 @@ static int create_grant_pte_mapping(
page = mfn_to_page(mfn);
type_info = page->u.inuse.type_info;
- if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
+ if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
!get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
{
MEM_LOG("Grant map attempted to update a non-L1 page");
@@ -2427,28 +2422,22 @@ static int create_grant_pte_mapping(
}
ol1e = *(l1_pgentry_t *)va;
- if ( !update_l1e(va, ol1e, _nl1e) )
+ if ( !update_l1e(va, ol1e, _nl1e, mfn, v) )
{
put_page_type(page);
rc = GNTST_general_error;
goto failed;
}
- put_page_from_l1e(ol1e, d);
-
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- struct domain_mmap_cache sh_mapcache;
- domain_mmap_cache_init(&sh_mapcache);
- shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
- domain_mmap_cache_destroy(&sh_mapcache);
- }
+ if ( !shadow2_mode_refcounts(d) )
+ put_page_from_l1e(ol1e, d);
put_page_type(page);
failed:
unmap_domain_page(va);
put_page(page);
+
return rc;
}
@@ -2462,8 +2451,6 @@ static int destroy_grant_pte_mapping(
u32 type_info;
l1_pgentry_t ol1e;
- ASSERT(!shadow_mode_refcounts(d));
-
gmfn = addr >> PAGE_SHIFT;
mfn = gmfn_to_mfn(d, gmfn);
@@ -2504,7 +2491,9 @@ static int destroy_grant_pte_mapping(
}
/* Delete pagetable entry. */
- if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) )
+ if ( unlikely(!update_l1e(
+ (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn,
+ d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
{
MEM_LOG("Cannot delete PTE entry at %p", va);
put_page_type(page);
@@ -2512,14 +2501,6 @@ static int destroy_grant_pte_mapping(
goto failed;
}
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- struct domain_mmap_cache sh_mapcache;
- domain_mmap_cache_init(&sh_mapcache);
- shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
- domain_mmap_cache_destroy(&sh_mapcache);
- }
-
put_page_type(page);
failed:
@@ -2536,31 +2517,22 @@ static int create_grant_va_mapping(
struct domain *d = v->domain;
ASSERT(spin_is_locked(&d->big_lock));
- ASSERT(!shadow_mode_refcounts(d));
-
- /*
- * This is actually overkill - we don't need to sync the L1 itself,
- * just everything involved in getting to this L1 (i.e. we need
- * linear_pg_table[l1_linear_offset(va)] to be in sync)...
- */
- __shadow_sync_va(v, va);
pl1e = &linear_pg_table[l1_linear_offset(va)];
if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
- !update_l1e(pl1e, ol1e, _nl1e) )
+ !update_l1e(pl1e, ol1e, _nl1e,
+ l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
return GNTST_general_error;
- put_page_from_l1e(ol1e, d);
-
- if ( unlikely(shadow_mode_enabled(d)) )
- shadow_do_update_va_mapping(va, _nl1e, v);
+ if ( !shadow2_mode_refcounts(d) )
+ put_page_from_l1e(ol1e, d);
return GNTST_okay;
}
static int destroy_grant_va_mapping(
- unsigned long addr, unsigned long frame)
+ unsigned long addr, unsigned long frame, struct domain *d)
{
l1_pgentry_t *pl1e, ol1e;
@@ -2584,12 +2556,14 @@ static int destroy_grant_va_mapping(
}
/* Delete pagetable entry. */
- if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) )
+ if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(),
+ l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]),
+ d->vcpu[0] /* Change for per-vcpu shadows */)) )
{
MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
return GNTST_general_error;
}
-
+
return 0;
}
@@ -2597,7 +2571,7 @@ int create_grant_host_mapping(
unsigned long addr, unsigned long frame, unsigned int flags)
{
l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
-
+
if ( (flags & GNTMAP_application_map) )
l1e_add_flags(pte,_PAGE_USER);
if ( !(flags & GNTMAP_readonly) )
@@ -2613,7 +2587,7 @@ int destroy_grant_host_mapping(
{
if ( flags & GNTMAP_contains_pte )
return destroy_grant_pte_mapping(addr, frame, current->domain);
- return destroy_grant_va_mapping(addr, frame);
+ return destroy_grant_va_mapping(addr, frame, current->domain);
}
int steal_page(
@@ -2675,46 +2649,44 @@ int do_update_va_mapping(unsigned long va, u64 val64,
perfc_incrc(calls_to_update_va);
- if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
+ if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
return -EINVAL;
- LOCK_BIGLOCK(d);
-
- if ( unlikely(shadow_mode_enabled(d)) )
- check_pagetable(v, "pre-va"); /* debug */
+ if ( unlikely(shadow2_mode_refcounts(d)) )
+ {
+ DPRINTK("Grant op on a shadow-refcounted domain\n");
+ return -EINVAL;
+ }
- if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
- val)) )
- rc = -EINVAL;
+ LOCK_BIGLOCK(d);
- if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
+ if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
{
if ( unlikely(this_cpu(percpu_mm_info).foreign &&
- (shadow_mode_translate(d) ||
- shadow_mode_translate(
+ (shadow2_mode_translate(d) ||
+ shadow2_mode_translate(
this_cpu(percpu_mm_info).foreign))) )
{
/*
* The foreign domain's pfn's are in a different namespace. There's
- * not enough information in just a gpte to figure out how to
+ * not enough information in just a gpte to figure out how to
* (re-)shadow this entry.
*/
domain_crash(d);
}
-
- rc = shadow_do_update_va_mapping(va, val, v);
-
- check_pagetable(v, "post-va"); /* debug */
}
+ if ( unlikely(!mod_l1_entry(
+ &linear_pg_table[l1_linear_offset(va)], val,
+ l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) )
+ rc = -EINVAL;
+
switch ( flags & UVMF_FLUSHTYPE_MASK )
{
case UVMF_TLB_FLUSH:
switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
{
case UVMF_LOCAL:
- if ( unlikely(shadow_mode_enabled(d)) )
- shadow_sync_all(d);
local_flush_tlb();
break;
case UVMF_ALL:
@@ -2733,9 +2705,9 @@ int do_update_va_mapping(unsigned long va, u64 val64,
switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
{
case UVMF_LOCAL:
- if ( unlikely(shadow_mode_enabled(d)) )
- shadow_invlpg(current, va);
- local_flush_tlb_one(va);
+ if ( !shadow2_mode_enabled(d)
+ || (shadow2_invlpg(current, va) != 0) )
+ local_flush_tlb_one(va);
break;
case UVMF_ALL:
flush_tlb_one_mask(d->domain_dirty_cpumask, va);
@@ -2808,8 +2780,6 @@ long set_gdt(struct vcpu *v,
if ( entries > FIRST_RESERVED_GDT_ENTRY )
return -EINVAL;
- shadow_sync_all(d);
-
/* Check the pages in the new GDT. */
for ( i = 0; i < nr_pages; i++ ) {
mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
@@ -2912,24 +2882,13 @@ long do_update_descriptor(u64 pa, u64 desc)
break;
}
- if ( shadow_mode_enabled(dom) )
- {
- shadow_lock(dom);
-
- __mark_dirty(dom, mfn);
-
- if ( page_is_page_table(page) && !page_out_of_sync(page) )
- shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
- }
+ mark_dirty(dom, mfn);
/* All is good so make the update. */
gdt_pent = map_domain_page(mfn);
memcpy(&gdt_pent[offset], &d, 8);
unmap_domain_page(gdt_pent);
- if ( shadow_mode_enabled(dom) )
- shadow_unlock(dom);
-
put_page_type(page);
ret = 0; /* success */
@@ -2981,8 +2940,8 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
default:
break;
}
-
- if ( !shadow_mode_translate(d) || (mfn == 0) )
+
+ if ( !shadow2_mode_translate(d) || (mfn == 0) )
{
put_domain(d);
return -EINVAL;
@@ -3011,7 +2970,7 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
guest_physmap_add_page(d, xatp.gpfn, mfn);
UNLOCK_BIGLOCK(d);
-
+
put_domain(d);
break;
@@ -3136,7 +3095,8 @@ static int ptwr_emulated_update(
unsigned long pfn;
struct page_info *page;
l1_pgentry_t pte, ol1e, nl1e, *pl1e;
- struct domain *d = current->domain;
+ struct vcpu *v = current;
+ struct domain *d = v->domain;
/* Aligned access only, thank you. */
if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
@@ -3196,25 +3156,36 @@ static int ptwr_emulated_update(
return X86EMUL_UNHANDLEABLE;
}
+
/* Checked successfully: do the update (write or cmpxchg). */
pl1e = map_domain_page(page_to_mfn(page));
pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
if ( do_cmpxchg )
{
+ if ( shadow2_mode_enabled(d) )
+ shadow2_lock(d);
ol1e = l1e_from_intpte(old);
if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
{
+ if ( shadow2_mode_enabled(d) )
+ shadow2_unlock(d);
unmap_domain_page(pl1e);
put_page_from_l1e(nl1e, d);
return X86EMUL_CMPXCHG_FAILED;
}
+ if ( unlikely(shadow2_mode_enabled(v->domain)) )
+ {
+ shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
+ shadow2_unlock(v->domain);
+ }
}
else
{
ol1e = *pl1e;
- if ( !update_l1e(pl1e, ol1e, nl1e) )
+ if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) )
BUG();
}
+
unmap_domain_page(pl1e);
/* Finally, drop the old PTE. */
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index ff0589082a..01782320b3 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -532,8 +532,6 @@ void __init __start_xen(multiboot_info_t *mbi)
if ( opt_watchdog )
watchdog_enable();
- shadow_mode_init();
-
/* initialize access control security module */
acm_init(&initrdidx, mbi, initial_images_start);
diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c
deleted file mode 100644
index 88e2ec8417..0000000000
--- a/xen/arch/x86/shadow.c
+++ /dev/null
@@ -1,4150 +0,0 @@
-/******************************************************************************
- * arch/x86/shadow.c
- *
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-/*
- * Jun Nakajima <jun.nakajima@intel.com>
- * Chengyuan Li <chengyuan.li@intel.com>
- *
- * Extended to support 32-bit PAE and 64-bit guests.
- */
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/shadow.h>
-#include <asm/page.h>
-#include <xen/event.h>
-#include <xen/sched.h>
-#include <xen/trace.h>
-#include <asm/shadow_64.h>
-
-/* Use this to have the compiler remove unnecessary branches */
-#define SH_L1_HAS_NEXT_PAGE (GUEST_L1_PAGETABLE_ENTRIES - L1_PAGETABLE_ENTRIES)
-
-extern void free_shadow_pages(struct domain *d);
-
-#if 0 // this code has not been updated for 32pae & 64 bit modes
-#if SHADOW_DEBUG
-static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
-#endif
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3
-static unsigned long shadow_l3_table(
- struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-static unsigned long shadow_l4_table(
- struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void shadow_map_into_current(struct vcpu *v,
- unsigned long va, unsigned int from, unsigned int to);
-static inline void validate_bl2e_change( struct domain *d,
- guest_root_pgentry_t *new_gle_p, pgentry_64_t *shadow_l3, int index);
-static void update_top_level_shadow(struct vcpu *v, unsigned long smfn);
-#endif
-
-/********
-
-There's a per-domain shadow table spin lock which works fine for SMP
-hosts. We don't have to worry about interrupts as no shadow operations
-happen in an interrupt context. It's probably not quite ready for SMP
-guest operation as we have to worry about synchonisation between gpte
-and spte updates. Its possible that this might only happen in a
-hypercall context, in which case we'll probably at have a per-domain
-hypercall lock anyhow (at least initially).
-
-********/
-
-static inline int
-shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
- unsigned long new_type)
-{
- struct page_info *page = mfn_to_page(gmfn);
- int pinned = 0, okay = 1;
-
- if ( page_out_of_sync(page) )
- {
- // Don't know how long ago this snapshot was taken.
- // Can't trust it to be recent enough.
- //
- __shadow_sync_mfn(d, gmfn);
- }
-
- if ( !shadow_mode_refcounts(d) )
- return 1;
-
- if ( unlikely(page_is_page_table(page)) )
- return 1;
-
- FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
-
- if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
- {
- FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
- __func__, gpfn, gmfn);
-#if 1 || defined(LIVE_DANGEROUSLY)
- set_bit(_PGC_page_table, &page->count_info);
- return 1;
-#endif
- return 0;
- }
-
- // To convert this page to use as a page table, the writable count
- // should now be zero. Test this by grabbing the page as an page table,
- // and then immediately releasing. This will also deal with any
- // necessary TLB flushing issues for us.
- //
- // The cruft here about pinning doesn't really work right. This
- // needs rethinking/rewriting... Need to gracefully deal with the
- // TLB flushes required when promoting a writable page, and also deal
- // with any outstanding (external) writable refs to this page (by
- // refusing to promote it). The pinning headache complicates this
- // code -- it would all get much simpler if we stop using
- // shadow_lock() and move the shadow code to BIGLOCK().
- //
- if ( unlikely(!get_page(page, d)) )
- BUG(); // XXX -- needs more thought for a graceful failure
- if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
- {
- pinned = 1;
- put_page_and_type(page);
- }
- if ( get_page_type(page, PGT_base_page_table) )
- {
- set_bit(_PGC_page_table, &page->count_info);
- put_page_type(page);
- }
- else
- {
- printk("shadow_promote: get_page_type failed "
- "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
- d->domain_id, gpfn, gmfn, new_type);
- okay = 0;
- }
-
- // Now put the type back to writable...
- if ( unlikely(!get_page_type(page, PGT_writable_page)) )
- BUG(); // XXX -- needs more thought for a graceful failure
- if ( unlikely(pinned) )
- {
- if ( unlikely(test_and_set_bit(_PGT_pinned,
- &page->u.inuse.type_info)) )
- BUG(); // hmm... someone pinned this again?
- }
- else
- put_page_and_type(page);
-
- return okay;
-}
-
-
-/*
- * Things in shadow mode that collect get_page() refs to the domain's
- * pages are:
- * - PGC_allocated takes a gen count, just like normal.
- * - A writable page can be pinned (paravirtualized guests may consider
- * these pages to be L1s or L2s, and don't know the difference).
- * Pinning a page takes a gen count (but, for domains in shadow mode,
- * it *doesn't* take a type count)
- * - CR3 grabs a ref to whatever it points at, just like normal.
- * - Shadow mode grabs an initial gen count for itself, as a placehold
- * for whatever references will exist.
- * - Shadow PTEs that point to a page take a gen count, just like regular
- * PTEs. However, they don't get a type count, as get_page_type() is
- * hardwired to keep writable pages' counts at 1 for domains in shadow
- * mode.
- * - Whenever we shadow a page, the entry in the shadow hash grabs a
- * general ref to the page.
- * - Whenever a page goes out of sync, the out of sync entry grabs a
- * general ref to the page.
- */
-/*
- * page_info fields for pages allocated as shadow pages:
- *
- * All 32 bits of count_info are a simple count of refs to this shadow
- * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
- * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
- * references.
- *
- * u.inuse._domain is left NULL, to prevent accidently allow some random
- * domain from gaining permissions to map this page.
- *
- * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
- * shadowed.
- * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
- * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
- * is currently exists because this is a shadow of a root page, and we
- * don't want to let those disappear just because no CR3 is currently pointing
- * at it.
- *
- * tlbflush_timestamp holds a min & max index of valid page table entries
- * within the shadow page.
- */
-static inline void
-shadow_page_info_init(struct page_info *page,
- unsigned long gmfn,
- u32 psh_type)
-{
- ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
- page->u.inuse.type_info = psh_type | gmfn;
- page->count_info = 0;
- page->tlbflush_timestamp = 0;
-}
-
-static inline unsigned long
-alloc_shadow_page(struct domain *d,
- unsigned long gpfn, unsigned long gmfn,
- u32 psh_type)
-{
- struct page_info *page;
- unsigned long smfn, real_gpfn;
- int pin = 0;
- void *l1, *lp;
- u64 index = 0;
-
- // Currently, we only keep pre-zero'ed pages around for use as L1's...
- // This will change. Soon.
- //
- if ( psh_type == PGT_l1_shadow )
- {
- if ( !list_empty(&d->arch.free_shadow_frames) )
- {
- struct list_head *entry = d->arch.free_shadow_frames.next;
- page = list_entry(entry, struct page_info, list);
- list_del(entry);
- perfc_decr(free_l1_pages);
- }
- else
- {
- if ( SH_L1_HAS_NEXT_PAGE &&
- d->arch.ops->guest_paging_levels == PAGING_L2)
- {
-#if CONFIG_PAGING_LEVELS >= 3
- /*
- * For 32-bit HVM guest, 2 shadow L1s are required to
- * simulate 1 guest L1 So need allocate 2 shadow L1
- * pages each time.
- *
- * --> Need to avoidalloc_domheap_pages.
- */
- page = alloc_domheap_pages(NULL, SL1_ORDER, 0);
- if (!page)
- goto no_shadow_page;
-
- l1 = map_domain_page(page_to_mfn(page));
- memset(l1, 0, PAGE_SIZE);
- unmap_domain_page(l1);
-
- l1 = map_domain_page(page_to_mfn(page + 1));
- memset(l1, 0, PAGE_SIZE);
- unmap_domain_page(l1);
-
- /* we'd like to initialize the second continuous page here
- * and leave the first page initialization later */
-
- shadow_page_info_init(page+1, gmfn, psh_type);
-#else
- page = alloc_domheap_page(NULL);
- if (!page)
- goto no_shadow_page;
-
- l1 = map_domain_page(page_to_mfn(page));
- memset(l1, 0, PAGE_SIZE);
- unmap_domain_page(l1);
-#endif
- }
- else
- {
- page = alloc_domheap_page(NULL);
- if (!page)
- goto no_shadow_page;
-
- l1 = map_domain_page(page_to_mfn(page));
- memset(l1, 0, PAGE_SIZE);
- unmap_domain_page(l1);
- }
- }
- }
- else {
-#if CONFIG_PAGING_LEVELS == 2
- page = alloc_domheap_page(NULL);
-#elif CONFIG_PAGING_LEVELS >= 3
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
- psh_type == PGT_l4_shadow ) /* allocated for PAE PDP page */
- page = alloc_domheap_pages(NULL, 0, MEMF_dma);
- else if ( d->arch.ops->guest_paging_levels == PAGING_L3 &&
- (psh_type == PGT_l3_shadow || psh_type == PGT_l4_shadow) )
- page = alloc_domheap_pages(NULL, 0, MEMF_dma); /* allocated for PAE PDP page */
- else
- page = alloc_domheap_page(NULL);
-#endif
- if (!page)
- goto no_shadow_page;
-
- lp = map_domain_page(page_to_mfn(page));
- memset(lp, 0, PAGE_SIZE);
- unmap_domain_page(lp);
- }
-
- smfn = page_to_mfn(page);
-
- shadow_page_info_init(page, gmfn, psh_type);
-
- switch ( psh_type )
- {
- case PGT_l1_shadow:
- if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
- goto fail;
- perfc_incr(shadow_l1_pages);
- d->arch.shadow_page_count++;
- break;
-
- case PGT_l2_shadow:
- if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
- goto fail;
- perfc_incr(shadow_l2_pages);
- d->arch.shadow_page_count++;
- if ( PGT_l2_page_table == PGT_root_page_table )
- pin = 1;
-
- break;
-
- case PGT_l3_shadow:
- if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
- goto fail;
- perfc_incr(shadow_l3_pages);
- d->arch.shadow_page_count++;
- if ( PGT_l3_page_table == PGT_root_page_table )
- pin = 1;
- break;
-
- case PGT_l4_shadow:
- real_gpfn = gpfn & PGT_mfn_mask;
- if ( !shadow_promote(d, real_gpfn, gmfn, psh_type) )
- goto fail;
- perfc_incr(shadow_l4_pages);
- d->arch.shadow_page_count++;
- if ( PGT_l4_page_table == PGT_root_page_table )
- pin = 1;
-#if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32)
- /*
- * We use PGT_l4_shadow for 2-level paging guests on PAE
- */
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
- pin = 1;
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE )
- /*
- * We use PGT_l4_shadow for 2-level paging guests on PAE
- */
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
- pin = 1;
-#endif
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
- index = get_cr3_idxval(current);
- break;
-
-#if CONFIG_PAGING_LEVELS >= 3
- case PGT_fl1_shadow:
- perfc_incr(shadow_l1_pages);
- d->arch.shadow_page_count++;
- break;
-#else
-
- case PGT_hl2_shadow:
- // Treat an hl2 as an L1 for purposes of promotion.
- // For external mode domains, treat them as an L2 for purposes of
- // pinning.
- //
- if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
- goto fail;
- perfc_incr(hl2_table_pages);
- d->arch.hl2_page_count++;
- if ( shadow_mode_external(d) &&
- (PGT_l2_page_table == PGT_root_page_table) )
- pin = 1;
-
- break;
-#endif
- case PGT_snapshot:
- perfc_incr(snapshot_pages);
- d->arch.snapshot_page_count++;
- break;
-
- default:
- printk("Alloc shadow weird page type type=%08x\n", psh_type);
- BUG();
- break;
- }
-
- // Don't add a new shadow of something that already has a snapshot.
- //
- ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
-
- set_shadow_status(d, gpfn, gmfn, smfn, psh_type, index);
-
- if ( pin )
- shadow_pin(smfn);
-
- return smfn;
-
-fail:
- FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?",
- gpfn, gmfn);
- if (psh_type == PGT_l1_shadow)
- {
- if (d->arch.ops->guest_paging_levels == PAGING_L2)
- {
-#if CONFIG_PAGING_LEVELS >=3
- free_domheap_pages(page, SL1_ORDER);
-#else
- free_domheap_page(page);
-#endif
- }
- else
- free_domheap_page(page);
- }
- else
- free_domheap_page(page);
-
- return 0;
-
-no_shadow_page:
- ASSERT(page == NULL);
- printk("Couldn't alloc shadow page! dom%d count=%d\n",
- d->domain_id, d->arch.shadow_page_count);
- printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
- perfc_value(shadow_l1_pages),
- perfc_value(shadow_l2_pages),
- perfc_value(hl2_table_pages),
- perfc_value(snapshot_pages));
- /* XXX FIXME: try a shadow flush to free up some memory. */
- domain_crash_synchronous();
-
- return 0;
-}
-
-#if CONFIG_PAGING_LEVELS == 2
-static unsigned long
-shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
- unsigned long smfn)
-{
- unsigned long hl2mfn;
- l1_pgentry_t *hl2;
- int limit;
-
- ASSERT(PGT_base_page_table == PGT_l2_page_table);
-
- if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
- {
- printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
- gpfn, gmfn);
- BUG(); /* XXX Deal gracefully with failure. */
- }
-
- SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
- gpfn, gmfn, smfn, hl2mfn);
- perfc_incrc(shadow_hl2_table_count);
-
- hl2 = map_domain_page(hl2mfn);
-
- if ( shadow_mode_external(d) )
- limit = L2_PAGETABLE_ENTRIES;
- else
- limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
- memset(hl2, 0, limit * sizeof(l1_pgentry_t));
-
- if ( !shadow_mode_external(d) )
- {
- memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
- HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
- // Setup easy access to the GL2, SL2, and HL2 frames.
- //
- hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
- l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
- hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
- l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
- hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
- l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
- }
-
- unmap_domain_page(hl2);
-
- return hl2mfn;
-}
-
-/*
- * This could take and use a snapshot, and validate the entire page at
- * once, or it could continue to fault in entries one at a time...
- * Might be worth investigating...
- */
-static unsigned long shadow_l2_table(
- struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
-{
- unsigned long smfn;
- l2_pgentry_t *spl2e;
- struct domain *d = v->domain;
- int i;
-
- SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
-
- perfc_incrc(shadow_l2_table_count);
-
- if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
- {
- printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
- gpfn, gmfn);
- BUG(); /* XXX Deal gracefully with failure. */
- }
-
- spl2e = (l2_pgentry_t *)map_domain_page(smfn);
-
- /* Install hypervisor and 2x linear p.t. mapings. */
- if ( (PGT_base_page_table == PGT_l2_page_table) &&
- !shadow_mode_external(d) )
- {
- /*
- * We could proactively fill in PDEs for pages that are already
- * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
- * (restriction required for coherence of the accessed bit). However,
- * we tried it and it didn't help performance. This is simpler.
- */
- memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
-
- /* Install hypervisor and 2x linear p.t. mapings. */
- memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
- spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
-
- for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
- spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
- l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
- arch.mm_perdomain_pt) + i,
- __PAGE_HYPERVISOR);
-
- if ( shadow_mode_translate(d) ) // NB: not external
- {
- unsigned long hl2mfn;
-
- spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
- l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
- __PAGE_HYPERVISOR);
-
- if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
- hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
-
- // shadow_mode_translate (but not external) sl2 tables hold a
- // ref to their hl2.
- //
- if ( !get_shadow_ref(hl2mfn) )
- BUG();
-
- spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
- }
- else
- spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
- }
- else
- {
- memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
- }
-
- unmap_domain_page(spl2e);
-
- SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
- return smfn;
-}
-#endif /* CONFIG_PAGING_LEVELS == 2 */
-
-static void shadow_map_l1_into_current_l2(unsigned long va)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- l1_pgentry_t *spl1e, *spl1e_next = 0;
- l2_pgentry_t sl2e;
- guest_l1_pgentry_t *gpl1e;
- guest_l2_pgentry_t gl2e = {0};
- unsigned long gl1pfn, gl1mfn, sl1mfn;
- int i, init_table = 0;
-
- __guest_get_l2e(v, va, &gl2e);
- ASSERT(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT);
- gl1pfn = l2e_get_pfn(gl2e);
-
- if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
- {
- /* This L1 is NOT already shadowed so we need to shadow it. */
- SH_VVLOG("4a: l1 not shadowed");
-
- gl1mfn = gmfn_to_mfn(d, gl1pfn);
- if ( unlikely(!VALID_MFN(gl1mfn)) )
- {
- // Attempt to use an invalid pfn as an L1 page.
- // XXX this needs to be more graceful!
- BUG();
- }
-
- if ( unlikely(!(sl1mfn =
- alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
- {
- printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
- gl1pfn, gl1mfn);
- BUG(); /* XXX Need to deal gracefully with failure. */
- }
-
- perfc_incrc(shadow_l1_table_count);
- init_table = 1;
- }
- else
- {
- /* This L1 is shadowed already, but the L2 entry is missing. */
- SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
- }
-
-#ifndef NDEBUG
- {
- l2_pgentry_t old_sl2e;
- __shadow_get_l2e(v, va, &old_sl2e);
- ASSERT(!(l2e_get_flags(old_sl2e) & _PAGE_PRESENT));
- }
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
- if ( SH_L1_HAS_NEXT_PAGE &&
- d->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- /* for 32-bit HVM guest on 64-bit or PAE host,
- * need update two L2 entries each time
- */
- if ( !get_shadow_ref(sl1mfn))
- BUG();
- l2pde_general(d, &gl2e, &sl2e, sl1mfn);
- __guest_set_l2e(v, va, &gl2e);
- __shadow_set_l2e(v, va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1), &sl2e);
- if ( !get_shadow_ref(sl1mfn+1))
- BUG();
- sl2e = l2e_empty();
- l2pde_general(d, &gl2e, &sl2e, sl1mfn+1);
- __shadow_set_l2e(v,((va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1)) + (1 << L2_PAGETABLE_SHIFT)) , &sl2e);
- } else
-#endif
- {
- if ( !get_shadow_ref(sl1mfn) )
- BUG();
- l2pde_general(d, &gl2e, &sl2e, sl1mfn);
- __guest_set_l2e(v, va, &gl2e);
- __shadow_set_l2e(v, va , &sl2e);
- }
-
- if ( init_table )
- {
- l1_pgentry_t sl1e;
- int index = guest_l1_table_offset(va);
- int min = 1, max = 0;
-
- unsigned long tmp_gmfn;
- l2_pgentry_t tmp_sl2e = {0};
- guest_l2_pgentry_t tmp_gl2e = {0};
-
- __guest_get_l2e(v, va, &tmp_gl2e);
- tmp_gmfn = gmfn_to_mfn(d, l2e_get_pfn(tmp_gl2e));
- gpl1e = (guest_l1_pgentry_t *) map_domain_page(tmp_gmfn);
-
- /* If the PGT_l1_shadow has two contiguous pages */
-#if CONFIG_PAGING_LEVELS >= 3
- if ( SH_L1_HAS_NEXT_PAGE &&
- d->arch.ops->guest_paging_levels == PAGING_L2 )
- __shadow_get_l2e(v, va & ~((1UL << L2_PAGETABLE_SHIFT_32) - 1), &tmp_sl2e);
- else
-#endif
- __shadow_get_l2e(v, va, &tmp_sl2e);
-
- spl1e = (l1_pgentry_t *) map_domain_page(l2e_get_pfn(tmp_sl2e));
-
- if ( SH_L1_HAS_NEXT_PAGE )
- spl1e_next = (l1_pgentry_t *) map_domain_page(
- (l2e_get_pfn(tmp_sl2e) + 1UL));
-
- for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
- {
- l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
- if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
- unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
- sl1e = l1e_empty();
- if ( l1e_get_flags(sl1e) == 0 )
- {
- // First copy entries from 0 until first invalid.
- // Then copy entries from index until first invalid.
- //
- if ( i < index ) {
- i = index - 1;
- continue;
- }
- break;
- }
-
- if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
- spl1e_next[i - L1_PAGETABLE_ENTRIES] = sl1e;
- else
- spl1e[i] = sl1e;
-
- if ( unlikely(i < min) )
- min = i;
- if ( likely(i > max) )
- max = i;
- set_guest_back_ptr(d, sl1e, sl1mfn, i);
- }
-
- mfn_to_page(sl1mfn)->tlbflush_timestamp =
- SHADOW_ENCODE_MIN_MAX(min, max);
-
- unmap_domain_page(gpl1e);
- unmap_domain_page(spl1e);
-
- if ( SH_L1_HAS_NEXT_PAGE )
- unmap_domain_page(spl1e_next);
- }
-}
-
-#if CONFIG_PAGING_LEVELS == 2
-static void
-shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- l2_pgentry_t sl2e = {0};
-
- __shadow_get_l2e(v, va, &sl2e);
- if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
- {
- /*
- * Either the L1 is not shadowed, or the shadow isn't linked into
- * the current shadow L2.
- */
- if ( create_l1_shadow )
- {
- perfc_incrc(shadow_set_l1e_force_map);
- shadow_map_l1_into_current_l2(va);
- }
- else /* check to see if it exists; if so, link it in */
- {
- l2_pgentry_t gpde = {0};
- unsigned long gl1pfn;
- unsigned long sl1mfn;
-
- __guest_get_l2e(v, va, &gpde);
-
- if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
- {
- gl1pfn = l2e_get_pfn(gpde);
- sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
- }
- else
- {
- // no shadow exists, so there's nothing to do.
- perfc_incrc(shadow_set_l1e_fail);
- return;
- }
-
- if ( sl1mfn )
- {
- perfc_incrc(shadow_set_l1e_unlinked);
- if ( !get_shadow_ref(sl1mfn) )
- BUG();
- l2pde_general(d, (guest_l2_pgentry_t *)&gpde, &sl2e, sl1mfn);
- __guest_set_l2e(v, va, &gpde);
- __shadow_set_l2e(v, va, &sl2e);
- }
- else
- {
- // no shadow exists, so there's nothing to do.
- perfc_incrc(shadow_set_l1e_fail);
- return;
- }
- }
- }
-
- __shadow_get_l2e(v, va, &sl2e);
-
- if ( shadow_mode_refcounts(d) )
- {
- l1_pgentry_t old_spte;
- __shadow_get_l1e(v, va, &old_spte);
-
- // only do the ref counting if something important changed.
- //
- if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
- {
- if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
- !shadow_get_page_from_l1e(new_spte, d) )
- new_spte = l1e_empty();
- if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
- shadow_put_page_from_l1e(old_spte, d);
- }
- }
-
- set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
- __shadow_set_l1e(v, va, &new_spte);
- shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
-}
-
-static void shadow_invlpg_32(struct vcpu *v, unsigned long va)
-{
- struct domain *d = v->domain;
- l1_pgentry_t gpte, spte;
-
- ASSERT(shadow_mode_enabled(d));
-
- shadow_lock(d);
-
- __shadow_sync_va(v, va);
-
- // XXX mafetter: will need to think about 4MB pages...
-
- // It's not strictly necessary to update the shadow here,
- // but it might save a fault later.
- //
- /*if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
- sizeof(gpte))) {*/
- if (unlikely(!__guest_get_l1e(v, va, &gpte))) {
- perfc_incrc(shadow_invlpg_faults);
- shadow_unlock(d);
- return;
- }
- l1pte_propagate_from_guest(d, gpte, &spte);
- shadow_set_l1e(va, spte, 1);
-
- shadow_unlock(d);
-}
-#endif /* CONFIG_PAGING_LEVELS == 2 */
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void shadow_set_l1e_64(
- unsigned long va, pgentry_64_t *sl1e_p,
- int create_l1_shadow)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- pgentry_64_t sle = { 0 };
- pgentry_64_t sle_up = {0};
- l1_pgentry_t old_spte;
- l1_pgentry_t sl1e = *(l1_pgentry_t *)sl1e_p;
- int i;
- unsigned long orig_va = 0;
-
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- /* This is for 32-bit VMX guest on 64-bit host */
- orig_va = va;
- va = va & (~((1<<L2_PAGETABLE_SHIFT_32)-1));
- }
-
- for ( i = PAGING_L4; i >= PAGING_L2; i-- )
- {
- if ( !__rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i) )
- {
- sl1e = l1e_empty();
- goto out;
- }
- if ( !(entry_get_flags(sle) & _PAGE_PRESENT) )
- {
- if ( create_l1_shadow )
- {
- perfc_incrc(shadow_set_l3e_force_map);
- shadow_map_into_current(v, va, i-1, i);
- __rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i);
- }
- }
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
- {
- if ( i < PAGING_L3 )
- shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
- }
- else
- {
- if ( i < PAGING_L4 )
- shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
- }
-
- sle_up = sle;
- }
-
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- va = orig_va;
- }
-
- if ( shadow_mode_refcounts(d) )
- {
- __shadow_get_l1e(v, va, &old_spte);
- if ( l1e_has_changed(old_spte, sl1e, _PAGE_RW | _PAGE_PRESENT) )
- {
- if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
- !shadow_get_page_from_l1e(sl1e, d) )
- sl1e = l1e_empty();
- if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
- put_page_from_l1e(old_spte, d);
- }
- }
-
-out:
- __shadow_set_l1e(v, va, &sl1e);
-
- shadow_update_min_max(entry_get_pfn(sle_up), guest_l1_table_offset(va));
-}
-#endif /* CONFIG_PAGING_LEVELS >= 3 */
-
-static struct out_of_sync_entry *
-shadow_alloc_oos_entry(struct domain *d)
-{
- struct out_of_sync_entry *f, *extra;
- unsigned size, i;
-
- if ( unlikely(d->arch.out_of_sync_free == NULL) )
- {
- FSH_LOG("Allocate more fullshadow tuple blocks.");
-
- size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
- extra = xmalloc_bytes(size);
-
- /* XXX Should be more graceful here. */
- if ( extra == NULL )
- BUG();
-
- memset(extra, 0, size);
-
- /* Record the allocation block so it can be correctly freed later. */
- d->arch.out_of_sync_extras_count++;
- *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
- d->arch.out_of_sync_extras;
- d->arch.out_of_sync_extras = &extra[0];
-
- /* Thread a free chain through the newly-allocated nodes. */
- for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
- extra[i].next = &extra[i+1];
- extra[i].next = NULL;
-
- /* Add the new nodes to the free list. */
- d->arch.out_of_sync_free = &extra[0];
- }
-
- /* Allocate a new node from the quicklist. */
- f = d->arch.out_of_sync_free;
- d->arch.out_of_sync_free = f->next;
-
- return f;
-}
-
-static inline unsigned long
-shadow_make_snapshot(
- struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
- unsigned long smfn, sl1mfn = 0;
- void *original, *snapshot;
- u32 min_max = 0;
- int min, max, length;
-
- if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
- {
- ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
- return SHADOW_SNAPSHOT_ELSEWHERE;
- }
-
- perfc_incrc(shadow_make_snapshot);
-
- if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
- {
- printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
- "Dom%d snapshot_count_count=%d\n",
- gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
- BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
- }
-
- if ( !get_shadow_ref(smfn) )
- BUG();
-
- if ( shadow_mode_refcounts(d) &&
- (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
- min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
- mfn_to_page(smfn)->tlbflush_timestamp = min_max;
-
- min = SHADOW_MIN(min_max);
- max = SHADOW_MAX(min_max);
- length = max - min + 1;
- perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
-
- min *= sizeof(guest_l1_pgentry_t);
- length *= sizeof(guest_l1_pgentry_t);
-
- original = map_domain_page(gmfn);
- snapshot = map_domain_page(smfn);
- memcpy(snapshot + min, original + min, length);
- unmap_domain_page(original);
- unmap_domain_page(snapshot);
-
- return smfn;
-}
-
-static struct out_of_sync_entry *
-__mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
- unsigned long mfn)
-{
- struct domain *d = v->domain;
- struct page_info *page = mfn_to_page(mfn);
- struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(mfn_valid(mfn));
-
-#ifndef NDEBUG
- {
- u32 type = page->u.inuse.type_info & PGT_type_mask;
- if ( shadow_mode_refcounts(d) )
- {
- ASSERT(type == PGT_writable_page);
- }
- else
- {
- ASSERT(type && (type < PGT_l4_page_table));
- }
- }
-#endif
-
- FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
- gpfn, mfn, page->count_info, page->u.inuse.type_info);
-
- // XXX this will require some more thought... Cross-domain sharing and
- // modification of page tables? Hmm...
- //
- if ( d != page_get_owner(page) )
- BUG();
-
- perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
-
- entry->v = v;
- entry->gpfn = gpfn;
- entry->gmfn = mfn;
- entry->writable_pl1e = -1;
-
-#if 0 // this code has not been updated for 32pae & 64 bit modes
-#if SHADOW_DEBUG
- mark_shadows_as_reflecting_snapshot(d, gpfn);
-#endif
-#endif
-
- // increment guest's ref count to represent the entry in the
- // full shadow out-of-sync list.
- //
- get_page(page, d);
-
- return entry;
-}
-
-static struct out_of_sync_entry *
-mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
- unsigned long mfn)
-{
- struct out_of_sync_entry *entry =
- __mark_mfn_out_of_sync(v, gpfn, mfn);
- struct domain *d = v->domain;
-
- entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
- // Add to the out-of-sync list
- //
- entry->next = d->arch.out_of_sync;
- d->arch.out_of_sync = entry;
-
- return entry;
-
-}
-
-static void shadow_mark_va_out_of_sync(
- struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
-{
- struct out_of_sync_entry *entry =
- __mark_mfn_out_of_sync(v, gpfn, mfn);
- l2_pgentry_t sl2e;
- struct domain *d = v->domain;
-
-#if CONFIG_PAGING_LEVELS >= 3
- {
- l4_pgentry_t sl4e;
- l3_pgentry_t sl3e;
-
- __shadow_get_l4e(v, va, &sl4e);
- if ( !(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
- shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
- }
-
- if (!__shadow_get_l3e(v, va, &sl3e)) {
- BUG();
- }
-
- if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
- shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
- }
- }
-#endif
-
- // We need the address of shadow PTE that maps @va.
- // It might not exist yet. Make sure it's there.
- //
- __shadow_get_l2e(v, va, &sl2e);
- if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
- {
- // either this L1 isn't shadowed yet, or the shadow isn't linked into
- // the current L2.
- shadow_map_l1_into_current_l2(va);
- __shadow_get_l2e(v, va, &sl2e);
- }
- ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
-
- entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
- // NB: this is stored as a machine address.
- entry->writable_pl1e =
- l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
- ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
- entry->va = va;
-
- // Increment shadow's page count to represent the reference
- // inherent in entry->writable_pl1e
- //
- if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
- BUG();
-
- // Add to the out-of-sync list
- //
- entry->next = d->arch.out_of_sync;
- d->arch.out_of_sync = entry;
-
- FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
- __func__, va, entry->writable_pl1e);
-}
-
-/*
- * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
- * Returns 0 otherwise.
- */
-static int snapshot_entry_matches(
- struct domain *d, guest_l1_pgentry_t *guest_pt,
- unsigned long gpfn, unsigned index)
-{
- unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
- guest_l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
- int entries_match;
-
- perfc_incrc(snapshot_entry_matches_calls);
-
- if ( !smfn )
- return 0;
-
- snapshot = map_domain_page(smfn);
-
- if (__copy_from_user(&gpte, &guest_pt[index],
- sizeof(gpte)))
- {
- unmap_domain_page(snapshot);
- return 0;
- }
-
- // This could probably be smarter, but this is sufficent for
- // our current needs.
- //
- entries_match = !guest_l1e_has_changed(gpte, snapshot[index],
- PAGE_FLAG_MASK);
-
- unmap_domain_page(snapshot);
-
-#ifdef PERF_COUNTERS
- if ( entries_match )
- perfc_incrc(snapshot_entry_matches_true);
-#endif
-
- return entries_match;
-}
-
-/*
- * Returns 1 if va's shadow mapping is out-of-sync.
- * Returns 0 otherwise.
- */
-static int is_out_of_sync(struct vcpu *v, unsigned long va) /* __shadow_out_of_sync */
-{
- struct domain *d = v->domain;
-#if CONFIG_PAGING_LEVELS == 4
- unsigned long l2mfn = ((v->arch.flags & TF_kernel_mode)?
- pagetable_get_pfn(v->arch.guest_table) :
- pagetable_get_pfn(v->arch.guest_table_user));
-#else
- unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
-#endif
- unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
- guest_l2_pgentry_t l2e;
- unsigned long l1pfn, l1mfn;
- guest_l1_pgentry_t *guest_pt;
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(VALID_M2P(l2pfn));
-
- perfc_incrc(shadow_out_of_sync_calls);
-
-#if CONFIG_PAGING_LEVELS >= 3
-
-#define unmap_and_return(x) \
- if ( guest_pt != (guest_l1_pgentry_t *) v->arch.guest_vtable ) \
- unmap_domain_page(guest_pt); \
- return (x);
-
- if (d->arch.ops->guest_paging_levels >= PAGING_L3)
- {
- pgentry_64_t le;
- unsigned long gmfn;
- unsigned long gpfn;
- int i;
- unsigned int base_idx = 0;
- base_idx = get_cr3_idxval(v);
-
- gmfn = l2mfn;
- gpfn = l2pfn;
- guest_pt = (guest_l1_pgentry_t *)v->arch.guest_vtable;
-
- for ( i = PAGING_L4; i >= PAGING_L3; i-- )
- {
- if (d->arch.ops->guest_paging_levels == PAGING_L3
- && i == PAGING_L4)
- continue; /* skip the top-level for 3-level */
-
- if ( page_out_of_sync(mfn_to_page(gmfn)) &&
- !snapshot_entry_matches(
- d, guest_pt, gpfn, guest_table_offset_64(va, i, base_idx)) )
- {
- unmap_and_return (1);
- }
-
- le = entry_empty();
- __rw_entry(v, va, &le, GUEST_ENTRY | GET_ENTRY | i);
-
- if ( !(entry_get_flags(le) & _PAGE_PRESENT) )
- {
- unmap_and_return (0);
- }
- gpfn = entry_get_pfn(le);
- gmfn = gmfn_to_mfn(d, gpfn);
- if ( !VALID_MFN(gmfn) )
- {
- unmap_and_return (0);
- }
- if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
- unmap_domain_page(guest_pt);
- guest_pt = (guest_l1_pgentry_t *)map_domain_page(gmfn);
- }
-
- /* L2 */
- if ( page_out_of_sync(mfn_to_page(gmfn)) &&
- !snapshot_entry_matches(d, guest_pt, gpfn, l2_table_offset(va)) )
- {
- unmap_and_return (1);
- }
-
- if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
- unmap_domain_page(guest_pt);
-
- }
- else
-#undef unmap_and_return
-#endif /* CONFIG_PAGING_LEVELS >= 3 */
- {
- if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
- !snapshot_entry_matches(d, (guest_l1_pgentry_t *)v->arch.guest_vtable,
- l2pfn, guest_l2_table_offset(va)) )
- return 1;
- }
-
- __guest_get_l2e(v, va, &l2e);
- if ( !(guest_l2e_get_flags(l2e) & _PAGE_PRESENT) ||
- (guest_l2e_get_flags(l2e) & _PAGE_PSE))
- return 0;
-
- l1pfn = l2e_get_pfn(l2e);
- l1mfn = gmfn_to_mfn(d, l1pfn);
-
- // If the l1 pfn is invalid, it can't be out of sync...
- if ( !VALID_MFN(l1mfn) )
- return 0;
-
- guest_pt = (guest_l1_pgentry_t *) map_domain_page(l1mfn);
-
- if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
- !snapshot_entry_matches(
- d, guest_pt, l1pfn, guest_l1_table_offset(va)) )
- {
- unmap_domain_page(guest_pt);
- return 1;
- }
-
- unmap_domain_page(guest_pt);
- return 0;
-}
-
-static int fix_entry(
- struct domain *d,
- l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
-{
- l1_pgentry_t old = *pt;
- l1_pgentry_t new = old;
-
- l1e_remove_flags(new,_PAGE_RW);
- if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
- BUG();
- (*found)++;
- *pt = new;
- if ( is_l1_shadow )
- shadow_put_page_from_l1e(old, d);
-
- return (*found == max_refs_to_find);
-}
-
-static u32 remove_all_write_access_in_ptpage(
- struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
- unsigned long readonly_gpfn, unsigned long readonly_gmfn,
- u32 max_refs_to_find, unsigned long prediction)
-{
- l1_pgentry_t *pt = map_domain_page(pt_mfn);
- l1_pgentry_t *pt_next = 0, *sl1e_p;
- l1_pgentry_t match;
- unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
- int i;
- u32 found = 0;
- int is_l1_shadow =
- ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
- PGT_l1_shadow);
-#if CONFIG_PAGING_LEVELS >= 3
- is_l1_shadow |=
- ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
- PGT_fl1_shadow);
-#endif
-
- if ( SH_L1_HAS_NEXT_PAGE )
- pt_next = map_domain_page(pt_mfn + 1);
-
- match = l1e_from_pfn(readonly_gmfn, flags);
-
- if ( shadow_mode_external(d) )
- {
- i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask)
- >> PGT_va_shift;
-
- if ( SH_L1_HAS_NEXT_PAGE &&
- i >= L1_PAGETABLE_ENTRIES )
- sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
- else
- sl1e_p = &pt[i];
-
- if ( (i >= 0 && i < GUEST_L1_PAGETABLE_ENTRIES) &&
- !l1e_has_changed(*sl1e_p, match, flags) &&
- fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) &&
- !prediction )
- goto out;
- }
-
- for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
- {
- if ( SH_L1_HAS_NEXT_PAGE &&
- i >= L1_PAGETABLE_ENTRIES )
- sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
- else
- sl1e_p = &pt[i];
-
- if ( unlikely(!l1e_has_changed(*sl1e_p, match, flags)) &&
- fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) )
- break;
- }
-
-out:
- unmap_domain_page(pt);
- if ( SH_L1_HAS_NEXT_PAGE )
- unmap_domain_page(pt_next);
-
- return found;
-}
-
-static int remove_all_write_access(
- struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
-{
- int i;
- struct shadow_status *a;
- u32 found = 0, write_refs;
- unsigned long predicted_smfn;
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(VALID_MFN(readonly_gmfn));
-
- perfc_incrc(remove_write_access);
-
- // If it's not a writable page, then no writable refs can be outstanding.
- //
- if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
- PGT_writable_page )
- {
- perfc_incrc(remove_write_not_writable);
- return 1;
- }
-
- // How many outstanding writable PTEs for this page are there?
- //
- write_refs =
- (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
- if ( write_refs && MFN_PINNED(readonly_gmfn) )
- {
- write_refs--;
- }
-
- if ( write_refs == 0 )
- {
- perfc_incrc(remove_write_no_work);
- return 1;
- }
-
- if ( shadow_mode_external(d) ) {
- if (--write_refs == 0)
- return 0;
-
- // Use the back pointer to locate the shadow page that can contain
- // the PTE of interest
- if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
- found += remove_all_write_access_in_ptpage(
- d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
- if ( found == write_refs )
- return 0;
- }
- }
-
- // Search all the shadow L1 page tables...
- //
- for (i = 0; i < shadow_ht_buckets; i++)
- {
- a = &d->arch.shadow_ht[i];
- while ( a && a->gpfn_and_flags )
- {
- if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow
-#if CONFIG_PAGING_LEVELS >= 3
- || (a->gpfn_and_flags & PGT_type_mask) == PGT_fl1_shadow
-#endif
- )
-
- {
- found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
- if ( found == write_refs )
- return 0;
- }
-
- a = a->next;
- }
- }
-
- FSH_LOG("%s: looking for %d refs, found %d refs",
- __func__, write_refs, found);
-
- return 0;
-}
-
-static void resync_pae_guest_l3(struct domain *d)
-{
- struct out_of_sync_entry *entry;
- unsigned long i, idx;
- unsigned long smfn, gmfn;
- pgentry_64_t *guest, *shadow_l3, *snapshot;
- struct vcpu *v = current;
- int max = -1;
- int unshadow = 0;
-
-
- ASSERT( shadow_mode_external(d) );
-
- gmfn = pagetable_get_pfn(v->arch.guest_table);
-
- for ( entry = d->arch.out_of_sync; entry; entry = entry->next )
- {
- if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
- continue;
- if ( entry->gmfn != gmfn )
- continue;
-
- idx = get_cr3_idxval(v);
-
- smfn = __shadow_status(d, entry->gpfn, PGT_l4_shadow);
-
- if ( !smfn )
- continue;
-
- guest = (pgentry_64_t *)map_domain_page(entry->gmfn);
- snapshot = (pgentry_64_t *)map_domain_page(entry->snapshot_mfn);
- shadow_l3 = (pgentry_64_t *)map_domain_page(smfn);
-
- for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
- {
- int index = i + idx * PAE_L3_PAGETABLE_ENTRIES;
- if ( entry_has_changed(
- guest[index], snapshot[index], PAGE_FLAG_MASK) )
- {
- unsigned long gpfn;
-
- /*
- * Looks like it's no longer a page table.
- */
- if ( unlikely(entry_get_value(guest[index]) & PAE_PDPT_RESERVED) )
- {
- if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT )
- put_shadow_ref(entry_get_pfn(shadow_l3[i]));
-
- shadow_l3[i] = entry_empty();
- continue;
- }
-
- gpfn = entry_get_pfn(guest[index]);
-
- if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
- {
- if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT )
- put_shadow_ref(entry_get_pfn(shadow_l3[i]));
-
- shadow_l3[i] = entry_empty();
- continue;
- }
-
- validate_entry_change(d, &guest[index],
- &shadow_l3[i], PAGING_L3);
- }
-
- if ( entry_get_value(guest[index]) != 0 )
- max = i;
-
- if ( !(entry_get_flags(guest[index]) & _PAGE_PRESENT) &&
- unlikely(entry_get_value(guest[index]) != 0) &&
- !unshadow &&
- (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
- unshadow = 1;
-
- }
- if ( max == -1 )
- unshadow = 1;
-
- unmap_domain_page(guest);
- unmap_domain_page(snapshot);
- unmap_domain_page(shadow_l3);
-
- if ( unlikely(unshadow) )
- shadow_unpin(smfn);
- break;
- }
-}
-
-static int resync_all(struct domain *d, u32 stype)
-{
- struct out_of_sync_entry *entry;
- unsigned i;
- unsigned long smfn;
- void *guest, *shadow, *snapshot;
- int need_flush = 0, external = shadow_mode_external(d);
- int unshadow;
- int changed;
- u32 min_max_shadow, min_max_snapshot;
- int min_shadow, max_shadow, min_snapshot, max_snapshot;
- struct vcpu *v;
-
- ASSERT(shadow_lock_is_acquired(d));
-
- for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
- {
- int max = -1;
-
- if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
- continue;
-
- smfn = __shadow_status(d, entry->gpfn, stype);
-
- if ( !smfn )
- {
- // For heavy weight shadows: no need to update refcounts if
- // there's no shadow page.
- //
- if ( shadow_mode_refcounts(d) )
- continue;
-
- // For light weight shadows: only need up resync the refcounts to
- // the new contents of the guest page iff this it has the right
- // page type.
- //
- if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
- continue;
- }
-
- FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
- stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
-
- // Compare guest's new contents to its snapshot, validating
- // and updating its shadow as appropriate.
- //
- guest = map_domain_page(entry->gmfn);
- snapshot = map_domain_page(entry->snapshot_mfn);
-
- if ( smfn )
- shadow = map_domain_page(smfn);
- else
- shadow = NULL;
-
- unshadow = 0;
-
- min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
- min_shadow = SHADOW_MIN(min_max_shadow);
- max_shadow = SHADOW_MAX(min_max_shadow);
-
- min_max_snapshot= mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
- min_snapshot = SHADOW_MIN(min_max_snapshot);
- max_snapshot = SHADOW_MAX(min_max_snapshot);
-
- switch ( stype )
- {
- case PGT_l1_shadow:
- {
- guest_l1_pgentry_t *guest1 = guest;
- l1_pgentry_t *shadow1 = shadow;
- l1_pgentry_t *shadow1_next = 0, *sl1e_p;
- guest_l1_pgentry_t *snapshot1 = snapshot;
- int unshadow_l1 = 0;
-
- ASSERT(shadow_mode_write_l1(d) ||
- shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
-
- if ( !shadow_mode_refcounts(d) )
- revalidate_l1(d, (l1_pgentry_t *)guest1, (l1_pgentry_t *)snapshot1);
- if ( !smfn )
- break;
-
- changed = 0;
-
- if ( SH_L1_HAS_NEXT_PAGE && shadow1 )
- shadow1_next = map_domain_page(smfn + 1);
-
- for ( i = min_shadow; i <= max_shadow; i++ )
- {
-
- if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
- sl1e_p = &shadow1_next[i - L1_PAGETABLE_ENTRIES];
- else
- sl1e_p = &shadow1[i];
-
- if ( (i < min_snapshot) || (i > max_snapshot) ||
- guest_l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
- {
- int error;
-
-#if CONFIG_PAGING_LEVELS >= 3
- unsigned long gpfn;
-
- gpfn = guest_l1e_get_paddr(guest1[i]) >> PAGE_SHIFT;
-
- if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
- {
- guest_l1_pgentry_t tmp_gl1e = guest_l1e_empty();
- validate_pte_change(d, tmp_gl1e, sl1e_p);
- unshadow_l1 = 1;
- continue;
- }
-#endif
-
- error = validate_pte_change(d, guest1[i], sl1e_p);
- if ( error == -1 )
- unshadow_l1 = 1;
- else {
- need_flush |= error;
- if ( l1e_get_flags(*sl1e_p) & _PAGE_PRESENT )
- set_guest_back_ptr(d, *sl1e_p, smfn, i);
- }
- // can't update snapshots of linear page tables -- they
- // are used multiple times...
- //
- // snapshot[i] = new_pte;
-
- changed++;
- }
- }
-
- if ( shadow1_next )
- unmap_domain_page(shadow1_next);
-
- perfc_incrc(resync_l1);
- perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
- perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
-
- if ( d->arch.ops->guest_paging_levels >= PAGING_L3 &&
- unshadow_l1 ) {
- pgentry_64_t l2e = { 0 };
-
- __shadow_get_l2e(entry->v, entry->va, &l2e);
-
- if ( entry_get_flags(l2e) & _PAGE_PRESENT ) {
- put_shadow_ref(entry_get_pfn(l2e));
- l2e = entry_empty();
- __shadow_set_l2e(entry->v, entry->va, &l2e);
-
- if (entry->v == current)
- need_flush = 1;
- }
- }
-
- break;
- }
-#if CONFIG_PAGING_LEVELS == 2
- case PGT_l2_shadow:
- {
- l2_pgentry_t *guest2 = guest;
- l2_pgentry_t *shadow2 = shadow;
- l2_pgentry_t *snapshot2 = snapshot;
-
- ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
- BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
-
- changed = 0;
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- {
- if ( !is_guest_l2_slot(0,i) && !external )
- continue;
-
- l2_pgentry_t new_pde = guest2[i];
- if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
- {
- need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
-
- // can't update snapshots of linear page tables -- they
- // are used multiple times...
- //
- // snapshot[i] = new_pde;
-
- changed++;
- }
- if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
- max = i;
-
- // XXX - This hack works for linux guests.
- // Need a better solution long term.
- if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
- unlikely(l2e_get_intpte(new_pde) != 0) &&
- !unshadow && MFN_PINNED(smfn) )
- unshadow = 1;
- }
- if ( max == -1 )
- unshadow = 1;
- perfc_incrc(resync_l2);
- perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
- break;
- }
- case PGT_hl2_shadow:
- {
- l2_pgentry_t *guest2 = guest;
- l2_pgentry_t *snapshot2 = snapshot;
- l1_pgentry_t *shadow2 = shadow;
-
- ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
- BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
-
- changed = 0;
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- {
- if ( !is_guest_l2_slot(0, i) && !external )
- continue;
-
- l2_pgentry_t new_pde = guest2[i];
- if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
- {
- need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
-
- // can't update snapshots of linear page tables -- they
- // are used multiple times...
- //
- // snapshot[i] = new_pde;
-
- changed++;
- }
- }
- perfc_incrc(resync_hl2);
- perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
- break;
- }
-#elif CONFIG_PAGING_LEVELS >= 3
- case PGT_l2_shadow:
- case PGT_l3_shadow:
- {
- pgentry_64_t *guest_pt = guest;
- pgentry_64_t *shadow_pt = shadow;
- pgentry_64_t *snapshot_pt = snapshot;
-
- changed = 0;
- for ( i = min_shadow; i <= max_shadow; i++ )
- {
- if ( (i < min_snapshot) || (i > max_snapshot) ||
- entry_has_changed(
- guest_pt[i], snapshot_pt[i], PAGE_FLAG_MASK) )
- {
- unsigned long gpfn;
-
- gpfn = entry_get_pfn(guest_pt[i]);
- /*
- * Looks like it's no longer a page table.
- */
- if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
- {
- if ( entry_get_flags(shadow_pt[i]) & _PAGE_PRESENT )
- put_shadow_ref(entry_get_pfn(shadow_pt[i]));
- shadow_pt[i] = entry_empty();
- continue;
- }
-
- need_flush |= validate_entry_change(
- d, &guest_pt[i], &shadow_pt[i],
- shadow_type_to_level(stype));
- changed++;
- }
-#if CONFIG_PAGING_LEVELS == 3
- if ( stype == PGT_l3_shadow )
- {
- if ( entry_get_value(guest_pt[i]) != 0 )
- max = i;
-
- if ( !(entry_get_flags(guest_pt[i]) & _PAGE_PRESENT) &&
- unlikely(entry_get_value(guest_pt[i]) != 0) &&
- !unshadow &&
- (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
- unshadow = 1;
- }
-#endif
- }
-
- if ( d->arch.ops->guest_paging_levels == PAGING_L3
- && max == -1 && stype == PGT_l3_shadow )
- unshadow = 1;
-
- perfc_incrc(resync_l3);
- perfc_incr_histo(shm_l3_updates, changed, PT_UPDATES);
- break;
- }
- case PGT_l4_shadow:
- {
- guest_root_pgentry_t *guest_root = guest;
- guest_root_pgentry_t *snapshot_root = snapshot;
-
- changed = 0;
- for ( i = 0; i < GUEST_ROOT_PAGETABLE_ENTRIES; i++ )
- {
- guest_root_pgentry_t new_root_e = guest_root[i];
- if ( !is_guest_l4_slot(i) && !external )
- continue;
- if ( root_entry_has_changed(
- new_root_e, snapshot_root[i], PAGE_FLAG_MASK))
- {
-#ifndef GUEST_PGENTRY_32
- l4_pgentry_t *shadow4 = shadow;
- unsigned long gpfn;
-
- gpfn = l4e_get_pfn(new_root_e);
-
- /*
- * Looks like it's no longer a page table.
- */
- if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
- {
- if ( l4e_get_flags(shadow4[i]) & _PAGE_PRESENT )
- put_shadow_ref(l4e_get_pfn(shadow4[i]));
- shadow4[i] = l4e_empty();
- continue;
- }
-
- if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
- {
- need_flush |= validate_entry_change(
- d, (pgentry_64_t *)&new_root_e,
- (pgentry_64_t *)&shadow4[i], shadow_type_to_level(stype));
- }
- else
-#endif
- {
- validate_bl2e_change(d, &new_root_e, shadow, i);
- }
- changed++;
- ESH_LOG("%d: shadow4 mfn: %lx, shadow root: %lx\n", i,
- smfn, pagetable_get_paddr(current->arch.shadow_table));
- }
- if ( guest_root_get_intpte(new_root_e) != 0 ) /* FIXME: check flags? */
- max = i;
-
- // Need a better solution in the long term.
- if ( !(guest_root_get_flags(new_root_e) & _PAGE_PRESENT) &&
- unlikely(guest_root_get_intpte(new_root_e) != 0) &&
- !unshadow &&
- (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
- unshadow = 1;
- }
- if ( max == -1 )
- unshadow = 1;
- perfc_incrc(resync_l4);
- perfc_incr_histo(shm_l4_updates, changed, PT_UPDATES);
- break;
- }
-
-#endif /* CONFIG_PAGING_LEVELS >= 3 */
- default:
- BUG();
- }
-
- if ( smfn )
- unmap_domain_page(shadow);
- unmap_domain_page(snapshot);
- unmap_domain_page(guest);
-
- if ( unlikely(unshadow && stype == PGT_root_page_table) )
- {
- for_each_vcpu(d, v)
- if(smfn == pagetable_get_pfn(v->arch.shadow_table))
- return need_flush;
- perfc_incrc(unshadow_l2_count);
- shadow_unpin(smfn);
-#if CONFIG_PAGING_LEVELS == 2
- if ( unlikely(shadow_mode_external(d)) )
- {
- unsigned long hl2mfn;
-
- if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
- MFN_PINNED(hl2mfn) )
- shadow_unpin(hl2mfn);
- }
-#endif
- }
- }
-
- return need_flush;
-}
-
-#if CONFIG_PAGING_LEVELS == 2
-static int resync_all_levels_guest_page(struct domain *d)
-{
- int need_flush = 0;
-
- need_flush |= resync_all(d, PGT_l1_shadow);
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
- shadow_mode_translate(d) )
- {
- need_flush |= resync_all(d, PGT_hl2_shadow);
- }
- return need_flush;
-}
-#elif CONFIG_PAGING_LEVELS == 3
-static int resync_all_levels_guest_page(struct domain *d)
-{
- int need_flush = 0;
-
- need_flush |= resync_all(d, PGT_l1_shadow);
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
- need_flush |= resync_all(d, PGT_l4_shadow);
- else
- {
- need_flush |= resync_all(d, PGT_l2_shadow);
- if ( shadow_mode_log_dirty(d) )
- {
- need_flush |= resync_all(d, PGT_l3_shadow);
- need_flush |= resync_all(d, PGT_l4_shadow);
- }
- else
- resync_pae_guest_l3(d);
- }
-
- return need_flush;
-}
-#elif CONFIG_PAGING_LEVELS == 4
-static int resync_all_levels_guest_page(struct domain *d)
-{
- int need_flush = 0;
-
- need_flush |= resync_all(d, PGT_l1_shadow);
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
- need_flush |= resync_all(d, PGT_l4_shadow);
- else
- {
- need_flush |= resync_all(d, PGT_l2_shadow);
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
- resync_pae_guest_l3(d);
- else
- {
- need_flush |= resync_all(d, PGT_l3_shadow);
- need_flush |= resync_all(d, PGT_l4_shadow);
- }
- }
- return need_flush;
-}
-#endif
-
-static void sync_all(struct domain *d)
-{
- struct out_of_sync_entry *entry;
- int need_flush = 0;
- l1_pgentry_t *ppte, opte, npte;
- cpumask_t other_vcpus_mask;
-
- perfc_incrc(shadow_sync_all);
-
- ASSERT(shadow_lock_is_acquired(d));
-
- // First, remove all write permissions to the page tables
- //
- for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
- {
- // Skip entries that have low bits set... Those aren't
- // real PTEs.
- //
- if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
- continue;
-
- ppte = (l1_pgentry_t *)(
- (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
- (entry->writable_pl1e & ~PAGE_MASK));
- opte = npte = *ppte;
- l1e_remove_flags(npte, _PAGE_RW);
-
- if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
- !shadow_get_page_from_l1e(npte, d) )
- BUG();
- *ppte = npte;
- set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT,
- (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
- shadow_put_page_from_l1e(opte, d);
-
- unmap_domain_page(ppte);
- }
-
- /* Other VCPUs mustn't use the revoked writable mappings. */
- other_vcpus_mask = d->domain_dirty_cpumask;
- cpu_clear(smp_processor_id(), other_vcpus_mask);
- flush_tlb_mask(other_vcpus_mask);
-
- /* Flush ourself later. */
- need_flush = 1;
-
- need_flush |= resync_all_levels_guest_page(d);
-
- if ( need_flush && !unlikely(shadow_mode_external(d)) )
- local_flush_tlb();
-
- free_out_of_sync_state(d);
-}
-
-static inline int l1pte_write_fault(
- struct vcpu *v, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
- unsigned long va)
-{
- struct domain *d = v->domain;
- guest_l1_pgentry_t gpte = *gpte_p;
- l1_pgentry_t spte;
- unsigned long gpfn = l1e_get_pfn(gpte);
- unsigned long gmfn = gmfn_to_mfn(d, gpfn);
-
- //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
-
- if ( unlikely(!VALID_MFN(gmfn)) )
- {
- SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
- *spte_p = l1e_empty();
- return 0;
- }
-
- ASSERT(guest_l1e_get_flags(gpte) & _PAGE_RW);
- guest_l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
- spte = l1e_from_pfn(gmfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
- SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
- l1e_get_intpte(spte), l1e_get_intpte(gpte));
-
- __mark_dirty(d, gmfn);
-
- if ( mfn_is_page_table(gmfn) )
- shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
-
- *gpte_p = gpte;
- *spte_p = spte;
-
- return 1;
-}
-
-static inline int l1pte_read_fault(
- struct domain *d, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
-{
- guest_l1_pgentry_t gpte = *gpte_p;
- l1_pgentry_t spte = *spte_p;
- unsigned long pfn = l1e_get_pfn(gpte);
- unsigned long mfn = gmfn_to_mfn(d, pfn);
-
- if ( unlikely(!VALID_MFN(mfn)) )
- {
- SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
- *spte_p = l1e_empty();
- return 0;
- }
-
- guest_l1e_add_flags(gpte, _PAGE_ACCESSED);
- spte = l1e_from_pfn(mfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
- if ( shadow_mode_log_dirty(d) || !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
- mfn_is_page_table(mfn) )
- {
- l1e_remove_flags(spte, _PAGE_RW);
- }
-
- SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
- l1e_get_intpte(spte), l1e_get_intpte(gpte));
- *gpte_p = gpte;
- *spte_p = spte;
-
- return 1;
-}
-#if CONFIG_PAGING_LEVELS == 2
-static int shadow_fault_32(unsigned long va, struct cpu_user_regs *regs)
-{
- l1_pgentry_t gpte, spte, orig_gpte;
- struct vcpu *v = current;
- struct domain *d = v->domain;
- l2_pgentry_t gpde;
-
- spte = l1e_empty();
-
- SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
- va, (unsigned long)regs->error_code);
- perfc_incrc(shadow_fault_calls);
-
- check_pagetable(v, "pre-sf");
-
- /*
- * Don't let someone else take the guest's table pages out-of-sync.
- */
- shadow_lock(d);
-
- /* XXX - FIX THIS COMMENT!!!
- * STEP 1. Check to see if this fault might have been caused by an
- * out-of-sync table page entry, or if we should pass this
- * fault onto the guest.
- */
- __shadow_sync_va(v, va);
-
- /*
- * STEP 2. Check the guest PTE.
- */
- __guest_get_l2e(v, va, &gpde);
- if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
- {
- SH_VVLOG("shadow_fault - EXIT: L1 not present");
- perfc_incrc(shadow_fault_bail_pde_not_present);
- goto fail;
- }
-
- // This can't fault because we hold the shadow lock and we've ensured that
- // the mapping is in-sync, so the check of the PDE's present bit, above,
- // covers this access.
- //
- //orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
- __guest_get_l1e(v, va, &gpte);
- orig_gpte = gpte;
-
- if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
- {
- SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")",
- l1e_get_intpte(gpte));
- perfc_incrc(shadow_fault_bail_pte_not_present);
- goto fail;
- }
-
- /* Write fault? */
- if ( regs->error_code & 2 )
- {
- int allow_writes = 0;
-
- if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
- {
- if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
- {
- allow_writes = 1;
- l1e_add_flags(gpte, _PAGE_RW);
- }
- else
- {
- /* Write fault on a read-only mapping. */
- SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
- l1e_get_intpte(gpte));
- perfc_incrc(shadow_fault_bail_ro_mapping);
- goto fail;
- }
- }
- else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
- {
- SH_LOG("l1pte_write_fault: no write access to page table page");
- domain_crash_synchronous();
- }
-
- if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
- {
- SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
- perfc_incrc(write_fault_bail);
- shadow_unlock(d);
- return 0;
- }
-
- if ( allow_writes )
- l1e_remove_flags(gpte, _PAGE_RW);
- }
- else
- {
- if ( !l1pte_read_fault(d, &gpte, &spte) )
- {
- SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
- perfc_incrc(read_fault_bail);
- shadow_unlock(d);
- return 0;
- }
- }
-
- /*
- * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
- */
- if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
- {
- /* XXX Watch out for read-only L2 entries! (not used in Linux). */
- /*if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
- &gpte, sizeof(gpte))) )*/
- if ( unlikely(!__guest_set_l1e(v, va, &gpte)))
- {
- printk("%s() failed, crashing domain %d "
- "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
- __func__,d->domain_id, l2e_get_intpte(gpde), va);
- domain_crash_synchronous();
- }
-
- __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
- }
-
- shadow_set_l1e(va, spte, 1);
-
- perfc_incrc(shadow_fault_fixed);
- d->arch.shadow_fault_count++;
-
- shadow_unlock(d);
-
- check_pagetable(v, "post-sf");
- return EXCRET_fault_fixed;
-
-fail:
- shadow_unlock(d);
- return 0;
-}
-#endif /* CONFIG_PAGING_LEVELS == 2 */
-
-static inline unsigned long va_to_l1mfn(struct vcpu *v, unsigned long va)
-{
- struct domain *d = v->domain;
- guest_l2_pgentry_t gl2e = {0};
-
- __guest_get_l2e(v, va, &gl2e);
-
- if ( unlikely(!(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT)) )
- return INVALID_MFN;
-
- return gmfn_to_mfn(d, l2e_get_pfn(gl2e));
-}
-
-static int do_update_va_mapping(unsigned long va,
- l1_pgentry_t val,
- struct vcpu *v)
-{
- struct domain *d = v->domain;
- l1_pgentry_t spte;
- int rc = 0;
-
- shadow_lock(d);
-
- // This is actually overkill - we don't need to sync the L1 itself,
- // just everything involved in getting to this L1 (i.e. we need
- // linear_pg_table[l1_linear_offset(va)] to be in sync)...
- //
- __shadow_sync_va(v, va);
-
- l1pte_propagate_from_guest(d, *(guest_l1_pgentry_t *)&val, &spte);
-#if CONFIG_PAGING_LEVELS == 2
- shadow_set_l1e(va, spte, 0);
-#elif CONFIG_PAGING_LEVELS >= 3
- shadow_set_l1e_64(va, (pgentry_64_t *) &spte, 0);
-#endif
- /*
- * If we're in log-dirty mode then we need to note that we've updated
- * the PTE in the PT-holding page. We need the machine frame number
- * for this.
- */
- __mark_dirty(d, va_to_l1mfn(v, va));
-
- shadow_unlock(d);
-
- return rc;
-}
-
-
-/*
- * What lives where in the 32-bit address space in the various shadow modes,
- * and what it uses to get/maintain that mapping.
- *
- * SHADOW MODE: none enable translate external
- *
- * 4KB things:
- * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
- * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
- * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
- * monitor_vtable n/a n/a n/a mapped once
- *
- * 4MB things:
- * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
- * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
- * monitor_linear n/a n/a n/a ???
- * perdomain perdomain perdomain perdomain perdomain
- * R/O M2P R/O M2P R/O M2P n/a n/a
- * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
- * P2M n/a n/a R/O M2P R/O M2P
- *
- * NB:
- * update_pagetables(), shadow_update_pagetables(), shadow_mode_enable(),
- * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
- * all play a part in maintaining these mappings.
- */
-static void shadow_update_pagetables(struct vcpu *v)
-{
- struct domain *d = v->domain;
-#if CONFIG_PAGING_LEVELS == 4
- unsigned long gmfn = ((v->arch.flags & TF_kernel_mode)?
- pagetable_get_pfn(v->arch.guest_table) :
- pagetable_get_pfn(v->arch.guest_table_user));
-#else
- unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
-#endif
-
- unsigned long gpfn = mfn_to_gmfn(d, gmfn);
- unsigned long smfn, old_smfn;
-
-#if CONFIG_PAGING_LEVELS == 2
- unsigned long hl2mfn;
-#endif
- int need_sync = 0;
-
- int max_mode = ( shadow_mode_external(d) ? SHM_external
- : shadow_mode_translate(d) ? SHM_translate
- : shadow_mode_enabled(d) ? SHM_enable
- : 0 );
-
- ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
- ASSERT( max_mode );
-
- /*
- * arch.guest_vtable
- */
- if ( max_mode & (SHM_enable | SHM_external) )
- {
- if ( likely(v->arch.guest_vtable != NULL) )
- unmap_domain_page_global(v->arch.guest_vtable);
- v->arch.guest_vtable = map_domain_page_global(gmfn);
- }
-
- /*
- * arch.shadow_table
- */
-#if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32)
- /*
- * We use PGT_l4_shadow for 2-level paging guests on PAE
- */
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) )
- smfn = shadow_l3_table(v, gpfn, gmfn);
- }
- else
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE )
- /*
- * We use PGT_l4_shadow for 2-level paging guests on PAE
- */
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
- {
- if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) )
- smfn = shadow_l3_table(v, gpfn, gmfn);
- else
- {
- update_top_level_shadow(v, smfn);
- need_sync = 1;
- }
- }
- else
-#endif
- if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
- {
-#if CONFIG_PAGING_LEVELS == 2
- smfn = shadow_l2_table(v, gpfn, gmfn);
-#elif CONFIG_PAGING_LEVELS == 3
- smfn = shadow_l3_table(v, gpfn, gmfn);
-#elif CONFIG_PAGING_LEVELS == 4
- smfn = shadow_l4_table(v, gpfn, gmfn);
-#endif
- }
- else
- {
-#if CONFIG_PAGING_LEVELS >= 3
- if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
- update_top_level_shadow(v, smfn);
-#endif
- /*
- * move sync later in order to avoid this smfn been
- * unshadowed occasionally
- */
- need_sync = 1;
- }
-
-
- if ( !get_shadow_ref(smfn) )
- BUG();
- old_smfn = pagetable_get_pfn(v->arch.shadow_table);
- v->arch.shadow_table = pagetable_from_pfn(smfn);
- if ( old_smfn )
- put_shadow_ref(old_smfn);
-
- SH_VVLOG("shadow_update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
-
- /*
- * arch.shadow_vtable
- */
- if ( max_mode == SHM_external
-#if CONFIG_PAGING_LEVELS >=3
- || max_mode & SHM_enable
-#endif
- )
- {
- if ( v->arch.shadow_vtable )
- unmap_domain_page_global(v->arch.shadow_vtable);
- v->arch.shadow_vtable = map_domain_page_global(smfn);
- }
-
-#if CONFIG_PAGING_LEVELS == 2
- /*
- * arch.hl2_vtable
- */
-
- // if max_mode == SHM_translate, then the hl2 is already installed
- // correctly in its smfn, and there's nothing to do.
- //
- if ( max_mode == SHM_external )
- {
- if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
- hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
- if ( v->arch.hl2_vtable )
- unmap_domain_page_global(v->arch.hl2_vtable);
- v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
- }
-
- /*
- * fixup pointers in monitor table, as necessary
- */
- if ( max_mode == SHM_external )
- {
- l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
- l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
- l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
-
- ASSERT( shadow_mode_translate(d) );
-
- if ( !get_shadow_ref(hl2mfn) )
- BUG();
- mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
- if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
- put_shadow_ref(l2e_get_pfn(old_hl2e));
-
- if ( !get_shadow_ref(smfn) )
- BUG();
- mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
- if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
- put_shadow_ref(l2e_get_pfn(old_sl2e));
-
- // XXX - maybe this can be optimized somewhat??
- local_flush_tlb();
- }
-#endif /* CONFIG_PAGING_LEVELS == 2 */
-
-#if CONFIG_PAGING_LEVELS == 3
- /*
- * fixup pointers in monitor table, as necessary
- */
- if ( max_mode == SHM_external )
- {
- l3_pgentry_t *mpl3e = (l3_pgentry_t *) v->arch.monitor_vtable;
- l2_pgentry_t *spl2e;
- unsigned long s2mfn;
- int i;
-
- ASSERT( shadow_mode_translate(d) );
- s2mfn = l3e_get_pfn(mpl3e[L3_PAGETABLE_ENTRIES - 1]);
-
- ASSERT( s2mfn);
- spl2e = map_domain_page(s2mfn);
-
- for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
- spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
- (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
- l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
- l2e_empty();
-
- unmap_domain_page(spl2e);
- local_flush_tlb();
- }
-#endif
-
- if(likely(need_sync))
- shadow_sync_all(d);
-}
-
-
-/************************************************************************/
-/************************************************************************/
-/************************************************************************/
-
-#if 0 // this code has not been updated for 32pae & 64 bit modes
-#if SHADOW_DEBUG
-
-// The following is entirely for _check_pagetable()'s benefit.
-// _check_pagetable() wants to know whether a given entry in a
-// shadow page table is supposed to be the shadow of the guest's
-// current entry, or the shadow of the entry held in the snapshot
-// taken above.
-//
-// Here, we mark all currently existing entries as reflecting
-// the snapshot, above. All other places in xen that update
-// the shadow will keep the shadow in sync with the guest's
-// entries (via l1pte_propagate_from_guest and friends), which clear
-// the SHADOW_REFLECTS_SNAPSHOT bit.
-//
-static void
-mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
-{
- unsigned long smfn;
- l1_pgentry_t *l1e;
- l2_pgentry_t *l2e;
- unsigned i;
-
- if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
- {
- l1e = map_domain_page(smfn);
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
- if ( is_guest_l1_slot(i) &&
- (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
- l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
- unmap_domain_page(l1e);
- }
-
- if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
- {
- l2e = map_domain_page(smfn);
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- if ( is_guest_l2_slot(0, i) &&
- (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
- l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
- unmap_domain_page(l2e);
- }
-}
-
-// BUG: these are not SMP safe...
-static int sh_l2_present;
-static int sh_l1_present;
-static char *sh_check_name;
-// int shadow_status_noswap; // declared in shadow32.c
-
-#define v2m(_v, _adr) ({ \
- unsigned long _a = (unsigned long)(_adr); \
- l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \
- unsigned long _pa = -1; \
- if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
- { \
- l1_pgentry_t _pte; \
- _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
- if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
- _pa = l1e_get_paddr(_pte); \
- } \
- _pa | (_a & ~PAGE_MASK); \
-})
-
-#define FAIL(_f, _a...) \
- do { \
- printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \
- sh_check_name, level, l2_idx, l1_idx, ## _a, \
- __FILE__, __LINE__); \
- printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \
- " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \
- " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \
- " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \
- l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \
- l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \
- p_guest_pte, p_shadow_pte, p_snapshot_pte, \
- (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \
- (void *)v2m(v, p_snapshot_pte), \
- (l2_idx << L2_PAGETABLE_SHIFT) | \
- (l1_idx << L1_PAGETABLE_SHIFT)); \
- errors++; \
- } while ( 0 )
-
-static int check_pte(
- struct vcpu *v,
- l1_pgentry_t *p_guest_pte,
- l1_pgentry_t *p_shadow_pte,
- l1_pgentry_t *p_snapshot_pte,
- int level, int l2_idx, int l1_idx)
-{
- struct domain *d = v->domain;
- l1_pgentry_t guest_pte = *p_guest_pte;
- l1_pgentry_t shadow_pte = *p_shadow_pte;
- l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
- l1_pgentry_t eff_guest_pte;
- unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
- int errors = 0, guest_writable;
- int page_table_page;
-
- if ( (l1e_get_intpte(shadow_pte) == 0) ||
- (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
- (l1e_get_intpte(shadow_pte) == 0x00000E00) )
- return errors; /* always safe */
-
- if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
- FAIL("Non zero not present shadow_pte");
-
- if ( level == 2 ) sh_l2_present++;
- if ( level == 1 ) sh_l1_present++;
-
- if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
- eff_guest_pte = snapshot_pte;
- else
- eff_guest_pte = guest_pte;
-
- if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
- FAIL("Guest not present yet shadow is");
-
- mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
-
- if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
- FAIL("Corrupt?");
-
- if ( (level == 1) &&
- (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
- !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
- FAIL("Dirty coherence");
-
- if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
- !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
- FAIL("Accessed coherence");
-
- if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
- FAIL("global bit set in shadow");
-
- eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
- eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
- shadow_mfn = l1e_get_pfn(shadow_pte);
-
- if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
- FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
- __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
-
- page_table_page = mfn_is_page_table(eff_guest_mfn);
-
- guest_writable =
- (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
- (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
-
- if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
- {
- printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
- eff_guest_pfn, eff_guest_mfn, shadow_mfn,
- mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
- page_table_page);
- FAIL("RW coherence");
- }
-
- if ( (level == 1) &&
- (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
- !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
- {
- printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
- eff_guest_pfn, eff_guest_mfn, shadow_mfn,
- mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
- page_table_page);
- FAIL("RW2 coherence");
- }
-
- if ( eff_guest_mfn == shadow_mfn )
- {
- if ( level > 1 )
- FAIL("Linear map ???"); /* XXX this will fail on BSD */
- }
- else
- {
- if ( level < 2 )
- FAIL("Shadow in L1 entry?");
-
- if ( level == 2 )
- {
- if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
- FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
- __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
- }
- else
- BUG(); // XXX -- not handled yet.
- }
-
- return errors;
-}
-#undef FAIL
-#undef v2m
-
-static int check_l1_table(
- struct vcpu *v, unsigned long gpfn,
- unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
-{
- struct domain *d = v->domain;
- int i;
- unsigned long snapshot_mfn;
- l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
- int errors = 0;
-
- if ( page_out_of_sync(mfn_to_page(gmfn)) )
- {
- snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
- ASSERT(snapshot_mfn);
- p_snapshot = map_domain_page(snapshot_mfn);
- }
-
- p_guest = map_domain_page(gmfn);
- p_shadow = map_domain_page(smfn);
-
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
- errors += check_pte(v, p_guest+i, p_shadow+i,
- p_snapshot ? p_snapshot+i : NULL,
- 1, l2_idx, i);
-
- unmap_domain_page(p_shadow);
- unmap_domain_page(p_guest);
- if ( p_snapshot )
- unmap_domain_page(p_snapshot);
-
- return errors;
-}
-
-#define FAILPT(_f, _a...) \
- do { \
- printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
- errors++; \
- } while ( 0 )
-
-static int check_l2_table(
- struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
-{
- struct domain *d = v->domain;
- l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
- l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
- l2_pgentry_t match;
- int i;
- int errors = 0;
- int limit;
-
- if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
- FAILPT("domain doesn't own page");
- if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
- FAILPT("bogus owner for snapshot page");
- if ( page_get_owner(mfn_to_page(smfn)) != NULL )
- FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
- smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
-
-#if 0
- if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
- DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
- {
- for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
- i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
- i++ )
- printk("+++ (%d) %lx %lx\n",i,
- l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
- FAILPT("hypervisor entries inconsistent");
- }
-
- if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
- l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
- FAILPT("hypervisor linear map inconsistent");
-#endif
-
- match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
- if ( !shadow_mode_external(d) &&
- l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
- match, PAGE_FLAG_MASK))
- {
- FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
- l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
- L2_PAGETABLE_SHIFT]),
- l2e_get_intpte(match));
- }
-
- match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
- if ( !shadow_mode_external(d) &&
- l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
- match, PAGE_FLAG_MASK))
- {
- FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
- l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
- d->arch.mm_perdomain_pt,
- l2e_get_intpte(match));
- }
-
-#if CONFIG_PAGING_LEVELS == 2
- if ( shadow_mode_external(d) )
- limit = L2_PAGETABLE_ENTRIES;
- else
- limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-#else
- limit = 0; /* XXX x86/64 XXX */
-#endif
-
- /* Check the whole L2. */
- for ( i = 0; i < limit; i++ )
- errors += check_pte(v,
- (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
- (l1_pgentry_t*)(&spl2e[i]),
- NULL,
- 2, i, 0);
-
- unmap_domain_page(spl2e);
- unmap_domain_page(gpl2e);
-
-#if 1
- if ( errors )
- printk("check_l2_table returning %d errors\n", errors);
-#endif
-
- return errors;
-}
-#undef FAILPT
-
-int _check_pagetable(struct vcpu *v, char *s)
-{
- struct domain *d = v->domain;
-#if CONFIG_PAGING_LEVELS == 4
- pagetable_t pt = ((v->arch.flags & TF_kernel_mode)?
- v->arch.guest_table : v->arch.guest_table_user);
-#else
- pagetable_t pt = v->arch.guest_table;
-#endif
- unsigned long gptbase = pagetable_get_paddr(pt);
- unsigned long ptbase_pfn, smfn;
- unsigned long i;
- l2_pgentry_t *gpl2e, *spl2e;
- unsigned long ptbase_mfn = 0;
- int errors = 0, limit, oos_pdes = 0;
-
- //_audit_domain(d, AUDIT_QUIET);
- shadow_lock(d);
-
- sh_check_name = s;
- //SH_VVLOG("%s-PT Audit", s);
- sh_l2_present = sh_l1_present = 0;
- perfc_incrc(check_pagetable);
-
- ptbase_mfn = gptbase >> PAGE_SHIFT;
- ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
-
- if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
- {
- printk("%s-PT %lx not shadowed\n", s, gptbase);
- goto out;
- }
- if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
- {
- ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
- oos_pdes = 1;
- ASSERT(ptbase_mfn);
- }
-
- errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
-
- gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
- spl2e = (l2_pgentry_t *) map_domain_page(smfn);
-
- /* Go back and recurse. */
-#if CONFIG_PAGING_LEVELS == 2
- if ( shadow_mode_external(d) )
- limit = L2_PAGETABLE_ENTRIES;
- else
- limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-#else
- limit = 0; /* XXX x86/64 XXX */
-#endif
-
- for ( i = 0; i < limit; i++ )
- {
- unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
- unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
- unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
-
- if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */
- {
- errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
- }
- }
-
- unmap_domain_page(spl2e);
- unmap_domain_page(gpl2e);
-
-#if 0
- SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
- sh_l2_present, sh_l1_present);
-#endif
-
- out:
- if ( errors )
- BUG();
-
- shadow_unlock(d);
-
- return errors;
-}
-
-int _check_all_pagetables(struct vcpu *v, char *s)
-{
- struct domain *d = v->domain;
- int i;
- struct shadow_status *a;
- unsigned long gmfn;
- int errors = 0;
-
- shadow_status_noswap = 1;
-
- sh_check_name = s;
- SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
- sh_l2_present = sh_l1_present = 0;
- perfc_incrc(check_all_pagetables);
-
- for (i = 0; i < shadow_ht_buckets; i++)
- {
- a = &d->arch.shadow_ht[i];
- while ( a && a->gpfn_and_flags )
- {
- gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
-
- switch ( a->gpfn_and_flags & PGT_type_mask )
- {
- case PGT_l1_shadow:
- errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
- gmfn, a->smfn, 0);
- break;
- case PGT_l2_shadow:
- errors += check_l2_table(v, gmfn, a->smfn,
- page_out_of_sync(mfn_to_page(gmfn)));
- break;
- case PGT_l3_shadow:
- case PGT_l4_shadow:
- case PGT_hl2_shadow:
- BUG(); // XXX - ought to fix this...
- break;
- case PGT_snapshot:
- case PGT_writable_pred:
- break;
- default:
- errors++;
- printk("unexpected shadow type %lx, gpfn=%lx, "
- "gmfn=%lx smfn=%lx\n",
- a->gpfn_and_flags & PGT_type_mask,
- a->gpfn_and_flags & PGT_mfn_mask,
- gmfn, a->smfn);
- BUG();
- }
- a = a->next;
- }
- }
-
- shadow_status_noswap = 0;
-
- if ( errors )
- BUG();
-
- return errors;
-}
-
-#endif // SHADOW_DEBUG
-#endif // this code has not been updated for 32pae & 64 bit modes
-
-#if CONFIG_PAGING_LEVELS >= 3
-/****************************************************************************/
-/* 64-bit shadow-mode code testing */
-/****************************************************************************/
-/*
- * init_bl2() is for 32-bit VMX guest on 64-bit host
- * Using 1 shadow L4(l3) and 4 shadow L2s to simulate guest L2
- */
-static inline unsigned long init_bl2(
- struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
- unsigned int count;
- unsigned long sl2mfn;
- unsigned long smfn;
- struct page_info *page;
- l4_pgentry_t *spl4e;
- void *l2;
-
- if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
- {
- printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
- /* XXX Deal gracefully with failure. */
- domain_crash_synchronous();
- }
-
- spl4e = (l4_pgentry_t *)map_domain_page(smfn);
-
- /* Map the self entry, L4&L3 share the same page */
- spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
-
- /* Allocate 4 shadow L2s */
- page = alloc_domheap_pages(NULL, SL2_ORDER, 0);
- if ( !page )
- domain_crash_synchronous();
-
- for ( count = 0; count < PAE_L3_PAGETABLE_ENTRIES; count++ )
- {
- sl2mfn = page_to_mfn(page+count);
- l2 = map_domain_page(sl2mfn);
- memset(l2, 0, PAGE_SIZE);
- unmap_domain_page(l2);
- spl4e[count] = l4e_from_pfn(sl2mfn, _PAGE_PRESENT);
- }
-
- unmap_domain_page(spl4e);
-
- return smfn;
-}
-
-static inline unsigned long init_l3(
- struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
-{
- unsigned long smfn;
- l4_pgentry_t *spl4e;
- unsigned long index;
-
- if ( unlikely(!(smfn = alloc_shadow_page(v->domain, gpfn, gmfn, PGT_l4_shadow))) )
- {
- printk("Couldn't alloc an L4 shadow for pfn= %lx mfn= %lx\n", gpfn, gmfn);
- BUG(); /* XXX Deal gracefully wiht failure. */
- }
-
- /* Map the self entry, L4&L3 share the same page */
- spl4e = (l4_pgentry_t *)map_domain_page(smfn);
-
- /*
- * Shadow L4's pfn_info->tlbflush_timestamp
- * should also save it's own index.
- */
-
- index = get_cr3_idxval(v);
- frame_table[smfn].tlbflush_timestamp = index;
-
- memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
- spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
- unmap_domain_page(spl4e);
- return smfn;
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3
-static unsigned long shadow_l3_table(
- struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
-{
- unsigned long smfn;
- l3_pgentry_t *spl3e;
- struct domain *d = v->domain;
-
- perfc_incrc(shadow_l3_table_count);
-
- SH_VVLOG("shadow_l3_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
-
- if ( SH_L1_HAS_NEXT_PAGE &&
- d->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- return init_bl2(d, gpfn, gmfn);
- }
-
- if ( SH_GUEST_32PAE &&
- d->arch.ops->guest_paging_levels == PAGING_L3 )
- {
- return init_l3(v, gpfn, gmfn);
- }
-
- if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l3_shadow))) )
- {
- printk("Couldn't alloc an L3 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
- BUG(); /* XXX Deal gracefully with failure. */
- }
-
- spl3e = (l3_pgentry_t *)map_domain_page(smfn);
-
- /* Make the self entry */
- spl3e[PAE_SHADOW_SELF_ENTRY] = l3e_from_pfn(smfn, __PAGE_HYPERVISOR);
-
- if ( (PGT_base_page_table == PGT_l3_page_table) &&
- !shadow_mode_external(d) ) {
- int i;
- unsigned long g2mfn, s2mfn;
- l2_pgentry_t *spl2e;
- l3_pgentry_t *gpl3e;
-
- /* Get the top entry */
- gpl3e = (l3_pgentry_t *)map_domain_page(gmfn);
-
- if ( !(l3e_get_flags(gpl3e[L3_PAGETABLE_ENTRIES - 1]) & _PAGE_PRESENT) )
- {
- BUG();
- }
-
- g2mfn = l3e_get_pfn(gpl3e[L3_PAGETABLE_ENTRIES - 1]);
-
- /* NB. g2mfn should be same as g2pfn */
- if (!(s2mfn = __shadow_status(d, g2mfn, PGT_l2_shadow))) {
- if ( unlikely(!(s2mfn =
- alloc_shadow_page(d, g2mfn, g2mfn, PGT_l2_shadow))) ) {
- printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
- g2mfn, g2mfn);
- BUG(); /* XXX Deal gracefully with failure. */
- }
- }
-
- if (!get_shadow_ref(s2mfn))
- BUG();
-
- /* Map shadow L2 into shadow L3 */
- spl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(s2mfn, _PAGE_PRESENT);
- shadow_update_min_max(smfn, L3_PAGETABLE_ENTRIES -1);
-
- /*
- * Xen private mappings. Do the similar things as
- * create_pae_xen_mappings().
- */
- spl2e = (l2_pgentry_t *)map_domain_page(s2mfn);
-
- /*
- * When we free L2 pages, we need to tell if the page contains
- * Xen private mappings. Use the va_mask part.
- */
- mfn_to_page(s2mfn)->u.inuse.type_info |=
- (unsigned long) 3 << PGT_score_shift;
-
- memset(spl2e, 0,
- (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)) * sizeof(l2_pgentry_t));
-
- memcpy(&spl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
- &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
- L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
-
- for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
- spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
- l2e_from_page(
- virt_to_page(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_pt) + i,
- __PAGE_HYPERVISOR);
- for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
- spl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
- (l3e_get_flags(gpl3e[i]) & _PAGE_PRESENT) ?
- l2e_from_pfn(l3e_get_pfn(gpl3e[i]), __PAGE_HYPERVISOR) :
- l2e_empty();
-
- unmap_domain_page(spl2e);
- unmap_domain_page(gpl3e);
- }
- unmap_domain_page(spl3e);
-
- return smfn;
-}
-#endif /* CONFIG_PAGING_LEVELS == 3 */
-
-#if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
-static unsigned long gva_to_gpa_pae(unsigned long gva)
-{
- BUG();
- return 43;
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-static unsigned long shadow_l4_table(
- struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
-{
- unsigned long smfn;
- l4_pgentry_t *spl4e;
- struct domain *d = v->domain;
-
- SH_VVLOG("shadow_l4_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
-
- perfc_incrc(shadow_l4_table_count);
-
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- return init_bl2(d, gpfn, gmfn);
- }
-
- if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
- {
- return init_l3(v, gpfn, gmfn);
- }
-
- if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
- {
- printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
- BUG(); /* XXX Deal gracefully with failure. */
- }
-
- spl4e = (l4_pgentry_t *)map_domain_page(smfn);
-
- /* Install hypervisor and 4x linear p.t. mapings. */
- if ( (PGT_base_page_table == PGT_l4_page_table) &&
- !shadow_mode_external(d) )
- {
- /*
- * We could proactively fill in PDEs for pages that are already
- * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
- * (restriction required for coherence of the accessed bit). However,
- * we tried it and it didn't help performance. This is simpler.
- */
- memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
-
- /* Install hypervisor and 2x linear p.t. mapings. */
- memcpy(&spl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
- &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
- ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
-
- spl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
- l4e_from_paddr(__pa(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_l3),
- __PAGE_HYPERVISOR);
-
- if ( shadow_mode_translate(d) ) // NB: not external
- {
- spl4e[l4_table_offset(RO_MPT_VIRT_START)] =
- l4e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
- __PAGE_HYPERVISOR);
- }
- else
- spl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
- l4e_from_pfn(gmfn, __PAGE_HYPERVISOR);
-
- } else
- memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
-
- unmap_domain_page(spl4e);
-
- ESH_LOG("shadow_l4_table(%lx -> %lx)", gmfn, smfn);
- return smfn;
-}
-#endif /* CONFIG_PAGING_LEVELS == 4 */
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void
-update_top_level_shadow(struct vcpu *v, unsigned long smfn)
-{
- unsigned long index = get_cr3_idxval(v);
- pgentry_64_t *sple = (pgentry_64_t *)map_domain_page(smfn);
- pgentry_64_t *gple = (pgentry_64_t *)&v->arch.guest_vtable;
- int i;
-
- for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
- {
- unsigned long gpfn;
-
- /*
- * Looks like it's no longer a page table.
- */
- if ( unlikely(entry_get_value(gple[index*4+i]) & PAE_PDPT_RESERVED) )
- {
- if ( entry_get_flags(sple[i]) & _PAGE_PRESENT )
- put_shadow_ref(entry_get_pfn(sple[i]));
-
- sple[i] = entry_empty();
- continue;
- }
-
- gpfn = entry_get_pfn(gple[index*4+i]);
-
- if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
- {
- if ( entry_get_flags(sple[i]) & _PAGE_PRESENT )
- put_shadow_ref(entry_get_pfn(sple[i]));
-
- sple[i] = entry_empty();
- continue;
- }
-
- validate_entry_change(
- v->domain, &gple[index*4+i], &sple[i], PAGING_L3);
- }
-
- unmap_domain_page(sple);
-}
-
-/*
- * validate_bl2e_change()
- * The code is for 32-bit HVM guest on 64-bit host.
- * To sync guest L2.
- */
-
-static inline void
-validate_bl2e_change(
- struct domain *d,
- guest_root_pgentry_t *new_gle_p,
- pgentry_64_t *shadow_l3,
- int index)
-{
- int sl3_idx, sl2_idx;
- unsigned long sl2mfn, sl1mfn;
- pgentry_64_t *sl2_p;
-
- /* Using guest l2 pte index to get shadow l3&l2 index
- * index: 0 ~ 1023, PAGETABLE_ENTRIES: 512
- */
- sl3_idx = index / (PAGETABLE_ENTRIES / 2);
- sl2_idx = (index % (PAGETABLE_ENTRIES / 2)) * 2;
-
- sl2mfn = entry_get_pfn(shadow_l3[sl3_idx]);
- sl2_p = (pgentry_64_t *)map_domain_page(sl2mfn);
-
- validate_pde_change(
- d, *(guest_l2_pgentry_t *)new_gle_p, (l2_pgentry_t *)&sl2_p[sl2_idx]);
-
- /* Mapping the second l1 shadow page */
- if (entry_get_flags(sl2_p[sl2_idx]) & _PAGE_PRESENT) {
- sl1mfn = entry_get_pfn(sl2_p[sl2_idx]);
- sl2_p[sl2_idx + 1] =
- entry_from_pfn(sl1mfn + 1, entry_get_flags(sl2_p[sl2_idx]));
- }
- else
- sl2_p[sl2_idx + 1] = (pgentry_64_t){0};
- unmap_domain_page(sl2_p);
-
-}
-
-/*
- * This shadow_mark_va_out_of_sync() is for 2M page shadow
- */
-static void shadow_mark_va_out_of_sync_2mp(
- struct vcpu *v, unsigned long gpfn, unsigned long mfn, paddr_t writable_pl1e)
-{
- struct out_of_sync_entry *entry =
- shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
-
- entry->writable_pl1e = writable_pl1e;
- ESH_LOG("<shadow_mark_va_out_of_sync_2mp> gpfn = %lx\n", gpfn);
- if ( !get_shadow_ref(writable_pl1e >> L1_PAGETABLE_SHIFT) )
- BUG();
-}
-
-static int get_shadow_mfn(struct domain *d, unsigned long gpfn, unsigned long *spmfn, u32 flag)
-{
- unsigned long gmfn;
- if ( !(*spmfn = __shadow_status(d, gpfn, flag)) )
- {
- /* This is NOT already shadowed so we need to shadow it. */
- SH_VVLOG("<get_shadow_mfn>: not shadowed");
-
- gmfn = gmfn_to_mfn(d, gpfn);
- if ( unlikely(!VALID_MFN(gmfn)) )
- {
- // Attempt to use an invalid pfn as an shadow page.
- // XXX this needs to be more graceful!
- BUG();
- }
-
- if ( unlikely(!(*spmfn =
- alloc_shadow_page(d, gpfn, gmfn, flag))) )
- {
- printk("<get_shadow_mfn>Couldn't alloc an shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
- BUG(); /* XXX Need to deal gracefully with failure. */
- }
- switch(flag) {
- case PGT_l1_shadow:
- perfc_incrc(shadow_l1_table_count);
- break;
- case PGT_l2_shadow:
- perfc_incrc(shadow_l2_table_count);
- break;
- case PGT_l3_shadow:
- perfc_incrc(shadow_l3_table_count);
- break;
- case PGT_hl2_shadow:
- perfc_incrc(shadow_hl2_table_count);
- break;
- }
-
- return 1;
- } else {
- /* This L1 is shadowed already, but the L2 entry is missing. */
- SH_VVLOG("4b: was shadowed, l2 missing (%lx)", *spmfn);
- return 0;
- }
-}
-
-static void shadow_map_into_current(struct vcpu *v,
- unsigned long va, unsigned int from, unsigned int to)
-{
- pgentry_64_t gle = {0}, sle;
- unsigned long gpfn, smfn;
-
- if (from == PAGING_L1 && to == PAGING_L2) {
- shadow_map_l1_into_current_l2(va);
- return;
- }
-
- __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | to);
- ASSERT(entry_get_flags(gle) & _PAGE_PRESENT);
- gpfn = entry_get_pfn(gle);
-
- get_shadow_mfn(v->domain, gpfn, &smfn, shadow_level_to_type(from));
-
- if ( !get_shadow_ref(smfn) )
- BUG();
- entry_general(v->domain, &gle, &sle, smfn, to);
- __rw_entry(v, va, &gle, GUEST_ENTRY | SET_ENTRY | to);
- __rw_entry(v, va, &sle, SHADOW_ENTRY | SET_ENTRY | to);
-}
-
-/*
- * shadow_set_lxe should be put in shadow.h
- */
-static void shadow_set_l2e_64(unsigned long va, l2_pgentry_t sl2e,
- int create_l2_shadow, int put_ref_check)
-{
- struct vcpu *v = current;
- l4_pgentry_t sl4e;
- l3_pgentry_t sl3e;
-
- __shadow_get_l4e(v, va, &sl4e);
- if (!(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
- if (create_l2_shadow) {
- perfc_incrc(shadow_set_l3e_force_map);
- shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
- __shadow_get_l4e(v, va, &sl4e);
- } else {
- printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
- }
- }
-
- __shadow_get_l3e(v, va, &sl3e);
- if (!(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
- if (create_l2_shadow) {
- perfc_incrc(shadow_set_l2e_force_map);
- shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
- __shadow_get_l3e(v, va, &sl3e);
- } else {
- printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
- }
-
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L4 )
- shadow_update_min_max(l4e_get_pfn(sl4e), l3_table_offset(va));
- }
-
- if ( put_ref_check ) {
- l2_pgentry_t tmp_sl2e;
- if ( __shadow_get_l2e(v, va, &tmp_sl2e) ) {
- if ( l2e_get_flags(tmp_sl2e) & _PAGE_PRESENT )
- if ( l2e_get_pfn(tmp_sl2e) == l2e_get_pfn(sl2e) ) {
- put_shadow_ref(l2e_get_pfn(sl2e));
- }
- }
-
- }
-
- if (! __shadow_set_l2e(v, va, &sl2e))
- BUG();
- shadow_update_min_max(l3e_get_pfn(sl3e), l2_table_offset(va));
-}
-
-
-/* As 32-bit guest don't support 4M page yet,
- * we don't concern double compile for this function
- */
-static inline int l2e_rw_fault(
- struct vcpu *v, l2_pgentry_t *gl2e_p, unsigned long va, int rw)
-{
- struct domain *d = v->domain;
- l2_pgentry_t gl2e = *gl2e_p;
- l2_pgentry_t tmp_l2e = gl2e;
- unsigned long start_gpfn = l2e_get_pfn(gl2e);
- unsigned long gpfn, mfn;
- unsigned long l1_mfn, gmfn;
- l1_pgentry_t *l1_p;
- l1_pgentry_t sl1e;
- l1_pgentry_t old_sl1e;
- l2_pgentry_t sl2e;
-#ifdef __x86_64__
- u64 nx = 0;
-#endif
- int put_ref_check = 0;
- /* Check if gpfn is 2M aligned */
-
- /* Update guest l2e */
- if (rw) {
- ASSERT(l2e_get_flags(gl2e) & _PAGE_RW);
- l2e_add_flags(gl2e, _PAGE_DIRTY | _PAGE_ACCESSED);
- } else {
- l2e_add_flags(gl2e, _PAGE_ACCESSED);
- }
-
- l2e_remove_flags(tmp_l2e, _PAGE_PSE);
- if (l2e_get_flags(gl2e) & _PAGE_NX) {
- l2e_remove_flags(tmp_l2e, _PAGE_NX);
-#ifdef __x86_64__
- nx = PGT_high_mfn_nx;
-#endif
- }
-
-
- /* Get the shadow l2 first */
- if ( !__shadow_get_l2e(v, va, &sl2e) )
- sl2e = l2e_empty();
-
-#ifdef __x86_64__
- l1_mfn = __shadow_status(d, start_gpfn | nx, PGT_fl1_shadow);
-#else
- l1_mfn = __shadow_status(d, start_gpfn, PGT_fl1_shadow);
-#endif
-
- /* Check the corresponding l2e */
- if (l1_mfn) {
- /* Why it is PRESENT?*/
- if ((l2e_get_flags(sl2e) & _PAGE_PRESENT) &&
- l2e_get_pfn(sl2e) == l1_mfn) {
- ESH_LOG("sl2e PRSENT bit is set: %lx, l1_mfn = %lx\n", l2e_get_pfn(sl2e), l1_mfn);
- } else {
- put_ref_check = 1;
- if (!get_shadow_ref(l1_mfn))
- BUG();
- }
- l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn);
- sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
- } else {
- /* Allocate a new page as shadow page table if need */
- gmfn = gmfn_to_mfn(d, start_gpfn);
-#ifdef __x86_64__
- l1_mfn = alloc_shadow_page(d, start_gpfn | nx, gmfn, PGT_fl1_shadow);
-#else
- l1_mfn = alloc_shadow_page(d, start_gpfn, gmfn, PGT_fl1_shadow);
-#endif
- if (unlikely(!l1_mfn)) {
- BUG();
- }
-
- if (!get_shadow_ref(l1_mfn))
- BUG();
- l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn );
- sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
- memset(l1_p, 0, PAGE_SIZE);
- ESH_LOG("Alloc a shadow page: %lx\n", l1_mfn);
- }
-
- ESH_LOG("<%s>: sl2e = %lx\n", __func__, l2e_get_intpte(sl2e));
- /* Map the page to l2*/
- shadow_set_l2e_64(va, sl2e, 1, put_ref_check);
-
- if (l2e_get_flags(gl2e) & _PAGE_NX)
- l2e_add_flags(tmp_l2e, _PAGE_NX);
-
- /* Propagate the shadow page table, i.e. setting sl1e */
- for (gpfn = start_gpfn;
- gpfn < (start_gpfn + L1_PAGETABLE_ENTRIES); gpfn++) {
-
- mfn = gmfn_to_mfn(d, gpfn);
-
- if ( unlikely(!VALID_MFN(mfn)) )
- {
- continue;
- }
-
- sl1e = l1e_from_pfn(mfn, l2e_get_flags(tmp_l2e));
-
- if (!rw) {
- if ( shadow_mode_log_dirty(d) ||
- !(l2e_get_flags(gl2e) & _PAGE_DIRTY) || mfn_is_page_table(mfn) )
- {
- l1e_remove_flags(sl1e, _PAGE_RW);
- }
- } else {
- /* __mark_dirty(d, gmfn); */
- }
- // printk("<%s> gpfn: %lx, mfn: %lx, sl1e: %lx\n", __func__, gpfn, mfn, l1e_get_intpte(sl1e));
- /* The shadow entrys need setup before shadow_mark_va_out_of_sync()*/
- old_sl1e = l1_p[gpfn - start_gpfn];
-
- if ( l1e_has_changed(old_sl1e, sl1e, _PAGE_RW | _PAGE_PRESENT) )
- {
- if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
- !shadow_get_page_from_l1e(sl1e, d) ) {
- ESH_LOG("%lx, mfn: %lx why make me empty, start_pfn: %lx, gpfn: %lx\n", l1e_get_intpte(sl1e),mfn, start_gpfn, gpfn);
- sl1e = l1e_empty();
- }
- if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
- put_page_from_l1e(old_sl1e, d);
- }
-
- if (rw) {
- /* shadow_mark_va_out_of_sync() need modificatin for 2M pages*/
- if ( mfn_is_page_table(mfn) )
- shadow_mark_va_out_of_sync_2mp(v, gpfn, mfn,
- l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * (gpfn - start_gpfn)));
- }
-
- l1_p[gpfn - start_gpfn] = sl1e;
- }
-
- unmap_domain_page(l1_p);
- *gl2e_p = gl2e;
- return 1;
-}
-
-/*
- * Check P, R/W, U/S bits in the guest page table.
- * If the fault belongs to guest return 1,
- * else return 0.
- */
-#if defined( GUEST_PGENTRY_32 )
-static inline int guest_page_fault(
- struct vcpu *v,
- unsigned long va, unsigned int error_code,
- guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
-{
- /* The following check for 32-bit guest on 64-bit host */
-
- __guest_get_l2e(v, va, gpl2e);
-
- /* Check the guest L2 page-table entry first*/
- if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_PRESENT)) )
- return 1;
-
- if ( error_code & ERROR_W )
- {
- if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_RW)) )
- return 1;
- }
-
- if ( error_code & ERROR_U )
- {
- if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_USER)) )
- return 1;
- }
-
- if ( guest_l2e_get_flags(*gpl2e) & _PAGE_PSE )
- {
- printk("None-PAE HVM guests can NOT use PSE, "
- "because we don't support 4MBytes PSE pages.\n");
- printk("remove pae=1 from your config file.\n");
- domain_crash_synchronous();
- return 0;
- }
-
- __guest_get_l1e(v, va, gpl1e);
-
- /* Then check the guest L1 page-table entry */
- if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_PRESENT)) )
- return 1;
-
- if ( error_code & ERROR_W )
- {
- if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_RW)) )
- return 1;
- }
-
- if ( error_code & ERROR_U )
- {
- if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_USER)) )
- return 1;
- }
-
- return 0;
-}
-#else
-static inline int guest_page_fault(
- struct vcpu *v,
- unsigned long va, unsigned int error_code,
- guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
-{
- struct domain *d = v->domain;
- pgentry_64_t gle = { 0 };
- unsigned long gpfn = 0, mfn;
- int i;
- unsigned int base_idx = 0;
- base_idx = get_cr3_idxval(v);
-
- ASSERT( d->arch.ops->guest_paging_levels >= PAGING_L3 );
-
-#if CONFIG_PAGING_LEVELS >= 3
- if ( (error_code & (ERROR_I | ERROR_P)) == (ERROR_I | ERROR_P) )
- return 1;
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
- if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
- {
- __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | PAGING_L4);
- if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
- return 1;
-
- if ( error_code & ERROR_W )
- {
- if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
- return 1;
- }
-
- if ( error_code & ERROR_U )
- {
- if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
- return 1;
- }
- gpfn = entry_get_pfn(gle);
- }
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
- {
- if ( SH_GUEST_32PAE )
- gpfn = (hvm_get_guest_ctrl_reg(v, 3)) >> PAGE_SHIFT;
- else
- gpfn = pagetable_get_pfn(v->arch.guest_table);
- }
-#endif
-
- for ( i = PAGING_L3; i >= PAGING_L1; i-- )
- {
- pgentry_64_t *lva;
- /*
- * If it's not external mode, then mfn should be machine physical.
- */
- mfn = gmfn_to_mfn(d, gpfn);
-
- lva = (pgentry_64_t *) map_domain_page(mfn);
- gle = lva[guest_table_offset_64(va, i, base_idx)];
-
- unmap_domain_page(lva);
-
- gpfn = entry_get_pfn(gle);
-
- if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
- return 1;
-
- if ( i < PAGING_L3 ||
- d->arch.ops->guest_paging_levels == PAGING_L4 )
- {
- if ( error_code & ERROR_W )
- {
- if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
- {
- if ( i == PAGING_L1 )
- if ( gpl1e )
- gpl1e->l1 = gle.lo;
- return 1;
- }
- }
- if ( error_code & ERROR_U )
- {
- if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
- return 1;
- }
- }
-
- if ( i == PAGING_L2 )
- {
- if ( gpl2e )
- gpl2e->l2 = gle.lo;
- if ( likely(entry_get_flags(gle) & _PAGE_PSE) )
- return 0;
- }
-
- if ( i == PAGING_L1 )
- if ( gpl1e )
- gpl1e->l1 = gle.lo;
- }
-
- return 0;
-
-}
-#endif
-
-static int shadow_fault_64(unsigned long va, struct cpu_user_regs *regs)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- guest_l2_pgentry_t gl2e;
- guest_l1_pgentry_t gl1e, orig_gl1e;
- l1_pgentry_t sl1e;
-
- gl1e = guest_l1e_empty(); gl2e = guest_l2e_empty();
-
- sl1e = l1e_empty();
-
- perfc_incrc(shadow_fault_calls);
-
- ESH_LOG("<shadow_fault_64> va=%lx, rip = %lx, error code = %x\n",
- va, regs->eip, regs->error_code);
-
- /*
- * Don't let someone else take the guest's table pages out-of-sync.
- */
- shadow_lock(d);
-
- /*
- * STEP 1. Check to see if this fault might have been caused by an
- * out-of-sync table page entry, or if we should pass this
- * fault onto the guest.
- */
- __shadow_sync_va(v, va);
-
- /*
- * STEP 2. Check if the fault belongs to guest
- */
- if ( guest_page_fault(v, va, regs->error_code, &gl2e, &gl1e) )
- {
- if ( unlikely(shadow_mode_log_dirty(d)) && l1e_get_intpte(gl1e) != 0 )
- goto check_writeable;
-
- goto fail;
- }
-
- if ( unlikely((guest_l2e_get_flags(gl2e) & _PAGE_PSE)) )
- goto pse;
-
- /*
- * Handle 4K pages here
- */
-check_writeable:
- orig_gl1e = gl1e;
-
- /* Write fault? */
- if ( regs->error_code & 2 )
- {
- int allow_writes = 0;
-
- if ( unlikely(!(guest_l1e_get_flags(gl1e) & _PAGE_RW)) )
- {
- if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gl1e)) )
- {
- allow_writes = 1;
- l1e_add_flags(gl1e, _PAGE_RW);
- }
- else
- {
- /* Write fault on a read-only mapping. */
- SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
- l1e_get_intpte(gl1e));
- perfc_incrc(shadow_fault_bail_ro_mapping);
- goto fail;
- }
- }
-
- if ( !l1pte_write_fault(v, &gl1e, &sl1e, va) )
- {
- SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
- perfc_incrc(write_fault_bail);
- shadow_unlock(d);
- return 0;
- }
-
- if (allow_writes)
- l1e_remove_flags(gl1e, _PAGE_RW);
- }
- else
- {
- if ( !l1pte_read_fault(d, &gl1e, &sl1e) )
- {
- SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
- perfc_incrc(read_fault_bail);
- shadow_unlock(d);
- return 0;
- }
- }
-
- /*
- * STEP 3. Write the modified shadow PTE and guest PTE back to the tables
- */
- if ( l1e_has_changed(orig_gl1e, gl1e, PAGE_FLAG_MASK) )
- {
- if (unlikely(!__guest_set_l1e(v, va, &gl1e)))
- domain_crash_synchronous();
-
- __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gl2e)));
- }
-
- shadow_set_l1e_64(va, (pgentry_64_t *)&sl1e, 1);
-
- perfc_incrc(shadow_fault_fixed);
- d->arch.shadow_fault_count++;
-
- shadow_unlock(d);
-
- return EXCRET_fault_fixed;
-
-pse:
- /*
- * Handle 2M pages here
- */
- if ( unlikely(!shadow_mode_external(d)) )
- BUG();
-
- /* Write fault? */
- if ( regs->error_code & 2 )
- {
- if ( !l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, WRITE_FAULT) )
- {
- goto fail;
- }
- }
- else
- {
- l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, READ_FAULT);
- }
-
- /*
- * STEP 3. Write guest/shadow l2e back
- */
-
- if ( unlikely(!__guest_set_l2e(v, va, &gl2e)) )
- {
- domain_crash_synchronous();
- }
-
- /*
- * Todo: if necessary, record the page table page as dirty
- */
-
- perfc_incrc(shadow_fault_fixed);
- d->arch.shadow_fault_count++;
-
- shadow_unlock(d);
-
- return EXCRET_fault_fixed;
-fail:
- shadow_unlock(d);
- ESH_LOG("Guest fault~~~\n");
- return 0;
-}
-
-static void shadow_invlpg_64(struct vcpu *v, unsigned long va)
-{
- struct domain *d = v->domain;
- l1_pgentry_t sl1e, old_sl1e;
-
- shadow_lock(d);
-
- __shadow_sync_va(v, va);
-
- if ( shadow_mode_external(d) && __shadow_get_l1e(v, va, &old_sl1e) )
- if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
- put_page_from_l1e(old_sl1e, d);
-
- sl1e = l1e_empty();
- __shadow_set_l1e(v, va, &sl1e);
-
- shadow_unlock(d);
-}
-
-static unsigned long gva_to_gpa_64(unsigned long gva)
-{
- struct vcpu *v = current;
- guest_l1_pgentry_t gl1e = {0};
- guest_l2_pgentry_t gl2e = {0};
- unsigned long gpa;
-
- if (guest_page_fault(v, gva, 0, &gl2e, &gl1e))
- return 0;
-
- if (guest_l2e_get_flags(gl2e) & _PAGE_PSE)
- gpa = guest_l2e_get_paddr(gl2e) + (gva & ((1 << GUEST_L2_PAGETABLE_SHIFT) - 1));
- else
- gpa = guest_l1e_get_paddr(gl1e) + (gva & ~PAGE_MASK);
-
- return gpa;
-}
-
-/*
- * The naming convention of the shadow_ops:
- * MODE_<pgentry size>_<guest paging levels>_HANDLER
- */
-#if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
-struct shadow_ops MODE_64_3_HANDLER = {
- .guest_paging_levels = 3,
- .invlpg = shadow_invlpg_64,
- .fault = shadow_fault_64,
- .update_pagetables = shadow_update_pagetables,
- .sync_all = sync_all,
- .remove_all_write_access = remove_all_write_access,
- .do_update_va_mapping = do_update_va_mapping,
- .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
- .is_out_of_sync = is_out_of_sync,
- .gva_to_gpa = gva_to_gpa_pae,
-};
-
-struct shadow_ops MODE_64_4_HANDLER = {
- .guest_paging_levels = 4,
- .invlpg = shadow_invlpg_64,
- .fault = shadow_fault_64,
- .update_pagetables = shadow_update_pagetables,
- .sync_all = sync_all,
- .remove_all_write_access = remove_all_write_access,
- .do_update_va_mapping = do_update_va_mapping,
- .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
- .is_out_of_sync = is_out_of_sync,
- .gva_to_gpa = gva_to_gpa_64,
-};
-#endif /* GUEST_PGENTRY_32 */
-#endif /* CONFIG_PAGING_LEVELS >= 3 */
-
-
-#if CONFIG_PAGING_LEVELS == 2
-struct shadow_ops MODE_32_2_HANDLER = {
- .guest_paging_levels = 2,
- .invlpg = shadow_invlpg_32,
- .fault = shadow_fault_32,
- .update_pagetables = shadow_update_pagetables,
- .sync_all = sync_all,
- .remove_all_write_access = remove_all_write_access,
- .do_update_va_mapping = do_update_va_mapping,
- .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
- .is_out_of_sync = is_out_of_sync,
- .gva_to_gpa = gva_to_gpa_64,
-};
-#endif
-
-#if ( CONFIG_PAGING_LEVELS == 3 && !defined (GUEST_PGENTRY_32) && !defined (GUEST_32PAE) ) || \
- ( CONFIG_PAGING_LEVELS == 4 && defined (GUEST_PGENTRY_32) )
-
-
-/*
- * Use GUEST_PGENTRY_32 to force PAE_SHADOW_SELF_ENTRY for L4.
- *
- * Very simple shadow code to handle 1:1 direct mapping for guest
- * non-paging code, which actually is running in PAE/vm86 mode with
- * paging-enabled.
- *
- * We expect that the top level (L3) page has been allocated and initialized.
- */
-int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- l3_pgentry_t sl3e, *sl3e_p;
- l2_pgentry_t sl2e, *sl2e_p;
- l1_pgentry_t sl1e;
- unsigned long mfn, smfn;
- struct page_info *page;
-
- /*
- * If the faulting address is within the MMIO range, we continue
- * on handling the #PF as such.
- */
- if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN )
- return 0;
-
- shadow_lock(d);
-
- __direct_get_l3e(v, vpa, &sl3e);
-
- if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT) )
- {
- page = alloc_domheap_page(NULL);
- if ( !page )
- goto nomem;
-
- smfn = page_to_mfn(page);
- sl3e = l3e_from_pfn(smfn, _PAGE_PRESENT);
-
- sl3e_p = (l3_pgentry_t *)map_domain_page(smfn);
- memset(sl3e_p, 0, PAGE_SIZE);
- unmap_domain_page(sl3e_p);
-
- __direct_set_l3e(v, vpa, &sl3e);
- }
-
- __direct_get_l2e(v, vpa, &sl2e);
-
- if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
- {
- page = alloc_domheap_page(NULL);
- if ( !page )
- goto nomem;
-
- smfn = page_to_mfn(page);
- sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER);
- sl2e_p = (l2_pgentry_t *)map_domain_page(smfn);
- memset(sl2e_p, 0, PAGE_SIZE);
- unmap_domain_page(sl2e_p);
-
- __direct_set_l2e(v, vpa, &sl2e);
- }
-
- __direct_get_l1e(v, vpa, &sl1e);
-
- if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) )
- {
- sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER);
- __direct_set_l1e(v, vpa, &sl1e);
- }
-
- shadow_unlock(d);
- return EXCRET_fault_fixed;
-
-nomem:
- shadow_direct_map_clean(d);
- domain_crash_synchronous();
-}
-#endif
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/arch/x86/shadow2-common.c b/xen/arch/x86/shadow2-common.c
new file mode 100644
index 0000000000..eab6361c3d
--- /dev/null
+++ b/xen/arch/x86/shadow2-common.c
@@ -0,0 +1,3394 @@
+/******************************************************************************
+ * arch/x86/shadow2-common.c
+ *
+ * Shadow2 code that does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/irq.h>
+#include <xen/domain_page.h>
+#include <xen/guest_access.h>
+#include <asm/event.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/flushtlb.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+
+#if SHADOW2_AUDIT
+int shadow2_audit_enable = 0;
+#endif
+
+static void sh2_free_log_dirty_bitmap(struct domain *d);
+
+int _shadow2_mode_refcounts(struct domain *d)
+{
+ return shadow2_mode_refcounts(d);
+}
+
+
+/**************************************************************************/
+/* x86 emulator support for the shadow2 code
+ */
+
+static int
+sh2_x86_emulate_read_std(unsigned long addr,
+ unsigned long *val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+ if ( hvm_guest(v) )
+ {
+ *val = 0;
+ // XXX -- this is WRONG.
+ // It entirely ignores the permissions in the page tables.
+ // In this case, that is only a user vs supervisor access check.
+ //
+ if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
+ {
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id,
+ addr, *val, bytes);
+#endif
+ return X86EMUL_CONTINUE;
+ }
+
+ /* If we got here, there was nothing mapped here, or a bad GFN
+ * was mapped here. This should never happen: we're here because
+ * of a write fault at the end of the instruction we're emulating. */
+ SHADOW2_PRINTK("read failed to va %#lx\n", addr);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh2_x86_emulate_write_std(unsigned long addr,
+ unsigned long val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ // XXX -- this is WRONG.
+ // It entirely ignores the permissions in the page tables.
+ // In this case, that includes user vs supervisor, and
+ // write access.
+ //
+ if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
+ return X86EMUL_CONTINUE;
+
+ /* If we got here, there was nothing mapped here, or a bad GFN
+ * was mapped here. This should never happen: we're here because
+ * of a write fault at the end of the instruction we're emulating,
+ * which should be handled by sh2_x86_emulate_write_emulated. */
+ SHADOW2_PRINTK("write failed to va %#lx\n", addr);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh2_x86_emulate_write_emulated(unsigned long addr,
+ unsigned long val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow2->x86_emulate_write(v, addr, &val, bytes, ctxt);
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh2_x86_emulate_cmpxchg_emulated(unsigned long addr,
+ unsigned long old,
+ unsigned long new,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow2->x86_emulate_cmpxchg(v, addr, old, new,
+ bytes, ctxt);
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
+ unsigned long old_lo,
+ unsigned long old_hi,
+ unsigned long new_lo,
+ unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
+ v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
+ new_hi, new_lo, ctxt);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow2->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
+ new_lo, new_hi, ctxt);
+ }
+ else
+ {
+ SHADOW2_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+
+struct x86_emulate_ops shadow2_emulator_ops = {
+ .read_std = sh2_x86_emulate_read_std,
+ .write_std = sh2_x86_emulate_write_std,
+ .read_emulated = sh2_x86_emulate_read_std,
+ .write_emulated = sh2_x86_emulate_write_emulated,
+ .cmpxchg_emulated = sh2_x86_emulate_cmpxchg_emulated,
+ .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated,
+};
+
+
+/**************************************************************************/
+/* Code for "promoting" a guest page to the point where the shadow code is
+ * willing to let it be treated as a guest page table. This generally
+ * involves making sure there are no writable mappings available to the guest
+ * for this page.
+ */
+void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ unsigned long type_info;
+
+ ASSERT(valid_mfn(gmfn));
+
+ /* We should never try to promote a gmfn that has writeable mappings */
+ ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0);
+
+ // Is the page already shadowed?
+ if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
+ {
+ // No prior shadow exists...
+
+ // Grab a type-ref. We don't really care if we are racing with another
+ // vcpu or not, or even what kind of type we get; we just want the type
+ // count to be > 0.
+ //
+ do {
+ type_info =
+ page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
+ } while ( !get_page_type(page, type_info) );
+
+ // Now that the type ref is non-zero, we can safely use the
+ // shadow2_flags.
+ //
+ page->shadow2_flags = 0;
+ }
+
+ ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+ set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+}
+
+void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+
+ ASSERT(test_bit(_PGC_page_table, &page->count_info));
+ ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+
+ clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+
+ if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 )
+ {
+ // release the extra type ref
+ put_page_type(page);
+
+ // clear the is-a-page-table bit.
+ clear_bit(_PGC_page_table, &page->count_info);
+ }
+}
+
+/**************************************************************************/
+/* Validate a pagetable change from the guest and update the shadows.
+ * Returns a bitmask of SHADOW2_SET_* flags. */
+
+static int
+__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size)
+{
+ int result = 0;
+ struct page_info *page = mfn_to_page(gmfn);
+
+ sh2_mark_dirty(v->domain, gmfn);
+
+ // Determine which types of shadows are affected, and update each.
+ //
+ // Always validate L1s before L2s to prevent another cpu with a linear
+ // mapping of this gmfn from seeing a walk that results from
+ // using the new L2 value and the old L1 value. (It is OK for such a
+ // guest to see a walk that uses the old L2 value with the new L1 value,
+ // as hardware could behave this way if one level of the pagewalk occurs
+ // before the store, and the next level of the pagewalk occurs after the
+ // store.
+ //
+ // Ditto for L2s before L3s, etc.
+ //
+
+ if ( !(page->count_info & PGC_page_table) )
+ return 0; /* Not shadowed at all */
+
+#if CONFIG_PAGING_LEVELS == 2
+ if ( page->shadow2_flags & SH2F_L1_32 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2)
+ (v, gmfn, entry, size);
+#else
+ if ( page->shadow2_flags & SH2F_L1_32 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2)
+ (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS == 2
+ if ( page->shadow2_flags & SH2F_L2_32 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2)
+ (v, gmfn, entry, size);
+#else
+ if ( page->shadow2_flags & SH2F_L2_32 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2)
+ (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( page->shadow2_flags & SH2F_L1_PAE )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L2_PAE )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L2H_PAE )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L3_PAE )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3)
+ (v, gmfn, entry, size);
+#else /* 32-bit non-PAE hypervisor does not support PAE guests */
+ ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( page->shadow2_flags & SH2F_L1_64 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L2_64 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L3_64 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow2_flags & SH2F_L4_64 )
+ result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4)
+ (v, gmfn, entry, size);
+#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
+ ASSERT((page->shadow2_flags
+ & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0);
+#endif
+
+ return result;
+}
+
+
+int
+shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
+/* This is the entry point from hypercalls. It returns a bitmask of all the
+ * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
+{
+ int rc;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
+ shadow2_audit_tables(v);
+ return rc;
+}
+
+void
+shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size)
+/* This is the entry point for emulated writes to pagetables in HVM guests */
+{
+ struct domain *d = v->domain;
+ int rc;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ rc = __shadow2_validate_guest_entry(v, gmfn, entry, size);
+ if ( rc & SHADOW2_SET_FLUSH )
+ {
+ // Flush everyone except the local processor, which will flush when it
+ // re-enters the HVM guest.
+ //
+ cpumask_t mask = d->domain_dirty_cpumask;
+ cpu_clear(v->processor, mask);
+ flush_tlb_mask(mask);
+ }
+ if ( rc & SHADOW2_SET_ERROR )
+ {
+ /* This page is probably not a pagetable any more: tear it out of the
+ * shadows, along with any tables that reference it */
+ shadow2_remove_all_shadows_and_parents(v, gmfn);
+ }
+ /* We ignore the other bits: since we are about to change CR3 on
+ * VMENTER we don't need to do any extra TLB flushes. */
+}
+
+
+/**************************************************************************/
+/* Memory management for shadow pages. */
+
+/* Meaning of the count_info field in shadow pages
+ * ----------------------------------------------
+ *
+ * A count of all references to this page from other shadow pages and
+ * guest CR3s (a.k.a. v->arch.shadow_table).
+ *
+ * The top bits hold the shadow type and the pinned bit. Top-level
+ * shadows are pinned so that they don't disappear when not in a CR3
+ * somewhere.
+ *
+ * We don't need to use get|put_page for this as the updates are all
+ * protected by the shadow lock. We can't use get|put_page for this
+ * as the size of the count on shadow pages is different from that on
+ * normal guest pages.
+ */
+
+/* Meaning of the type_info field in shadow pages
+ * ----------------------------------------------
+ *
+ * type_info use depends on the shadow type (from count_info)
+ *
+ * PGC_SH2_none : This page is in the shadow2 free pool. type_info holds
+ * the chunk order for our freelist allocator.
+ *
+ * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info
+ * holds the mfn of the guest page being shadowed,
+ *
+ * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage.
+ * type_info holds the gfn being shattered.
+ *
+ * PGC_SH2_monitor_table : This page is part of a monitor table.
+ * type_info is not used.
+ */
+
+/* Meaning of the _domain field in shadow pages
+ * --------------------------------------------
+ *
+ * In shadow pages, this field will always have its least significant bit
+ * set. This ensures that all attempts to get_page() will fail (as all
+ * valid pickled domain pointers have a zero for their least significant bit).
+ * Instead, the remaining upper bits are used to record the shadow generation
+ * counter when the shadow was created.
+ */
+
+/* Meaning of the shadow2_flags field
+ * ----------------------------------
+ *
+ * In guest pages that are shadowed, one bit for each kind of shadow they have.
+ *
+ * In shadow pages, will be used for holding a representation of the populated
+ * entries in this shadow (either a min/max, or a bitmap, or ...)
+ *
+ * In monitor-table pages, holds the level of the particular page (to save
+ * spilling the shadow types into an extra bit by having three types of monitor
+ * page).
+ */
+
+/* Meaning of the list_head struct in shadow pages
+ * -----------------------------------------------
+ *
+ * In free shadow pages, this is used to hold the free-lists of chunks.
+ *
+ * In top-level shadow tables, this holds a linked-list of all top-level
+ * shadows (used for recovering memory and destroying shadows).
+ *
+ * In lower-level shadows, this holds the physical address of a higher-level
+ * shadow entry that holds a reference to this shadow (or zero).
+ */
+
+/* Allocating shadow pages
+ * -----------------------
+ *
+ * Most shadow pages are allocated singly, but there are two cases where we
+ * need to allocate multiple pages together.
+ *
+ * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
+ * A 32-bit guest l1 table covers 4MB of virtuial address space,
+ * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
+ * of virtual address space each). Similarly, a 32-bit guest l2 table
+ * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va
+ * each). These multi-page shadows are contiguous and aligned;
+ * functions for handling offsets into them are defined in shadow2.c
+ * (shadow_l1_index() etc.)
+ *
+ * 2: Shadowing PAE top-level pages. Each guest page that contains
+ * any PAE top-level pages requires two shadow pages to shadow it.
+ * They contain alternating l3 tables and pae_l3_bookkeeping structs.
+ *
+ * This table shows the allocation behaviour of the different modes:
+ *
+ * Xen paging 32b pae pae 64b 64b 64b
+ * Guest paging 32b 32b pae 32b pae 64b
+ * PV or HVM * HVM * HVM HVM *
+ * Shadow paging 32b pae pae pae pae 64b
+ *
+ * sl1 size 4k 8k 4k 8k 4k 4k
+ * sl2 size 4k 16k 4k 16k 4k 4k
+ * sl3 size - - 8k - 8k 4k
+ * sl4 size - - - - - 4k
+ *
+ * We allocate memory from xen in four-page units and break them down
+ * with a simple buddy allocator. Can't use the xen allocator to handle
+ * this as it only works for contiguous zones, and a domain's shadow
+ * pool is made of fragments.
+ *
+ * In HVM guests, the p2m table is built out of shadow pages, and we provide
+ * a function for the p2m management to steal pages, in max-order chunks, from
+ * the free pool. We don't provide for giving them back, yet.
+ */
+
+/* Figure out the least acceptable quantity of shadow memory.
+ * The minimum memory requirement for always being able to free up a
+ * chunk of memory is very small -- only three max-order chunks per
+ * vcpu to hold the top level shadows and pages with Xen mappings in them.
+ *
+ * But for a guest to be guaranteed to successfully execute a single
+ * instruction, we must be able to map a large number (about thirty) VAs
+ * at the same time, which means that to guarantee progress, we must
+ * allow for more than ninety allocated pages per vcpu. We round that
+ * up to 128 pages, or half a megabyte per vcpu. */
+unsigned int shadow2_min_acceptable_pages(struct domain *d)
+{
+ u32 vcpu_count = 0;
+ struct vcpu *v;
+
+ for_each_vcpu(d, v)
+ vcpu_count++;
+
+ return (vcpu_count * 128);
+}
+
+/* Using the type_info field to store freelist order */
+#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
+#define SH2_SET_PFN_ORDER(_p, _o) \
+ do { (_p)->u.inuse.type_info = (_o); } while (0)
+
+
+/* Figure out the order of allocation needed for a given shadow type */
+static inline u32
+shadow_order(u32 shadow_type)
+{
+#if CONFIG_PAGING_LEVELS > 2
+ static const u32 type_to_order[16] = {
+ 0, /* PGC_SH2_none */
+ 1, /* PGC_SH2_l1_32_shadow */
+ 1, /* PGC_SH2_fl1_32_shadow */
+ 2, /* PGC_SH2_l2_32_shadow */
+ 0, /* PGC_SH2_l1_pae_shadow */
+ 0, /* PGC_SH2_fl1_pae_shadow */
+ 0, /* PGC_SH2_l2_pae_shadow */
+ 0, /* PGC_SH2_l2h_pae_shadow */
+ 1, /* PGC_SH2_l3_pae_shadow */
+ 0, /* PGC_SH2_l1_64_shadow */
+ 0, /* PGC_SH2_fl1_64_shadow */
+ 0, /* PGC_SH2_l2_64_shadow */
+ 0, /* PGC_SH2_l3_64_shadow */
+ 0, /* PGC_SH2_l4_64_shadow */
+ 2, /* PGC_SH2_p2m_table */
+ 0 /* PGC_SH2_monitor_table */
+ };
+ u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift;
+ return type_to_order[type];
+#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
+ return 0;
+#endif
+}
+
+
+/* Do we have a free chunk of at least this order? */
+static inline int chunk_is_available(struct domain *d, int order)
+{
+ int i;
+
+ for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+ if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+ return 1;
+ return 0;
+}
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift )
+ {
+ case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn);
+#else
+ SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn);
+#endif
+ break;
+#if CONFIG_PAGING_LEVELS >= 3
+ case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn);
+ break;
+#endif
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn);
+ break;
+#endif
+ default:
+ SHADOW2_PRINTK("top-level shadow has bad type %08lx\n",
+ (unsigned long)((pg->count_info & PGC_SH2_type_mask)
+ >> PGC_SH2_type_shift));
+ BUG();
+ }
+}
+
+
+/* Make sure there is at least one chunk of the required order available
+ * in the shadow page pool. This must be called before any calls to
+ * shadow2_alloc(). Since this will free existing shadows to make room,
+ * it must be called early enough to avoid freeing shadows that the
+ * caller is currently working on. */
+void shadow2_prealloc(struct domain *d, unsigned int order)
+{
+ /* Need a vpcu for calling unpins; for now, since we don't have
+ * per-vcpu shadows, any will do */
+ struct vcpu *v = d->vcpu[0];
+ struct list_head *l, *t;
+ struct page_info *pg;
+ mfn_t smfn;
+
+ if ( chunk_is_available(d, order) ) return;
+
+ /* Stage one: walk the list of top-level pages, unpinning them */
+ perfc_incrc(shadow2_prealloc_1);
+ list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ smfn = page_to_mfn(pg);
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow )
+ {
+ /* For PAE, we need to unpin each subshadow on this shadow */
+ SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn);
+ }
+ else
+#endif /* 32-bit code always takes this branch */
+ {
+ /* Unpin this top-level shadow */
+ sh2_unpin(v, smfn);
+ }
+
+ /* See if that freed up a chunk of appropriate size */
+ if ( chunk_is_available(d, order) ) return;
+ }
+
+ /* Stage two: all shadow pages are in use in hierarchies that are
+ * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
+ * mappings. */
+ perfc_incrc(shadow2_prealloc_2);
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+ /* Walk the list from the tail: recently used toplevels have been pulled
+ * to the head */
+ list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ smfn = page_to_mfn(pg);
+ shadow2_unhook_mappings(v, smfn);
+
+ /* Need to flush TLB if we've altered our own tables */
+ if ( !shadow2_mode_external(d)
+ && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
+ local_flush_tlb();
+
+ /* See if that freed up a chunk of appropriate size */
+ if ( chunk_is_available(d, order) ) return;
+ }
+
+ /* Nothing more we can do: all remaining shadows are of pages that
+ * hold Xen mappings for some vcpu. This can never happen. */
+ SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n"
+ " shadow pages total = %u, free = %u, p2m=%u\n",
+ 1 << order,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ BUG();
+}
+
+
+/* Allocate another shadow's worth of (contiguous, aligned) pages,
+ * and fill in the type and backpointer fields of their page_infos.
+ * Never fails to allocate. */
+mfn_t shadow2_alloc(struct domain *d,
+ u32 shadow_type,
+ unsigned long backpointer)
+{
+ struct page_info *pg = NULL;
+ unsigned int order = shadow_order(shadow_type);
+ cpumask_t mask;
+ void *p;
+ int i;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(order <= SHADOW2_MAX_ORDER);
+ ASSERT(shadow_type != PGC_SH2_none);
+ perfc_incrc(shadow2_alloc);
+
+ /* Find smallest order which can satisfy the request. */
+ for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+ if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+ {
+ pg = list_entry(d->arch.shadow2_freelists[i].next,
+ struct page_info, list);
+ list_del(&pg->list);
+
+ /* We may have to halve the chunk a number of times. */
+ while ( i != order )
+ {
+ i--;
+ SH2_SET_PFN_ORDER(pg, i);
+ list_add_tail(&pg->list, &d->arch.shadow2_freelists[i]);
+ pg += 1 << i;
+ }
+ d->arch.shadow2_free_pages -= 1 << order;
+
+ /* Init page info fields and clear the pages */
+ for ( i = 0; i < 1<<order ; i++ )
+ {
+ pg[i].u.inuse.type_info = backpointer;
+ pg[i].count_info = shadow_type;
+ pg[i].shadow2_flags = 0;
+ INIT_LIST_HEAD(&pg[i].list);
+ /* Before we overwrite the old contents of this page,
+ * we need to be sure that no TLB holds a pointer to it. */
+ mask = d->domain_dirty_cpumask;
+ tlbflush_filter(mask, pg[i].tlbflush_timestamp);
+ if ( unlikely(!cpus_empty(mask)) )
+ {
+ perfc_incrc(shadow2_alloc_tlbflush);
+ flush_tlb_mask(mask);
+ }
+ /* Now safe to clear the page for reuse */
+ p = sh2_map_domain_page(page_to_mfn(pg+i));
+ ASSERT(p != NULL);
+ clear_page(p);
+ sh2_unmap_domain_page(p);
+ perfc_incr(shadow2_alloc_count);
+ }
+ return page_to_mfn(pg);
+ }
+
+ /* If we get here, we failed to allocate. This should never happen.
+ * It means that we didn't call shadow2_prealloc() correctly before
+ * we allocated. We can't recover by calling prealloc here, because
+ * we might free up higher-level pages that the caller is working on. */
+ SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
+ BUG();
+}
+
+
+/* Return some shadow pages to the pool. */
+void shadow2_free(struct domain *d, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ u32 shadow_type;
+ unsigned long order;
+ unsigned long mask;
+ int i;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ perfc_incrc(shadow2_free);
+
+ shadow_type = pg->count_info & PGC_SH2_type_mask;
+ ASSERT(shadow_type != PGC_SH2_none);
+ ASSERT(shadow_type != PGC_SH2_p2m_table);
+ order = shadow_order(shadow_type);
+
+ d->arch.shadow2_free_pages += 1 << order;
+
+ for ( i = 0; i < 1<<order; i++ )
+ {
+ /* Strip out the type: this is now a free shadow page */
+ pg[i].count_info = 0;
+ /* Remember the TLB timestamp so we will know whether to flush
+ * TLBs when we reuse the page. Because the destructors leave the
+ * contents of the pages in place, we can delay TLB flushes until
+ * just before the allocator hands the page out again. */
+ pg[i].tlbflush_timestamp = tlbflush_current_time();
+ perfc_decr(shadow2_alloc_count);
+ }
+
+ /* Merge chunks as far as possible. */
+ while ( order < SHADOW2_MAX_ORDER )
+ {
+ mask = 1 << order;
+ if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
+ /* Merge with predecessor block? */
+ if ( (((pg-mask)->count_info & PGC_SH2_type_mask) != PGT_none)
+ || (SH2_PFN_ORDER(pg-mask) != order) )
+ break;
+ list_del(&(pg-mask)->list);
+ pg -= mask;
+ } else {
+ /* Merge with successor block? */
+ if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none)
+ || (SH2_PFN_ORDER(pg+mask) != order) )
+ break;
+ list_del(&(pg+mask)->list);
+ }
+ order++;
+ }
+
+ SH2_SET_PFN_ORDER(pg, order);
+ list_add_tail(&pg->list, &d->arch.shadow2_freelists[order]);
+}
+
+/* Divert some memory from the pool to be used by the p2m mapping.
+ * This action is irreversible: the p2m mapping only ever grows.
+ * That's OK because the p2m table only exists for external domains,
+ * and those domains can't ever turn off shadow mode.
+ * Also, we only ever allocate a max-order chunk, so as to preserve
+ * the invariant that shadow2_prealloc() always works.
+ * Returns 0 iff it can't get a chunk (the caller should then
+ * free up some pages in domheap and call set_sh2_allocation);
+ * returns non-zero on success.
+ */
+static int
+shadow2_alloc_p2m_pages(struct domain *d)
+{
+ struct page_info *pg;
+ u32 i;
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ if ( d->arch.shadow2_total_pages
+ < (shadow2_min_acceptable_pages(d) + (1<<SHADOW2_MAX_ORDER)) )
+ return 0; /* Not enough shadow memory: need to increase it first */
+
+ pg = mfn_to_page(shadow2_alloc(d, PGC_SH2_p2m_table, 0));
+ d->arch.shadow2_p2m_pages += (1<<SHADOW2_MAX_ORDER);
+ d->arch.shadow2_total_pages -= (1<<SHADOW2_MAX_ORDER);
+ for (i = 0; i < (1<<SHADOW2_MAX_ORDER); i++)
+ {
+ /* Unlike shadow pages, mark p2m pages as owned by the domain */
+ page_set_owner(&pg[i], d);
+ list_add_tail(&pg[i].list, &d->arch.shadow2_p2m_freelist);
+ }
+ return 1;
+}
+
+// Returns 0 if no memory is available...
+mfn_t
+shadow2_alloc_p2m_page(struct domain *d)
+{
+ struct list_head *entry;
+ mfn_t mfn;
+ void *p;
+
+ if ( list_empty(&d->arch.shadow2_p2m_freelist) &&
+ !shadow2_alloc_p2m_pages(d) )
+ return _mfn(0);
+ entry = d->arch.shadow2_p2m_freelist.next;
+ list_del(entry);
+ list_add_tail(entry, &d->arch.shadow2_p2m_inuse);
+ mfn = page_to_mfn(list_entry(entry, struct page_info, list));
+ sh2_get_ref(mfn, 0);
+ p = sh2_map_domain_page(mfn);
+ clear_page(p);
+ sh2_unmap_domain_page(p);
+
+ return mfn;
+}
+
+#if CONFIG_PAGING_LEVELS == 3
+static void p2m_install_entry_in_monitors(struct domain *d,
+ l3_pgentry_t *l3e)
+/* Special case, only used for external-mode domains on PAE hosts:
+ * update the mapping of the p2m table. Once again, this is trivial in
+ * other paging modes (one top-level entry points to the top-level p2m,
+ * no maintenance needed), but PAE makes life difficult by needing a
+ * copy the eight l3es of the p2m table in eight l2h slots in the
+ * monitor table. This function makes fresh copies when a p2m l3e
+ * changes. */
+{
+ l2_pgentry_t *ml2e;
+ struct vcpu *v;
+ unsigned int index;
+
+ index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
+ ASSERT(index < MACHPHYS_MBYTES>>1);
+
+ for_each_vcpu(d, v)
+ {
+ if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ continue;
+ ASSERT(shadow2_mode_external(v->domain));
+
+ SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
+ d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
+
+ if ( v == current ) /* OK to use linear map of monitor_table */
+ ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
+ else
+ {
+ l3_pgentry_t *ml3e;
+ ml3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
+ ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
+ ml2e += l2_table_offset(RO_MPT_VIRT_START);
+ sh2_unmap_domain_page(ml3e);
+ }
+ ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
+ if ( v != current )
+ sh2_unmap_domain_page(ml2e);
+ }
+}
+#endif
+
+// Find the next level's P2M entry, checking for out-of-range gfn's...
+// Returns NULL on error.
+//
+static l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+ unsigned long gfn, u32 shift, u32 max)
+{
+ u32 index;
+
+ index = *gfn_remainder >> shift;
+ if ( index >= max )
+ {
+ SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range "
+ "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
+ gfn, *gfn_remainder, shift, index, max);
+ return NULL;
+ }
+ *gfn_remainder &= (1 << shift) - 1;
+ return (l1_pgentry_t *)table + index;
+}
+
+// Walk one level of the P2M table, allocating a new table if required.
+// Returns 0 on error.
+//
+static int
+p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
+ unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
+ u32 max, unsigned long type)
+{
+ l1_pgentry_t *p2m_entry;
+ void *next;
+
+ if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
+ shift, max)) )
+ return 0;
+
+ if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+ {
+ mfn_t mfn = shadow2_alloc_p2m_page(d);
+ if ( mfn_x(mfn) == 0 )
+ return 0;
+ *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+ mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
+ mfn_to_page(mfn)->count_info = 1;
+#if CONFIG_PAGING_LEVELS == 3
+ if (type == PGT_l2_page_table)
+ {
+ /* We have written to the p2m l3: need to sync the per-vcpu
+ * copies of it in the monitor tables */
+ p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
+ }
+#endif
+ /* The P2M can be shadowed: keep the shadows synced */
+ if ( d->vcpu[0] )
+ (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn,
+ p2m_entry, sizeof *p2m_entry);
+ }
+ *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
+ next = sh2_map_domain_page(*table_mfn);
+ sh2_unmap_domain_page(*table);
+ *table = next;
+
+ return 1;
+}
+
+// Returns 0 on error (out of memory)
+int
+shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+ // XXX -- this might be able to be faster iff current->domain == d
+ mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
+ void *table = sh2_map_domain_page(table_mfn);
+ unsigned long gfn_remainder = gfn;
+ l1_pgentry_t *p2m_entry;
+
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+ L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+ return 0;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ // When using PAE Xen, we only allow 33 bits of pseudo-physical
+ // address in translated guests (i.e. 8 GBytes). This restriction
+ // comes from wanting to map the P2M table into the 16MB RO_MPT hole
+ // in Xen's address space for translated PV guests.
+ //
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+ (CONFIG_PAGING_LEVELS == 3
+ ? 8
+ : L3_PAGETABLE_ENTRIES),
+ PGT_l2_page_table) )
+ return 0;
+#endif
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+ L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+ return 0;
+
+ p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+ 0, L1_PAGETABLE_ENTRIES);
+ ASSERT(p2m_entry);
+ if ( valid_mfn(mfn) )
+ *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+ else
+ *p2m_entry = l1e_empty();
+
+ /* The P2M can be shadowed: keep the shadows synced */
+ (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn,
+ p2m_entry, sizeof *p2m_entry);
+
+ sh2_unmap_domain_page(table);
+
+ return 1;
+}
+
+// Allocate a new p2m table for a domain.
+//
+// The structure of the p2m table is that of a pagetable for xen (i.e. it is
+// controlled by CONFIG_PAGING_LEVELS).
+//
+// Returns 0 if p2m table could not be initialized
+//
+static int
+shadow2_alloc_p2m_table(struct domain *d)
+{
+ mfn_t p2m_top;
+ struct list_head *entry;
+ unsigned int page_count = 0;
+
+ SHADOW2_PRINTK("allocating p2m table\n");
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
+
+ p2m_top = shadow2_alloc_p2m_page(d);
+ mfn_to_page(p2m_top)->count_info = 1;
+ mfn_to_page(p2m_top)->u.inuse.type_info =
+#if CONFIG_PAGING_LEVELS == 4
+ PGT_l4_page_table
+#elif CONFIG_PAGING_LEVELS == 3
+ PGT_l3_page_table
+#elif CONFIG_PAGING_LEVELS == 2
+ PGT_l2_page_table
+#endif
+ | 1 | PGT_validated;
+
+ if ( mfn_x(p2m_top) == 0 )
+ return 0;
+
+ d->arch.phys_table = pagetable_from_mfn(p2m_top);
+
+ SHADOW2_PRINTK("populating p2m table\n");
+
+ for ( entry = d->page_list.next;
+ entry != &d->page_list;
+ entry = entry->next )
+ {
+ struct page_info *page = list_entry(entry, struct page_info, list);
+ mfn_t mfn = page_to_mfn(page);
+ unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
+ page_count++;
+ if (
+#ifdef __x86_64__
+ (gfn != 0x5555555555555555L)
+#else
+ (gfn != 0x55555555L)
+#endif
+ && gfn != INVALID_M2P_ENTRY
+ && !shadow2_set_p2m_entry(d, gfn, mfn) )
+ {
+ SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH2_PRI_mfn "\n",
+ gfn, mfn_x(mfn));
+ return 0;
+ }
+ }
+
+ SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count);
+ return 1;
+}
+
+mfn_t
+sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+/* Read another domain's p2m entries */
+{
+ mfn_t mfn;
+ unsigned long addr = gpfn << PAGE_SHIFT;
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+
+ ASSERT(shadow2_mode_translate(d));
+ mfn = pagetable_get_mfn(d->arch.phys_table);
+
+
+#if CONFIG_PAGING_LEVELS > 2
+ if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
+ /* This pfn is higher than the p2m map can hold */
+ return _mfn(INVALID_MFN);
+#endif
+
+
+#if CONFIG_PAGING_LEVELS >= 4
+ {
+ l4_pgentry_t *l4e = sh2_map_domain_page(mfn);
+ l4e += l4_table_offset(addr);
+ if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+ {
+ sh2_unmap_domain_page(l4e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l4e_get_pfn(*l4e));
+ sh2_unmap_domain_page(l4e);
+ }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ {
+ l3_pgentry_t *l3e = sh2_map_domain_page(mfn);
+ l3e += l3_table_offset(addr);
+ if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+ {
+ sh2_unmap_domain_page(l3e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l3e_get_pfn(*l3e));
+ sh2_unmap_domain_page(l3e);
+ }
+#endif
+
+ l2e = sh2_map_domain_page(mfn);
+ l2e += l2_table_offset(addr);
+ if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+ {
+ sh2_unmap_domain_page(l2e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l2e_get_pfn(*l2e));
+ sh2_unmap_domain_page(l2e);
+
+ l1e = sh2_map_domain_page(mfn);
+ l1e += l1_table_offset(addr);
+ if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+ {
+ sh2_unmap_domain_page(l1e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l1e_get_pfn(*l1e));
+ sh2_unmap_domain_page(l1e);
+
+ return mfn;
+}
+
+unsigned long
+shadow2_gfn_to_mfn_foreign(unsigned long gpfn)
+{
+ return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn));
+}
+
+
+static void shadow2_p2m_teardown(struct domain *d)
+/* Return all the p2m pages to Xen.
+ * We know we don't have any extra mappings to these pages */
+{
+ struct list_head *entry, *n;
+ struct page_info *pg;
+
+ d->arch.phys_table = pagetable_null();
+
+ list_for_each_safe(entry, n, &d->arch.shadow2_p2m_inuse)
+ {
+ pg = list_entry(entry, struct page_info, list);
+ list_del(entry);
+ /* Should have just the one ref we gave it in alloc_p2m_page() */
+ if ( (pg->count_info & PGC_SH2_count_mask) != 1 )
+ {
+ SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
+ pg->count_info, pg->u.inuse.type_info);
+ }
+ ASSERT(page_get_owner(pg) == d);
+ /* Free should not decrement domain's total allocation, since
+ * these pages were allocated without an owner. */
+ page_set_owner(pg, NULL);
+ free_domheap_pages(pg, 0);
+ d->arch.shadow2_p2m_pages--;
+ perfc_decr(shadow2_alloc_count);
+ }
+ list_for_each_safe(entry, n, &d->arch.shadow2_p2m_freelist)
+ {
+ list_del(entry);
+ pg = list_entry(entry, struct page_info, list);
+ ASSERT(page_get_owner(pg) == d);
+ /* Free should not decrement domain's total allocation. */
+ page_set_owner(pg, NULL);
+ free_domheap_pages(pg, 0);
+ d->arch.shadow2_p2m_pages--;
+ perfc_decr(shadow2_alloc_count);
+ }
+ ASSERT(d->arch.shadow2_p2m_pages == 0);
+}
+
+/* Set the pool of shadow pages to the required number of pages.
+ * Input will be rounded up to at least shadow2_min_acceptable_pages(),
+ * plus space for the p2m table.
+ * Returns 0 for success, non-zero for failure. */
+static unsigned int set_sh2_allocation(struct domain *d,
+ unsigned int pages,
+ int *preempted)
+{
+ struct page_info *pg;
+ unsigned int lower_bound;
+ int j;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ /* Don't allocate less than the minimum acceptable, plus one page per
+ * megabyte of RAM (for the p2m table) */
+ lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256);
+ if ( pages > 0 && pages < lower_bound )
+ pages = lower_bound;
+ /* Round up to largest block size */
+ pages = (pages + ((1<<SHADOW2_MAX_ORDER)-1)) & ~((1<<SHADOW2_MAX_ORDER)-1);
+
+ SHADOW2_PRINTK("current %i target %i\n",
+ d->arch.shadow2_total_pages, pages);
+
+ while ( d->arch.shadow2_total_pages != pages )
+ {
+ if ( d->arch.shadow2_total_pages < pages )
+ {
+ /* Need to allocate more memory from domheap */
+ pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0);
+ if ( pg == NULL )
+ {
+ SHADOW2_PRINTK("failed to allocate shadow pages.\n");
+ return -ENOMEM;
+ }
+ d->arch.shadow2_free_pages += 1<<SHADOW2_MAX_ORDER;
+ d->arch.shadow2_total_pages += 1<<SHADOW2_MAX_ORDER;
+ for ( j = 0; j < 1<<SHADOW2_MAX_ORDER; j++ )
+ {
+ pg[j].u.inuse.type_info = 0; /* Free page */
+ pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
+ }
+ SH2_SET_PFN_ORDER(pg, SHADOW2_MAX_ORDER);
+ list_add_tail(&pg->list,
+ &d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]);
+ }
+ else if ( d->arch.shadow2_total_pages > pages )
+ {
+ /* Need to return memory to domheap */
+ shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+ ASSERT(!list_empty(&d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]));
+ pg = list_entry(d->arch.shadow2_freelists[SHADOW2_MAX_ORDER].next,
+ struct page_info, list);
+ list_del(&pg->list);
+ d->arch.shadow2_free_pages -= 1<<SHADOW2_MAX_ORDER;
+ d->arch.shadow2_total_pages -= 1<<SHADOW2_MAX_ORDER;
+ free_domheap_pages(pg, SHADOW2_MAX_ORDER);
+ }
+
+ /* Check to see if we need to yield and try again */
+ if ( preempted && hypercall_preempt_check() )
+ {
+ *preempted = 1;
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+unsigned int shadow2_set_allocation(struct domain *d,
+ unsigned int megabytes,
+ int *preempted)
+/* Hypercall interface to set the shadow memory allocation */
+{
+ unsigned int rv;
+ shadow2_lock(d);
+ rv = set_sh2_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
+ SHADOW2_PRINTK("dom %u allocation now %u pages (%u MB)\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ shadow2_get_allocation(d));
+ shadow2_unlock(d);
+ return rv;
+}
+
+/**************************************************************************/
+/* Hash table for storing the guest->shadow mappings */
+
+/* Hash function that takes a gfn or mfn, plus another byte of type info */
+typedef u32 key_t;
+static inline key_t sh2_hash(unsigned long n, u8 t)
+{
+ unsigned char *p = (unsigned char *)&n;
+ key_t k = t;
+ int i;
+ for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
+ return k;
+}
+
+#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL)
+
+/* Before we get to the mechanism, define a pair of audit functions
+ * that sanity-check the contents of the hash table. */
+static void sh2_hash_audit_bucket(struct domain *d, int bucket)
+/* Audit one bucket of the hash table */
+{
+ struct shadow2_hash_entry *e, *x;
+ struct page_info *pg;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) )
+ return;
+
+ e = &d->arch.shadow2_hash_table[bucket];
+ if ( e->t == 0 ) return; /* Bucket is empty */
+ while ( e )
+ {
+ /* Empty link? */
+ BUG_ON( e->t == 0 );
+ /* Bogus type? */
+ BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) );
+ /* Wrong bucket? */
+ BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket );
+ /* Duplicate entry? */
+ for ( x = e->next; x; x = x->next )
+ BUG_ON( x->n == e->n && x->t == e->t );
+ /* Bogus MFN? */
+ BUG_ON( !valid_mfn(e->smfn) );
+ pg = mfn_to_page(e->smfn);
+ /* Not a shadow? */
+ BUG_ON( page_get_owner(pg) != 0 );
+ /* Wrong kind of shadow? */
+ BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift
+ != e->t );
+ /* Bad backlink? */
+ BUG_ON( pg->u.inuse.type_info != e->n );
+ if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+ && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+ && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) )
+ {
+ /* Bad shadow flags on guest page? */
+ BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<<e->t)) );
+ }
+ /* That entry was OK; on we go */
+ e = e->next;
+ }
+}
+
+#else
+#define sh2_hash_audit_bucket(_d, _b)
+#endif /* Hashtable bucket audit */
+
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL
+
+static void sh2_hash_audit(struct domain *d)
+/* Full audit: audit every bucket in the table */
+{
+ int i;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) )
+ return;
+
+ for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ )
+ {
+ sh2_hash_audit_bucket(d, i);
+ }
+}
+
+#else
+#define sh2_hash_audit(_d)
+#endif /* Hashtable bucket audit */
+
+/* Memory management interface for bucket allocation.
+ * These ought to come out of shadow memory, but at least on 32-bit
+ * machines we are forced to allocate them from xenheap so that we can
+ * address them. */
+static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d)
+{
+ struct shadow2_hash_entry *extra, *x;
+ int i;
+
+ /* We need to allocate a new node. Ensure the free list is not empty.
+ * Allocate new entries in units the same size as the original table. */
+ if ( unlikely(d->arch.shadow2_hash_freelist == NULL) )
+ {
+ size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x));
+ extra = xmalloc_bytes(sz);
+
+ if ( extra == NULL )
+ {
+ /* No memory left! */
+ SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n");
+ domain_crash_synchronous();
+ }
+ memset(extra, 0, sz);
+
+ /* Record the allocation block so it can be correctly freed later. */
+ *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) =
+ d->arch.shadow2_hash_allocations;
+ d->arch.shadow2_hash_allocations = &extra[0];
+
+ /* Thread a free chain through the newly-allocated nodes. */
+ for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ )
+ extra[i].next = &extra[i+1];
+ extra[i].next = NULL;
+
+ /* Add the new nodes to the free list. */
+ d->arch.shadow2_hash_freelist = &extra[0];
+ }
+
+ /* Allocate a new node from the free list. */
+ x = d->arch.shadow2_hash_freelist;
+ d->arch.shadow2_hash_freelist = x->next;
+ return x;
+}
+
+static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e)
+{
+ /* Mark the bucket as empty and return it to the free list */
+ e->t = 0;
+ e->next = d->arch.shadow2_hash_freelist;
+ d->arch.shadow2_hash_freelist = e;
+}
+
+
+/* Allocate and initialise the table itself.
+ * Returns 0 for success, 1 for error. */
+static int shadow2_hash_alloc(struct domain *d)
+{
+ struct shadow2_hash_entry *table;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(!d->arch.shadow2_hash_table);
+
+ table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS);
+ if ( !table ) return 1;
+ memset(table, 0,
+ SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry));
+ d->arch.shadow2_hash_table = table;
+ return 0;
+}
+
+/* Tear down the hash table and return all memory to Xen.
+ * This function does not care whether the table is populated. */
+static void shadow2_hash_teardown(struct domain *d)
+{
+ struct shadow2_hash_entry *a, *n;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_table);
+
+ /* Return the table itself */
+ xfree(d->arch.shadow2_hash_table);
+ d->arch.shadow2_hash_table = NULL;
+
+ /* Return any extra allocations */
+ a = d->arch.shadow2_hash_allocations;
+ while ( a )
+ {
+ /* We stored a linked-list pointer at the end of each allocation */
+ n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS]));
+ xfree(a);
+ a = n;
+ }
+ d->arch.shadow2_hash_allocations = NULL;
+ d->arch.shadow2_hash_freelist = NULL;
+}
+
+
+mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
+/* Find an entry in the hash table. Returns the MFN of the shadow,
+ * or INVALID_MFN if it doesn't exist */
+{
+ struct domain *d = v->domain;
+ struct shadow2_hash_entry *p, *x, *head;
+ key_t key;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_table);
+ ASSERT(t);
+
+ sh2_hash_audit(d);
+
+ perfc_incrc(shadow2_hash_lookups);
+ key = sh2_hash(n, t);
+
+ x = head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+ p = NULL;
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+ do
+ {
+ ASSERT(x->t || ((x == head) && (x->next == NULL)));
+
+ if ( x->n == n && x->t == t )
+ {
+ /* Pull-to-front if 'x' isn't already the head item */
+ if ( unlikely(x != head) )
+ {
+ if ( unlikely(d->arch.shadow2_hash_walking != 0) )
+ /* Can't reorder: someone is walking the hash chains */
+ return x->smfn;
+ else
+ {
+ /* Delete 'x' from list and reinsert after head. */
+ p->next = x->next;
+ x->next = head->next;
+ head->next = x;
+
+ /* Swap 'x' contents with head contents. */
+ SWAP(head->n, x->n);
+ SWAP(head->t, x->t);
+ SWAP(head->smfn, x->smfn);
+ }
+ }
+ else
+ {
+ perfc_incrc(shadow2_hash_lookup_head);
+ }
+ return head->smfn;
+ }
+
+ p = x;
+ x = x->next;
+ }
+ while ( x != NULL );
+
+ perfc_incrc(shadow2_hash_lookup_miss);
+ return _mfn(INVALID_MFN);
+}
+
+void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Put a mapping (n,t)->smfn into the hash table */
+{
+ struct domain *d = v->domain;
+ struct shadow2_hash_entry *x, *head;
+ key_t key;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_table);
+ ASSERT(t);
+
+ sh2_hash_audit(d);
+
+ perfc_incrc(shadow2_hash_inserts);
+ key = sh2_hash(n, t);
+
+ head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+ /* If the bucket is empty then insert the new page as the head item. */
+ if ( head->t == 0 )
+ {
+ head->n = n;
+ head->t = t;
+ head->smfn = smfn;
+ ASSERT(head->next == NULL);
+ }
+ else
+ {
+ /* Insert a new entry directly after the head item. */
+ x = sh2_alloc_hash_entry(d);
+ x->n = n;
+ x->t = t;
+ x->smfn = smfn;
+ x->next = head->next;
+ head->next = x;
+ }
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Excise the mapping (n,t)->smfn from the hash table */
+{
+ struct domain *d = v->domain;
+ struct shadow2_hash_entry *p, *x, *head;
+ key_t key;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_table);
+ ASSERT(t);
+
+ sh2_hash_audit(d);
+
+ perfc_incrc(shadow2_hash_deletes);
+ key = sh2_hash(n, t);
+
+ head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+ /* Match on head item? */
+ if ( head->n == n && head->t == t )
+ {
+ if ( (x = head->next) != NULL )
+ {
+ /* Overwrite head with contents of following node. */
+ head->n = x->n;
+ head->t = x->t;
+ head->smfn = x->smfn;
+
+ /* Delete following node. */
+ head->next = x->next;
+ sh2_free_hash_entry(d, x);
+ }
+ else
+ {
+ /* This bucket is now empty. Initialise the head node. */
+ head->t = 0;
+ }
+ }
+ else
+ {
+ /* Not at the head; need to walk the chain */
+ p = head;
+ x = head->next;
+
+ while(1)
+ {
+ ASSERT(x); /* We can't have hit the end, since our target is
+ * still in the chain somehwere... */
+ if ( x->n == n && x->t == t )
+ {
+ /* Delete matching node. */
+ p->next = x->next;
+ sh2_free_hash_entry(d, x);
+ break;
+ }
+ p = x;
+ x = x->next;
+ }
+ }
+
+ sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
+
+static void hash_foreach(struct vcpu *v,
+ unsigned int callback_mask,
+ hash_callback_t callbacks[],
+ mfn_t callback_mfn)
+/* Walk the hash table looking at the types of the entries and
+ * calling the appropriate callback function for each entry.
+ * The mask determines which shadow types we call back for, and the array
+ * of callbacks tells us which function to call.
+ * Any callback may return non-zero to let us skip the rest of the scan.
+ *
+ * WARNING: Callbacks MUST NOT add or remove hash entries unless they
+ * then return non-zero to terminate the scan. */
+{
+ int i, done = 0;
+ struct domain *d = v->domain;
+ struct shadow2_hash_entry *x;
+
+ /* Say we're here, to stop hash-lookups reordering the chains */
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d->arch.shadow2_hash_walking == 0);
+ d->arch.shadow2_hash_walking = 1;
+
+ callback_mask &= ~1; /* Never attempt to call back on empty buckets */
+ for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ )
+ {
+ /* WARNING: This is not safe against changes to the hash table.
+ * The callback *must* return non-zero if it has inserted or
+ * deleted anything from the hash (lookups are OK, though). */
+ for ( x = &d->arch.shadow2_hash_table[i]; x; x = x->next )
+ {
+ if ( callback_mask & (1 << x->t) )
+ {
+ ASSERT(x->t <= 15);
+ ASSERT(callbacks[x->t] != NULL);
+ if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
+ break;
+ }
+ }
+ if ( done ) break;
+ }
+ d->arch.shadow2_hash_walking = 0;
+}
+
+
+/**************************************************************************/
+/* Destroy a shadow page: simple dispatcher to call the per-type destructor
+ * which will decrement refcounts appropriately and return memory to the
+ * free pool. */
+
+void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ u32 t = pg->count_info & PGC_SH2_type_mask;
+
+
+ SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn));
+
+ /* Double-check, if we can, that the shadowed page belongs to this
+ * domain, (by following the back-pointer). */
+ ASSERT(t == PGC_SH2_fl1_32_shadow ||
+ t == PGC_SH2_fl1_pae_shadow ||
+ t == PGC_SH2_fl1_64_shadow ||
+ t == PGC_SH2_monitor_table ||
+ (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info)))
+ == v->domain));
+
+ /* The down-shifts here are so that the switch statement is on nice
+ * small numbers that the compiler will enjoy */
+ switch ( t >> PGC_SH2_type_shift )
+ {
+#if CONFIG_PAGING_LEVELS == 2
+ case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn);
+ break;
+ case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn);
+ break;
+#else /* PAE or 64bit */
+ case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn);
+ break;
+ case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn);
+ break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+ case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn);
+ break;
+ case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn);
+ break;
+ case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn);
+ break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift:
+ case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+ SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn);
+ break;
+#endif
+ default:
+ SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n",
+ (unsigned long)t);
+ BUG();
+ }
+}
+
+/**************************************************************************/
+/* Remove all writeable mappings of a guest frame from the shadow tables
+ * Returns non-zero if we need to flush TLBs.
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access.*/
+
+int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn,
+ unsigned int level,
+ unsigned long fault_addr)
+{
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32 */
+#else
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32 */
+#endif
+ NULL, /* l2_32 */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae */
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */
+#else
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#endif
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64 */
+#else
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#endif
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ static unsigned int callback_mask =
+ 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+ ;
+ struct page_info *pg = mfn_to_page(gmfn);
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ /* Only remove writable mappings if we are doing shadow refcounts.
+ * In guest refcounting, we trust Xen to already be restricting
+ * all the writes to the guest page tables, so we do not need to
+ * do more. */
+ if ( !shadow2_mode_refcounts(v->domain) )
+ return 0;
+
+ /* Early exit if it's already a pagetable, or otherwise not writeable */
+ if ( sh2_mfn_is_a_page_table(gmfn)
+ || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
+ return 0;
+
+ perfc_incrc(shadow2_writeable);
+
+ /* If this isn't a "normal" writeable page, the domain is trying to
+ * put pagetables in special memory of some kind. We can't allow that. */
+ if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
+ {
+ SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %"
+ PRtype_info "\n",
+ mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
+ domain_crash(v->domain);
+ }
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+ if ( v == current && level != 0 )
+ {
+ unsigned long gfn;
+ /* Heuristic: there is likely to be only one writeable mapping,
+ * and that mapping is likely to be in the current pagetable,
+ * either in the guest's linear map (linux, windows) or in a
+ * magic slot used to map high memory regions (linux HIGHTPTE) */
+
+#define GUESS(_a, _h) do { \
+ if ( v->arch.shadow2->guess_wrmap(v, (_a), gmfn) ) \
+ perfc_incrc(shadow2_writeable_h_ ## _h); \
+ if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
+ return 1; \
+ } while (0)
+
+
+ /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
+ if ( v == current
+ && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
+ GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
+
+ if ( v->arch.shadow2->guest_levels == 2 )
+ {
+ if ( level == 1 )
+ /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
+ GUESS(0xC0000000UL + (fault_addr >> 10), 1);
+ }
+#if CONFIG_PAGING_LEVELS >= 3
+ else if ( v->arch.shadow2->guest_levels == 3 )
+ {
+ /* 32bit PAE w2k3: linear map at 0xC0000000 */
+ switch ( level )
+ {
+ case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
+ case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
+ }
+ }
+#if CONFIG_PAGING_LEVELS >= 4
+ else if ( v->arch.shadow2->guest_levels == 4 )
+ {
+ /* 64bit w2k3: linear map at 0x0000070000000000 */
+ switch ( level )
+ {
+ case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
+ case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
+ case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
+ }
+ }
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS >= 3 */
+
+#undef GUESS
+
+ }
+#endif
+
+ /* Brute-force search of all the shadows, by walking the hash */
+ perfc_incrc(shadow2_writeable_bf);
+ hash_foreach(v, callback_mask, callbacks, gmfn);
+
+ /* If that didn't catch the mapping, something is very wrong */
+ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
+ {
+ SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: "
+ "%lu left\n", mfn_x(gmfn),
+ (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
+ domain_crash(v->domain);
+ }
+
+ /* We killed at least one writeable mapping, so must flush TLBs. */
+ return 1;
+}
+
+
+
+/**************************************************************************/
+/* Remove all mappings of a guest frame from the shadow tables.
+ * Returns non-zero if we need to flush TLBs. */
+
+int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ int expected_count;
+
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32 */
+#else
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32 */
+#endif
+ NULL, /* l2_32 */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae */
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */
+#else
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#endif
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64 */
+#else
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#endif
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ static unsigned int callback_mask =
+ 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+ | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+ ;
+
+ perfc_incrc(shadow2_mappings);
+ if ( (page->count_info & PGC_count_mask) == 0 )
+ return 0;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ /* XXX TODO:
+ * Heuristics for finding the (probably) single mapping of this gmfn */
+
+ /* Brute-force search of all the shadows, by walking the hash */
+ perfc_incrc(shadow2_mappings_bf);
+ hash_foreach(v, callback_mask, callbacks, gmfn);
+
+ /* If that didn't catch the mapping, something is very wrong */
+ expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
+ if ( (page->count_info & PGC_count_mask) != expected_count )
+ {
+ /* Don't complain if we're in HVM and there's one extra mapping:
+ * The qemu helper process has an untyped mapping of this dom's RAM */
+ if ( !(shadow2_mode_external(v->domain)
+ && (page->count_info & PGC_count_mask) <= 2
+ && (page->u.inuse.type_info & PGT_count_mask) == 0) )
+ {
+ SHADOW2_ERROR("can't find all mappings of mfn %lx: "
+ "c=%08x t=%08lx\n", mfn_x(gmfn),
+ page->count_info, page->u.inuse.type_info);
+ }
+ }
+
+ /* We killed at least one mapping, so must flush TLBs. */
+ return 1;
+}
+
+
+/**************************************************************************/
+/* Remove all shadows of a guest frame from the shadow tables */
+
+static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
+/* Follow this shadow's up-pointer, if it has one, and remove the reference
+ * found there. Returns 1 if that was the only reference to this shadow */
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ mfn_t pmfn;
+ void *vaddr;
+ int rc;
+
+ ASSERT((pg->count_info & PGC_SH2_type_mask) > 0);
+ ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow);
+ ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow);
+ ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow);
+ ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow);
+
+ if (pg->up == 0) return 0;
+ pmfn = _mfn(pg->up >> PAGE_SHIFT);
+ ASSERT(valid_mfn(pmfn));
+ vaddr = sh2_map_domain_page(pmfn);
+ ASSERT(vaddr);
+ vaddr += pg->up & (PAGE_SIZE-1);
+ ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
+
+ /* Is this the only reference to this shadow? */
+ rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0;
+
+ /* Blank the offending entry */
+ switch ((pg->count_info & PGC_SH2_type_mask))
+ {
+ case PGC_SH2_l1_32_shadow:
+ case PGC_SH2_l2_32_shadow:
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn);
+#else
+ SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn);
+#endif
+ break;
+#if CONFIG_PAGING_LEVELS >=3
+ case PGC_SH2_l1_pae_shadow:
+ case PGC_SH2_l2_pae_shadow:
+ case PGC_SH2_l2h_pae_shadow:
+ case PGC_SH2_l3_pae_shadow:
+ SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn);
+ break;
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH2_l1_64_shadow:
+ case PGC_SH2_l2_64_shadow:
+ case PGC_SH2_l3_64_shadow:
+ case PGC_SH2_l4_64_shadow:
+ SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn);
+ break;
+#endif
+#endif
+ default: BUG(); /* Some wierd unknown shadow type */
+ }
+
+ sh2_unmap_domain_page(vaddr);
+ if ( rc )
+ perfc_incrc(shadow2_up_pointer);
+ else
+ perfc_incrc(shadow2_unshadow_bf);
+
+ return rc;
+}
+
+void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
+/* Remove the shadows of this guest page.
+ * If all != 0, find all shadows, if necessary by walking the tables.
+ * Otherwise, just try the (much faster) heuristics, which will remove
+ * at most one reference to each shadow of the page. */
+{
+ struct page_info *pg;
+ mfn_t smfn;
+ u32 sh_flags;
+ unsigned char t;
+
+ /* Dispatch table for getting per-type functions: each level must
+ * be called with the function to remove a lower-level shadow. */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+ NULL, /* l1_32 */
+ NULL, /* fl1_32 */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32 */
+#else
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32 */
+#endif
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae */
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */
+ SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae */
+#else
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#endif
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64 */
+ SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64 */
+#else
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+#endif
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ /* Another lookup table, for choosing which mask to use */
+ static unsigned int masks[16] = {
+ 0, /* none */
+ 1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32 */
+ 0, /* fl1_32 */
+ 0, /* l2_32 */
+ ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift))
+ | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae */
+ 0, /* fl1_pae */
+ 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae */
+ 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae */
+ 0, /* l3_pae */
+ 1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64 */
+ 0, /* fl1_64 */
+ 1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64 */
+ 1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64 */
+ 0, /* l4_64 */
+ 0, /* p2m */
+ 0 /* unused */
+ };
+
+ SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ pg = mfn_to_page(gmfn);
+
+ /* Bale out now if the page is not shadowed */
+ if ( (pg->count_info & PGC_page_table) == 0 )
+ return;
+
+ /* Search for this shadow in all appropriate shadows */
+ perfc_incrc(shadow2_unshadow);
+ sh_flags = pg->shadow2_flags;
+
+ /* Lower-level shadows need to be excised from upper-level shadows.
+ * This call to hash_foreach() looks dangerous but is in fact OK: each
+ * call will remove at most one shadow, and terminate immediately when
+ * it does remove it, so we never walk the hash after doing a deletion. */
+#define DO_UNSHADOW(_type) do { \
+ t = (_type) >> PGC_SH2_type_shift; \
+ smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \
+ if ( !sh2_remove_shadow_via_pointer(v, smfn) && all ) \
+ hash_foreach(v, masks[t], callbacks, smfn); \
+} while (0)
+
+ /* Top-level shadows need to be unpinned */
+#define DO_UNPIN(_type) do { \
+ t = (_type) >> PGC_SH2_type_shift; \
+ smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \
+ if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned ) \
+ sh2_unpin(v, smfn); \
+ if ( (_type) == PGC_SH2_l3_pae_shadow ) \
+ SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \
+} while (0)
+
+ if ( sh_flags & SH2F_L1_32 ) DO_UNSHADOW(PGC_SH2_l1_32_shadow);
+ if ( sh_flags & SH2F_L2_32 ) DO_UNPIN(PGC_SH2_l2_32_shadow);
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( sh_flags & SH2F_L1_PAE ) DO_UNSHADOW(PGC_SH2_l1_pae_shadow);
+ if ( sh_flags & SH2F_L2_PAE ) DO_UNSHADOW(PGC_SH2_l2_pae_shadow);
+ if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow);
+ if ( sh_flags & SH2F_L3_PAE ) DO_UNPIN(PGC_SH2_l3_pae_shadow);
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( sh_flags & SH2F_L1_64 ) DO_UNSHADOW(PGC_SH2_l1_64_shadow);
+ if ( sh_flags & SH2F_L2_64 ) DO_UNSHADOW(PGC_SH2_l2_64_shadow);
+ if ( sh_flags & SH2F_L3_64 ) DO_UNSHADOW(PGC_SH2_l3_64_shadow);
+ if ( sh_flags & SH2F_L4_64 ) DO_UNPIN(PGC_SH2_l4_64_shadow);
+#endif
+#endif
+
+#undef DO_UNSHADOW
+#undef DO_UNPIN
+
+
+#if CONFIG_PAGING_LEVELS > 2
+ /* We may have caused some PAE l3 entries to change: need to
+ * fix up the copies of them in various places */
+ if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) )
+ sh2_pae_recopy(v->domain);
+#endif
+
+ /* If that didn't catch the shadows, something is wrong */
+ if ( all && (pg->count_info & PGC_page_table) )
+ {
+ SHADOW2_ERROR("can't find all shadows of mfn %05lx (shadow2_flags=%08x)\n",
+ mfn_x(gmfn), pg->shadow2_flags);
+ domain_crash(v->domain);
+ }
+}
+
+void
+shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
+/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+{
+ shadow2_remove_all_shadows(v, gmfn);
+ /* XXX TODO:
+ * Rework this hashtable walker to return a linked-list of all
+ * the shadows it modified, then do breadth-first recursion
+ * to find the way up to higher-level tables and unshadow them too.
+ *
+ * The current code (just tearing down each page's shadows as we
+ * detect that it is not a pagetable) is correct, but very slow.
+ * It means extra emulated writes and slows down removal of mappings. */
+}
+
+/**************************************************************************/
+
+void sh2_update_paging_modes(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct shadow2_entry_points *old_entries = v->arch.shadow2;
+ mfn_t old_guest_table;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ // Valid transitions handled by this function:
+ // - For PV guests:
+ // - after a shadow mode has been changed
+ // - For HVM guests:
+ // - after a shadow mode has been changed
+ // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
+ //
+
+ // Avoid determining the current shadow2 mode for uninitialized CPUs, as
+ // we can not yet determine whether it is an HVM or PV domain.
+ //
+ if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+ {
+ printk("%s: postponing determination of shadow2 mode\n", __func__);
+ return;
+ }
+
+ // First, tear down any old shadow tables held by this vcpu.
+ //
+ if ( v->arch.shadow2 )
+ shadow2_detach_old_tables(v);
+
+ if ( !hvm_guest(v) )
+ {
+ ///
+ /// PV guest
+ ///
+#if CONFIG_PAGING_LEVELS == 4
+ if ( pv_32bit_guest(v) )
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 3);
+ else
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+#elif CONFIG_PAGING_LEVELS == 3
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3);
+#elif CONFIG_PAGING_LEVELS == 2
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2);
+#else
+#error unexpected paging mode
+#endif
+ }
+ else
+ {
+ ///
+ /// HVM guest
+ ///
+ ASSERT(shadow2_mode_translate(d));
+ ASSERT(shadow2_mode_external(d));
+
+ if ( !hvm_paging_enabled(v) )
+ {
+ // paging disabled...
+ clear_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+
+ /* Set v->arch.guest_table to use the p2m map, and choose
+ * the appropriate shadow mode */
+ old_guest_table = pagetable_get_mfn(v->arch.guest_table);
+#if CONFIG_PAGING_LEVELS == 2
+ v->arch.guest_table =
+ pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,2,2);
+#elif CONFIG_PAGING_LEVELS == 3
+ v->arch.guest_table =
+ pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#else /* CONFIG_PAGING_LEVELS == 4 */
+ {
+ l4_pgentry_t *l4e;
+ /* Use the start of the first l3 table as a PAE l3 */
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+ l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+ ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+ v->arch.guest_table =
+ pagetable_from_pfn(l4e_get_pfn(l4e[0]));
+ sh2_unmap_domain_page(l4e);
+ }
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#endif
+ /* Fix up refcounts on guest_table */
+ get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
+ if ( mfn_x(old_guest_table) != 0 )
+ put_page(mfn_to_page(old_guest_table));
+ }
+ else
+ {
+ set_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+
+#ifdef __x86_64__
+ if ( hvm_long_mode_enabled(v) )
+ {
+ // long mode guest...
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+ }
+ else
+#endif
+ if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
+ {
+#if CONFIG_PAGING_LEVELS >= 3
+ // 32-bit PAE mode guest...
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3);
+#else
+ SHADOW2_ERROR("PAE not supported in 32-bit Xen\n");
+ domain_crash(d);
+ return;
+#endif
+ }
+ else
+ {
+ // 32-bit 2 level guest...
+#if CONFIG_PAGING_LEVELS >= 3
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 2);
+#else
+ v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2);
+#endif
+ }
+ }
+
+ if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ {
+ mfn_t mmfn = shadow2_make_monitor_table(v);
+ v->arch.monitor_table = pagetable_from_mfn(mmfn);
+ v->arch.monitor_vtable = sh2_map_domain_page(mmfn);
+ }
+
+ if ( v->arch.shadow2 != old_entries )
+ {
+ SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
+ "(was g=%u s=%u)\n",
+ d->domain_id, v->vcpu_id,
+ v->arch.shadow2->guest_levels,
+ v->arch.shadow2->shadow_levels,
+ old_entries ? old_entries->guest_levels : 0,
+ old_entries ? old_entries->shadow_levels : 0);
+ if ( old_entries &&
+ (v->arch.shadow2->shadow_levels !=
+ old_entries->shadow_levels) )
+ {
+ /* Need to make a new monitor table for the new mode */
+ mfn_t new_mfn, old_mfn;
+
+ if ( v != current )
+ {
+ SHADOW2_ERROR("Some third party (d=%u v=%u) is changing "
+ "this HVM vcpu's (d=%u v=%u) paging mode!\n",
+ current->domain->domain_id, current->vcpu_id,
+ v->domain->domain_id, v->vcpu_id);
+ domain_crash(v->domain);
+ return;
+ }
+
+ sh2_unmap_domain_page(v->arch.monitor_vtable);
+ old_mfn = pagetable_get_mfn(v->arch.monitor_table);
+ v->arch.monitor_table = pagetable_null();
+ new_mfn = v->arch.shadow2->make_monitor_table(v);
+ v->arch.monitor_table = pagetable_from_mfn(new_mfn);
+ v->arch.monitor_vtable = sh2_map_domain_page(new_mfn);
+ SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n",
+ mfn_x(new_mfn));
+
+ /* Don't be running on the old monitor table when we
+ * pull it down! Switch CR3, and warn the HVM code that
+ * its host cr3 has changed. */
+ make_cr3(v, mfn_x(new_mfn));
+ write_ptbase(v);
+ hvm_update_host_cr3(v);
+ old_entries->destroy_monitor_table(v, old_mfn);
+ }
+ }
+
+ // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
+ // These are HARD: think about the case where two CPU's have
+ // different values for CR4.PSE and CR4.PGE at the same time.
+ // This *does* happen, at least for CR4.PGE...
+ }
+
+ v->arch.shadow2->update_cr3(v);
+}
+
+/**************************************************************************/
+/* Turning on and off shadow2 features */
+
+static void sh2_new_mode(struct domain *d, u32 new_mode)
+/* Inform all the vcpus that the shadow mode has been changed */
+{
+ struct vcpu *v;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(d != current->domain);
+ d->arch.shadow2_mode = new_mode;
+ if ( new_mode & SHM2_translate )
+ shadow2_audit_p2m(d);
+ for_each_vcpu(d, v)
+ sh2_update_paging_modes(v);
+}
+
+static int shadow2_enable(struct domain *d, u32 mode)
+/* Turn on "permanent" shadow features: external, translate, refcount.
+ * Can only be called once on a domain, and these features cannot be
+ * disabled.
+ * Returns 0 for success, -errno for failure. */
+{
+ unsigned int old_pages;
+ int rv = 0;
+
+ domain_pause(d);
+ shadow2_lock(d);
+
+ /* Sanity check the arguments */
+ if ( d == current->domain
+ || shadow2_mode_enabled(d)
+ || !(mode & SHM2_enable)
+ || ((mode & SHM2_external) && !(mode & SHM2_translate)) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ // XXX -- eventually would like to require that all memory be allocated
+ // *after* shadow2_enabled() is called... So here, we would test to make
+ // sure that d->page_list is empty.
+#if 0
+ spin_lock(&d->page_alloc_lock);
+ if ( !list_empty(&d->page_list) )
+ {
+ spin_unlock(&d->page_alloc_lock);
+ rv = -EINVAL;
+ goto out;
+ }
+ spin_unlock(&d->page_alloc_lock);
+#endif
+
+ /* Init the shadow memory allocation if the user hasn't done so */
+ old_pages = d->arch.shadow2_total_pages;
+ if ( old_pages == 0 )
+ if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
+ {
+ set_sh2_allocation(d, 0, NULL);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Init the hash table */
+ if ( shadow2_hash_alloc(d) != 0 )
+ {
+ set_sh2_allocation(d, old_pages, NULL);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Init the P2M table */
+ if ( mode & SHM2_translate )
+ if ( !shadow2_alloc_p2m_table(d) )
+ {
+ shadow2_hash_teardown(d);
+ set_sh2_allocation(d, old_pages, NULL);
+ shadow2_p2m_teardown(d);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Update the bits */
+ sh2_new_mode(d, mode);
+ shadow2_audit_p2m(d);
+ out:
+ shadow2_unlock(d);
+ domain_unpause(d);
+ return 0;
+}
+
+void shadow2_teardown(struct domain *d)
+/* Destroy the shadow pagetables of this domain and free its shadow memory.
+ * Should only be called for dying domains. */
+{
+ struct vcpu *v;
+ mfn_t mfn;
+
+ ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
+ ASSERT(d != current->domain);
+
+ if ( !shadow2_lock_is_acquired(d) )
+ shadow2_lock(d); /* Keep various asserts happy */
+
+ if ( shadow2_mode_enabled(d) )
+ {
+ /* Release the shadow and monitor tables held by each vcpu */
+ for_each_vcpu(d, v)
+ {
+ if ( v->arch.shadow2 )
+ shadow2_detach_old_tables(v);
+ if ( shadow2_mode_external(d) )
+ {
+ mfn = pagetable_get_mfn(v->arch.monitor_table);
+ if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
+ shadow2_destroy_monitor_table(v, mfn);
+ v->arch.monitor_table = pagetable_null();
+ }
+ }
+ }
+
+ if ( d->arch.shadow2_total_pages != 0 )
+ {
+ SHADOW2_PRINTK("teardown of domain %u starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ /* Destroy all the shadows and release memory to domheap */
+ set_sh2_allocation(d, 0, NULL);
+ /* Release the hash table back to xenheap */
+ if (d->arch.shadow2_hash_table)
+ shadow2_hash_teardown(d);
+ /* Release the log-dirty bitmap of dirtied pages */
+ sh2_free_log_dirty_bitmap(d);
+ /* Should not have any more memory held */
+ SHADOW2_PRINTK("teardown done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ ASSERT(d->arch.shadow2_total_pages == 0);
+ }
+
+ /* We leave the "permanent" shadow modes enabled, but clear the
+ * log-dirty mode bit. We don't want any more mark_dirty()
+ * calls now that we've torn down the bitmap */
+ d->arch.shadow2_mode &= ~SHM2_log_dirty;
+
+ shadow2_unlock(d);
+}
+
+void shadow2_final_teardown(struct domain *d)
+/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
+{
+
+ SHADOW2_PRINTK("dom %u final teardown starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+
+ /* Double-check that the domain didn't have any shadow memory.
+ * It is possible for a domain that never got domain_kill()ed
+ * to get here with its shadow allocation intact. */
+ if ( d->arch.shadow2_total_pages != 0 )
+ shadow2_teardown(d);
+
+ /* It is now safe to pull down the p2m map. */
+ if ( d->arch.shadow2_p2m_pages != 0 )
+ shadow2_p2m_teardown(d);
+
+ SHADOW2_PRINTK("dom %u final teardown done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+}
+
+static int shadow2_one_bit_enable(struct domain *d, u32 mode)
+/* Turn on a single shadow mode feature */
+{
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ /* Sanity check the call */
+ if ( d == current->domain || (d->arch.shadow2_mode & mode) )
+ {
+ return -EINVAL;
+ }
+
+ if ( d->arch.shadow2_mode == 0 )
+ {
+ /* Init the shadow memory allocation and the hash table */
+ if ( set_sh2_allocation(d, 1, NULL) != 0
+ || shadow2_hash_alloc(d) != 0 )
+ {
+ set_sh2_allocation(d, 0, NULL);
+ return -ENOMEM;
+ }
+ }
+
+ /* Update the bits */
+ sh2_new_mode(d, d->arch.shadow2_mode | mode);
+
+ return 0;
+}
+
+static int shadow2_one_bit_disable(struct domain *d, u32 mode)
+/* Turn off a single shadow mode feature */
+{
+ struct vcpu *v;
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ /* Sanity check the call */
+ if ( d == current->domain || !(d->arch.shadow2_mode & mode) )
+ {
+ return -EINVAL;
+ }
+
+ /* Update the bits */
+ sh2_new_mode(d, d->arch.shadow2_mode & ~mode);
+ if ( d->arch.shadow2_mode == 0 )
+ {
+ /* Get this domain off shadows */
+ SHADOW2_PRINTK("un-shadowing of domain %u starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ for_each_vcpu(d, v)
+ {
+ if ( v->arch.shadow2 )
+ shadow2_detach_old_tables(v);
+#if CONFIG_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
+ else
+#endif
+ make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
+
+ }
+
+ /* Pull down the memory allocation */
+ if ( set_sh2_allocation(d, 0, NULL) != 0 )
+ {
+ // XXX - How can this occur?
+ // Seems like a bug to return an error now that we've
+ // disabled the relevant shadow mode.
+ //
+ return -ENOMEM;
+ }
+ shadow2_hash_teardown(d);
+ SHADOW2_PRINTK("un-shadowing of domain %u done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow2_total_pages,
+ d->arch.shadow2_free_pages,
+ d->arch.shadow2_p2m_pages);
+ }
+
+ return 0;
+}
+
+/* Enable/disable ops for the "test" and "log-dirty" modes */
+int shadow2_test_enable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow2_lock(d);
+
+ if ( shadow2_mode_enabled(d) )
+ {
+ SHADOW2_ERROR("Don't support enabling test mode"
+ "on already shadowed doms\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = shadow2_one_bit_enable(d, SHM2_enable);
+ out:
+ shadow2_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+int shadow2_test_disable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow2_lock(d);
+ ret = shadow2_one_bit_disable(d, SHM2_enable);
+ shadow2_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+static int
+sh2_alloc_log_dirty_bitmap(struct domain *d)
+{
+ ASSERT(d->arch.shadow_dirty_bitmap == NULL);
+ d->arch.shadow_dirty_bitmap_size =
+ (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
+ ~(BITS_PER_LONG - 1);
+ d->arch.shadow_dirty_bitmap =
+ xmalloc_array(unsigned long,
+ d->arch.shadow_dirty_bitmap_size / BITS_PER_LONG);
+ if ( d->arch.shadow_dirty_bitmap == NULL )
+ {
+ d->arch.shadow_dirty_bitmap_size = 0;
+ return -ENOMEM;
+ }
+ memset(d->arch.shadow_dirty_bitmap, 0, d->arch.shadow_dirty_bitmap_size/8);
+
+ return 0;
+}
+
+static void
+sh2_free_log_dirty_bitmap(struct domain *d)
+{
+ d->arch.shadow_dirty_bitmap_size = 0;
+ if ( d->arch.shadow_dirty_bitmap )
+ {
+ xfree(d->arch.shadow_dirty_bitmap);
+ d->arch.shadow_dirty_bitmap = NULL;
+ }
+}
+
+static int shadow2_log_dirty_enable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow2_lock(d);
+
+ if ( shadow2_mode_log_dirty(d) )
+ {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ( shadow2_mode_enabled(d) )
+ {
+ SHADOW2_ERROR("Don't (yet) support enabling log-dirty"
+ "on already shadowed doms\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sh2_alloc_log_dirty_bitmap(d);
+ if ( ret != 0 )
+ {
+ sh2_free_log_dirty_bitmap(d);
+ goto out;
+ }
+
+ ret = shadow2_one_bit_enable(d, SHM2_log_dirty);
+ if ( ret != 0 )
+ sh2_free_log_dirty_bitmap(d);
+
+ out:
+ shadow2_unlock(d);
+ domain_unpause(d);
+ return ret;
+}
+
+static int shadow2_log_dirty_disable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow2_lock(d);
+ ret = shadow2_one_bit_disable(d, SHM2_log_dirty);
+ if ( !shadow2_mode_log_dirty(d) )
+ sh2_free_log_dirty_bitmap(d);
+ shadow2_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+/**************************************************************************/
+/* P2M map manipulations */
+
+static void
+sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+ struct vcpu *v;
+
+ if ( !shadow2_mode_translate(d) )
+ return;
+
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+
+
+ SHADOW2_PRINTK("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+ ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn);
+ //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn);
+
+ shadow2_remove_all_shadows_and_parents(v, _mfn(mfn));
+ if ( shadow2_remove_all_mappings(v, _mfn(mfn)) )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
+ set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+}
+
+void
+shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn)
+{
+ shadow2_lock(d);
+ shadow2_audit_p2m(d);
+ sh2_p2m_remove_page(d, gfn, mfn);
+ shadow2_audit_p2m(d);
+ shadow2_unlock(d);
+}
+
+void
+shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn)
+{
+ struct vcpu *v;
+ unsigned long ogfn;
+ mfn_t omfn;
+
+ if ( !shadow2_mode_translate(d) )
+ return;
+
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+
+ shadow2_lock(d);
+ shadow2_audit_p2m(d);
+
+ SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+ omfn = sh2_gfn_to_mfn(d, gfn);
+ if ( valid_mfn(omfn) )
+ {
+ /* Get rid of the old mapping, especially any shadows */
+ shadow2_remove_all_shadows_and_parents(v, omfn);
+ if ( shadow2_remove_all_mappings(v, omfn) )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+ }
+
+ ogfn = sh2_mfn_to_gfn(d, _mfn(mfn));
+ if (
+#ifdef __x86_64__
+ (ogfn != 0x5555555555555555L)
+#else
+ (ogfn != 0x55555555L)
+#endif
+ && (ogfn != INVALID_M2P_ENTRY)
+ && (ogfn != gfn) )
+ {
+ /* This machine frame is already mapped at another physical address */
+ SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
+ mfn, ogfn, gfn);
+ if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) )
+ {
+ SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
+ ogfn , mfn_x(omfn));
+ if ( mfn_x(omfn) == mfn )
+ sh2_p2m_remove_page(d, ogfn, mfn);
+ }
+ }
+
+ shadow2_set_p2m_entry(d, gfn, _mfn(mfn));
+ set_gpfn_from_mfn(mfn, gfn);
+ shadow2_audit_p2m(d);
+ shadow2_unlock(d);
+}
+
+/**************************************************************************/
+/* Log-dirty mode support */
+
+/* Convert a shadow to log-dirty mode. */
+void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
+{
+ BUG();
+}
+
+
+/* Read a domain's log-dirty bitmap and stats.
+ * If the operation is a CLEAN, clear the bitmap and stats as well. */
+static int shadow2_log_dirty_op(struct domain *d, dom0_shadow_control_t *sc)
+{
+ int i, rv = 0, clean = 0;
+
+ domain_pause(d);
+ shadow2_lock(d);
+
+ if ( sc->op == DOM0_SHADOW_CONTROL_OP_CLEAN
+ || sc->op == DOM0_SHADOW_CONTROL_OP_FLUSH )
+ clean = 1;
+ else
+ ASSERT(sc->op == DOM0_SHADOW_CONTROL_OP_PEEK);
+
+ SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
+ (clean) ? "clean" : "peek",
+ d->domain_id,
+ d->arch.shadow_fault_count,
+ d->arch.shadow_dirty_count);
+
+ sc->stats.fault_count = d->arch.shadow_fault_count;
+ sc->stats.dirty_count = d->arch.shadow_dirty_count;
+
+ if ( clean )
+ {
+ struct list_head *l, *t;
+ struct page_info *pg;
+
+ /* Need to revoke write access to the domain's pages again.
+ * In future, we'll have a less heavy-handed approach to this,
+ * but for now, we just unshadow everything except Xen. */
+ list_for_each_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
+ }
+
+ d->arch.shadow_fault_count = 0;
+ d->arch.shadow_dirty_count = 0;
+ }
+
+ if ( guest_handle_is_null(sc->dirty_bitmap) ||
+ (d->arch.shadow_dirty_bitmap == NULL) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
+ sc->pages = d->arch.shadow_dirty_bitmap_size;
+
+#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
+ for ( i = 0; i < sc->pages; i += CHUNK )
+ {
+ int bytes = ((((sc->pages - i) > CHUNK)
+ ? CHUNK
+ : (sc->pages - i)) + 7) / 8;
+
+ if ( copy_to_guest_offset(
+ sc->dirty_bitmap,
+ i/(8*sizeof(unsigned long)),
+ d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ if ( clean )
+ memset(d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ 0, bytes);
+ }
+#undef CHUNK
+
+ out:
+ shadow2_unlock(d);
+ domain_unpause(d);
+ return 0;
+}
+
+
+/* Mark a page as dirty */
+void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+ unsigned long pfn;
+
+ ASSERT(shadow2_lock_is_acquired(d));
+ ASSERT(shadow2_mode_log_dirty(d));
+
+ if ( !valid_mfn(gmfn) )
+ return;
+
+ ASSERT(d->arch.shadow_dirty_bitmap != NULL);
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+ /*
+ * Values with the MSB set denote MFNs that aren't really part of the
+ * domain's pseudo-physical memory map (e.g., the shared info frame).
+ * Nothing to do here...
+ */
+ if ( unlikely(!VALID_M2P(pfn)) )
+ return;
+
+ /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */
+ if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) )
+ {
+ if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
+ {
+ SHADOW2_DEBUG(LOGDIRTY,
+ "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n",
+ mfn_x(gmfn), pfn, d->domain_id);
+ d->arch.shadow_dirty_count++;
+ }
+ }
+ else
+ {
+ SHADOW2_PRINTK("mark_dirty OOR! "
+ "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
+ "owner=%d c=%08x t=%" PRtype_info "\n",
+ mfn_x(gmfn),
+ pfn,
+ d->arch.shadow_dirty_bitmap_size,
+ d->domain_id,
+ (page_get_owner(mfn_to_page(gmfn))
+ ? page_get_owner(mfn_to_page(gmfn))->domain_id
+ : -1),
+ mfn_to_page(gmfn)->count_info,
+ mfn_to_page(gmfn)->u.inuse.type_info);
+ }
+}
+
+
+/**************************************************************************/
+/* Shadow-control DOM0_OP dispatcher */
+
+int shadow2_control_op(struct domain *d,
+ dom0_shadow_control_t *sc,
+ XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op)
+{
+ int rc, preempted = 0;
+
+ if ( unlikely(d == current->domain) )
+ {
+ DPRINTK("Don't try to do a shadow op on yourself!\n");
+ return -EINVAL;
+ }
+
+ switch ( sc->op )
+ {
+ case DOM0_SHADOW_CONTROL_OP_OFF:
+ if ( shadow2_mode_log_dirty(d) )
+ if ( (rc = shadow2_log_dirty_disable(d)) != 0 )
+ return rc;
+ if ( d->arch.shadow2_mode & SHM2_enable )
+ if ( (rc = shadow2_test_disable(d)) != 0 )
+ return rc;
+ return 0;
+
+ case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
+ return shadow2_test_enable(d);
+
+ case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
+ return shadow2_log_dirty_enable(d);
+
+ case DOM0_SHADOW_CONTROL_OP_FLUSH:
+ case DOM0_SHADOW_CONTROL_OP_CLEAN:
+ case DOM0_SHADOW_CONTROL_OP_PEEK:
+ return shadow2_log_dirty_op(d, sc);
+
+
+
+ case DOM0_SHADOW2_CONTROL_OP_ENABLE:
+ return shadow2_enable(d, sc->mode << SHM2_shift);
+
+ case DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION:
+ sc->mb = shadow2_get_allocation(d);
+ return 0;
+
+ case DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION:
+ rc = shadow2_set_allocation(d, sc->mb, &preempted);
+ if ( preempted )
+ /* Not finished. Set up to re-run the call. */
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_dom0_op, "h", u_dom0_op);
+ else
+ /* Finished. Return the new allocation */
+ sc->mb = shadow2_get_allocation(d);
+ return rc;
+
+
+ default:
+ SHADOW2_ERROR("Bad shadow op %u\n", sc->op);
+ return -EINVAL;
+ }
+}
+
+
+/**************************************************************************/
+/* Auditing shadow tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
+
+void shadow2_audit_tables(struct vcpu *v)
+{
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2), /* l2_32 */
+#else
+ SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2), /* l1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2), /* l2_32 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3), /* l1_pae */
+ SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2_pae */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2h_pae */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3), /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4), /* l1_64 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4), /* l2_64 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4), /* l3_64 */
+ SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4), /* l4_64 */
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS > 2 */
+ NULL /* All the rest */
+ };
+ unsigned int mask;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) )
+ return;
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL )
+ mask = ~1; /* Audit every table in the system */
+ else
+ {
+ /* Audit only the current mode's tables */
+ switch (v->arch.shadow2->guest_levels)
+ {
+ case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break;
+ case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE
+ |SH2F_L2H_PAE|SH2F_L3_PAE); break;
+ case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64
+ |SH2F_L3_64|SH2F_L4_64); break;
+ default: BUG();
+ }
+ }
+
+ hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
+}
+
+#endif /* Shadow audit */
+
+
+/**************************************************************************/
+/* Auditing p2m tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
+
+void shadow2_audit_p2m(struct domain *d)
+{
+ struct list_head *entry;
+ struct page_info *page;
+ struct domain *od;
+ unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+ mfn_t p2mfn;
+ unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
+ int test_linear;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) )
+ return;
+
+ //SHADOW2_PRINTK("p2m audit starts\n");
+
+ test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
+ if ( test_linear )
+ local_flush_tlb();
+
+ /* Audit part one: walk the domain's page allocation list, checking
+ * the m2p entries. */
+ for ( entry = d->page_list.next;
+ entry != &d->page_list;
+ entry = entry->next )
+ {
+ page = list_entry(entry, struct page_info, list);
+ mfn = mfn_x(page_to_mfn(page));
+
+ // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
+
+ od = page_get_owner(page);
+
+ if ( od != d )
+ {
+ SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
+ mfn, od, (od?od->domain_id:-1), d, d->domain_id);
+ continue;
+ }
+
+ gfn = get_gpfn_from_mfn(mfn);
+ if ( gfn == INVALID_M2P_ENTRY )
+ {
+ orphans_i++;
+ //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
+ // mfn);
+ continue;
+ }
+
+ if ( gfn == 0x55555555 )
+ {
+ orphans_d++;
+ //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
+ // mfn);
+ continue;
+ }
+
+ p2mfn = sh2_gfn_to_mfn_foreign(d, gfn);
+ if ( mfn_x(p2mfn) != mfn )
+ {
+ mpbad++;
+ SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
+ " (-> gfn %#lx)\n",
+ mfn, gfn, mfn_x(p2mfn),
+ (mfn_valid(p2mfn)
+ ? get_gpfn_from_mfn(mfn_x(p2mfn))
+ : -1u));
+ /* This m2p entry is stale: the domain has another frame in
+ * this physical slot. No great disaster, but for neatness,
+ * blow away the m2p entry. */
+ set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+ }
+
+ if ( test_linear )
+ {
+ lp2mfn = get_mfn_from_gpfn(gfn);
+ if ( lp2mfn != mfn_x(p2mfn) )
+ {
+ SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
+ "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
+ }
+ }
+
+ // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
+ // mfn, gfn, p2mfn, lp2mfn);
+ }
+
+ /* Audit part two: walk the domain's p2m table, checking the entries. */
+ if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
+ {
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+ int i1, i2;
+
+#if CONFIG_PAGING_LEVELS == 4
+ l4_pgentry_t *l4e;
+ l3_pgentry_t *l3e;
+ int i3, i4;
+ l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#elif CONFIG_PAGING_LEVELS == 3
+ l3_pgentry_t *l3e;
+ int i3;
+ l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#else /* CONFIG_PAGING_LEVELS == 2 */
+ l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#endif
+
+ gfn = 0;
+#if CONFIG_PAGING_LEVELS >= 3
+#if CONFIG_PAGING_LEVELS >= 4
+ for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+ {
+ if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
+#endif /* now at levels 3 or 4... */
+ for ( i3 = 0;
+ i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
+ i3++ )
+ {
+ if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
+#endif /* all levels... */
+ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+ {
+ if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
+
+ for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+ {
+ if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+ continue;
+ mfn = l1e_get_pfn(l1e[i1]);
+ ASSERT(valid_mfn(_mfn(mfn)));
+ m2pfn = get_gpfn_from_mfn(mfn);
+ if ( m2pfn != gfn )
+ {
+ pmbad++;
+ SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+ " -> gfn %#lx\n", gfn, mfn, m2pfn);
+ BUG();
+ }
+ }
+ sh2_unmap_domain_page(l1e);
+ }
+#if CONFIG_PAGING_LEVELS >= 3
+ sh2_unmap_domain_page(l2e);
+ }
+#if CONFIG_PAGING_LEVELS >= 4
+ sh2_unmap_domain_page(l3e);
+ }
+#endif
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+ sh2_unmap_domain_page(l4e);
+#elif CONFIG_PAGING_LEVELS == 3
+ sh2_unmap_domain_page(l3e);
+#else /* CONFIG_PAGING_LEVELS == 2 */
+ sh2_unmap_domain_page(l2e);
+#endif
+
+ }
+
+ //SHADOW2_PRINTK("p2m audit complete\n");
+ //if ( orphans_i | orphans_d | mpbad | pmbad )
+ // SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
+ // orphans_i + orphans_d, orphans_i, orphans_d,
+ if ( mpbad | pmbad )
+ SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
+ pmbad, mpbad);
+}
+
+#endif /* p2m audit */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/shadow2.c b/xen/arch/x86/shadow2.c
new file mode 100644
index 0000000000..9d845cb797
--- /dev/null
+++ b/xen/arch/x86/shadow2.c
@@ -0,0 +1,4469 @@
+/******************************************************************************
+ * arch/x86/shadow2.c
+ *
+ * Simple, mostly-synchronous shadow page tables.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+// DESIGN QUESTIONS:
+// Why use subshadows for PAE guests?
+// - reduces pressure in the hash table
+// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
+// - would need to find space in the page_info to store 7 more bits of
+// backpointer
+// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
+// figure out when to demote the guest page from l3 status
+//
+// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
+// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
+// space for both PV and HVM guests.
+//
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/domain_page.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+#include <asm/shadow2-types.h>
+#include <asm/flushtlb.h>
+#include <asm/hvm/hvm.h>
+
+/* The first cut: an absolutely synchronous, trap-and-emulate version,
+ * supporting only HVM guests (and so only "external" shadow mode).
+ *
+ * THINGS TO DO LATER:
+ *
+ * FIX GVA_TO_GPA
+ * The current interface returns an unsigned long, which is not big enough
+ * to hold a physical address in PAE. Should return a gfn instead.
+ *
+ * TEARDOWN HEURISTICS
+ * Also: have a heuristic for when to destroy a previous paging-mode's
+ * shadows. When a guest is done with its start-of-day 32-bit tables
+ * and reuses the memory we want to drop those shadows. Start with
+ * shadows in a page in two modes as a hint, but beware of clever tricks
+ * like reusing a pagetable for both PAE and 64-bit during boot...
+ *
+ * PAE LINEAR MAPS
+ * Rework shadow_get_l*e() to have the option of using map_domain_page()
+ * instead of linear maps. Add appropriate unmap_l*e calls in the users.
+ * Then we can test the speed difference made by linear maps. If the
+ * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
+ * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
+ * to share l2h pages again.
+ *
+ * PAE L3 COPYING
+ * In this code, we copy all 32 bytes of a PAE L3 every time we change an
+ * entry in it, and every time we change CR3. We copy it for the linear
+ * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
+ * buffer so it fits in CR3. Maybe we can avoid some of this recopying
+ * by using the shadow directly in some places.
+ * Also, for SMP, need to actually respond to seeing shadow2_pae_flip_pending.
+ *
+ * GUEST_WALK_TABLES TLB FLUSH COALESCE
+ * guest_walk_tables can do up to three remote TLB flushes as it walks to
+ * the first l1 of a new pagetable. Should coalesce the flushes to the end,
+ * and if we do flush, re-do the walk. If anything has changed, then
+ * pause all the other vcpus and do the walk *again*.
+ *
+ * WP DISABLED
+ * Consider how to implement having the WP bit of CR0 set to 0.
+ * Since we need to be able to cause write faults to pagetables, this might
+ * end up looking like not having the (guest) pagetables present at all in
+ * HVM guests...
+ *
+ * PSE disabled / PSE36
+ * We don't support any modes other than PSE enabled, PSE36 disabled.
+ * Neither of those would be hard to change, but we'd need to be able to
+ * deal with shadows made in one mode and used in another.
+ */
+
+#define FETCH_TYPE_PREFETCH 1
+#define FETCH_TYPE_DEMAND 2
+#define FETCH_TYPE_WRITE 4
+typedef enum {
+ ft_prefetch = FETCH_TYPE_PREFETCH,
+ ft_demand_read = FETCH_TYPE_DEMAND,
+ ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
+} fetch_type_t;
+
+#ifndef NDEBUG
+static char *fetch_type_names[] = {
+ [ft_prefetch] "prefetch",
+ [ft_demand_read] "demand read",
+ [ft_demand_write] "demand write",
+};
+#endif
+
+/* XXX forward declarations */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res);
+#endif
+static inline void sh2_update_linear_entries(struct vcpu *v);
+
+/**************************************************************************/
+/* Hash table mapping from guest pagetables to shadows
+ *
+ * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
+ * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
+ * shadow L1 which maps its "splinters".
+ * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
+ * PAE L3 info page for that CR3 value.
+ */
+
+static inline mfn_t
+get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
+/* Look for FL1 shadows in the hash table */
+{
+ mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn),
+ PGC_SH2_fl1_shadow >> PGC_SH2_type_shift);
+
+ if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH2_log_dirty) )
+ shadow2_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline mfn_t
+get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+/* Look for shadows in the hash table */
+{
+ mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn),
+ shadow_type >> PGC_SH2_type_shift);
+ perfc_incrc(shadow2_get_shadow_status);
+
+ if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH2_log_dirty) )
+ shadow2_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline void
+set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Put an FL1 shadow into the hash table */
+{
+ SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
+ gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
+
+ if ( unlikely(shadow2_mode_log_dirty(v->domain)) )
+ // mark this shadow as a log dirty shadow...
+ set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+ else
+ clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+
+ shadow2_hash_insert(v, gfn_x(gfn),
+ PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn);
+}
+
+static inline void
+set_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Put a shadow into the hash table */
+{
+ struct domain *d = v->domain;
+ int res;
+
+ SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+ d->domain_id, v->vcpu_id, mfn_x(gmfn),
+ shadow_type, mfn_x(smfn));
+
+ if ( unlikely(shadow2_mode_log_dirty(d)) )
+ // mark this shadow as a log dirty shadow...
+ set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+ else
+ clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+
+ res = get_page(mfn_to_page(gmfn), d);
+ ASSERT(res == 1);
+
+ shadow2_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH2_type_shift,
+ smfn);
+}
+
+static inline void
+delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+ SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
+ gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
+
+ shadow2_hash_delete(v, gfn_x(gfn),
+ PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn);
+}
+
+static inline void
+delete_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+ SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id,
+ mfn_x(gmfn), shadow_type, mfn_x(smfn));
+ shadow2_hash_delete(v, mfn_x(gmfn),
+ shadow_type >> PGC_SH2_type_shift, smfn);
+ put_page(mfn_to_page(gmfn));
+}
+
+
+/**************************************************************************/
+/* Functions for walking the guest page tables */
+
+
+/* Walk the guest pagetables, filling the walk_t with what we see.
+ * Takes an uninitialised walk_t. The caller must call unmap_walk()
+ * on the walk_t before discarding it or calling guest_walk_tables again.
+ * If "guest_op" is non-zero, we are serving a genuine guest memory access,
+ * and must (a) be under the shadow2 lock, and (b) remove write access
+ * from any gueat PT pages we see, as we will be using their contents to
+ * perform shadow updates.
+ * Returns 0 for success or non-zero if the guest pagetables are malformed.
+ * N.B. Finding a not-present entry does not cause a non-zero return code. */
+static inline int
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
+{
+ ASSERT(!guest_op || shadow2_lock_is_acquired(v->domain));
+
+ perfc_incrc(shadow2_guest_walk);
+ memset(gw, 0, sizeof(*gw));
+ gw->va = va;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ /* Get l4e from the top level table */
+ gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
+ /* Walk down to the l3e */
+ if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
+ gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
+ if ( !valid_mfn(gw->l3mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op && shadow2_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l3e = ((guest_l3e_t *)sh2_map_domain_page(gw->l3mfn))
+ + guest_l3_table_offset(va);
+#else /* PAE only... */
+ /* Get l3e from the top level table */
+ gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
+#endif /* PAE or 64... */
+ /* Walk down to the l2e */
+ if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
+ gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
+ if ( !valid_mfn(gw->l2mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op && shadow2_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l2e = ((guest_l2e_t *)sh2_map_domain_page(gw->l2mfn))
+ + guest_l2_table_offset(va);
+#else /* 32-bit only... */
+ /* Get l2e from the top level table */
+ gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
+#endif /* All levels... */
+
+ if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
+ if ( guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
+ {
+ /* Special case: this guest VA is in a PSE superpage, so there's
+ * no guest l1e. We make one up so that the propagation code
+ * can generate a shadow l1 table. Start with the gfn of the
+ * first 4k-page of the superpage. */
+ gfn_t start = guest_l2e_get_gfn(*gw->l2e);
+ /* Grant full access in the l1e, since all the guest entry's
+ * access controls are enforced in the shadow l2e. This lets
+ * us reflect l2 changes later without touching the l1s. */
+ int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+ _PAGE_ACCESSED|_PAGE_DIRTY);
+ /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
+ * of the level 1 */
+ if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
+ flags |= _PAGE_PAT;
+ /* Increment the pfn by the right number of 4k pages.
+ * The ~0x1 is to mask out the PAT bit mentioned above. */
+ start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+ gw->eff_l1e = guest_l1e_from_gfn(start, flags);
+ gw->l1e = NULL;
+ gw->l1mfn = _mfn(INVALID_MFN);
+ }
+ else
+ {
+ /* Not a superpage: carry on and find the l1e. */
+ gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
+ if ( !valid_mfn(gw->l1mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op
+ && shadow2_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l1e = ((guest_l1e_t *)sh2_map_domain_page(gw->l1mfn))
+ + guest_l1_table_offset(va);
+ gw->eff_l1e = *gw->l1e;
+ }
+
+ return 0;
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+ return _gfn(INVALID_GFN);
+ return guest_l1e_get_gfn(gw->eff_l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+ return 0;
+ return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
+}
+
+
+/* Unmap (and reinitialise) a guest walk.
+ * Call this to dispose of any walk filled in by guest_walk_tables() */
+static void unmap_walk(struct vcpu *v, walk_t *gw)
+{
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+ if ( gw->l3e != NULL ) sh2_unmap_domain_page(gw->l3e);
+#endif
+ if ( gw->l2e != NULL ) sh2_unmap_domain_page(gw->l2e);
+#endif
+ if ( gw->l1e != NULL ) sh2_unmap_domain_page(gw->l1e);
+#ifdef DEBUG
+ memset(gw, 0, sizeof(*gw));
+#endif
+}
+
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+ SHADOW2_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ SHADOW2_PRINTK(" l4mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l4mfn));
+ SHADOW2_PRINTK(" l4e=%p\n", gw->l4e);
+ if ( gw->l4e )
+ SHADOW2_PRINTK(" *l4e=%" SH2_PRI_gpte "\n", gw->l4e->l4);
+#endif /* PAE or 64... */
+ SHADOW2_PRINTK(" l3mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l3mfn));
+ SHADOW2_PRINTK(" l3e=%p\n", gw->l3e);
+ if ( gw->l3e )
+ SHADOW2_PRINTK(" *l3e=%" SH2_PRI_gpte "\n", gw->l3e->l3);
+#endif /* All levels... */
+ SHADOW2_PRINTK(" l2mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l2mfn));
+ SHADOW2_PRINTK(" l2e=%p\n", gw->l2e);
+ if ( gw->l2e )
+ SHADOW2_PRINTK(" *l2e=%" SH2_PRI_gpte "\n", gw->l2e->l2);
+ SHADOW2_PRINTK(" l1mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l1mfn));
+ SHADOW2_PRINTK(" l1e=%p\n", gw->l1e);
+ if ( gw->l1e )
+ SHADOW2_PRINTK(" *l1e=%" SH2_PRI_gpte "\n", gw->l1e->l1);
+ SHADOW2_PRINTK(" eff_l1e=%" SH2_PRI_gpte "\n", gw->eff_l1e.l1);
+}
+
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
+/* Lightweight audit: pass all the shadows associated with this guest walk
+ * through the audit mechanisms */
+static void sh2_audit_gw(struct vcpu *v, walk_t *gw)
+{
+ mfn_t smfn;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) )
+ return;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ if ( valid_mfn(gw->l4mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn,
+ PGC_SH2_l4_shadow))) )
+ (void) sh2_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* PAE or 64... */
+ if ( valid_mfn(gw->l3mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn,
+ PGC_SH2_l3_shadow))) )
+ (void) sh2_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* All levels... */
+ if ( valid_mfn(gw->l2mfn) )
+ {
+ if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
+ PGC_SH2_l2_shadow))) )
+ (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#if GUEST_PAGING_LEVELS == 3
+ if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
+ PGC_SH2_l2h_shadow))) )
+ (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#endif
+ }
+ if ( valid_mfn(gw->l1mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn,
+ PGC_SH2_l1_shadow))) )
+ (void) sh2_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
+ else if ( gw->l2e
+ && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
+ && valid_mfn(
+ (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
+ (void) sh2_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
+}
+
+#else
+#define sh2_audit_gw(_v, _gw) do {} while(0)
+#endif /* audit code */
+
+
+
+/**************************************************************************/
+/* Function to write to the guest tables, for propagating accessed and
+ * dirty bits from the shadow to the guest.
+ * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
+ * and an operation type. The guest entry is always passed as an l1e:
+ * since we only ever write flags, that's OK.
+ * Returns the new flag bits of the guest entry. */
+
+static u32 guest_set_ad_bits(struct vcpu *v,
+ mfn_t gmfn,
+ guest_l1e_t *ep,
+ unsigned int level,
+ fetch_type_t ft)
+{
+ u32 flags, shflags, bit;
+ struct page_info *pg;
+ int res = 0;
+
+ ASSERT(valid_mfn(gmfn)
+ && (sh2_mfn_is_a_page_table(gmfn)
+ || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
+ == 0)));
+ ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
+ ASSERT(level <= GUEST_PAGING_LEVELS);
+ ASSERT(ft == ft_demand_read || ft == ft_demand_write);
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ flags = guest_l1e_get_flags(*ep);
+
+ /* PAE l3s do not have A and D bits */
+ if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) )
+ return flags;
+
+ /* Need the D bit as well for writes, in l1es and PSE l2es. */
+ if ( ft == ft_demand_write
+ && (level == 1 || (level == 2 && (flags & _PAGE_PSE))) )
+ {
+ if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
+ == (_PAGE_DIRTY | _PAGE_ACCESSED) )
+ return flags; /* Guest already has A and D bits set */
+ flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
+ perfc_incrc(shadow2_ad_update);
+ }
+ else
+ {
+ if ( flags & _PAGE_ACCESSED )
+ return flags; /* Guest already has A bit set */
+ flags |= _PAGE_ACCESSED;
+ perfc_incrc(shadow2_a_update);
+ }
+
+ /* Set the bit(s) */
+ sh2_mark_dirty(v->domain, gmfn);
+ SHADOW2_DEBUG(A_AND_D, "gfn = %"SH2_PRI_gfn", "
+ "old flags = %#x, new flags = %#x\n",
+ guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags);
+ *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
+
+ /* May need to propagate this change forward to other kinds of shadow */
+ pg = mfn_to_page(gmfn);
+ if ( !sh2_mfn_is_a_page_table(gmfn) )
+ {
+ /* This guest pagetable is not yet shadowed at all. */
+ // MAF: I think this assert is busted... If this gmfn has not yet
+ // been promoted, then it seems perfectly reasonable for there to be
+ // outstanding type refs to it...
+ /* TJD: No. If the gmfn has not been promoted, we must at least
+ * have recognised that it is a pagetable, and pulled write access.
+ * The type count should only be non-zero if it is actually a page
+ * table. The test above was incorrect, though, so I've fixed it. */
+ ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0);
+ return flags;
+ }
+
+ shflags = pg->shadow2_flags & SH2F_page_type_mask;
+ while ( shflags )
+ {
+ bit = find_first_set_bit(shflags);
+ ASSERT(shflags & (1u << bit));
+ shflags &= ~(1u << bit);
+ if ( !(pg->shadow2_flags & (1u << bit)) )
+ continue;
+ switch ( bit )
+ {
+ case PGC_SH2_type_to_index(PGC_SH2_l1_shadow):
+ if (level != 1)
+ res |= sh2_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep));
+ break;
+ case PGC_SH2_type_to_index(PGC_SH2_l2_shadow):
+ if (level != 2)
+ res |= sh2_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep));
+ break;
+#if GUEST_PAGING_LEVELS == 3 /* PAE only */
+ case PGC_SH2_type_to_index(PGC_SH2_l2h_shadow):
+ if (level != 2)
+ res |= sh2_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep));
+ break;
+#endif
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+ case PGC_SH2_type_to_index(PGC_SH2_l3_shadow):
+ if (level != 3)
+ res |= sh2_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep));
+ break;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ case PGC_SH2_type_to_index(PGC_SH2_l4_shadow):
+ if (level != 4)
+ res |= sh2_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep));
+ break;
+#endif
+#endif
+ default:
+ SHADOW2_ERROR("mfn %"SH2_PRI_mfn" is shadowed in multiple "
+ "modes: A&D bits may be out of sync (flags=%#x).\n",
+ mfn_x(gmfn), pg->shadow2_flags);
+ /* XXX Shadows in other modes will not be updated, so will
+ * have their A and D bits out of sync. */
+ }
+ }
+
+ /* We should never need to flush the TLB or recopy PAE entries */
+ ASSERT( res == 0 || res == SHADOW2_SET_CHANGED );
+ return flags;
+}
+
+/**************************************************************************/
+/* Functions to compute the correct index into a shadow page, given an
+ * index into the guest page (as returned by guest_get_index()).
+ * This is trivial when the shadow and guest use the same sized PTEs, but
+ * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
+ * PAE- or 64-bit shadows).
+ *
+ * These functions also increment the shadow mfn, when necessary. When PTE
+ * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
+ * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
+ * use simple pointer arithmetic on a pointer to the guest L1e to figure out
+ * which shadow page we really want. Similarly, when PTE sizes are
+ * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
+ * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
+ * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
+ * space.)
+ *
+ * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
+ * of shadow (to store both the shadow, and the info that would normally be
+ * stored in page_info fields). This arrangement allows the shadow and the
+ * "page_info" fields to always be stored in the same page (in fact, in
+ * the same cache line), avoiding an extra call to map_domain_page().
+ */
+
+static inline u32
+guest_index(void *ptr)
+{
+ return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
+}
+
+static inline u32
+shadow_l1_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
+ return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
+#else
+ return guest_index;
+#endif
+}
+
+static inline u32
+shadow_l2_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+ // Because we use 2 shadow l2 entries for each guest entry, the number of
+ // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
+ //
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+ // We multiple by two to get the index of the first of the two entries
+ // used to shadow the specified guest entry.
+ return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
+#else
+ return guest_index;
+#endif
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+
+static inline u32
+shadow_l3_index(mfn_t *smfn, u32 guest_index)
+{
+#if GUEST_PAGING_LEVELS == 3
+ u32 group_id;
+
+ // Because we use twice the space in L3 shadows as was consumed in guest
+ // L3s, the number of guest entries per shadow page is
+ // SHADOW_L2_PAGETABLE_ENTRIES/2. (Note this is *not*
+ // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...)
+ //
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+ // We store PAE L3 shadows in groups of 4, alternating shadows and
+ // pae_l3_bookkeeping structs. So the effective shadow index is
+ // the the group_id * 8 + the offset within the group.
+ //
+ guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2);
+ group_id = guest_index / 4;
+ return (group_id * 8) + (guest_index % 4);
+#else
+ return guest_index;
+#endif
+}
+
+#endif // GUEST_PAGING_LEVELS >= 3
+
+#if GUEST_PAGING_LEVELS >= 4
+
+static inline u32
+shadow_l4_index(mfn_t *smfn, u32 guest_index)
+{
+ return guest_index;
+}
+
+#endif // GUEST_PAGING_LEVELS >= 4
+
+
+/**************************************************************************/
+/* Functions which compute shadow entries from their corresponding guest
+ * entries.
+ *
+ * These are the "heart" of the shadow code.
+ *
+ * There are two sets of these: those that are called on demand faults (read
+ * faults and write faults), and those that are essentially called to
+ * "prefetch" (or propagate) entries from the guest into the shadow. The read
+ * fault and write fault are handled as two separate cases for L1 entries (due
+ * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
+ * into the respective demand_fault functions.
+ */
+
+#define CHECK(_cond) \
+do { \
+ if (unlikely(!(_cond))) \
+ { \
+ printk("%s %s %d ASSERTION (%s) FAILED\n", \
+ __func__, __FILE__, __LINE__, #_cond); \
+ return -1; \
+ } \
+} while (0);
+
+// The function below tries to capture all of the flag manipulation for the
+// demand and propagate functions into one place.
+//
+static always_inline u32
+sh2_propagate_flags(struct vcpu *v, mfn_t target_mfn,
+ u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn,
+ int mmio, int level, fetch_type_t ft)
+{
+ struct domain *d = v->domain;
+ u32 pass_thru_flags;
+ u32 sflags;
+ int lowest_level_guest_mapping;
+
+ // XXX -- might want to think about PAT support for HVM guests...
+
+#ifndef NDEBUG
+ // MMIO can only occur from L1e's
+ //
+ if ( mmio )
+ CHECK(level == 1);
+
+ // We should always have a pointer to the guest entry if it's a non-PSE
+ // non-MMIO demand access.
+ if ( ft & FETCH_TYPE_DEMAND )
+ CHECK(guest_entry_ptr || level == 1);
+#endif
+
+ // A not-present guest entry has a special signature in the shadow table,
+ // so that we do not have to consult the guest tables multiple times...
+ //
+ if ( unlikely(!(gflags & _PAGE_PRESENT)) )
+ return _PAGE_SHADOW_GUEST_NOT_PRESENT;
+
+ // Must have a valid target_mfn, unless this is mmio, or unless this is a
+ // prefetch. In the case of a prefetch, an invalid mfn means that we can
+ // not usefully shadow anything, and so we return early.
+ //
+ if ( !valid_mfn(target_mfn) )
+ {
+ CHECK((ft == ft_prefetch) || mmio);
+ if ( !mmio )
+ return 0;
+ }
+
+ // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's...
+ //
+ if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) )
+ pass_thru_flags = _PAGE_PRESENT;
+ else
+ {
+ pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
+ _PAGE_RW | _PAGE_PRESENT);
+ if ( guest_supports_nx(v) )
+ pass_thru_flags |= _PAGE_NX_BIT;
+ }
+
+ // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their
+ // L3e's; they are all implied. So we emulate them here.
+ //
+ if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) )
+ gflags = pass_thru_flags;
+
+ // Propagate bits from the guest to the shadow.
+ // Some of these may be overwritten, below.
+ // Since we know the guest's PRESENT bit is set, we also set the shadow's
+ // SHADOW_PRESENT bit.
+ //
+ sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
+
+ // Copy the guest's RW bit into the SHADOW_RW bit.
+ //
+ if ( gflags & _PAGE_RW )
+ sflags |= _PAGE_SHADOW_RW;
+
+ // Set the A&D bits for higher level shadows.
+ // Higher level entries do not, strictly speaking, have dirty bits, but
+ // since we use shadow linear tables, each of these entries may, at some
+ // point in time, also serve as a shadow L1 entry.
+ // By setting both the A&D bits in each of these, we eliminate the burden
+ // on the hardware to update these bits on initial accesses.
+ //
+ if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
+ sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
+
+ lowest_level_guest_mapping =
+ ((level == 1) ||
+ ((level == 2) && guest_supports_superpages(v) &&
+ (gflags & _PAGE_PSE)));
+
+ // Set the A and D bits in the guest entry, if we need to.
+ if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
+ gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
+
+ // If the A or D bit has not yet been set in the guest, then we must
+ // prevent the corresponding kind of access.
+ //
+ if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) &&
+ !(gflags & _PAGE_ACCESSED)) )
+ sflags &= ~_PAGE_PRESENT;
+
+ if ( unlikely(lowest_level_guest_mapping &&
+ !(gflags & _PAGE_DIRTY)) )
+ sflags &= ~_PAGE_RW;
+
+ // MMIO caching
+ //
+ // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
+ // to cache the fact that this entry is in MMIO space.
+ //
+ if ( (level == 1) && mmio )
+ {
+ sflags &= ~(_PAGE_PRESENT);
+ sflags |= _PAGE_SHADOW_MMIO;
+ }
+ else
+ {
+ // shadow2_mode_log_dirty support
+ //
+ // Only allow the guest write access to a page a) on a demand fault,
+ // or b) if the page is already marked as dirty.
+ //
+ if ( unlikely((level == 1) &&
+ !(ft & FETCH_TYPE_WRITE) &&
+ shadow2_mode_log_dirty(d) &&
+ !sh2_mfn_is_dirty(d, target_mfn)) )
+ {
+ sflags &= ~_PAGE_RW;
+ }
+
+ // protect guest page tables
+ //
+ if ( unlikely((level == 1) &&
+ sh2_mfn_is_a_page_table(target_mfn)) )
+ {
+ if ( shadow2_mode_trap_reads(d) )
+ {
+ // if we are trapping both reads & writes, then mark this page
+ // as not present...
+ //
+ sflags &= ~_PAGE_PRESENT;
+ }
+ else
+ {
+ // otherwise, just prevent any writes...
+ //
+ sflags &= ~_PAGE_RW;
+ }
+ }
+ }
+
+ return sflags;
+}
+
+#undef CHECK
+
+#if GUEST_PAGING_LEVELS >= 4
+static void
+l4e_propagate_from_guest(struct vcpu *v,
+ guest_l4e_t *gl4e,
+ mfn_t gl4mfn,
+ mfn_t sl3mfn,
+ shadow_l4e_t *sl4p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l4e_get_flags(*gl4e);
+ u32 sflags = sh2_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
+ gl4mfn, 0, 4, ft);
+
+ *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "%s gl4e=%" SH2_PRI_gpte " sl4e=%" SH2_PRI_pte "\n",
+ fetch_type_names[ft], gl4e->l4, sl4p->l4);
+ ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static void
+l3e_propagate_from_guest(struct vcpu *v,
+ guest_l3e_t *gl3e,
+ mfn_t gl3mfn,
+ mfn_t sl2mfn,
+ shadow_l3e_t *sl3p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l3e_get_flags(*gl3e);
+ u32 sflags = sh2_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
+ gl3mfn, 0, 3, ft);
+
+ *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "%s gl3e=%" SH2_PRI_gpte " sl3e=%" SH2_PRI_pte "\n",
+ fetch_type_names[ft], gl3e->l3, sl3p->l3);
+ ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static void
+l2e_propagate_from_guest(struct vcpu *v,
+ guest_l2e_t *gl2e,
+ mfn_t gl2mfn,
+ mfn_t sl1mfn,
+ shadow_l2e_t *sl2p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l2e_get_flags(*gl2e);
+ u32 sflags = sh2_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e,
+ gl2mfn, 0, 2, ft);
+
+ *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "%s gl2e=%" SH2_PRI_gpte " sl2e=%" SH2_PRI_pte "\n",
+ fetch_type_names[ft], gl2e->l2, sl2p->l2);
+ ASSERT(sflags != -1);
+}
+
+static inline int
+l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+ int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+ struct domain *d = v->domain;
+ u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+ u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+ mmio, 1, ft_demand_read);
+
+ if ( shadow2_mode_trap_reads(d) && !mmio && sh2_mfn_is_a_page_table(gmfn) )
+ {
+ // emulation required!
+ *sl1p = shadow_l1e_empty();
+ return 1;
+ }
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
+ (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+ return 0;
+}
+
+static inline int
+l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+ int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+ struct domain *d = v->domain;
+ u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+ u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+ mmio, 1, ft_demand_write);
+
+ sh2_mark_dirty(d, gmfn);
+
+ if ( !mmio && sh2_mfn_is_a_page_table(gmfn) )
+ {
+ // emulation required!
+ *sl1p = shadow_l1e_empty();
+ return 1;
+ }
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
+ (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+ return 0;
+}
+
+static inline void
+l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
+ int mmio)
+{
+ gfn_t gfn = guest_l1e_get_gfn(gl1e);
+ mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
+ u32 gflags = guest_l1e_get_flags(gl1e);
+ u32 sflags = sh2_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN),
+ mmio, 1, ft_prefetch);
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
+ gl1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+}
+
+
+/**************************************************************************/
+/* These functions update shadow entries (and do bookkeeping on the shadow
+ * tables they are in). It is intended that they are the only
+ * functions which ever write (non-zero) data onto a shadow page.
+ *
+ * They return a set of flags:
+ * SHADOW2_SET_CHANGED -- we actually wrote a new value to the shadow.
+ * SHADOW2_SET_FLUSH -- the caller must cause a TLB flush.
+ * SHADOW2_SET_ERROR -- the input is not a valid entry (for example, if
+ * shadow2_get_page_from_l1e() fails).
+ * SHADOW2_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
+ * copies of their PAE L3 entries re-copied.
+ */
+
+static inline void safe_write_entry(void *dst, void *src)
+/* Copy one PTE safely when processors might be running on the
+ * destination pagetable. This does *not* give safety against
+ * concurrent writes (that's what the shadow lock is for), just
+ * stops the hardware picking up partially written entries. */
+{
+ volatile unsigned long *d = dst;
+ unsigned long *s = src;
+ ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
+#if CONFIG_PAGING_LEVELS == 3
+ /* In PAE mode, pagetable entries are larger
+ * than machine words, so won't get written atomically. We need to make
+ * sure any other cpu running on these shadows doesn't see a
+ * half-written entry. Do this by marking the entry not-present first,
+ * then writing the high word before the low word. */
+ BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
+ d[0] = 0;
+ d[1] = s[1];
+ d[0] = s[0];
+#else
+ /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
+ * which will be an atomic write, since the entry is aligned. */
+ BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
+ *d = *s;
+#endif
+}
+
+
+static inline void
+shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
+/* This function does the actual writes to shadow pages.
+ * It must not be called directly, since it doesn't do the bookkeeping
+ * that shadow_set_l*e() functions do. */
+{
+ shadow_l1e_t *dst = d;
+ shadow_l1e_t *src = s;
+ void *map = NULL;
+ int i;
+
+ /* Because we mirror access rights at all levels in the shadow, an
+ * l2 (or higher) entry with the RW bit cleared will leave us with
+ * no write access through the linear map.
+ * We detect that by writing to the shadow with copy_to_user() and
+ * using map_domain_page() to get a writeable mapping if we need to. */
+ if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
+ {
+ perfc_incrc(shadow2_linear_map_failed);
+ map = sh2_map_domain_page(mfn);
+ ASSERT(map != NULL);
+ dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
+ }
+
+
+ for ( i = 0; i < entries; i++ )
+ safe_write_entry(dst++, src++);
+
+ if ( map != NULL ) sh2_unmap_domain_page(map);
+
+ /* XXX TODO:
+ * Update min/max field in page_info struct of this mfn */
+}
+
+static inline int
+perms_strictly_increased(u32 old_flags, u32 new_flags)
+/* Given the flags of two entries, are the new flags a strict
+ * increase in rights over the old ones? */
+{
+ u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+ u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+ /* Flip the NX bit, since it's the only one that decreases rights;
+ * we calculate as if it were an "X" bit. */
+ of ^= _PAGE_NX_BIT;
+ nf ^= _PAGE_NX_BIT;
+ /* If the changed bits are all set in the new flags, then rights strictly
+ * increased between old and new. */
+ return ((of | (of ^ nf)) == nf);
+}
+
+static int inline
+shadow2_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{
+ int res;
+ mfn_t mfn;
+ struct domain *owner;
+ shadow_l1e_t sanitized_sl1e =
+ shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
+
+ //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
+ //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
+
+ if ( !shadow2_mode_refcounts(d) )
+ return 1;
+
+ res = get_page_from_l1e(sanitized_sl1e, d);
+
+ // If a privileged domain is attempting to install a map of a page it does
+ // not own, we let it succeed anyway.
+ //
+ if ( unlikely(!res) &&
+ IS_PRIV(d) &&
+ !shadow2_mode_translate(d) &&
+ valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
+ (owner = page_get_owner(mfn_to_page(mfn))) &&
+ (d != owner) )
+ {
+ res = get_page_from_l1e(sanitized_sl1e, owner);
+ SHADOW2_PRINTK("privileged domain %d installs map of mfn %05lx "
+ "which is owned by domain %d: %s\n",
+ d->domain_id, mfn_x(mfn), owner->domain_id,
+ res ? "success" : "failed");
+ }
+
+ if ( unlikely(!res) )
+ {
+ perfc_incrc(shadow2_get_page_fail);
+ SHADOW2_PRINTK("failed: l1e=" SH2_PRI_pte "\n");
+ }
+
+ return res;
+}
+
+static void inline
+shadow2_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{
+ if ( !shadow2_mode_refcounts(d) )
+ return;
+
+ put_page_from_l1e(sl1e, d);
+}
+
+#if GUEST_PAGING_LEVELS >= 4
+static int shadow_set_l4e(struct vcpu *v,
+ shadow_l4e_t *sl4e,
+ shadow_l4e_t new_sl4e,
+ mfn_t sl4mfn)
+{
+ int flags = 0;
+ shadow_l4e_t old_sl4e;
+ paddr_t paddr;
+ ASSERT(sl4e != NULL);
+ old_sl4e = *sl4e;
+
+ if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl4e) & ~PAGE_MASK));
+
+ if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh2_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr);
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+ flags |= SHADOW2_SET_CHANGED;
+
+ if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
+ if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
+ || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
+ shadow_l4e_get_flags(new_sl4e)) )
+ {
+ flags |= SHADOW2_SET_FLUSH;
+ }
+ sh2_put_ref(v, osl3mfn, paddr);
+ }
+ return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if GUEST_PAGING_LEVELS >= 3
+static int shadow_set_l3e(struct vcpu *v,
+ shadow_l3e_t *sl3e,
+ shadow_l3e_t new_sl3e,
+ mfn_t sl3mfn)
+{
+ int flags = 0;
+ shadow_l3e_t old_sl3e;
+ paddr_t paddr;
+ ASSERT(sl3e != NULL);
+ old_sl3e = *sl3e;
+
+ if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl3e) & ~PAGE_MASK));
+
+ if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh2_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr);
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
+ flags |= SHADOW2_SET_CHANGED;
+
+#if GUEST_PAGING_LEVELS == 3
+ /* We wrote a guest l3e in a PAE pagetable. This table is copied in
+ * the linear pagetable entries of its l2s, and may also be copied
+ * to a low memory location to make it fit in CR3. Report that we
+ * need to resync those copies (we can't wait for the guest to flush
+ * the TLB because it might be an increase in rights). */
+ {
+ struct vcpu *vcpu;
+
+ struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e);
+ for_each_vcpu(v->domain, vcpu)
+ {
+ if (info->vcpus & (1 << vcpu->vcpu_id))
+ {
+ // Remember that this flip/update needs to occur.
+ vcpu->arch.shadow2_pae_flip_pending = 1;
+ flags |= SHADOW2_SET_L3PAE_RECOPY;
+ }
+ }
+ }
+#endif
+
+ if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
+ if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
+ !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
+ shadow_l3e_get_flags(new_sl3e)) )
+ {
+ flags |= SHADOW2_SET_FLUSH;
+ }
+ sh2_put_ref(v, osl2mfn, paddr);
+ }
+ return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+static int shadow_set_l2e(struct vcpu *v,
+ shadow_l2e_t *sl2e,
+ shadow_l2e_t new_sl2e,
+ mfn_t sl2mfn)
+{
+ int flags = 0;
+ shadow_l2e_t old_sl2e;
+ paddr_t paddr;
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+ /* In 2-on-3 we work with pairs of l2es pointing at two-page
+ * shadows. Reference counting and up-pointers track from the first
+ * page of the shadow to the first l2e, so make sure that we're
+ * working with those:
+ * Align the pointer down so it's pointing at the first of the pair */
+ sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
+ /* Align the mfn of the shadow entry too */
+ new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
+#endif
+
+ ASSERT(sl2e != NULL);
+ old_sl2e = *sl2e;
+
+ if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl2e) & ~PAGE_MASK));
+
+ if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh2_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr);
+ }
+
+ /* Write the new entry */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+ {
+ shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
+ /* The l1 shadow is two pages long and need to be pointed to by
+ * two adjacent l1es. The pair have the same flags, but point
+ * at odd and even MFNs */
+ ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
+ pair[1].l2 |= (1<<PAGE_SHIFT);
+ shadow_write_entries(sl2e, &pair, 2, sl2mfn);
+ }
+#else /* normal case */
+ shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
+#endif
+ flags |= SHADOW2_SET_CHANGED;
+
+ if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
+ if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
+ !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
+ shadow_l2e_get_flags(new_sl2e)) )
+ {
+ flags |= SHADOW2_SET_FLUSH;
+ }
+ sh2_put_ref(v, osl1mfn, paddr);
+ }
+ return flags;
+}
+
+static int shadow_set_l1e(struct vcpu *v,
+ shadow_l1e_t *sl1e,
+ shadow_l1e_t new_sl1e,
+ mfn_t sl1mfn)
+{
+ int flags = 0;
+ struct domain *d = v->domain;
+ shadow_l1e_t old_sl1e;
+ ASSERT(sl1e != NULL);
+
+ old_sl1e = *sl1e;
+
+ if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
+
+ if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ if ( shadow2_mode_refcounts(d) ) {
+ if ( shadow2_get_page_from_l1e(new_sl1e, d) == 0 )
+ {
+ /* Doesn't look like a pagetable. */
+ flags |= SHADOW2_SET_ERROR;
+ new_sl1e = shadow_l1e_empty();
+ }
+ }
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
+ flags |= SHADOW2_SET_CHANGED;
+
+ if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ /* N.B. Unlike higher-level sets, never need an extra flush
+ * when writing an l1e. Because it points to the same guest frame
+ * as the guest l1e did, it's the guest's responsibility to
+ * trigger a flush later. */
+ if ( shadow2_mode_refcounts(d) )
+ {
+ shadow2_put_page_from_l1e(old_sl1e, d);
+ }
+ }
+ return flags;
+}
+
+
+/**************************************************************************/
+/* These functions take a vcpu and a virtual address, and return a pointer
+ * to the appropriate level N entry from the shadow tables.
+ * If the necessary tables are not present in the shadow, they return NULL. */
+
+/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
+ * more levels than the guest, the upper levels are always fixed and do not
+ * reflect any information from the guest, so we do not use these functions
+ * to access them. */
+
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t *
+shadow_get_l4e(struct vcpu *v, unsigned long va)
+{
+ /* Reading the top level table is always valid. */
+ return sh2_linear_l4_table(v) + shadow_l4_linear_offset(va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t *
+shadow_get_l3e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+ /* Get the l4 */
+ shadow_l4e_t *sl4e = shadow_get_l4e(v, va);
+ ASSERT(sl4e != NULL);
+ if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e)));
+ /* l4 was present; OK to get the l3 */
+ return sh2_linear_l3_table(v) + shadow_l3_linear_offset(va);
+#else /* PAE... */
+ /* Top level is always mapped */
+ ASSERT(v->arch.shadow_vtable);
+ return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va);
+#endif
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t *
+shadow_get_l2e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 3 /* 64bit/PAE... */
+ /* Get the l3 */
+ shadow_l3e_t *sl3e = shadow_get_l3e(v, va);
+ if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e)));
+ /* l3 was present; OK to get the l2 */
+#endif
+ return sh2_linear_l2_table(v) + shadow_l2_linear_offset(va);
+}
+
+
+#if 0 // avoid the compiler warning for now...
+
+static shadow_l1e_t *
+shadow_get_l1e(struct vcpu *v, unsigned long va)
+{
+ /* Get the l2 */
+ shadow_l2e_t *sl2e = shadow_get_l2e(v, va);
+ if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e)));
+ /* l2 was present; OK to get the l1 */
+ return sh2_linear_l1_table(v) + shadow_l1_linear_offset(va);
+}
+
+#endif
+
+
+/**************************************************************************/
+/* Macros to walk pagetables. These take the shadow of a pagetable and
+ * walk every "interesting" entry. That is, they don't touch Xen mappings,
+ * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
+ * second entry (since pairs of entries are managed together). For multi-page
+ * shadows they walk all pages.
+ *
+ * Arguments are an MFN, the variable to point to each entry, a variable
+ * to indicate that we are done (we will shortcut to the end of the scan
+ * when _done != 0), a variable to indicate that we should avoid Xen mappings,
+ * and the code.
+ *
+ * WARNING: These macros have side-effects. They change the values of both
+ * the pointer and the MFN. */
+
+static inline void increment_ptr_to_guest_entry(void *ptr)
+{
+ if ( ptr )
+ {
+ guest_l1e_t **entry = ptr;
+ (*entry)++;
+ }
+}
+
+/* All kinds of l1: touch all entries */
+#define _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+do { \
+ int _i; \
+ shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
+ ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l1_shadow \
+ || (mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_fl1_shadow); \
+ for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl1e) = _sp + _i; \
+ if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl1p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+do { \
+ int __done = 0; \
+ _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
+ ({ (__done = _done); }), _code); \
+ _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
+ if ( !__done ) \
+ _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
+ ({ (__done = _done); }), _code); \
+} while (0)
+#else /* Everything else; l1 shadows are only one page */
+#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+ _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
+#endif
+
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+
+/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i, _j, __done = 0; \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2_32_shadow); \
+ for ( _j = 0; _j < 4 && !__done; _j++ ) \
+ { \
+ shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
+ if ( (!(_xen)) \
+ || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
+ < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( (__done = (_done)) ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+ _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
+ } \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 2
+
+/* 32-bit on 32-bit: avoid Xen entries */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2_32_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ if ( (!(_xen)) \
+ || \
+ (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 3
+
+/* PAE: if it's an l2h, don't touch Xen mappings */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2_pae_shadow \
+ || (mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2h_pae_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ if ( (!(_xen)) \
+ || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ != PGC_SH2_l2h_pae_shadow) \
+ || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
+ < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#else
+
+/* 64-bit l2: touch all entries */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#endif /* different kinds of l2 */
+
+#if GUEST_PAGING_LEVELS == 3
+
+/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */
+#define SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code) \
+do { \
+ int _i; \
+ for ( _i = 0; _i < 4; _i++ ) \
+ { \
+ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ _sl3e++; \
+ increment_ptr_to_guest_entry(_gl3p); \
+ } \
+} while (0)
+
+/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */
+#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
+do { \
+ int _i, _j, _k, __done = 0; \
+ ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l3_pae_shadow); \
+ /* The subshadows are split, 64 on each page of the shadow */ \
+ for ( _j = 0; _j < 2 && !__done; _j++ ) \
+ { \
+ void *_sp = sh2_map_domain_page(_sl3mfn); \
+ for ( _i = 0; _i < 64; _i++ ) \
+ { \
+ /* Every second 32-byte region is a bookkeeping entry */ \
+ _sl3e = (shadow_l3e_t *)(_sp + (64 * _i)); \
+ if ( (sl3p_to_info(_sl3e))->refcount > 0 ) \
+ SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, \
+ ({ __done = (_done); __done; }), \
+ _code); \
+ else \
+ for ( _k = 0 ; _k < 4 ; _k++ ) \
+ increment_ptr_to_guest_entry(_gl3p); \
+ if ( __done ) break; \
+ } \
+ sh2_unmap_domain_page(_sp); \
+ _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1); \
+ } \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 4
+
+/* 64-bit l3: touch all entries */
+#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
+do { \
+ int _i; \
+ shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
+ ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l3_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl3e) = _sp + _i; \
+ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl3p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+/* 64-bit l4: avoid Xen mappings */
+#define SHADOW2_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
+ ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l4_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ if ( (!(_xen)) || is_guest_l4_slot(_i) ) \
+ { \
+ (_sl4e) = _sp + _i; \
+ if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ } \
+ increment_ptr_to_guest_entry(_gl4p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#endif
+
+
+
+/**************************************************************************/
+/* Functions to install Xen mappings and linear mappings in shadow pages */
+
+static mfn_t sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
+
+// XXX -- this function should probably be moved to shadow2-common.c, but that
+// probably wants to wait until the shadow types have been moved from
+// shadow2-types.h to shadow2-private.h
+//
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
+{
+ struct domain *d = v->domain;
+ shadow_l4e_t *sl4e;
+
+ sl4e = sh2_map_domain_page(sl4mfn);
+ ASSERT(sl4e != NULL);
+ ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+ &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+ ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
+ shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
+ __PAGE_HYPERVISOR);
+
+ /* Linear mapping */
+ sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
+ sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
+
+ if ( shadow2_mode_translate(v->domain) )
+ {
+ /* install domain-specific P2M table */
+ sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
+ shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+ __PAGE_HYPERVISOR);
+ }
+
+ sh2_unmap_domain_page(sl4e);
+}
+#endif
+
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+// For 3-on-3 PV guests, we need to make sure the xen mappings are in
+// place, which means that we need to populate the l2h entry in the l3
+// table.
+
+void sh2_install_xen_entries_in_l2h(struct vcpu *v,
+ mfn_t sl2hmfn)
+{
+ struct domain *d = v->domain;
+ shadow_l2e_t *sl2e;
+ int i;
+
+ sl2e = sh2_map_domain_page(sl2hmfn);
+ ASSERT(sl2e != NULL);
+ ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
+ &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
+ L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ shadow_l2e_from_mfn(
+ page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+ __PAGE_HYPERVISOR);
+
+ /* We don't set up a linear mapping here because we can't until this
+ * l2h is installed in an l3e. sh2_update_linear_entries() handles
+ * the linear mappings when the l3 is loaded. */
+
+ if ( shadow2_mode_translate(d) )
+ {
+ /* Install the domain-specific p2m table */
+ l3_pgentry_t *p2m;
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+ p2m = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+ for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
+ {
+ sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
+ shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
+ __PAGE_HYPERVISOR);
+ }
+ sh2_unmap_domain_page(p2m);
+ }
+
+ sh2_unmap_domain_page(sl2e);
+}
+
+void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn)
+{
+ shadow_l3e_t *sl3e;
+ guest_l3e_t *gl3e = v->arch.guest_vtable;
+ shadow_l3e_t new_sl3e;
+ gfn_t l2gfn;
+ mfn_t l2gmfn, l2smfn;
+ int r;
+
+ ASSERT(!shadow2_mode_external(v->domain));
+ ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT);
+ l2gfn = guest_l3e_get_gfn(gl3e[3]);
+ l2gmfn = sh2_gfn_to_mfn(v->domain, gfn_x(l2gfn));
+ l2smfn = get_shadow_status(v, l2gmfn, PGC_SH2_l2h_shadow);
+ if ( !valid_mfn(l2smfn) )
+ {
+ l2smfn = sh2_make_shadow(v, l2gmfn, PGC_SH2_l2h_shadow);
+ }
+ l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e,
+ ft_prefetch);
+ sl3e = sh2_map_domain_page(sl3mfn);
+ r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn);
+ sh2_unmap_domain_page(sl3e);
+}
+#endif
+
+
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
+{
+ struct domain *d = v->domain;
+ shadow_l2e_t *sl2e;
+ int i;
+
+ sl2e = sh2_map_domain_page(sl2mfn);
+ ASSERT(sl2e != NULL);
+ ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
+ &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
+ L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ shadow_l2e_from_mfn(
+ page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+ __PAGE_HYPERVISOR);
+
+ /* Linear mapping */
+ sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
+ shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
+ sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
+
+ if ( shadow2_mode_translate(d) )
+ {
+ /* install domain-specific P2M table */
+ sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
+ shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+ __PAGE_HYPERVISOR);
+ }
+
+ sh2_unmap_domain_page(sl2e);
+}
+#endif
+
+
+
+
+
+/**************************************************************************/
+/* Create a shadow of a given guest page.
+ */
+static mfn_t
+sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+{
+ mfn_t smfn = shadow2_alloc(v->domain, shadow_type, mfn_x(gmfn));
+ SHADOW2_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
+ mfn_x(gmfn), shadow_type, mfn_x(smfn));
+
+ if ( shadow_type != PGC_SH2_guest_root_type )
+ /* Lower-level shadow, not yet linked form a higher level */
+ mfn_to_page(smfn)->up = 0;
+
+ // Create the Xen mappings...
+ if ( !shadow2_mode_external(v->domain) )
+ {
+ switch (shadow_type)
+ {
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+ case PGC_SH2_l4_shadow:
+ sh2_install_xen_entries_in_l4(v, gmfn, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+ case PGC_SH2_l3_shadow:
+ sh2_install_xen_entries_in_l3(v, gmfn, smfn); break;
+ case PGC_SH2_l2h_shadow:
+ sh2_install_xen_entries_in_l2h(v, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+ case PGC_SH2_l2_shadow:
+ sh2_install_xen_entries_in_l2(v, gmfn, smfn); break;
+#endif
+ default: /* Do nothing */ break;
+ }
+ }
+
+ shadow2_promote(v, gmfn, shadow_type);
+ set_shadow2_status(v, gmfn, shadow_type, smfn);
+
+ return smfn;
+}
+
+/* Make a splintered superpage shadow */
+static mfn_t
+make_fl1_shadow(struct vcpu *v, gfn_t gfn)
+{
+ mfn_t smfn = shadow2_alloc(v->domain, PGC_SH2_fl1_shadow,
+ (unsigned long) gfn_x(gfn));
+
+ SHADOW2_DEBUG(MAKE_SHADOW, "(%" SH2_PRI_gfn ")=>%" SH2_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(smfn));
+
+ set_fl1_shadow_status(v, gfn, smfn);
+ return smfn;
+}
+
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+mfn_t
+sh2_make_monitor_table(struct vcpu *v)
+{
+
+ ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
+
+#if CONFIG_PAGING_LEVELS == 4
+ {
+ struct domain *d = v->domain;
+ mfn_t m4mfn;
+ m4mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ sh2_install_xen_entries_in_l4(v, m4mfn, m4mfn);
+ /* Remember the level of this table */
+ mfn_to_page(m4mfn)->shadow2_flags = 4;
+#if SHADOW_PAGING_LEVELS < 4
+ // Install a monitor l3 table in slot 0 of the l4 table.
+ // This is used for shadow linear maps.
+ {
+ mfn_t m3mfn;
+ l4_pgentry_t *l4e;
+ m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ mfn_to_page(m3mfn)->shadow2_flags = 3;
+ l4e = sh2_map_domain_page(m4mfn);
+ l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
+ sh2_unmap_domain_page(l4e);
+ }
+#endif /* SHADOW_PAGING_LEVELS < 4 */
+ return m4mfn;
+ }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+ {
+ struct domain *d = v->domain;
+ mfn_t m3mfn, m2mfn;
+ l3_pgentry_t *l3e;
+ l2_pgentry_t *l2e;
+ int i;
+
+ m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ /* Remember the level of this table */
+ mfn_to_page(m3mfn)->shadow2_flags = 3;
+
+ // Install a monitor l2 table in slot 3 of the l3 table.
+ // This is used for all Xen entries, including linear maps
+ m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ mfn_to_page(m2mfn)->shadow2_flags = 2;
+ l3e = sh2_map_domain_page(m3mfn);
+ l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
+ sh2_install_xen_entries_in_l2h(v, m2mfn);
+ /* Install the monitor's own linear map */
+ l2e = sh2_map_domain_page(m2mfn);
+ for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+ l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
+ (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
+ : l2e_empty();
+ sh2_unmap_domain_page(l2e);
+ sh2_unmap_domain_page(l3e);
+
+ SHADOW2_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
+ return m3mfn;
+ }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+ {
+ struct domain *d = v->domain;
+ mfn_t m2mfn;
+ m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ sh2_install_xen_entries_in_l2(v, m2mfn, m2mfn);
+ /* Remember the level of this table */
+ mfn_to_page(m2mfn)->shadow2_flags = 2;
+ return m2mfn;
+ }
+
+#else
+#error this should not happen
+#endif /* CONFIG_PAGING_LEVELS */
+}
+#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
+
+/**************************************************************************/
+/* These functions also take a virtual address and return the level-N
+ * shadow table mfn and entry, but they create the shadow pagetables if
+ * they are needed. The "demand" argument is non-zero when handling
+ * a demand fault (so we know what to do about accessed bits &c).
+ * If the necessary tables are not present in the guest, they return NULL. */
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl4mfn)
+{
+ /* There is always a shadow of the top level table. Get it. */
+ *sl4mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* Reading the top level table is always valid. */
+ return sh2_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl3mfn,
+ fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+ mfn_t sl4mfn;
+ shadow_l4e_t *sl4e;
+ if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
+ /* Get the l4e */
+ sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
+ ASSERT(sl4e != NULL);
+ if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
+ {
+ *sl3mfn = shadow_l4e_get_mfn(*sl4e);
+ ASSERT(valid_mfn(*sl3mfn));
+ }
+ else
+ {
+ int r;
+ shadow_l4e_t new_sl4e;
+ /* No l3 shadow installed: find and install it. */
+ *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH2_l3_shadow);
+ if ( !valid_mfn(*sl3mfn) )
+ {
+ /* No l3 shadow of this page exists at all: make one. */
+ *sl3mfn = sh2_make_shadow(v, gw->l3mfn, PGC_SH2_l3_shadow);
+ }
+ /* Install the new sl3 table in the sl4e */
+ l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
+ *sl3mfn, &new_sl4e, ft);
+ r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
+ ASSERT((r & SHADOW2_SET_FLUSH) == 0);
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh2_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
+#else /* PAE... */
+ /* There is always a shadow of the top level table. Get it. */
+ *sl3mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* This next line is important: the shadow l3 table is in an 8k
+ * shadow and we need to return the right mfn of the pair. This call
+ * will set it for us as a side-effect. */
+ (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e));
+ ASSERT(v->arch.shadow_vtable);
+ return ((shadow_l3e_t *)v->arch.shadow_vtable)
+ + shadow_l3_table_offset(gw->va);
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl2mfn,
+ fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */
+ mfn_t sl3mfn = _mfn(INVALID_MFN);
+ shadow_l3e_t *sl3e;
+ if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+ /* Get the l3e */
+ sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
+ ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */
+ if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
+ {
+ *sl2mfn = shadow_l3e_get_mfn(*sl3e);
+ ASSERT(valid_mfn(*sl2mfn));
+ }
+ else
+ {
+ int r;
+ shadow_l3e_t new_sl3e;
+ /* No l2 shadow installed: find and install it. */
+ *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH2_l2_shadow);
+ if ( !valid_mfn(*sl2mfn) )
+ {
+ /* No l2 shadow of this page exists at all: make one. */
+ *sl2mfn = sh2_make_shadow(v, gw->l2mfn, PGC_SH2_l2_shadow);
+ }
+ /* Install the new sl2 table in the sl3e */
+ l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
+ *sl2mfn, &new_sl3e, ft);
+ r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
+ ASSERT((r & SHADOW2_SET_FLUSH) == 0);
+#if GUEST_PAGING_LEVELS == 3
+ /* Need to sync up the linear maps, as we are about to use them */
+ ASSERT( r & SHADOW2_SET_L3PAE_RECOPY );
+ sh2_pae_recopy(v->domain);
+#endif
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#else /* 32bit... */
+ /* There is always a shadow of the top level table. Get it. */
+ *sl2mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* This next line is important: the guest l2 has a 16k
+ * shadow, we need to return the right mfn of the four. This
+ * call will set it for us as a side-effect. */
+ (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
+ /* Reading the top level table is always valid. */
+ return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#endif
+}
+
+
+static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl1mfn,
+ fetch_type_t ft)
+{
+ mfn_t sl2mfn;
+ shadow_l2e_t *sl2e;
+
+ /* Get the l2e */
+ sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
+ if ( sl2e == NULL ) return NULL;
+ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
+ {
+ *sl1mfn = shadow_l2e_get_mfn(*sl2e);
+ ASSERT(valid_mfn(*sl1mfn));
+ }
+ else
+ {
+ shadow_l2e_t new_sl2e;
+ int r, flags = guest_l2e_get_flags(*gw->l2e);
+ /* No l1 shadow installed: find and install it. */
+ if ( !(flags & _PAGE_PRESENT) )
+ return NULL; /* No guest page. */
+ if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
+ {
+ /* Splintering a superpage */
+ gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
+ *sl1mfn = get_fl1_shadow_status(v, l2gfn);
+ if ( !valid_mfn(*sl1mfn) )
+ {
+ /* No fl1 shadow of this superpage exists at all: make one. */
+ *sl1mfn = make_fl1_shadow(v, l2gfn);
+ }
+ }
+ else
+ {
+ /* Shadowing an actual guest l1 table */
+ if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+ *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH2_l1_shadow);
+ if ( !valid_mfn(*sl1mfn) )
+ {
+ /* No l1 shadow of this page exists at all: make one. */
+ *sl1mfn = sh2_make_shadow(v, gw->l1mfn, PGC_SH2_l1_shadow);
+ }
+ }
+ /* Install the new sl1 table in the sl2e */
+ l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
+ *sl1mfn, &new_sl2e, ft);
+ r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
+ ASSERT((r & SHADOW2_SET_FLUSH) == 0);
+ /* This next line is important: in 32-on-PAE and 32-on-64 modes,
+ * the guest l1 table has an 8k shadow, and we need to return
+ * the right mfn of the pair. This call will set it for us as a
+ * side-effect. (In all other cases, it's a no-op and will be
+ * compiled out.) */
+ (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh2_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
+}
+
+
+
+/**************************************************************************/
+/* Destructors for shadow tables:
+ * Unregister the shadow, decrement refcounts of any entries present in it,
+ * and release the memory.
+ *
+ * N.B. These destructors do not clear the contents of the shadows.
+ * This allows us to delay TLB shootdowns until the page is being reused.
+ * See shadow2_alloc() and shadow2_free() for how this is handled.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+void sh2_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l4e_t *sl4e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+ mfn_t gmfn, sl4mfn;
+ int xen_mappings;
+
+ SHADOW2_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH2_l4_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow2_status(v, gmfn, t, smfn);
+ shadow2_demote(v, gmfn, t);
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+
+ /* Decrement refcounts of all the old entries */
+ xen_mappings = (!shadow2_mode_external(v->domain));
+ sl4mfn = smfn;
+ SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+ if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
+ {
+ sh2_put_ref(v, shadow_l4e_get_mfn(*sl4e),
+ (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl4e & ~PAGE_MASK));
+ }
+ });
+
+ /* Put the memory back in the pool */
+ shadow2_free(v->domain, smfn);
+}
+#endif
+
+#if GUEST_PAGING_LEVELS >= 3
+void sh2_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l3e_t *sl3e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+ mfn_t gmfn, sl3mfn;
+
+ SHADOW2_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH2_l3_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow2_status(v, gmfn, t, smfn);
+ shadow2_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 3
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+ /* Decrement refcounts of all the old entries */
+ sl3mfn = smfn;
+ SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+ if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
+ sh2_put_ref(v, shadow_l3e_get_mfn(*sl3e),
+ (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl3e & ~PAGE_MASK));
+ });
+
+ /* Put the memory back in the pool */
+ shadow2_free(v->domain, smfn);
+}
+#endif
+
+
+#if GUEST_PAGING_LEVELS == 3
+static void sh2_destroy_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e)
+/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */
+{
+ int i;
+ ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0);
+ for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ )
+ if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT )
+ sh2_put_ref(v, shadow_l3e_get_mfn(sl3e[i]),
+ mapped_domain_page_to_maddr(sl3e));
+}
+#endif
+
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh2_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn)
+/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */
+{
+ int i, j;
+ struct pae_l3_bookkeeping *bk;
+
+ ASSERT((mfn_to_page(smfn)->count_info & PGC_SH2_type_mask)
+ == PGC_SH2_l3_pae_shadow);
+ /* The subshadows are split, 64 on each page of the shadow */
+ for ( i = 0; i < 2; i++ )
+ {
+ void *p = sh2_map_domain_page(_mfn(mfn_x(smfn) + i));
+ for ( j = 0; j < 64; j++ )
+ {
+ /* Every second 32-byte region is a bookkeeping entry */
+ bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32);
+ if ( bk->pinned )
+ sh2_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn);
+ /* Check whether we've just freed the whole shadow */
+ if ( (mfn_to_page(smfn)->count_info & PGC_SH2_count_mask) == 0 )
+ {
+ sh2_unmap_domain_page(p);
+ return;
+ }
+ }
+ sh2_unmap_domain_page(p);
+ }
+}
+#endif
+
+void sh2_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l2e_t *sl2e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+ mfn_t gmfn, sl2mfn;
+ int xen_mappings;
+
+ SHADOW2_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH2_l2_shadow
+ || t == PGC_SH2_l2h_pae_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow2_status(v, gmfn, t, smfn);
+ shadow2_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 2
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+ /* Decrement refcounts of all the old entries */
+ sl2mfn = smfn;
+ xen_mappings = (!shadow2_mode_external(v->domain) &&
+ ((GUEST_PAGING_LEVELS == 2) ||
+ ((GUEST_PAGING_LEVELS == 3) &&
+ (t == PGC_SH2_l2h_pae_shadow))));
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
+ sh2_put_ref(v, shadow_l2e_get_mfn(*sl2e),
+ (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl2e & ~PAGE_MASK));
+ });
+
+ /* Put the memory back in the pool */
+ shadow2_free(v->domain, smfn);
+}
+
+void sh2_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
+{
+ struct domain *d = v->domain;
+ shadow_l1e_t *sl1e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+
+ SHADOW2_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH2_l1_shadow || t == PGC_SH2_fl1_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ if ( t == PGC_SH2_fl1_shadow )
+ {
+ gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_fl1_shadow_status(v, gfn, smfn);
+ }
+ else
+ {
+ mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow2_status(v, gmfn, t, smfn);
+ shadow2_demote(v, gmfn, t);
+ }
+
+ if ( shadow2_mode_refcounts(d) )
+ {
+ /* Decrement refcounts of all the old entries */
+ mfn_t sl1mfn = smfn;
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
+ if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT )
+ shadow2_put_page_from_l1e(*sl1e, d);
+ });
+ }
+
+ /* Put the memory back in the pool */
+ shadow2_free(v->domain, smfn);
+}
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+void sh2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
+{
+ struct domain *d = v->domain;
+ ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH2_type_mask)
+ == PGC_SH2_monitor_table);
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
+ /* Need to destroy the l3 monitor page in slot 0 too */
+ {
+ l4_pgentry_t *l4e = sh2_map_domain_page(mmfn);
+ ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+ shadow2_free(d, _mfn(l4e_get_pfn(l4e[0])));
+ sh2_unmap_domain_page(l4e);
+ }
+#elif CONFIG_PAGING_LEVELS == 3
+ /* Need to destroy the l2 monitor page in slot 4 too */
+ {
+ l3_pgentry_t *l3e = sh2_map_domain_page(mmfn);
+ ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+ shadow2_free(d, _mfn(l3e_get_pfn(l3e[3])));
+ sh2_unmap_domain_page(l3e);
+ }
+#endif
+
+ /* Put the memory back in the pool */
+ shadow2_free(d, mmfn);
+}
+#endif
+
+/**************************************************************************/
+/* Functions to destroy non-Xen mappings in a pagetable hierarchy.
+ * These are called from common code when we are running out of shadow
+ * memory, and unpinning all the top-level shadows hasn't worked.
+ *
+ * This implementation is pretty crude and slow, but we hope that it won't
+ * be called very often. */
+
+#if GUEST_PAGING_LEVELS == 2
+
+void sh2_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
+{
+ shadow_l2e_t *sl2e;
+ int xen_mappings = !shadow2_mode_external(v->domain);
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+ (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ });
+}
+
+#elif GUEST_PAGING_LEVELS == 3
+
+void sh2_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn)
+/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */
+{
+ shadow_l3e_t *sl3e;
+ SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+ if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) {
+ mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e);
+ if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask)
+ == PGC_SH2_l2h_pae_shadow )
+ {
+ /* High l2: need to pick particular l2es to unhook */
+ shadow_l2e_t *sl2e;
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, {
+ (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ });
+ }
+ else
+ {
+ /* Normal l2: can safely unhook the whole l3e */
+ (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+ }
+ }
+ });
+ /* We've changed PAE L3 entries: must sync up various copies of them */
+ sh2_pae_recopy(v->domain);
+}
+
+#elif GUEST_PAGING_LEVELS == 4
+
+void sh2_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
+{
+ shadow_l4e_t *sl4e;
+ int xen_mappings = !shadow2_mode_external(v->domain);
+ SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+ (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+ });
+}
+
+#endif
+
+/**************************************************************************/
+/* Internal translation functions.
+ * These functions require a pointer to the shadow entry that will be updated.
+ */
+
+/* These functions take a new guest entry, translate it to shadow and write
+ * the shadow entry.
+ *
+ * They return the same bitmaps as the shadow_set_lXe() functions.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
+{
+ shadow_l4e_t new_sl4e;
+ guest_l4e_t *new_gl4e = new_ge;
+ shadow_l4e_t *sl4p = se;
+ mfn_t sl3mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow2_validate_gl4e_calls);
+
+ if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
+ {
+ gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
+ mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
+ if ( valid_mfn(gl3mfn) )
+ sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH2_l3_shadow);
+ else
+ result |= SHADOW2_SET_ERROR;
+ }
+ l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
+ sl3mfn, &new_sl4e, ft_prefetch);
+ result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
+ return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
+{
+ shadow_l3e_t new_sl3e;
+ guest_l3e_t *new_gl3e = new_ge;
+ shadow_l3e_t *sl3p = se;
+ mfn_t sl2mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow2_validate_gl3e_calls);
+
+ if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
+ {
+ gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
+ mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
+ if ( valid_mfn(gl2mfn) )
+ sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH2_l2_shadow);
+ else
+ result |= SHADOW2_SET_ERROR;
+ }
+ l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
+ sl2mfn, &new_sl3e, ft_prefetch);
+ result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
+
+#if GUEST_PAGING_LEVELS == 3
+ /* We have changed a PAE l3 entry: need to sync up the possible copies
+ * of it */
+ if ( result & SHADOW2_SET_L3PAE_RECOPY )
+ sh2_pae_recopy(v->domain);
+#endif
+
+ return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
+{
+ shadow_l2e_t new_sl2e;
+ guest_l2e_t *new_gl2e = new_ge;
+ shadow_l2e_t *sl2p = se;
+ mfn_t sl1mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow2_validate_gl2e_calls);
+
+ if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
+ {
+ gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
+ if ( guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
+ {
+ // superpage -- need to look up the shadow L1 which holds the
+ // splitters...
+ sl1mfn = get_fl1_shadow_status(v, gl1gfn);
+#if 0
+ // XXX - it's possible that we want to do some kind of prefetch
+ // for superpage fl1's here, but this is *not* on the demand path,
+ // so we'll hold off trying that for now...
+ //
+ if ( !valid_mfn(sl1mfn) )
+ sl1mfn = make_fl1_shadow(v, gl1gfn);
+#endif
+ }
+ else
+ {
+ mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
+ if ( valid_mfn(gl1mfn) )
+ sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH2_l1_shadow);
+ else
+ result |= SHADOW2_SET_ERROR;
+ }
+ }
+ l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
+ sl1mfn, &new_sl2e, ft_prefetch);
+ result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
+
+ return result;
+}
+
+static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
+{
+ shadow_l1e_t new_sl1e;
+ guest_l1e_t *new_gl1e = new_ge;
+ shadow_l1e_t *sl1p = se;
+ gfn_t gfn;
+ mfn_t mfn;
+ int result = 0;
+
+ perfc_incrc(shadow2_validate_gl1e_calls);
+
+ gfn = guest_l1e_get_gfn(*new_gl1e);
+ mfn = vcpu_gfn_to_mfn(v, gfn);
+
+ l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e,
+ /* mmio? */ !valid_mfn(mfn));
+
+ result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
+ return result;
+}
+
+
+/**************************************************************************/
+/* Functions which translate and install a the shadows of arbitrary guest
+ * entries that we have just seen the guest write. */
+
+
+static inline int
+sh2_map_and_validate(struct vcpu *v, mfn_t gmfn,
+ void *new_gp, u32 size, u32 sh_type,
+ u32 (*shadow_index)(mfn_t *smfn, u32 idx),
+ int (*validate_ge)(struct vcpu *v, void *ge,
+ mfn_t smfn, void *se))
+/* Generic function for mapping and validating. */
+{
+ mfn_t smfn, smfn2, map_mfn;
+ shadow_l1e_t *sl1p;
+ u32 shadow_idx, guest_idx;
+ int result = 0;
+
+ /* Align address and size to guest entry boundaries */
+ size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
+ new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
+ size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
+ ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
+
+ /* Map the shadow page */
+ smfn = get_shadow_status(v, gmfn, sh_type);
+ ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
+ guest_idx = guest_index(new_gp);
+ map_mfn = smfn;
+ shadow_idx = shadow_index(&map_mfn, guest_idx);
+ sl1p = map_shadow_page(map_mfn);
+
+ /* Validate one entry at a time */
+ while ( size )
+ {
+ smfn2 = smfn;
+ guest_idx = guest_index(new_gp);
+ shadow_idx = shadow_index(&smfn2, guest_idx);
+ if ( mfn_x(smfn2) != mfn_x(map_mfn) )
+ {
+ /* We have moved to another page of the shadow */
+ map_mfn = smfn2;
+ unmap_shadow_page(sl1p);
+ sl1p = map_shadow_page(map_mfn);
+ }
+ result |= validate_ge(v,
+ new_gp,
+ map_mfn,
+ &sl1p[shadow_idx]);
+ size -= sizeof(guest_l1e_t);
+ new_gp += sizeof(guest_l1e_t);
+ }
+ unmap_shadow_page(sl1p);
+ return result;
+}
+
+
+int
+sh2_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
+ void *new_gl4p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 4
+ return sh2_map_and_validate(v, gl4mfn, new_gl4p, size,
+ PGC_SH2_l4_shadow,
+ shadow_l4_index,
+ validate_gl4e);
+#else // ! GUEST_PAGING_LEVELS >= 4
+ SHADOW2_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh2_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
+ void *new_gl3p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 3
+ return sh2_map_and_validate(v, gl3mfn, new_gl3p, size,
+ PGC_SH2_l3_shadow,
+ shadow_l3_index,
+ validate_gl3e);
+#else // ! GUEST_PAGING_LEVELS >= 3
+ SHADOW2_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh2_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
+ void *new_gl2p, u32 size)
+{
+ return sh2_map_and_validate(v, gl2mfn, new_gl2p, size,
+ PGC_SH2_l2_shadow,
+ shadow_l2_index,
+ validate_gl2e);
+}
+
+int
+sh2_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
+ void *new_gl2p, u32 size)
+{
+#if GUEST_PAGING_LEVELS == 3
+ return sh2_map_and_validate(v, gl2mfn, new_gl2p, size,
+ PGC_SH2_l2h_shadow,
+ shadow_l2_index,
+ validate_gl2e);
+#else /* Non-PAE guests don't have different kinds of l2 table */
+ SHADOW2_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh2_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
+ void *new_gl1p, u32 size)
+{
+ return sh2_map_and_validate(v, gl1mfn, new_gl1p, size,
+ PGC_SH2_l1_shadow,
+ shadow_l1_index,
+ validate_gl1e);
+}
+
+
+/**************************************************************************/
+/* Optimization: If we see two emulated writes of zeros to the same
+ * page-table without another kind of page fault in between, we guess
+ * that this is a batch of changes (for process destruction) and
+ * unshadow the page so we don't take a pagefault on every entry. This
+ * should also make finding writeable mappings of pagetables much
+ * easier. */
+
+/* Look to see if this is the second emulated write in a row to this
+ * page, and unshadow/unhook if it is */
+static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
+{
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
+ if ( v->arch.last_emulated_mfn == mfn_x(gmfn) &&
+ sh2_mfn_is_a_page_table(gmfn) )
+ {
+ u32 flags = mfn_to_page(gmfn)->shadow2_flags;
+ mfn_t smfn;
+ if ( !(flags & (SH2F_L2_32|SH2F_L3_PAE|SH2F_L4_64)) )
+ {
+ perfc_incrc(shadow2_early_unshadow);
+ sh2_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ );
+ return;
+ }
+ /* SH2F_unhooked_mappings is set to make sure we only unhook
+ * once in a single batch of updates. It is reset when this
+ * top-level page is loaded into CR3 again */
+ if ( !(flags & SH2F_unhooked_mappings) )
+ {
+ perfc_incrc(shadow2_early_unshadow_top);
+ mfn_to_page(gmfn)->shadow2_flags |= SH2F_unhooked_mappings;
+ if ( flags & SH2F_L2_32 )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH2_l2_32_shadow);
+ shadow2_unhook_mappings(v, smfn);
+ }
+ if ( flags & SH2F_L3_PAE )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH2_l3_pae_shadow);
+ shadow2_unhook_mappings(v, smfn);
+ }
+ if ( flags & SH2F_L4_64 )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH2_l4_64_shadow);
+ shadow2_unhook_mappings(v, smfn);
+ }
+ }
+ }
+ v->arch.last_emulated_mfn = mfn_x(gmfn);
+#endif
+}
+
+/* Stop counting towards early unshadows, as we've seen a real page fault */
+static inline void reset_early_unshadow(struct vcpu *v)
+{
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
+ v->arch.last_emulated_mfn = INVALID_MFN;
+#endif
+}
+
+
+
+/**************************************************************************/
+/* Entry points into the shadow code */
+
+/* Called from pagefault handler in Xen, and from the HVM trap handlers
+ * for pagefaults. Returns 1 if this fault was an artefact of the
+ * shadow code (and the guest should retry) or 0 if it is not (and the
+ * fault should be handled elsewhere or passed to the guest). */
+
+static int sh2_page_fault(struct vcpu *v,
+ unsigned long va,
+ struct cpu_user_regs *regs)
+{
+ struct domain *d = v->domain;
+ walk_t gw;
+ u32 accumulated_gflags;
+ gfn_t gfn;
+ mfn_t gmfn, sl1mfn=_mfn(0);
+ shadow_l1e_t sl1e, *ptr_sl1e;
+ paddr_t gpa;
+ struct cpu_user_regs emul_regs;
+ struct x86_emulate_ctxt emul_ctxt;
+ int r, mmio;
+ fetch_type_t ft = 0;
+
+ //
+ // XXX: Need to think about eventually mapping superpages directly in the
+ // shadow (when possible), as opposed to splintering them into a
+ // bunch of 4K maps.
+ //
+
+ SHADOW2_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
+ v->domain->domain_id, v->vcpu_id, va, regs->error_code);
+
+ shadow2_lock(d);
+
+ shadow2_audit_tables(v);
+
+ if ( guest_walk_tables(v, va, &gw, 1) != 0 )
+ {
+ SHADOW2_PRINTK("malformed guest pagetable!");
+ print_gw(&gw);
+ }
+
+ sh2_audit_gw(v, &gw);
+
+ // We do not look at the gw->l1e, as that will not exist for superpages.
+ // Instead, we use the gw->eff_l1e...
+ //
+ // We need not check all the levels of the guest page table entries for
+ // present vs not-present, as the eff_l1e will always be not present if
+ // one of the higher level entries is not present.
+ //
+ if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
+ {
+ if ( hvm_guest(v) && !shadow2_vcpu_mode_translate(v) )
+ {
+ /* Not present in p2m map, means this is mmio */
+ gpa = va;
+ goto mmio;
+ }
+
+ perfc_incrc(shadow2_fault_bail_not_present);
+ goto not_a_shadow_fault;
+ }
+
+ // All levels of the guest page table are now known to be present.
+ accumulated_gflags = accumulate_guest_flags(&gw);
+
+ // Check for attempts to access supervisor-only pages from user mode,
+ // i.e. ring 3. Such errors are not caused or dealt with by the shadow
+ // code.
+ //
+ if ( (regs->error_code & X86_PFEC_SUPERVISOR_FAULT) &&
+ !(accumulated_gflags & _PAGE_USER) )
+ {
+ /* illegal user-mode access to supervisor-only page */
+ perfc_incrc(shadow2_fault_bail_user_supervisor);
+ goto not_a_shadow_fault;
+ }
+
+ // Was it a write fault?
+ //
+ if ( regs->error_code & X86_PFEC_WRITE_FAULT )
+ {
+ if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
+ {
+ perfc_incrc(shadow2_fault_bail_ro_mapping);
+ goto not_a_shadow_fault;
+ }
+ }
+ else // must have been either an insn fetch or read fault
+ {
+ // Check for NX bit violations: attempts to execute code that is
+ // marked "do not execute". Such errors are not caused or dealt with
+ // by the shadow code.
+ //
+ if ( regs->error_code & X86_PFEC_INSN_FETCH_FAULT )
+ {
+ if ( accumulated_gflags & _PAGE_NX_BIT )
+ {
+ /* NX prevented this code fetch */
+ perfc_incrc(shadow2_fault_bail_nx);
+ goto not_a_shadow_fault;
+ }
+ }
+ }
+
+ /* Is this an MMIO access? */
+ gfn = guest_l1e_get_gfn(gw.eff_l1e);
+ mmio = ( hvm_guest(v)
+ && shadow2_vcpu_mode_translate(v)
+ && mmio_space(gfn_to_paddr(gfn)) );
+
+ /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds
+ * the equivalent mfn. */
+ if ( mmio )
+ gmfn = _mfn(gfn_x(gfn));
+ else
+ {
+ gmfn = vcpu_gfn_to_mfn(v, gfn);
+ if ( !valid_mfn(gmfn) )
+ {
+ perfc_incrc(shadow2_fault_bail_bad_gfn);
+ SHADOW2_PRINTK("BAD gfn=%"SH2_PRI_gfn" gmfn=%"SH2_PRI_mfn"\n",
+ gfn_x(gfn), mfn_x(gmfn));
+ goto not_a_shadow_fault;
+ }
+ }
+
+ /* Make sure there is enough free shadow memory to build a chain of
+ * shadow tables: one SHADOW2_MAX_ORDER chunk will always be enough
+ * to allocate all we need. (We never allocate a top-level shadow
+ * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
+ shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+
+ /* Acquire the shadow. This must happen before we figure out the rights
+ * for the shadow entry, since we might promote a page here. */
+ // XXX -- this code will need to change somewhat if/when the shadow code
+ // can directly map superpages...
+ ft = ((regs->error_code & X86_PFEC_WRITE_FAULT)
+ ? ft_demand_write : ft_demand_read);
+ ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
+ ASSERT(ptr_sl1e);
+
+ /* Calculate the shadow entry */
+ if ( ft == ft_demand_write )
+ {
+ if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
+ {
+ perfc_incrc(shadow2_fault_emulate_write);
+ goto emulate;
+ }
+ }
+ else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
+ {
+ perfc_incrc(shadow2_fault_emulate_read);
+ goto emulate;
+ }
+
+ /* Quick sanity check: we never make an MMIO entry that's got the
+ * _PAGE_PRESENT flag set in it. */
+ ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
+
+ r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
+
+ if ( mmio )
+ {
+ gpa = guest_walk_to_gpa(&gw);
+ goto mmio;
+ }
+
+#if 0
+ if ( !(r & SHADOW2_SET_CHANGED) )
+ debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH2_PRI_pte
+ ") did not change anything\n",
+ __func__, gw.va, l1e_get_intpte(sl1e));
+#endif
+
+ perfc_incrc(shadow2_fault_fixed);
+ d->arch.shadow_fault_count++;
+ reset_early_unshadow(v);
+
+ done:
+ sh2_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW2_PRINTK("fixed\n");
+ shadow2_audit_tables(v);
+ shadow2_unlock(d);
+ return EXCRET_fault_fixed;
+
+ emulate:
+
+ /* Take the register set we were called with */
+ emul_regs = *regs;
+ if ( hvm_guest(v) )
+ {
+ /* Add the guest's segment selectors, rip, rsp. rflags */
+ hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
+ }
+ emul_ctxt.regs = &emul_regs;
+ emul_ctxt.cr2 = va;
+ emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST;
+
+ SHADOW2_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
+
+ v->arch.shadow2_propagate_fault = 0;
+ if ( x86_emulate_memop(&emul_ctxt, &shadow2_emulator_ops) )
+ {
+ SHADOW2_PRINTK("emulator failure, unshadowing mfn %#lx\n",
+ mfn_x(gmfn));
+ perfc_incrc(shadow2_fault_emulate_failed);
+ /* If this is actually a page table, then we have a bug, and need
+ * to support more operations in the emulator. More likely,
+ * though, this is a hint that this page should not be shadowed. */
+ shadow2_remove_all_shadows(v, gmfn);
+ /* This means that actual missing operations will cause the
+ * guest to loop on the same page fault. */
+ goto done;
+ }
+ if ( v->arch.shadow2_propagate_fault )
+ {
+ /* Emulation triggered another page fault */
+ goto not_a_shadow_fault;
+ }
+
+ /* Emulator has changed the user registers: write back */
+ if ( hvm_guest(v) )
+ {
+ /* Write back the guest's segment selectors, rip, rsp. rflags */
+ hvm_load_cpu_guest_regs(v, &emul_regs);
+ /* And don't overwrite those in the caller's regs. */
+ emul_regs.eip = regs->eip;
+ emul_regs.cs = regs->cs;
+ emul_regs.eflags = regs->eflags;
+ emul_regs.esp = regs->esp;
+ emul_regs.ss = regs->ss;
+ emul_regs.es = regs->es;
+ emul_regs.ds = regs->ds;
+ emul_regs.fs = regs->fs;
+ emul_regs.gs = regs->gs;
+ }
+ *regs = emul_regs;
+
+ goto done;
+
+ mmio:
+ perfc_incrc(shadow2_fault_mmio);
+ if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) )
+ {
+ /* Need to deal with these disabled-APIC accesses, as
+ * handle_mmio() apparently does not currently do that. */
+ /* TJD: What about it, then? For now, I'm turning this BUG()
+ * into a domain_crash() since we don't want to kill Xen. */
+ SHADOW2_ERROR("disabled-APIC access: not supported\n.");
+ domain_crash(d);
+ }
+ sh2_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW2_PRINTK("mmio\n");
+ shadow2_audit_tables(v);
+ reset_early_unshadow(v);
+ shadow2_unlock(d);
+ sh2_log_mmio(v, gpa);
+ handle_mmio(va, gpa);
+ return EXCRET_fault_fixed;
+
+ not_a_shadow_fault:
+ sh2_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW2_PRINTK("not a shadow fault\n");
+ shadow2_audit_tables(v);
+ reset_early_unshadow(v);
+ shadow2_unlock(d);
+ return 0;
+}
+
+
+static int
+sh2_invlpg(struct vcpu *v, unsigned long va)
+/* Called when the guest requests an invlpg. Returns 1 if the invlpg
+ * instruction should be issued on the hardware, or 0 if it's safe not
+ * to do so. */
+{
+ shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va);
+
+ // XXX -- might be a good thing to prefetch the va into the shadow
+
+ // no need to flush anything if there's no SL2...
+ //
+ if ( !ptr_sl2e )
+ return 0;
+
+ // If there's nothing shadowed for this particular sl2e, then
+ // there is no need to do an invlpg, either...
+ //
+ if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) )
+ return 0;
+
+ // Check to see if the SL2 is a splintered superpage...
+ // If so, then we'll need to flush the entire TLB (because that's
+ // easier than invalidating all of the individual 4K pages).
+ //
+ if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info &
+ PGC_SH2_type_mask) == PGC_SH2_fl1_shadow )
+ {
+ local_flush_tlb();
+ return 0;
+ }
+
+ return 1;
+}
+
+static unsigned long
+sh2_gva_to_gfn(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ walk_t gw;
+ gfn_t gfn;
+
+ guest_walk_tables(v, va, &gw, 0);
+ gfn = guest_walk_to_gfn(&gw);
+ unmap_walk(v, &gw);
+
+ return gfn_x(gfn);
+}
+
+
+static unsigned long
+sh2_gva_to_gpa(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ unsigned long gfn = sh2_gva_to_gfn(v, va);
+ if ( gfn == INVALID_GFN )
+ return 0;
+ else
+ return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK);
+}
+
+
+// XXX -- should this be in this file?
+// Or should it be moved to shadow2-common.c?
+//
+/* returns a lowmem machine address of the copied HVM L3 root table
+ * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy,
+ * otherwise blank out any entries with reserved bits in them. */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long
+hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res)
+{
+ int i, f;
+ int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY);
+ l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+ memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t));
+ for ( i = 0; i < 4; i++ )
+ {
+ f = l3e_get_flags(l3tab[i]);
+ if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) )
+ new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res);
+ else
+ new_l3e = l3e_empty();
+ safe_write_entry(&copy[i], &new_l3e);
+ }
+ return __pa(copy);
+}
+#endif
+
+
+static inline void
+sh2_update_linear_entries(struct vcpu *v)
+/* Sync up all the linear mappings for this vcpu's pagetables */
+{
+ struct domain *d = v->domain;
+
+ /* Linear pagetables in PV guests
+ * ------------------------------
+ *
+ * Guest linear pagetables, which map the guest pages, are at
+ * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
+ * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
+ * are set up at shadow creation time, but (of course!) the PAE case
+ * is subtler. Normal linear mappings are made by having an entry
+ * in the top-level table that points to itself (shadow linear) or
+ * to the guest top-level table (guest linear). For PAE, to set up
+ * a linear map requires us to copy the four top-level entries into
+ * level-2 entries. That means that every time we change a PAE l3e,
+ * we need to reflect the change into the copy.
+ *
+ * Linear pagetables in HVM guests
+ * -------------------------------
+ *
+ * For HVM guests, the linear pagetables are installed in the monitor
+ * tables (since we can't put them in the shadow). Shadow linear
+ * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
+ * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
+ * a linear pagetable of the monitor tables themselves. We have
+ * the same issue of having to re-copy PAE l3 entries whevever we use
+ * PAE shadows.
+ *
+ * Because HVM guests run on the same monitor tables regardless of the
+ * shadow tables in use, the linear mapping of the shadow tables has to
+ * be updated every time v->arch.shadow_table changes.
+ */
+
+ /* Don't try to update the monitor table if it doesn't exist */
+ if ( shadow2_mode_external(d)
+ && pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ return;
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
+
+ /* For PV, one l4e points at the guest l4, one points at the shadow
+ * l4. No maintenance required.
+ * For HVM, just need to update the l4e that points to the shadow l4. */
+
+ if ( shadow2_mode_external(d) )
+ {
+ /* Use the linear map if we can; otherwise make a new mapping */
+ if ( v == current )
+ {
+ __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
+ l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ }
+ else
+ {
+ l4_pgentry_t *ml4e;
+ ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ sh2_unmap_domain_page(ml4e);
+ }
+ }
+
+#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
+
+ /* This case only exists in HVM. To give ourselves a linear map of the
+ * shadows, we need to extend a PAE shadow to 4 levels. We do this by
+ * having a monitor l3 in slot 0 of the monitor l4 table, and
+ * copying the PAE l3 entries into it. Then, by having the monitor l4e
+ * for shadow pagetables also point to the monitor l4, we can use it
+ * to access the shadows. */
+
+ if ( shadow2_mode_external(d) )
+ {
+ /* Install copies of the shadow l3es into the monitor l3 table.
+ * The monitor l3 table is hooked into slot 0 of the monitor
+ * l4 table, so we use l3 linear indices 0 to 3 */
+ shadow_l3e_t *sl3e;
+ l3_pgentry_t *ml3e;
+ mfn_t l3mfn;
+ int i;
+
+ /* Use linear mappings if we can; otherwise make new mappings */
+ if ( v == current )
+ {
+ ml3e = __linear_l3_table;
+ l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables are made up by update_cr3 */
+ sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+ sl3e = v->arch.shadow_vtable;
+#endif
+ }
+ else
+ {
+ l4_pgentry_t *ml4e;
+ ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
+ l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
+ ml3e = sh2_map_domain_page(l3mfn);
+ sh2_unmap_domain_page(ml4e);
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables are made up by update_cr3 */
+ sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+ sl3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.shadow_table));
+#endif
+ }
+
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ ml3e[i] =
+ (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
+ ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
+ __PAGE_HYPERVISOR)
+ : l3e_empty();
+ }
+
+ if ( v != current )
+ {
+ sh2_unmap_domain_page(ml3e);
+#if GUEST_PAGING_LEVELS != 2
+ sh2_unmap_domain_page(sl3e);
+#endif
+ }
+ }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+ /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
+ * entries in the shadow, and the shadow's l3 entries into the
+ * shadow-linear-map l2 entries in the shadow. This is safe to do
+ * because Xen does not let guests share high-slot l2 tables between l3s,
+ * so we know we're not treading on anyone's toes.
+ *
+ * HVM: need to copy the shadow's l3 entries into the
+ * shadow-linear-map l2 entries in the monitor table. This is safe
+ * because we have one monitor table for each vcpu. The monitor's
+ * own l3es don't need to be copied because they never change.
+ * XXX That might change if we start stuffing things into the rest
+ * of the monitor's virtual address space.
+ */
+ {
+ l2_pgentry_t *l2e, new_l2e;
+ shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
+ int i;
+
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables were built by update_cr3 */
+ if ( shadow2_mode_external(d) )
+ shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+ else
+ BUG(); /* PV 2-on-3 is not supported yet */
+
+#else /* GUEST_PAGING_LEVELS == 3 */
+
+ /* Use local vcpu's mappings if we can; otherwise make new mappings */
+ if ( v == current )
+ {
+ shadow_l3e = v->arch.shadow_vtable;
+ if ( !shadow2_mode_external(d) )
+ guest_l3e = v->arch.guest_vtable;
+ }
+ else
+ {
+ mfn_t smfn;
+ int idx;
+
+ /* Map the shadow l3 */
+ smfn = pagetable_get_mfn(v->arch.shadow_table);
+ idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable));
+ shadow_l3e = sh2_map_domain_page(smfn);
+ shadow_l3e += idx;
+ if ( !shadow2_mode_external(d) )
+ {
+ /* Also the guest l3 */
+ mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table);
+ guest_l3e = sh2_map_domain_page(gmfn);
+ guest_l3e += guest_index(v->arch.guest_vtable);
+ }
+ }
+#endif /* GUEST_PAGING_LEVELS */
+
+ /* Choose where to write the entries, using linear maps if possible */
+ if ( v == current && shadow2_mode_external(d) )
+ {
+ /* From the monitor tables, it's safe to use linear maps to update
+ * monitor l2s */
+ l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
+ }
+ else if ( shadow2_mode_external(d) )
+ {
+ /* Map the monitor table's high l2 */
+ l3_pgentry_t *l3e;
+ l3e = sh2_map_domain_page(
+ pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+ l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
+ sh2_unmap_domain_page(l3e);
+ }
+ else
+ {
+ /* Map the shadow table's high l2 */
+ ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
+ l2e = sh2_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
+ }
+
+
+ if ( !shadow2_mode_external(d) )
+ {
+ /* Write linear mapping of guest. */
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
+ __PAGE_HYPERVISOR)
+ : l2e_empty();
+ safe_write_entry(
+ &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
+ &new_l2e);
+ }
+ }
+
+ /* Write linear mapping of shadow. */
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
+ __PAGE_HYPERVISOR)
+ : l2e_empty();
+ safe_write_entry(
+ &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
+ &new_l2e);
+ }
+
+ if ( v != current || !shadow2_mode_external(d) )
+ sh2_unmap_domain_page(l2e);
+
+#if GUEST_PAGING_LEVELS == 3
+ if ( v != current)
+ {
+ sh2_unmap_domain_page(shadow_l3e);
+ if ( !shadow2_mode_external(d) )
+ sh2_unmap_domain_page(guest_l3e);
+ }
+#endif
+ }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+ /* For PV, one l2e points at the guest l2, one points at the shadow
+ * l2. No maintenance required.
+ * For HVM, just need to update the l2e that points to the shadow l2. */
+
+ if ( shadow2_mode_external(d) )
+ {
+ /* Use the linear map if we can; otherwise make a new mapping */
+ if ( v == current )
+ {
+ __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
+ l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ }
+ else
+ {
+ l2_pgentry_t *ml2e;
+ ml2e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ sh2_unmap_domain_page(ml2e);
+ }
+ }
+
+#else
+#error this should not happen
+#endif
+}
+
+
+// XXX -- should this be in this file?
+// Or should it be moved to shadow2-common.c?
+//
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh2_pae_recopy(struct domain *d)
+/* Called whenever we write to the l3 entries of a PAE pagetable which
+ * is currently in use. Each vcpu that is using the table needs to
+ * resync its copies of the l3s in linear maps and any low-memory
+ * copies it might have made for fitting into 32bit CR3.
+ * Since linear maps are also resynced when we change CR3, we don't
+ * need to worry about changes to PAE l3es that are not currently in use.*/
+{
+ struct vcpu *v;
+ cpumask_t flush_mask = CPU_MASK_NONE;
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ for_each_vcpu(d, v)
+ {
+ if ( !v->arch.shadow2_pae_flip_pending )
+ continue;
+
+ cpu_set(v->processor, flush_mask);
+
+ SHADOW2_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id);
+
+ /* This vcpu has a copy in its linear maps */
+ sh2_update_linear_entries(v);
+ if ( hvm_guest(v) )
+ {
+ /* This vcpu has a copy in its HVM PAE l3 */
+ v->arch.hvm_vcpu.hw_cr3 =
+ hvm_pae_copy_root(v, v->arch.shadow_vtable,
+ !shadow2_vcpu_mode_translate(v));
+ }
+#if CONFIG_PAGING_LEVELS == 3
+ else
+ {
+ /* This vcpu might have copied the l3 to below 4GB */
+ if ( v->arch.cr3 >> PAGE_SHIFT
+ != pagetable_get_pfn(v->arch.shadow_table) )
+ {
+ /* Recopy to where that copy is. */
+ int i;
+ l3_pgentry_t *dst, *src;
+ dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */
+ src = v->arch.shadow_vtable;
+ for ( i = 0 ; i < 4 ; i++ )
+ safe_write_entry(dst + i, src + i);
+ }
+ }
+#endif
+ v->arch.shadow2_pae_flip_pending = 0;
+ }
+
+ flush_tlb_mask(flush_mask);
+}
+#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */
+
+
+/* removes:
+ * vcpu->arch.guest_vtable
+ * vcpu->arch.shadow_table
+ * vcpu->arch.shadow_vtable
+ * Does all appropriate management/bookkeeping/refcounting/etc...
+ */
+static void
+sh2_detach_old_tables(struct vcpu *v)
+{
+ mfn_t smfn;
+
+ ////
+ //// vcpu->arch.guest_vtable
+ ////
+ if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+ v->arch.guest_vtable )
+ {
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ sh2_unmap_domain_page_global(v->arch.guest_vtable);
+ v->arch.guest_vtable = NULL;
+ }
+
+ ////
+ //// vcpu->arch.shadow_table
+ ////
+ smfn = pagetable_get_mfn(v->arch.shadow_table);
+ if ( mfn_x(smfn) )
+ {
+ ASSERT(v->arch.shadow_vtable);
+
+#if GUEST_PAGING_LEVELS == 3
+ // PAE guests do not (necessarily) use an entire page for their
+ // 4-entry L3s, so we have to deal with them specially.
+ //
+ sh2_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn);
+#else
+ sh2_put_ref(v, smfn, 0);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ {
+ struct pae_l3_bookkeeping *info =
+ sl3p_to_info(v->arch.shadow_vtable);
+ ASSERT(test_bit(v->vcpu_id, &info->vcpus));
+ clear_bit(v->vcpu_id, &info->vcpus);
+ }
+#endif
+ v->arch.shadow_table = pagetable_null();
+ }
+
+ ////
+ //// vcpu->arch.shadow_vtable
+ ////
+ if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+ v->arch.shadow_vtable )
+ {
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ //
+ sh2_unmap_domain_page_global(v->arch.shadow_vtable);
+ v->arch.shadow_vtable = NULL;
+ }
+}
+
+static void
+sh2_update_cr3(struct vcpu *v)
+/* Updates vcpu->arch.shadow_table after the guest has changed CR3.
+ * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
+ * if appropriate).
+ * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)...
+ */
+{
+ struct domain *d = v->domain;
+ mfn_t gmfn, smfn;
+#if GUEST_PAGING_LEVELS == 3
+ u32 guest_idx=0;
+#endif
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ ASSERT(v->arch.shadow2);
+
+ ////
+ //// vcpu->arch.guest_table is already set
+ ////
+
+#ifndef NDEBUG
+ /* Double-check that the HVM code has sent us a sane guest_table */
+ if ( hvm_guest(v) )
+ {
+ gfn_t gfn;
+
+ ASSERT(shadow2_mode_external(d));
+
+ // Is paging enabled on this vcpu?
+ if ( shadow2_vcpu_mode_translate(v) )
+ {
+ gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
+ gmfn = vcpu_gfn_to_mfn(v, gfn);
+ ASSERT(valid_mfn(gmfn));
+ ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
+ }
+ else
+ {
+ /* Paging disabled: guest_table points at (part of) p2m */
+#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
+ /* For everything else, they sould be the same */
+ ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
+#endif
+ }
+ }
+#endif
+
+ SHADOW2_PRINTK("d=%u v=%u guest_table=%05lx\n",
+ d->domain_id, v->vcpu_id,
+ (unsigned long)pagetable_get_pfn(v->arch.guest_table));
+
+#if GUEST_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ gmfn = pagetable_get_mfn(v->arch.guest_table_user);
+ else
+#endif
+ gmfn = pagetable_get_mfn(v->arch.guest_table);
+
+ sh2_detach_old_tables(v);
+
+ if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+ {
+ ASSERT(v->arch.cr3 == 0);
+ return;
+ }
+
+ ////
+ //// vcpu->arch.guest_vtable
+ ////
+ if ( shadow2_mode_external(d) )
+ {
+#if GUEST_PAGING_LEVELS == 3
+ if ( shadow2_vcpu_mode_translate(v) )
+ /* Paging enabled: find where in the page the l3 table is */
+ guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
+ else
+ /* Paging disabled: l3 is at the start of a page (in the p2m) */
+ guest_idx = 0;
+
+ // Ignore the low 2 bits of guest_idx -- they are really just
+ // cache control.
+ guest_idx &= ~3;
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable =
+ (guest_l3e_t *)sh2_map_domain_page_global(gmfn) + guest_idx;
+#else
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable = sh2_map_domain_page_global(gmfn);
+#endif
+ }
+ else
+ {
+#ifdef __x86_64__
+ v->arch.guest_vtable = __linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable = sh2_map_domain_page_global(gmfn);
+#else
+ v->arch.guest_vtable = __linear_l2_table;
+#endif
+ }
+
+#if 0
+ printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
+ __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
+#endif
+
+ ////
+ //// vcpu->arch.shadow_table
+ ////
+ smfn = get_shadow_status(v, gmfn, PGC_SH2_guest_root_type);
+ if ( valid_mfn(smfn) )
+ {
+ /* Pull this root shadow to the front of the list of roots. */
+ list_del(&mfn_to_page(smfn)->list);
+ list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2_toplevel_shadows);
+ }
+ else
+ {
+ /* This guest MFN is a pagetable. Must revoke write access. */
+ if ( shadow2_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0)
+ != 0 )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ /* Make sure there's enough free shadow memory. */
+ shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+ /* Shadow the page. */
+ smfn = sh2_make_shadow(v, gmfn, PGC_SH2_guest_root_type);
+ list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2_toplevel_shadows);
+ }
+ ASSERT(valid_mfn(smfn));
+ v->arch.shadow_table = pagetable_from_mfn(smfn);
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
+ /* Once again OK to unhook entries from this table if we see fork/exit */
+ ASSERT(sh2_mfn_is_a_page_table(gmfn));
+ mfn_to_page(gmfn)->shadow2_flags &= ~SH2F_unhooked_mappings;
+#endif
+
+
+ ////
+ //// vcpu->arch.shadow_vtable
+ ////
+ if ( shadow2_mode_external(d) )
+ {
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ mfn_t adjusted_smfn = smfn;
+ u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx);
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ v->arch.shadow_vtable =
+ (shadow_l3e_t *)sh2_map_domain_page_global(adjusted_smfn) +
+ shadow_idx;
+#else
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ v->arch.shadow_vtable = sh2_map_domain_page_global(smfn);
+#endif
+ }
+ else
+ {
+#if SHADOW_PAGING_LEVELS == 4
+ v->arch.shadow_vtable = __sh2_linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+ // XXX - why does this need a global map?
+ v->arch.shadow_vtable = sh2_map_domain_page_global(smfn);
+#else
+ v->arch.shadow_vtable = __sh2_linear_l2_table;
+#endif
+ }
+
+ ////
+ //// Take a ref to the new shadow table, and pin it.
+ ////
+ //
+ // This ref is logically "held" by v->arch.shadow_table entry itself.
+ // Release the old ref.
+ //
+#if GUEST_PAGING_LEVELS == 3
+ // PAE guests do not (necessarily) use an entire page for their
+ // 4-entry L3s, so we have to deal with them specially.
+ //
+ // XXX - might want to revisit this if/when we do multiple compilation for
+ // HVM-vs-PV guests, as PAE PV guests could get away without doing
+ // subshadows.
+ //
+ sh2_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn);
+ sh2_pin_l3_subshadow(v->arch.shadow_vtable, smfn);
+#else
+ sh2_get_ref(smfn, 0);
+ sh2_pin(smfn);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ // PAE 3-on-3 shadows have to keep track of which vcpu's are using
+ // which l3 subshadow, in order handle the SHADOW2_SET_L3PAE_RECOPY
+ // case from validate_gl3e(). Search for SHADOW2_SET_L3PAE_RECOPY
+ // in the code for more info.
+ //
+ {
+ struct pae_l3_bookkeeping *info =
+ sl3p_to_info(v->arch.shadow_vtable);
+ ASSERT(!test_bit(v->vcpu_id, &info->vcpus));
+ set_bit(v->vcpu_id, &info->vcpus);
+ }
+#endif
+
+ debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n",
+ __func__, gmfn, smfn);
+
+ ///
+ /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3
+ ///
+ if ( shadow2_mode_external(d) )
+ {
+ ASSERT(hvm_guest(v));
+ make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
+
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+#if SHADOW_PAGING_LEVELS != 3
+#error unexpected combination of GUEST and SHADOW paging levels
+#endif
+ /* 2-on-3: make a PAE l3 table that points at the four-page l2 */
+ {
+ mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table);
+ int i;
+
+ ASSERT(v->arch.hvm_vcpu.hw_cr3 ==
+ virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab));
+ for (i = 0; i < 4; i++)
+ {
+ v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] =
+ shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT);
+ }
+ }
+#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+ /* 3-on-3: copy the shadow l3 to slots that are below 4GB.
+ * If paging is disabled, clear l3e reserved bits; otherwise
+ * remove entries that have reserved bits set. */
+ v->arch.hvm_vcpu.hw_cr3 =
+ hvm_pae_copy_root(v, v->arch.shadow_vtable,
+ !shadow2_vcpu_mode_translate(v));
+#else
+ /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */
+ v->arch.hvm_vcpu.hw_cr3 =
+ pagetable_get_paddr(v->arch.shadow_table);
+#endif
+ }
+ else // not shadow2_mode_external...
+ {
+ /* We don't support PV except guest == shadow == config levels */
+ BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
+ make_cr3(v, pagetable_get_pfn(v->arch.shadow_table));
+ }
+
+ /* Fix up the linear pagetable mappings */
+ sh2_update_linear_entries(v);
+}
+
+
+/**************************************************************************/
+/* Functions to revoke guest rights */
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+static int sh2_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
+/* Look up this vaddr in the current shadow and see if it's a writeable
+ * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
+{
+ shadow_l1e_t sl1e, *sl1p;
+ shadow_l2e_t *sl2p;
+#if GUEST_PAGING_LEVELS >= 3
+ shadow_l3e_t *sl3p;
+#if GUEST_PAGING_LEVELS >= 4
+ shadow_l4e_t *sl4p;
+#endif
+#endif
+ mfn_t sl1mfn;
+
+
+ /* Carefully look in the shadow linear map for the l1e we expect */
+ if ( v->arch.shadow_vtable == NULL ) return 0;
+#if GUEST_PAGING_LEVELS >= 4
+ sl4p = sh2_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
+ if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
+ return 0;
+ sl3p = sh2_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
+ if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
+ return 0;
+#elif GUEST_PAGING_LEVELS == 3
+ sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable)
+ + shadow_l3_linear_offset(vaddr);
+ if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
+ return 0;
+#endif
+ sl2p = sh2_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
+ if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
+ return 0;
+ sl1p = sh2_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
+ sl1e = *sl1p;
+ if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
+ != (_PAGE_PRESENT|_PAGE_RW))
+ || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
+ return 0;
+
+ /* Found it! Need to remove its write permissions. */
+ sl1mfn = shadow_l2e_get_mfn(*sl2p);
+ sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
+ shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
+ return 1;
+}
+#endif
+
+int sh2_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
+/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
+{
+ shadow_l1e_t *sl1e;
+ int done = 0;
+ int flags;
+
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done,
+ {
+ flags = shadow_l1e_get_flags(*sl1e);
+ if ( (flags & _PAGE_PRESENT)
+ && (flags & _PAGE_RW)
+ && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
+ {
+ shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
+ if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
+ & PGT_count_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+
+int sh2_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
+/* Excises all mappings to guest frame from this shadow l1 table */
+{
+ shadow_l1e_t *sl1e;
+ int done = 0;
+ int flags;
+
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done,
+ {
+ flags = shadow_l1e_get_flags(*sl1e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
+ {
+ shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
+ if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+/**************************************************************************/
+/* Functions to excise all pointers to shadows from higher-level shadows. */
+
+void sh2_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
+/* Blank out a single shadow entry */
+{
+ switch (mfn_to_page(smfn)->count_info & PGC_SH2_type_mask)
+ {
+ case PGC_SH2_l1_shadow:
+ shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
+ case PGC_SH2_l2_shadow:
+#if GUEST_PAGING_LEVELS == 3
+ case PGC_SH2_l2h_shadow:
+#endif
+ shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
+#if GUEST_PAGING_LEVELS >= 3
+ case PGC_SH2_l3_shadow:
+ shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
+#if GUEST_PAGING_LEVELS >= 4
+ case PGC_SH2_l4_shadow:
+ shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
+#endif
+#endif
+ default: BUG(); /* Called with the wrong kind of shadow. */
+ }
+}
+
+int sh2_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
+/* Remove all mappings of this l1 shadow from this l2 shadow */
+{
+ shadow_l2e_t *sl2e;
+ int done = 0;
+ int flags;
+#if GUEST_PAGING_LEVELS != 4
+ int xen_mappings = !shadow2_mode_external(v->domain);
+#endif
+
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings,
+ {
+ flags = shadow_l2e_get_flags(*sl2e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
+ {
+ shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH2_type_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+int sh2_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
+/* Remove all mappings of this l2 shadow from this l3 shadow */
+{
+ shadow_l3e_t *sl3e;
+ int done = 0;
+ int flags;
+
+ SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, done,
+ {
+ flags = shadow_l3e_get_flags(*sl3e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
+ {
+ shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+ if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+#if GUEST_PAGING_LEVELS >= 4
+int sh2_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
+/* Remove all mappings of this l3 shadow from this l4 shadow */
+{
+ shadow_l4e_t *sl4e;
+ int done = 0;
+ int flags, xen_mappings = !shadow2_mode_external(v->domain);
+
+ SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
+ {
+ flags = shadow_l4e_get_flags(*sl4e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
+ {
+ shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+ if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH2_type_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+#endif /* 64bit guest */
+#endif /* PAE guest */
+
+/**************************************************************************/
+/* Handling HVM guest writes to pagetables */
+
+/* Check that the user is allowed to perform this write.
+ * Returns a mapped pointer to write to, and the mfn it's on,
+ * or NULL for error. */
+static inline void * emulate_map_dest(struct vcpu *v,
+ unsigned long vaddr,
+ struct x86_emulate_ctxt *ctxt,
+ mfn_t *mfnp)
+{
+ walk_t gw;
+ u32 flags;
+ gfn_t gfn;
+ mfn_t mfn;
+
+ guest_walk_tables(v, vaddr, &gw, 1);
+ flags = accumulate_guest_flags(&gw);
+ gfn = guest_l1e_get_gfn(gw.eff_l1e);
+ mfn = vcpu_gfn_to_mfn(v, gfn);
+ sh2_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+
+ if ( !(flags & _PAGE_PRESENT)
+ || !(flags & _PAGE_RW)
+ || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) )
+ {
+ /* This write would have faulted even on bare metal */
+ v->arch.shadow2_propagate_fault = 1;
+ return NULL;
+ }
+
+ if ( !valid_mfn(mfn) )
+ {
+ /* Attempted a write to a bad gfn. This should never happen:
+ * after all, we're here because this write is to a page table. */
+ BUG();
+ }
+
+ ASSERT(sh2_mfn_is_a_page_table(mfn));
+ *mfnp = mfn;
+ return sh2_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
+}
+
+int
+sh2_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
+ u32 bytes, struct x86_emulate_ctxt *ctxt)
+{
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ while ( bytes > 0 )
+ {
+ mfn_t mfn;
+ int bytes_on_page;
+ void *addr;
+
+ bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK);
+ if ( bytes_on_page > bytes )
+ bytes_on_page = bytes;
+
+ if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+ return X86EMUL_PROPAGATE_FAULT;
+ memcpy(addr, src, bytes_on_page);
+ shadow2_validate_guest_pt_write(v, mfn, addr, bytes_on_page);
+ bytes -= bytes_on_page;
+ /* If we are writing zeros to this page, might want to unshadow */
+ if ( *(u8 *)addr == 0 )
+ check_for_early_unshadow(v, mfn);
+ sh2_unmap_domain_page(addr);
+ }
+ shadow2_audit_tables(v);
+ return X86EMUL_CONTINUE;
+}
+
+int
+sh2_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
+ unsigned long old, unsigned long new,
+ unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+ mfn_t mfn;
+ void *addr;
+ unsigned long prev;
+ int rv = X86EMUL_CONTINUE;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ ASSERT(bytes <= sizeof (unsigned long));
+
+ if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+ return X86EMUL_PROPAGATE_FAULT;
+
+ switch (bytes)
+ {
+ case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
+ case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
+ case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
+ case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
+ default:
+ SHADOW2_PRINTK("cmpxchg of size %i is not supported\n", bytes);
+ prev = ~old;
+ }
+
+ if ( (prev == old) )
+ shadow2_validate_guest_pt_write(v, mfn, addr, bytes);
+ else
+ rv = X86EMUL_CMPXCHG_FAILED;
+
+ SHADOW2_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
+ " wanted %#lx now %#lx bytes %u\n",
+ vaddr, prev, old, new, *(unsigned long *)addr, bytes);
+
+ /* If we are writing zeros to this page, might want to unshadow */
+ if ( *(u8 *)addr == 0 )
+ check_for_early_unshadow(v, mfn);
+
+ sh2_unmap_domain_page(addr);
+ shadow2_audit_tables(v);
+ check_for_early_unshadow(v, mfn);
+ return rv;
+}
+
+int
+sh2_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
+ unsigned long old_lo, unsigned long old_hi,
+ unsigned long new_lo, unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt)
+{
+ mfn_t mfn;
+ void *addr;
+ u64 old, new, prev;
+ int rv = X86EMUL_CONTINUE;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+ return X86EMUL_PROPAGATE_FAULT;
+
+ old = (((u64) old_hi) << 32) | (u64) old_lo;
+ new = (((u64) new_hi) << 32) | (u64) new_lo;
+ prev = cmpxchg(((u64 *)addr), old, new);
+
+ if ( (prev == old) )
+ shadow2_validate_guest_pt_write(v, mfn, addr, 8);
+ else
+ rv = X86EMUL_CMPXCHG_FAILED;
+
+ /* If we are writing zeros to this page, might want to unshadow */
+ if ( *(u8 *)addr == 0 )
+ check_for_early_unshadow(v, mfn);
+
+ sh2_unmap_domain_page(addr);
+ shadow2_audit_tables(v);
+ check_for_early_unshadow(v, mfn);
+ return rv;
+}
+
+
+/**************************************************************************/
+/* Audit tools */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
+
+#define AUDIT_FAIL(_level, _fmt, _a...) do { \
+ printk("Shadow2 %u-on-%u audit failed at level %i, index %i\n" \
+ "gl" #_level "mfn = %" SH2_PRI_mfn \
+ " sl" #_level "mfn = %" SH2_PRI_mfn \
+ " &gl" #_level "e = %p &sl" #_level "e = %p" \
+ " gl" #_level "e = %" SH2_PRI_gpte \
+ " sl" #_level "e = %" SH2_PRI_pte "\nError: " _fmt "\n", \
+ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
+ _level, guest_index(gl ## _level ## e), \
+ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
+ gl ## _level ## e, sl ## _level ## e, \
+ gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
+ ##_a); \
+ BUG(); \
+ done = 1; \
+} while (0)
+
+
+static char * sh2_audit_flags(struct vcpu *v, int level,
+ int gflags, int sflags)
+/* Common code for auditing flag bits */
+{
+ if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
+ return "shadow is present but guest is not present";
+ if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) )
+ return "global bit set in PV shadow";
+ if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
+ && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) )
+ return "dirty bit not propagated";
+ if ( level == 2 && (sflags & _PAGE_PSE) )
+ return "PS bit set in shadow";
+#if SHADOW_PAGING_LEVELS == 3
+ if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
+#endif
+ if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
+ return "user/supervisor bit does not match";
+ if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
+ return "NX bit does not match";
+ if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
+ return "shadow grants write access but guest does not";
+ if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) )
+ return "accessed bit not propagated";
+ return NULL;
+}
+
+static inline mfn_t
+audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
+/* Convert this gfn to an mfn in the manner appropriate for the
+ * guest pagetable it's used in (gmfn) */
+{
+ if ( !shadow2_mode_translate(v->domain) )
+ return _mfn(gfn_x(gfn));
+
+ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
+ != PGT_writable_page )
+ return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
+ else
+ return sh2_gfn_to_mfn(v->domain, gfn_x(gfn));
+}
+
+
+int sh2_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
+{
+ guest_l1e_t *gl1e, *gp;
+ shadow_l1e_t *sl1e;
+ mfn_t mfn, gmfn, gl1mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+
+ /* Follow the backpointer */
+ gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
+ gl1e = gp = sh2_map_domain_page(gl1mfn);
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
+
+ s = sh2_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
+ shadow_l1e_get_flags(*sl1e));
+ if ( s ) AUDIT_FAIL(1, "%s", s);
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l1e_get_gfn(*gl1e);
+ mfn = shadow_l1e_get_mfn(*sl1e);
+ gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(1, "bad translation: gfn %" SH2_PRI_gfn
+ " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh2_unmap_domain_page(gp);
+ return done;
+}
+
+int sh2_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
+{
+ guest_l1e_t *gl1e, e;
+ shadow_l1e_t *sl1e;
+ mfn_t gl1mfn = _mfn(INVALID_MFN);
+ int f;
+ int done = 0;
+
+ /* fl1 has no useful backpointer: all we can check are flags */
+ e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
+ f = shadow_l1e_get_flags(*sl1e);
+ f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
+ if ( !(f == 0
+ || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+ _PAGE_ACCESSED|_PAGE_DIRTY)
+ || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) )
+ AUDIT_FAIL(1, "fl1e has bad flags");
+ });
+ return 0;
+}
+
+int sh2_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
+{
+ guest_l2e_t *gl2e, *gp;
+ shadow_l2e_t *sl2e;
+ mfn_t mfn, gmfn, gl2mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+#if GUEST_PAGING_LEVELS != 4
+ int xen_mappings = !shadow2_mode_external(v->domain);
+#endif
+
+ /* Follow the backpointer */
+ gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info);
+ gl2e = gp = sh2_map_domain_page(gl2mfn);
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
+
+ s = sh2_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
+ shadow_l2e_get_flags(*sl2e));
+ if ( s ) AUDIT_FAIL(2, "%s", s);
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l2e_get_gfn(*gl2e);
+ mfn = shadow_l2e_get_mfn(*sl2e);
+ gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
+ ? get_fl1_shadow_status(v, gfn)
+ : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
+ PGC_SH2_l1_shadow);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(2, "bad translation: gfn %" SH2_PRI_gfn
+ " (--> %" SH2_PRI_mfn ")"
+ " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+ gfn_x(gfn),
+ (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
+ : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
+ mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh2_unmap_domain_page(gp);
+ return 0;
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+int sh2_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
+{
+ guest_l3e_t *gl3e, *gp;
+ shadow_l3e_t *sl3e;
+ mfn_t mfn, gmfn, gl3mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+
+ /* Follow the backpointer */
+ gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info);
+ gl3e = gp = sh2_map_domain_page(gl3mfn);
+ SHADOW2_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
+
+ s = sh2_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
+ shadow_l3e_get_flags(*sl3e));
+ if ( s ) AUDIT_FAIL(3, "%s", s);
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l3e_get_gfn(*gl3e);
+ mfn = shadow_l3e_get_mfn(*sl3e);
+ gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
+ (GUEST_PAGING_LEVELS == 3
+ && !shadow2_mode_external(v->domain)
+ && (guest_index(gl3e) % 4) == 3)
+ ? PGC_SH2_l2h_pae_shadow
+ : PGC_SH2_l2_shadow);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(3, "bad translation: gfn %" SH2_PRI_gfn
+ " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh2_unmap_domain_page(gp);
+ return 0;
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+#if GUEST_PAGING_LEVELS >= 4
+int sh2_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
+{
+ guest_l4e_t *gl4e, *gp;
+ shadow_l4e_t *sl4e;
+ mfn_t mfn, gmfn, gl4mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+ int xen_mappings = !shadow2_mode_external(v->domain);
+
+ /* Follow the backpointer */
+ gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info);
+ gl4e = gp = sh2_map_domain_page(gl4mfn);
+ SHADOW2_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
+ {
+ s = sh2_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
+ shadow_l4e_get_flags(*sl4e));
+ if ( s ) AUDIT_FAIL(4, "%s", s);
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l4e_get_gfn(*gl4e);
+ mfn = shadow_l4e_get_mfn(*sl4e);
+ gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
+ PGC_SH2_l3_shadow);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(4, "bad translation: gfn %" SH2_PRI_gfn
+ " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh2_unmap_domain_page(gp);
+ return 0;
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#undef AUDIT_FAIL
+
+#endif /* Audit code */
+
+/**************************************************************************/
+/* Entry points into this mode of the shadow code.
+ * This will all be mangled by the preprocessor to uniquify everything. */
+struct shadow2_entry_points shadow2_entry = {
+ .page_fault = sh2_page_fault,
+ .invlpg = sh2_invlpg,
+ .gva_to_gpa = sh2_gva_to_gpa,
+ .gva_to_gfn = sh2_gva_to_gfn,
+ .update_cr3 = sh2_update_cr3,
+ .map_and_validate_gl1e = sh2_map_and_validate_gl1e,
+ .map_and_validate_gl2e = sh2_map_and_validate_gl2e,
+ .map_and_validate_gl2he = sh2_map_and_validate_gl2he,
+ .map_and_validate_gl3e = sh2_map_and_validate_gl3e,
+ .map_and_validate_gl4e = sh2_map_and_validate_gl4e,
+ .detach_old_tables = sh2_detach_old_tables,
+ .x86_emulate_write = sh2_x86_emulate_write,
+ .x86_emulate_cmpxchg = sh2_x86_emulate_cmpxchg,
+ .x86_emulate_cmpxchg8b = sh2_x86_emulate_cmpxchg8b,
+ .make_monitor_table = sh2_make_monitor_table,
+ .destroy_monitor_table = sh2_destroy_monitor_table,
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+ .guess_wrmap = sh2_guess_wrmap,
+#endif
+ .guest_levels = GUEST_PAGING_LEVELS,
+ .shadow_levels = SHADOW_PAGING_LEVELS,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/shadow32.c b/xen/arch/x86/shadow32.c
deleted file mode 100644
index 392669746e..0000000000
--- a/xen/arch/x86/shadow32.c
+++ /dev/null
@@ -1,3782 +0,0 @@
-/******************************************************************************
- * arch/x86/shadow.c
- *
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/shadow.h>
-#include <asm/page.h>
-#include <xen/event.h>
-#include <xen/sched.h>
-#include <xen/trace.h>
-#include <xen/guest_access.h>
-
-#define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
-#define va_to_l1mfn(_ed, _va) \
- (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]))
-
-static void shadow_free_snapshot(struct domain *d,
- struct out_of_sync_entry *entry);
-static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
-static void free_writable_pte_predictions(struct domain *d);
-
-#if SHADOW_DEBUG
-static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
-#endif
-
-static int alloc_p2m_table(struct domain *d);
-static void free_p2m_table(struct domain *d);
-
-/********
-
-There's a per-domain shadow table spin lock which works fine for SMP
-hosts. We don't have to worry about interrupts as no shadow operations
-happen in an interrupt context. It's probably not quite ready for SMP
-guest operation as we have to worry about synchonisation between gpte
-and spte updates. Its possible that this might only happen in a
-hypercall context, in which case we'll probably at have a per-domain
-hypercall lock anyhow (at least initially).
-
-********/
-
-static inline int
-shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
- unsigned long new_type)
-{
- struct page_info *page = mfn_to_page(gmfn);
- int pinned = 0, okay = 1;
-
- if ( page_out_of_sync(page) )
- {
- // Don't know how long ago this snapshot was taken.
- // Can't trust it to be recent enough.
- //
- __shadow_sync_mfn(d, gmfn);
- }
-
- if ( !shadow_mode_refcounts(d) )
- return 1;
-
- if ( unlikely(page_is_page_table(page)) )
- return 1;
-
- FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
-
- if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
- {
- FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
- __func__, gpfn, gmfn);
-#if 1 || defined(LIVE_DANGEROUSLY)
- set_bit(_PGC_page_table, &page->count_info);
- return 1;
-#endif
- return 0;
-
- }
-
- // To convert this page to use as a page table, the writable count
- // should now be zero. Test this by grabbing the page as an page table,
- // and then immediately releasing. This will also deal with any
- // necessary TLB flushing issues for us.
- //
- // The cruft here about pinning doesn't really work right. This
- // needs rethinking/rewriting... Need to gracefully deal with the
- // TLB flushes required when promoting a writable page, and also deal
- // with any outstanding (external) writable refs to this page (by
- // refusing to promote it). The pinning headache complicates this
- // code -- it would all get much simpler if we stop using
- // shadow_lock() and move the shadow code to BIGLOCK().
- //
- if ( unlikely(!get_page(page, d)) )
- BUG(); // XXX -- needs more thought for a graceful failure
- if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
- {
- pinned = 1;
- put_page_and_type(page);
- }
- if ( get_page_type(page, PGT_base_page_table) )
- {
- set_bit(_PGC_page_table, &page->count_info);
- put_page_type(page);
- }
- else
- {
- printk("shadow_promote: get_page_type failed "
- "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
- d->domain_id, gpfn, gmfn, new_type);
- okay = 0;
- }
-
- // Now put the type back to writable...
- if ( unlikely(!get_page_type(page, PGT_writable_page)) )
- BUG(); // XXX -- needs more thought for a graceful failure
- if ( unlikely(pinned) )
- {
- if ( unlikely(test_and_set_bit(_PGT_pinned,
- &page->u.inuse.type_info)) )
- BUG(); // hmm... someone pinned this again?
- }
- else
- put_page_and_type(page);
-
- return okay;
-}
-
-static inline void
-shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
- if ( !shadow_mode_refcounts(d) )
- return;
-
- ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table);
-
- if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
- {
- clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info);
-
- if ( page_out_of_sync(mfn_to_page(gmfn)) )
- {
- remove_out_of_sync_entries(d, gmfn);
- }
- }
-}
-
-/*
- * Things in shadow mode that collect get_page() refs to the domain's
- * pages are:
- * - PGC_allocated takes a gen count, just like normal.
- * - A writable page can be pinned (paravirtualized guests may consider
- * these pages to be L1s or L2s, and don't know the difference).
- * Pinning a page takes a gen count (but, for domains in shadow mode,
- * it *doesn't* take a type count)
- * - CR3 grabs a ref to whatever it points at, just like normal.
- * - Shadow mode grabs an initial gen count for itself, as a placehold
- * for whatever references will exist.
- * - Shadow PTEs that point to a page take a gen count, just like regular
- * PTEs. However, they don't get a type count, as get_page_type() is
- * hardwired to keep writable pages' counts at 1 for domains in shadow
- * mode.
- * - Whenever we shadow a page, the entry in the shadow hash grabs a
- * general ref to the page.
- * - Whenever a page goes out of sync, the out of sync entry grabs a
- * general ref to the page.
- */
-/*
- * page_info fields for pages allocated as shadow pages:
- *
- * All 32 bits of count_info are a simple count of refs to this shadow
- * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
- * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
- * references.
- *
- * u.inuse._domain is left NULL, to prevent accidently allow some random
- * domain from gaining permissions to map this page.
- *
- * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
- * shadowed.
- * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
- * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
- * is currently exists because this is a shadow of a root page, and we
- * don't want to let those disappear just because no CR3 is currently pointing
- * at it.
- *
- * tlbflush_timestamp holds a min & max index of valid page table entries
- * within the shadow page.
- */
-
-static inline unsigned long
-alloc_shadow_page(struct domain *d,
- unsigned long gpfn, unsigned long gmfn,
- u32 psh_type)
-{
- struct page_info *page;
- unsigned long smfn;
- int pin = 0;
- void *l1;
-
- // Currently, we only keep pre-zero'ed pages around for use as L1's...
- // This will change. Soon.
- //
- if ( psh_type == PGT_l1_shadow )
- {
- if ( !list_empty(&d->arch.free_shadow_frames) )
- {
- struct list_head *entry = d->arch.free_shadow_frames.next;
- page = list_entry(entry, struct page_info, list);
- list_del(entry);
- perfc_decr(free_l1_pages);
- }
- else
- {
- page = alloc_domheap_page(NULL);
- l1 = map_domain_page(page_to_mfn(page));
- memset(l1, 0, PAGE_SIZE);
- unmap_domain_page(l1);
- }
- }
- else
- page = alloc_domheap_page(NULL);
-
- if ( unlikely(page == NULL) )
- {
- printk("Couldn't alloc shadow page! dom%d count=%d\n",
- d->domain_id, d->arch.shadow_page_count);
- printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
- perfc_value(shadow_l1_pages),
- perfc_value(shadow_l2_pages),
- perfc_value(hl2_table_pages),
- perfc_value(snapshot_pages));
- /* XXX FIXME: try a shadow flush to free up some memory. */
- domain_crash_synchronous();
- }
-
- smfn = page_to_mfn(page);
-
- ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
- page->u.inuse.type_info = psh_type | gmfn;
- page->count_info = 0;
- page->tlbflush_timestamp = 0;
-
- switch ( psh_type )
- {
- case PGT_l1_shadow:
- if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
- goto fail;
- perfc_incr(shadow_l1_pages);
- d->arch.shadow_page_count++;
- break;
-
- case PGT_l2_shadow:
- if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
- goto fail;
- perfc_incr(shadow_l2_pages);
- d->arch.shadow_page_count++;
- if ( PGT_l2_page_table == PGT_root_page_table )
- pin = 1;
-
- break;
-
- case PGT_hl2_shadow:
- // Treat an hl2 as an L1 for purposes of promotion.
- // For external mode domains, treat them as an L2 for purposes of
- // pinning.
- //
- if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
- goto fail;
- perfc_incr(hl2_table_pages);
- d->arch.hl2_page_count++;
- if ( shadow_mode_external(d) &&
- (PGT_l2_page_table == PGT_root_page_table) )
- pin = 1;
-
- break;
-
- case PGT_snapshot:
- perfc_incr(snapshot_pages);
- d->arch.snapshot_page_count++;
- break;
-
- default:
- printk("Alloc shadow weird page type type=%08x\n", psh_type);
- BUG();
- break;
- }
-
- // Don't add a new shadow of something that already has a snapshot.
- //
- ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
-
- set_shadow_status(d, gpfn, gmfn, smfn, psh_type, 0);
-
- if ( pin )
- shadow_pin(smfn);
-
- return smfn;
-
- fail:
- FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?",
- gpfn, gmfn);
- free_domheap_page(page);
- return 0;
-}
-
-static void inline
-free_shadow_l1_table(struct domain *d, unsigned long smfn)
-{
- l1_pgentry_t *pl1e = map_domain_page(smfn);
- int i;
- struct page_info *spage = mfn_to_page(smfn);
- u32 min_max = spage->tlbflush_timestamp;
- int min = SHADOW_MIN(min_max);
- int max = SHADOW_MAX(min_max);
-
- for ( i = min; i <= max; i++ )
- {
- shadow_put_page_from_l1e(pl1e[i], d);
- pl1e[i] = l1e_empty();
- }
-
- unmap_domain_page(pl1e);
-}
-
-static void inline
-free_shadow_hl2_table(struct domain *d, unsigned long smfn)
-{
- l1_pgentry_t *hl2 = map_domain_page(smfn);
- int i, limit;
-
- SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
-
- if ( shadow_mode_external(d) )
- limit = L2_PAGETABLE_ENTRIES;
- else
- limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
- for ( i = 0; i < limit; i++ )
- {
- if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
- put_page(mfn_to_page(l1e_get_pfn(hl2[i])));
- }
-
- unmap_domain_page(hl2);
-}
-
-static void inline
-free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
-{
- l2_pgentry_t *pl2e = map_domain_page(smfn);
- int i, external = shadow_mode_external(d);
-
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- if ( external || is_guest_l2_slot(type, i) )
- if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
- put_shadow_ref(l2e_get_pfn(pl2e[i]));
-
- if ( (PGT_base_page_table == PGT_l2_page_table) &&
- shadow_mode_translate(d) && !external )
- {
- // free the ref to the hl2
- //
- put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
- }
-
- unmap_domain_page(pl2e);
-}
-
-void free_shadow_page(unsigned long smfn)
-{
- struct page_info *page = mfn_to_page(smfn);
- unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
- struct domain *d = page_get_owner(mfn_to_page(gmfn));
- unsigned long gpfn = mfn_to_gmfn(d, gmfn);
- unsigned long type = page->u.inuse.type_info & PGT_type_mask;
-
- SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
-
- ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
-
- delete_shadow_status(d, gpfn, gmfn, type, 0);
-
- switch ( type )
- {
- case PGT_l1_shadow:
- perfc_decr(shadow_l1_pages);
- shadow_demote(d, gpfn, gmfn);
- free_shadow_l1_table(d, smfn);
- d->arch.shadow_page_count--;
- break;
-
- case PGT_l2_shadow:
- perfc_decr(shadow_l2_pages);
- shadow_demote(d, gpfn, gmfn);
- free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
- d->arch.shadow_page_count--;
- break;
-
- case PGT_hl2_shadow:
- perfc_decr(hl2_table_pages);
- shadow_demote(d, gpfn, gmfn);
- free_shadow_hl2_table(d, smfn);
- d->arch.hl2_page_count--;
- break;
-
- case PGT_snapshot:
- perfc_decr(snapshot_pages);
- d->arch.snapshot_page_count--;
- break;
-
- default:
- printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n",
- page_to_mfn(page), page->u.inuse.type_info);
- break;
- }
-
- // No TLB flushes are needed the next time this page gets allocated.
- //
- page->tlbflush_timestamp = 0;
- page->u.free.cpumask = CPU_MASK_NONE;
-
- if ( type == PGT_l1_shadow )
- {
- list_add(&page->list, &d->arch.free_shadow_frames);
- perfc_incr(free_l1_pages);
- }
- else
- free_domheap_page(page);
-}
-
-void
-remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
-{
- unsigned long smfn;
-
- //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype);
-
- shadow_lock(d);
-
- while ( stype >= PGT_l1_shadow )
- {
- smfn = __shadow_status(d, gpfn, stype);
- if ( smfn && MFN_PINNED(smfn) )
- shadow_unpin(smfn);
- stype -= PGT_l1_shadow;
- }
-
- shadow_unlock(d);
-}
-
-static void inline
-release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
-{
- struct page_info *page;
-
- page = mfn_to_page(entry->gmfn);
-
- // Decrement ref count of guest & shadow pages
- //
- put_page(page);
-
- // Only use entries that have low bits clear...
- //
- if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
- {
- put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
- entry->writable_pl1e = -2;
- }
- else
- ASSERT( entry->writable_pl1e == -1 );
-
- // Free the snapshot
- //
- shadow_free_snapshot(d, entry);
-}
-
-static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
-{
- struct out_of_sync_entry *entry = d->arch.out_of_sync;
- struct out_of_sync_entry **prev = &d->arch.out_of_sync;
- struct out_of_sync_entry *found = NULL;
-
- // NB: Be careful not to call something that manipulates this list
- // while walking it. Collect the results into a separate list
- // first, then walk that list.
- //
- while ( entry )
- {
- if ( entry->gmfn == gmfn )
- {
- // remove from out of sync list
- *prev = entry->next;
-
- // add to found list
- entry->next = found;
- found = entry;
-
- entry = *prev;
- continue;
- }
- prev = &entry->next;
- entry = entry->next;
- }
-
- prev = NULL;
- entry = found;
- while ( entry )
- {
- release_out_of_sync_entry(d, entry);
-
- prev = &entry->next;
- entry = entry->next;
- }
-
- // Add found list to free list
- if ( prev )
- {
- *prev = d->arch.out_of_sync_free;
- d->arch.out_of_sync_free = found;
- }
-}
-
-static void free_out_of_sync_state(struct domain *d)
-{
- struct out_of_sync_entry *entry;
-
- // NB: Be careful not to call something that manipulates this list
- // while walking it. Remove one item at a time, and always
- // restart from start of list.
- //
- while ( (entry = d->arch.out_of_sync) )
- {
- d->arch.out_of_sync = entry->next;
- release_out_of_sync_entry(d, entry);
-
- entry->next = d->arch.out_of_sync_free;
- d->arch.out_of_sync_free = entry;
- }
-}
-
-static void free_shadow_pages(struct domain *d)
-{
- int i;
- struct shadow_status *x;
- struct vcpu *v;
- struct list_head *list_ent, *tmp;
-
- /*
- * WARNING! The shadow page table must not currently be in use!
- * e.g., You are expected to have paused the domain and synchronized CR3.
- */
-
- if( !d->arch.shadow_ht ) return;
-
- shadow_audit(d, 1);
-
- // first, remove any outstanding refs from out_of_sync entries...
- //
- free_out_of_sync_state(d);
-
- // second, remove any outstanding refs from v->arch.shadow_table
- // and CR3.
- //
- for_each_vcpu(d, v)
- {
- if ( pagetable_get_paddr(v->arch.shadow_table) )
- {
- put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
- v->arch.shadow_table = pagetable_null();
-
- if ( shadow_mode_external(d) )
- {
- if ( v->arch.shadow_vtable )
- unmap_domain_page_global(v->arch.shadow_vtable);
- v->arch.shadow_vtable = NULL;
- }
- }
-
- if ( v->arch.monitor_shadow_ref )
- {
- put_shadow_ref(v->arch.monitor_shadow_ref);
- v->arch.monitor_shadow_ref = 0;
- }
- }
-
- // For external shadows, remove the monitor table's refs
- //
- if ( shadow_mode_external(d) )
- {
- for_each_vcpu(d, v)
- {
- l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
-
- if ( mpl2e )
- {
- l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
- l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
-
- if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
- {
- put_shadow_ref(l2e_get_pfn(hl2e));
- mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
- }
- if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
- {
- put_shadow_ref(l2e_get_pfn(smfn));
- mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
- }
- }
- }
- }
-
- // Now, the only refs to shadow pages that are left are from the shadow
- // pages themselves. We just unpin the pinned pages, and the rest
- // should automatically disappear.
- //
- // NB: Beware: each explicitly or implicit call to free_shadow_page
- // can/will result in the hash bucket getting rewritten out from
- // under us... First, collect the list of pinned pages, then
- // free them.
- //
- // FIXME: it would be good to just free all the pages referred to in
- // the hash table without going through each of them to decrement their
- // reference counts. In shadow_mode_refcount(), we've gotta do the hard
- // work, but only for L1 shadows. If we're not in refcount mode, then
- // there's no real hard work to do at all. Need to be careful with the
- // writable_pte_predictions and snapshot entries in the hash table, but
- // that's about it.
- //
- for ( i = 0; i < shadow_ht_buckets; i++ )
- {
- u32 count;
- unsigned long *mfn_list;
-
- /* Skip empty buckets. */
- if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
- continue;
-
- count = 0;
-
- for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) {
- /* Skip entries that are writable_pred) */
- switch(x->gpfn_and_flags & PGT_type_mask){
- case PGT_l1_shadow:
- case PGT_l2_shadow:
- case PGT_l3_shadow:
- case PGT_l4_shadow:
- case PGT_hl2_shadow:
- if ( MFN_PINNED(x->smfn) )
- count++;
- break;
- case PGT_snapshot:
- case PGT_writable_pred:
- break;
- default:
- BUG();
-
- }
- }
-
- if ( !count )
- continue;
-
- mfn_list = xmalloc_array(unsigned long, count);
- count = 0;
- for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) {
- /* Skip entries that are writable_pred) */
- switch(x->gpfn_and_flags & PGT_type_mask){
- case PGT_l1_shadow:
- case PGT_l2_shadow:
- case PGT_l3_shadow:
- case PGT_l4_shadow:
- case PGT_hl2_shadow:
- if ( MFN_PINNED(x->smfn) )
- mfn_list[count++] = x->smfn;
- break;
- case PGT_snapshot:
- case PGT_writable_pred:
- break;
- default:
- BUG();
-
- }
- }
-
- while ( count )
- {
- shadow_unpin(mfn_list[--count]);
- }
- xfree(mfn_list);
- }
-
- /* Now free the pre-zero'ed pages from the domain */
- list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
- {
- struct page_info *page = list_entry(list_ent, struct page_info, list);
-
- list_del(list_ent);
- perfc_decr(free_l1_pages);
-
- free_domheap_page(page);
- }
-
- shadow_audit(d, 0);
-
- SH_VLOG("Free shadow table.");
-}
-
-void shadow_mode_init(void)
-{
-}
-
-int _shadow_mode_refcounts(struct domain *d)
-{
- return shadow_mode_refcounts(d);
-}
-
-static void alloc_monitor_pagetable(struct vcpu *v)
-{
- unsigned long mmfn;
- l2_pgentry_t *mpl2e;
- struct page_info *mmfn_info;
- struct domain *d = v->domain;
- int i;
-
- ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
-
- mmfn_info = alloc_domheap_page(NULL);
- ASSERT(mmfn_info != NULL);
-
- mmfn = page_to_mfn(mmfn_info);
- mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
- memset(mpl2e, 0, PAGE_SIZE);
-
- memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
- for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
- mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
- l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
- __PAGE_HYPERVISOR);
-
- // Don't (yet) have mappings for these...
- // Don't want to accidentally see the idle_pg_table's linear mapping.
- //
- mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
- mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
- mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = l2e_empty();
-
- v->arch.monitor_table = pagetable_from_pfn(mmfn);
- v->arch.monitor_vtable = mpl2e;
-
- if ( v->vcpu_id == 0 )
- alloc_p2m_table(d);
- else
- {
- unsigned long mfn;
-
- mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
- if ( mfn )
- {
- l2_pgentry_t *l2tab;
-
- l2tab = map_domain_page(mfn);
-
- mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
- l2tab[l2_table_offset(RO_MPT_VIRT_START)];
-
- unmap_domain_page(l2tab);
- }
- }
-}
-
-/*
- * Free the pages for monitor_table and hl2_table
- */
-void free_monitor_pagetable(struct vcpu *v)
-{
- l2_pgentry_t *mpl2e, hl2e, sl2e;
- unsigned long mfn;
-
- ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
-
- mpl2e = v->arch.monitor_vtable;
-
- /*
- * First get the mfn for hl2_table by looking at monitor_table
- */
- hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
- if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
- {
- mfn = l2e_get_pfn(hl2e);
- ASSERT(mfn);
- put_shadow_ref(mfn);
- }
-
- sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
- if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
- {
- mfn = l2e_get_pfn(sl2e);
- ASSERT(mfn);
- put_shadow_ref(mfn);
- }
-
- if ( v->vcpu_id == 0 )
- free_p2m_table(v->domain);
-
- /*
- * Then free monitor_table.
- */
- mfn = pagetable_get_pfn(v->arch.monitor_table);
- unmap_domain_page_global(v->arch.monitor_vtable);
- free_domheap_page(mfn_to_page(mfn));
-
- v->arch.monitor_table = pagetable_null();
- v->arch.monitor_vtable = 0;
-}
-
-static int
-map_p2m_entry(l1_pgentry_t *l1tab, unsigned long gpfn, unsigned long mfn)
-{
- unsigned long *l0tab = NULL;
- l1_pgentry_t l1e = { 0 };
- struct page_info *page;
- unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
-
- l1e = l1tab[l1_table_offset(va)];
- if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
- {
- page = alloc_domheap_page(NULL);
- if ( !page )
- return 0;
-
- l0tab = map_domain_page(page_to_mfn(page));
- memset(l0tab, 0, PAGE_SIZE);
-
- l1e = l1tab[l1_table_offset(va)] =
- l1e_from_page(page, __PAGE_HYPERVISOR);
- }
- else
- l0tab = map_domain_page(l1e_get_pfn(l1e));
-
- l0tab[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn;
-
- unmap_domain_page(l0tab);
-
- return 1;
-}
-
-int
-set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
- struct domain_mmap_cache *l2cache,
- struct domain_mmap_cache *l1cache)
-{
- unsigned long tabpfn;
- l2_pgentry_t *l2, l2e;
- l1_pgentry_t *l1;
- struct page_info *l1page;
- unsigned long va = pfn << PAGE_SHIFT;
-
- if ( shadow_mode_external(d) )
- tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
- else
- tabpfn = pagetable_get_pfn(d->arch.phys_table);
-
- ASSERT(tabpfn != 0);
- ASSERT(shadow_lock_is_acquired(d));
-
- l2 = map_domain_page_with_cache(tabpfn, l2cache);
-
- /*
- * The following code covers (SHM_translate | SHM_external) mode.
- */
-
- if ( shadow_mode_external(d) )
- {
- int error;
- l1_pgentry_t *l1tab = NULL;
- l2_pgentry_t l2e;
-
- l2e = l2[l2_table_offset(RO_MPT_VIRT_START)];
-
- ASSERT( l2e_get_flags(l2e) & _PAGE_PRESENT );
-
- l1tab = map_domain_page(l2e_get_pfn(l2e));
- if ( !(error = map_p2m_entry(l1tab, pfn, mfn)) )
- domain_crash(d);
-
- unmap_domain_page(l1tab);
- unmap_domain_page_with_cache(l2, l2cache);
-
- return error;
- }
-
- /*
- * The following code covers SHM_translate mode.
- */
- ASSERT(shadow_mode_translate(d));
-
- l2e = l2[l2_table_offset(va)];
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- {
- l1page = alloc_domheap_page(NULL);
- if ( !l1page )
- {
- unmap_domain_page_with_cache(l2, l2cache);
- return 0;
- }
-
- l1 = map_domain_page_with_cache(page_to_mfn(l1page), l1cache);
- /* Initialise entries to INVALID_MFN = ~0 */
- memset(l1, -1, PAGE_SIZE);
- unmap_domain_page_with_cache(l1, l1cache);
-
- l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR);
- l2[l2_table_offset(va)] = l2e;
- }
- unmap_domain_page_with_cache(l2, l2cache);
-
- l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache);
- l1[l1_table_offset(va)] = (l1_pgentry_t) { mfn };
- unmap_domain_page_with_cache(l1, l1cache);
-
- return 1;
-}
-
-static int
-alloc_p2m_table(struct domain *d)
-{
- struct list_head *list_ent;
-
- l2_pgentry_t *l2tab = NULL;
- l1_pgentry_t *l1tab = NULL;
- l2_pgentry_t l2e = { 0 };
- struct page_info *page;
- unsigned long gpfn, mfn;
- int error = 0;
-
- if ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) )
- {
- l2tab = map_domain_page(
- pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
- l2e = l2tab[l2_table_offset(RO_MPT_VIRT_START)];
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- {
- page = alloc_domheap_page(NULL);
-
- l1tab = map_domain_page(page_to_mfn(page));
- memset(l1tab, 0, PAGE_SIZE);
- l2e = l2tab[l2_table_offset(RO_MPT_VIRT_START)] =
- l2e_from_page(page, __PAGE_HYPERVISOR);
- }
- else
- l1tab = map_domain_page(l2e_get_pfn(l2e));
-
- if ( l2tab )
- unmap_domain_page(l2tab);
- }
- else
- {
- page = alloc_domheap_page(NULL);
- if (!page)
- {
- printk("Alloc p2m table fail\n");
- domain_crash(d);
- }
-
- l1tab = map_domain_page(page_to_mfn(page));
- memset(l1tab, 0, PAGE_SIZE);
- d->arch.phys_table = pagetable_from_page(page);
- }
-
- list_ent = d->page_list.next;
-
- while ( list_ent != &d->page_list )
- {
- page = list_entry(list_ent, struct page_info, list);
- mfn = page_to_mfn(page);
-
- gpfn = get_gpfn_from_mfn(mfn);
-
- if ( !(error = map_p2m_entry(l1tab, gpfn, mfn)) )
- {
- domain_crash(d);
- break;
- }
-
- list_ent = page->list.next;
- }
-
- unmap_domain_page(l1tab);
-
- return error;
-}
-
-static void
-free_p2m_table(struct domain *d)
-{
- unsigned long va;
- l2_pgentry_t *l2tab;
- l1_pgentry_t *l1tab;
- l2_pgentry_t l2e;
- l1_pgentry_t l1e;
-
- ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
-
- l2tab = map_domain_page(
- pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-
- for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
- {
- int i;
-
- l2e = l2tab[l2_table_offset(va)];
- if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
- {
- l1tab = map_domain_page(l2e_get_pfn(l2e));
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++)
- {
- l1e = l1tab[l1_table_offset(va)];
-
- if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
- free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
- va += PAGE_SIZE;
- }
- unmap_domain_page(l1tab);
- free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
- }
- else
- va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
- }
- unmap_domain_page(l2tab);
-}
-
-int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- l2_pgentry_t sl2e;
- l1_pgentry_t sl1e;
- l1_pgentry_t *sple = NULL;
- unsigned long mfn, smfn;
- struct page_info *page;
-
- /*
- * If the faulting address is within the MMIO range, we continue
- * on handling the #PF as such.
- */
- if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN )
- return 0;
-
- shadow_lock(d);
-
- __direct_get_l2e(v, vpa, &sl2e);
-
- if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
- {
- page = alloc_domheap_page(NULL);
- if ( !page )
- goto nomem;
-
- smfn = page_to_mfn(page);
- sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER);
-
- sple = (l1_pgentry_t *)map_domain_page(smfn);
- memset(sple, 0, PAGE_SIZE);
- __direct_set_l2e(v, vpa, sl2e);
- }
-
- if ( !sple )
- sple = (l1_pgentry_t *)map_domain_page(l2e_get_pfn(sl2e));
-
- sl1e = sple[l1_table_offset(vpa)];
-
- if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) )
- {
- sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER);
- sple[l1_table_offset(vpa)] = sl1e;
- }
-
- if (sple)
- unmap_domain_page(sple);
-
- shadow_unlock(d);
- return EXCRET_fault_fixed;
-
-nomem:
- shadow_direct_map_clean(d);
- domain_crash_synchronous();
-}
-
-
-int shadow_direct_map_init(struct domain *d)
-{
- struct page_info *page;
- l2_pgentry_t *root;
-
- if ( !(page = alloc_domheap_page(NULL)) )
- return 0;
-
- root = map_domain_page(page_to_mfn(page));
- memset(root, 0, PAGE_SIZE);
- unmap_domain_page(root);
-
- d->arch.phys_table = pagetable_from_page(page);
-
- return 1;
-}
-
-void shadow_direct_map_clean(struct domain *d)
-{
- int i;
- unsigned long mfn;
- l2_pgentry_t *l2e;
-
- mfn = pagetable_get_pfn(d->arch.phys_table);
-
- /*
- * We may fail very early before direct map is built.
- */
- if ( !mfn )
- return;
-
- l2e = map_domain_page(mfn);
-
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- {
- if ( l2e_get_flags(l2e[i]) & _PAGE_PRESENT )
- free_domheap_page(mfn_to_page(l2e_get_pfn(l2e[i])));
- }
- free_domheap_page(mfn_to_page(mfn));
-
- unmap_domain_page(l2e);
-
- d->arch.phys_table = pagetable_null();
-}
-
-int __shadow_mode_enable(struct domain *d, unsigned int mode)
-{
- struct vcpu *v;
- int new_modes = (mode & ~d->arch.shadow_mode);
-
- if(!new_modes) /* Nothing to do - return success */
- return 0;
-
- // can't take anything away by calling this function.
- ASSERT(!(d->arch.shadow_mode & ~mode));
-
- for_each_vcpu(d, v)
- {
- invalidate_shadow_ldt(v);
-
- // We need to set these up for __update_pagetables().
- // See the comment there.
-
- /*
- * arch.guest_vtable
- */
- if ( v->arch.guest_vtable &&
- (v->arch.guest_vtable != __linear_l2_table) )
- {
- unmap_domain_page_global(v->arch.guest_vtable);
- }
- if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
- v->arch.guest_vtable = __linear_l2_table;
- else
- v->arch.guest_vtable = NULL;
-
- /*
- * arch.shadow_vtable
- */
- if ( v->arch.shadow_vtable &&
- (v->arch.shadow_vtable != __shadow_linear_l2_table) )
- {
- unmap_domain_page_global(v->arch.shadow_vtable);
- }
- if ( !(mode & SHM_external) )
- v->arch.shadow_vtable = __shadow_linear_l2_table;
- else
- v->arch.shadow_vtable = NULL;
-
- /*
- * arch.hl2_vtable
- */
- if ( v->arch.hl2_vtable &&
- (v->arch.hl2_vtable != __linear_hl2_table) )
- {
- unmap_domain_page_global(v->arch.hl2_vtable);
- }
- if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
- v->arch.hl2_vtable = __linear_hl2_table;
- else
- v->arch.hl2_vtable = NULL;
-
- /*
- * arch.monitor_table & arch.monitor_vtable
- */
- if ( v->arch.monitor_vtable )
- {
- free_monitor_pagetable(v);
- }
- if ( mode & SHM_external )
- {
- alloc_monitor_pagetable(v);
- }
- }
-
- if ( new_modes & SHM_enable )
- {
- ASSERT( !d->arch.shadow_ht );
- d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
- if ( d->arch.shadow_ht == NULL )
- goto nomem;
-
- memset(d->arch.shadow_ht, 0,
- shadow_ht_buckets * sizeof(struct shadow_status));
- }
-
- if ( new_modes & SHM_log_dirty )
- {
- ASSERT( !d->arch.shadow_dirty_bitmap );
- d->arch.shadow_dirty_bitmap_size =
- (d->shared_info->arch.max_pfn + 63) & ~63;
- d->arch.shadow_dirty_bitmap =
- xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
- (8 * sizeof(unsigned long)));
- if ( d->arch.shadow_dirty_bitmap == NULL )
- {
- d->arch.shadow_dirty_bitmap_size = 0;
- goto nomem;
- }
- memset(d->arch.shadow_dirty_bitmap, 0,
- d->arch.shadow_dirty_bitmap_size/8);
- }
-
- if ( new_modes & SHM_translate )
- {
- if ( !(new_modes & SHM_external) )
- {
- ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
- if ( !alloc_p2m_table(d) )
- {
- printk("alloc_p2m_table failed (out-of-memory?)\n");
- goto nomem;
- }
- }
- }
-
- // Get rid of any shadow pages from any previous shadow mode.
- //
- free_shadow_pages(d);
-
- d->arch.shadow_mode = mode;
-
- if ( shadow_mode_refcounts(d) )
- {
- struct list_head *list_ent;
- struct page_info *page;
-
- /*
- * Tear down its counts by disassembling its page-table-based refcounts
- * Also remove CR3's gcount/tcount.
- * That leaves things like GDTs and LDTs and external refs in tact.
- *
- * Most pages will be writable tcount=0.
- * Some will still be L1 tcount=0 or L2 tcount=0.
- * Maybe some pages will be type none tcount=0.
- * Pages granted external writable refs (via grant tables?) will
- * still have a non-zero tcount. That's OK.
- *
- * gcounts will generally be 1 for PGC_allocated.
- * GDTs and LDTs will have additional gcounts.
- * Any grant-table based refs will still be in the gcount.
- *
- * We attempt to grab writable refs to each page thus setting its type
- * Immediately put back those type refs.
- *
- * Assert that no pages are left with L1/L2/L3/L4 type.
- */
- audit_adjust_pgtables(d, -1, 1);
-
-
- for (list_ent = d->page_list.next; list_ent != &d->page_list;
- list_ent = page->list.next) {
-
- page = list_entry(list_ent, struct page_info, list);
-
- if ( !get_page_type(page, PGT_writable_page) )
- BUG();
- put_page_type(page);
- /*
- * We use tlbflush_timestamp as back pointer to smfn, and need to
- * clean up it.
- */
- if (shadow_mode_external(d))
- page->tlbflush_timestamp = 0;
- }
-
- audit_adjust_pgtables(d, 1, 1);
-
- }
-
- return 0;
-
- nomem:
- if ( (new_modes & SHM_enable) )
- {
- xfree(d->arch.shadow_ht);
- d->arch.shadow_ht = NULL;
- }
- if ( (new_modes & SHM_log_dirty) )
- {
- xfree(d->arch.shadow_dirty_bitmap);
- d->arch.shadow_dirty_bitmap = NULL;
- }
-
- return -ENOMEM;
-}
-
-int shadow_mode_enable(struct domain *d, unsigned int mode)
-{
- int rc;
- shadow_lock(d);
- rc = __shadow_mode_enable(d, mode);
- shadow_unlock(d);
- return rc;
-}
-
-static void
-translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
-{
- int i;
- l1_pgentry_t *l1;
-
- l1 = map_domain_page(l1mfn);
- for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
- {
- if ( is_guest_l1_slot(i) &&
- (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
- {
- unsigned long mfn = l1e_get_pfn(l1[i]);
- unsigned long gpfn = mfn_to_gmfn(d, mfn);
- ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
- l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
- }
- }
- unmap_domain_page(l1);
-}
-
-// This is not general enough to handle arbitrary pagetables
-// with shared L1 pages, etc., but it is sufficient for bringing
-// up dom0.
-//
-void
-translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
- unsigned int type)
-{
- int i;
- l2_pgentry_t *l2;
-
- ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
-
- l2 = map_domain_page(l2mfn);
- for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
- {
- if ( is_guest_l2_slot(type, i) &&
- (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
- {
- unsigned long mfn = l2e_get_pfn(l2[i]);
- unsigned long gpfn = mfn_to_gmfn(d, mfn);
- ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
- l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
- translate_l1pgtable(d, p2m, mfn);
- }
- }
- unmap_domain_page(l2);
-}
-
-static void free_shadow_ht_entries(struct domain *d)
-{
- struct shadow_status *x, *n;
-
- SH_VLOG("freed tables count=%d l1=%d l2=%d",
- d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
- perfc_value(shadow_l2_pages));
-
- n = d->arch.shadow_ht_extras;
- while ( (x = n) != NULL )
- {
- d->arch.shadow_extras_count--;
- n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
- xfree(x);
- }
-
- d->arch.shadow_ht_extras = NULL;
- d->arch.shadow_ht_free = NULL;
-
- ASSERT(d->arch.shadow_extras_count == 0);
- SH_VLOG("freed extras, now %d", d->arch.shadow_extras_count);
-
- if ( d->arch.shadow_dirty_bitmap != NULL )
- {
- xfree(d->arch.shadow_dirty_bitmap);
- d->arch.shadow_dirty_bitmap = 0;
- d->arch.shadow_dirty_bitmap_size = 0;
- }
-
- xfree(d->arch.shadow_ht);
- d->arch.shadow_ht = NULL;
-}
-
-static void free_out_of_sync_entries(struct domain *d)
-{
- struct out_of_sync_entry *x, *n;
-
- n = d->arch.out_of_sync_extras;
- while ( (x = n) != NULL )
- {
- d->arch.out_of_sync_extras_count--;
- n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
- xfree(x);
- }
-
- d->arch.out_of_sync_extras = NULL;
- d->arch.out_of_sync_free = NULL;
- d->arch.out_of_sync = NULL;
-
- ASSERT(d->arch.out_of_sync_extras_count == 0);
- FSH_LOG("freed extra out_of_sync entries, now %d",
- d->arch.out_of_sync_extras_count);
-}
-
-void __shadow_mode_disable(struct domain *d)
-{
- struct vcpu *v;
-#ifndef NDEBUG
- int i;
-#endif
-
- if ( unlikely(!shadow_mode_enabled(d)) )
- return;
-
- free_shadow_pages(d);
- free_writable_pte_predictions(d);
-
-#ifndef NDEBUG
- for ( i = 0; i < shadow_ht_buckets; i++ )
- {
- if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
- {
- printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
- __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
- BUG();
- }
- }
-#endif
-
- d->arch.shadow_mode = 0;
-
- free_shadow_ht_entries(d);
- free_out_of_sync_entries(d);
-
- for_each_vcpu(d, v)
- update_pagetables(v);
-}
-
-static int shadow_mode_table_op(
- struct domain *d, dom0_shadow_control_t *sc)
-{
- unsigned int op = sc->op;
- int i, rc = 0;
- struct vcpu *v;
-
- ASSERT(shadow_lock_is_acquired(d));
-
- SH_VLOG("shadow mode table op %lx %lx count %d",
- (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */
- (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
- d->arch.shadow_page_count);
-
- shadow_audit(d, 1);
-
- switch ( op )
- {
- case DOM0_SHADOW_CONTROL_OP_FLUSH:
- free_shadow_pages(d);
-
- d->arch.shadow_fault_count = 0;
- d->arch.shadow_dirty_count = 0;
-
- break;
-
- case DOM0_SHADOW_CONTROL_OP_CLEAN:
- free_shadow_pages(d);
-
- sc->stats.fault_count = d->arch.shadow_fault_count;
- sc->stats.dirty_count = d->arch.shadow_dirty_count;
-
- d->arch.shadow_fault_count = 0;
- d->arch.shadow_dirty_count = 0;
-
- if ( guest_handle_is_null(sc->dirty_bitmap) ||
- (d->arch.shadow_dirty_bitmap == NULL) )
- {
- rc = -EINVAL;
- break;
- }
-
- if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
- sc->pages = d->arch.shadow_dirty_bitmap_size;
-
-#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
- for ( i = 0; i < sc->pages; i += chunk )
- {
- int bytes = ((((sc->pages - i) > chunk) ?
- chunk : (sc->pages - i)) + 7) / 8;
-
- if ( copy_to_guest_offset(
- sc->dirty_bitmap, i/(8*sizeof(unsigned long)),
- d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
- (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) )
- {
- rc = -EINVAL;
- break;
- }
-
- memset(
- d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
- 0, bytes);
- }
-
- break;
-
- case DOM0_SHADOW_CONTROL_OP_PEEK:
- sc->stats.fault_count = d->arch.shadow_fault_count;
- sc->stats.dirty_count = d->arch.shadow_dirty_count;
-
- if ( guest_handle_is_null(sc->dirty_bitmap) ||
- (d->arch.shadow_dirty_bitmap == NULL) )
- {
- rc = -EINVAL;
- break;
- }
-
- if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
- sc->pages = d->arch.shadow_dirty_bitmap_size;
-
- if ( copy_to_guest(sc->dirty_bitmap,
- d->arch.shadow_dirty_bitmap,
- (((sc->pages+7)/8)+sizeof(unsigned long)-1) /
- sizeof(unsigned long)) )
- {
- rc = -EINVAL;
- break;
- }
-
- break;
-
- default:
- rc = -EINVAL;
- break;
- }
-
- SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
- shadow_audit(d, 1);
-
- for_each_vcpu(d,v)
- __update_pagetables(v);
-
- return rc;
-}
-
-int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
-{
- unsigned int op = sc->op;
- int rc = 0;
- struct vcpu *v;
-
- if ( unlikely(d == current->domain) )
- {
- DPRINTK("Don't try to do a shadow op on yourself!\n");
- return -EINVAL;
- }
-
- domain_pause(d);
-
- shadow_lock(d);
-
- switch ( op )
- {
- case DOM0_SHADOW_CONTROL_OP_OFF:
- if ( shadow_mode_enabled(d) )
- {
- __shadow_sync_all(d);
- __shadow_mode_disable(d);
- }
- break;
-
- case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
- free_shadow_pages(d);
- rc = __shadow_mode_enable(d, SHM_enable);
- break;
-
- case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
- free_shadow_pages(d);
- rc = __shadow_mode_enable(
- d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
- break;
-
- case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
- free_shadow_pages(d);
- rc = __shadow_mode_enable(
- d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate|SHM_wr_pt_pte);
- break;
-
- default:
- rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
- break;
- }
-
- shadow_unlock(d);
-
- for_each_vcpu(d,v)
- update_pagetables(v);
-
- domain_unpause(d);
-
- return rc;
-}
-
-unsigned long
-get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn)
-{
- unsigned long va, tabpfn;
- l1_pgentry_t *l1, l1e;
- l2_pgentry_t *l2, l2e;
-
- ASSERT(shadow_mode_translate(d));
-
- perfc_incrc(get_mfn_from_gpfn_foreign);
-
- if ( shadow_mode_external(d) )
- {
- unsigned long mfn;
- unsigned long *l0;
-
- va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
-
- tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
- if ( !tabpfn )
- return INVALID_MFN;
-
- l2 = map_domain_page(tabpfn);
- l2e = l2[l2_table_offset(va)];
- unmap_domain_page(l2);
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- return INVALID_MFN;
-
- l1 = map_domain_page(l2e_get_pfn(l2e));
- l1e = l1[l1_table_offset(va)];
- unmap_domain_page(l1);
- if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
- return INVALID_MFN;
-
- l0 = map_domain_page(l1e_get_pfn(l1e));
- mfn = l0[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)];
- unmap_domain_page(l0);
- return mfn;
- }
- else
- {
- va = gpfn << PAGE_SHIFT;
- tabpfn = pagetable_get_pfn(d->arch.phys_table);
- l2 = map_domain_page(tabpfn);
- l2e = l2[l2_table_offset(va)];
- unmap_domain_page(l2);
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- {
-#if 0
- printk("%s(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte "\n",
- __func__, d->domain_id, gpfn, l2e_get_intpte(l2e));
-#endif
- return INVALID_MFN;
- }
- l1 = map_domain_page(l2e_get_pfn(l2e));
- l1e = l1[l1_table_offset(va)];
- unmap_domain_page(l1);
-#if 0
- printk("%s(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx l1tab=%lx, l1e=%lx\n",
- __func__, d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, l1tab, l1e);
-#endif
-
- return l1e_get_intpte(l1e);
- }
-
-}
-
-static unsigned long
-shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
- unsigned long smfn)
-{
- unsigned long hl2mfn;
- l1_pgentry_t *hl2;
- l2_pgentry_t *gpgd;
- int limit;
- int x;
-
- ASSERT(PGT_base_page_table == PGT_l2_page_table);
-
- if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
- {
- printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
- gpfn, gmfn);
- BUG(); /* XXX Deal gracefully with failure. */
- }
-
- SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
- gpfn, gmfn, smfn, hl2mfn);
- perfc_incrc(shadow_hl2_table_count);
-
- hl2 = map_domain_page(hl2mfn);
-
- if ( shadow_mode_external(d) )
- limit = L2_PAGETABLE_ENTRIES;
- else
- limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
- memset(hl2, 0, limit * sizeof(l1_pgentry_t));
-
- if ( !shadow_mode_external(d) )
- {
- memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
- HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
- // Setup easy access to the GL2, SL2, and HL2 frames.
- //
- hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
- l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
- hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
- l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
- hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
- l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
- }
-
- gpgd = map_domain_page(gmfn);
- for (x = 0; x < DOMAIN_ENTRIES_PER_L2_PAGETABLE; x++)
- validate_hl2e_change(d, gpgd[x], &hl2[x]);
- unmap_domain_page(gpgd);
-
- unmap_domain_page(hl2);
-
- return hl2mfn;
-}
-
-/*
- * This could take and use a snapshot, and validate the entire page at
- * once, or it could continue to fault in entries one at a time...
- * Might be worth investigating...
- */
-static unsigned long shadow_l2_table(
- struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
- unsigned long smfn;
- l2_pgentry_t *spl2e;
- int i;
-
- SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
-
- perfc_incrc(shadow_l2_table_count);
-
- if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
- {
- printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
- gpfn, gmfn);
- BUG(); /* XXX Deal gracefully with failure. */
- }
-
- spl2e = (l2_pgentry_t *)map_domain_page(smfn);
-
- /* Install hypervisor and 2x linear p.t. mapings. */
- if ( (PGT_base_page_table == PGT_l2_page_table) &&
- !shadow_mode_external(d) )
- {
- /*
- * We could proactively fill in PDEs for pages that are already
- * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
- * (restriction required for coherence of the accessed bit). However,
- * we tried it and it didn't help performance. This is simpler.
- */
- memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
-
- /* Install hypervisor and 2x linear p.t. mapings. */
- memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
- spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
-
- for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
- spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
- l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
- arch.mm_perdomain_pt) + i,
- __PAGE_HYPERVISOR);
-
- if ( shadow_mode_translate(d) ) // NB: not external
- {
- unsigned long hl2mfn;
-
- ASSERT(pagetable_get_paddr(d->arch.phys_table));
- spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
- l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
- __PAGE_HYPERVISOR);
-
- if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
- hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
-
- // shadow_mode_translate (but not external) sl2 tables hold a
- // ref to their hl2.
- //
- if ( !get_shadow_ref(hl2mfn) )
- BUG();
-
- spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
- }
- else
- spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
- }
- else
- {
- memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
- }
-
- unmap_domain_page(spl2e);
-
- SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
- return smfn;
-}
-
-void shadow_map_l1_into_current_l2(unsigned long va)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- l1_pgentry_t *gpl1e, *spl1e;
- l2_pgentry_t gl2e, sl2e;
- unsigned long gl1pfn, gl1mfn, sl1mfn;
- int i, init_table = 0;
-
- __guest_get_l2e(v, va, &gl2e);
- ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
- gl1pfn = l2e_get_pfn(gl2e);
-
- if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
- {
- /* This L1 is NOT already shadowed so we need to shadow it. */
- SH_VVLOG("4a: l1 not shadowed");
-
- gl1mfn = gmfn_to_mfn(d, gl1pfn);
- if ( unlikely(!VALID_MFN(gl1mfn)) )
- {
- // Attempt to use an invalid pfn as an L1 page.
- // XXX this needs to be more graceful!
- BUG();
- }
-
- if ( unlikely(!(sl1mfn =
- alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
- {
- printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
- gl1pfn, gl1mfn);
- BUG(); /* XXX Need to deal gracefully with failure. */
- }
-
- perfc_incrc(shadow_l1_table_count);
- init_table = 1;
- }
- else
- {
- /* This L1 is shadowed already, but the L2 entry is missing. */
- SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
- }
-
-#ifndef NDEBUG
- {
- l2_pgentry_t old_sl2e;
- __shadow_get_l2e(v, va, &old_sl2e);
- ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
- }
-#endif
-
- if ( !get_shadow_ref(sl1mfn) )
- BUG();
- l2pde_general(d, &gl2e, &sl2e, sl1mfn);
- __guest_set_l2e(v, va, gl2e);
- __shadow_set_l2e(v, va, sl2e);
-
- if ( init_table )
- {
- l1_pgentry_t sl1e;
- int index = l1_table_offset(va);
- int min = 1, max = 0;
-
- gpl1e = &(linear_pg_table[l1_linear_offset(va) &
- ~(L1_PAGETABLE_ENTRIES-1)]);
-
- spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
- ~(L1_PAGETABLE_ENTRIES-1)]);
-
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
- {
- l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
- if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
- unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
- sl1e = l1e_empty();
- if ( l1e_get_flags(sl1e) == 0 )
- {
- // First copy entries from 0 until first invalid.
- // Then copy entries from index until first invalid.
- //
- if ( i < index ) {
- i = index - 1;
- continue;
- }
- break;
- }
- spl1e[i] = sl1e;
- if ( unlikely(i < min) )
- min = i;
- if ( likely(i > max) )
- max = i;
- set_guest_back_ptr(d, sl1e, sl1mfn, i);
- }
-
- mfn_to_page(sl1mfn)->tlbflush_timestamp =
- SHADOW_ENCODE_MIN_MAX(min, max);
- }
-}
-
-void shadow_invlpg(struct vcpu *v, unsigned long va)
-{
- struct domain *d = v->domain;
- l1_pgentry_t gpte, spte;
-
- ASSERT(shadow_mode_enabled(d));
-
- shadow_lock(d);
-
- __shadow_sync_va(v, va);
-
- // XXX mafetter: will need to think about 4MB pages...
-
- // It's not strictly necessary to update the shadow here,
- // but it might save a fault later.
- //
- if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
- sizeof(gpte))) {
- perfc_incrc(shadow_invlpg_faults);
- shadow_unlock(d);
- return;
- }
- l1pte_propagate_from_guest(d, gpte, &spte);
- shadow_set_l1e(va, spte, 1);
-
- shadow_unlock(d);
-}
-
-struct out_of_sync_entry *
-shadow_alloc_oos_entry(struct domain *d)
-{
- struct out_of_sync_entry *f, *extra;
- unsigned size, i;
-
- if ( unlikely(d->arch.out_of_sync_free == NULL) )
- {
- FSH_LOG("Allocate more fullshadow tuple blocks.");
-
- size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
- extra = xmalloc_bytes(size);
-
- /* XXX Should be more graceful here. */
- if ( extra == NULL )
- BUG();
-
- memset(extra, 0, size);
-
- /* Record the allocation block so it can be correctly freed later. */
- d->arch.out_of_sync_extras_count++;
- *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
- d->arch.out_of_sync_extras;
- d->arch.out_of_sync_extras = &extra[0];
-
- /* Thread a free chain through the newly-allocated nodes. */
- for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
- extra[i].next = &extra[i+1];
- extra[i].next = NULL;
-
- /* Add the new nodes to the free list. */
- d->arch.out_of_sync_free = &extra[0];
- }
-
- /* Allocate a new node from the quicklist. */
- f = d->arch.out_of_sync_free;
- d->arch.out_of_sync_free = f->next;
-
- return f;
-}
-
-static inline unsigned long
-shadow_make_snapshot(
- struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
- unsigned long smfn, sl1mfn = 0;
- void *original, *snapshot;
- u32 min_max = 0;
- int min, max, length;
-
- if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
- {
- ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
- return SHADOW_SNAPSHOT_ELSEWHERE;
- }
-
- perfc_incrc(shadow_make_snapshot);
-
- if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
- {
- printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
- "Dom%d snapshot_count_count=%d\n",
- gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
- BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
- }
-
- if ( !get_shadow_ref(smfn) )
- BUG();
-
- if ( shadow_mode_refcounts(d) &&
- (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
- min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
- mfn_to_page(smfn)->tlbflush_timestamp = min_max;
-
- min = SHADOW_MIN(min_max);
- max = SHADOW_MAX(min_max);
- length = max - min + 1;
- perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
-
- min *= sizeof(l1_pgentry_t);
- length *= sizeof(l1_pgentry_t);
-
- original = map_domain_page(gmfn);
- snapshot = map_domain_page(smfn);
- memcpy(snapshot + min, original + min, length);
- unmap_domain_page(original);
- unmap_domain_page(snapshot);
-
- return smfn;
-}
-
-static void
-shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
-{
- void *snapshot;
-
- if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
- return;
-
- // Clear the out_of_sync bit.
- //
- clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info);
-
- // XXX Need to think about how to protect the domain's
- // information less expensively.
- //
- snapshot = map_domain_page(entry->snapshot_mfn);
- memset(snapshot, 0, PAGE_SIZE);
- unmap_domain_page(snapshot);
-
- put_shadow_ref(entry->snapshot_mfn);
-}
-
-struct out_of_sync_entry *
-__shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
- unsigned long mfn)
-{
- struct domain *d = v->domain;
- struct page_info *page = mfn_to_page(mfn);
- struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(mfn_valid(mfn));
-
-#ifndef NDEBUG
- {
- u32 type = page->u.inuse.type_info & PGT_type_mask;
- if ( shadow_mode_refcounts(d) )
- {
- ASSERT(type == PGT_writable_page);
- }
- else
- {
- ASSERT(type && (type < PGT_l4_page_table));
- }
- }
-#endif
-
- FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08lx", __func__,
- gpfn, mfn, page->count_info, page->u.inuse.type_info);
-
- // XXX this will require some more thought... Cross-domain sharing and
- // modification of page tables? Hmm...
- //
- if ( d != page_get_owner(page) )
- BUG();
-
- perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
-
- entry->v = v;
- entry->gpfn = gpfn;
- entry->gmfn = mfn;
- entry->writable_pl1e = -1;
-
-#if SHADOW_DEBUG
- mark_shadows_as_reflecting_snapshot(d, gpfn);
-#endif
-
- // increment guest's ref count to represent the entry in the
- // full shadow out-of-sync list.
- //
- get_page(page, d);
-
- return entry;
-}
-
-struct out_of_sync_entry *
-shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
- unsigned long mfn)
-{
- struct out_of_sync_entry *entry =
- __shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
- struct domain *d = v->domain;
-
- entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
- // Add to the out-of-sync list
- //
- entry->next = d->arch.out_of_sync;
- d->arch.out_of_sync = entry;
-
- return entry;
-}
-
-void shadow_mark_va_out_of_sync(
- struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
-{
- struct out_of_sync_entry *entry =
- __shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
- l2_pgentry_t sl2e;
- struct domain *d = v->domain;
-
- // We need the address of shadow PTE that maps @va.
- // It might not exist yet. Make sure it's there.
- //
- __shadow_get_l2e(v, va, &sl2e);
- if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
- {
- // either this L1 isn't shadowed yet, or the shadow isn't linked into
- // the current L2.
- shadow_map_l1_into_current_l2(va);
- __shadow_get_l2e(v, va, &sl2e);
- }
- ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
-
- entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
- // NB: this is stored as a machine address.
- entry->writable_pl1e =
- l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
- ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
- entry->va = va;
-
- // Increment shadow's page count to represent the reference
- // inherent in entry->writable_pl1e
- //
- if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
- BUG();
-
- // Add to the out-of-sync list
- //
- entry->next = d->arch.out_of_sync;
- d->arch.out_of_sync = entry;
-
- FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
- __func__, va, entry->writable_pl1e);
-}
-
-/*
- * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
- * Returns 0 otherwise.
- */
-static int snapshot_entry_matches(
- struct domain *d, l1_pgentry_t *guest_pt,
- unsigned long gpfn, unsigned index)
-{
- unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
- l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
- int entries_match;
-
- perfc_incrc(snapshot_entry_matches_calls);
-
- if ( !smfn )
- return 0;
-
- snapshot = map_domain_page(smfn);
-
- if (__copy_from_user(&gpte, &guest_pt[index],
- sizeof(gpte))) {
- unmap_domain_page(snapshot);
- return 0;
- }
-
- // This could probably be smarter, but this is sufficent for
- // our current needs.
- //
- entries_match = !l1e_has_changed(gpte, snapshot[index],
- PAGE_FLAG_MASK);
-
- unmap_domain_page(snapshot);
-
-#ifdef PERF_COUNTERS
- if ( entries_match )
- perfc_incrc(snapshot_entry_matches_true);
-#endif
-
- return entries_match;
-}
-
-/*
- * Returns 1 if va's shadow mapping is out-of-sync.
- * Returns 0 otherwise.
- */
-int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
-{
- struct domain *d = v->domain;
- unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
- unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
- l2_pgentry_t l2e;
- unsigned long l1pfn, l1mfn;
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(VALID_M2P(l2pfn));
-
- perfc_incrc(shadow_out_of_sync_calls);
-
- if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
- !snapshot_entry_matches(d, (l1_pgentry_t *)v->arch.guest_vtable,
- l2pfn, l2_table_offset(va)) )
- return 1;
-
- __guest_get_l2e(v, va, &l2e);
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- return 0;
-
- l1pfn = l2e_get_pfn(l2e);
- l1mfn = gmfn_to_mfn(d, l1pfn);
-
- // If the l1 pfn is invalid, it can't be out of sync...
- if ( !VALID_MFN(l1mfn) )
- return 0;
-
- if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
- !snapshot_entry_matches(
- d, &linear_pg_table[l1_linear_offset(va) & ~(L1_PAGETABLE_ENTRIES-1)],
- l1pfn, l1_table_offset(va)) )
- return 1;
-
- return 0;
-}
-
-#define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
-static inline unsigned long
-predict_writable_pte_page(struct domain *d, unsigned long gpfn)
-{
- return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
-}
-
-static inline void
-increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
-{
- unsigned long score = prediction & PGT_score_mask;
- int create = (score == 0);
-
- // saturating addition
- score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
- score = score ? score : PGT_score_mask;
-
- prediction = (prediction & PGT_mfn_mask) | score;
-
- //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create);
- set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred, 0);
-
- if ( create )
- perfc_incr(writable_pte_predictions);
-}
-
-static inline void
-decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
-{
- unsigned long score = prediction & PGT_score_mask;
- ASSERT(score);
-
- // divide score by 2... We don't like bad predictions.
- //
- score = (score >> 1) & PGT_score_mask;
-
- prediction = (prediction & PGT_mfn_mask) | score;
-
- //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score);
-
- if ( score )
- set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred, 0);
- else
- {
- delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred, 0);
- perfc_decr(writable_pte_predictions);
- }
-}
-
-static void
-free_writable_pte_predictions(struct domain *d)
-{
- int i;
- struct shadow_status *x;
-
- for ( i = 0; i < shadow_ht_buckets; i++ )
- {
- u32 count;
- unsigned long *gpfn_list;
-
- /* Skip empty buckets. */
- if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
- continue;
-
- count = 0;
- for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
- if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
- count++;
-
- gpfn_list = xmalloc_array(unsigned long, count);
- count = 0;
- for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
- if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
- gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
-
- while ( count )
- {
- count--;
- /* delete_shadow_status() may do a shadow_audit(), so we need to
- * keep an accurate count of writable_pte_predictions to keep it
- * happy.
- */
- delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred, 0);
- perfc_decr(writable_pte_predictions);
- }
-
- xfree(gpfn_list);
- }
-}
-
-static int fix_entry(
- struct domain *d,
- l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
-{
- l1_pgentry_t old = *pt;
- l1_pgentry_t new = old;
-
- l1e_remove_flags(new,_PAGE_RW);
- if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
- BUG();
- (*found)++;
- *pt = new;
- if ( is_l1_shadow )
- shadow_put_page_from_l1e(old, d);
-
- return (*found == max_refs_to_find);
-}
-
-static u32 remove_all_write_access_in_ptpage(
- struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
- unsigned long readonly_gpfn, unsigned long readonly_gmfn,
- u32 max_refs_to_find, unsigned long prediction)
-{
- l1_pgentry_t *pt = map_domain_page(pt_mfn);
- l1_pgentry_t match;
- unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
- int i;
- u32 found = 0;
- int is_l1_shadow =
- ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
- PGT_l1_shadow);
-
- match = l1e_from_pfn(readonly_gmfn, flags);
-
- if ( shadow_mode_external(d) ) {
- i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask)
- >> PGT_va_shift;
-
- if ( (i >= 0 && i < L1_PAGETABLE_ENTRIES) &&
- !l1e_has_changed(pt[i], match, flags) &&
- fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) &&
- !prediction )
- goto out;
- }
-
- for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
- {
- if ( unlikely(!l1e_has_changed(pt[i], match, flags)) &&
- fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) )
- break;
- }
-
-out:
- unmap_domain_page(pt);
-
- return found;
-}
-
-int shadow_remove_all_write_access(
- struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
-{
- int i;
- struct shadow_status *a;
- u32 found = 0, write_refs;
- unsigned long predicted_smfn;
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(VALID_MFN(readonly_gmfn));
-
- perfc_incrc(remove_write_access);
-
- // If it's not a writable page, then no writable refs can be outstanding.
- //
- if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
- PGT_writable_page )
- {
- perfc_incrc(remove_write_not_writable);
- return 1;
- }
-
- // How many outstanding writable PTEs for this page are there?
- //
- write_refs =
- (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
- if ( write_refs && MFN_PINNED(readonly_gmfn) )
- {
- write_refs--;
- }
-
- if ( write_refs == 0 )
- {
- perfc_incrc(remove_write_no_work);
- return 1;
- }
-
- if ( shadow_mode_external(d) ) {
- if (--write_refs == 0)
- return 0;
-
- // Use the back pointer to locate the shadow page that can contain
- // the PTE of interest
- if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
- found += remove_all_write_access_in_ptpage(
- d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
- if ( found == write_refs )
- return 0;
- }
- }
-
- // Search all the shadow L1 page tables...
- //
- for (i = 0; i < shadow_ht_buckets; i++)
- {
- a = &d->arch.shadow_ht[i];
- while ( a && a->gpfn_and_flags )
- {
- if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
- {
- found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
- if ( found == write_refs )
- return 0;
- }
-
- a = a->next;
- }
- }
-
- FSH_LOG("%s: looking for %d refs, found %d refs",
- __func__, write_refs, found);
-
- return 0;
-}
-
-static u32 remove_all_access_in_page(
- struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
-{
- l1_pgentry_t *pl1e = map_domain_page(l1mfn);
- l1_pgentry_t match, ol2e;
- unsigned long flags = _PAGE_PRESENT;
- int i;
- u32 count = 0;
- int is_l1_shadow =
- ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) ==
- PGT_l1_shadow);
-
- match = l1e_from_pfn(forbidden_gmfn, flags);
-
- for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
- {
- if ( l1e_has_changed(pl1e[i], match, flags) )
- continue;
-
- ol2e = pl1e[i];
- pl1e[i] = l1e_empty();
- count++;
-
- if ( is_l1_shadow )
- shadow_put_page_from_l1e(ol2e, d);
- else /* must be an hl2 page */
- put_page(mfn_to_page(forbidden_gmfn));
- }
-
- unmap_domain_page(pl1e);
-
- return count;
-}
-
-u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
-{
- int i;
- struct shadow_status *a;
- u32 count = 0;
-
- if ( unlikely(!shadow_mode_enabled(d)) )
- return 0;
-
- ASSERT(shadow_lock_is_acquired(d));
- perfc_incrc(remove_all_access);
-
- for (i = 0; i < shadow_ht_buckets; i++)
- {
- a = &d->arch.shadow_ht[i];
- while ( a && a->gpfn_and_flags )
- {
- switch (a->gpfn_and_flags & PGT_type_mask)
- {
- case PGT_l1_shadow:
- case PGT_l2_shadow:
- case PGT_l3_shadow:
- case PGT_l4_shadow:
- case PGT_hl2_shadow:
- count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
- break;
- case PGT_snapshot:
- case PGT_writable_pred:
- // these can't hold refs to the forbidden page
- break;
- default:
- BUG();
- }
-
- a = a->next;
- }
- }
-
- return count;
-}
-
-static int resync_all(struct domain *d, u32 stype)
-{
- struct out_of_sync_entry *entry;
- unsigned i;
- unsigned long smfn;
- void *guest, *shadow, *snapshot;
- int need_flush = 0, external = shadow_mode_external(d);
- int unshadow;
- int changed;
- u32 min_max_shadow, min_max_snapshot;
- int min_shadow, max_shadow, min_snapshot, max_snapshot;
- struct vcpu *v;
-
- ASSERT(shadow_lock_is_acquired(d));
-
- for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
- {
- if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
- continue;
-
- smfn = __shadow_status(d, entry->gpfn, stype);
-
- if ( !smfn )
- {
- // For heavy weight shadows: no need to update refcounts if
- // there's no shadow page.
- //
- if ( shadow_mode_refcounts(d) )
- continue;
-
- // For light weight shadows: only need up resync the refcounts to
- // the new contents of the guest page iff this it has the right
- // page type.
- //
- if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
- continue;
- }
-
- FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
- stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
-
- // Compare guest's new contents to its snapshot, validating
- // and updating its shadow as appropriate.
- //
- guest = map_domain_page(entry->gmfn);
- snapshot = map_domain_page(entry->snapshot_mfn);
-
- if ( smfn )
- shadow = map_domain_page(smfn);
- else
- shadow = NULL;
-
- unshadow = 0;
-
- switch ( stype ) {
- case PGT_l1_shadow:
- {
- l1_pgentry_t *guest1 = guest;
- l1_pgentry_t *shadow1 = shadow;
- l1_pgentry_t *snapshot1 = snapshot;
- int unshadow_l1 = 0;
-
- ASSERT(shadow_mode_write_l1(d) ||
- shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
-
- if ( !shadow_mode_refcounts(d) )
- revalidate_l1(d, guest1, snapshot1);
-
- if ( !smfn )
- break;
-
- min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
- min_shadow = SHADOW_MIN(min_max_shadow);
- max_shadow = SHADOW_MAX(min_max_shadow);
-
- min_max_snapshot =
- mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
- min_snapshot = SHADOW_MIN(min_max_snapshot);
- max_snapshot = SHADOW_MAX(min_max_snapshot);
-
- changed = 0;
-
- for ( i = min_shadow; i <= max_shadow; i++ )
- {
- if ( (i < min_snapshot) || (i > max_snapshot) ||
- l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
- {
- int error;
-
- error = validate_pte_change(d, guest1[i], &shadow1[i]);
- if ( error == -1 )
- unshadow_l1 = 1;
- else {
- need_flush |= error;
- if ( l1e_get_flags(shadow1[i]) & _PAGE_PRESENT )
- set_guest_back_ptr(d, shadow1[i], smfn, i);
- }
-
- // can't update snapshots of linear page tables -- they
- // are used multiple times...
- //
- // snapshot[i] = new_pte;
- changed++;
- }
- }
- perfc_incrc(resync_l1);
- perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
- perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
- if (unshadow_l1) {
- l2_pgentry_t l2e;
-
- __shadow_get_l2e(entry->v, entry->va, &l2e);
- if (l2e_get_flags(l2e) & _PAGE_PRESENT) {
- put_shadow_ref(l2e_get_pfn(l2e));
- l2e = l2e_empty();
- __shadow_set_l2e(entry->v, entry->va, l2e);
-
- if (entry->v == current)
- need_flush = 1;
- }
- }
-
- break;
- }
- case PGT_l2_shadow:
- {
- int max = -1;
-
- l2_pgentry_t *guest2 = guest;
- l2_pgentry_t *shadow2 = shadow;
- l2_pgentry_t *snapshot2 = snapshot;
-
- ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
- BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
-
- changed = 0;
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- {
- l2_pgentry_t new_pde = guest2[i];
-
- if ( !is_guest_l2_slot(0,i) && !external )
- continue;
-
- if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
- {
- need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
-
- // can't update snapshots of linear page tables -- they
- // are used multiple times...
- //
- // snapshot[i] = new_pde;
-
- changed++;
- }
- if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
- max = i;
-
- // XXX - This hack works for linux guests.
- // Need a better solution long term.
- if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
- unlikely(l2e_get_intpte(new_pde) != 0) &&
- !unshadow && MFN_PINNED(smfn) )
- unshadow = 1;
- }
- if ( max == -1 )
- unshadow = 1;
- perfc_incrc(resync_l2);
- perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
- break;
- }
- case PGT_hl2_shadow:
- {
- l2_pgentry_t *guest2 = guest;
- l2_pgentry_t *snapshot2 = snapshot;
- l1_pgentry_t *shadow2 = shadow;
-
- ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
- BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
-
- changed = 0;
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- {
- l2_pgentry_t new_pde = guest2[i];
-
- if ( !is_guest_l2_slot(0, i) && !external )
- continue;
-
- if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
- {
- need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
-
- // can't update snapshots of linear page tables -- they
- // are used multiple times...
- //
- // snapshot[i] = new_pde;
-
- changed++;
- }
- }
- perfc_incrc(resync_hl2);
- perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
- break;
- }
- default:
- BUG();
- }
-
- if ( smfn )
- unmap_domain_page(shadow);
- unmap_domain_page(snapshot);
- unmap_domain_page(guest);
-
- if ( unlikely(unshadow) )
- {
- for_each_vcpu(d, v)
- if(smfn == pagetable_get_pfn(v->arch.shadow_table))
- return need_flush;
- perfc_incrc(unshadow_l2_count);
- shadow_unpin(smfn);
- if ( unlikely(shadow_mode_external(d)) )
- {
- unsigned long hl2mfn;
-
- if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
- MFN_PINNED(hl2mfn) )
- shadow_unpin(hl2mfn);
- }
- }
- }
-
- return need_flush;
-}
-
-void __shadow_sync_all(struct domain *d)
-{
- struct out_of_sync_entry *entry;
- int need_flush = 0;
- l1_pgentry_t *ppte, opte, npte;
- cpumask_t other_vcpus_mask;
-
- perfc_incrc(shadow_sync_all);
-
- ASSERT(shadow_lock_is_acquired(d));
-
- // First, remove all write permissions to the page tables
- //
- for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
- {
- // Skip entries that have low bits set... Those aren't
- // real PTEs.
- //
- if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
- continue;
-
- ppte = (l1_pgentry_t *)(
- (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
- (entry->writable_pl1e & ~PAGE_MASK));
- opte = npte = *ppte;
- l1e_remove_flags(npte, _PAGE_RW);
-
- if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
- !shadow_get_page_from_l1e(npte, d) )
- BUG();
- *ppte = npte;
- set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT,
- (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
- shadow_put_page_from_l1e(opte, d);
-
- unmap_domain_page(ppte);
- }
-
- /* Other VCPUs mustn't use the revoked writable mappings. */
- other_vcpus_mask = d->domain_dirty_cpumask;
- cpu_clear(smp_processor_id(), other_vcpus_mask);
- flush_tlb_mask(other_vcpus_mask);
-
- /* Flush ourself later. */
- need_flush = 1;
-
- /* Second, resync all L1 pages, then L2 pages, etc... */
- need_flush |= resync_all(d, PGT_l1_shadow);
- if ( shadow_mode_translate(d) )
- need_flush |= resync_all(d, PGT_hl2_shadow);
- need_flush |= resync_all(d, PGT_l2_shadow);
-
- if ( need_flush && !unlikely(shadow_mode_external(d)) )
- local_flush_tlb();
-
- free_out_of_sync_state(d);
-}
-
-int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
-{
- l1_pgentry_t gpte, spte, orig_gpte;
- struct vcpu *v = current;
- struct domain *d = v->domain;
- l2_pgentry_t gpde;
-
- spte = l1e_empty();
-
- SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
- va, (unsigned long)regs->error_code);
- perfc_incrc(shadow_fault_calls);
-
- check_pagetable(v, "pre-sf");
-
- /*
- * Don't let someone else take the guest's table pages out-of-sync.
- */
- shadow_lock(d);
-
- /* XXX - FIX THIS COMMENT!!!
- * STEP 1. Check to see if this fault might have been caused by an
- * out-of-sync table page entry, or if we should pass this
- * fault onto the guest.
- */
- __shadow_sync_va(v, va);
-
- /*
- * STEP 2. Check the guest PTE.
- */
- __guest_get_l2e(v, va, &gpde);
- if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
- {
- SH_VVLOG("shadow_fault - EXIT: L2 not present (%x)",
- l2e_get_intpte(gpde));
- perfc_incrc(shadow_fault_bail_pde_not_present);
- goto fail;
- }
-
- // This can't fault because we hold the shadow lock and we've ensured that
- // the mapping is in-sync, so the check of the PDE's present bit, above,
- // covers this access.
- //
- if ( __copy_from_user(&gpte,
- &linear_pg_table[l1_linear_offset(va)],
- sizeof(gpte)) ) {
- printk("%s() failed, crashing domain %d "
- "due to a unaccessible linear page table (gpde=%" PRIpte "), va=%lx\n",
- __func__, d->domain_id, l2e_get_intpte(gpde), va);
- domain_crash_synchronous();
- }
- orig_gpte = gpte;
-
- if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
- {
- SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ") (gpde %" PRIpte ")",
- l1e_get_intpte(gpte),
- l2e_get_intpte(gpde));
- perfc_incrc(shadow_fault_bail_pte_not_present);
- goto fail;
- }
-
- /* Write fault? */
- if ( regs->error_code & 2 )
- {
- int allow_writes = 0;
-
- if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
- {
- if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
- {
- allow_writes = 1;
- l1e_add_flags(gpte, _PAGE_RW);
- }
- else
- {
- /* Write fault on a read-only mapping. */
- SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
- l1e_get_intpte(gpte));
- perfc_incrc(shadow_fault_bail_ro_mapping);
- goto fail;
- }
- }
- else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
- {
- SH_LOG("l1pte_write_fault: no write access to page table page");
- domain_crash_synchronous();
- }
-
- /* User access violation in guest? */
- if ( unlikely((regs->error_code & 4) &&
- !(l1e_get_flags(gpte) & _PAGE_USER)))
- {
- SH_VVLOG("shadow_fault - EXIT: wr fault on super page (%" PRIpte ")",
- l1e_get_intpte(gpte));
- goto fail;
-
- }
-
- if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
- {
- SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
- perfc_incrc(write_fault_bail);
- shadow_unlock(d);
- return 0;
- }
-
- if ( allow_writes )
- l1e_remove_flags(gpte, _PAGE_RW);
- }
- else
- {
- /* Read-protection violation in guest? */
- if ( unlikely((regs->error_code & 1) ))
- {
- SH_VVLOG("shadow_fault - EXIT: read fault on super page (%" PRIpte ")",
- l1e_get_intpte(gpte));
- goto fail;
-
- }
-
-
- if ( !l1pte_read_fault(d, &gpte, &spte) )
- {
- SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
- perfc_incrc(read_fault_bail);
- shadow_unlock(d);
- return 0;
- }
- }
-
- /*
- * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
- */
- if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
- {
- /* XXX Watch out for read-only L2 entries! (not used in Linux). */
- if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
- &gpte, sizeof(gpte))) )
- {
- printk("%s() failed, crashing domain %d "
- "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
- __func__,d->domain_id, l2e_get_intpte(gpde), va);
- domain_crash_synchronous();
- }
-
- __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
- }
-
- shadow_set_l1e(va, spte, 1);
-
- perfc_incrc(shadow_fault_fixed);
- d->arch.shadow_fault_count++;
-
- shadow_unlock(d);
-
- check_pagetable(v, "post-sf");
- return EXCRET_fault_fixed;
-
- fail:
- shadow_unlock(d);
- return 0;
-}
-
-void shadow_l1_normal_pt_update(
- struct domain *d,
- unsigned long pa, l1_pgentry_t gpte,
- struct domain_mmap_cache *cache)
-{
- unsigned long sl1mfn;
- l1_pgentry_t *spl1e, spte;
-
- shadow_lock(d);
-
- sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
- if ( sl1mfn )
- {
- SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte,
- (void *)pa, l1e_get_intpte(gpte));
- l1pte_propagate_from_guest(current->domain, gpte, &spte);
-
- spl1e = map_domain_page_with_cache(sl1mfn, cache);
- spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
- unmap_domain_page_with_cache(spl1e, cache);
- }
-
- shadow_unlock(d);
-}
-
-void shadow_l2_normal_pt_update(
- struct domain *d,
- unsigned long pa, l2_pgentry_t gpde,
- struct domain_mmap_cache *cache)
-{
- unsigned long sl2mfn, hl2mfn;
- l2_pgentry_t *spl2e;
- l1_pgentry_t *hl2e;
-
- shadow_lock(d);
-
- sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
- if ( sl2mfn )
- {
- SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
- (void *)pa, l2e_get_intpte(gpde));
- spl2e = map_domain_page_with_cache(sl2mfn, cache);
- validate_pde_change(d, gpde,
- &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
- unmap_domain_page_with_cache(spl2e, cache);
- }
- hl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT,
- PGT_hl2_shadow);
- if ( hl2mfn )
- {
- hl2e = map_domain_page(hl2mfn);
- validate_hl2e_change(d, gpde,
- &hl2e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)]);
- unmap_domain_page(hl2e);
- }
-
- shadow_unlock(d);
-}
-
-#if CONFIG_PAGING_LEVELS >= 3
-void shadow_l3_normal_pt_update(
- struct domain *d,
- unsigned long pa, l3_pgentry_t gpde,
- struct domain_mmap_cache *cache)
-{
- BUG(); // not yet implemented
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-void shadow_l4_normal_pt_update(
- struct domain *d,
- unsigned long pa, l4_pgentry_t gpde,
- struct domain_mmap_cache *cache)
-{
- BUG(); // not yet implemented
-}
-#endif
-
-int shadow_do_update_va_mapping(unsigned long va,
- l1_pgentry_t val,
- struct vcpu *v)
-{
- struct domain *d = v->domain;
- l1_pgentry_t spte;
- int rc = 0;
-
- shadow_lock(d);
-
- // This is actually overkill - we don't need to sync the L1 itself,
- // just everything involved in getting to this L1 (i.e. we need
- // linear_pg_table[l1_linear_offset(va)] to be in sync)...
- //
- __shadow_sync_va(v, va);
-
- l1pte_propagate_from_guest(d, val, &spte);
- shadow_set_l1e(va, spte, 0);
-
- /*
- * If we're in log-dirty mode then we need to note that we've updated
- * the PTE in the PT-holding page. We need the machine frame number
- * for this.
- */
- __mark_dirty(d, va_to_l1mfn(v, va));
-
- shadow_unlock(d);
-
- return rc;
-}
-
-
-/*
- * What lives where in the 32-bit address space in the various shadow modes,
- * and what it uses to get/maintain that mapping.
- *
- * SHADOW MODE: none enable translate external
- *
- * 4KB things:
- * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2
- * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2
- * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2
- * monitor_vtable n/a n/a n/a mapped once
- *
- * 4MB things:
- * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2
- * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2
- * monitor_linear n/a n/a n/a ???
- * perdomain perdomain perdomain perdomain perdomain
- * R/O M2P R/O M2P R/O M2P n/a n/a
- * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P
- * P2M n/a n/a R/O M2P R/O M2P
- *
- * NB:
- * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
- * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
- * all play a part in maintaining these mappings.
- */
-void __update_pagetables(struct vcpu *v)
-{
- struct domain *d = v->domain;
- unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
- unsigned long gpfn = mfn_to_gmfn(d, gmfn);
- unsigned long smfn, hl2mfn, old_smfn;
- int need_sync = 0;
-
- int max_mode = ( shadow_mode_external(d) ? SHM_external
- : shadow_mode_translate(d) ? SHM_translate
- : shadow_mode_enabled(d) ? SHM_enable
- : 0 );
-
- ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
- ASSERT( max_mode );
-
- /*
- * arch.guest_vtable
- */
- if ( max_mode & (SHM_enable | SHM_external) )
- {
- if ( likely(v->arch.guest_vtable != NULL) )
- unmap_domain_page_global(v->arch.guest_vtable);
- v->arch.guest_vtable = map_domain_page_global(gmfn);
- }
-
- /*
- * arch.shadow_table
- */
- if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
- smfn = shadow_l2_table(d, gpfn, gmfn);
- else
- {
- /*
- * move sync later in order to avoid this smfn been
- * unshadowed occasionally
- */
- need_sync = 1;
- }
- if ( !get_shadow_ref(smfn) )
- BUG();
- old_smfn = pagetable_get_pfn(v->arch.shadow_table);
- v->arch.shadow_table = pagetable_from_pfn(smfn);
- if ( old_smfn )
- put_shadow_ref(old_smfn);
-
- SH_VVLOG("__update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
-
- /*
- * arch.shadow_vtable
- */
- if ( max_mode == SHM_external )
- {
- if ( v->arch.shadow_vtable )
- unmap_domain_page_global(v->arch.shadow_vtable);
- v->arch.shadow_vtable = map_domain_page_global(smfn);
- }
-
- /*
- * arch.hl2_vtable
- */
-
- // if max_mode == SHM_translate, then the hl2 is already installed
- // correctly in its smfn, and there's nothing to do.
- //
- if ( max_mode == SHM_external )
- {
- if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
- hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
- if ( v->arch.hl2_vtable )
- unmap_domain_page_global(v->arch.hl2_vtable);
- v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
- }
-
- /*
- * fixup pointers in monitor table, as necessary
- */
- if ( max_mode == SHM_external )
- {
- l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
- l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
- l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
-
- ASSERT( shadow_mode_translate(d) );
-
- if ( !get_shadow_ref(hl2mfn) )
- BUG();
- mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
- if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
- put_shadow_ref(l2e_get_pfn(old_hl2e));
-
- if ( !get_shadow_ref(smfn) )
- BUG();
- mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
- l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
- if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
- put_shadow_ref(l2e_get_pfn(old_sl2e));
-
- // XXX - maybe this can be optimized somewhat??
- local_flush_tlb();
- }
-
- if(likely(need_sync))
- shadow_sync_all(d);
-}
-
-void clear_all_shadow_status(struct domain *d)
-{
- struct vcpu *v = current;
-
- /*
- * Don't clean up while other vcpus are working.
- */
- if ( v->vcpu_id )
- return;
-
- shadow_lock(d);
-
- free_shadow_pages(d);
- free_shadow_ht_entries(d);
- d->arch.shadow_ht =
- xmalloc_array(struct shadow_status, shadow_ht_buckets);
- if ( d->arch.shadow_ht == NULL ) {
- printk("clear all shadow status: xmalloc failed\n");
- domain_crash_synchronous();
- }
- memset(d->arch.shadow_ht, 0,
- shadow_ht_buckets * sizeof(struct shadow_status));
-
- free_out_of_sync_entries(d);
-
- shadow_unlock(d);
-}
-
-/************************************************************************/
-/************************************************************************/
-/************************************************************************/
-
-#if SHADOW_DEBUG
-
-// The following is entirely for _check_pagetable()'s benefit.
-// _check_pagetable() wants to know whether a given entry in a
-// shadow page table is supposed to be the shadow of the guest's
-// current entry, or the shadow of the entry held in the snapshot
-// taken above.
-//
-// Here, we mark all currently existing entries as reflecting
-// the snapshot, above. All other places in xen that update
-// the shadow will keep the shadow in sync with the guest's
-// entries (via l1pte_propagate_from_guest and friends), which clear
-// the SHADOW_REFLECTS_SNAPSHOT bit.
-//
-static void
-mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
-{
- unsigned long smfn;
- l1_pgentry_t *l1e;
- l2_pgentry_t *l2e;
- unsigned i;
-
- if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
- {
- l1e = map_domain_page(smfn);
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
- if ( is_guest_l1_slot(i) &&
- (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
- l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
- unmap_domain_page(l1e);
- }
-
- if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
- {
- l2e = map_domain_page(smfn);
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- if ( is_guest_l2_slot(0, i) &&
- (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
- l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
- unmap_domain_page(l2e);
- }
-}
-
-// BUG: these are not SMP safe...
-static int sh_l2_present;
-static int sh_l1_present;
-static char *sh_check_name;
-int shadow_status_noswap;
-
-#define v2m(_v, _adr) ({ \
- unsigned long _a = (unsigned long)(_adr); \
- l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \
- unsigned long _pa = -1; \
- if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \
- { \
- l1_pgentry_t _pte; \
- _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \
- if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \
- _pa = l1e_get_paddr(_pte); \
- } \
- _pa | (_a & ~PAGE_MASK); \
-})
-
-#define FAIL(_f, _a...) \
- do { \
- printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \
- sh_check_name, level, l2_idx, l1_idx, ## _a, \
- __FILE__, __LINE__); \
- printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \
- " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \
- " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \
- " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \
- l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \
- l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \
- p_guest_pte, p_shadow_pte, p_snapshot_pte, \
- (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \
- (void *)v2m(v, p_snapshot_pte), \
- (l2_idx << L2_PAGETABLE_SHIFT) | \
- (l1_idx << L1_PAGETABLE_SHIFT)); \
- errors++; \
- } while ( 0 )
-
-static int check_pte(
- struct vcpu *v,
- l1_pgentry_t *p_guest_pte,
- l1_pgentry_t *p_shadow_pte,
- l1_pgentry_t *p_snapshot_pte,
- int level, int l2_idx, int l1_idx)
-{
- struct domain *d = v->domain;
- l1_pgentry_t guest_pte = *p_guest_pte;
- l1_pgentry_t shadow_pte = *p_shadow_pte;
- l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
- l1_pgentry_t eff_guest_pte = l1e_empty();
- unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
- int errors = 0, guest_writable;
- int page_table_page;
-
- if ( (l1e_get_intpte(shadow_pte) == 0) ||
- (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
- (l1e_get_intpte(shadow_pte) == 0x00000E00) )
- return errors; /* always safe */
-
- if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
- FAIL("Non zero not present shadow_pte");
-
- if ( level == 2 ) sh_l2_present++;
- if ( level == 1 ) sh_l1_present++;
-
- if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
- eff_guest_pte = snapshot_pte;
- else
- eff_guest_pte = guest_pte;
-
- if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
- FAIL("Guest not present yet shadow is");
-
- mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
-
- if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
- FAIL("Corrupt?");
-
- if ( (level == 1) &&
- (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
- !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
- FAIL("Dirty coherence");
-
- if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
- !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
- FAIL("Accessed coherence");
-
- if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
- FAIL("global bit set in shadow");
-
- eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
- eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
- shadow_mfn = l1e_get_pfn(shadow_pte);
-
- if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
- FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
- __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
-
- page_table_page = mfn_is_page_table(eff_guest_mfn);
-
- guest_writable =
- (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
- (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
-
- if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
- {
- printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n",
- eff_guest_pfn, eff_guest_mfn, shadow_mfn,
- mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
- page_table_page);
- FAIL("RW coherence");
- }
-
- if ( (level == 1) &&
- (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
- !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
- {
- printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n",
- eff_guest_pfn, eff_guest_mfn, shadow_mfn,
- mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
- page_table_page);
- FAIL("RW2 coherence");
- }
-
- if ( eff_guest_mfn == shadow_mfn )
- {
- if ( level > 1 )
- FAIL("Linear map ???"); /* XXX this will fail on BSD */
- }
- else
- {
- if ( level < 2 )
- FAIL("Shadow in L1 entry?");
-
- if ( level == 2 )
- {
- if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
- FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
- __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
- }
- else
- BUG(); // XXX -- not handled yet.
- }
-
- return errors;
-}
-#undef FAIL
-#undef v2m
-
-static int check_l1_table(
- struct vcpu *v, unsigned long gpfn,
- unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
-{
- struct domain *d = v->domain;
- int i;
- unsigned long snapshot_mfn;
- l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
- int errors = 0;
-
- if ( page_out_of_sync(mfn_to_page(gmfn)) )
- {
- snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
- ASSERT(snapshot_mfn);
- p_snapshot = map_domain_page(snapshot_mfn);
- }
-
- p_guest = map_domain_page(gmfn);
- p_shadow = map_domain_page(smfn);
-
- for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
- errors += check_pte(v, p_guest+i, p_shadow+i,
- p_snapshot ? p_snapshot+i : NULL,
- 1, l2_idx, i);
-
- unmap_domain_page(p_shadow);
- unmap_domain_page(p_guest);
- if ( p_snapshot )
- unmap_domain_page(p_snapshot);
-
- return errors;
-}
-
-#define FAILPT(_f, _a...) \
- do { \
- printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
- errors++; \
- } while ( 0 )
-
-int check_l2_table(
- struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
-{
- struct domain *d = v->domain;
- l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
- l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
- l2_pgentry_t match;
- int i;
- int errors = 0;
- int limit;
-
- if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
- FAILPT("domain doesn't own page");
- if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
- FAILPT("bogus owner for snapshot page");
- if ( page_get_owner(mfn_to_page(smfn)) != NULL )
- FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
- smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
-
-#if 0
- if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
- DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
- {
- for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
- i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
- i++ )
- printk("+++ (%d) %lx %lx\n",i,
- l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
- FAILPT("hypervisor entries inconsistent");
- }
-
- if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
- l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
- FAILPT("hypervisor linear map inconsistent");
-#endif
-
- match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
- if ( !shadow_mode_external(d) &&
- l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
- match, PAGE_FLAG_MASK))
- {
- FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
- l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
- L2_PAGETABLE_SHIFT]),
- l2e_get_intpte(match));
- }
-
- match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
- if ( !shadow_mode_external(d) &&
- l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
- match, PAGE_FLAG_MASK))
- {
- FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
- l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
- d->arch.mm_perdomain_pt,
- l2e_get_intpte(match));
- }
-
- if ( shadow_mode_external(d) )
- limit = L2_PAGETABLE_ENTRIES;
- else
- limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
- /* Check the whole L2. */
- for ( i = 0; i < limit; i++ )
- errors += check_pte(v,
- (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
- (l1_pgentry_t*)(&spl2e[i]),
- NULL,
- 2, i, 0);
-
- unmap_domain_page(spl2e);
- unmap_domain_page(gpl2e);
-
-#if 1
- if ( errors )
- printk("check_l2_table returning %d errors\n", errors);
-#endif
-
- return errors;
-}
-#undef FAILPT
-
-int _check_pagetable(struct vcpu *v, char *s)
-{
- struct domain *d = v->domain;
- pagetable_t pt = v->arch.guest_table;
- unsigned long gptbase = pagetable_get_paddr(pt);
- unsigned long ptbase_pfn, smfn;
- unsigned long i;
- l2_pgentry_t *gpl2e, *spl2e;
- unsigned long ptbase_mfn = 0;
- int errors = 0, limit, oos_pdes = 0;
-
- //_audit_domain(d, AUDIT_QUIET);
- shadow_lock(d);
-
- sh_check_name = s;
- //SH_VVLOG("%s-PT Audit", s);
- sh_l2_present = sh_l1_present = 0;
- perfc_incrc(check_pagetable);
-
- ptbase_mfn = gptbase >> PAGE_SHIFT;
- ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
-
- if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
- {
- printk("%s-PT %lx not shadowed\n", s, gptbase);
- goto out;
- }
- if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
- {
- ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
- oos_pdes = 1;
- ASSERT(ptbase_mfn);
- }
-
- errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
-
- gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
- spl2e = (l2_pgentry_t *) map_domain_page(smfn);
-
- /* Go back and recurse. */
- if ( shadow_mode_external(d) )
- limit = L2_PAGETABLE_ENTRIES;
- else
- limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
- for ( i = 0; i < limit; i++ )
- {
- unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
- unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
- unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
-
- if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */
- {
- errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
- }
- }
-
- unmap_domain_page(spl2e);
- unmap_domain_page(gpl2e);
-
- out:
- if ( errors )
- BUG();
-
- shadow_unlock(d);
-
- return errors;
-}
-
-int _check_all_pagetables(struct vcpu *v, char *s)
-{
- struct domain *d = v->domain;
- int i;
- struct shadow_status *a;
- unsigned long gmfn;
- int errors = 0;
-
- shadow_status_noswap = 1;
-
- sh_check_name = s;
- SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
- sh_l2_present = sh_l1_present = 0;
- perfc_incrc(check_all_pagetables);
-
- for (i = 0; i < shadow_ht_buckets; i++)
- {
- a = &d->arch.shadow_ht[i];
- while ( a && a->gpfn_and_flags )
- {
- gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
-
- switch ( a->gpfn_and_flags & PGT_type_mask )
- {
- case PGT_l1_shadow:
- errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
- gmfn, a->smfn, 0);
- break;
- case PGT_l2_shadow:
- errors += check_l2_table(v, gmfn, a->smfn,
- page_out_of_sync(mfn_to_page(gmfn)));
- break;
- case PGT_l3_shadow:
- case PGT_l4_shadow:
- case PGT_hl2_shadow:
- BUG(); // XXX - ought to fix this...
- break;
- case PGT_snapshot:
- case PGT_writable_pred:
- break;
- default:
- errors++;
- printk("unexpected shadow type %lx, gpfn=%lx, "
- "gmfn=%lx smfn=%lx\n",
- a->gpfn_and_flags & PGT_type_mask,
- a->gpfn_and_flags & PGT_mfn_mask,
- gmfn, a->smfn);
- BUG();
- }
- a = a->next;
- }
- }
-
- shadow_status_noswap = 0;
-
- if ( errors )
- BUG();
-
- return errors;
-}
-
-#endif // SHADOW_DEBUG
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/arch/x86/shadow_guest32.c b/xen/arch/x86/shadow_guest32.c
deleted file mode 100644
index bdc58257cd..0000000000
--- a/xen/arch/x86/shadow_guest32.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#define GUEST_PGENTRY_32
-
-#include "shadow.c"
-struct shadow_ops MODE_64_2_HANDLER = {
- .guest_paging_levels = 2,
- .invlpg = shadow_invlpg_64,
- .fault = shadow_fault_64,
- .update_pagetables = shadow_update_pagetables,
- .sync_all = sync_all,
- .remove_all_write_access = remove_all_write_access,
- .do_update_va_mapping = do_update_va_mapping,
- .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
- .is_out_of_sync = is_out_of_sync,
- .gva_to_gpa = gva_to_gpa_64,
-};
-
diff --git a/xen/arch/x86/shadow_guest32pae.c b/xen/arch/x86/shadow_guest32pae.c
deleted file mode 100644
index 432c9b9cb4..0000000000
--- a/xen/arch/x86/shadow_guest32pae.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#define GUEST_32PAE
-
-#include "shadow.c"
-struct shadow_ops MODE_64_PAE_HANDLER = {
- .guest_paging_levels = 3,
- .invlpg = shadow_invlpg_64,
- .fault = shadow_fault_64,
- .update_pagetables = shadow_update_pagetables,
- .sync_all = sync_all,
- .remove_all_write_access = remove_all_write_access,
- .do_update_va_mapping = do_update_va_mapping,
- .mark_mfn_out_of_sync = mark_mfn_out_of_sync,
- .is_out_of_sync = is_out_of_sync,
- .gva_to_gpa = gva_to_gpa_64,
-};
-
diff --git a/xen/arch/x86/shadow_public.c b/xen/arch/x86/shadow_public.c
deleted file mode 100644
index 40aa22e4ea..0000000000
--- a/xen/arch/x86/shadow_public.c
+++ /dev/null
@@ -1,2143 +0,0 @@
-/******************************************************************************
- * arch/x86/shadow_public.c
- *
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/shadow.h>
-#include <asm/page.h>
-#include <xen/event.h>
-#include <xen/sched.h>
-#include <xen/trace.h>
-#include <xen/guest_access.h>
-#include <asm/shadow_64.h>
-
-static int alloc_p2m_table(struct domain *d);
-static void free_p2m_table(struct domain *d);
-
-#define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) - ((_encoded) >> 16))
-
-
-int shadow_direct_map_init(struct domain *d)
-{
- struct page_info *page;
- l3_pgentry_t *root;
-
- if ( !(page = alloc_domheap_pages(NULL, 0, MEMF_dma)) )
- return 0;
-
- root = map_domain_page(page_to_mfn(page));
- memset(root, 0, PAGE_SIZE);
- root[PAE_SHADOW_SELF_ENTRY] = l3e_from_page(page, __PAGE_HYPERVISOR);
-
- d->arch.phys_table = pagetable_from_page(page);
-
- unmap_domain_page(root);
- return 1;
-}
-
-void shadow_direct_map_clean(struct domain *d)
-{
- unsigned long mfn;
- l2_pgentry_t *l2e;
- l3_pgentry_t *l3e;
- int i, j;
-
- mfn = pagetable_get_pfn(d->arch.phys_table);
-
- /*
- * We may fail very early before direct map is built.
- */
- if ( !mfn )
- return;
-
- l3e = (l3_pgentry_t *)map_domain_page(mfn);
-
- for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
- {
- if ( l3e_get_flags(l3e[i]) & _PAGE_PRESENT )
- {
- l2e = map_domain_page(l3e_get_pfn(l3e[i]));
-
- for ( j = 0; j < L2_PAGETABLE_ENTRIES; j++ )
- {
- if ( l2e_get_flags(l2e[j]) & _PAGE_PRESENT )
- free_domheap_page(mfn_to_page(l2e_get_pfn(l2e[j])));
- }
- unmap_domain_page(l2e);
- free_domheap_page(mfn_to_page(l3e_get_pfn(l3e[i])));
- }
- }
- free_domheap_page(mfn_to_page(mfn));
-
- unmap_domain_page(l3e);
-
- d->arch.phys_table = pagetable_null();
-}
-
-/****************************************************************************/
-/************* export interface functions ***********************************/
-/****************************************************************************/
-void free_shadow_pages(struct domain *d);
-
-int shadow_set_guest_paging_levels(struct domain *d, int levels)
-{
- struct vcpu *v = current;
-
- /*
- * Need to wait for VCPU0 to complete the on-going shadow ops.
- */
-
- if ( v->domain == d && v->vcpu_id )
- return 1;
-
- shadow_lock(d);
-
- switch(levels) {
-#if CONFIG_PAGING_LEVELS == 4
- case 4:
- if ( d->arch.ops != &MODE_64_4_HANDLER )
- d->arch.ops = &MODE_64_4_HANDLER;
- shadow_unlock(d);
- return 1;
-#endif
-#if CONFIG_PAGING_LEVELS == 3
- case 3:
- if ( d->arch.ops == NULL ||
- shadow_mode_log_dirty(d) )
- {
- if ( d->arch.ops != &MODE_64_3_HANDLER )
- d->arch.ops = &MODE_64_3_HANDLER;
- }
- else
- {
- if ( d->arch.ops == &MODE_64_2_HANDLER )
- free_shadow_pages(d);
- if ( d->arch.ops != &MODE_64_PAE_HANDLER )
- d->arch.ops = &MODE_64_PAE_HANDLER;
- }
- shadow_unlock(d);
- return 1;
-#endif
-#if CONFIG_PAGING_LEVELS == 4
- case 3:
- if ( d->arch.ops == &MODE_64_2_HANDLER )
- free_shadow_pages(d);
- if ( d->arch.ops != &MODE_64_PAE_HANDLER )
- d->arch.ops = &MODE_64_PAE_HANDLER;
- shadow_unlock(d);
- return 1;
-#endif
- case 2:
-#if CONFIG_PAGING_LEVELS == 2
- if ( d->arch.ops != &MODE_32_2_HANDLER )
- d->arch.ops = &MODE_32_2_HANDLER;
-#elif CONFIG_PAGING_LEVELS >= 3
- if ( d->arch.ops != &MODE_64_2_HANDLER )
- d->arch.ops = &MODE_64_2_HANDLER;
-#endif
- shadow_unlock(d);
- return 1;
- default:
- shadow_unlock(d);
- return 0;
- }
-}
-
-void shadow_invlpg(struct vcpu *v, unsigned long va)
-{
- struct domain *d = current->domain;
- d->arch.ops->invlpg(v, va);
-}
-
-int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
-{
- struct domain *d = current->domain;
- return d->arch.ops->fault(va, regs);
-}
-
-void __update_pagetables(struct vcpu *v)
-{
- struct domain *d = v->domain;
- d->arch.ops->update_pagetables(v);
-}
-
-void __shadow_sync_all(struct domain *d)
-{
- d->arch.ops->sync_all(d);
-}
-
-int shadow_remove_all_write_access(
- struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
-{
- return d->arch.ops->remove_all_write_access(d, readonly_gpfn, readonly_gmfn);
-}
-
-int shadow_do_update_va_mapping(unsigned long va,
- l1_pgentry_t val,
- struct vcpu *v)
-{
- struct domain *d = v->domain;
- return d->arch.ops->do_update_va_mapping(va, val, v);
-}
-
-struct out_of_sync_entry *
-shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
- unsigned long mfn)
-{
- struct domain *d = v->domain;
- return d->arch.ops->mark_mfn_out_of_sync(v, gpfn, mfn);
-}
-
-/*
- * Returns 1 if va's shadow mapping is out-of-sync.
- * Returns 0 otherwise.
- */
-int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
-{
- struct domain *d = v->domain;
- return d->arch.ops->is_out_of_sync(v, va);
-}
-
-unsigned long gva_to_gpa(unsigned long gva)
-{
- struct domain *d = current->domain;
- return d->arch.ops->gva_to_gpa(gva);
-}
-/****************************************************************************/
-/****************************************************************************/
-#if CONFIG_PAGING_LEVELS >= 3
-
-static void inline
-free_shadow_fl1_table(struct domain *d, unsigned long smfn)
-{
- l1_pgentry_t *pl1e = map_domain_page(smfn);
- int i;
-
- for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
- put_page_from_l1e(pl1e[i], d);
-
- unmap_domain_page(pl1e);
-}
-
-/*
- * Free l2, l3, l4 shadow tables
- */
-
-void free_fake_shadow_l2(struct domain *d,unsigned long smfn);
-
-static void inline
-free_shadow_tables(struct domain *d, unsigned long smfn, u32 level)
-{
- pgentry_64_t *ple = map_domain_page(smfn);
- int i, external = shadow_mode_external(d);
-
-#if CONFIG_PAGING_LEVELS >= 3
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- struct page_info *page = mfn_to_page(smfn);
- for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
- {
- if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
- free_fake_shadow_l2(d, entry_get_pfn(ple[i]));
- }
-
- page = mfn_to_page(entry_get_pfn(ple[0]));
- free_domheap_pages(page, SL2_ORDER);
- unmap_domain_page(ple);
- }
- else
-#endif
- {
- /*
- * No Xen mappings in external pages
- */
- if ( external )
- {
- for ( i = 0; i < PAGETABLE_ENTRIES; i++ ) {
- if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
- put_shadow_ref(entry_get_pfn(ple[i]));
- if (d->arch.ops->guest_paging_levels == PAGING_L3)
- {
-#if CONFIG_PAGING_LEVELS >= 3
- if ( i == PAE_L3_PAGETABLE_ENTRIES && level == PAGING_L4 )
-#endif
- break;
- }
- }
- }
- else
- {
- for ( i = 0; i < PAGETABLE_ENTRIES; i++ )
- {
- /*
- * List the skip/break conditions to avoid freeing
- * Xen private mappings.
- */
-#if CONFIG_PAGING_LEVELS == 2
- if ( level == PAGING_L2 && !is_guest_l2_slot(0, i) )
- continue;
-#endif
-#if CONFIG_PAGING_LEVELS == 3
- if ( level == PAGING_L3 && i == L3_PAGETABLE_ENTRIES )
- break;
- if ( level == PAGING_L2 )
- {
- struct page_info *page = mfn_to_page(smfn);
- if ( is_xen_l2_slot(page->u.inuse.type_info, i) )
- continue;
- }
-#endif
-#if CONFIG_PAGING_LEVELS == 4
- if ( level == PAGING_L4 && !is_guest_l4_slot(i))
- continue;
-#endif
- if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
- put_shadow_ref(entry_get_pfn(ple[i]));
- }
- }
- unmap_domain_page(ple);
- }
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-static void alloc_monitor_pagetable(struct vcpu *v)
-{
- unsigned long mmfn;
- l4_pgentry_t *mpl4e;
- struct page_info *mmfn_info;
- struct domain *d = v->domain;
-
- ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */
-
- mmfn_info = alloc_domheap_page(NULL);
- ASSERT( mmfn_info );
- if (!mmfn_info)
- {
- printk("Fail to allocate monitor pagetable\n");
- domain_crash(v->domain);
- }
-
- mmfn = page_to_mfn(mmfn_info);
- mpl4e = (l4_pgentry_t *) map_domain_page_global(mmfn);
- memcpy(mpl4e, idle_pg_table, PAGE_SIZE);
- mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
- l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
-
- /* map the phys_to_machine map into the per domain Read-Only MPT space */
-
- v->arch.monitor_table = pagetable_from_pfn(mmfn);
- v->arch.monitor_vtable = (l2_pgentry_t *) mpl4e;
- mpl4e[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
-
- if ( v->vcpu_id == 0 )
- alloc_p2m_table(d);
- else
- {
- unsigned long mfn;
-
- mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
- if ( mfn )
- {
- l4_pgentry_t *l4tab;
-
- l4tab = map_domain_page(mfn);
-
- mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
- l4tab[l4_table_offset(RO_MPT_VIRT_START)];
-
- unmap_domain_page(l4tab);
- }
- }
-}
-
-void free_monitor_pagetable(struct vcpu *v)
-{
- unsigned long mfn;
-
- /*
- * free monitor_table.
- */
- if ( v->vcpu_id == 0 )
- free_p2m_table(v->domain);
-
- /*
- * Then free monitor_table.
- */
- mfn = pagetable_get_pfn(v->arch.monitor_table);
- unmap_domain_page_global(v->arch.monitor_vtable);
- free_domheap_page(mfn_to_page(mfn));
-
- v->arch.monitor_table = pagetable_null();
- v->arch.monitor_vtable = 0;
-}
-#elif CONFIG_PAGING_LEVELS == 3
-static void alloc_monitor_pagetable(struct vcpu *v)
-{
- unsigned long m2mfn, m3mfn;
- l2_pgentry_t *mpl2e;
- l3_pgentry_t *mpl3e;
- struct page_info *m2mfn_info, *m3mfn_info;
- struct domain *d = v->domain;
- int i;
-
- ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */
-
- m3mfn_info = alloc_domheap_pages(NULL, 0, MEMF_dma);
- ASSERT( m3mfn_info );
-
- m3mfn = page_to_mfn(m3mfn_info);
- mpl3e = (l3_pgentry_t *) map_domain_page_global(m3mfn);
- memset(mpl3e, 0, L3_PAGETABLE_ENTRIES * sizeof(l3_pgentry_t));
-
- v->arch.monitor_table = pagetable_from_pfn(m3mfn);
- v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e;
-
- m2mfn_info = alloc_domheap_page(NULL);
- ASSERT( m2mfn_info );
-
- m2mfn = page_to_mfn(m2mfn_info);
- mpl2e = (l2_pgentry_t *) map_domain_page(m2mfn);
- memset(mpl2e, 0, PAGE_SIZE);
-
- /* Map L2 page into L3 */
- mpl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(m2mfn, _PAGE_PRESENT);
-
- memcpy(&mpl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
- &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
- L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
-
- for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
- mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
- l2e_from_page(
- virt_to_page(d->arch.mm_perdomain_pt) + i,
- __PAGE_HYPERVISOR);
- for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
- mpl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
- (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
- l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
- l2e_empty();
- for ( i = 0; i < (MACHPHYS_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
- mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] = l2e_empty();
-
- if ( v->vcpu_id == 0 )
- {
- unsigned long m1mfn;
- l1_pgentry_t *mpl1e;
- struct page_info *m1mfn_info;
-
- /*
- * 2 l2 slots are allocated here, so that 4M for p2m table,
- * with this we can guarantee PCI MMIO p2m entries, especially
- * Cirrus VGA, can be seen by all other vcpus.
- */
- for ( i = 0; i < 2; i++ )
- {
- m1mfn_info = alloc_domheap_page(NULL);
- ASSERT( m1mfn_info );
-
- m1mfn = page_to_mfn(m1mfn_info);
- mpl1e = (l1_pgentry_t *) map_domain_page(m1mfn);
- memset(mpl1e, 0, PAGE_SIZE);
- unmap_domain_page(mpl1e);
-
- /* Map L1 page into L2 */
- mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] =
- l2e_from_pfn(m1mfn, __PAGE_HYPERVISOR);
- }
-
- alloc_p2m_table(d);
- }
- else
- {
- unsigned long mfn;
-
- mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
- if ( mfn )
- {
- l3_pgentry_t *l3tab, l3e;
- l2_pgentry_t *l2tab;
-
- l3tab = map_domain_page(mfn);
- l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
-
- /*
- * NB: when CONFIG_PAGING_LEVELS == 3,
- * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
- * alloc_monitor_pagetable should guarantee this.
- */
- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
- BUG();
-
- l2tab = map_domain_page(l3e_get_pfn(l3e));
-
- for ( i = 0; i < (MACHPHYS_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
- mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] =
- l2tab[l2_table_offset(RO_MPT_VIRT_START) + i];
-
- unmap_domain_page(l2tab);
- unmap_domain_page(l3tab);
- }
- }
-
- unmap_domain_page(mpl2e);
-}
-
-void free_monitor_pagetable(struct vcpu *v)
-{
- unsigned long m2mfn, m3mfn;
- /*
- * free monitor_table.
- */
- if ( v->vcpu_id == 0 )
- free_p2m_table(v->domain);
-
- m3mfn = pagetable_get_pfn(v->arch.monitor_table);
- m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]);
-
- free_domheap_page(mfn_to_page(m2mfn));
- unmap_domain_page_global(v->arch.monitor_vtable);
- free_domheap_page(mfn_to_page(m3mfn));
-
- v->arch.monitor_table = pagetable_null();
- v->arch.monitor_vtable = 0;
-}
-#endif
-
-static void
-shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
-{
- void *snapshot;
-
- if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
- return;
-
- // Clear the out_of_sync bit.
- //
- clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info);
-
- // XXX Need to think about how to protect the domain's
- // information less expensively.
- //
- snapshot = map_domain_page(entry->snapshot_mfn);
- memset(snapshot, 0, PAGE_SIZE);
- unmap_domain_page(snapshot);
-
- put_shadow_ref(entry->snapshot_mfn);
-}
-
-void
-release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
-{
- struct page_info *page;
-
- page = mfn_to_page(entry->gmfn);
-
- // Decrement ref count of guest & shadow pages
- //
- put_page(page);
-
- // Only use entries that have low bits clear...
- //
- if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
- {
- put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
- entry->writable_pl1e = -2;
- }
- else
- ASSERT( entry->writable_pl1e == -1 );
-
- // Free the snapshot
- //
- shadow_free_snapshot(d, entry);
-}
-
-static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
-{
- struct out_of_sync_entry *entry = d->arch.out_of_sync;
- struct out_of_sync_entry **prev = &d->arch.out_of_sync;
- struct out_of_sync_entry *found = NULL;
-
- // NB: Be careful not to call something that manipulates this list
- // while walking it. Collect the results into a separate list
- // first, then walk that list.
- //
- while ( entry )
- {
- if ( entry->gmfn == gmfn )
- {
- // remove from out of sync list
- *prev = entry->next;
-
- // add to found list
- entry->next = found;
- found = entry;
-
- entry = *prev;
- continue;
- }
- prev = &entry->next;
- entry = entry->next;
- }
-
- prev = NULL;
- entry = found;
- while ( entry )
- {
- release_out_of_sync_entry(d, entry);
-
- prev = &entry->next;
- entry = entry->next;
- }
-
- // Add found list to free list
- if ( prev )
- {
- *prev = d->arch.out_of_sync_free;
- d->arch.out_of_sync_free = found;
- }
-}
-
-static inline void
-shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
- if ( !shadow_mode_refcounts(d) )
- return;
-
- ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table);
-
- if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
- {
- clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info);
-
- if ( page_out_of_sync(mfn_to_page(gmfn)) )
- {
- remove_out_of_sync_entries(d, gmfn);
- }
- }
-}
-
-static void inline
-free_shadow_l1_table(struct domain *d, unsigned long smfn)
-{
- l1_pgentry_t *pl1e = map_domain_page(smfn);
- l1_pgentry_t *pl1e_next = 0, *sl1e_p;
- int i;
- struct page_info *spage = mfn_to_page(smfn);
- u32 min_max = spage->tlbflush_timestamp;
- int min = SHADOW_MIN(min_max);
- int max;
-
- if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
- {
- max = SHADOW_MAX_GUEST32(min_max);
- pl1e_next = map_domain_page(smfn + 1);
- }
- else
- max = SHADOW_MAX(min_max);
-
- for ( i = min; i <= max; i++ )
- {
- if ( pl1e_next && i >= L1_PAGETABLE_ENTRIES )
- sl1e_p = &pl1e_next[i - L1_PAGETABLE_ENTRIES];
- else
- sl1e_p = &pl1e[i];
-
- shadow_put_page_from_l1e(*sl1e_p, d);
- *sl1e_p = l1e_empty();
- }
-
- unmap_domain_page(pl1e);
- if ( pl1e_next )
- unmap_domain_page(pl1e_next);
-}
-
-static void inline
-free_shadow_hl2_table(struct domain *d, unsigned long smfn)
-{
- l1_pgentry_t *hl2 = map_domain_page(smfn);
- int i, limit;
-
- SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
-
-#if CONFIG_PAGING_LEVELS == 2
- if ( shadow_mode_external(d) )
- limit = L2_PAGETABLE_ENTRIES;
- else
- limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-#endif
-
- for ( i = 0; i < limit; i++ )
- {
- if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
- put_page(mfn_to_page(l1e_get_pfn(hl2[i])));
- }
-
- unmap_domain_page(hl2);
-}
-
-static void inline
-free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
-{
- l2_pgentry_t *pl2e = map_domain_page(smfn);
- int i, external = shadow_mode_external(d);
-
- for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
- if ( external || is_guest_l2_slot(type, i) )
- if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
- put_shadow_ref(l2e_get_pfn(pl2e[i]));
-
- if ( (PGT_base_page_table == PGT_l2_page_table) &&
- shadow_mode_translate(d) && !external )
- {
- // free the ref to the hl2
- //
- put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
- }
-
- unmap_domain_page(pl2e);
-}
-
-void free_fake_shadow_l2(struct domain *d, unsigned long smfn)
-{
- pgentry_64_t *ple = map_domain_page(smfn);
- int i;
-
- for ( i = 0; i < PAGETABLE_ENTRIES; i = i + 2 )
- if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
- put_shadow_ref(entry_get_pfn(ple[i]));
-
- unmap_domain_page(ple);
-}
-
-void free_shadow_page(unsigned long smfn)
-{
- struct page_info *page = mfn_to_page(smfn);
- unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
- struct domain *d = page_get_owner(mfn_to_page(gmfn));
- unsigned long gpfn = mfn_to_gmfn(d, gmfn);
- unsigned long type = page->u.inuse.type_info & PGT_type_mask;
- u64 index = 0;
-
- SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
-
- ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
-#if CONFIG_PAGING_LEVELS >= 4
- if ( type == PGT_fl1_shadow )
- {
- unsigned long mfn;
- mfn = __shadow_status(d, gpfn, PGT_fl1_shadow);
- if ( !mfn )
- gpfn |= PGT_high_mfn_nx;
- }
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
- {
- if ( type == PGT_l4_shadow )
- index = page->tlbflush_timestamp;
- }
-#endif
-
- delete_shadow_status(d, gpfn, gmfn, type, index);
-
- switch ( type )
- {
- case PGT_l1_shadow:
- perfc_decr(shadow_l1_pages);
- shadow_demote(d, gpfn, gmfn);
- free_shadow_l1_table(d, smfn);
- d->arch.shadow_page_count--;
- break;
-#if CONFIG_PAGING_LEVELS == 2
- case PGT_l2_shadow:
- perfc_decr(shadow_l2_pages);
- shadow_demote(d, gpfn, gmfn);
- free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
- d->arch.shadow_page_count--;
- break;
-
- case PGT_hl2_shadow:
- perfc_decr(hl2_table_pages);
- shadow_demote(d, gpfn, gmfn);
- free_shadow_hl2_table(d, smfn);
- d->arch.hl2_page_count--;
- break;
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
- case PGT_l2_shadow:
- case PGT_l3_shadow:
- shadow_demote(d, gpfn, gmfn);
- free_shadow_tables(d, smfn, shadow_type_to_level(type));
- d->arch.shadow_page_count--;
- break;
-
- case PGT_l4_shadow:
- gpfn = gpfn & PGT_mfn_mask;
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
- {
- /*
- * Since a single PDPT page can have multiple PDPs, it's possible
- * that shadow_demote() has been already called for gmfn.
- */
- if ( mfn_is_page_table(gmfn) )
- shadow_demote(d, gpfn, gmfn);
- } else
- shadow_demote(d, gpfn, gmfn);
-
- free_shadow_tables(d, smfn, shadow_type_to_level(type));
- d->arch.shadow_page_count--;
- break;
-
- case PGT_fl1_shadow:
- free_shadow_fl1_table(d, smfn);
- d->arch.shadow_page_count--;
- break;
-#endif
- case PGT_snapshot:
- perfc_decr(snapshot_pages);
- break;
-
- default:
- printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n",
- page_to_mfn(page), page->u.inuse.type_info);
- break;
- }
-
- // No TLB flushes are needed the next time this page gets allocated.
- //
- page->tlbflush_timestamp = 0;
- page->u.free.cpumask = CPU_MASK_NONE;
-
- if ( type == PGT_l1_shadow )
- {
- list_add(&page->list, &d->arch.free_shadow_frames);
- perfc_incr(free_l1_pages);
- }
- else
- free_domheap_page(page);
-}
-
-static void
-free_writable_pte_predictions(struct domain *d)
-{
- int i;
- struct shadow_status *x;
-
- for ( i = 0; i < shadow_ht_buckets; i++ )
- {
- u32 count;
- unsigned long *gpfn_list;
-
- /* Skip empty buckets. */
- if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
- continue;
-
- count = 0;
- for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
- if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
- count++;
-
- gpfn_list = xmalloc_array(unsigned long, count);
- count = 0;
- for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
- if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
- gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
-
- while ( count )
- {
- count--;
- delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred, 0);
- }
-
- xfree(gpfn_list);
- }
-}
-
-static void free_shadow_ht_entries(struct domain *d)
-{
- struct shadow_status *x, *n;
-
- SH_VLOG("freed tables count=%d l1=%d l2=%d",
- d->arch.shadow_page_count, perfc_value(shadow_l1_pages),
- perfc_value(shadow_l2_pages));
-
- n = d->arch.shadow_ht_extras;
- while ( (x = n) != NULL )
- {
- d->arch.shadow_extras_count--;
- n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
- xfree(x);
- }
-
- d->arch.shadow_ht_extras = NULL;
- d->arch.shadow_ht_free = NULL;
-
- ASSERT(d->arch.shadow_extras_count == 0);
- SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
-
- if ( d->arch.shadow_dirty_bitmap != NULL )
- {
- xfree(d->arch.shadow_dirty_bitmap);
- d->arch.shadow_dirty_bitmap = 0;
- d->arch.shadow_dirty_bitmap_size = 0;
- }
-
- xfree(d->arch.shadow_ht);
- d->arch.shadow_ht = NULL;
-}
-
-static void free_out_of_sync_entries(struct domain *d)
-{
- struct out_of_sync_entry *x, *n;
-
- n = d->arch.out_of_sync_extras;
- while ( (x = n) != NULL )
- {
- d->arch.out_of_sync_extras_count--;
- n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
- xfree(x);
- }
-
- d->arch.out_of_sync_extras = NULL;
- d->arch.out_of_sync_free = NULL;
- d->arch.out_of_sync = NULL;
-
- ASSERT(d->arch.out_of_sync_extras_count == 0);
- FSH_LOG("freed extra out_of_sync entries, now %d",
- d->arch.out_of_sync_extras_count);
-}
-
-void free_shadow_pages(struct domain *d)
-{
- int i;
- struct shadow_status *x;
- struct vcpu *v;
- struct list_head *list_ent, *tmp;
-
- /*
- * WARNING! The shadow page table must not currently be in use!
- * e.g., You are expected to have paused the domain and synchronized CR3.
- */
-
- if( !d->arch.shadow_ht ) return;
-
- shadow_audit(d, 1);
-
- // first, remove any outstanding refs from out_of_sync entries...
- //
- free_out_of_sync_state(d);
-
- // second, remove any outstanding refs from v->arch.shadow_table
- // and CR3.
- //
- for_each_vcpu(d, v)
- {
- if ( pagetable_get_paddr(v->arch.shadow_table) )
- {
- put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
- v->arch.shadow_table = pagetable_null();
-
- if ( shadow_mode_external(d) )
- {
- if ( v->arch.shadow_vtable )
- unmap_domain_page_global(v->arch.shadow_vtable);
- v->arch.shadow_vtable = NULL;
- }
- }
-
- if ( v->arch.monitor_shadow_ref )
- {
- put_shadow_ref(v->arch.monitor_shadow_ref);
- v->arch.monitor_shadow_ref = 0;
- }
- }
-
-#if CONFIG_PAGING_LEVELS == 2
- // For external shadows, remove the monitor table's refs
- //
- if ( shadow_mode_external(d) )
- {
- for_each_vcpu(d, v)
- {
- l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
-
- if ( mpl2e )
- {
- l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
- l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
-
- if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
- {
- put_shadow_ref(l2e_get_pfn(hl2e));
- mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
- }
- if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
- {
- put_shadow_ref(l2e_get_pfn(smfn));
- mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
- }
- }
- }
- }
-#endif
- // Now, the only refs to shadow pages that are left are from the shadow
- // pages themselves. We just unpin the pinned pages, and the rest
- // should automatically disappear.
- //
- // NB: Beware: each explicitly or implicit call to free_shadow_page
- // can/will result in the hash bucket getting rewritten out from
- // under us... First, collect the list of pinned pages, then
- // free them.
- //
- for ( i = 0; i < shadow_ht_buckets; i++ )
- {
- u32 count;
- unsigned long *mfn_list;
-
- /* Skip empty buckets. */
- if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
- continue;
-
- count = 0;
- for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
- if ( MFN_PINNED(x->smfn) )
- count++;
- if ( !count )
- continue;
-
- mfn_list = xmalloc_array(unsigned long, count);
- count = 0;
- for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
- if ( MFN_PINNED(x->smfn) )
- mfn_list[count++] = x->smfn;
-
- while ( count )
- {
- shadow_unpin(mfn_list[--count]);
- }
- xfree(mfn_list);
- }
-
- /* Now free the pre-zero'ed pages from the domain. */
- list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
- {
- struct page_info *page = list_entry(list_ent, struct page_info, list);
-
- list_del(list_ent);
- perfc_decr(free_l1_pages);
-
- if (d->arch.ops->guest_paging_levels == PAGING_L2)
- {
-#if CONFIG_PAGING_LEVELS >=3
- free_domheap_pages(page, SL1_ORDER);
-#else
- free_domheap_page(page);
-#endif
- }
- else
- free_domheap_page(page);
- }
-
- shadow_audit(d, 0);
-
- SH_LOG("Free shadow table.");
-}
-
-void __shadow_mode_disable(struct domain *d)
-{
- struct vcpu *v;
-#ifndef NDEBUG
- int i;
-#endif
-
- if ( unlikely(!shadow_mode_enabled(d)) )
- return;
-
- free_shadow_pages(d);
- free_writable_pte_predictions(d);
-
-#ifndef NDEBUG
- for ( i = 0; i < shadow_ht_buckets; i++ )
- {
- if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
- {
- printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%"PRIx64"\n",
- __FILE__, i, (u64)d->arch.shadow_ht[i].gpfn_and_flags);
- BUG();
- }
- }
-#endif
-
- d->arch.shadow_mode = 0;
-
- free_shadow_ht_entries(d);
- free_out_of_sync_entries(d);
-
- for_each_vcpu(d, v)
- update_pagetables(v);
-}
-
-
-int __shadow_mode_enable(struct domain *d, unsigned int mode)
-{
- struct vcpu *v;
- int new_modes = (mode & ~d->arch.shadow_mode);
-#if defined(CONFIG_PAGING_LEVELS)
- int initial_paging_levels = 3;
-#endif
-
- // Gotta be adding something to call this function.
- ASSERT(new_modes);
-
- // can't take anything away by calling this function.
- ASSERT(!(d->arch.shadow_mode & ~mode));
-
-#if defined(CONFIG_PAGING_LEVELS)
- if ( CONFIG_PAGING_LEVELS == 2 )
- initial_paging_levels = CONFIG_PAGING_LEVELS;
- if ( !shadow_set_guest_paging_levels(d,
- initial_paging_levels) ) {
- printk("Unsupported guest paging levels\n");
- domain_crash_synchronous(); /* need to take a clean path */
- }
-#endif
-
- for_each_vcpu(d, v)
- {
- invalidate_shadow_ldt(v);
-
- // We need to set these up for __update_pagetables().
- // See the comment there.
-
- /*
- * arch.guest_vtable
- */
- if ( v->arch.guest_vtable &&
- (v->arch.guest_vtable != __linear_l2_table) )
- {
- unmap_domain_page_global(v->arch.guest_vtable);
- }
- if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
- v->arch.guest_vtable = __linear_l2_table;
- else
- v->arch.guest_vtable = NULL;
-
- /*
- * arch.shadow_vtable
- */
- if ( v->arch.shadow_vtable &&
- (v->arch.shadow_vtable != __shadow_linear_l2_table) )
- {
- unmap_domain_page_global(v->arch.shadow_vtable);
- }
- if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2)
- v->arch.shadow_vtable = __shadow_linear_l2_table;
- else
- v->arch.shadow_vtable = NULL;
-
-#if CONFIG_PAGING_LEVELS == 2
- /*
- * arch.hl2_vtable
- */
- if ( v->arch.hl2_vtable &&
- (v->arch.hl2_vtable != __linear_hl2_table) )
- {
- unmap_domain_page_global(v->arch.hl2_vtable);
- }
- if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
- v->arch.hl2_vtable = __linear_hl2_table;
- else
- v->arch.hl2_vtable = NULL;
-#endif
- /*
- * arch.monitor_table & arch.monitor_vtable
- */
- if ( v->arch.monitor_vtable )
- {
- free_monitor_pagetable(v);
- }
- if ( mode & SHM_external )
- {
- alloc_monitor_pagetable(v);
- }
- }
-
- if ( new_modes & SHM_enable )
- {
- ASSERT( !d->arch.shadow_ht );
- d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
- if ( d->arch.shadow_ht == NULL )
- goto nomem;
-
- memset(d->arch.shadow_ht, 0,
- shadow_ht_buckets * sizeof(struct shadow_status));
- }
-
- if ( new_modes & SHM_log_dirty )
- {
- ASSERT( !d->arch.shadow_dirty_bitmap );
- d->arch.shadow_dirty_bitmap_size =
- (d->shared_info->arch.max_pfn + 63) & ~63;
- d->arch.shadow_dirty_bitmap =
- xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
- (8 * sizeof(unsigned long)));
- if ( d->arch.shadow_dirty_bitmap == NULL )
- {
- d->arch.shadow_dirty_bitmap_size = 0;
- goto nomem;
- }
- memset(d->arch.shadow_dirty_bitmap, 0,
- d->arch.shadow_dirty_bitmap_size/8);
- }
-
- if ( new_modes & SHM_translate )
- {
- if ( !(new_modes & SHM_external) )
- {
- ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
- if ( !alloc_p2m_table(d) )
- {
- printk("alloc_p2m_table failed (out-of-memory?)\n");
- goto nomem;
- }
- }
- }
-
- // Get rid of any shadow pages from any previous shadow mode.
- //
- free_shadow_pages(d);
-
- d->arch.shadow_mode = mode;
-
- if ( shadow_mode_refcounts(d) )
- {
- struct list_head *list_ent;
- struct page_info *page;
-
- /*
- * Tear down its counts by disassembling its page-table-based refcounts
- * Also remove CR3's gcount/tcount.
- * That leaves things like GDTs and LDTs and external refs in tact.
- *
- * Most pages will be writable tcount=0.
- * Some will still be L1 tcount=0 or L2 tcount=0.
- * Maybe some pages will be type none tcount=0.
- * Pages granted external writable refs (via grant tables?) will
- * still have a non-zero tcount. That's OK.
- *
- * gcounts will generally be 1 for PGC_allocated.
- * GDTs and LDTs will have additional gcounts.
- * Any grant-table based refs will still be in the gcount.
- *
- * We attempt to grab writable refs to each page thus setting its type
- * Immediately put back those type refs.
- *
- * Assert that no pages are left with L1/L2/L3/L4 type.
- */
- audit_adjust_pgtables(d, -1, 1);
-
-
- for (list_ent = d->page_list.next; list_ent != &d->page_list;
- list_ent = page->list.next) {
-
- page = list_entry(list_ent, struct page_info, list);
- if ( !get_page_type(page, PGT_writable_page) )
- BUG();
- put_page_type(page);
- /*
- * We use tlbflush_timestamp as back pointer to smfn, and need to
- * clean up it.
- */
- if (shadow_mode_external(d))
- page->tlbflush_timestamp = 0;
- }
-
- audit_adjust_pgtables(d, 1, 1);
-
- }
-
- return 0;
-
- nomem:
- if ( (new_modes & SHM_enable) )
- {
- xfree(d->arch.shadow_ht);
- d->arch.shadow_ht = NULL;
- }
- if ( (new_modes & SHM_log_dirty) )
- {
- xfree(d->arch.shadow_dirty_bitmap);
- d->arch.shadow_dirty_bitmap = NULL;
- }
-
- return -ENOMEM;
-}
-
-
-int shadow_mode_enable(struct domain *d, unsigned int mode)
-{
- int rc;
- shadow_lock(d);
- rc = __shadow_mode_enable(d, mode);
- shadow_unlock(d);
- return rc;
-}
-
-static int shadow_mode_table_op(
- struct domain *d, dom0_shadow_control_t *sc)
-{
- unsigned int op = sc->op;
- int i, rc = 0;
- struct vcpu *v;
-
- ASSERT(shadow_lock_is_acquired(d));
-
- SH_VLOG("shadow mode table op %lx %lx count %d",
- (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */
- (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
- d->arch.shadow_page_count);
-
- shadow_audit(d, 1);
-
- switch ( op )
- {
- case DOM0_SHADOW_CONTROL_OP_FLUSH:
- free_shadow_pages(d);
-
- d->arch.shadow_fault_count = 0;
- d->arch.shadow_dirty_count = 0;
-
- break;
-
- case DOM0_SHADOW_CONTROL_OP_CLEAN:
- free_shadow_pages(d);
-
- sc->stats.fault_count = d->arch.shadow_fault_count;
- sc->stats.dirty_count = d->arch.shadow_dirty_count;
-
- d->arch.shadow_fault_count = 0;
- d->arch.shadow_dirty_count = 0;
-
- if ( guest_handle_is_null(sc->dirty_bitmap) ||
- (d->arch.shadow_dirty_bitmap == NULL) )
- {
- rc = -EINVAL;
- break;
- }
-
- if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
- sc->pages = d->arch.shadow_dirty_bitmap_size;
-
-#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
- for ( i = 0; i < sc->pages; i += chunk )
- {
- int bytes = ((((sc->pages - i) > chunk) ?
- chunk : (sc->pages - i)) + 7) / 8;
-
- if ( copy_to_guest_offset(
- sc->dirty_bitmap, i/(8*sizeof(unsigned long)),
- d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
- (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) )
- {
- rc = -EINVAL;
- break;
- }
- memset(
- d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
- 0, bytes);
- }
-
- break;
-
- case DOM0_SHADOW_CONTROL_OP_PEEK:
- sc->stats.fault_count = d->arch.shadow_fault_count;
- sc->stats.dirty_count = d->arch.shadow_dirty_count;
-
- if ( guest_handle_is_null(sc->dirty_bitmap) ||
- (d->arch.shadow_dirty_bitmap == NULL) )
- {
- rc = -EINVAL;
- break;
- }
-
- if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
- sc->pages = d->arch.shadow_dirty_bitmap_size;
-
- if ( copy_to_guest(sc->dirty_bitmap,
- d->arch.shadow_dirty_bitmap,
- (((sc->pages+7)/8)+sizeof(unsigned long)-1) /
- sizeof(unsigned long)) )
- {
- rc = -EINVAL;
- break;
- }
-
- break;
-
- default:
- rc = -EINVAL;
- break;
- }
-
- SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
- shadow_audit(d, 1);
-
- for_each_vcpu(d,v)
- __update_pagetables(v);
-
- return rc;
-}
-
-int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
-{
- unsigned int op = sc->op;
- int rc = 0;
- struct vcpu *v;
-
- if ( unlikely(d == current->domain) )
- {
- DPRINTK("Don't try to do a shadow op on yourself!\n");
- return -EINVAL;
- }
-
- domain_pause(d);
-
- shadow_lock(d);
-
- switch ( op )
- {
- case DOM0_SHADOW_CONTROL_OP_OFF:
- if ( shadow_mode_enabled(d) )
- {
- __shadow_sync_all(d);
- __shadow_mode_disable(d);
- }
- break;
-
- case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
- free_shadow_pages(d);
- rc = __shadow_mode_enable(d, SHM_enable);
- break;
-
- case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
- free_shadow_pages(d);
- rc = __shadow_mode_enable(
- d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
- break;
-
- case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
- free_shadow_pages(d);
- rc = __shadow_mode_enable(
- d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate);
- break;
-
- default:
- rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
- break;
- }
-
- shadow_unlock(d);
-
- for_each_vcpu(d,v)
- update_pagetables(v);
-
- domain_unpause(d);
-
- return rc;
-}
-
-void shadow_mode_init(void)
-{
-}
-
-int _shadow_mode_refcounts(struct domain *d)
-{
- return shadow_mode_refcounts(d);
-}
-
-static int
-map_p2m_entry(pgentry_64_t *top_tab, unsigned long gpfn, unsigned long mfn)
-{
-#if CONFIG_PAGING_LEVELS >= 4
- pgentry_64_t l4e = { 0 };
- pgentry_64_t *l3tab = NULL;
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
- pgentry_64_t l3e = { 0 };
-#endif
- l2_pgentry_t *l2tab = NULL;
- l1_pgentry_t *l1tab = NULL;
- unsigned long *l0tab = NULL;
- l2_pgentry_t l2e = { 0 };
- l1_pgentry_t l1e = { 0 };
- struct page_info *page;
- unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
-
-#if CONFIG_PAGING_LEVELS >= 4
- l4e = top_tab[l4_table_offset(va)];
- if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
- {
- page = alloc_domheap_page(NULL);
- if ( !page )
- goto nomem;
-
- l3tab = map_domain_page(page_to_mfn(page));
- memset(l3tab, 0, PAGE_SIZE);
- l4e = top_tab[l4_table_offset(va)] =
- entry_from_page(page, __PAGE_HYPERVISOR);
- }
- else
- l3tab = map_domain_page(entry_get_pfn(l4e));
-
- l3e = l3tab[l3_table_offset(va)];
- if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
- {
- page = alloc_domheap_page(NULL);
- if ( !page )
- goto nomem;
-
- l2tab = map_domain_page(page_to_mfn(page));
- memset(l2tab, 0, PAGE_SIZE);
- l3e = l3tab[l3_table_offset(va)] =
- entry_from_page(page, __PAGE_HYPERVISOR);
- }
- else
- l2tab = map_domain_page(entry_get_pfn(l3e));
-
- unmap_domain_page(l3tab);
-#else
- l3e = top_tab[l3_table_offset(va)];
-
- /*
- * NB: when CONFIG_PAGING_LEVELS == 3,
- * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
- * alloc_monitor_pagetable should guarantee this.
- */
- if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
- BUG();
-
- l2tab = map_domain_page(entry_get_pfn(l3e));
-#endif
-
- l2e = l2tab[l2_table_offset(va)];
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- {
- page = alloc_domheap_page(NULL);
- if ( !page )
- goto nomem;
-
- l1tab = map_domain_page(page_to_mfn(page));
- memset(l1tab, 0, PAGE_SIZE);
- l2e = l2tab[l2_table_offset(va)] =
- l2e_from_page(page, __PAGE_HYPERVISOR);
- }
- else
- l1tab = map_domain_page(l2e_get_pfn(l2e));
-
- unmap_domain_page(l2tab);
-
- l1e = l1tab[l1_table_offset(va)];
- if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
- {
- page = alloc_domheap_page(NULL);
- if ( !page )
- goto nomem;
-
- l0tab = map_domain_page(page_to_mfn(page));
- memset(l0tab, 0, PAGE_SIZE);
- l1e = l1tab[l1_table_offset(va)] =
- l1e_from_page(page, __PAGE_HYPERVISOR);
- }
- else
- l0tab = map_domain_page(l1e_get_pfn(l1e));
-
- unmap_domain_page(l1tab);
-
- l0tab[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn;
-
- unmap_domain_page(l0tab);
-
- return 1;
-
-nomem:
- return 0;
-}
-
-int
-set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn,
- struct domain_mmap_cache *l2cache,
- struct domain_mmap_cache *l1cache)
-{
- unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
- pgentry_64_t *top_tab;
- int error;
-
- ASSERT(tabmfn != 0);
- ASSERT(shadow_lock_is_acquired(d));
-
- top_tab = map_domain_page_with_cache(tabmfn, l2cache);
-
- if ( !(error = map_p2m_entry(top_tab, gpfn, mfn)) )
- domain_crash(d);
-
- unmap_domain_page_with_cache(top_tab, l2cache);
-
- return error;
-}
-
-static int
-alloc_p2m_table(struct domain *d)
-{
- struct list_head *list_ent;
- pgentry_64_t *top_tab = NULL;
- unsigned long gpfn, mfn;
- int error = 0;
-
- ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
-
- top_tab = map_domain_page(
- pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-
- list_ent = d->page_list.next;
-
- while ( list_ent != &d->page_list )
- {
- struct page_info *page;
-
- page = list_entry(list_ent, struct page_info, list);
- mfn = page_to_mfn(page);
-
- gpfn = get_gpfn_from_mfn(mfn);
-
- if ( !(error = map_p2m_entry(top_tab, gpfn, mfn)) )
- {
- domain_crash(d);
- break;
- }
-
- list_ent = page->list.next;
- }
-
- unmap_domain_page(top_tab);
-
- return error;
-}
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void
-free_p2m_table(struct domain *d)
-{
- unsigned long va;
- l1_pgentry_t *l1tab;
- l1_pgentry_t l1e;
- l2_pgentry_t *l2tab;
- l2_pgentry_t l2e;
-#if CONFIG_PAGING_LEVELS >= 3
- l3_pgentry_t *l3tab;
- l3_pgentry_t l3e;
-#endif
-#if CONFIG_PAGING_LEVELS == 4
- int i3;
- l4_pgentry_t *l4tab;
- l4_pgentry_t l4e;
-#endif
-
- ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
-
-#if CONFIG_PAGING_LEVELS == 4
- l4tab = map_domain_page(
- pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-#endif
-#if CONFIG_PAGING_LEVELS == 3
- l3tab = map_domain_page(
- pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-
- l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
-
- /*
- * NB: when CONFIG_PAGING_LEVELS == 3,
- * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
- * alloc_monitor_pagetable should guarantee this.
- */
- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
- BUG();
-
- l2tab = map_domain_page(l3e_get_pfn(l3e));
-#endif
-
- for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
- {
-#if CONFIG_PAGING_LEVELS == 4
- l4e = l4tab[l4_table_offset(va)];
-
- if ( l4e_get_flags(l4e) & _PAGE_PRESENT )
- {
- l3tab = map_domain_page(l4e_get_pfn(l4e));
-
- for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ )
- {
- l3e = l3tab[l3_table_offset(va)];
-
- if ( l3e_get_flags(l3e) & _PAGE_PRESENT )
- {
- int i2;
-
- l2tab = map_domain_page(l3e_get_pfn(l3e));
-
- for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
- {
-#endif
- l2e = l2tab[l2_table_offset(va)];
-
- if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
- {
- int i1;
-
- l1tab = map_domain_page(l2e_get_pfn(l2e));
-
- /*
- * unsigned long phys_to_machine_mapping[]
- */
- for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++ )
- {
- l1e = l1tab[l1_table_offset(va)];
-
- if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
- free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
-
- va += PAGE_SIZE;
- }
- unmap_domain_page(l1tab);
- free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
- }
- else
- va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
-
-#if CONFIG_PAGING_LEVELS == 4
- }
- unmap_domain_page(l2tab);
- free_domheap_page(mfn_to_page(l3e_get_pfn(l3e)));
- }
- else
- va += PAGE_SIZE * L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES;
- }
- unmap_domain_page(l3tab);
- free_domheap_page(mfn_to_page(l4e_get_pfn(l4e)));
- }
- else
- va += PAGE_SIZE *
- L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES * L3_PAGETABLE_ENTRIES;
-#endif
- }
-
-#if CONFIG_PAGING_LEVELS == 4
- unmap_domain_page(l4tab);
-#endif
-#if CONFIG_PAGING_LEVELS == 3
- unmap_domain_page(l3tab);
-#endif
-}
-#endif
-
-void shadow_l1_normal_pt_update(
- struct domain *d,
- paddr_t pa, l1_pgentry_t gpte,
- struct domain_mmap_cache *cache)
-{
- unsigned long sl1mfn;
- l1_pgentry_t *spl1e, spte;
-
- shadow_lock(d);
-
- sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
- if ( sl1mfn )
- {
- SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpde=%" PRIpte,
- (void *)pa, l1e_get_intpte(gpte));
- l1pte_propagate_from_guest(current->domain, gpte, &spte);
-
- spl1e = map_domain_page_with_cache(sl1mfn, cache);
- spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
- unmap_domain_page_with_cache(spl1e, cache);
- }
-
- shadow_unlock(d);
-}
-
-void shadow_l2_normal_pt_update(
- struct domain *d,
- paddr_t pa, l2_pgentry_t gpde,
- struct domain_mmap_cache *cache)
-{
- unsigned long sl2mfn;
- l2_pgentry_t *spl2e;
-
- shadow_lock(d);
-
- sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
- if ( sl2mfn )
- {
- SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
- (void *)pa, l2e_get_intpte(gpde));
- spl2e = map_domain_page_with_cache(sl2mfn, cache);
- validate_pde_change(d, gpde,
- &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
- unmap_domain_page_with_cache(spl2e, cache);
- }
-
- shadow_unlock(d);
-}
-
-#if CONFIG_PAGING_LEVELS >= 3
-void shadow_l3_normal_pt_update(
- struct domain *d,
- paddr_t pa, l3_pgentry_t l3e,
- struct domain_mmap_cache *cache)
-{
- unsigned long sl3mfn;
- pgentry_64_t *spl3e;
-
- shadow_lock(d);
-
- sl3mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l3_shadow);
- if ( sl3mfn )
- {
- SH_VVLOG("shadow_l3_normal_pt_update pa=%p, l3e=%" PRIpte,
- (void *)pa, l3e_get_intpte(l3e));
- spl3e = (pgentry_64_t *) map_domain_page_with_cache(sl3mfn, cache);
- validate_entry_change(d, (pgentry_64_t *) &l3e,
- &spl3e[(pa & ~PAGE_MASK) / sizeof(l3_pgentry_t)],
- shadow_type_to_level(PGT_l3_shadow));
- unmap_domain_page_with_cache(spl3e, cache);
- }
-
- shadow_unlock(d);
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-void shadow_l4_normal_pt_update(
- struct domain *d,
- paddr_t pa, l4_pgentry_t l4e,
- struct domain_mmap_cache *cache)
-{
- unsigned long sl4mfn;
- pgentry_64_t *spl4e;
-
- shadow_lock(d);
-
- sl4mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l4_shadow);
- if ( sl4mfn )
- {
- SH_VVLOG("shadow_l4_normal_pt_update pa=%p, l4e=%" PRIpte,
- (void *)pa, l4e_get_intpte(l4e));
- spl4e = (pgentry_64_t *)map_domain_page_with_cache(sl4mfn, cache);
- validate_entry_change(d, (pgentry_64_t *)&l4e,
- &spl4e[(pa & ~PAGE_MASK) / sizeof(l4_pgentry_t)],
- shadow_type_to_level(PGT_l4_shadow));
- unmap_domain_page_with_cache(spl4e, cache);
- }
-
- shadow_unlock(d);
-}
-#endif
-
-static void
-translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
-{
- int i;
- l1_pgentry_t *l1;
-
- l1 = map_domain_page(l1mfn);
- for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
- {
- if ( is_guest_l1_slot(i) &&
- (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
- {
- unsigned long mfn = l1e_get_pfn(l1[i]);
- unsigned long gpfn = mfn_to_gmfn(d, mfn);
- ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
- l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
- }
- }
- unmap_domain_page(l1);
-}
-
-// This is not general enough to handle arbitrary pagetables
-// with shared L1 pages, etc., but it is sufficient for bringing
-// up dom0.
-//
-void
-translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
- unsigned int type)
-{
- int i;
- l2_pgentry_t *l2;
-
- ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
-
- l2 = map_domain_page(l2mfn);
- for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
- {
- if ( is_guest_l2_slot(type, i) &&
- (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
- {
- unsigned long mfn = l2e_get_pfn(l2[i]);
- unsigned long gpfn = mfn_to_gmfn(d, mfn);
- ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
- l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
- translate_l1pgtable(d, p2m, mfn);
- }
- }
- unmap_domain_page(l2);
-}
-
-void
-remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
-{
- unsigned long smfn;
-
- shadow_lock(d);
-
- while ( stype >= PGT_l1_shadow )
- {
- smfn = __shadow_status(d, gpfn, stype);
- if ( smfn && MFN_PINNED(smfn) )
- shadow_unpin(smfn);
- stype -= PGT_l1_shadow;
- }
-
- shadow_unlock(d);
-}
-
-unsigned long
-get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn)
-{
- unsigned long va, tabpfn;
- l1_pgentry_t *l1, l1e;
- l2_pgentry_t *l2, l2e;
-#if CONFIG_PAGING_LEVELS >= 4
- pgentry_64_t *l4 = NULL;
- pgentry_64_t l4e = { 0 };
-#endif
- pgentry_64_t *l3 = NULL;
- pgentry_64_t l3e = { 0 };
- unsigned long *l0tab = NULL;
- unsigned long mfn;
-
- ASSERT(shadow_mode_translate(d));
-
- perfc_incrc(get_mfn_from_gpfn_foreign);
-
- va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
-
- tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
- if ( !tabpfn )
- return INVALID_MFN;
-
-#if CONFIG_PAGING_LEVELS >= 4
- l4 = map_domain_page(tabpfn);
- l4e = l4[l4_table_offset(va)];
- unmap_domain_page(l4);
- if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
- return INVALID_MFN;
-
- l3 = map_domain_page(entry_get_pfn(l4e));
-#else
- l3 = map_domain_page(tabpfn);
-#endif
- l3e = l3[l3_table_offset(va)];
- unmap_domain_page(l3);
- if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
- return INVALID_MFN;
- l2 = map_domain_page(entry_get_pfn(l3e));
- l2e = l2[l2_table_offset(va)];
- unmap_domain_page(l2);
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- return INVALID_MFN;
-
- l1 = map_domain_page(l2e_get_pfn(l2e));
- l1e = l1[l1_table_offset(va)];
- unmap_domain_page(l1);
- if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
- return INVALID_MFN;
-
- l0tab = map_domain_page(l1e_get_pfn(l1e));
- mfn = l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1)];
- unmap_domain_page(l0tab);
- return mfn;
-}
-
-static u32 remove_all_access_in_page(
- struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
-{
- l1_pgentry_t *pl1e = map_domain_page(l1mfn);
- l1_pgentry_t match, ol2e;
- unsigned long flags = _PAGE_PRESENT;
- int i;
- u32 count = 0;
- int is_l1_shadow =
- ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) ==
- PGT_l1_shadow);
-
- match = l1e_from_pfn(forbidden_gmfn, flags);
-
- for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
- {
- if ( l1e_has_changed(pl1e[i], match, flags) )
- continue;
-
- ol2e = pl1e[i];
- pl1e[i] = l1e_empty();
- count++;
-
- if ( is_l1_shadow )
- shadow_put_page_from_l1e(ol2e, d);
- else /* must be an hl2 page */
- put_page(mfn_to_page(forbidden_gmfn));
- }
-
- unmap_domain_page(pl1e);
-
- return count;
-}
-
-static u32 __shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
-{
- int i;
- struct shadow_status *a;
- u32 count = 0;
-
- if ( unlikely(!shadow_mode_enabled(d)) )
- return 0;
-
- ASSERT(shadow_lock_is_acquired(d));
- perfc_incrc(remove_all_access);
-
- for (i = 0; i < shadow_ht_buckets; i++)
- {
- a = &d->arch.shadow_ht[i];
- while ( a && a->gpfn_and_flags )
- {
- switch (a->gpfn_and_flags & PGT_type_mask)
- {
- case PGT_l1_shadow:
- case PGT_l2_shadow:
- case PGT_l3_shadow:
- case PGT_l4_shadow:
- case PGT_hl2_shadow:
- count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
- break;
- case PGT_snapshot:
- case PGT_writable_pred:
- // these can't hold refs to the forbidden page
- break;
- default:
- BUG();
- }
-
- a = a->next;
- }
- }
-
- return count;
-}
-
-void shadow_drop_references(
- struct domain *d, struct page_info *page)
-{
- if ( likely(!shadow_mode_refcounts(d)) ||
- ((page->u.inuse.type_info & PGT_count_mask) == 0) )
- return;
-
- /* XXX This needs more thought... */
- printk("%s: needing to call __shadow_remove_all_access for mfn=%lx\n",
- __func__, page_to_mfn(page));
- printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
- page->count_info, page->u.inuse.type_info);
-
- shadow_lock(d);
- __shadow_remove_all_access(d, page_to_mfn(page));
- shadow_unlock(d);
-
- printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
- page->count_info, page->u.inuse.type_info);
-}
-
-/* XXX Needs more thought. Neither pretty nor fast: a place holder. */
-void shadow_sync_and_drop_references(
- struct domain *d, struct page_info *page)
-{
- if ( likely(!shadow_mode_refcounts(d)) )
- return;
-
- shadow_lock(d);
-
- if ( page_out_of_sync(page) )
- __shadow_sync_mfn(d, page_to_mfn(page));
-
- __shadow_remove_all_access(d, page_to_mfn(page));
-
- shadow_unlock(d);
-}
-
-void clear_all_shadow_status(struct domain *d)
-{
- struct vcpu *v = current;
-
- /*
- * Don't clean up while other vcpus are working.
- */
- if ( v->vcpu_id )
- return;
-
- shadow_lock(d);
-
- free_shadow_pages(d);
- free_shadow_ht_entries(d);
- d->arch.shadow_ht =
- xmalloc_array(struct shadow_status, shadow_ht_buckets);
- if ( d->arch.shadow_ht == NULL ) {
- printk("clear all shadow status:xmalloc fail\n");
- domain_crash_synchronous();
- }
- memset(d->arch.shadow_ht, 0,
- shadow_ht_buckets * sizeof(struct shadow_status));
-
- free_out_of_sync_entries(d);
-
- shadow_unlock(d);
-}
-
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index a78ed07d26..734bd41797 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -896,7 +896,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
v = alloc_idle_vcpu(cpu);
BUG_ON(v == NULL);
- v->arch.monitor_table = pagetable_from_paddr(__pa(idle_pg_table));
+ v->arch.cr3 = __pa(idle_pg_table);
/* start_eip had better be page-aligned! */
start_eip = setup_trampoline();
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 87f9a4fd42..2d398712fe 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -277,6 +277,21 @@ void show_stack(struct cpu_user_regs *regs)
show_trace(regs);
}
+void show_xen_trace()
+{
+ struct cpu_user_regs regs;
+#ifdef __x86_64
+ __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
+ __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
+ __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
+#else
+ __asm__("movl %%esp,%0" : "=m" (regs.esp));
+ __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
+ __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
+#endif
+ show_trace(&regs);
+}
+
void show_stack_overflow(unsigned long esp)
{
#ifdef MEMORY_GUARD
@@ -861,8 +876,8 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
{
- if ( shadow_mode_external(d) && guest_mode(regs) )
- return shadow_fault(addr, regs);
+ if ( shadow2_mode_external(d) && guest_mode(regs) )
+ return shadow2_fault(addr, regs);
if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
return handle_gdt_ldt_mapping_fault(
addr - GDT_LDT_VIRT_START, regs);
@@ -873,15 +888,15 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
}
- if ( unlikely(shadow_mode_enabled(d)) )
- return shadow_fault(addr, regs);
-
if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) &&
guest_kernel_mode(v, regs) &&
((regs->error_code & (PGERR_write_access|PGERR_page_present)) ==
(PGERR_write_access|PGERR_page_present)) )
return ptwr_do_page_fault(d, addr, regs) ? EXCRET_fault_fixed : 0;
+ if ( shadow2_mode_enabled(d) )
+ return shadow2_fault(addr, regs);
+
return 0;
}
@@ -906,6 +921,13 @@ asmlinkage int do_page_fault(struct cpu_user_regs *regs)
perfc_incrc(page_faults);
+ if ( shadow2_mode_enabled(current->domain) )
+ debugtrace_printk("%s %s %d dom=%d eip=%p cr2=%p code=%d cs=%x\n",
+ __func__, __FILE__, __LINE__,
+ current->domain->domain_id,
+ (void *)regs->eip, (void *)addr, regs->error_code,
+ regs->cs);
+
if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
return rc;
diff --git a/xen/arch/x86/x86_32/domain_page.c b/xen/arch/x86/x86_32/domain_page.c
index db3237242c..8fe7b9b344 100644
--- a/xen/arch/x86/x86_32/domain_page.c
+++ b/xen/arch/x86/x86_32/domain_page.c
@@ -15,6 +15,7 @@
#include <asm/current.h>
#include <asm/flushtlb.h>
#include <asm/hardirq.h>
+#include <asm/hvm/support.h>
static inline struct vcpu *mapcache_current_vcpu(void)
{
@@ -58,10 +59,10 @@ void *map_domain_page(unsigned long pfn)
cache = &v->domain->arch.mapcache;
hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)];
- if ( hashent->pfn == pfn )
+ if ( hashent->pfn == pfn && (idx = hashent->idx) != MAPHASHENT_NOTINUSE )
{
- idx = hashent->idx;
hashent->refcnt++;
+ ASSERT(idx < MAPCACHE_ENTRIES);
ASSERT(hashent->refcnt != 0);
ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn);
goto out;
@@ -178,6 +179,30 @@ void mapcache_init(struct domain *d)
MAPHASHENT_NOTINUSE;
}
+paddr_t mapped_domain_page_to_maddr(void *va)
+/* Convert a pointer in a mapped domain page to a machine address.
+ * Takes any pointer that's valid for use in unmap_domain_page() */
+{
+ unsigned int idx;
+ struct vcpu *v;
+ struct mapcache *cache;
+ unsigned long pfn;
+
+ ASSERT(!in_irq());
+
+ ASSERT((void *)MAPCACHE_VIRT_START <= va);
+ ASSERT(va < (void *)MAPCACHE_VIRT_END);
+
+ v = mapcache_current_vcpu();
+
+ cache = &v->domain->arch.mapcache;
+
+ idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
+ pfn = l1e_get_pfn(cache->l1tab[idx]);
+ return ((paddr_t) pfn << PAGE_SHIFT
+ | ((unsigned long) va & ~PAGE_MASK));
+}
+
#define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT))
static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)];
static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)];
@@ -233,6 +258,8 @@ void unmap_domain_page_global(void *va)
l1_pgentry_t *pl1e;
unsigned int idx;
+ ASSERT((__va >= IOREMAP_VIRT_START) && (__va <= (IOREMAP_VIRT_END - 1)));
+
/* /First/, we zap the PTE. */
pl2e = virt_to_xen_l2e(__va);
pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(__va);
diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c
index 868140e586..dc2450201a 100644
--- a/xen/arch/x86/x86_32/mm.c
+++ b/xen/arch/x86/x86_32/mm.c
@@ -75,8 +75,7 @@ void __init paging_init(void)
printk("PAE disabled.\n");
#endif
- idle_vcpu[0]->arch.monitor_table =
- pagetable_from_paddr(__pa(idle_pg_table));
+ idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
if ( cpu_has_pge )
{
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index d5db7f3b30..f173c05d83 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -81,8 +81,7 @@ void __init paging_init(void)
l2_pgentry_t *l2_ro_mpt;
struct page_info *pg;
- idle_vcpu[0]->arch.monitor_table =
- pagetable_from_paddr(__pa(idle_pg_table));
+ idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
/* Create user-accessible L2 directory to map the MPT for guests. */
l3_ro_mpt = alloc_xenheap_page();
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index cfe2a6a5a0..84c9c35952 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -84,7 +84,8 @@ void show_page_walk(unsigned long addr)
l4e = l4t[l4_table_offset(addr)];
mfn = l4e_get_pfn(l4e);
pfn = get_gpfn_from_mfn(mfn);
- printk(" L4 = %"PRIpte" %016lx\n", l4e_get_intpte(l4e), pfn);
+ printk(" L4[0x%lx] = %"PRIpte" %016lx\n",
+ l4_table_offset(addr), l4e_get_intpte(l4e), pfn);
if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
return;
@@ -92,7 +93,8 @@ void show_page_walk(unsigned long addr)
l3e = l3t[l3_table_offset(addr)];
mfn = l3e_get_pfn(l3e);
pfn = get_gpfn_from_mfn(mfn);
- printk(" L3 = %"PRIpte" %016lx\n", l3e_get_intpte(l3e), pfn);
+ printk(" L3[0x%lx] = %"PRIpte" %016lx\n",
+ l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
return;
@@ -100,7 +102,8 @@ void show_page_walk(unsigned long addr)
l2e = l2t[l2_table_offset(addr)];
mfn = l2e_get_pfn(l2e);
pfn = get_gpfn_from_mfn(mfn);
- printk(" L2 = %"PRIpte" %016lx %s\n", l2e_get_intpte(l2e), pfn,
+ printk(" L2[0x%lx] = %"PRIpte" %016lx %s\n",
+ l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
(l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
(l2e_get_flags(l2e) & _PAGE_PSE) )
@@ -110,7 +113,8 @@ void show_page_walk(unsigned long addr)
l1e = l1t[l1_table_offset(addr)];
mfn = l1e_get_pfn(l1e);
pfn = get_gpfn_from_mfn(mfn);
- printk(" L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn);
+ printk(" L1[0x%lx] = %"PRIpte" %016lx\n",
+ l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
}
asmlinkage void double_fault(void);
@@ -162,7 +166,7 @@ void toggle_guest_mode(struct vcpu *v)
{
v->arch.flags ^= TF_kernel_mode;
__asm__ __volatile__ ( "swapgs" );
- update_pagetables(v);
+ update_cr3(v);
write_ptbase(v);
}
diff --git a/xen/common/acm_ops.c b/xen/common/acm_ops.c
index 6c65612799..3692577873 100644
--- a/xen/common/acm_ops.c
+++ b/xen/common/acm_ops.c
@@ -26,7 +26,6 @@
#include <xen/trace.h>
#include <xen/console.h>
#include <xen/guest_access.h>
-#include <asm/shadow.h>
#include <public/sched_ctl.h>
#include <acm/acm_hooks.h>
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index ad33217711..c8ba260711 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -434,7 +434,7 @@ __gnttab_unmap_grant_ref(
/* If just unmapped a writable mapping, mark as dirtied */
if ( !(flags & GNTMAP_readonly) )
- gnttab_log_dirty(rd, frame);
+ gnttab_mark_dirty(rd, frame);
if ( ((act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0) &&
!(flags & GNTMAP_readonly) )
@@ -731,7 +731,7 @@ __release_grant_for_copy(
const unsigned long r_frame = act->frame;
if ( !readonly )
- gnttab_log_dirty(rd, r_frame);
+ gnttab_mark_dirty(rd, r_frame);
spin_lock(&rd->grant_table->lock);
if ( readonly )
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index fb7118e71f..1fb50b6bd2 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -241,9 +241,6 @@ static void read_clocks(unsigned char key)
}
extern void dump_runq(unsigned char key);
-#ifndef NDEBUG
-extern void audit_domains_key(unsigned char key);
-#endif
#ifdef PERF_COUNTERS
extern void perfc_printall(unsigned char key);
@@ -261,10 +258,16 @@ static void do_debug_key(unsigned char key, struct cpu_user_regs *regs)
#ifndef NDEBUG
static void debugtrace_key(unsigned char key)
{
- debugtrace_send_to_console = !debugtrace_send_to_console;
- debugtrace_dump();
- printk("debugtrace_printk now writing to %s.\n",
- debugtrace_send_to_console ? "console" : "buffer");
+ debugtrace_toggle();
+}
+
+static void shadow2_audit_key(unsigned char key)
+{
+ extern int shadow2_audit_enable;
+
+ shadow2_audit_enable = !shadow2_audit_enable;
+ printk("%s shadow2_audit_enable=%d\n",
+ __func__, shadow2_audit_enable);
}
#endif
@@ -288,7 +291,7 @@ void initialize_keytable(void)
#ifndef NDEBUG
register_keyhandler(
- 'o', audit_domains_key, "audit domains >0 EXPERIMENTAL");
+ 'O', shadow2_audit_key, "toggle shadow2 audits");
register_keyhandler(
'T', debugtrace_key, "toggle debugtrace to console/buffer");
#endif
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 0a631ca83e..9962c2e89a 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -126,6 +126,11 @@ populate_physmap(
for ( j = 0; j < (1 << extent_order); j++ )
guest_physmap_add_page(d, gpfn + j, mfn + j);
}
+ else if ( unlikely(shadow2_mode_translate(d)) )
+ {
+ for ( j = 0; j < (1 << extent_order); j++ )
+ shadow2_guest_physmap_add_page(d, gpfn + j, mfn + j);
+ }
else
{
for ( j = 0; j < (1 << extent_order); j++ )
@@ -153,7 +158,7 @@ guest_remove_page(
if ( unlikely(!mfn_valid(mfn)) )
{
DPRINTK("Domain %u page number %lx invalid\n",
- d->domain_id, mfn);
+ d->domain_id, gmfn);
return 0;
}
@@ -179,7 +184,7 @@ guest_remove_page(
(unsigned long)page->count_info, page->u.inuse.type_info);
}
- guest_physmap_remove_page(d, gmfn, mfn);
+ shadow2_guest_physmap_remove_page(d, gmfn, mfn);
put_page(page);
@@ -250,7 +255,7 @@ translate_gpfn_list(
if ( (d = find_domain_by_id(op.domid)) == NULL )
return -ESRCH;
- if ( !shadow_mode_translate(d) )
+ if ( !(shadow_mode_translate(d) || shadow2_mode_translate(d)) )
{
put_domain(d);
return -EINVAL;
diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c
index 8bd1c28915..974f6e3d8e 100644
--- a/xen/drivers/char/console.c
+++ b/xen/drivers/char/console.c
@@ -569,7 +569,7 @@ int console_getc(void)
#ifndef NDEBUG
/* Send output direct to console, or buffer it? */
-int debugtrace_send_to_console;
+static volatile int debugtrace_send_to_console;
static char *debugtrace_buf; /* Debug-trace buffer */
static unsigned int debugtrace_prd; /* Producer index */
@@ -578,17 +578,11 @@ static unsigned int debugtrace_used;
static DEFINE_SPINLOCK(debugtrace_lock);
integer_param("debugtrace", debugtrace_kilobytes);
-void debugtrace_dump(void)
+static void debugtrace_dump_worker(void)
{
- unsigned long flags;
-
if ( (debugtrace_bytes == 0) || !debugtrace_used )
return;
- watchdog_disable();
-
- spin_lock_irqsave(&debugtrace_lock, flags);
-
printk("debugtrace_dump() starting\n");
/* Print oldest portion of the ring. */
@@ -602,15 +596,47 @@ void debugtrace_dump(void)
memset(debugtrace_buf, '\0', debugtrace_bytes);
printk("debugtrace_dump() finished\n");
+}
+
+void debugtrace_toggle(void)
+{
+ unsigned long flags;
+
+ watchdog_disable();
+ spin_lock_irqsave(&debugtrace_lock, flags);
+
+ // dump the buffer *before* toggling, in case the act of dumping the
+ // buffer itself causes more printk's...
+ //
+ printk("debugtrace_printk now writing to %s.\n",
+ !debugtrace_send_to_console ? "console": "buffer");
+ if ( !debugtrace_send_to_console )
+ debugtrace_dump_worker();
+
+ debugtrace_send_to_console = !debugtrace_send_to_console;
spin_unlock_irqrestore(&debugtrace_lock, flags);
+ watchdog_enable();
+
+}
+
+void debugtrace_dump(void)
+{
+ unsigned long flags;
+ watchdog_disable();
+ spin_lock_irqsave(&debugtrace_lock, flags);
+
+ debugtrace_dump_worker();
+
+ spin_unlock_irqrestore(&debugtrace_lock, flags);
watchdog_enable();
}
void debugtrace_printk(const char *fmt, ...)
{
static char buf[1024];
+ static u32 count;
va_list args;
char *p;
@@ -625,8 +651,10 @@ void debugtrace_printk(const char *fmt, ...)
ASSERT(debugtrace_buf[debugtrace_bytes - 1] == 0);
+ sprintf(buf, "%u ", ++count);
+
va_start(args, fmt);
- (void)vsnprintf(buf, sizeof(buf), fmt, args);
+ (void)vsnprintf(buf + strlen(buf), sizeof(buf), fmt, args);
va_end(args);
if ( debugtrace_send_to_console )
diff --git a/xen/include/asm-x86/bitops.h b/xen/include/asm-x86/bitops.h
index b2ee953361..b9fd2557d0 100644
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -75,6 +75,24 @@ static __inline__ void clear_bit(int nr, volatile void * addr)
:"=m" (ADDR)
:"dIr" (nr));
}
+
+/**
+ * __clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * Unlike clear_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __inline__ void __clear_bit(int nr, volatile void * addr)
+{
+ __asm__(
+ "btrl %1,%0"
+ :"=m" (ADDR)
+ :"dIr" (nr));
+}
+
#define smp_mb__before_clear_bit() barrier()
#define smp_mb__after_clear_bit() barrier()
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 99c74cf5ad..74a123de6f 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -79,9 +79,14 @@
#ifndef __ASSEMBLY__
extern unsigned long _end; /* standard ELF symbol */
-#endif /* __ASSEMBLY__ */
-#define FORCE_CRASH() __asm__ __volatile__ ( "ud2" )
+static inline void FORCE_CRASH(void) __attribute__((noreturn,always_inline));
+static inline void FORCE_CRASH(void)
+{
+ __asm__ __volatile__ ( "ud2" );
+ while(1);
+}
+#endif /* __ASSEMBLY__ */
#if defined(__x86_64__)
@@ -149,9 +154,14 @@ extern unsigned long _end; /* standard ELF symbol */
/* Slot 256: read-only guest-accessible machine-to-phys translation table. */
#define RO_MPT_VIRT_START (PML4_ADDR(256))
#define RO_MPT_VIRT_END (RO_MPT_VIRT_START + PML4_ENTRY_BYTES/2)
+
+// current unused?
+#if 0
/* Slot 257: read-only guest-accessible linear page table. */
#define RO_LINEAR_PT_VIRT_START (PML4_ADDR(257))
#define RO_LINEAR_PT_VIRT_END (RO_LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
+#endif
+
/* Slot 258: linear page table (guest table). */
#define LINEAR_PT_VIRT_START (PML4_ADDR(258))
#define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
@@ -175,7 +185,7 @@ extern unsigned long _end; /* standard ELF symbol */
#define DIRECTMAP_VIRT_START (PML4_ADDR(262))
#define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + PML4_ENTRY_BYTES*2)
-#define PGT_base_page_table PGT_l4_page_table
+#define PGT_base_page_table PGT_l4_page_table
#define __HYPERVISOR_CS64 0xe010
#define __HYPERVISOR_CS32 0xe008
@@ -274,9 +284,9 @@ extern unsigned long _end; /* standard ELF symbol */
(L2_PAGETABLE_LAST_XEN_SLOT - L2_PAGETABLE_FIRST_XEN_SLOT + 1)
#ifdef CONFIG_X86_PAE
-# define PGT_base_page_table PGT_l3_page_table
+# define PGT_base_page_table PGT_l3_page_table
#else
-# define PGT_base_page_table PGT_l2_page_table
+# define PGT_base_page_table PGT_l2_page_table
#endif
#define __HYPERVISOR_CS 0xe008
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index a0efe89f0a..2ef0775795 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -73,42 +73,42 @@ struct arch_domain
/* I/O-port admin-specified access capabilities. */
struct rangeset *ioport_caps;
- /* Shadow mode status and controls. */
- struct shadow_ops *ops;
- unsigned int shadow_mode; /* flags to control shadow table operation */
- unsigned int shadow_nest; /* Recursive depth of shadow_lock() nesting */
-
- /* shadow hashtable */
- struct shadow_status *shadow_ht;
- struct shadow_status *shadow_ht_free;
- struct shadow_status *shadow_ht_extras; /* extra allocation units */
- unsigned int shadow_extras_count;
-
- /* shadow dirty bitmap */
+ /* HVM stuff */
+ struct hvm_domain hvm_domain;
+
+ /* Shadow-translated guest: Pseudophys base address of reserved area. */
+ unsigned long first_reserved_pfn;
+
+ /* Shadow2 stuff */
+ u32 shadow2_mode; /* flags to control shadow operation */
+ spinlock_t shadow2_lock; /* shadow2 domain lock */
+ int shadow2_locker; /* processor which holds the lock */
+ const char *shadow2_locker_function; /* Func that took it */
+ struct list_head shadow2_freelists[SHADOW2_MAX_ORDER + 1];
+ struct list_head shadow2_p2m_freelist;
+ struct list_head shadow2_p2m_inuse;
+ struct list_head shadow2_toplevel_shadows;
+ unsigned int shadow2_total_pages; /* number of pages allocated */
+ unsigned int shadow2_free_pages; /* number of pages on freelists */
+ unsigned int shadow2_p2m_pages; /* number of pages in p2m map */
+
+ /* Shadow2 hashtable */
+ struct shadow2_hash_entry *shadow2_hash_table;
+ struct shadow2_hash_entry *shadow2_hash_freelist;
+ struct shadow2_hash_entry *shadow2_hash_allocations;
+ int shadow2_hash_walking; /* Some function is walking the hash table */
+
+ /* Shadow log-dirty bitmap */
unsigned long *shadow_dirty_bitmap;
unsigned int shadow_dirty_bitmap_size; /* in pages, bit per page */
- /* shadow mode stats */
- unsigned int shadow_page_count;
- unsigned int hl2_page_count;
- unsigned int snapshot_page_count;
-
+ /* Shadow log-dirty mode stats */
unsigned int shadow_fault_count;
unsigned int shadow_dirty_count;
- /* full shadow mode */
- struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */
- struct out_of_sync_entry *out_of_sync_free;
- struct out_of_sync_entry *out_of_sync_extras;
- unsigned int out_of_sync_extras_count;
+ /* Shadow translated domain: P2M mapping */
+ pagetable_t phys_table;
- struct list_head free_shadow_frames;
-
- pagetable_t phys_table; /* guest 1:1 pagetable */
- struct hvm_domain hvm_domain;
-
- /* Shadow-translated guest: Pseudophys base address of reserved area. */
- unsigned long first_reserved_pfn;
} __cacheline_aligned;
#ifdef CONFIG_X86_PAE
@@ -166,25 +166,34 @@ struct arch_vcpu
*/
l1_pgentry_t *perdomain_ptes;
- pagetable_t guest_table_user; /* x86/64: user-space pagetable. */
- pagetable_t guest_table; /* (MA) guest notion of cr3 */
- pagetable_t shadow_table; /* (MA) shadow of guest */
- pagetable_t monitor_table; /* (MA) used in hypervisor */
-
- l2_pgentry_t *guest_vtable; /* virtual address of pagetable */
- l2_pgentry_t *shadow_vtable; /* virtual address of shadow_table */
- l2_pgentry_t *monitor_vtable; /* virtual address of monitor_table */
- l1_pgentry_t *hl2_vtable; /* virtual address of hl2_table */
-
#ifdef CONFIG_X86_64
- l3_pgentry_t *guest_vl3table;
- l4_pgentry_t *guest_vl4table;
+ pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */
#endif
+ pagetable_t guest_table; /* (MFN) guest notion of cr3 */
+ /* guest_table holds a ref to the page, and also a type-count unless
+ * shadow refcounts are in use */
+ pagetable_t shadow_table; /* (MFN) shadow of guest */
+ pagetable_t monitor_table; /* (MFN) hypervisor PT (for HVM) */
+ unsigned long cr3; /* (MA) value to install in HW CR3 */
- unsigned long monitor_shadow_ref;
+ void *guest_vtable; /* virtual address of pagetable */
+ void *shadow_vtable; /* virtual address of shadow_table */
+ root_pgentry_t *monitor_vtable; /* virtual address of monitor_table */
/* Current LDT details. */
unsigned long shadow_ldt_mapcnt;
+
+ /* Shadow2 stuff */
+ /* -- pointers to mode-specific entry points */
+ struct shadow2_entry_points *shadow2;
+ unsigned long last_emulated_mfn; /* last mfn we emulated a write to */
+ u8 shadow2_propagate_fault; /* emulated fault needs to be */
+ /* propagated to guest */
+#if CONFIG_PAGING_LEVELS >= 3
+ u8 shadow2_pae_flip_pending; /* shadow update requires this PAE cpu
+ * to recopy/install its L3 table.
+ */
+#endif
} __cacheline_aligned;
/* shorthands to improve code legibility */
diff --git a/xen/include/asm-x86/grant_table.h b/xen/include/asm-x86/grant_table.h
index 5c6600ac7e..277b93ca0c 100644
--- a/xen/include/asm-x86/grant_table.h
+++ b/xen/include/asm-x86/grant_table.h
@@ -31,7 +31,7 @@ int destroy_grant_host_mapping(
#define gnttab_shared_gmfn(d, t, i) \
(mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i)))
-#define gnttab_log_dirty(d, f) mark_dirty((d), (f))
+#define gnttab_mark_dirty(d, f) mark_dirty((d), (f))
static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr)
{
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 73f3b31275..cb573e5d9c 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -56,9 +56,16 @@ struct hvm_function_table {
*/
int (*realmode)(struct vcpu *v);
int (*paging_enabled)(struct vcpu *v);
+ int (*long_mode_enabled)(struct vcpu *v);
+ int (*guest_x86_mode)(struct vcpu *v);
int (*instruction_length)(struct vcpu *v);
unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
+ /*
+ * Re-set the value of CR3 that Xen runs on when handling VM exits
+ */
+ void (*update_host_cr3)(struct vcpu *v);
+
/*
* Update specifics of the guest state:
* 1) TS bit in guest cr0
@@ -134,11 +141,29 @@ hvm_paging_enabled(struct vcpu *v)
}
static inline int
+hvm_long_mode_enabled(struct vcpu *v)
+{
+ return hvm_funcs.long_mode_enabled(v);
+}
+
+static inline int
+hvm_guest_x86_mode(struct vcpu *v)
+{
+ return hvm_funcs.guest_x86_mode(v);
+}
+
+static inline int
hvm_instruction_length(struct vcpu *v)
{
return hvm_funcs.instruction_length(v);
}
+static inline void
+hvm_update_host_cr3(struct vcpu *v)
+{
+ hvm_funcs.update_host_cr3(v);
+}
+
void hvm_hypercall_page_initialise(struct domain *d,
void *hypercall_page);
diff --git a/xen/include/asm-x86/hvm/support.h b/xen/include/asm-x86/hvm/support.h
index 35a0bfe464..6ccfdee678 100644
--- a/xen/include/asm-x86/hvm/support.h
+++ b/xen/include/asm-x86/hvm/support.h
@@ -116,10 +116,13 @@ enum hval_bitmaps {
#define DBG_LEVEL_IOAPIC (1 << 9)
extern unsigned int opt_hvm_debug_level;
-#define HVM_DBG_LOG(level, _f, _a...) \
- if ( (level) & opt_hvm_debug_level ) \
- printk("[HVM:%d.%d] <%s> " _f "\n", \
- current->domain->domain_id, current->vcpu_id, __func__, ## _a)
+#define HVM_DBG_LOG(level, _f, _a...) \
+ do { \
+ if ( (level) & opt_hvm_debug_level ) \
+ printk("[HVM:%d.%d] <%s> " _f "\n", \
+ current->domain->domain_id, current->vcpu_id, __func__, \
+ ## _a); \
+ } while (0)
#else
#define HVM_DBG_LOG(level, _f, _a...)
#endif
diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h
index f89b6ad787..b607a4578b 100644
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -29,6 +29,7 @@
#define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI 1
struct hvm_vcpu {
+ unsigned long hw_cr3; /* value we give to HW to use */
unsigned long ioflags;
struct hvm_io_op io_op;
struct vlapic *vlapic;
@@ -40,6 +41,11 @@ struct hvm_vcpu {
int xen_port;
+#if CONFIG_PAGING_LEVELS >= 3
+ l3_pgentry_t hvm_lowmem_l3tab[4]
+ __attribute__((__aligned__(32)));
+#endif
+
/* Flags */
int flag_dr_dirty;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 85ee7046fd..524411be34 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -87,6 +87,7 @@ struct arch_vmx_struct {
unsigned long cpu_cr0; /* copy of guest CR0 */
unsigned long cpu_shadow_cr0; /* copy of guest read shadow CR0 */
+ unsigned long cpu_shadow_cr4; /* copy of guest read shadow CR4 */
unsigned long cpu_cr2; /* save CR2 */
unsigned long cpu_cr3;
unsigned long cpu_state;
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
index 38ae0e3b0f..38e447259c 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -298,6 +298,9 @@ static always_inline void __vmwrite_vcpu(
case GUEST_CR0:
v->arch.hvm_vmx.cpu_cr0 = value;
break;
+ case CR4_READ_SHADOW:
+ v->arch.hvm_vmx.cpu_shadow_cr4 = value;
+ break;
case CPU_BASED_VM_EXEC_CONTROL:
v->arch.hvm_vmx.cpu_based_exec_control = value;
break;
@@ -317,11 +320,14 @@ static always_inline void __vmread_vcpu(
case GUEST_CR0:
*value = v->arch.hvm_vmx.cpu_cr0;
break;
+ case CR4_READ_SHADOW:
+ *value = v->arch.hvm_vmx.cpu_shadow_cr4;
+ break;
case CPU_BASED_VM_EXEC_CONTROL:
*value = v->arch.hvm_vmx.cpu_based_exec_control;
break;
default:
- printk("__vmread_cpu: invalid field %lx\n", field);
+ printk("__vmread_vcpu: invalid field %lx\n", field);
break;
}
}
@@ -342,6 +348,7 @@ static inline int __vmwrite(unsigned long field, unsigned long value)
switch ( field ) {
case CR0_READ_SHADOW:
case GUEST_CR0:
+ case CR4_READ_SHADOW:
case CPU_BASED_VM_EXEC_CONTROL:
__vmwrite_vcpu(v, field, value);
break;
@@ -404,6 +411,46 @@ static inline int vmx_paging_enabled(struct vcpu *v)
return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG);
}
+/* Works only for vcpu == current */
+static inline int vmx_long_mode_enabled(struct vcpu *v)
+{
+ ASSERT(v == current);
+ return VMX_LONG_GUEST(current);
+}
+
+/* Works only for vcpu == current */
+static inline int vmx_realmode(struct vcpu *v)
+{
+ unsigned long rflags;
+ ASSERT(v == current);
+
+ __vmread(GUEST_RFLAGS, &rflags);
+ return rflags & X86_EFLAGS_VM;
+}
+
+/* Works only for vcpu == current */
+static inline void vmx_update_host_cr3(struct vcpu *v)
+{
+ ASSERT(v == current);
+ __vmwrite(HOST_CR3, v->arch.cr3);
+}
+
+static inline int vmx_guest_x86_mode(struct vcpu *v)
+{
+ unsigned long cs_ar_bytes;
+ ASSERT(v == current);
+
+ if ( vmx_long_mode_enabled(v) )
+ {
+ __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+ return (cs_ar_bytes & (1u<<13)) ? 8 : 4;
+ }
+ if ( vmx_realmode(v) )
+ return 2;
+ __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+ return (cs_ar_bytes & (1u<<14)) ? 4 : 2;
+}
+
static inline int vmx_pgbit_test(struct vcpu *v)
{
unsigned long cr0;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 06ea598754..0b19fbe7ec 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -20,7 +20,11 @@
struct page_info
{
/* Each frame can be threaded onto a doubly-linked list. */
- struct list_head list;
+ union {
+ struct list_head list;
+ /* Shadow2 uses this field as an up-pointer in lower-level shadows */
+ paddr_t up;
+ };
/* Reference count and various PGC_xxx flags and fields. */
u32 count_info;
@@ -46,8 +50,20 @@ struct page_info
} u;
- /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
- u32 tlbflush_timestamp;
+ union {
+ /* Timestamp from 'TLB clock', used to reduce need for safety
+ * flushes. Only valid on a) free pages, and b) guest pages with a
+ * zero type count. */
+ u32 tlbflush_timestamp;
+
+ /* Only used on guest pages with a shadow.
+ * Guest pages with a shadow must have a non-zero type count, so this
+ * does not conflict with the tlbflush timestamp. */
+ u32 shadow2_flags;
+
+ // XXX -- we expect to add another field here, to be used for min/max
+ // purposes, which is only used for shadow pages.
+ };
};
/* The following page types are MUTUALLY EXCLUSIVE. */
@@ -60,6 +76,7 @@ struct page_info
#define PGT_ldt_page (6U<<29) /* using this page in an LDT? */
#define PGT_writable_page (7U<<29) /* has writable mappings of this page? */
+#ifndef SHADOW2
#define PGT_l1_shadow PGT_l1_page_table
#define PGT_l2_shadow PGT_l2_page_table
#define PGT_l3_shadow PGT_l3_page_table
@@ -69,14 +86,16 @@ struct page_info
#define PGT_writable_pred (7U<<29) /* predicted gpfn with writable ref */
#define PGT_fl1_shadow (5U<<29)
+#endif
+
#define PGT_type_mask (7U<<29) /* Bits 29-31. */
- /* Has this page been validated for use as its current type? */
-#define _PGT_validated 28
-#define PGT_validated (1U<<_PGT_validated)
/* Owning guest has pinned this page to its current type? */
-#define _PGT_pinned 27
+#define _PGT_pinned 28
#define PGT_pinned (1U<<_PGT_pinned)
+ /* Has this page been validated for use as its current type? */
+#define _PGT_validated 27
+#define PGT_validated (1U<<_PGT_validated)
#if defined(__i386__)
/* The 11 most significant bits of virt address if this is a page table. */
#define PGT_va_shift 16
@@ -98,6 +117,7 @@ struct page_info
/* 16-bit count of uses of this frame as its current type. */
#define PGT_count_mask ((1U<<16)-1)
+#ifndef SHADOW2
#ifdef __x86_64__
#define PGT_high_mfn_shift 52
#define PGT_high_mfn_mask (0xfffUL << PGT_high_mfn_shift)
@@ -112,19 +132,53 @@ struct page_info
#define PGT_score_shift 23
#define PGT_score_mask (((1U<<4)-1)<<PGT_score_shift)
#endif
+#endif /* SHADOW2 */
/* Cleared when the owning guest 'frees' this page. */
#define _PGC_allocated 31
#define PGC_allocated (1U<<_PGC_allocated)
- /* Set when fullshadow mode marks a page out-of-sync */
+ /* Set on a *guest* page to mark it out-of-sync with its shadow */
#define _PGC_out_of_sync 30
#define PGC_out_of_sync (1U<<_PGC_out_of_sync)
- /* Set when fullshadow mode is using a page as a page table */
+ /* Set when is using a page as a page table */
#define _PGC_page_table 29
#define PGC_page_table (1U<<_PGC_page_table)
/* 29-bit count of references to this frame. */
#define PGC_count_mask ((1U<<29)-1)
+/* shadow2 uses the count_info on shadow pages somewhat differently */
+/* NB: please coordinate any changes here with the SH2F's in shadow2.h */
+#define PGC_SH2_none (0U<<28) /* on the shadow2 free list */
+#define PGC_SH2_min_shadow (1U<<28)
+#define PGC_SH2_l1_32_shadow (1U<<28) /* shadowing a 32-bit L1 guest page */
+#define PGC_SH2_fl1_32_shadow (2U<<28) /* L1 shadow for a 32b 4M superpage */
+#define PGC_SH2_l2_32_shadow (3U<<28) /* shadowing a 32-bit L2 guest page */
+#define PGC_SH2_l1_pae_shadow (4U<<28) /* shadowing a pae L1 page */
+#define PGC_SH2_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */
+#define PGC_SH2_l2_pae_shadow (6U<<28) /* shadowing a pae L2-low page */
+#define PGC_SH2_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */
+#define PGC_SH2_l3_pae_shadow (8U<<28) /* shadowing a pae L3 page */
+#define PGC_SH2_l1_64_shadow (9U<<28) /* shadowing a 64-bit L1 page */
+#define PGC_SH2_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */
+#define PGC_SH2_l2_64_shadow (11U<<28) /* shadowing a 64-bit L2 page */
+#define PGC_SH2_l3_64_shadow (12U<<28) /* shadowing a 64-bit L3 page */
+#define PGC_SH2_l4_64_shadow (13U<<28) /* shadowing a 64-bit L4 page */
+#define PGC_SH2_max_shadow (13U<<28)
+#define PGC_SH2_p2m_table (14U<<28) /* in use as the p2m table */
+#define PGC_SH2_monitor_table (15U<<28) /* in use as a monitor table */
+#define PGC_SH2_unused (15U<<28)
+
+#define PGC_SH2_type_mask (15U<<28)
+#define PGC_SH2_type_shift 28
+
+#define PGC_SH2_pinned (1U<<27)
+
+#define _PGC_SH2_log_dirty 26
+#define PGC_SH2_log_dirty (1U<<26)
+
+/* 26 bit ref count for shadow pages */
+#define PGC_SH2_count_mask ((1U<<26) - 1)
+
/* We trust the slab allocator in slab.c, and our use of it. */
#define PageSlab(page) (1)
#define PageSetSlab(page) ((void)0)
@@ -134,16 +188,24 @@ struct page_info
#if defined(__i386__)
#define pickle_domptr(_d) ((u32)(unsigned long)(_d))
-#define unpickle_domptr(_d) ((struct domain *)(unsigned long)(_d))
+static inline struct domain *unpickle_domptr(u32 _domain)
+{ return (_domain & 1) ? NULL : (void *)_domain; }
#define PRtype_info "08lx" /* should only be used for printk's */
#elif defined(__x86_64__)
static inline struct domain *unpickle_domptr(u32 _domain)
-{ return (_domain == 0) ? NULL : __va(_domain); }
+{ return ((_domain == 0) || (_domain & 1)) ? NULL : __va(_domain); }
static inline u32 pickle_domptr(struct domain *domain)
{ return (domain == NULL) ? 0 : (u32)__pa(domain); }
#define PRtype_info "016lx"/* should only be used for printk's */
#endif
+/* The order of the largest allocation unit we use for shadow pages */
+#if CONFIG_PAGING_LEVELS == 2
+#define SHADOW2_MAX_ORDER 0 /* Only ever need 4k allocations */
+#else
+#define SHADOW2_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
+#endif
+
#define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain))
#define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
@@ -165,7 +227,7 @@ extern void invalidate_shadow_ldt(struct vcpu *d);
extern int shadow_remove_all_write_access(
struct domain *d, unsigned long gmfn, unsigned long mfn);
extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
-extern int _shadow_mode_refcounts(struct domain *d);
+extern int _shadow2_mode_refcounts(struct domain *d);
static inline void put_page(struct page_info *page)
{
@@ -197,8 +259,8 @@ static inline int get_page(struct page_info *page,
unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
unlikely(d != _domain) ) /* Wrong owner? */
{
- if ( !_shadow_mode_refcounts(domain) )
- DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
+ if ( !_shadow2_mode_refcounts(domain) )
+ DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
PRtype_info "\n",
page_to_mfn(page), domain, unpickle_domptr(d),
x, page->u.inuse.type_info);
@@ -254,6 +316,16 @@ static inline int page_is_removable(struct page_info *page)
ASSERT(((_p)->count_info & PGC_count_mask) != 0); \
ASSERT(page_get_owner(_p) == (_d))
+// Quick test for whether a given page can be represented directly in CR3.
+//
+#if CONFIG_PAGING_LEVELS == 3
+#define MFN_FITS_IN_CR3(_MFN) !(mfn_x(_MFN) >> 20)
+
+/* returns a lowmem machine address of the copied L3 root table */
+unsigned long
+pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab);
+#endif /* CONFIG_PAGING_LEVELS == 3 */
+
int check_descriptor(struct desc_struct *d);
/*
@@ -271,29 +343,44 @@ int check_descriptor(struct desc_struct *d);
#define set_gpfn_from_mfn(mfn, pfn) (machine_to_phys_mapping[(mfn)] = (pfn))
#define get_gpfn_from_mfn(mfn) (machine_to_phys_mapping[(mfn)])
+
+#define mfn_to_gmfn(_d, mfn) \
+ ( (shadow2_mode_translate(_d)) \
+ ? get_gpfn_from_mfn(mfn) \
+ : (mfn) )
+
+#define gmfn_to_mfn(_d, gpfn) mfn_x(sh2_gfn_to_mfn(_d, gpfn))
+
+
/*
* The phys_to_machine_mapping is the reversed mapping of MPT for full
* virtualization. It is only used by shadow_mode_translate()==true
* guests, so we steal the address space that would have normally
* been used by the read-only MPT map.
*/
-#define phys_to_machine_mapping ((unsigned long *)RO_MPT_VIRT_START)
-#define NR_P2M_TABLE_ENTRIES ((unsigned long *)RO_MPT_VIRT_END \
- - phys_to_machine_mapping)
+#define phys_to_machine_mapping ((l1_pgentry_t *)RO_MPT_VIRT_START)
#define INVALID_MFN (~0UL)
#define VALID_MFN(_mfn) (!((_mfn) & (1U<<31)))
-#define set_mfn_from_gpfn(pfn, mfn) (phys_to_machine_mapping[(pfn)] = (mfn))
static inline unsigned long get_mfn_from_gpfn(unsigned long pfn)
{
- unsigned long mfn;
+ l1_pgentry_t l1e = l1e_empty();
+ int ret;
+
+#if CONFIG_PAGING_LEVELS > 2
+ if ( pfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof (l1_pgentry_t) )
+ /* This pfn is higher than the p2m map can hold */
+ return INVALID_MFN;
+#endif
+
+ ret = __copy_from_user(&l1e,
+ &phys_to_machine_mapping[pfn],
+ sizeof(l1e));
- if ( unlikely(pfn >= NR_P2M_TABLE_ENTRIES) ||
- unlikely(__copy_from_user(&mfn, &phys_to_machine_mapping[pfn],
- sizeof(mfn))) )
- mfn = INVALID_MFN;
+ if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) )
+ return l1e_get_pfn(l1e);
- return mfn;
+ return INVALID_MFN;
}
#ifdef MEMORY_GUARD
@@ -333,6 +420,7 @@ void audit_domains(void);
#endif
int new_guest_cr3(unsigned long pfn);
+void make_cr3(struct vcpu *v, unsigned long mfn);
void propagate_page_fault(unsigned long addr, u16 error_code);
diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
index f1c08cf500..07c09b2ae2 100644
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -112,6 +112,10 @@ static inline void wrmsrl(unsigned int msr, __u64 val)
#define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483
#define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484
#define MSR_IA32_VMX_MISC_MSR 0x485
+#define MSR_IA32_VMX_CR0_FIXED0 0x486
+#define MSR_IA32_VMX_CR0_FIXED1 0x487
+#define MSR_IA32_VMX_CR4_FIXED0 0x488
+#define MSR_IA32_VMX_CR4_FIXED1 0x489
#define IA32_FEATURE_CONTROL_MSR 0x3a
#define IA32_FEATURE_CONTROL_MSR_LOCK 0x1
#define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON 0x4
diff --git a/xen/include/asm-x86/page-guest32.h b/xen/include/asm-x86/page-guest32.h
index cf5595b078..e93206169a 100644
--- a/xen/include/asm-x86/page-guest32.h
+++ b/xen/include/asm-x86/page-guest32.h
@@ -89,15 +89,8 @@ static inline l2_pgentry_32_t l2e_from_paddr_32(paddr_t pa, unsigned int flags)
#define linear_l1_table_32 \
((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table_32 \
- ((l2_pgentry_32_t *)(LINEAR_PT_VIRT_START + \
- (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0))))
#define linear_pg_table_32 linear_l1_table_32
-#define linear_l2_table_32(_ed) ((_ed)->arch.guest_vtable)
-
-#define va_to_l1mfn_32(_ed, _va) \
- (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]))
#endif /* __X86_PAGE_GUEST_H__ */
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index 6432402066..94158c7f3d 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -233,26 +233,18 @@ typedef struct { u64 pfn; } pagetable_t;
+ DOMAIN_ENTRIES_PER_L4_PAGETABLE)
#endif
-#define LINEAR_PT_OFFSET (LINEAR_PT_VIRT_START & VADDR_MASK)
-#define linear_l1_table \
- ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table \
- ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0))))
-#define __linear_l3_table \
- ((l3_pgentry_t *)(LINEAR_PT_VIRT_START + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1))))
-#define __linear_l4_table \
- ((l4_pgentry_t *)(LINEAR_PT_VIRT_START + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)) + \
- (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<2))))
-
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
+#define linear_l1_table __linear_l1_table
#define linear_pg_table linear_l1_table
-#define linear_l2_table(v) ((v)->arch.guest_vtable)
-#define linear_l3_table(v) ((v)->arch.guest_vl3table)
-#define linear_l4_table(v) ((v)->arch.guest_vl4table)
+#define linear_l2_table(v) ((l2_pgentry_t *)(v)->arch.guest_vtable)
#ifndef __ASSEMBLY__
#if CONFIG_PAGING_LEVELS == 3
@@ -294,6 +286,7 @@ extern void paging_init(void);
#define _PAGE_AVAIL1 0x400U
#define _PAGE_AVAIL2 0x800U
#define _PAGE_AVAIL 0xE00U
+#define _PAGE_PSE_PAT 0x1000U
/*
* Debug option: Ensure that granted mappings are not implicitly unmapped.
@@ -307,9 +300,9 @@ extern void paging_init(void);
#endif
/*
- * Disallow unused flag bits plus PAT, PSE and GLOBAL. Also disallow GNTTAB
- * if we are using it for grant-table debugging. Permit the NX bit if the
- * hardware supports it.
+ * Disallow unused flag bits plus PAT, PSE and GLOBAL.
+ * Also disallow GNTTAB if we are using it for grant-table debugging.
+ * Permit the NX bit if the hardware supports it.
*/
#define BASE_DISALLOW_MASK ((0xFFFFF180U | _PAGE_GNTTAB) & ~_PAGE_NX)
diff --git a/xen/include/asm-x86/perfc_defn.h b/xen/include/asm-x86/perfc_defn.h
index 54bc01ea7c..d6e24b207d 100644
--- a/xen/include/asm-x86/perfc_defn.h
+++ b/xen/include/asm-x86/perfc_defn.h
@@ -144,4 +144,57 @@ PERFCOUNTER_CPU(remove_write_predicted, "remove_write predict hit&exit")
PERFCOUNTER_CPU(remove_write_bad_prediction, "remove_write bad prediction")
PERFCOUNTER_CPU(update_hl2e_invlpg, "update_hl2e calls invlpg")
+/* Shadow2 counters */
+PERFCOUNTER_CPU(shadow2_alloc, "calls to shadow2_alloc")
+PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs")
+PERFSTATUS(shadow2_alloc_count, "number of shadow pages in use")
+PERFCOUNTER_CPU(shadow2_free, "calls to shadow2_free")
+PERFCOUNTER_CPU(shadow2_prealloc_1, "shadow2 recycles old shadows")
+PERFCOUNTER_CPU(shadow2_prealloc_2, "shadow2 recycles in-use shadows")
+PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map")
+PERFCOUNTER_CPU(shadow2_a_update, "shadow2 A bit update")
+PERFCOUNTER_CPU(shadow2_ad_update, "shadow2 A&D bit update")
+PERFCOUNTER_CPU(shadow2_fault, "calls to shadow2_fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn")
+PERFCOUNTER_CPU(shadow2_fault_bail_not_present,
+ "shadow2_fault guest not-present")
+PERFCOUNTER_CPU(shadow2_fault_bail_nx, "shadow2_fault guest NX fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor,
+ "shadow2_fault guest U/S fault")
+PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read")
+PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write")
+PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails")
+PERFCOUNTER_CPU(shadow2_fault_mmio, "shadow2_fault handled as mmio")
+PERFCOUNTER_CPU(shadow2_fault_fixed, "shadow2_fault fixed fault")
+PERFCOUNTER_CPU(shadow2_ptwr_emulate, "shadow2 causes ptwr to emulate")
+PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e")
+PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e")
+PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e")
+PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e")
+PERFCOUNTER_CPU(shadow2_hash_lookups, "calls to shadow2_hash_lookup")
+PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head")
+PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses")
+PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status")
+PERFCOUNTER_CPU(shadow2_hash_inserts, "calls to shadow2_hash_insert")
+PERFCOUNTER_CPU(shadow2_hash_deletes, "calls to shadow2_hash_delete")
+PERFCOUNTER_CPU(shadow2_writeable, "shadow2 removes write access")
+PERFCOUNTER_CPU(shadow2_writeable_h_1, "shadow2 writeable: 32b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_2, "shadow2 writeable: 32pae w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_3, "shadow2 writeable: 64b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_4, "shadow2 writeable: 32b linux low")
+PERFCOUNTER_CPU(shadow2_writeable_bf, "shadow2 writeable brute-force")
+PERFCOUNTER_CPU(shadow2_mappings, "shadow2 removes all mappings")
+PERFCOUNTER_CPU(shadow2_mappings_bf, "shadow2 rm-mappings brute-force")
+PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit")
+PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit")
+PERFCOUNTER_CPU(shadow2_unshadow, "shadow2 unshadows a page")
+PERFCOUNTER_CPU(shadow2_up_pointer, "shadow2 unshadow by up-pointer")
+PERFCOUNTER_CPU(shadow2_unshadow_bf, "shadow2 unshadow brute-force")
+PERFCOUNTER_CPU(shadow2_get_page_fail, "shadow2_get_page_from_l1e failed")
+PERFCOUNTER_CPU(shadow2_guest_walk, "shadow2 walks guest tables")
+PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits")
+PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses")
+
+
/*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index d460544d3e..81c8757f8e 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -545,6 +545,7 @@ extern always_inline void prefetchw(const void *x)
#endif
void show_stack(struct cpu_user_regs *regs);
+void show_xen_trace(void);
void show_stack_overflow(unsigned long esp);
void show_registers(struct cpu_user_regs *regs);
void show_execution_state(struct cpu_user_regs *regs);
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
index 7144b24d8b..efade3021c 100644
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -1,8 +1,7 @@
/******************************************************************************
* include/asm-x86/shadow.h
*
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
+ * Copyright (c) 2006 by XenSource Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -22,1782 +21,28 @@
#ifndef _XEN_SHADOW_H
#define _XEN_SHADOW_H
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/perfc.h>
-#include <xen/sched.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/current.h>
-#include <asm/flushtlb.h>
-#include <asm/processor.h>
-#include <asm/hvm/hvm.h>
-#include <asm/hvm/support.h>
-#include <asm/regs.h>
-#include <public/dom0_ops.h>
-#include <asm/shadow_public.h>
-#include <asm/page-guest32.h>
-#include <asm/shadow_ops.h>
+/* This file is just a wrapper around the new Shadow2 header,
+ * providing names that must be defined in any shadow implementation. */
-/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
+#include <asm/shadow2.h>
-#define SHM_enable (1<<0) /* we're in one of the shadow modes */
-#define SHM_refcounts (1<<1) /* refcounts based on shadow tables instead of
- guest tables */
-#define SHM_write_all (1<<2) /* allow write access to all guest pt pages,
- regardless of pte write permissions */
-#define SHM_log_dirty (1<<3) /* enable log dirty mode */
-#define SHM_translate (1<<4) /* Xen does p2m translation, not guest */
-#define SHM_external (1<<5) /* Xen does not steal address space from the
- domain for its own booking; requires VT or
- similar mechanisms */
-#define SHM_wr_pt_pte (1<<6) /* guest allowed to set PAGE_RW bit in PTEs which
- point to page table pages. */
+/* How to make sure a page is not referred to in a shadow PT */
+/* This will need to be a for_each_vcpu if we go to per-vcpu shadows */
+#define shadow_drop_references(_d, _p) \
+ shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
+#define shadow_sync_and_drop_references(_d, _p) \
+ shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
-#define shadow_mode_enabled(_d) ((_d)->arch.shadow_mode)
-#define shadow_mode_refcounts(_d) ((_d)->arch.shadow_mode & SHM_refcounts)
-#define shadow_mode_write_l1(_d) (VM_ASSIST(_d, VMASST_TYPE_writable_pagetables))
-#define shadow_mode_write_all(_d) ((_d)->arch.shadow_mode & SHM_write_all)
-#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty)
-#define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate)
-#define shadow_mode_external(_d) ((_d)->arch.shadow_mode & SHM_external)
-#define shadow_mode_wr_pt_pte(_d) ((_d)->arch.shadow_mode & SHM_wr_pt_pte)
+/* Whether we are translating the domain's frame numbers for it */
+#define shadow_mode_translate(d) shadow2_mode_translate(d)
-#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
-#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
- (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-#define shadow_linear_l2_table(_v) ((_v)->arch.shadow_vtable)
+/* ...and if so, how to add and remove entries in the mapping */
+#define guest_physmap_add_page(_d, _p, _m) \
+ shadow2_guest_physmap_add_page((_d), (_p), (_m))
+#define guest_physmap_remove_page(_d, _p, _m ) \
+ shadow2_guest_physmap_remove_page((_d), (_p), (_m))
-// easy access to the hl2 table (for translated but not external modes only)
-#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \
- (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-
-/*
- * For now we use the per-domain BIGLOCK rather than a shadow-specific lock.
- * We usually have the BIGLOCK already acquired anyway, so this is unlikely
- * to cause much unnecessary extra serialisation. Also it's a recursive
- * lock, and there are some code paths containing nested shadow_lock().
- * The #if0'ed code below is therefore broken until such nesting is removed.
- */
-#if 0
-#define shadow_lock_init(_d) \
- spin_lock_init(&(_d)->arch.shadow_lock)
-#define shadow_lock_is_acquired(_d) \
- spin_is_locked(&(_d)->arch.shadow_lock)
-#define shadow_lock(_d) \
-do { \
- ASSERT(!shadow_lock_is_acquired(_d)); \
- spin_lock(&(_d)->arch.shadow_lock); \
-} while (0)
-#define shadow_unlock(_d) \
-do { \
- ASSERT(!shadow_lock_is_acquired(_d)); \
- spin_unlock(&(_d)->arch.shadow_lock); \
-} while (0)
-#else
-#define shadow_lock_init(_d) \
- ((_d)->arch.shadow_nest = 0)
-#define shadow_lock_is_acquired(_d) \
- (spin_is_locked(&(_d)->big_lock) && ((_d)->arch.shadow_nest != 0))
-#define shadow_lock(_d) \
-do { \
- LOCK_BIGLOCK(_d); \
- (_d)->arch.shadow_nest++; \
-} while (0)
-#define shadow_unlock(_d) \
-do { \
- ASSERT(shadow_lock_is_acquired(_d)); \
- (_d)->arch.shadow_nest--; \
- UNLOCK_BIGLOCK(_d); \
-} while (0)
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
-static inline u64 get_cr3_idxval(struct vcpu *v)
-{
- u64 pae_cr3;
-
- if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 &&
- !shadow_mode_log_dirty(v->domain) )
- {
- pae_cr3 = hvm_get_guest_ctrl_reg(v, 3); /* get CR3 */
- return (pae_cr3 >> PAE_CR3_ALIGN) & PAE_CR3_IDX_MASK;
- }
- else
- return 0;
-}
-
-#define shadow_key_t u64
-#define index_to_key(x) ((x) << 32)
-#else
-#define get_cr3_idxval(v) (0)
-#define shadow_key_t unsigned long
-#define index_to_key(x) (0)
-#endif
-
-
-#define SHADOW_ENCODE_MIN_MAX(_min, _max) ((((GUEST_L1_PAGETABLE_ENTRIES - 1) - (_max)) << 16) | (_min))
-#define SHADOW_MIN(_encoded) ((_encoded) & ((1u<<16) - 1))
-#define SHADOW_MAX(_encoded) ((GUEST_L1_PAGETABLE_ENTRIES - 1) - ((_encoded) >> 16))
-extern void shadow_direct_map_clean(struct domain *d);
-extern int shadow_direct_map_init(struct domain *d);
-extern int shadow_direct_map_fault(
- unsigned long vpa, struct cpu_user_regs *regs);
-extern void shadow_mode_init(void);
-extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc);
-extern int shadow_fault(unsigned long va, struct cpu_user_regs *regs);
-extern int shadow_mode_enable(struct domain *p, unsigned int mode);
-extern void shadow_invlpg(struct vcpu *, unsigned long);
-extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync(
- struct vcpu *v, unsigned long gpfn, unsigned long mfn);
-extern void free_monitor_pagetable(struct vcpu *v);
-extern void __shadow_sync_all(struct domain *d);
-extern int __shadow_out_of_sync(struct vcpu *v, unsigned long va);
-extern int set_p2m_entry(
- struct domain *d, unsigned long pfn, unsigned long mfn,
- struct domain_mmap_cache *l2cache,
- struct domain_mmap_cache *l1cache);
-extern void remove_shadow(struct domain *d, unsigned long gpfn, u32 stype);
-
-extern void free_shadow_page(unsigned long smfn);
-
-extern void shadow_l1_normal_pt_update(struct domain *d,
- paddr_t pa, l1_pgentry_t l1e,
- struct domain_mmap_cache *cache);
-extern void shadow_l2_normal_pt_update(struct domain *d,
- paddr_t pa, l2_pgentry_t l2e,
- struct domain_mmap_cache *cache);
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/page-guest32.h>
-/*
- * va_mask cannot be used because it's used by the shadow hash.
- * Use the score area for for now.
- */
-#define is_xen_l2_slot(t,s) \
- ( ((((t) & PGT_score_mask) >> PGT_score_shift) == 3) && \
- ((s) >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES - 1))) )
-
-extern unsigned long gva_to_gpa(unsigned long gva);
-extern void shadow_l3_normal_pt_update(struct domain *d,
- paddr_t pa, l3_pgentry_t l3e,
- struct domain_mmap_cache *cache);
-#endif
-#if CONFIG_PAGING_LEVELS >= 4
-extern void shadow_l4_normal_pt_update(struct domain *d,
- paddr_t pa, l4_pgentry_t l4e,
- struct domain_mmap_cache *cache);
-#endif
-extern int shadow_do_update_va_mapping(unsigned long va,
- l1_pgentry_t val,
- struct vcpu *v);
-
-
-static inline unsigned long __shadow_status(
- struct domain *d, unsigned long gpfn, unsigned long stype);
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void update_hl2e(struct vcpu *v, unsigned long va);
-#endif
-
-static inline int page_is_page_table(struct page_info *page)
-{
- struct domain *owner = page_get_owner(page);
- u32 type_info;
-
- if ( owner && shadow_mode_refcounts(owner) )
- return page->count_info & PGC_page_table;
-
- type_info = page->u.inuse.type_info & PGT_type_mask;
- return type_info && (type_info <= PGT_l4_page_table);
-}
-
-static inline int mfn_is_page_table(unsigned long mfn)
-{
- if ( !mfn_valid(mfn) )
- return 0;
-
- return page_is_page_table(mfn_to_page(mfn));
-}
-
-static inline int page_out_of_sync(struct page_info *page)
-{
- return page->count_info & PGC_out_of_sync;
-}
-
-static inline int mfn_out_of_sync(unsigned long mfn)
-{
- if ( !mfn_valid(mfn) )
- return 0;
-
- return page_out_of_sync(mfn_to_page(mfn));
-}
-
-
-/************************************************************************/
-
-static void inline
-__shadow_sync_mfn(struct domain *d, unsigned long mfn)
-{
- if ( d->arch.out_of_sync )
- {
- // XXX - could be smarter
- //
- __shadow_sync_all(d);
- }
-}
-
-static void inline
-__shadow_sync_va(struct vcpu *v, unsigned long va)
-{
- struct domain *d = v->domain;
-
- if ( d->arch.out_of_sync && __shadow_out_of_sync(v, va) )
- {
- perfc_incrc(shadow_sync_va);
-
- // XXX - could be smarter
- //
- __shadow_sync_all(v->domain);
- }
-#if CONFIG_PAGING_LEVELS <= 2
- // Also make sure the HL2 is up-to-date for this address.
- //
- if ( unlikely(shadow_mode_translate(v->domain)) )
- update_hl2e(v, va);
-#endif
-}
-
-static void inline
-shadow_sync_all(struct domain *d)
-{
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- shadow_lock(d);
-
- if ( d->arch.out_of_sync )
- __shadow_sync_all(d);
-
- ASSERT(d->arch.out_of_sync == NULL);
-
- shadow_unlock(d);
- }
-}
-
-// SMP BUG: This routine can't ever be used properly in an SMP context.
-// It should be something like get_shadow_and_sync_va().
-// This probably shouldn't exist.
-//
-static void inline
-shadow_sync_va(struct vcpu *v, unsigned long gva)
-{
- struct domain *d = v->domain;
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- shadow_lock(d);
- __shadow_sync_va(v, gva);
- shadow_unlock(d);
- }
-}
-
-extern void __shadow_mode_disable(struct domain *d);
-static inline void shadow_mode_disable(struct domain *d)
-{
- if ( unlikely(shadow_mode_enabled(d)) )
- {
- shadow_lock(d);
- __shadow_mode_disable(d);
- shadow_unlock(d);
- }
-}
-
-/************************************************************************/
-
-#define mfn_to_gmfn(_d, mfn) \
- ( (shadow_mode_translate(_d)) \
- ? get_gpfn_from_mfn(mfn) \
- : (mfn) )
-
-#define gmfn_to_mfn(_d, gpfn) \
- ({ \
- unlikely(shadow_mode_translate(_d)) \
- ? (likely(current->domain == (_d)) \
- ? get_mfn_from_gpfn(gpfn) \
- : get_mfn_from_gpfn_foreign(_d, gpfn)) \
- : (gpfn); \
- })
-
-extern unsigned long get_mfn_from_gpfn_foreign(
- struct domain *d, unsigned long gpfn);
-
-/************************************************************************/
-
-struct shadow_status {
- struct shadow_status *next; /* Pull-to-front list per hash bucket. */
- shadow_key_t gpfn_and_flags; /* Guest pfn plus flags. */
- unsigned long smfn; /* Shadow mfn. */
-};
-
-#define shadow_ht_extra_size 128
-#define shadow_ht_buckets 256
-
-struct out_of_sync_entry {
- struct out_of_sync_entry *next;
- struct vcpu *v;
- unsigned long gpfn; /* why is this here? */
- unsigned long gmfn;
- unsigned long snapshot_mfn;
- paddr_t writable_pl1e; /* NB: this is a machine address */
- unsigned long va;
-};
-
-#define out_of_sync_extra_size 127
-
-#define SHADOW_SNAPSHOT_ELSEWHERE (-1L)
-
-/************************************************************************/
-#define SHADOW_DEBUG 0
-#define SHADOW_VERBOSE_DEBUG 0
-#define SHADOW_VVERBOSE_DEBUG 0
-#define SHADOW_VVVERBOSE_DEBUG 0
-#define SHADOW_HASH_DEBUG 0
-#define FULLSHADOW_DEBUG 0
-
-#if SHADOW_DEBUG
-extern int shadow_status_noswap;
-#define SHADOW_REFLECTS_SNAPSHOT _PAGE_AVAIL0
-#endif
-
-#if SHADOW_VERBOSE_DEBUG
-#define SH_LOG(_f, _a...) \
- printk("DOM%uP%u: SH_LOG(%d): " _f "\n", \
- current->domain->domain_id , smp_processor_id(), __LINE__ , ## _a )
-#define SH_VLOG(_f, _a...) \
- printk("DOM%uP%u: SH_VLOG(%d): " _f "\n", \
- current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_LOG(_f, _a...) ((void)0)
-#define SH_VLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVERBOSE_DEBUG
-#define SH_VVLOG(_f, _a...) \
- printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n", \
- current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVVERBOSE_DEBUG
-#define SH_VVVLOG(_f, _a...) \
- printk("DOM%uP%u: SH_VVVLOG(%d): " _f "\n", \
- current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVVLOG(_f, _a...) ((void)0)
-#endif
-
-#if FULLSHADOW_DEBUG
-#define FSH_LOG(_f, _a...) \
- printk("DOM%uP%u: FSH_LOG(%d): " _f "\n", \
- current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define FSH_LOG(_f, _a...) ((void)0)
-#endif
-
-
-/************************************************************************/
-
-static inline int
-shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
- l1_pgentry_t nl1e;
- int res;
- unsigned long mfn;
- struct domain *owner;
-
- ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
-
- if ( !shadow_mode_refcounts(d) )
- return 1;
-
- nl1e = l1e;
- l1e_remove_flags(nl1e, _PAGE_GLOBAL);
-
- if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
- return 0;
-
- res = get_page_from_l1e(nl1e, d);
-
- if ( unlikely(!res) && IS_PRIV(d) && !shadow_mode_translate(d) &&
- !(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) &&
- (mfn = l1e_get_pfn(nl1e)) &&
- mfn_valid(mfn) &&
- (owner = page_get_owner(mfn_to_page(mfn))) &&
- (d != owner) )
- {
- res = get_page_from_l1e(nl1e, owner);
- printk("tried to map mfn %lx from domain %d into shadow page tables "
- "of domain %d; %s\n",
- mfn, owner->domain_id, d->domain_id,
- res ? "success" : "failed");
- }
-
- if ( unlikely(!res) )
- {
- perfc_incrc(shadow_get_page_fail);
- FSH_LOG("%s failed to get ref l1e=%" PRIpte "\n",
- __func__, l1e_get_intpte(l1e));
- }
-
- return res;
-}
-
-static inline void
-shadow_put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
- if ( !shadow_mode_refcounts(d) )
- return;
-
- put_page_from_l1e(l1e, d);
-}
-
-static inline void
-shadow_put_page_type(struct domain *d, struct page_info *page)
-{
- if ( !shadow_mode_refcounts(d) )
- return;
-
- put_page_type(page);
-}
-
-static inline int shadow_get_page(struct domain *d,
- struct page_info *page,
- struct domain *owner)
-{
- if ( !shadow_mode_refcounts(d) )
- return 1;
- return get_page(page, owner);
-}
-
-static inline void shadow_put_page(struct domain *d,
- struct page_info *page)
-{
- if ( !shadow_mode_refcounts(d) )
- return;
- put_page(page);
-}
-
-/************************************************************************/
-
-static inline void __mark_dirty(struct domain *d, unsigned long mfn)
-{
- unsigned long pfn;
-
- ASSERT(shadow_lock_is_acquired(d));
-
- if ( likely(!shadow_mode_log_dirty(d)) || !VALID_MFN(mfn) )
- return;
-
- ASSERT(d->arch.shadow_dirty_bitmap != NULL);
-
- /* We /really/ mean PFN here, even for non-translated guests. */
- pfn = get_gpfn_from_mfn(mfn);
-
- /*
- * Values with the MSB set denote MFNs that aren't really part of the
- * domain's pseudo-physical memory map (e.g., the shared info frame).
- * Nothing to do here...
- */
- if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) )
- return;
-
- /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
- if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) &&
- !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
- {
- d->arch.shadow_dirty_count++;
- }
-#ifndef NDEBUG
- else if ( mfn_valid(mfn) )
- {
- SH_VLOG("mark_dirty OOR! mfn=%lx pfn=%lx max=%x (dom %p)",
- mfn, pfn, d->arch.shadow_dirty_bitmap_size, d);
- SH_VLOG("dom=%p caf=%08x taf=%" PRtype_info,
- page_get_owner(mfn_to_page(mfn)),
- mfn_to_page(mfn)->count_info,
- mfn_to_page(mfn)->u.inuse.type_info );
- }
-#endif
-}
-
-
-static inline void mark_dirty(struct domain *d, unsigned int mfn)
-{
- if ( unlikely(shadow_mode_log_dirty(d)) )
- {
- shadow_lock(d);
- __mark_dirty(d, mfn);
- shadow_unlock(d);
- }
-}
-
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void
-__shadow_get_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
- ASSERT(shadow_mode_enabled(v->domain));
-
- *psl2e = v->arch.shadow_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__shadow_set_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
- ASSERT(shadow_mode_enabled(v->domain));
-
- v->arch.shadow_vtable[l2_table_offset(va)] = value;
-}
-
-static inline void
-__guest_get_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t *pl2e)
-{
- *pl2e = v->arch.guest_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__guest_set_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
- struct domain *d = v->domain;
-
- v->arch.guest_vtable[l2_table_offset(va)] = value;
-
- if ( unlikely(shadow_mode_translate(d)) )
- update_hl2e(v, va);
-
- __mark_dirty(d, pagetable_get_pfn(v->arch.guest_table));
-}
-
-static inline void
-__direct_get_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
- l2_pgentry_t *phys_vtable;
-
- ASSERT(shadow_mode_enabled(v->domain));
-
- phys_vtable = map_domain_page(
- pagetable_get_pfn(v->domain->arch.phys_table));
-
- *psl2e = phys_vtable[l2_table_offset(va)];
-
- unmap_domain_page(phys_vtable);
-}
-
-static inline void
-__direct_set_l2e(
- struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
- l2_pgentry_t *phys_vtable;
-
- ASSERT(shadow_mode_enabled(v->domain));
-
- phys_vtable = map_domain_page(
- pagetable_get_pfn(v->domain->arch.phys_table));
-
- phys_vtable[l2_table_offset(va)] = value;
-
- unmap_domain_page(phys_vtable);
-}
-
-static inline void
-update_hl2e(struct vcpu *v, unsigned long va)
-{
- int index = l2_table_offset(va);
- unsigned long mfn;
- l2_pgentry_t gl2e = v->arch.guest_vtable[index];
- l1_pgentry_t old_hl2e, new_hl2e;
- int need_flush = 0;
-
- ASSERT(shadow_mode_translate(v->domain));
-
- old_hl2e = v->arch.hl2_vtable[index];
-
- if ( (l2e_get_flags(gl2e) & _PAGE_PRESENT) &&
- VALID_MFN(mfn = get_mfn_from_gpfn(l2e_get_pfn(gl2e))) )
- new_hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
- else
- new_hl2e = l1e_empty();
-
- // only do the ref counting if something has changed.
- //
- if ( (l1e_has_changed(old_hl2e, new_hl2e, PAGE_FLAG_MASK)) )
- {
- if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
- !shadow_get_page(v->domain, mfn_to_page(l1e_get_pfn(new_hl2e)),
- v->domain) )
- new_hl2e = l1e_empty();
- if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
- {
- shadow_put_page(v->domain, mfn_to_page(l1e_get_pfn(old_hl2e)));
- need_flush = 1;
- }
-
- v->arch.hl2_vtable[l2_table_offset(va)] = new_hl2e;
-
- if ( need_flush )
- {
- perfc_incrc(update_hl2e_invlpg);
- flush_tlb_one_mask(v->domain->domain_dirty_cpumask,
- &linear_pg_table[l1_linear_offset(va)]);
- }
- }
-}
-
-static inline void shadow_drop_references(
- struct domain *d, struct page_info *page)
-{
- if ( likely(!shadow_mode_refcounts(d)) ||
- ((page->u.inuse.type_info & PGT_count_mask) == 0) )
- return;
-
- /* XXX This needs more thought... */
- printk("%s: needing to call shadow_remove_all_access for mfn=%lx\n",
- __func__, page_to_mfn(page));
- printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
- page->count_info, page->u.inuse.type_info);
-
- shadow_lock(d);
- shadow_remove_all_access(d, page_to_mfn(page));
- shadow_unlock(d);
-
- printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
- page->count_info, page->u.inuse.type_info);
-}
-
-/* XXX Needs more thought. Neither pretty nor fast: a place holder. */
-static inline void shadow_sync_and_drop_references(
- struct domain *d, struct page_info *page)
-{
- if ( likely(!shadow_mode_refcounts(d)) )
- return;
-
- if ( page_out_of_sync(page) )
- __shadow_sync_mfn(d, page_to_mfn(page));
-
- shadow_remove_all_access(d, page_to_mfn(page));
-}
-#endif
-
-/************************************************************************/
-
-/*
- * Add another shadow reference to smfn.
- */
-static inline int
-get_shadow_ref(unsigned long smfn)
-{
- u32 x, nx;
-
- ASSERT(mfn_valid(smfn));
-
- x = mfn_to_page(smfn)->count_info;
- nx = x + 1;
-
- if ( unlikely(nx == 0) )
- {
- printk("get_shadow_ref overflow, gmfn=%" PRtype_info " smfn=%lx\n",
- mfn_to_page(smfn)->u.inuse.type_info & PGT_mfn_mask,
- smfn);
- BUG();
- }
-
- // Guarded by the shadow lock...
- //
- mfn_to_page(smfn)->count_info = nx;
-
- return 1;
-}
-
-/*
- * Drop a shadow reference to smfn.
- */
-static inline void
-put_shadow_ref(unsigned long smfn)
-{
- u32 x, nx;
-
- ASSERT(mfn_valid(smfn));
-
- x = mfn_to_page(smfn)->count_info;
- nx = x - 1;
-
- if ( unlikely(x == 0) )
- {
- printk("put_shadow_ref underflow, smfn=%lx oc=%08x t=%"
- PRtype_info "\n",
- smfn,
- mfn_to_page(smfn)->count_info,
- mfn_to_page(smfn)->u.inuse.type_info);
- BUG();
- }
-
- // Guarded by the shadow lock...
- //
- mfn_to_page(smfn)->count_info = nx;
-
- if ( unlikely(nx == 0) )
- {
- free_shadow_page(smfn);
- }
-}
-
-static inline void
-shadow_pin(unsigned long smfn)
-{
- ASSERT( !(mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
- mfn_to_page(smfn)->u.inuse.type_info |= PGT_pinned;
- if ( unlikely(!get_shadow_ref(smfn)) )
- BUG();
-}
-
-static inline void
-shadow_unpin(unsigned long smfn)
-{
- ASSERT( (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
- mfn_to_page(smfn)->u.inuse.type_info &= ~PGT_pinned;
- put_shadow_ref(smfn);
-}
-
-/*
- * SMP issue. The following code assumes the shadow lock is held. Re-visit
- * when working on finer-gained locks for shadow.
- */
-static inline void set_guest_back_ptr(
- struct domain *d, l1_pgentry_t spte,
- unsigned long smfn, unsigned int index)
-{
- struct page_info *gpage;
-
- ASSERT(shadow_lock_is_acquired(d));
-
- if ( !shadow_mode_external(d) ||
- ((l1e_get_flags(spte) & (_PAGE_PRESENT|_PAGE_RW)) !=
- (_PAGE_PRESENT|_PAGE_RW)) )
- return;
-
- gpage = l1e_get_page(spte);
-
- ASSERT(smfn != 0);
- ASSERT(page_to_mfn(gpage) != 0);
-
- gpage->tlbflush_timestamp = smfn;
- gpage->u.inuse.type_info &= ~PGT_va_mask;
- gpage->u.inuse.type_info |= (unsigned long)index << PGT_va_shift;
-}
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_mark_va_out_of_sync(
- struct vcpu *v, unsigned long gpfn, unsigned long mfn,
- unsigned long va);
-
-static inline int l1pte_write_fault(
- struct vcpu *v, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
- unsigned long va)
-{
- struct domain *d = v->domain;
- l1_pgentry_t gpte = *gpte_p;
- l1_pgentry_t spte;
- unsigned long gpfn = l1e_get_pfn(gpte);
- unsigned long gmfn = gmfn_to_mfn(d, gpfn);
-
- //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
-
- if ( unlikely(!VALID_MFN(gmfn)) )
- {
- SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
- *spte_p = l1e_empty();
- return 0;
- }
-
- ASSERT(l1e_get_flags(gpte) & _PAGE_RW);
- l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
- spte = l1e_from_pfn(gmfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
- SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
- l1e_get_intpte(spte), l1e_get_intpte(gpte));
-
- __mark_dirty(d, gmfn);
-
- if ( mfn_is_page_table(gmfn) )
- shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
-
- *gpte_p = gpte;
- *spte_p = spte;
-
- return 1;
-}
-
-static inline int l1pte_read_fault(
- struct domain *d, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
-{
- l1_pgentry_t gpte = *gpte_p;
- l1_pgentry_t spte = *spte_p;
- unsigned long pfn = l1e_get_pfn(gpte);
- unsigned long mfn = gmfn_to_mfn(d, pfn);
-
- if ( unlikely(!VALID_MFN(mfn)) )
- {
- SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
- *spte_p = l1e_empty();
- return 0;
- }
-
- l1e_add_flags(gpte, _PAGE_ACCESSED);
- spte = l1e_from_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
- if ( shadow_mode_log_dirty(d) || !(l1e_get_flags(gpte) & _PAGE_DIRTY) ||
- mfn_is_page_table(mfn) )
- {
- l1e_remove_flags(spte, _PAGE_RW);
- }
-
- SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
- l1e_get_intpte(spte), l1e_get_intpte(gpte));
- *gpte_p = gpte;
- *spte_p = spte;
-
- return 1;
-}
-#endif
-
-static inline void l1pte_propagate_from_guest(
- struct domain *d, guest_l1_pgentry_t gpte, l1_pgentry_t *spte_p)
-{
- unsigned long mfn;
- l1_pgentry_t spte;
-
- spte = l1e_empty();
-
- if ( ((guest_l1e_get_flags(gpte) & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
- (_PAGE_PRESENT|_PAGE_ACCESSED)) &&
- VALID_MFN(mfn = gmfn_to_mfn(d, l1e_get_pfn(gpte))) )
- {
- spte = l1e_from_pfn(
- mfn, guest_l1e_get_flags(gpte) & ~(_PAGE_GLOBAL | _PAGE_AVAIL));
-
- if ( shadow_mode_log_dirty(d) ||
- !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
- mfn_is_page_table(mfn) )
- {
- l1e_remove_flags(spte, _PAGE_RW);
- }
- }
-
- if ( l1e_get_intpte(spte) || l1e_get_intpte(gpte) )
- SH_VVVLOG("%s: gpte=%" PRIpte ", new spte=%" PRIpte,
- __func__, l1e_get_intpte(gpte), l1e_get_intpte(spte));
-
- *spte_p = spte;
-}
-
-static inline void hl2e_propagate_from_guest(
- struct domain *d, l2_pgentry_t gpde, l1_pgentry_t *hl2e_p)
-{
- unsigned long pfn = l2e_get_pfn(gpde);
- unsigned long mfn;
- l1_pgentry_t hl2e;
-
- hl2e = l1e_empty();
-
- if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
- {
- mfn = gmfn_to_mfn(d, pfn);
- if ( VALID_MFN(mfn) && mfn_valid(mfn) )
- hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
- }
-
- if ( l1e_get_intpte(hl2e) || l2e_get_intpte(gpde) )
- SH_VVLOG("%s: gpde=%" PRIpte " hl2e=%" PRIpte, __func__,
- l2e_get_intpte(gpde), l1e_get_intpte(hl2e));
-
- *hl2e_p = hl2e;
-}
-
-static inline void l2pde_general(
- struct domain *d,
- guest_l2_pgentry_t *gpde_p,
- l2_pgentry_t *spde_p,
- unsigned long sl1mfn)
-{
- guest_l2_pgentry_t gpde = *gpde_p;
- l2_pgentry_t spde;
-
- spde = l2e_empty();
- if ( (guest_l2e_get_flags(gpde) & _PAGE_PRESENT) && (sl1mfn != 0) )
- {
- spde = l2e_from_pfn(
- sl1mfn,
- (guest_l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED) & ~_PAGE_AVAIL);
-
- /* N.B. PDEs do not have a dirty bit. */
- guest_l2e_add_flags(gpde, _PAGE_ACCESSED);
-
- *gpde_p = gpde;
- }
-
- if ( l2e_get_intpte(spde) || l2e_get_intpte(gpde) )
- SH_VVLOG("%s: gpde=%" PRIpte ", new spde=%" PRIpte, __func__,
- l2e_get_intpte(gpde), l2e_get_intpte(spde));
-
- *spde_p = spde;
-}
-
-static inline void l2pde_propagate_from_guest(
- struct domain *d, guest_l2_pgentry_t *gpde_p, l2_pgentry_t *spde_p)
-{
- guest_l2_pgentry_t gpde = *gpde_p;
- unsigned long sl1mfn = 0;
-
- if ( guest_l2e_get_flags(gpde) & _PAGE_PRESENT )
- sl1mfn = __shadow_status(d, l2e_get_pfn(gpde), PGT_l1_shadow);
- l2pde_general(d, gpde_p, spde_p, sl1mfn);
-}
-
-/************************************************************************/
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pte_change(
- struct domain *d,
- guest_l1_pgentry_t new_pte,
- l1_pgentry_t *shadow_pte_p)
-{
- l1_pgentry_t old_spte, new_spte;
- int need_flush = 0;
-
- perfc_incrc(validate_pte_calls);
-
- l1pte_propagate_from_guest(d, new_pte, &new_spte);
-
- if ( shadow_mode_refcounts(d) )
- {
- old_spte = *shadow_pte_p;
-
- if ( l1e_get_intpte(old_spte) == l1e_get_intpte(new_spte) )
- {
- // No accounting required...
- //
- perfc_incrc(validate_pte_changes1);
- }
- else if ( l1e_get_intpte(old_spte) == (l1e_get_intpte(new_spte)|_PAGE_RW) )
- {
- // Fast path for PTEs that have merely been write-protected
- // (e.g., during a Unix fork()). A strict reduction in privilege.
- //
- perfc_incrc(validate_pte_changes2);
- if ( likely(l1e_get_flags(new_spte) & _PAGE_PRESENT) )
- shadow_put_page_type(d, mfn_to_page(l1e_get_pfn(new_spte)));
- }
- else if ( ((l1e_get_flags(old_spte) | l1e_get_flags(new_spte)) &
- _PAGE_PRESENT ) &&
- l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
- {
- // only do the ref counting if something important changed.
- //
- perfc_incrc(validate_pte_changes3);
-
- if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
- {
- shadow_put_page_from_l1e(old_spte, d);
- need_flush = 1;
- }
- if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
- !shadow_get_page_from_l1e(new_spte, d) ) {
- new_spte = l1e_empty();
- need_flush = -1; /* need to unshadow the page */
- }
- }
- else
- {
- perfc_incrc(validate_pte_changes4);
- }
- }
-
- *shadow_pte_p = new_spte;
-
- return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_hl2e_change(
- struct domain *d,
- l2_pgentry_t new_gpde,
- l1_pgentry_t *shadow_hl2e_p)
-{
- l1_pgentry_t old_hl2e, new_hl2e;
- int need_flush = 0;
-
- perfc_incrc(validate_hl2e_calls);
-
- old_hl2e = *shadow_hl2e_p;
- hl2e_propagate_from_guest(d, new_gpde, &new_hl2e);
-
- // Only do the ref counting if something important changed.
- //
- if ( ((l1e_get_flags(old_hl2e) | l1e_get_flags(new_hl2e)) & _PAGE_PRESENT) &&
- l1e_has_changed(old_hl2e, new_hl2e, _PAGE_PRESENT) )
- {
- perfc_incrc(validate_hl2e_changes);
-
- if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
- !get_page(mfn_to_page(l1e_get_pfn(new_hl2e)), d) )
- new_hl2e = l1e_empty();
- if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
- {
- put_page(mfn_to_page(l1e_get_pfn(old_hl2e)));
- need_flush = 1;
- }
- }
-
- *shadow_hl2e_p = new_hl2e;
-
- return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pde_change(
- struct domain *d,
- guest_l2_pgentry_t new_gpde,
- l2_pgentry_t *shadow_pde_p)
-{
- l2_pgentry_t old_spde, new_spde;
- int need_flush = 0;
-
- perfc_incrc(validate_pde_calls);
-
- old_spde = *shadow_pde_p;
- l2pde_propagate_from_guest(d, &new_gpde, &new_spde);
-
- // Only do the ref counting if something important changed.
- //
- if ( ((l2e_get_intpte(old_spde) | l2e_get_intpte(new_spde)) & _PAGE_PRESENT) &&
- l2e_has_changed(old_spde, new_spde, _PAGE_PRESENT) )
- {
- perfc_incrc(validate_pde_changes);
-
- if ( (l2e_get_flags(new_spde) & _PAGE_PRESENT) &&
- !get_shadow_ref(l2e_get_pfn(new_spde)) )
- BUG();
- if ( l2e_get_flags(old_spde) & _PAGE_PRESENT )
- {
- put_shadow_ref(l2e_get_pfn(old_spde));
- need_flush = 1;
- }
- }
-
- *shadow_pde_p = new_spde;
-
- return need_flush;
-}
-
-/*********************************************************************/
-
-#if SHADOW_HASH_DEBUG
-
-static void shadow_audit(struct domain *d, int print)
-{
- int live = 0, free = 0, j = 0, abs;
- struct shadow_status *a;
-
- for ( j = 0; j < shadow_ht_buckets; j++ )
- {
- a = &d->arch.shadow_ht[j];
- if ( a->gpfn_and_flags )
- {
- live++;
- ASSERT(a->smfn);
- }
- else
- ASSERT(!a->next);
-
- a = a->next;
- while ( a && (live < 9999) )
- {
- live++;
- if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) )
- {
- printk("XXX live=%d gpfn+flags=%lx sp=%lx next=%p\n",
- live, a->gpfn_and_flags, a->smfn, a->next);
- BUG();
- }
- ASSERT(a->smfn);
- a = a->next;
- }
- ASSERT(live < 9999);
- }
-
- for ( a = d->arch.shadow_ht_free; a != NULL; a = a->next )
- free++;
-
- if ( print )
- printk("Xlive=%d free=%d\n", live, free);
-
- // BUG: this only works if there's only a single domain which is
- // using shadow tables.
- //
- abs = (
- perfc_value(shadow_l1_pages) +
- perfc_value(shadow_l2_pages) +
- perfc_value(hl2_table_pages) +
- perfc_value(snapshot_pages) +
- perfc_value(writable_pte_predictions)
- ) - live;
-#ifdef PERF_COUNTERS
- if ( (abs < -1) || (abs > 1) )
- {
- printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d writable_ptes=%d\n",
- live, free,
- perfc_value(shadow_l1_pages),
- perfc_value(shadow_l2_pages),
- perfc_value(hl2_table_pages),
- perfc_value(snapshot_pages),
- perfc_value(writable_pte_predictions));
- BUG();
- }
-#endif
-
- // XXX ought to add some code to audit the out-of-sync entries, too.
- //
-}
-#else
-#define shadow_audit(p, print) ((void)0)
-#endif
-
-
-static inline struct shadow_status *hash_bucket(
- struct domain *d, unsigned int gpfn)
-{
- return &d->arch.shadow_ht[gpfn % shadow_ht_buckets];
-}
-
-
-/*
- * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace,
- * which, depending on full shadow mode, may or may not equal
- * its mfn).
- * It returns the shadow's mfn, or zero if it doesn't exist.
- */
-static inline unsigned long __shadow_status(
- struct domain *d, unsigned long gpfn, unsigned long stype)
-{
- struct shadow_status *p, *x, *head;
- shadow_key_t key;
-#if CONFIG_PAGING_LEVELS >= 3
- if ( d->arch.ops->guest_paging_levels == PAGING_L3 && stype == PGT_l4_shadow )
- key = gpfn | stype | index_to_key(get_cr3_idxval(current));
- else
-#endif
- key = gpfn | stype;
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(gpfn == (gpfn & PGT_mfn_mask));
- ASSERT(stype && !(stype & ~PGT_type_mask));
-
- perfc_incrc(shadow_status_calls);
-
- x = head = hash_bucket(d, gpfn);
- p = NULL;
-
- shadow_audit(d, 0);
-
- do
- {
- ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL)));
-
- if ( x->gpfn_and_flags == key )
- {
-#if SHADOW_DEBUG
- if ( unlikely(shadow_status_noswap) )
- return x->smfn;
-#endif
- /* Pull-to-front if 'x' isn't already the head item. */
- if ( unlikely(x != head) )
- {
- /* Delete 'x' from list and reinsert immediately after head. */
- p->next = x->next;
- x->next = head->next;
- head->next = x;
-
- /* Swap 'x' contents with head contents. */
- SWAP(head->gpfn_and_flags, x->gpfn_and_flags);
- SWAP(head->smfn, x->smfn);
- }
- else
- {
- perfc_incrc(shadow_status_hit_head);
- }
-
- return head->smfn;
- }
-
- p = x;
- x = x->next;
- }
- while ( x != NULL );
-
- perfc_incrc(shadow_status_miss);
- return 0;
-}
-
-/*
- * Not clear if pull-to-front is worth while for this or not,
- * as it generally needs to scan the entire bucket anyway.
- * Much simpler without.
- *
- * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table.
- */
-static inline u32
-shadow_max_pgtable_type(struct domain *d, unsigned long gpfn,
- unsigned long *smfn)
-{
- struct shadow_status *x;
- u32 pttype = PGT_none, type;
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(gpfn == (gpfn & PGT_mfn_mask));
-
- perfc_incrc(shadow_max_type);
-
- x = hash_bucket(d, gpfn);
-
- while ( x && x->gpfn_and_flags )
- {
- if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn )
- {
- type = x->gpfn_and_flags & PGT_type_mask;
-
- switch ( type )
- {
- case PGT_hl2_shadow:
- // Treat an HL2 as if it's an L1
- //
- type = PGT_l1_shadow;
- break;
- case PGT_snapshot:
- case PGT_writable_pred:
- // Ignore snapshots -- they don't in and of themselves constitute
- // treating a page as a page table
- //
- goto next;
- case PGT_base_page_table:
- // Early exit if we found the max possible value
- //
- return type;
- default:
- break;
- }
-
- if ( type > pttype )
- {
- pttype = type;
- if ( smfn )
- *smfn = x->smfn;
- }
- }
- next:
- x = x->next;
- }
-
- return pttype;
-}
-
-static inline void delete_shadow_status(
- struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned int stype, u64 index)
-{
- struct shadow_status *p, *x, *n, *head;
-
- shadow_key_t key = gpfn | stype | index_to_key(index);
-
- ASSERT(shadow_lock_is_acquired(d));
- ASSERT(!(gpfn & ~PGT_mfn_mask));
- ASSERT(stype && !(stype & ~PGT_type_mask));
-
- head = hash_bucket(d, gpfn);
-
- SH_VLOG("delete gpfn=%lx t=%08x bucket=%p", gpfn, stype, head);
- shadow_audit(d, 0);
-
- /* Match on head item? */
- if ( head->gpfn_and_flags == key )
- {
- if ( (n = head->next) != NULL )
- {
- /* Overwrite head with contents of following node. */
- head->gpfn_and_flags = n->gpfn_and_flags;
- head->smfn = n->smfn;
-
- /* Delete following node. */
- head->next = n->next;
-
- /* Add deleted node to the free list. */
- n->gpfn_and_flags = 0;
- n->smfn = 0;
- n->next = d->arch.shadow_ht_free;
- d->arch.shadow_ht_free = n;
- }
- else
- {
- /* This bucket is now empty. Initialise the head node. */
- head->gpfn_and_flags = 0;
- head->smfn = 0;
- }
-
- goto found;
- }
-
- p = head;
- x = head->next;
-
- do
- {
- if ( x->gpfn_and_flags == key )
- {
- /* Delete matching node. */
- p->next = x->next;
-
- /* Add deleted node to the free list. */
- x->gpfn_and_flags = 0;
- x->smfn = 0;
- x->next = d->arch.shadow_ht_free;
- d->arch.shadow_ht_free = x;
-
- goto found;
- }
-
- p = x;
- x = x->next;
- }
- while ( x != NULL );
-
- /* If we got here, it wasn't in the list! */
- BUG();
-
- found:
- // release ref to page
- if ( stype != PGT_writable_pred )
- put_page(mfn_to_page(gmfn));
-
- shadow_audit(d, 0);
-}
-
-static inline void set_shadow_status(
- struct domain *d, unsigned long gpfn, unsigned long gmfn,
- unsigned long smfn, unsigned long stype, u64 index)
-{
- struct shadow_status *x, *head, *extra;
- int i;
-
- shadow_key_t key = gpfn | stype | index_to_key(index);
-
- SH_VVLOG("set gpfn=%lx gmfn=%lx smfn=%lx t=%lx", gpfn, gmfn, smfn, stype);
-
- ASSERT(shadow_lock_is_acquired(d));
-
- ASSERT(shadow_mode_translate(d) || gpfn);
- ASSERT(!(gpfn & ~PGT_mfn_mask));
-
- // XXX - need to be more graceful.
- ASSERT(VALID_MFN(gmfn));
-
- ASSERT(stype && !(stype & ~PGT_type_mask));
-
- x = head = hash_bucket(d, gpfn);
-
- SH_VLOG("set gpfn=%lx smfn=%lx t=%lx bucket=%p(%p)",
- gpfn, smfn, stype, x, x->next);
- shadow_audit(d, 0);
-
- // grab a reference to the guest page to represent the entry in the shadow
- // hash table
- //
- // XXX - Should PGT_writable_pred grab a page ref?
- // - Who/how are these hash table entry refs flushed if/when a page
- // is given away by the domain?
- //
- if ( stype != PGT_writable_pred )
- get_page(mfn_to_page(gmfn), d);
-
- /*
- * STEP 1. If page is already in the table, update it in place.
- */
- do
- {
- if ( unlikely(x->gpfn_and_flags == key) )
- {
- if ( stype != PGT_writable_pred )
- BUG(); // we should never replace entries into the hash table
- x->smfn = smfn;
- if ( stype != PGT_writable_pred )
- put_page(mfn_to_page(gmfn)); // already had a ref...
- goto done;
- }
-
- x = x->next;
- }
- while ( x != NULL );
-
- /*
- * STEP 2. The page must be inserted into the table.
- */
-
- /* If the bucket is empty then insert the new page as the head item. */
- if ( head->gpfn_and_flags == 0 )
- {
- head->gpfn_and_flags = key;
- head->smfn = smfn;
- ASSERT(head->next == NULL);
- goto done;
- }
-
- /* We need to allocate a new node. Ensure the quicklist is non-empty. */
- if ( unlikely(d->arch.shadow_ht_free == NULL) )
- {
- SH_VLOG("Allocate more shadow hashtable blocks.");
-
- extra = xmalloc_bytes(
- sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
- /* XXX Should be more graceful here. */
- if ( extra == NULL )
- BUG();
-
- memset(extra, 0, sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
- /* Record the allocation block so it can be correctly freed later. */
- d->arch.shadow_extras_count++;
- *((struct shadow_status **)&extra[shadow_ht_extra_size]) =
- d->arch.shadow_ht_extras;
- d->arch.shadow_ht_extras = &extra[0];
-
- /* Thread a free chain through the newly-allocated nodes. */
- for ( i = 0; i < (shadow_ht_extra_size - 1); i++ )
- extra[i].next = &extra[i+1];
- extra[i].next = NULL;
-
- /* Add the new nodes to the free list. */
- d->arch.shadow_ht_free = &extra[0];
- }
-
- /* Allocate a new node from the quicklist. */
- x = d->arch.shadow_ht_free;
- d->arch.shadow_ht_free = x->next;
-
- /* Initialise the new node and insert directly after the head item. */
- x->gpfn_and_flags = key;
- x->smfn = smfn;
- x->next = head->next;
- head->next = x;
-
- done:
- shadow_audit(d, 0);
-
- if ( stype <= PGT_l4_shadow )
- {
- // add to front of list of pages to check when removing write
- // permissions for a page...
- //
- }
-}
-
-/************************************************************************/
-
-static inline void guest_physmap_add_page(
- struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
- struct domain_mmap_cache c1, c2;
-
- if ( likely(!shadow_mode_translate(d)) )
- return;
-
- domain_mmap_cache_init(&c1);
- domain_mmap_cache_init(&c2);
- shadow_lock(d);
- shadow_sync_and_drop_references(d, mfn_to_page(mfn));
- set_p2m_entry(d, gpfn, mfn, &c1, &c2);
- set_gpfn_from_mfn(mfn, gpfn);
- shadow_unlock(d);
- domain_mmap_cache_destroy(&c1);
- domain_mmap_cache_destroy(&c2);
-}
-
-static inline void guest_physmap_remove_page(
- struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
- struct domain_mmap_cache c1, c2;
- unsigned long type;
-
- if ( likely(!shadow_mode_translate(d)) )
- return;
-
- domain_mmap_cache_init(&c1);
- domain_mmap_cache_init(&c2);
- shadow_lock(d);
- shadow_sync_and_drop_references(d, mfn_to_page(mfn));
- while ( (type = shadow_max_pgtable_type(d, gpfn, NULL)) != PGT_none )
- free_shadow_page(__shadow_status(d, gpfn, type));
- set_p2m_entry(d, gpfn, -1, &c1, &c2);
- set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
- shadow_unlock(d);
- domain_mmap_cache_destroy(&c1);
- domain_mmap_cache_destroy(&c2);
-}
-
-/************************************************************************/
-
-void static inline
-shadow_update_min_max(unsigned long smfn, int index)
-{
- struct page_info *sl1page = mfn_to_page(smfn);
- u32 min_max = sl1page->tlbflush_timestamp;
- int min = SHADOW_MIN(min_max);
- int max = SHADOW_MAX(min_max);
- int update = 0;
-
- if ( index < min )
- {
- min = index;
- update = 1;
- }
- if ( index > max )
- {
- max = index;
- update = 1;
- }
- if ( update )
- sl1page->tlbflush_timestamp = SHADOW_ENCODE_MIN_MAX(min, max);
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_map_l1_into_current_l2(unsigned long va);
-
-void static inline
-shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- l2_pgentry_t sl2e = {0};
-
- __shadow_get_l2e(v, va, &sl2e);
- if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
- {
- /*
- * Either the L1 is not shadowed, or the shadow isn't linked into
- * the current shadow L2.
- */
- if ( create_l1_shadow )
- {
- perfc_incrc(shadow_set_l1e_force_map);
- shadow_map_l1_into_current_l2(va);
- }
- else /* check to see if it exists; if so, link it in */
- {
- l2_pgentry_t gpde = linear_l2_table(v)[l2_table_offset(va)];
- unsigned long gl1pfn = l2e_get_pfn(gpde);
- unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
-
- ASSERT( l2e_get_flags(gpde) & _PAGE_PRESENT );
-
- if ( sl1mfn )
- {
- perfc_incrc(shadow_set_l1e_unlinked);
- if ( !get_shadow_ref(sl1mfn) )
- BUG();
- l2pde_general(d, &gpde, &sl2e, sl1mfn);
- __guest_set_l2e(v, va, gpde);
- __shadow_set_l2e(v, va, sl2e);
- }
- else
- {
- // no shadow exists, so there's nothing to do.
- perfc_incrc(shadow_set_l1e_fail);
- return;
- }
- }
- }
-
- __shadow_get_l2e(v, va, &sl2e);
-
- if ( shadow_mode_refcounts(d) )
- {
- l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)];
-
- // only do the ref counting if something important changed.
- //
- if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
- {
- if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
- !shadow_get_page_from_l1e(new_spte, d) )
- new_spte = l1e_empty();
- if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
- shadow_put_page_from_l1e(old_spte, d);
- }
-
- }
-
- set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
- shadow_linear_pg_table[l1_linear_offset(va)] = new_spte;
- shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
-}
-#endif
-/************************************************************************/
-
-static inline int
-shadow_mode_page_writable(unsigned long va, struct cpu_user_regs *regs, unsigned long gpfn)
-{
- struct vcpu *v = current;
- struct domain *d = v->domain;
- unsigned long mfn = gmfn_to_mfn(d, gpfn);
- u32 type = mfn_to_page(mfn)->u.inuse.type_info & PGT_type_mask;
-
- if ( shadow_mode_refcounts(d) &&
- (type == PGT_writable_page) )
- type = shadow_max_pgtable_type(d, gpfn, NULL);
-
- // Strange but true: writable page tables allow kernel-mode access
- // to L1 page table pages via write-protected PTEs... Similarly, write
- // access to all page table pages is granted for shadow_mode_write_all
- // clients.
- //
- if ( ((shadow_mode_write_l1(d) && (type == PGT_l1_page_table)) ||
- (shadow_mode_write_all(d) && type && (type <= PGT_l4_page_table))) &&
- ((va < HYPERVISOR_VIRT_START)
-#if defined(__x86_64__)
- || (va >= HYPERVISOR_VIRT_END)
-#endif
- ) &&
- guest_kernel_mode(v, regs) )
- return 1;
-
- return 0;
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline l1_pgentry_t gva_to_gpte(unsigned long gva)
-{
- l2_pgentry_t gpde;
- l1_pgentry_t gpte;
- struct vcpu *v = current;
-
- ASSERT( shadow_mode_translate(current->domain) );
-
- __guest_get_l2e(v, gva, &gpde);
- if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
- return l1e_empty();;
-
- // This is actually overkill - we only need to make sure the hl2
- // is in-sync.
- //
- shadow_sync_va(v, gva);
-
- if ( unlikely(__copy_from_user(&gpte,
- &linear_pg_table[gva >> PAGE_SHIFT],
- sizeof(gpte))) )
- {
- FSH_LOG("gva_to_gpte got a fault on gva=%lx", gva);
- return l1e_empty();
- }
-
- return gpte;
-}
-
-static inline unsigned long gva_to_gpa(unsigned long gva)
-{
- l1_pgentry_t gpte;
-
- gpte = gva_to_gpte(gva);
- if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) )
- return 0;
-
- return l1e_get_paddr(gpte) + (gva & ~PAGE_MASK);
-}
-#endif
-
-static inline unsigned long gva_to_mfn(unsigned long gva)
-{
- unsigned long gpa = gva_to_gpa(gva);
- return get_mfn_from_gpfn(gpa >> PAGE_SHIFT);
-}
-
-/************************************************************************/
-
-extern void __update_pagetables(struct vcpu *v);
-static inline void update_pagetables(struct vcpu *v)
-{
- struct domain *d = v->domain;
- int paging_enabled;
-
- if ( hvm_guest(v) )
- paging_enabled = hvm_paging_enabled(v);
- else
- // HACK ALERT: there's currently no easy way to figure out if a domU
- // has set its arch.guest_table to zero, vs not yet initialized it.
- //
- paging_enabled = !!pagetable_get_paddr(v->arch.guest_table);
-
- /*
- * We don't call __update_pagetables() when hvm guest paging is
- * disabled as we want the linear_pg_table to be inaccessible so that
- * we bail out early of shadow_fault() if the hvm guest tries illegal
- * accesses while it thinks paging is turned off.
- */
- if ( unlikely(shadow_mode_enabled(d)) && paging_enabled )
- {
- shadow_lock(d);
- __update_pagetables(v);
- shadow_unlock(d);
- }
-
- if ( likely(!shadow_mode_external(d)) )
- {
- if ( shadow_mode_enabled(d) )
- v->arch.monitor_table = v->arch.shadow_table;
- else
-#if CONFIG_PAGING_LEVELS == 4
- if ( !(v->arch.flags & TF_kernel_mode) )
- v->arch.monitor_table = v->arch.guest_table_user;
- else
-#endif
- v->arch.monitor_table = v->arch.guest_table;
- }
-}
-
-void clear_all_shadow_status(struct domain *d);
-
-#if SHADOW_DEBUG
-extern int _check_pagetable(struct vcpu *v, char *s);
-extern int _check_all_pagetables(struct vcpu *v, char *s);
-
-#define check_pagetable(_v, _s) _check_pagetable(_v, _s)
-//#define check_pagetable(_v, _s) _check_all_pagetables(_v, _s)
-
-#else
-#define check_pagetable(_v, _s) ((void)0)
-#endif
-
-#endif /* XEN_SHADOW_H */
+#endif /* _XEN_SHADOW_H */
/*
* Local variables:
diff --git a/xen/include/asm-x86/shadow2-multi.h b/xen/include/asm-x86/shadow2-multi.h
new file mode 100644
index 0000000000..3b23a2f198
--- /dev/null
+++ b/xen/include/asm-x86/shadow2-multi.h
@@ -0,0 +1,116 @@
+/******************************************************************************
+ * arch/x86/shadow2-multi.h
+ *
+ * Shadow2 declarations which will be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+extern int
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size);
+
+extern void
+SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t smfn);
+extern void
+SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t smfn);
+extern void
+SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t smfn);
+extern void
+SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+ struct vcpu *v, mfn_t smfn);
+
+extern void
+SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows, 3, 3)
+ (struct vcpu *v, mfn_t smfn);
+
+extern void
+SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl2mfn);
+extern void
+SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl3mfn);
+extern void
+SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl4mfn);
+
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_write_access, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn);
+
+extern void
+SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, void *ep, mfn_t smfn);
+
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn);
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
+int
+SHADOW2_INTERNAL_NAME(sh2_audit_l1_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
+int
+SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
+int
+SHADOW2_INTERNAL_NAME(sh2_audit_l2_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl2mfn, mfn_t x);
+int
+SHADOW2_INTERNAL_NAME(sh2_audit_l3_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl3mfn, mfn_t x);
+int
+SHADOW2_INTERNAL_NAME(sh2_audit_l4_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t sl4mfn, mfn_t x);
+#endif
+
+#if SHADOW_LEVELS == GUEST_LEVELS
+extern mfn_t
+SHADOW2_INTERNAL_NAME(sh2_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v);
+extern void
+SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
+ (struct vcpu *v, mfn_t mmfn);
+#endif
+
+extern struct shadow2_entry_points
+SHADOW2_INTERNAL_NAME(shadow2_entry, SHADOW_LEVELS, GUEST_LEVELS);
diff --git a/xen/include/asm-x86/shadow2-private.h b/xen/include/asm-x86/shadow2-private.h
new file mode 100644
index 0000000000..7b2ac57572
--- /dev/null
+++ b/xen/include/asm-x86/shadow2-private.h
@@ -0,0 +1,612 @@
+/******************************************************************************
+ * arch/x86/shadow2-private.h
+ *
+ * Shadow2 code that is private, and does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _XEN_SHADOW2_PRIVATE_H
+#define _XEN_SHADOW2_PRIVATE_H
+
+// In order to override the definition of mfn_to_page, we make sure page.h has
+// been included...
+#include <asm/page.h>
+#include <xen/domain_page.h>
+#include <asm/x86_emulate.h>
+#include <asm/hvm/support.h>
+
+
+/******************************************************************************
+ * Definitions for the use of the "available" bits in the shadow PTEs.
+ *
+ * Review of the low 12 bits of a shadow page table entry:
+ *
+ * in a guest: in a shadow:
+ * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB
+ * Bit 10: _PAGE_AVAIL1 _PAGE_SHADOW_RW ("SW" below)
+ * Bit 9: _PAGE_AVAIL0 _PAGE_SHADOW_PRESENT ("SP" below)
+ * Bit 8: _PAGE_GLOBAL _PAGE_SHADOW_MMIO ("MMIO" below),
+ * aka _PAGE_SHADOW_GUEST_NOT_PRESENT
+ * Bit 7: _PAGE_PSE, aka _PAGE_PAT
+ * Bit 6: _PAGE_DIRTY
+ * Bit 5: _PAGE_ACCESSED
+ * Bit 4: _PAGE_PCD
+ * Bit 3: _PAGE_PWT
+ * Bit 2: _PAGE_USER
+ * Bit 1: _PAGE_RW ("GW" below)
+ * Bit 0: _PAGE_PRESENT ("GP" below)
+ *
+ * Given a guest entry, as shown below, we can expect the following in the
+ * corresponding shadow entry:
+ *
+ * Guest entry Shadow entry Commentary
+ * ----------- ---------------- ---------------------------------------------
+ * Maps
+ * GP GW IO GP SP GW SW MMIO
+ * -- -- ---- -- -- -- -- ----
+ * - - - 0 0 0 0 0 The guest entry has not yet been shadowed.
+ * 0 - - 0 0 0 0 1 The guest entry is marked not-present.
+ * 1 1 no ? 1 ? 1 0 Writable entry in the guest.
+ * 1 0 no ? 1 0 0 0 Read-only entry in the guest.
+ * 1 1 yes 0 1 ? 1 1 Writable MMIO mapping in the guest.
+ * 1 0 yes 0 1 0 0 1 Read-only MMIO mapping in the guest.
+ *
+ * Normally, we would expect that GP=1 in the guest to imply GP=1 in the
+ * shadow, and similarly for GW=1. However, various functionality that may be
+ * implemented via the shadow can cause GP or GW to be cleared in such cases.
+ * A & D bit emulation is a prime example of such functionality.
+ *
+ * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same
+ * entry will always be zero, too.
+
+ * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests. It is
+ * currently available for random (ab)use in shadow entries.
+ *
+ * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow,
+ * but currently there is no benefit, as the guest's TLB is flushed on every
+ * transition of CR3 anyway due to the HVM exit/re-entry.
+ *
+ * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used
+ * as the _PAGE_SHADOW_MMIO bit. In such entries, if _PAGE_SHADOW_MMIO is
+ * set, then the entry contains the *gfn* directly from the corresponding
+ * guest entry (not an mfn!!).
+ *
+ * Bit 7 is set in a guest L2 to signify a superpage entry. The current
+ * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the
+ * resulting shadow L1 table is called an FL1. Note that there is no guest
+ * page that corresponds to an FL1.
+ *
+ * Bit 7 in a guest L1 is the PAT2 bit. Currently we do not support PAT in
+ * this shadow code.
+ *
+ * Bit 6 is the dirty bit.
+ *
+ * Bit 5 is the accessed bit.
+ *
+ * Bit 4 is the cache disable bit. If set in a guest, the hardware is
+ * supposed to refuse to cache anything found via this entry. It can be set
+ * in an L4e, L3e, L2e, or L1e. This shadow code currently does not support
+ * cache disable bits. They are silently ignored.
+ *
+ * Bit 4 is a guest L1 is also the PAT1 bit. Currently we do not support PAT
+ * in this shadow code.
+ *
+ * Bit 3 is the cache write-thru bit. If set in a guest, the hardware is
+ * supposed to use write-thru instead of write-back caching for anything found
+ * via this entry. It can be set in an L4e, L3e, L2e, or L1e. This shadow
+ * code currently does not support cache write-thru bits. They are silently
+ * ignored.
+ *
+ * Bit 3 is a guest L1 is also the PAT0 bit. Currently we do not support PAT
+ * in this shadow code.
+ *
+ * Bit 2 is the user bit.
+ *
+ * Bit 1 is the read-write bit.
+ *
+ * Bit 0 is the present bit.
+ */
+
+// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by
+// the appropriate shadow rules.
+#define _PAGE_SHADOW_RW _PAGE_AVAIL1
+
+// Copy of the _PAGE_PRESENT bit from the guest's PTE
+#define _PAGE_SHADOW_PRESENT _PAGE_AVAIL0
+
+// The matching guest entry maps MMIO space
+#define _PAGE_SHADOW_MMIO _PAGE_GLOBAL
+
+// Shadow flags value used when the guest is not present
+#define _PAGE_SHADOW_GUEST_NOT_PRESENT _PAGE_GLOBAL
+
+
+/******************************************************************************
+ * Debug and error-message output
+ */
+#define SHADOW2_PRINTK(_f, _a...) \
+ debugtrace_printk("sh2: %s(): " _f, __func__, ##_a)
+#define SHADOW2_ERROR(_f, _a...) \
+ printk("sh2 error: %s(): " _f, __func__, ##_a)
+#define SHADOW2_DEBUG(flag, _f, _a...) \
+ do { \
+ if (SHADOW2_DEBUG_ ## flag) \
+ debugtrace_printk("sh2debug: %s(): " _f, __func__, ##_a); \
+ } while (0)
+
+// The flags for use with SHADOW2_DEBUG:
+#define SHADOW2_DEBUG_PROPAGATE 0
+#define SHADOW2_DEBUG_MAKE_SHADOW 0
+#define SHADOW2_DEBUG_DESTROY_SHADOW 0
+#define SHADOW2_DEBUG_P2M 0
+#define SHADOW2_DEBUG_A_AND_D 0
+#define SHADOW2_DEBUG_EMULATE 0
+#define SHADOW2_DEBUG_LOGDIRTY 1
+
+
+/******************************************************************************
+ * Auditing routines
+ */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
+extern void shadow2_audit_tables(struct vcpu *v);
+#else
+#define shadow2_audit_tables(_v) do {} while(0)
+#endif
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
+extern void shadow2_audit_p2m(struct domain *d);
+#else
+#define shadow2_audit_p2m(_d) do {} while(0)
+#endif
+
+
+/******************************************************************************
+ * Mechanism for double-checking the optimized pagefault path: this
+ * structure contains a record of actions taken by the fault handling
+ * code. In paranoid mode, the fast-path code fills out one of these
+ * structures (but doesn't take any actual action) and then the normal
+ * path fills in another. When the fault handler finishes, the
+ * two are compared */
+
+#ifdef SHADOW2_OPTIMIZATION_PARANOIA
+
+typedef struct shadow2_action_log sh2_log_t;
+struct shadow2_action_log {
+ paddr_t ad[CONFIG_PAGING_LEVELS]; /* A & D bits propagated here */
+ paddr_t mmio; /* Address of an mmio operation */
+ int rv; /* Result of the fault handler */
+};
+
+/* There are two logs, one for the fast path, one for the normal path */
+enum sh2_log_type { log_slow = 0, log_fast= 1 };
+
+/* Alloc and zero the logs */
+static inline void sh2_init_log(struct vcpu *v)
+{
+ if ( unlikely(!v->arch.shadow2_action_log) )
+ v->arch.shadow2_action_log = xmalloc_array(sh2_log_t, 2);
+ ASSERT(v->arch.shadow2_action_log);
+ memset(v->arch.shadow2_action_log, 0, 2 * sizeof (sh2_log_t));
+}
+
+/* Log an A&D-bit update */
+static inline void sh2_log_ad(struct vcpu *v, paddr_t e, unsigned int level)
+{
+ v->arch.shadow2_action_log[v->arch.shadow2_action_index].ad[level] = e;
+}
+
+/* Log an MMIO address */
+static inline void sh2_log_mmio(struct vcpu *v, paddr_t m)
+{
+ v->arch.shadow2_action_log[v->arch.shadow2_action_index].mmio = m;
+}
+
+/* Log the result */
+static inline void sh2_log_rv(struct vcpu *v, int rv)
+{
+ v->arch.shadow2_action_log[v->arch.shadow2_action_index].rv = rv;
+}
+
+/* Set which mode we're in */
+static inline void sh2_set_log_mode(struct vcpu *v, enum sh2_log_type t)
+{
+ v->arch.shadow2_action_index = t;
+}
+
+/* Know not to take action, because we're only checking the mechanism */
+static inline int sh2_take_no_action(struct vcpu *v)
+{
+ return (v->arch.shadow2_action_index == log_fast);
+}
+
+#else /* Non-paranoid mode: these logs do not exist */
+
+#define sh2_init_log(_v) do { (void)(_v); } while(0)
+#define sh2_set_log_mode(_v,_t) do { (void)(_v); } while(0)
+#define sh2_log_ad(_v,_e,_l) do { (void)(_v),(void)(_e),(void)(_l); } while (0)
+#define sh2_log_mmio(_v,_m) do { (void)(_v),(void)(_m); } while (0)
+#define sh2_log_rv(_v,_r) do { (void)(_v),(void)(_r); } while (0)
+#define sh2_take_no_action(_v) (((void)(_v)), 0)
+
+#endif /* SHADOW2_OPTIMIZATION_PARANOIA */
+
+
+/******************************************************************************
+ * Macro for dealing with the naming of the internal names of the
+ * shadow code's external entry points.
+ */
+#define SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) \
+ name ## __shadow_ ## shadow_levels ## _guest_ ## guest_levels
+#define SHADOW2_INTERNAL_NAME(name, shadow_levels, guest_levels) \
+ SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels)
+
+#if CONFIG_PAGING_LEVELS == 2
+#define GUEST_LEVELS 2
+#define SHADOW_LEVELS 2
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+#endif /* CONFIG_PAGING_LEVELS == 2 */
+
+#if CONFIG_PAGING_LEVELS == 3
+#define GUEST_LEVELS 2
+#define SHADOW_LEVELS 3
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS 3
+#define SHADOW_LEVELS 3
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+#endif /* CONFIG_PAGING_LEVELS == 3 */
+
+#if CONFIG_PAGING_LEVELS == 4
+#define GUEST_LEVELS 2
+#define SHADOW_LEVELS 3
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS 3
+#define SHADOW_LEVELS 3
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS 3
+#define SHADOW_LEVELS 4
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS 4
+#define SHADOW_LEVELS 4
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+#endif /* CONFIG_PAGING_LEVELS == 4 */
+
+
+/******************************************************************************
+ * Various function declarations
+ */
+
+/* x86 emulator support */
+extern struct x86_emulate_ops shadow2_emulator_ops;
+
+/* Hash table functions */
+mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t);
+void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn);
+void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn);
+
+/* shadow promotion */
+void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type);
+void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type);
+
+/* Shadow page allocation functions */
+void shadow2_prealloc(struct domain *d, unsigned int order);
+mfn_t shadow2_alloc(struct domain *d,
+ u32 shadow_type,
+ unsigned long backpointer);
+void shadow2_free(struct domain *d, mfn_t smfn);
+
+/* Function to convert a shadow to log-dirty */
+void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn);
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn);
+
+/* Re-sync copies of PAE shadow L3 tables if they have been changed */
+void sh2_pae_recopy(struct domain *d);
+
+/* Install the xen mappings in various flavours of shadow */
+void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn);
+void sh2_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn);
+void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn);
+void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn);
+
+
+/******************************************************************************
+ * MFN/page-info handling
+ */
+
+// Override mfn_to_page from asm/page.h, which was #include'd above,
+// in order to make it work with our mfn type.
+#undef mfn_to_page
+#define mfn_to_page(_mfn) (frame_table + mfn_x(_mfn))
+
+// Override page_to_mfn from asm/page.h, which was #include'd above,
+// in order to make it work with our mfn type.
+#undef page_to_mfn
+#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
+
+// Override mfn_valid from asm/page.h, which was #include'd above,
+// in order to make it work with our mfn type.
+#undef mfn_valid
+#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
+
+// Provide mfn_t-aware versions of common xen functions
+static inline void *
+sh2_map_domain_page(mfn_t mfn)
+{
+ /* XXX Using the monitor-table as a map will happen here */
+ return map_domain_page(mfn_x(mfn));
+}
+
+static inline void
+sh2_unmap_domain_page(void *p)
+{
+ /* XXX Using the monitor-table as a map will happen here */
+ unmap_domain_page(p);
+}
+
+static inline void *
+sh2_map_domain_page_global(mfn_t mfn)
+{
+ /* XXX Using the monitor-table as a map will happen here */
+ return map_domain_page_global(mfn_x(mfn));
+}
+
+static inline void
+sh2_unmap_domain_page_global(void *p)
+{
+ /* XXX Using the monitor-table as a map will happen here */
+ unmap_domain_page_global(p);
+}
+
+static inline int
+sh2_mfn_is_dirty(struct domain *d, mfn_t gmfn)
+/* Is this guest page dirty? Call only in log-dirty mode. */
+{
+ unsigned long pfn;
+ ASSERT(shadow2_mode_log_dirty(d));
+ ASSERT(d->arch.shadow_dirty_bitmap != NULL);
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+ if ( likely(VALID_M2P(pfn))
+ && likely(pfn < d->arch.shadow_dirty_bitmap_size)
+ && test_bit(pfn, d->arch.shadow_dirty_bitmap) )
+ return 1;
+
+ return 0;
+}
+
+static inline int
+sh2_mfn_is_a_page_table(mfn_t gmfn)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ struct domain *owner;
+ unsigned long type_info;
+
+ if ( !valid_mfn(gmfn) )
+ return 0;
+
+ owner = page_get_owner(page);
+ if ( owner && shadow2_mode_refcounts(owner)
+ && (page->count_info & PGC_page_table) )
+ return 1;
+
+ type_info = page->u.inuse.type_info & PGT_type_mask;
+ return type_info && (type_info <= PGT_l4_page_table);
+}
+
+
+/**************************************************************************/
+/* Shadow-page refcounting. See comment in shadow2-common.c about the
+ * use of struct page_info fields for shadow pages */
+
+void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn);
+
+/* Increase the refcount of a shadow page. Arguments are the mfn to refcount,
+ * and the physical address of the shadow entry that holds the ref (or zero
+ * if the ref is held by something else) */
+static inline void sh2_get_ref(mfn_t smfn, paddr_t entry_pa)
+{
+ u32 x, nx;
+ struct page_info *page = mfn_to_page(smfn);
+
+ ASSERT(mfn_valid(smfn));
+
+ x = page->count_info & PGC_SH2_count_mask;
+ nx = x + 1;
+
+ if ( unlikely(nx & ~PGC_SH2_count_mask) )
+ {
+ SHADOW2_PRINTK("shadow ref overflow, gmfn=%" PRtype_info " smfn=%lx\n",
+ page->u.inuse.type_info, mfn_x(smfn));
+ domain_crash_synchronous();
+ }
+
+ /* Guarded by the shadow lock, so no need for atomic update */
+ page->count_info &= ~PGC_SH2_count_mask;
+ page->count_info |= nx;
+
+ /* We remember the first shadow entry that points to each shadow. */
+ if ( entry_pa != 0 && page->up == 0 )
+ page->up = entry_pa;
+}
+
+
+/* Decrease the refcount of a shadow page. As for get_ref, takes the
+ * physical address of the shadow entry that held this reference. */
+static inline void sh2_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
+{
+ u32 x, nx;
+ struct page_info *page = mfn_to_page(smfn);
+
+ ASSERT(mfn_valid(smfn));
+ ASSERT(page_get_owner(page) == NULL);
+
+ /* If this is the entry in the up-pointer, remove it */
+ if ( entry_pa != 0 && page->up == entry_pa )
+ page->up = 0;
+
+ x = page->count_info & PGC_SH2_count_mask;
+ nx = x - 1;
+
+ if ( unlikely(x == 0) )
+ {
+ SHADOW2_PRINTK("shadow ref underflow, smfn=%lx oc=%08x t=%"
+ PRtype_info "\n",
+ mfn_x(smfn),
+ page->count_info & PGC_SH2_count_mask,
+ page->u.inuse.type_info);
+ domain_crash_synchronous();
+ }
+
+ /* Guarded by the shadow lock, so no need for atomic update */
+ page->count_info &= ~PGC_SH2_count_mask;
+ page->count_info |= nx;
+
+ if ( unlikely(nx == 0) )
+ sh2_destroy_shadow(v, smfn);
+}
+
+
+/* Pin a shadow page: take an extra refcount and set the pin bit. */
+static inline void sh2_pin(mfn_t smfn)
+{
+ struct page_info *page;
+
+ ASSERT(mfn_valid(smfn));
+ page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH2_pinned) )
+ {
+ sh2_get_ref(smfn, 0);
+ page->count_info |= PGC_SH2_pinned;
+ }
+}
+
+/* Unpin a shadow page: unset the pin bit and release the extra ref. */
+static inline void sh2_unpin(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *page;
+
+ ASSERT(mfn_valid(smfn));
+ page = mfn_to_page(smfn);
+ if ( page->count_info & PGC_SH2_pinned )
+ {
+ page->count_info &= ~PGC_SH2_pinned;
+ sh2_put_ref(v, smfn, 0);
+ }
+}
+
+/**************************************************************************/
+/* CPU feature support querying */
+
+static inline int
+guest_supports_superpages(struct vcpu *v)
+{
+ return hvm_guest(v) && (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE);
+}
+
+static inline int
+guest_supports_nx(struct vcpu *v)
+{
+ if ( !hvm_guest(v) )
+ return cpu_has_nx;
+
+ // XXX - fix this!
+ return 1;
+}
+
+/**************************************************************************/
+/* Guest physmap (p2m) support */
+
+/* Read our own P2M table, checking in the linear pagetables first to be
+ * sure that we will succeed. Call this function if you expect it to
+ * fail often, as it avoids page faults. If you expect to succeed, use
+ * vcpu_gfn_to_mfn, which copy_from_user()s the entry */
+static inline mfn_t
+vcpu_gfn_to_mfn_nofault(struct vcpu *v, unsigned long gfn)
+{
+ unsigned long entry_addr = (unsigned long) &phys_to_machine_mapping[gfn];
+#if CONFIG_PAGING_LEVELS >= 4
+ l4_pgentry_t *l4e;
+ l3_pgentry_t *l3e;
+#endif
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+
+ ASSERT(current == v);
+ if ( !shadow2_vcpu_mode_translate(v) )
+ return _mfn(gfn);
+
+#if CONFIG_PAGING_LEVELS > 2
+ if ( gfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
+ /* This pfn is higher than the p2m map can hold */
+ return _mfn(INVALID_MFN);
+#endif
+
+ /* Walk the linear pagetables. Note that this is *not* the same as
+ * the walk in sh2_gfn_to_mfn_foreign, which is walking the p2m map */
+#if CONFIG_PAGING_LEVELS >= 4
+ l4e = __linear_l4_table + l4_linear_offset(entry_addr);
+ if ( !(l4e_get_flags(*l4e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+ l3e = __linear_l3_table + l3_linear_offset(entry_addr);
+ if ( !(l3e_get_flags(*l3e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+#endif
+ l2e = __linear_l2_table + l2_linear_offset(entry_addr);
+ if ( !(l2e_get_flags(*l2e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+ l1e = __linear_l1_table + l1_linear_offset(entry_addr);
+ if ( !(l1e_get_flags(*l1e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+
+ /* Safe to look at this part of the table */
+ if ( l1e_get_flags(phys_to_machine_mapping[gfn]) & _PAGE_PRESENT )
+ return _mfn(l1e_get_pfn(phys_to_machine_mapping[gfn]));
+
+ return _mfn(INVALID_MFN);
+}
+
+
+#endif /* _XEN_SHADOW2_PRIVATE_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/shadow2-types.h b/xen/include/asm-x86/shadow2-types.h
new file mode 100644
index 0000000000..f593c97822
--- /dev/null
+++ b/xen/include/asm-x86/shadow2-types.h
@@ -0,0 +1,705 @@
+/******************************************************************************
+ * include/asm-x86/shadow2-types.h
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _XEN_SHADOW2_TYPES_H
+#define _XEN_SHADOW2_TYPES_H
+
+// Map a shadow page
+static inline void *
+map_shadow_page(mfn_t smfn)
+{
+ // XXX -- Possible optimization/measurement question for 32-bit and PAE
+ // hypervisors:
+ // How often is this smfn already available in the shadow linear
+ // table? Might it be worth checking that table first,
+ // presumably using the reverse map hint in the page_info of this
+ // smfn, rather than calling map_domain_page()?
+ //
+ return sh2_map_domain_page(smfn);
+}
+
+// matching unmap for map_shadow_page()
+static inline void
+unmap_shadow_page(void *p)
+{
+ sh2_unmap_domain_page(p);
+}
+
+/*
+ * Define various types for handling pagetabels, based on these options:
+ * SHADOW_PAGING_LEVELS : Number of levels of shadow pagetables
+ * GUEST_PAGING_LEVELS : Number of levels of guest pagetables
+ */
+
+#if (CONFIG_PAGING_LEVELS < SHADOW_PAGING_LEVELS)
+#error Cannot have more levels of shadow pagetables than host pagetables
+#endif
+
+#if (SHADOW_PAGING_LEVELS < GUEST_PAGING_LEVELS)
+#error Cannot have more levels of guest pagetables than shadow pagetables
+#endif
+
+#if SHADOW_PAGING_LEVELS == 2
+#define SHADOW_L1_PAGETABLE_ENTRIES 1024
+#define SHADOW_L2_PAGETABLE_ENTRIES 1024
+#define SHADOW_L1_PAGETABLE_SHIFT 12
+#define SHADOW_L2_PAGETABLE_SHIFT 22
+#endif
+
+#if SHADOW_PAGING_LEVELS == 3
+#define SHADOW_L1_PAGETABLE_ENTRIES 512
+#define SHADOW_L2_PAGETABLE_ENTRIES 512
+#define SHADOW_L3_PAGETABLE_ENTRIES 4
+#define SHADOW_L1_PAGETABLE_SHIFT 12
+#define SHADOW_L2_PAGETABLE_SHIFT 21
+#define SHADOW_L3_PAGETABLE_SHIFT 30
+#endif
+
+#if SHADOW_PAGING_LEVELS == 4
+#define SHADOW_L1_PAGETABLE_ENTRIES 512
+#define SHADOW_L2_PAGETABLE_ENTRIES 512
+#define SHADOW_L3_PAGETABLE_ENTRIES 512
+#define SHADOW_L4_PAGETABLE_ENTRIES 512
+#define SHADOW_L1_PAGETABLE_SHIFT 12
+#define SHADOW_L2_PAGETABLE_SHIFT 21
+#define SHADOW_L3_PAGETABLE_SHIFT 30
+#define SHADOW_L4_PAGETABLE_SHIFT 39
+#endif
+
+/* Types of the shadow page tables */
+typedef l1_pgentry_t shadow_l1e_t;
+typedef l2_pgentry_t shadow_l2e_t;
+#if SHADOW_PAGING_LEVELS >= 3
+typedef l3_pgentry_t shadow_l3e_t;
+#if SHADOW_PAGING_LEVELS >= 4
+typedef l4_pgentry_t shadow_l4e_t;
+#endif
+#endif
+
+/* Access functions for them */
+static inline paddr_t shadow_l1e_get_paddr(shadow_l1e_t sl1e)
+{ return l1e_get_paddr(sl1e); }
+static inline paddr_t shadow_l2e_get_paddr(shadow_l2e_t sl2e)
+{ return l2e_get_paddr(sl2e); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline paddr_t shadow_l3e_get_paddr(shadow_l3e_t sl3e)
+{ return l3e_get_paddr(sl3e); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline paddr_t shadow_l4e_get_paddr(shadow_l4e_t sl4e)
+{ return l4e_get_paddr(sl4e); }
+#endif
+#endif
+
+static inline mfn_t shadow_l1e_get_mfn(shadow_l1e_t sl1e)
+{ return _mfn(l1e_get_pfn(sl1e)); }
+static inline mfn_t shadow_l2e_get_mfn(shadow_l2e_t sl2e)
+{ return _mfn(l2e_get_pfn(sl2e)); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline mfn_t shadow_l3e_get_mfn(shadow_l3e_t sl3e)
+{ return _mfn(l3e_get_pfn(sl3e)); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline mfn_t shadow_l4e_get_mfn(shadow_l4e_t sl4e)
+{ return _mfn(l4e_get_pfn(sl4e)); }
+#endif
+#endif
+
+static inline u32 shadow_l1e_get_flags(shadow_l1e_t sl1e)
+{ return l1e_get_flags(sl1e); }
+static inline u32 shadow_l2e_get_flags(shadow_l2e_t sl2e)
+{ return l2e_get_flags(sl2e); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline u32 shadow_l3e_get_flags(shadow_l3e_t sl3e)
+{ return l3e_get_flags(sl3e); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline u32 shadow_l4e_get_flags(shadow_l4e_t sl4e)
+{ return l4e_get_flags(sl4e); }
+#endif
+#endif
+
+static inline shadow_l1e_t
+shadow_l1e_remove_flags(shadow_l1e_t sl1e, u32 flags)
+{ l1e_remove_flags(sl1e, flags); return sl1e; }
+
+static inline shadow_l1e_t shadow_l1e_empty(void)
+{ return l1e_empty(); }
+static inline shadow_l2e_t shadow_l2e_empty(void)
+{ return l2e_empty(); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline shadow_l3e_t shadow_l3e_empty(void)
+{ return l3e_empty(); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline shadow_l4e_t shadow_l4e_empty(void)
+{ return l4e_empty(); }
+#endif
+#endif
+
+static inline shadow_l1e_t shadow_l1e_from_mfn(mfn_t mfn, u32 flags)
+{ return l1e_from_pfn(mfn_x(mfn), flags); }
+static inline shadow_l2e_t shadow_l2e_from_mfn(mfn_t mfn, u32 flags)
+{ return l2e_from_pfn(mfn_x(mfn), flags); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline shadow_l3e_t shadow_l3e_from_mfn(mfn_t mfn, u32 flags)
+{ return l3e_from_pfn(mfn_x(mfn), flags); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags)
+{ return l4e_from_pfn(mfn_x(mfn), flags); }
+#endif
+#endif
+
+#define shadow_l1_table_offset(a) l1_table_offset(a)
+#define shadow_l2_table_offset(a) l2_table_offset(a)
+#define shadow_l3_table_offset(a) l3_table_offset(a)
+#define shadow_l4_table_offset(a) l4_table_offset(a)
+
+/**************************************************************************/
+/* Access to the linear mapping of shadow page tables. */
+
+/* Offsets into each level of the linear mapping for a virtual address. */
+#define shadow_l1_linear_offset(_a) \
+ (((_a) & VADDR_MASK) >> SHADOW_L1_PAGETABLE_SHIFT)
+#define shadow_l2_linear_offset(_a) \
+ (((_a) & VADDR_MASK) >> SHADOW_L2_PAGETABLE_SHIFT)
+#define shadow_l3_linear_offset(_a) \
+ (((_a) & VADDR_MASK) >> SHADOW_L3_PAGETABLE_SHIFT)
+#define shadow_l4_linear_offset(_a) \
+ (((_a) & VADDR_MASK) >> SHADOW_L4_PAGETABLE_SHIFT)
+
+/* Where to find each level of the linear mapping. For PV guests, we use
+ * the shadow linear-map self-entry as many times as we need. For HVM
+ * guests, the shadow doesn't have a linear-map self-entry so we must use
+ * the monitor-table's linear-map entry N-1 times and then the shadow-map
+ * entry once. */
+#define __sh2_linear_l1_table ((shadow_l1e_t *)(SH_LINEAR_PT_VIRT_START))
+#define __sh2_linear_l2_table ((shadow_l2e_t *) \
+ (__sh2_linear_l1_table + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)))
+
+// shadow linear L3 and L4 tables only exist in 4 level paging...
+#if SHADOW_PAGING_LEVELS == 4
+#define __sh2_linear_l3_table ((shadow_l3e_t *) \
+ (__sh2_linear_l2_table + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)))
+#define __sh2_linear_l4_table ((shadow_l4e_t *) \
+ (__sh2_linear_l3_table + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)))
+#endif
+
+#define sh2_linear_l1_table(v) ({ \
+ ASSERT(current == (v)); \
+ __sh2_linear_l1_table; \
+})
+
+#define sh2_linear_l2_table(v) ({ \
+ ASSERT(current == (v)); \
+ ((shadow_l2e_t *) \
+ (hvm_guest(v) ? __linear_l1_table : __sh2_linear_l1_table) + \
+ shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)); \
+})
+
+// shadow linear L3 and L4 tables only exist in 4 level paging...
+#if SHADOW_PAGING_LEVELS == 4
+#define sh2_linear_l3_table(v) ({ \
+ ASSERT(current == (v)); \
+ ((shadow_l3e_t *) \
+ (hvm_guest(v) ? __linear_l2_table : __sh2_linear_l2_table) + \
+ shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)); \
+})
+
+// we use l4_pgentry_t instead of shadow_l4e_t below because shadow_l4e_t is
+// not defined for when xen_levels==4 & shadow_levels==3...
+#define sh2_linear_l4_table(v) ({ \
+ ASSERT(current == (v)); \
+ ((l4_pgentry_t *) \
+ (hvm_guest(v) ? __linear_l3_table : __sh2_linear_l3_table) + \
+ shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)); \
+})
+#endif
+
+#if GUEST_PAGING_LEVELS == 2
+
+#include <asm/page-guest32.h>
+
+#define GUEST_L1_PAGETABLE_ENTRIES 1024
+#define GUEST_L2_PAGETABLE_ENTRIES 1024
+#define GUEST_L1_PAGETABLE_SHIFT 12
+#define GUEST_L2_PAGETABLE_SHIFT 22
+
+/* Type of the guest's frame numbers */
+TYPE_SAFE(u32,gfn)
+#define INVALID_GFN ((u32)(-1u))
+#define SH2_PRI_gfn "05x"
+
+/* Types of the guest's page tables */
+typedef l1_pgentry_32_t guest_l1e_t;
+typedef l2_pgentry_32_t guest_l2e_t;
+
+/* Access functions for them */
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return l1e_get_paddr_32(gl1e); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return l2e_get_paddr_32(gl2e); }
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); }
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return l1e_get_flags_32(gl1e); }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return l2e_get_flags_32(gl2e); }
+
+static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
+{ l1e_add_flags_32(gl1e, flags); return gl1e; }
+static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
+{ l2e_add_flags_32(gl2e, flags); return gl2e; }
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return l1e_from_pfn_32(gfn_x(gfn), flags); }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return l2e_from_pfn_32(gfn_x(gfn), flags); }
+
+#define guest_l1_table_offset(a) l1_table_offset_32(a)
+#define guest_l2_table_offset(a) l2_table_offset_32(a)
+
+/* The shadow types needed for the various levels. */
+#define PGC_SH2_l1_shadow PGC_SH2_l1_32_shadow
+#define PGC_SH2_l2_shadow PGC_SH2_l2_32_shadow
+#define PGC_SH2_fl1_shadow PGC_SH2_fl1_32_shadow
+
+#else /* GUEST_PAGING_LEVELS != 2 */
+
+#if GUEST_PAGING_LEVELS == 3
+#define GUEST_L1_PAGETABLE_ENTRIES 512
+#define GUEST_L2_PAGETABLE_ENTRIES 512
+#define GUEST_L3_PAGETABLE_ENTRIES 4
+#define GUEST_L1_PAGETABLE_SHIFT 12
+#define GUEST_L2_PAGETABLE_SHIFT 21
+#define GUEST_L3_PAGETABLE_SHIFT 30
+#else /* GUEST_PAGING_LEVELS == 4 */
+#define GUEST_L1_PAGETABLE_ENTRIES 512
+#define GUEST_L2_PAGETABLE_ENTRIES 512
+#define GUEST_L3_PAGETABLE_ENTRIES 512
+#define GUEST_L4_PAGETABLE_ENTRIES 512
+#define GUEST_L1_PAGETABLE_SHIFT 12
+#define GUEST_L2_PAGETABLE_SHIFT 21
+#define GUEST_L3_PAGETABLE_SHIFT 30
+#define GUEST_L4_PAGETABLE_SHIFT 39
+#endif
+
+/* Type of the guest's frame numbers */
+TYPE_SAFE(unsigned long,gfn)
+#define INVALID_GFN ((unsigned long)(-1ul))
+#define SH2_PRI_gfn "05lx"
+
+/* Types of the guest's page tables */
+typedef l1_pgentry_t guest_l1e_t;
+typedef l2_pgentry_t guest_l2e_t;
+typedef l3_pgentry_t guest_l3e_t;
+#if GUEST_PAGING_LEVELS >= 4
+typedef l4_pgentry_t guest_l4e_t;
+#endif
+
+/* Access functions for them */
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return l1e_get_paddr(gl1e); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return l2e_get_paddr(gl2e); }
+static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e)
+{ return l3e_get_paddr(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e)
+{ return l4e_get_paddr(gl4e); }
+#endif
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e)
+{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e)
+{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); }
+#endif
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return l1e_get_flags(gl1e); }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return l2e_get_flags(gl2e); }
+static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e)
+{ return l3e_get_flags(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
+{ return l4e_get_flags(gl4e); }
+#endif
+
+static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
+{ l1e_add_flags(gl1e, flags); return gl1e; }
+static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
+{ l2e_add_flags(gl2e, flags); return gl2e; }
+static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags)
+{ l3e_add_flags(gl3e, flags); return gl3e; }
+#if GUEST_PAGING_LEVELS >= 4
+static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags)
+{ l4e_add_flags(gl4e, flags); return gl4e; }
+#endif
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return l1e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return l2e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags)
+{ return l3e_from_pfn(gfn_x(gfn), flags); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
+{ return l4e_from_pfn(gfn_x(gfn), flags); }
+#endif
+
+#define guest_l1_table_offset(a) l1_table_offset(a)
+#define guest_l2_table_offset(a) l2_table_offset(a)
+#define guest_l3_table_offset(a) l3_table_offset(a)
+#define guest_l4_table_offset(a) l4_table_offset(a)
+
+/* The shadow types needed for the various levels. */
+#if GUEST_PAGING_LEVELS == 3
+#define PGC_SH2_l1_shadow PGC_SH2_l1_pae_shadow
+#define PGC_SH2_fl1_shadow PGC_SH2_fl1_pae_shadow
+#define PGC_SH2_l2_shadow PGC_SH2_l2_pae_shadow
+#define PGC_SH2_l2h_shadow PGC_SH2_l2h_pae_shadow
+#define PGC_SH2_l3_shadow PGC_SH2_l3_pae_shadow
+#else
+#define PGC_SH2_l1_shadow PGC_SH2_l1_64_shadow
+#define PGC_SH2_fl1_shadow PGC_SH2_fl1_64_shadow
+#define PGC_SH2_l2_shadow PGC_SH2_l2_64_shadow
+#define PGC_SH2_l3_shadow PGC_SH2_l3_64_shadow
+#define PGC_SH2_l4_shadow PGC_SH2_l4_64_shadow
+#endif
+
+#endif /* GUEST_PAGING_LEVELS != 2 */
+
+#define VALID_GFN(m) (m != INVALID_GFN)
+
+static inline int
+valid_gfn(gfn_t m)
+{
+ return VALID_GFN(gfn_x(m));
+}
+
+#if GUEST_PAGING_LEVELS == 2
+#define PGC_SH2_guest_root_type PGC_SH2_l2_32_shadow
+#elif GUEST_PAGING_LEVELS == 3
+#define PGC_SH2_guest_root_type PGC_SH2_l3_pae_shadow
+#else
+#define PGC_SH2_guest_root_type PGC_SH2_l4_64_shadow
+#endif
+
+/* Translation between mfns and gfns */
+static inline mfn_t
+vcpu_gfn_to_mfn(struct vcpu *v, gfn_t gfn)
+{
+ return sh2_vcpu_gfn_to_mfn(v, gfn_x(gfn));
+}
+
+static inline gfn_t
+mfn_to_gfn(struct domain *d, mfn_t mfn)
+{
+ return _gfn(sh2_mfn_to_gfn(d, mfn));
+}
+
+static inline paddr_t
+gfn_to_paddr(gfn_t gfn)
+{
+ return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT;
+}
+
+/* Type used for recording a walk through guest pagetables. It is
+ * filled in by the pagetable walk function, and also used as a cache
+ * for later walks.
+ * Any non-null pointer in this structure represents a mapping of guest
+ * memory. We must always call walk_init() before using a walk_t, and
+ * call walk_unmap() when we're done.
+ * The "Effective l1e" field is used when there isn't an l1e to point to,
+ * but we have fabricated an l1e for propagation to the shadow (e.g.,
+ * for splintering guest superpages into many shadow l1 entries). */
+typedef struct shadow2_walk_t walk_t;
+struct shadow2_walk_t
+{
+ unsigned long va; /* Address we were looking for */
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+ guest_l4e_t *l4e; /* Pointer to guest's level 4 entry */
+#endif
+ guest_l3e_t *l3e; /* Pointer to guest's level 3 entry */
+#endif
+ guest_l2e_t *l2e; /* Pointer to guest's level 2 entry */
+ guest_l1e_t *l1e; /* Pointer to guest's level 1 entry */
+ guest_l1e_t eff_l1e; /* Effective level 1 entry */
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+ mfn_t l4mfn; /* MFN that the level 4 entry is in */
+#endif
+ mfn_t l3mfn; /* MFN that the level 3 entry is in */
+#endif
+ mfn_t l2mfn; /* MFN that the level 2 entry is in */
+ mfn_t l1mfn; /* MFN that the level 1 entry is in */
+};
+
+
+/* X86 error code bits:
+ * These bits certainly ought to be defined somewhere other than here,
+ * but until that place is determined, here they sit.
+ *
+ * "PFEC" == "Page Fault Error Code"
+ */
+#define X86_PFEC_PRESENT 1 /* 0 == page was not present */
+#define X86_PFEC_WRITE_FAULT 2 /* 0 == reading, 1 == writing */
+#define X86_PFEC_SUPERVISOR_FAULT 4 /* 0 == supervisor-mode, 1 == user */
+#define X86_PFEC_RESERVED_BIT_FAULT 8 /* 1 == reserved bits set in pte */
+#define X86_PFEC_INSN_FETCH_FAULT 16 /* 0 == normal, 1 == instr'n fetch */
+
+/* macros for dealing with the naming of the internal function names of the
+ * shadow code's external entry points.
+ */
+#define INTERNAL_NAME(name) \
+ SHADOW2_INTERNAL_NAME(name, SHADOW_PAGING_LEVELS, GUEST_PAGING_LEVELS)
+
+/* macros for renaming the primary entry points, so that they are more
+ * easily distinguished from a debugger
+ */
+#define sh2_page_fault INTERNAL_NAME(sh2_page_fault)
+#define sh2_invlpg INTERNAL_NAME(sh2_invlpg)
+#define sh2_gva_to_gpa INTERNAL_NAME(sh2_gva_to_gpa)
+#define sh2_gva_to_gfn INTERNAL_NAME(sh2_gva_to_gfn)
+#define sh2_update_cr3 INTERNAL_NAME(sh2_update_cr3)
+#define sh2_remove_write_access INTERNAL_NAME(sh2_remove_write_access)
+#define sh2_remove_all_mappings INTERNAL_NAME(sh2_remove_all_mappings)
+#define sh2_remove_l1_shadow INTERNAL_NAME(sh2_remove_l1_shadow)
+#define sh2_remove_l2_shadow INTERNAL_NAME(sh2_remove_l2_shadow)
+#define sh2_remove_l3_shadow INTERNAL_NAME(sh2_remove_l3_shadow)
+#define sh2_map_and_validate_gl4e INTERNAL_NAME(sh2_map_and_validate_gl4e)
+#define sh2_map_and_validate_gl3e INTERNAL_NAME(sh2_map_and_validate_gl3e)
+#define sh2_map_and_validate_gl2e INTERNAL_NAME(sh2_map_and_validate_gl2e)
+#define sh2_map_and_validate_gl2he INTERNAL_NAME(sh2_map_and_validate_gl2he)
+#define sh2_map_and_validate_gl1e INTERNAL_NAME(sh2_map_and_validate_gl1e)
+#define sh2_destroy_l4_shadow INTERNAL_NAME(sh2_destroy_l4_shadow)
+#define sh2_destroy_l3_shadow INTERNAL_NAME(sh2_destroy_l3_shadow)
+#define sh2_destroy_l3_subshadow INTERNAL_NAME(sh2_destroy_l3_subshadow)
+#define sh2_unpin_all_l3_subshadows INTERNAL_NAME(sh2_unpin_all_l3_subshadows)
+#define sh2_destroy_l2_shadow INTERNAL_NAME(sh2_destroy_l2_shadow)
+#define sh2_destroy_l1_shadow INTERNAL_NAME(sh2_destroy_l1_shadow)
+#define sh2_unhook_32b_mappings INTERNAL_NAME(sh2_unhook_32b_mappings)
+#define sh2_unhook_pae_mappings INTERNAL_NAME(sh2_unhook_pae_mappings)
+#define sh2_unhook_64b_mappings INTERNAL_NAME(sh2_unhook_64b_mappings)
+#define shadow2_entry INTERNAL_NAME(shadow2_entry)
+#define sh2_detach_old_tables INTERNAL_NAME(sh2_detach_old_tables)
+#define sh2_x86_emulate_write INTERNAL_NAME(sh2_x86_emulate_write)
+#define sh2_x86_emulate_cmpxchg INTERNAL_NAME(sh2_x86_emulate_cmpxchg)
+#define sh2_x86_emulate_cmpxchg8b INTERNAL_NAME(sh2_x86_emulate_cmpxchg8b)
+#define sh2_audit_l1_table INTERNAL_NAME(sh2_audit_l1_table)
+#define sh2_audit_fl1_table INTERNAL_NAME(sh2_audit_fl1_table)
+#define sh2_audit_l2_table INTERNAL_NAME(sh2_audit_l2_table)
+#define sh2_audit_l3_table INTERNAL_NAME(sh2_audit_l3_table)
+#define sh2_audit_l4_table INTERNAL_NAME(sh2_audit_l4_table)
+#define sh2_guess_wrmap INTERNAL_NAME(sh2_guess_wrmap)
+#define sh2_clear_shadow_entry INTERNAL_NAME(sh2_clear_shadow_entry)
+
+/* sh2_make_monitor_table only depends on the number of shadow levels */
+#define sh2_make_monitor_table \
+ SHADOW2_INTERNAL_NAME(sh2_make_monitor_table, \
+ SHADOW_PAGING_LEVELS, \
+ SHADOW_PAGING_LEVELS)
+#define sh2_destroy_monitor_table \
+ SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table, \
+ SHADOW_PAGING_LEVELS, \
+ SHADOW_PAGING_LEVELS)
+
+
+#if GUEST_PAGING_LEVELS == 3
+/*
+ * Accounting information stored in the shadow of PAE Guest L3 pages.
+ * Because these "L3 pages" are only 32-bytes, it is inconvenient to keep
+ * various refcounts, etc., on the page_info of their page. We provide extra
+ * bookkeeping space in the shadow itself, and this is the structure
+ * definition for that bookkeeping information.
+ */
+struct pae_l3_bookkeeping {
+ u32 vcpus; /* bitmap of which vcpus are currently storing
+ * copies of this 32-byte page */
+ u32 refcount; /* refcount for this 32-byte page */
+ u8 pinned; /* is this 32-byte page pinned or not? */
+};
+
+// Convert a shadow entry pointer into a pae_l3_bookkeeping pointer.
+#define sl3p_to_info(_ptr) ((struct pae_l3_bookkeeping *) \
+ (((unsigned long)(_ptr) & ~31) + 32))
+
+static void sh2_destroy_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e);
+
+/* Increment a subshadow ref
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. */
+static inline void sh2_get_ref_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn)
+{
+ struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
+
+ /* First ref to the subshadow takes a ref to the full shadow */
+ if ( bk->refcount == 0 )
+ sh2_get_ref(smfn, 0);
+ if ( unlikely(++(bk->refcount) == 0) )
+ {
+ SHADOW2_PRINTK("shadow l3 subshadow ref overflow, smfn=%" SH2_PRI_mfn " sh=%p\n",
+ mfn_x(smfn), sl3e);
+ domain_crash_synchronous();
+ }
+}
+
+/* Decrement a subshadow ref.
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. Calling this may cause the
+ * entire shadow to disappear, so the caller must immediately unmap
+ * the pointer after calling. */
+static inline void sh2_put_ref_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e,
+ mfn_t smfn)
+{
+ struct pae_l3_bookkeeping *bk;
+
+ bk = sl3p_to_info(sl3e);
+
+ ASSERT(bk->refcount > 0);
+ if ( --(bk->refcount) == 0 )
+ {
+ /* Need to destroy this subshadow */
+ sh2_destroy_l3_subshadow(v, sl3e);
+ /* Last ref to the subshadow had a ref to the full shadow */
+ sh2_put_ref(v, smfn, 0);
+ }
+}
+
+/* Pin a subshadow
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. */
+static inline void sh2_pin_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn)
+{
+ struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
+
+#if 0
+ debugtrace_printk("%s smfn=%05lx offset=%ld\n",
+ __func__, mfn_x(smfn),
+ ((unsigned long)sl3e & ~PAGE_MASK) / 64);
+#endif
+
+ if ( !bk->pinned )
+ {
+ bk->pinned = 1;
+ sh2_get_ref_l3_subshadow(sl3e, smfn);
+ }
+}
+
+/* Unpin a sub-shadow.
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. Calling this may cause the
+ * entire shadow to disappear, so the caller must immediately unmap
+ * the pointer after calling. */
+static inline void sh2_unpin_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e,
+ mfn_t smfn)
+{
+ struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
+
+#if 0
+ debugtrace_printk("%s smfn=%05lx offset=%ld\n",
+ __func__, mfn_x(smfn),
+ ((unsigned long)sl3e & ~PAGE_MASK) / 64);
+#endif
+
+ if ( bk->pinned )
+ {
+ bk->pinned = 0;
+ sh2_put_ref_l3_subshadow(v, sl3e, smfn);
+ }
+}
+
+#endif /* GUEST_PAGING_LEVELS == 3 */
+
+#if SHADOW_PAGING_LEVELS == 3
+#define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20)
+#endif
+
+#if SHADOW_PAGING_LEVELS == 2
+#define SH2_PRI_pte "08x"
+#else /* SHADOW_PAGING_LEVELS >= 3 */
+#ifndef __x86_64__
+#define SH2_PRI_pte "016llx"
+#else
+#define SH2_PRI_pte "016lx"
+#endif
+#endif /* SHADOW_PAGING_LEVELS >= 3 */
+
+#if GUEST_PAGING_LEVELS == 2
+#define SH2_PRI_gpte "08x"
+#else /* GUEST_PAGING_LEVELS >= 3 */
+#ifndef __x86_64__
+#define SH2_PRI_gpte "016llx"
+#else
+#define SH2_PRI_gpte "016lx"
+#endif
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+static inline u32
+accumulate_guest_flags(walk_t *gw)
+{
+ u32 accumulated_flags;
+
+ // We accumulate the permission flags with bitwise ANDing.
+ // This works for the PRESENT bit, RW bit, and USER bit.
+ // For the NX bit, however, the polarity is wrong, so we accumulate the
+ // inverse of the NX bit.
+ //
+ accumulated_flags = guest_l1e_get_flags(gw->eff_l1e) ^ _PAGE_NX_BIT;
+ accumulated_flags &= guest_l2e_get_flags(*gw->l2e) ^ _PAGE_NX_BIT;
+
+ // Note that PAE guests do not have USER or RW or NX bits in their L3s.
+ //
+#if GUEST_PAGING_LEVELS == 3
+ accumulated_flags &=
+ ~_PAGE_PRESENT | (guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT);
+#elif GUEST_PAGING_LEVELS >= 4
+ accumulated_flags &= guest_l3e_get_flags(*gw->l3e) ^ _PAGE_NX_BIT;
+ accumulated_flags &= guest_l4e_get_flags(*gw->l4e) ^ _PAGE_NX_BIT;
+#endif
+
+ // Finally, revert the NX bit back to its original polarity
+ accumulated_flags ^= _PAGE_NX_BIT;
+
+ return accumulated_flags;
+}
+
+#endif /* _XEN_SHADOW2_TYPES_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/shadow2.h b/xen/include/asm-x86/shadow2.h
new file mode 100644
index 0000000000..94de7781f8
--- /dev/null
+++ b/xen/include/asm-x86/shadow2.h
@@ -0,0 +1,627 @@
+/******************************************************************************
+ * include/asm-x86/shadow2.h
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _XEN_SHADOW2_H
+#define _XEN_SHADOW2_H
+
+#include <public/dom0_ops.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <asm/flushtlb.h>
+
+/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
+
+#define SHM2_shift 10
+/* We're in one of the shadow modes */
+#define SHM2_enable (DOM0_SHADOW2_CONTROL_FLAG_ENABLE << SHM2_shift)
+/* Refcounts based on shadow tables instead of guest tables */
+#define SHM2_refcounts (DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT << SHM2_shift)
+/* Enable log dirty mode */
+#define SHM2_log_dirty (DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY << SHM2_shift)
+/* Xen does p2m translation, not guest */
+#define SHM2_translate (DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE << SHM2_shift)
+/* Xen does not steal address space from the domain for its own booking;
+ * requires VT or similar mechanisms */
+#define SHM2_external (DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL << SHM2_shift)
+
+#define shadow2_mode_enabled(_d) ((_d)->arch.shadow2_mode)
+#define shadow2_mode_refcounts(_d) ((_d)->arch.shadow2_mode & SHM2_refcounts)
+#define shadow2_mode_log_dirty(_d) ((_d)->arch.shadow2_mode & SHM2_log_dirty)
+#define shadow2_mode_translate(_d) ((_d)->arch.shadow2_mode & SHM2_translate)
+#define shadow2_mode_external(_d) ((_d)->arch.shadow2_mode & SHM2_external)
+
+/* Xen traps & emulates all reads of all page table pages:
+ *not yet supported
+ */
+#define shadow2_mode_trap_reads(_d) ({ (void)(_d); 0; })
+
+// flags used in the return value of the shadow_set_lXe() functions...
+#define SHADOW2_SET_CHANGED 0x1
+#define SHADOW2_SET_FLUSH 0x2
+#define SHADOW2_SET_ERROR 0x4
+#define SHADOW2_SET_L3PAE_RECOPY 0x8
+
+// How do we tell that we have a 32-bit PV guest in a 64-bit Xen?
+#ifdef __x86_64__
+#define pv_32bit_guest(_v) 0 // not yet supported
+#else
+#define pv_32bit_guest(_v) !hvm_guest(v)
+#endif
+
+/* The shadow2 lock.
+ *
+ * This lock is per-domain. It is intended to allow us to make atomic
+ * updates to the software TLB that the shadow tables provide.
+ *
+ * Specifically, it protects:
+ * - all changes to shadow page table pages
+ * - the shadow hash table
+ * - the shadow page allocator
+ * - all changes to guest page table pages; if/when the notion of
+ * out-of-sync pages is added to this code, then the shadow lock is
+ * protecting all guest page table pages which are not listed as
+ * currently as both guest-writable and out-of-sync...
+ * XXX -- need to think about this relative to writable page tables.
+ * - all changes to the page_info->tlbflush_timestamp
+ * - the page_info->count fields on shadow pages
+ * - the shadow dirty bit array and count
+ * - XXX
+ */
+#ifndef CONFIG_SMP
+#error shadow2.h currently requires CONFIG_SMP
+#endif
+
+#define shadow2_lock_init(_d) \
+ do { \
+ spin_lock_init(&(_d)->arch.shadow2_lock); \
+ (_d)->arch.shadow2_locker = -1; \
+ (_d)->arch.shadow2_locker_function = "nobody"; \
+ } while (0)
+
+#define shadow2_lock_is_acquired(_d) \
+ (current->processor == (_d)->arch.shadow2_locker)
+
+#define shadow2_lock(_d) \
+ do { \
+ if ( unlikely((_d)->arch.shadow2_locker == current->processor) ) \
+ { \
+ printk("Error: shadow2 lock held by %s\n", \
+ (_d)->arch.shadow2_locker_function); \
+ BUG(); \
+ } \
+ spin_lock(&(_d)->arch.shadow2_lock); \
+ ASSERT((_d)->arch.shadow2_locker == -1); \
+ (_d)->arch.shadow2_locker = current->processor; \
+ (_d)->arch.shadow2_locker_function = __func__; \
+ } while (0)
+
+#define shadow2_unlock(_d) \
+ do { \
+ ASSERT((_d)->arch.shadow2_locker == current->processor); \
+ (_d)->arch.shadow2_locker = -1; \
+ (_d)->arch.shadow2_locker_function = "nobody"; \
+ spin_unlock(&(_d)->arch.shadow2_lock); \
+ } while (0)
+
+/*
+ * Levels of self-test and paranoia
+ * XXX should go in config files somewhere?
+ */
+#define SHADOW2_AUDIT_HASH 0x01 /* Check current hash bucket */
+#define SHADOW2_AUDIT_HASH_FULL 0x02 /* Check every hash bucket */
+#define SHADOW2_AUDIT_ENTRIES 0x04 /* Check this walk's shadows */
+#define SHADOW2_AUDIT_ENTRIES_FULL 0x08 /* Check every shadow */
+#define SHADOW2_AUDIT_ENTRIES_MFNS 0x10 /* Check gfn-mfn map in shadows */
+#define SHADOW2_AUDIT_P2M 0x20 /* Check the p2m table */
+
+#ifdef NDEBUG
+#define SHADOW2_AUDIT 0
+#define SHADOW2_AUDIT_ENABLE 0
+#else
+#define SHADOW2_AUDIT 0x15 /* Basic audit of all except p2m. */
+#define SHADOW2_AUDIT_ENABLE shadow2_audit_enable
+extern int shadow2_audit_enable;
+#endif
+
+/*
+ * Levels of optimization
+ * XXX should go in config files somewhere?
+ */
+#define SH2OPT_WRITABLE_HEURISTIC 0x01 /* Guess at RW PTEs via linear maps */
+#define SH2OPT_EARLY_UNSHADOW 0x02 /* Unshadow l1s on fork or exit */
+
+#define SHADOW2_OPTIMIZATIONS 0x03
+
+
+/* With shadow pagetables, the different kinds of address start
+ * to get get confusing.
+ *
+ * Virtual addresses are what they usually are: the addresses that are used
+ * to accessing memory while the guest is running. The MMU translates from
+ * virtual addresses to machine addresses.
+ *
+ * (Pseudo-)physical addresses are the abstraction of physical memory the
+ * guest uses for allocation and so forth. For the purposes of this code,
+ * we can largely ignore them.
+ *
+ * Guest frame numbers (gfns) are the entries that the guest puts in its
+ * pagetables. For normal paravirtual guests, they are actual frame numbers,
+ * with the translation done by the guest.
+ *
+ * Machine frame numbers (mfns) are the entries that the hypervisor puts
+ * in the shadow page tables.
+ *
+ * Elsewhere in the xen code base, the name "gmfn" is generally used to refer
+ * to a "machine frame number, from the guest's perspective", or in other
+ * words, pseudo-physical frame numbers. However, in the shadow code, the
+ * term "gmfn" means "the mfn of a guest page"; this combines naturally with
+ * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a
+ * guest L2 page), etc...
+ */
+
+/* With this defined, we do some ugly things to force the compiler to
+ * give us type safety between mfns and gfns and other integers.
+ * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions
+ * that translate beween int and foo_t.
+ *
+ * It does have some performance cost because the types now have
+ * a different storage attribute, so may not want it on all the time. */
+#ifndef NDEBUG
+#define TYPE_SAFETY 1
+#endif
+
+#ifdef TYPE_SAFETY
+#define TYPE_SAFE(_type,_name) \
+typedef struct { _type _name; } _name##_t; \
+static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \
+static inline _type _name##_x(_name##_t n) { return n._name; }
+#else
+#define TYPE_SAFE(_type,_name) \
+typedef _type _name##_t; \
+static inline _name##_t _##_name(_type n) { return n; } \
+static inline _type _name##_x(_name##_t n) { return n; }
+#endif
+
+TYPE_SAFE(unsigned long,mfn)
+#define SH2_PRI_mfn "05lx"
+
+static inline int
+valid_mfn(mfn_t m)
+{
+ return VALID_MFN(mfn_x(m));
+}
+
+static inline mfn_t
+pagetable_get_mfn(pagetable_t pt)
+{
+ return _mfn(pagetable_get_pfn(pt));
+}
+
+static inline pagetable_t
+pagetable_from_mfn(mfn_t mfn)
+{
+ return pagetable_from_pfn(mfn_x(mfn));
+}
+
+static inline int
+shadow2_vcpu_mode_translate(struct vcpu *v)
+{
+ // Returns true if this VCPU needs to be using the P2M table to translate
+ // between GFNs and MFNs.
+ //
+ // This is true of translated HVM domains on a vcpu which has paging
+ // enabled. (HVM vcpu's with paging disabled are using the p2m table as
+ // its paging table, so no translation occurs in this case.)
+ //
+ return v->vcpu_flags & VCPUF_shadow2_translate;
+}
+
+
+/**************************************************************************/
+/* Mode-specific entry points into the shadow code */
+
+struct x86_emulate_ctxt;
+struct shadow2_entry_points {
+ int (*page_fault )(struct vcpu *v, unsigned long va,
+ struct cpu_user_regs *regs);
+ int (*invlpg )(struct vcpu *v, unsigned long va);
+ unsigned long (*gva_to_gpa )(struct vcpu *v, unsigned long va);
+ unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va);
+ void (*update_cr3 )(struct vcpu *v);
+ int (*map_and_validate_gl1e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl2e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl2he)(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl3e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl4e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ void (*detach_old_tables )(struct vcpu *v);
+ int (*x86_emulate_write )(struct vcpu *v, unsigned long va,
+ void *src, u32 bytes,
+ struct x86_emulate_ctxt *ctxt);
+ int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va,
+ unsigned long old,
+ unsigned long new,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt);
+ int (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va,
+ unsigned long old_lo,
+ unsigned long old_hi,
+ unsigned long new_lo,
+ unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt);
+ mfn_t (*make_monitor_table )(struct vcpu *v);
+ void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn);
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+ int (*guess_wrmap )(struct vcpu *v,
+ unsigned long vaddr, mfn_t gmfn);
+#endif
+ /* For outsiders to tell what mode we're in */
+ unsigned int shadow_levels;
+ unsigned int guest_levels;
+};
+
+static inline int shadow2_guest_paging_levels(struct vcpu *v)
+{
+ ASSERT(v->arch.shadow2 != NULL);
+ return v->arch.shadow2->guest_levels;
+}
+
+/**************************************************************************/
+/* Entry points into the shadow code */
+
+/* Turning on shadow2 test mode */
+int shadow2_test_enable(struct domain *d);
+
+/* Handler for shadow control ops: enabling and disabling shadow modes,
+ * and log-dirty bitmap ops all happen through here. */
+int shadow2_control_op(struct domain *d,
+ dom0_shadow_control_t *sc,
+ XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op);
+
+/* Call when destroying a domain */
+void shadow2_teardown(struct domain *d);
+
+/* Call once all of the references to the domain have gone away */
+void shadow2_final_teardown(struct domain *d);
+
+
+/* Mark a page as dirty in the bitmap */
+void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn);
+static inline void mark_dirty(struct domain *d, unsigned long gmfn)
+{
+ if ( shadow2_mode_log_dirty(d) )
+ {
+ shadow2_lock(d);
+ sh2_do_mark_dirty(d, _mfn(gmfn));
+ shadow2_unlock(d);
+ }
+}
+
+/* Internal version, for when the shadow lock is already held */
+static inline void sh2_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+ ASSERT(shadow2_lock_is_acquired(d));
+ if ( shadow2_mode_log_dirty(d) )
+ sh2_do_mark_dirty(d, gmfn);
+}
+
+static inline int
+shadow2_fault(unsigned long va, struct cpu_user_regs *regs)
+/* Called from pagefault handler in Xen, and from the HVM trap handlers
+ * for pagefaults. Returns 1 if this fault was an artefact of the
+ * shadow code (and the guest should retry) or 0 if it is not (and the
+ * fault should be handled elsewhere or passed to the guest). */
+{
+ struct vcpu *v = current;
+ perfc_incrc(shadow2_fault);
+ return v->arch.shadow2->page_fault(v, va, regs);
+}
+
+static inline int
+shadow2_invlpg(struct vcpu *v, unsigned long va)
+/* Called when the guest requests an invlpg. Returns 1 if the invlpg
+ * instruction should be issued on the hardware, or 0 if it's safe not
+ * to do so. */
+{
+ return v->arch.shadow2->invlpg(v, va);
+}
+
+static inline unsigned long
+shadow2_gva_to_gpa(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ return v->arch.shadow2->gva_to_gpa(v, va);
+}
+
+static inline unsigned long
+shadow2_gva_to_gfn(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ return v->arch.shadow2->gva_to_gfn(v, va);
+}
+
+static inline void
+shadow2_update_cr3(struct vcpu *v)
+/* Updates all the things that are derived from the guest's CR3.
+ * Called when the guest changes CR3. */
+{
+ shadow2_lock(v->domain);
+ v->arch.shadow2->update_cr3(v);
+ shadow2_unlock(v->domain);
+}
+
+
+/* Should be called after CR3 is updated.
+ * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3.
+ *
+ * Also updates other state derived from CR3 (vcpu->arch.guest_vtable,
+ * shadow_vtable, etc).
+ *
+ * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
+ * for HVM guests, arch.monitor_table and hvm's guest CR3.
+ *
+ * Update ref counts to shadow tables appropriately.
+ * For PAE, relocate L3 entries, if necessary, into low memory.
+ */
+static inline void update_cr3(struct vcpu *v)
+{
+ unsigned long cr3_mfn=0;
+
+ if ( shadow2_mode_enabled(v->domain) )
+ {
+ shadow2_update_cr3(v);
+ return;
+ }
+
+#if CONFIG_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
+ else
+#endif
+ cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
+
+ /* Update vcpu->arch.cr3 */
+ BUG_ON(cr3_mfn == 0);
+ make_cr3(v, cr3_mfn);
+}
+
+extern void sh2_update_paging_modes(struct vcpu *v);
+
+/* Should be called to initialise paging structures if the paging mode
+ * has changed, and when bringing up a VCPU for the first time. */
+static inline void shadow2_update_paging_modes(struct vcpu *v)
+{
+ ASSERT(shadow2_mode_enabled(v->domain));
+ shadow2_lock(v->domain);
+ sh2_update_paging_modes(v);
+ shadow2_unlock(v->domain);
+}
+
+static inline void
+shadow2_detach_old_tables(struct vcpu *v)
+{
+ v->arch.shadow2->detach_old_tables(v);
+}
+
+static inline mfn_t
+shadow2_make_monitor_table(struct vcpu *v)
+{
+ return v->arch.shadow2->make_monitor_table(v);
+}
+
+static inline void
+shadow2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
+{
+ v->arch.shadow2->destroy_monitor_table(v, mmfn);
+}
+
+/* Validate a pagetable change from the guest and update the shadows. */
+extern int shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry);
+
+/* Update the shadows in response to a pagetable write from a HVM guest */
+extern void shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size);
+
+/* Remove all writeable mappings of a guest frame from the shadows.
+ * Returns non-zero if we need to flush TLBs.
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access. */
+extern int shadow2_remove_write_access(struct vcpu *v, mfn_t readonly_mfn,
+ unsigned int level,
+ unsigned long fault_addr);
+
+/* Remove all mappings of the guest mfn from the shadows.
+ * Returns non-zero if we need to flush TLBs. */
+extern int shadow2_remove_all_mappings(struct vcpu *v, mfn_t target_mfn);
+
+void
+shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn);
+/* This is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+
+/* Remove all shadows of the guest mfn. */
+extern void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all);
+static inline void shadow2_remove_all_shadows(struct vcpu *v, mfn_t gmfn)
+{
+ sh2_remove_shadows(v, gmfn, 1);
+}
+
+/* Add a page to a domain */
+void
+shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn);
+
+/* Remove a page from a domain */
+void
+shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn);
+
+/*
+ * Definitions for the shadow2_flags field in page_info.
+ * These flags are stored on *guest* pages...
+ * Bits 1-13 are encodings for the shadow types.
+ */
+#define PGC_SH2_type_to_index(_type) ((_type) >> PGC_SH2_type_shift)
+#define SH2F_page_type_mask \
+ (((1u << PGC_SH2_type_to_index(PGC_SH2_max_shadow + 1u)) - 1u) - \
+ ((1u << PGC_SH2_type_to_index(PGC_SH2_min_shadow)) - 1u))
+
+#define SH2F_L1_32 (1u << PGC_SH2_type_to_index(PGC_SH2_l1_32_shadow))
+#define SH2F_FL1_32 (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_32_shadow))
+#define SH2F_L2_32 (1u << PGC_SH2_type_to_index(PGC_SH2_l2_32_shadow))
+#define SH2F_L1_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l1_pae_shadow))
+#define SH2F_FL1_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_pae_shadow))
+#define SH2F_L2_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l2_pae_shadow))
+#define SH2F_L2H_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l2h_pae_shadow))
+#define SH2F_L3_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l3_pae_shadow))
+#define SH2F_L1_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l1_64_shadow))
+#define SH2F_FL1_64 (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_64_shadow))
+#define SH2F_L2_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l2_64_shadow))
+#define SH2F_L3_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l3_64_shadow))
+#define SH2F_L4_64 (1u << PGC_SH2_type_to_index(PGC_SH2_l4_64_shadow))
+
+/* Used for hysteresis when automatically unhooking mappings on fork/exit */
+#define SH2F_unhooked_mappings (1u<<31)
+
+/*
+ * Allocation of shadow pages
+ */
+
+/* Return the minumum acceptable number of shadow pages a domain needs */
+unsigned int shadow2_min_acceptable_pages(struct domain *d);
+
+/* Set the pool of shadow pages to the required number of MB.
+ * Input will be rounded up to at least min_acceptable_shadow_pages().
+ * Returns 0 for success, 1 for failure. */
+unsigned int shadow2_set_allocation(struct domain *d,
+ unsigned int megabytes,
+ int *preempted);
+
+/* Return the size of the shadow2 pool, rounded up to the nearest MB */
+static inline unsigned int shadow2_get_allocation(struct domain *d)
+{
+ unsigned int pg = d->arch.shadow2_total_pages;
+ return ((pg >> (20 - PAGE_SHIFT))
+ + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
+}
+
+/*
+ * Linked list for chaining entries in the shadow hash table.
+ */
+struct shadow2_hash_entry {
+ struct shadow2_hash_entry *next;
+ mfn_t smfn; /* MFN of the shadow */
+#ifdef _x86_64_ /* Shorten 'n' so we don't waste a whole word on storing 't' */
+ unsigned long n:56; /* MFN of guest PT or GFN of guest superpage */
+#else
+ unsigned long n; /* MFN of guest PT or GFN of guest superpage */
+#endif
+ unsigned char t; /* shadow type bits, or 0 for empty */
+};
+
+#define SHADOW2_HASH_BUCKETS 251
+/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
+
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_CACHE_WALKS
+/* Optimization: cache the results of guest walks. This helps with MMIO
+ * and emulated writes, which tend to issue very similar walk requests
+ * repeatedly. We keep the results of the last few walks, and blow
+ * away the cache on guest cr3 write, mode change, or page fault. */
+
+#define SH2_WALK_CACHE_ENTRIES 4
+
+/* Rather than cache a guest walk, which would include mapped pointers
+ * to pages, we cache what a TLB would remember about the walk: the
+ * permissions and the l1 gfn */
+struct shadow2_walk_cache {
+ unsigned long va; /* The virtual address (or 0 == unused) */
+ unsigned long gfn; /* The gfn from the effective l1e */
+ u32 permissions; /* The aggregated permission bits */
+};
+#endif
+
+
+/**************************************************************************/
+/* Guest physmap (p2m) support */
+
+/* Walk another domain's P2M table, mapping pages as we go */
+extern mfn_t
+sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn);
+
+
+/* General conversion function from gfn to mfn */
+static inline mfn_t
+sh2_gfn_to_mfn(struct domain *d, unsigned long gfn)
+{
+ if ( !shadow2_mode_translate(d) )
+ return _mfn(gfn);
+ else if ( likely(current->domain == d) )
+ return _mfn(get_mfn_from_gpfn(gfn));
+ else
+ return sh2_gfn_to_mfn_foreign(d, gfn);
+}
+
+// vcpu-specific version of gfn_to_mfn(). This is where we hide the dirty
+// little secret that, for hvm guests with paging disabled, nearly all of the
+// shadow code actually think that the guest is running on *untranslated* page
+// tables (which is actually domain->phys_table).
+//
+static inline mfn_t
+sh2_vcpu_gfn_to_mfn(struct vcpu *v, unsigned long gfn)
+{
+ if ( !shadow2_vcpu_mode_translate(v) )
+ return _mfn(gfn);
+ if ( likely(current->domain == v->domain) )
+ return _mfn(get_mfn_from_gpfn(gfn));
+ return sh2_gfn_to_mfn_foreign(v->domain, gfn);
+}
+
+static inline unsigned long
+sh2_mfn_to_gfn(struct domain *d, mfn_t mfn)
+{
+ if ( shadow2_mode_translate(d) )
+ return get_gpfn_from_mfn(mfn_x(mfn));
+ else
+ return mfn_x(mfn);
+}
+
+
+
+#endif /* _XEN_SHADOW2_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
diff --git a/xen/include/asm-x86/shadow_64.h b/xen/include/asm-x86/shadow_64.h
deleted file mode 100644
index d9afbdca18..0000000000
--- a/xen/include/asm-x86/shadow_64.h
+++ /dev/null
@@ -1,587 +0,0 @@
-/******************************************************************************
- * include/asm-x86/shadow_64.h
- *
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-/*
- * Jun Nakajima <jun.nakajima@intel.com>
- * Chengyuan Li <chengyuan.li@intel.com>
- *
- * Extended to support 64-bit guests.
- */
-#ifndef _XEN_SHADOW_64_H
-#define _XEN_SHADOW_64_H
-#include <asm/shadow.h>
-#include <asm/shadow_ops.h>
-#include <asm/hvm/hvm.h>
-
-/*
- * The naming convention of the shadow_ops:
- * MODE_<pgentry size>_<guest paging levels>_HANDLER
- */
-extern struct shadow_ops MODE_64_2_HANDLER;
-extern struct shadow_ops MODE_64_3_HANDLER;
-extern struct shadow_ops MODE_64_PAE_HANDLER;
-#if CONFIG_PAGING_LEVELS == 4
-extern struct shadow_ops MODE_64_4_HANDLER;
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3
-#define L4_PAGETABLE_SHIFT 39
-#define L4_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER)
-typedef struct { intpte_t l4; } l4_pgentry_t;
-#define is_guest_l4_slot(_s) (1)
-#endif
-
-#define READ_FAULT 0
-#define WRITE_FAULT 1
-
-#define ERROR_P 1
-#define ERROR_W 2
-#define ERROR_U 4
-#define ERROR_I (1 << 4)
-
-#define X86_64_SHADOW_DEBUG 0
-
-#if X86_64_SHADOW_DEBUG
-#define ESH_LOG(_f, _a...) \
- printk(_f, ##_a)
-#else
-#define ESH_LOG(_f, _a...) ((void)0)
-#endif
-
-#define L_MASK 0xff
-
-#define PAE_PAGING_LEVELS 3
-
-#define ROOT_LEVEL_64 PAGING_L4
-#define ROOT_LEVEL_32 PAGING_L2
-
-#define DIRECT_ENTRY (4UL << 16)
-#define SHADOW_ENTRY (2UL << 16)
-#define GUEST_ENTRY (1UL << 16)
-
-#define GET_ENTRY (2UL << 8)
-#define SET_ENTRY (1UL << 8)
-
-#define PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER)
-
-/* For 32-bit VMX guest to allocate shadow L1 & L2*/
-#define SL1_ORDER 1
-#define SL2_ORDER 2
-
-typedef struct { intpte_t lo; } pgentry_64_t;
-#define shadow_level_to_type(l) (l << 29)
-#define shadow_type_to_level(t) (t >> 29)
-
-#define entry_get_value(_x) ((_x).lo)
-#define entry_get_pfn(_x) \
- (((_x).lo & (PADDR_MASK&PAGE_MASK)) >> PAGE_SHIFT)
-#define entry_get_paddr(_x) (((_x).lo & (PADDR_MASK&PAGE_MASK)))
-#define entry_get_flags(_x) (get_pte_flags((_x).lo))
-
-#define entry_empty() ((pgentry_64_t) { 0 })
-#define entry_from_pfn(pfn, flags) \
- ((pgentry_64_t) { ((intpte_t)(pfn) << PAGE_SHIFT) | put_pte_flags(flags) })
-#define entry_from_page(page, flags) (entry_from_pfn(page_to_mfn(page),(flags)))
-#define entry_add_flags(x, flags) ((x).lo |= put_pte_flags(flags))
-#define entry_remove_flags(x, flags) ((x).lo &= ~put_pte_flags(flags))
-#define entry_has_changed(x,y,flags) \
- ( !!(((x).lo ^ (y).lo) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags(flags))) )
-
-/******************************************************************************/
-/*
- * The macro and inlines are for 32-bit PAE guest
- */
-#define PAE_PDPT_RESERVED 0x1e6 /* [8:5], [2,1] */
-
-#define PAE_SHADOW_SELF_ENTRY 259
-#define PAE_L3_PAGETABLE_ENTRIES 4
-
-/******************************************************************************/
-static inline int table_offset_64(unsigned long va, int level)
-{
- switch(level) {
- case 1:
- return (((va) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1));
- case 2:
- return (((va) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1));
- case 3:
- return (((va) >> L3_PAGETABLE_SHIFT) & (L3_PAGETABLE_ENTRIES - 1));
-#if CONFIG_PAGING_LEVELS == 3
- case 4:
- return PAE_SHADOW_SELF_ENTRY;
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-#ifndef GUEST_PGENTRY_32
-#ifndef GUEST_32PAE
- case 4:
- return (((va) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1));
-#else
- case 4:
- return PAE_SHADOW_SELF_ENTRY;
-#endif
-#else
- case 4:
- return PAE_SHADOW_SELF_ENTRY;
-#endif
-#endif
- default:
- return -1;
- }
-}
-
-/*****************************************************************************/
-
-#if defined( GUEST_32PAE )
-static inline int guest_table_offset_64(unsigned long va, int level, unsigned int index)
-{
- switch(level) {
- case 1:
- return (((va) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1));
- case 2:
- return (((va) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1));
- case 3:
- return (index * 4 + ((va) >> L3_PAGETABLE_SHIFT));
-#if CONFIG_PAGING_LEVELS == 3
- case 4:
- return PAE_SHADOW_SELF_ENTRY;
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-#ifndef GUEST_PGENTRY_32
- case 4:
- return (((va) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1));
-#else
- case 4:
- return PAE_SHADOW_SELF_ENTRY;
-#endif
-#endif
- default:
- return -1;
- }
-}
-
-#define SH_GUEST_32PAE 1
-#else
-#define guest_table_offset_64(va, level, index) \
- table_offset_64((va),(level))
-#define SH_GUEST_32PAE 0
-#endif
-
-/********************************************************************************/
-
-static inline void free_out_of_sync_state(struct domain *d)
-{
- struct out_of_sync_entry *entry;
-
- // NB: Be careful not to call something that manipulates this list
- // while walking it. Remove one item at a time, and always
- // restart from start of list.
- //
- while ( (entry = d->arch.out_of_sync) )
- {
- d->arch.out_of_sync = entry->next;
- release_out_of_sync_entry(d, entry);
-
- entry->next = d->arch.out_of_sync_free;
- d->arch.out_of_sync_free = entry;
- }
-}
-
-static inline int __entry(
- struct vcpu *v, unsigned long va, pgentry_64_t *e_p, u32 flag)
-{
- int i;
- pgentry_64_t *le_e;
- pgentry_64_t *le_p = NULL;
- pgentry_64_t *phys_vtable = NULL;
- unsigned long mfn;
- int index;
- u32 level = flag & L_MASK;
- struct domain *d = v->domain;
- int root_level;
- unsigned int base_idx;
-
- base_idx = get_cr3_idxval(v);
-
- if ( flag & SHADOW_ENTRY )
- {
- root_level = ROOT_LEVEL_64;
- index = table_offset_64(va, root_level);
- le_e = (pgentry_64_t *)&v->arch.shadow_vtable[index];
- }
- else if ( flag & GUEST_ENTRY )
- {
- root_level = v->domain->arch.ops->guest_paging_levels;
- if ( root_level == PAGING_L3 )
- index = guest_table_offset_64(va, PAGING_L3, base_idx);
- else
- index = guest_table_offset_64(va, root_level, base_idx);
- le_e = (pgentry_64_t *)&v->arch.guest_vtable[index];
- }
- else /* direct mode */
- {
- root_level = PAE_PAGING_LEVELS;
- index = table_offset_64(va, root_level);
- phys_vtable = (pgentry_64_t *)map_domain_page(
- pagetable_get_pfn(v->domain->arch.phys_table));
- le_e = &phys_vtable[index];
- }
-
- /*
- * If it's not external mode, then mfn should be machine physical.
- */
- for ( i = root_level - level; i > 0; i-- )
- {
- if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) )
- {
- if ( le_p )
- unmap_domain_page(le_p);
-
- if ( phys_vtable )
- unmap_domain_page(phys_vtable);
-
- return 0;
- }
-
- mfn = entry_get_pfn(*le_e);
- if ( (flag & GUEST_ENTRY) && shadow_mode_translate(d) )
- mfn = get_mfn_from_gpfn(mfn);
-
- if ( le_p )
- unmap_domain_page(le_p);
- le_p = (pgentry_64_t *)map_domain_page(mfn);
-
- if ( flag & SHADOW_ENTRY )
- index = table_offset_64(va, (level + i - 1));
- else
- index = guest_table_offset_64(va, (level + i - 1), base_idx);
- le_e = &le_p[index];
- }
-
- if ( flag & SET_ENTRY )
- *le_e = *e_p;
- else
- *e_p = *le_e;
-
- if ( le_p )
- unmap_domain_page(le_p);
-
- if ( phys_vtable )
- unmap_domain_page(phys_vtable);
-
- return 1;
-}
-
-static inline int __rw_entry(
- struct vcpu *v, unsigned long va, void *e_p, u32 flag)
-{
- pgentry_64_t *e = (pgentry_64_t *)e_p;
-
- if (e) {
- return __entry(v, va, e, flag);
- }
-
- return 0;
-}
-
-#define __shadow_set_l4e(v, va, value) \
- __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L4)
-#define __shadow_get_l4e(v, va, sl4e) \
- __rw_entry(v, va, sl4e, SHADOW_ENTRY | GET_ENTRY | PAGING_L4)
-#define __shadow_set_l3e(v, va, value) \
- __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L3)
-#define __shadow_get_l3e(v, va, sl3e) \
- __rw_entry(v, va, sl3e, SHADOW_ENTRY | GET_ENTRY | PAGING_L3)
-#define __shadow_set_l2e(v, va, value) \
- __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L2)
-#define __shadow_get_l2e(v, va, sl2e) \
- __rw_entry(v, va, sl2e, SHADOW_ENTRY | GET_ENTRY | PAGING_L2)
-#define __shadow_set_l1e(v, va, value) \
- __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L1)
-#define __shadow_get_l1e(v, va, sl1e) \
- __rw_entry(v, va, sl1e, SHADOW_ENTRY | GET_ENTRY | PAGING_L1)
-
-#define __guest_set_l4e(v, va, value) \
- __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L4)
-#define __guest_get_l4e(v, va, gl4e) \
- __rw_entry(v, va, gl4e, GUEST_ENTRY | GET_ENTRY | PAGING_L4)
-#define __guest_set_l3e(v, va, value) \
- __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L3)
-#define __guest_get_l3e(v, va, sl3e) \
- __rw_entry(v, va, gl3e, GUEST_ENTRY | GET_ENTRY | PAGING_L3)
-
-#define __direct_set_l3e(v, va, value) \
- __rw_entry(v, va, value, DIRECT_ENTRY | SET_ENTRY | PAGING_L3)
-#define __direct_get_l3e(v, va, sl3e) \
- __rw_entry(v, va, sl3e, DIRECT_ENTRY | GET_ENTRY | PAGING_L3)
-#define __direct_set_l2e(v, va, value) \
- __rw_entry(v, va, value, DIRECT_ENTRY | SET_ENTRY | PAGING_L2)
-#define __direct_get_l2e(v, va, sl2e) \
- __rw_entry(v, va, sl2e, DIRECT_ENTRY | GET_ENTRY | PAGING_L2)
-#define __direct_set_l1e(v, va, value) \
- __rw_entry(v, va, value, DIRECT_ENTRY | SET_ENTRY | PAGING_L1)
-#define __direct_get_l1e(v, va, sl1e) \
- __rw_entry(v, va, sl1e, DIRECT_ENTRY | GET_ENTRY | PAGING_L1)
-
-
-static inline int __guest_set_l2e(
- struct vcpu *v, unsigned long va, void *value, int size)
-{
- switch(size) {
- case 4:
- // 32-bit guest
- {
- l2_pgentry_32_t *l2va;
-
- l2va = (l2_pgentry_32_t *)v->arch.guest_vtable;
- if (value)
- l2va[l2_table_offset_32(va)] = *(l2_pgentry_32_t *)value;
- return 1;
- }
- case 8:
- return __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L2);
- default:
- BUG();
- return 0;
- }
- return 0;
-}
-
-#define __guest_set_l2e(v, va, value) \
- __guest_set_l2e(v, (unsigned long)va, value, sizeof(*value))
-
-static inline int __guest_get_l2e(
- struct vcpu *v, unsigned long va, void *gl2e, int size)
-{
- switch(size) {
- case 4:
- // 32-bit guest
- {
- l2_pgentry_32_t *l2va;
- l2va = (l2_pgentry_32_t *)v->arch.guest_vtable;
- if (gl2e)
- *(l2_pgentry_32_t *)gl2e = l2va[l2_table_offset_32(va)];
- return 1;
- }
- case 8:
- return __rw_entry(v, va, gl2e, GUEST_ENTRY | GET_ENTRY | PAGING_L2);
- default:
- BUG();
- return 0;
- }
- return 0;
-}
-
-#define __guest_get_l2e(v, va, gl2e) \
- __guest_get_l2e(v, (unsigned long)va, gl2e, sizeof(*gl2e))
-
-static inline int __guest_set_l1e(
- struct vcpu *v, unsigned long va, void *value, int size)
-{
- switch(size) {
- case 4:
- // 32-bit guest
- {
- l2_pgentry_32_t gl2e;
- l1_pgentry_32_t *l1va;
- unsigned long l1mfn;
-
- if (!__guest_get_l2e(v, va, &gl2e))
- return 0;
- if (unlikely(!(l2e_get_flags_32(gl2e) & _PAGE_PRESENT)))
- return 0;
-
- l1mfn = get_mfn_from_gpfn(
- l2e_get_pfn(gl2e));
-
- l1va = (l1_pgentry_32_t *)map_domain_page(l1mfn);
- if (value)
- l1va[l1_table_offset_32(va)] = *(l1_pgentry_32_t *)value;
- unmap_domain_page(l1va);
-
- return 1;
- }
-
- case 8:
- return __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L1);
- default:
- BUG();
- return 0;
- }
- return 0;
-}
-
-#define __guest_set_l1e(v, va, value) \
- __guest_set_l1e(v, (unsigned long)va, value, sizeof(*value))
-
-static inline int __guest_get_l1e(
- struct vcpu *v, unsigned long va, void *gl1e, int size)
-{
- switch(size) {
- case 4:
- // 32-bit guest
- {
- l2_pgentry_32_t gl2e;
- l1_pgentry_32_t *l1va;
- unsigned long l1mfn;
-
- if (!(__guest_get_l2e(v, va, &gl2e)))
- return 0;
-
-
- if (unlikely(!(l2e_get_flags_32(gl2e) & _PAGE_PRESENT)))
- return 0;
-
-
- l1mfn = get_mfn_from_gpfn(
- l2e_get_pfn(gl2e));
- l1va = (l1_pgentry_32_t *) map_domain_page(l1mfn);
- if (gl1e)
- *(l1_pgentry_32_t *)gl1e = l1va[l1_table_offset_32(va)];
- unmap_domain_page(l1va);
- return 1;
- }
- case 8:
- // 64-bit guest
- return __rw_entry(v, va, gl1e, GUEST_ENTRY | GET_ENTRY | PAGING_L1);
- default:
- BUG();
- return 0;
- }
- return 0;
-}
-
-#define __guest_get_l1e(v, va, gl1e) \
- __guest_get_l1e(v, (unsigned long)va, gl1e, sizeof(*gl1e))
-
-static inline void entry_general(
- struct domain *d,
- pgentry_64_t *gle_p,
- pgentry_64_t *sle_p,
- unsigned long smfn, u32 level)
-
-{
- pgentry_64_t gle = *gle_p;
- pgentry_64_t sle;
-
- sle = entry_empty();
- if ( (entry_get_flags(gle) & _PAGE_PRESENT) && (smfn != 0) )
- {
- if ((entry_get_flags(gle) & _PAGE_PSE) && level == PAGING_L2) {
- sle = entry_from_pfn(smfn, entry_get_flags(gle));
- entry_remove_flags(sle, _PAGE_PSE);
-
- if ( shadow_mode_log_dirty(d) ||
- !(entry_get_flags(gle) & _PAGE_DIRTY) )
- {
- pgentry_64_t *l1_p;
- int i;
-
- l1_p =(pgentry_64_t *)map_domain_page(smfn);
- for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
- {
- if ( mfn_is_page_table(entry_get_pfn(l1_p[i])) )
- entry_remove_flags(l1_p[i], _PAGE_RW);
- }
-
- unmap_domain_page(l1_p);
- }
- } else {
- if (d->arch.ops->guest_paging_levels <= PAGING_L3
- && level == PAGING_L3) {
- sle = entry_from_pfn(smfn, entry_get_flags(gle));
- } else {
-
- sle = entry_from_pfn(
- smfn,
- (entry_get_flags(gle) | _PAGE_RW | _PAGE_ACCESSED) & ~_PAGE_AVAIL);
- entry_add_flags(gle, _PAGE_ACCESSED);
- }
- }
- // XXX mafetter: Hmm...
- // Shouldn't the dirty log be checked/updated here?
- // Actually, it needs to be done in this function's callers.
- //
- *gle_p = gle;
- }
-
- if ( entry_get_value(sle) || entry_get_value(gle) )
- SH_VVLOG("%s: gpde=%lx, new spde=%lx", __func__,
- entry_get_value(gle), entry_get_value(sle));
-
- *sle_p = sle;
-}
-
-static inline void entry_propagate_from_guest(
- struct domain *d, pgentry_64_t *gle_p, pgentry_64_t *sle_p, u32 level)
-{
- pgentry_64_t gle = *gle_p;
- unsigned long smfn = 0;
-
- if ( entry_get_flags(gle) & _PAGE_PRESENT ) {
- if ((entry_get_flags(gle) & _PAGE_PSE) && level == PAGING_L2) {
- smfn = __shadow_status(d, entry_get_pfn(gle), PGT_fl1_shadow);
- } else {
- smfn = __shadow_status(d, entry_get_pfn(gle),
- shadow_level_to_type((level -1 )));
- }
- }
- entry_general(d, gle_p, sle_p, smfn, level);
-
-}
-
-static int inline
-validate_entry_change(
- struct domain *d,
- pgentry_64_t *new_gle_p,
- pgentry_64_t *shadow_le_p,
- u32 level)
-{
- pgentry_64_t old_sle, new_sle;
- pgentry_64_t new_gle = *new_gle_p;
-
- old_sle = *shadow_le_p;
- entry_propagate_from_guest(d, &new_gle, &new_sle, level);
-
- ESH_LOG("old_sle: %lx, new_gle: %lx, new_sle: %lx\n",
- entry_get_value(old_sle), entry_get_value(new_gle),
- entry_get_value(new_sle));
-
- if ( ((entry_get_value(old_sle) | entry_get_value(new_sle)) & _PAGE_PRESENT) &&
- entry_has_changed(old_sle, new_sle, _PAGE_PRESENT) )
- {
- perfc_incrc(validate_entry_changes);
-
- if ( (entry_get_flags(new_sle) & _PAGE_PRESENT) &&
- !get_shadow_ref(entry_get_pfn(new_sle)) )
- BUG();
- if ( entry_get_flags(old_sle) & _PAGE_PRESENT )
- put_shadow_ref(entry_get_pfn(old_sle));
- }
-
- *shadow_le_p = new_sle;
-
- return 1;
-}
-
-#endif
-
-
diff --git a/xen/include/asm-x86/shadow_ops.h b/xen/include/asm-x86/shadow_ops.h
deleted file mode 100644
index 8765ed8b10..0000000000
--- a/xen/include/asm-x86/shadow_ops.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/******************************************************************************
- * include/asm-x86/shadow_ops.h
- *
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _XEN_SHADOW_OPS_H
-#define _XEN_SHADOW_OPS_H
-
-#define PAGING_L4 4UL
-#define PAGING_L3 3UL
-#define PAGING_L2 2UL
-#define PAGING_L1 1UL
-
-#define PAE_CR3_ALIGN 5
-#define PAE_CR3_IDX_MASK 0x7f
-
-#if defined( GUEST_PGENTRY_32 )
-
-#define GUEST_L1_PAGETABLE_ENTRIES L1_PAGETABLE_ENTRIES_32
-#define GUEST_L2_PAGETABLE_ENTRIES L2_PAGETABLE_ENTRIES_32
-#define GUEST_ROOT_PAGETABLE_ENTRIES ROOT_PAGETABLE_ENTRIES_32
-#define GUEST_L2_PAGETABLE_SHIFT L2_PAGETABLE_SHIFT_32
-
-#define guest_l1_pgentry_t l1_pgentry_32_t
-#define guest_l2_pgentry_t l2_pgentry_32_t
-#define guest_root_pgentry_t l2_pgentry_32_t
-
-#define guest_l1e_get_paddr l1e_get_paddr_32
-#define guest_l2e_get_paddr l2e_get_paddr_32
-
-#define guest_get_pte_flags get_pte_flags_32
-#define guest_put_pte_flags put_pte_flags_32
-
-#define guest_l1e_get_flags l1e_get_flags_32
-#define guest_l2e_get_flags l2e_get_flags_32
-#define guest_root_get_flags l2e_get_flags_32
-#define guest_root_get_intpte l2e_get_intpte
-
-#define guest_l1e_empty l1e_empty_32
-#define guest_l2e_empty l2e_empty_32
-
-#define guest_l1e_from_pfn l1e_from_pfn_32
-#define guest_l2e_from_pfn l2e_from_pfn_32
-
-#define guest_l1e_from_paddr l1e_from_paddr_32
-#define guest_l2e_from_paddr l2e_from_paddr_32
-
-#define guest_l1e_from_page l1e_from_page_32
-#define guest_l2e_from_page l2e_from_page_32
-
-#define guest_l1e_add_flags l1e_add_flags_32
-#define guest_l2e_add_flags l2e_add_flags_32
-
-#define guest_l1e_remove_flag l1e_remove_flags_32
-#define guest_l2e_remove_flag l2e_remove_flags_32
-
-#define guest_l1e_has_changed l1e_has_changed_32
-#define guest_l2e_has_changed l2e_has_changed_32
-#define root_entry_has_changed l2e_has_changed_32
-
-#define guest_l1_table_offset l1_table_offset_32
-#define guest_l2_table_offset l2_table_offset_32
-
-#define guest_linear_l1_table linear_pg_table_32
-#define guest_linear_l2_table linear_l2_table_32
-
-#define guest_va_to_l1mfn va_to_l1mfn_32
-
-#else
-
-#define GUEST_L1_PAGETABLE_ENTRIES L1_PAGETABLE_ENTRIES
-#define GUEST_L2_PAGETABLE_ENTRIES L2_PAGETABLE_ENTRIES
-#define GUEST_ROOT_PAGETABLE_ENTRIES ROOT_PAGETABLE_ENTRIES
-#define GUEST_L2_PAGETABLE_SHIFT L2_PAGETABLE_SHIFT
-
-#define guest_l1_pgentry_t l1_pgentry_t
-#define guest_l2_pgentry_t l2_pgentry_t
-#define guest_root_pgentry_t l4_pgentry_t
-
-#define guest_l1e_get_paddr l1e_get_paddr
-#define guest_l2e_get_paddr l2e_get_paddr
-
-#define guest_get_pte_flags get_pte_flags
-#define guest_put_pte_flags put_pte_flags
-
-#define guest_l1e_get_flags l1e_get_flags
-#define guest_l2e_get_flags l2e_get_flags
-#define guest_root_get_flags l4e_get_flags
-#define guest_root_get_intpte l4e_get_intpte
-
-#define guest_l1e_empty l1e_empty
-#define guest_l2e_empty l2e_empty
-
-#define guest_l1e_from_pfn l1e_from_pfn
-#define guest_l2e_from_pfn l2e_from_pfn
-
-#define guest_l1e_from_paddr l1e_from_paddr
-#define guest_l2e_from_paddr l2e_from_paddr
-
-#define guest_l1e_from_page l1e_from_page
-#define guest_l2e_from_page l2e_from_page
-
-#define guest_l1e_add_flags l1e_add_flags
-#define guest_l2e_add_flags l2e_add_flags
-
-#define guest_l1e_remove_flag l1e_remove_flags
-#define guest_l2e_remove_flag l2e_remove_flags
-
-#define guest_l1e_has_changed l1e_has_changed
-#define guest_l2e_has_changed l2e_has_changed
-#define root_entry_has_changed l4e_has_changed
-
-#define guest_l1_table_offset l1_table_offset
-#define guest_l2_table_offset l2_table_offset
-
-#define guest_linear_l1_table linear_pg_table
-#define guest_linear_l2_table linear_l2_table
-
-#define guest_va_to_l1mfn va_to_l1mfn
-#endif
-
-#endif /* _XEN_SHADOW_OPS_H */
diff --git a/xen/include/asm-x86/shadow_public.h b/xen/include/asm-x86/shadow_public.h
deleted file mode 100644
index e2b4b5fd57..0000000000
--- a/xen/include/asm-x86/shadow_public.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/******************************************************************************
- * include/asm-x86/shadow_public.h
- *
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _XEN_SHADOW_PUBLIC_H
-#define _XEN_SHADOW_PUBLIC_H
-
-#if CONFIG_PAGING_LEVELS >= 3
-#define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
-
-extern void shadow_sync_and_drop_references(
- struct domain *d, struct page_info *page);
-extern void shadow_drop_references(
- struct domain *d, struct page_info *page);
-
-extern int shadow_set_guest_paging_levels(struct domain *d, int levels);
-
-extern void release_out_of_sync_entry(
- struct domain *d, struct out_of_sync_entry *entry);
-
-struct shadow_ops {
- unsigned long guest_paging_levels; /* guest paging levels */
- void (*invlpg)(struct vcpu *v, unsigned long va);
- int (*fault)(unsigned long va, struct cpu_user_regs *regs);
- void (*update_pagetables)(struct vcpu *v);
- void (*sync_all)(struct domain *d);
- int (*remove_all_write_access)(struct domain *d,
- unsigned long readonly_gpfn, unsigned long readonly_gmfn);
- int (*do_update_va_mapping)(unsigned long va, l1_pgentry_t val, struct vcpu *v);
- struct out_of_sync_entry *
- (*mark_mfn_out_of_sync)(struct vcpu *v, unsigned long gpfn,
- unsigned long mfn);
- int (*is_out_of_sync)(struct vcpu *v, unsigned long va);
- unsigned long (*gva_to_gpa)(unsigned long gva);
-};
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-extern void shadow_l4_normal_pt_update(struct domain *d,
- unsigned long pa, l4_pgentry_t l4e,
- struct domain_mmap_cache *cache);
-#endif
-
-#endif
diff --git a/xen/include/asm-x86/x86_32/page-2level.h b/xen/include/asm-x86/x86_32/page-2level.h
index 764b1c2c05..7f450c4624 100644
--- a/xen/include/asm-x86/x86_32/page-2level.h
+++ b/xen/include/asm-x86/x86_32/page-2level.h
@@ -46,6 +46,7 @@ typedef l2_pgentry_t root_pgentry_t;
* 12-bit flags = (pte[11:0])
*/
+#define _PAGE_NX_BIT 0U
#define _PAGE_NX 0U
/* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */
diff --git a/xen/include/asm-x86/x86_32/page-3level.h b/xen/include/asm-x86/x86_32/page-3level.h
index 43e73033e3..e0187478cc 100644
--- a/xen/include/asm-x86/x86_32/page-3level.h
+++ b/xen/include/asm-x86/x86_32/page-3level.h
@@ -59,7 +59,8 @@ typedef l3_pgentry_t root_pgentry_t;
* 32-bit flags = (pte[63:44],pte[11:0])
*/
-#define _PAGE_NX (cpu_has_nx ? (1<<31) : 0)
+#define _PAGE_NX_BIT (1U<<31)
+#define _PAGE_NX (cpu_has_nx ? _PAGE_NX_BIT : 0)
/* Extract flags into 32-bit integer, or turn 32-bit flags into a pte mask. */
#define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h
index 0afb5e719b..429cfb8c5d 100644
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -44,6 +44,8 @@ typedef l4_pgentry_t root_pgentry_t;
/* Given a virtual address, get an entry offset into a linear page table. */
#define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> L1_PAGETABLE_SHIFT)
#define l2_linear_offset(_a) (((_a) & VADDR_MASK) >> L2_PAGETABLE_SHIFT)
+#define l3_linear_offset(_a) (((_a) & VADDR_MASK) >> L3_PAGETABLE_SHIFT)
+#define l4_linear_offset(_a) (((_a) & VADDR_MASK) >> L4_PAGETABLE_SHIFT)
#define is_guest_l1_slot(_s) (1)
#define is_guest_l2_slot(_t, _s) (1)
@@ -70,7 +72,8 @@ typedef l4_pgentry_t root_pgentry_t;
#define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF))
/* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/
-#define _PAGE_NX (cpu_has_nx ? (1U<<23) : 0U)
+#define _PAGE_NX_BIT (1U<<23)
+#define _PAGE_NX (cpu_has_nx ? _PAGE_NX_BIT : 0U)
#define L1_DISALLOW_MASK BASE_DISALLOW_MASK
#define L2_DISALLOW_MASK BASE_DISALLOW_MASK
diff --git a/xen/include/public/dom0_ops.h b/xen/include/public/dom0_ops.h
index d211ca1624..f12cc93108 100644
--- a/xen/include/public/dom0_ops.h
+++ b/xen/include/public/dom0_ops.h
@@ -262,6 +262,18 @@ DEFINE_XEN_GUEST_HANDLE(dom0_sched_id_t);
#define DOM0_SHADOW_CONTROL_OP_CLEAN 11
#define DOM0_SHADOW_CONTROL_OP_PEEK 12
+/* Shadow2 operations */
+#define DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION 30
+#define DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION 31
+#define DOM0_SHADOW2_CONTROL_OP_ENABLE 32
+
+/* Mode flags for Shadow2 enable op */
+#define DOM0_SHADOW2_CONTROL_FLAG_ENABLE (1 << 0)
+#define DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT (1 << 1)
+#define DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY (1 << 2)
+#define DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE (1 << 3)
+#define DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL (1 << 4)
+
struct dom0_shadow_control_stats {
uint32_t fault_count;
uint32_t dirty_count;
@@ -277,7 +289,9 @@ struct dom0_shadow_control {
uint32_t op;
XEN_GUEST_HANDLE(ulong) dirty_bitmap;
/* IN/OUT variables. */
- uint64_t pages; /* size of buffer, updated with actual size */
+ uint64_t pages; /* size of buffer, updated with actual size */
+ uint32_t mb; /* Shadow2 memory allocation in MB */
+ uint32_t mode; /* Shadow2 mode to enable */
/* OUT variables. */
struct dom0_shadow_control_stats stats;
};
diff --git a/xen/include/xen/domain_page.h b/xen/include/xen/domain_page.h
index 03d7af5f0f..2a51fcbacb 100644
--- a/xen/include/xen/domain_page.h
+++ b/xen/include/xen/domain_page.h
@@ -26,6 +26,13 @@ extern void *map_domain_page(unsigned long pfn);
*/
extern void unmap_domain_page(void *va);
+/*
+ * Convert a VA (within a page previously mapped in the context of the
+ * currently-executing VCPU via a call to map_domain_pages()) to a machine
+ * address
+ */
+extern paddr_t mapped_domain_page_to_maddr(void *va);
+
/*
* Similar to the above calls, except the mapping is accessible in all
* address spaces (not just within the VCPU that created the mapping). Global
@@ -98,6 +105,7 @@ domain_mmap_cache_destroy(struct domain_mmap_cache *cache)
#define map_domain_page(pfn) maddr_to_virt((pfn)<<PAGE_SHIFT)
#define unmap_domain_page(va) ((void)(va))
+#define mapped_domain_page_to_maddr(va) (virt_to_maddr(va))
#define map_domain_page_global(pfn) maddr_to_virt((pfn)<<PAGE_SHIFT)
#define unmap_domain_page_global(va) ((void)(va))
@@ -112,4 +120,9 @@ struct domain_mmap_cache {
#endif /* !CONFIG_DOMAIN_PAGE */
+#define HERE_I_AM \
+do { \
+ printk("HERE I AM: %s %s %d\n", __func__, __FILE__, __LINE__); \
+} while (0)
+
#endif /* __XEN_DOMAIN_PAGE_H__ */
diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
index e2c67a1d46..e7d84afd92 100644
--- a/xen/include/xen/lib.h
+++ b/xen/include/xen/lib.h
@@ -18,7 +18,7 @@ extern void __bug(char *file, int line) __attribute__((noreturn));
#ifndef NDEBUG
#define ASSERT(_p) \
do { \
- if ( !(_p) ) \
+ if ( unlikely(!(_p)) ) \
{ \
printk("Assertion '%s' failed, line %d, file %s\n", #_p , \
__LINE__, __FILE__); \
@@ -41,7 +41,7 @@ struct domain;
void cmdline_parse(char *cmdline);
#ifndef NDEBUG
-extern int debugtrace_send_to_console;
+extern void debugtrace_toggle(void);
extern void debugtrace_dump(void);
extern void debugtrace_printk(const char *fmt, ...);
#else
diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h
index 66cdfc814b..5072d0b924 100644
--- a/xen/include/xen/list.h
+++ b/xen/include/xen/list.h
@@ -162,6 +162,16 @@ static __inline__ void list_splice(struct list_head *list, struct list_head *hea
pos = n, n = pos->next)
/**
+ * list_for_each_backwards_safe - iterate backwards over a list safe against removal of list entry
+ * @pos: the &struct list_head to use as a loop counter.
+ * @n: another &struct list_head to use as temporary storage
+ * @head: the head for your list.
+ */
+#define list_for_each_backwards_safe(pos, n, head) \
+ for (pos = (head)->prev, n = pos->prev; pos != (head); \
+ pos = n, n = pos->prev)
+
+/**
* list_for_each_entry - iterate over list of given type
* @pos: the type * to use as a loop counter.
* @head: the head for your list.
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index c37e60f23a..d90b27adc7 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -376,9 +376,12 @@ extern struct domain *domain_list;
/* VCPU is paused by the hypervisor? */
#define _VCPUF_paused 11
#define VCPUF_paused (1UL<<_VCPUF_paused)
- /* VCPU is blocked awaiting an event to be consumed by Xen. */
+/* VCPU is blocked awaiting an event to be consumed by Xen. */
#define _VCPUF_blocked_in_xen 12
#define VCPUF_blocked_in_xen (1UL<<_VCPUF_blocked_in_xen)
+ /* HVM vcpu thinks CR0.PG == 0 */
+#define _VCPUF_shadow2_translate 13
+#define VCPUF_shadow2_translate (1UL<<_VCPUF_shadow2_translate)
/*
* Per-domain flags (domain_flags).