aboutsummaryrefslogtreecommitdiffstats
path: root/xen/arch/x86/shadow2.c
diff options
context:
space:
mode:
Diffstat (limited to 'xen/arch/x86/shadow2.c')
-rw-r--r--xen/arch/x86/shadow2.c4469
1 files changed, 4469 insertions, 0 deletions
diff --git a/xen/arch/x86/shadow2.c b/xen/arch/x86/shadow2.c
new file mode 100644
index 0000000000..9d845cb797
--- /dev/null
+++ b/xen/arch/x86/shadow2.c
@@ -0,0 +1,4469 @@
+/******************************************************************************
+ * arch/x86/shadow2.c
+ *
+ * Simple, mostly-synchronous shadow page tables.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+// DESIGN QUESTIONS:
+// Why use subshadows for PAE guests?
+// - reduces pressure in the hash table
+// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
+// - would need to find space in the page_info to store 7 more bits of
+// backpointer
+// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
+// figure out when to demote the guest page from l3 status
+//
+// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
+// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
+// space for both PV and HVM guests.
+//
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/domain_page.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+#include <asm/shadow2-types.h>
+#include <asm/flushtlb.h>
+#include <asm/hvm/hvm.h>
+
+/* The first cut: an absolutely synchronous, trap-and-emulate version,
+ * supporting only HVM guests (and so only "external" shadow mode).
+ *
+ * THINGS TO DO LATER:
+ *
+ * FIX GVA_TO_GPA
+ * The current interface returns an unsigned long, which is not big enough
+ * to hold a physical address in PAE. Should return a gfn instead.
+ *
+ * TEARDOWN HEURISTICS
+ * Also: have a heuristic for when to destroy a previous paging-mode's
+ * shadows. When a guest is done with its start-of-day 32-bit tables
+ * and reuses the memory we want to drop those shadows. Start with
+ * shadows in a page in two modes as a hint, but beware of clever tricks
+ * like reusing a pagetable for both PAE and 64-bit during boot...
+ *
+ * PAE LINEAR MAPS
+ * Rework shadow_get_l*e() to have the option of using map_domain_page()
+ * instead of linear maps. Add appropriate unmap_l*e calls in the users.
+ * Then we can test the speed difference made by linear maps. If the
+ * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
+ * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
+ * to share l2h pages again.
+ *
+ * PAE L3 COPYING
+ * In this code, we copy all 32 bytes of a PAE L3 every time we change an
+ * entry in it, and every time we change CR3. We copy it for the linear
+ * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
+ * buffer so it fits in CR3. Maybe we can avoid some of this recopying
+ * by using the shadow directly in some places.
+ * Also, for SMP, need to actually respond to seeing shadow2_pae_flip_pending.
+ *
+ * GUEST_WALK_TABLES TLB FLUSH COALESCE
+ * guest_walk_tables can do up to three remote TLB flushes as it walks to
+ * the first l1 of a new pagetable. Should coalesce the flushes to the end,
+ * and if we do flush, re-do the walk. If anything has changed, then
+ * pause all the other vcpus and do the walk *again*.
+ *
+ * WP DISABLED
+ * Consider how to implement having the WP bit of CR0 set to 0.
+ * Since we need to be able to cause write faults to pagetables, this might
+ * end up looking like not having the (guest) pagetables present at all in
+ * HVM guests...
+ *
+ * PSE disabled / PSE36
+ * We don't support any modes other than PSE enabled, PSE36 disabled.
+ * Neither of those would be hard to change, but we'd need to be able to
+ * deal with shadows made in one mode and used in another.
+ */
+
+#define FETCH_TYPE_PREFETCH 1
+#define FETCH_TYPE_DEMAND 2
+#define FETCH_TYPE_WRITE 4
+typedef enum {
+ ft_prefetch = FETCH_TYPE_PREFETCH,
+ ft_demand_read = FETCH_TYPE_DEMAND,
+ ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
+} fetch_type_t;
+
+#ifndef NDEBUG
+static char *fetch_type_names[] = {
+ [ft_prefetch] "prefetch",
+ [ft_demand_read] "demand read",
+ [ft_demand_write] "demand write",
+};
+#endif
+
+/* XXX forward declarations */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res);
+#endif
+static inline void sh2_update_linear_entries(struct vcpu *v);
+
+/**************************************************************************/
+/* Hash table mapping from guest pagetables to shadows
+ *
+ * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
+ * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
+ * shadow L1 which maps its "splinters".
+ * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
+ * PAE L3 info page for that CR3 value.
+ */
+
+static inline mfn_t
+get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
+/* Look for FL1 shadows in the hash table */
+{
+ mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn),
+ PGC_SH2_fl1_shadow >> PGC_SH2_type_shift);
+
+ if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH2_log_dirty) )
+ shadow2_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline mfn_t
+get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+/* Look for shadows in the hash table */
+{
+ mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn),
+ shadow_type >> PGC_SH2_type_shift);
+ perfc_incrc(shadow2_get_shadow_status);
+
+ if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH2_log_dirty) )
+ shadow2_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline void
+set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Put an FL1 shadow into the hash table */
+{
+ SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
+ gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
+
+ if ( unlikely(shadow2_mode_log_dirty(v->domain)) )
+ // mark this shadow as a log dirty shadow...
+ set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+ else
+ clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+
+ shadow2_hash_insert(v, gfn_x(gfn),
+ PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn);
+}
+
+static inline void
+set_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Put a shadow into the hash table */
+{
+ struct domain *d = v->domain;
+ int res;
+
+ SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+ d->domain_id, v->vcpu_id, mfn_x(gmfn),
+ shadow_type, mfn_x(smfn));
+
+ if ( unlikely(shadow2_mode_log_dirty(d)) )
+ // mark this shadow as a log dirty shadow...
+ set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+ else
+ clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+
+ res = get_page(mfn_to_page(gmfn), d);
+ ASSERT(res == 1);
+
+ shadow2_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH2_type_shift,
+ smfn);
+}
+
+static inline void
+delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+ SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
+ gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
+
+ shadow2_hash_delete(v, gfn_x(gfn),
+ PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn);
+}
+
+static inline void
+delete_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+ SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id,
+ mfn_x(gmfn), shadow_type, mfn_x(smfn));
+ shadow2_hash_delete(v, mfn_x(gmfn),
+ shadow_type >> PGC_SH2_type_shift, smfn);
+ put_page(mfn_to_page(gmfn));
+}
+
+
+/**************************************************************************/
+/* Functions for walking the guest page tables */
+
+
+/* Walk the guest pagetables, filling the walk_t with what we see.
+ * Takes an uninitialised walk_t. The caller must call unmap_walk()
+ * on the walk_t before discarding it or calling guest_walk_tables again.
+ * If "guest_op" is non-zero, we are serving a genuine guest memory access,
+ * and must (a) be under the shadow2 lock, and (b) remove write access
+ * from any gueat PT pages we see, as we will be using their contents to
+ * perform shadow updates.
+ * Returns 0 for success or non-zero if the guest pagetables are malformed.
+ * N.B. Finding a not-present entry does not cause a non-zero return code. */
+static inline int
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
+{
+ ASSERT(!guest_op || shadow2_lock_is_acquired(v->domain));
+
+ perfc_incrc(shadow2_guest_walk);
+ memset(gw, 0, sizeof(*gw));
+ gw->va = va;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ /* Get l4e from the top level table */
+ gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
+ /* Walk down to the l3e */
+ if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
+ gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
+ if ( !valid_mfn(gw->l3mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op && shadow2_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l3e = ((guest_l3e_t *)sh2_map_domain_page(gw->l3mfn))
+ + guest_l3_table_offset(va);
+#else /* PAE only... */
+ /* Get l3e from the top level table */
+ gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
+#endif /* PAE or 64... */
+ /* Walk down to the l2e */
+ if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
+ gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
+ if ( !valid_mfn(gw->l2mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op && shadow2_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l2e = ((guest_l2e_t *)sh2_map_domain_page(gw->l2mfn))
+ + guest_l2_table_offset(va);
+#else /* 32-bit only... */
+ /* Get l2e from the top level table */
+ gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
+#endif /* All levels... */
+
+ if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
+ if ( guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
+ {
+ /* Special case: this guest VA is in a PSE superpage, so there's
+ * no guest l1e. We make one up so that the propagation code
+ * can generate a shadow l1 table. Start with the gfn of the
+ * first 4k-page of the superpage. */
+ gfn_t start = guest_l2e_get_gfn(*gw->l2e);
+ /* Grant full access in the l1e, since all the guest entry's
+ * access controls are enforced in the shadow l2e. This lets
+ * us reflect l2 changes later without touching the l1s. */
+ int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+ _PAGE_ACCESSED|_PAGE_DIRTY);
+ /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
+ * of the level 1 */
+ if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
+ flags |= _PAGE_PAT;
+ /* Increment the pfn by the right number of 4k pages.
+ * The ~0x1 is to mask out the PAT bit mentioned above. */
+ start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+ gw->eff_l1e = guest_l1e_from_gfn(start, flags);
+ gw->l1e = NULL;
+ gw->l1mfn = _mfn(INVALID_MFN);
+ }
+ else
+ {
+ /* Not a superpage: carry on and find the l1e. */
+ gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
+ if ( !valid_mfn(gw->l1mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op
+ && shadow2_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l1e = ((guest_l1e_t *)sh2_map_domain_page(gw->l1mfn))
+ + guest_l1_table_offset(va);
+ gw->eff_l1e = *gw->l1e;
+ }
+
+ return 0;
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+ return _gfn(INVALID_GFN);
+ return guest_l1e_get_gfn(gw->eff_l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+ return 0;
+ return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
+}
+
+
+/* Unmap (and reinitialise) a guest walk.
+ * Call this to dispose of any walk filled in by guest_walk_tables() */
+static void unmap_walk(struct vcpu *v, walk_t *gw)
+{
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+ if ( gw->l3e != NULL ) sh2_unmap_domain_page(gw->l3e);
+#endif
+ if ( gw->l2e != NULL ) sh2_unmap_domain_page(gw->l2e);
+#endif
+ if ( gw->l1e != NULL ) sh2_unmap_domain_page(gw->l1e);
+#ifdef DEBUG
+ memset(gw, 0, sizeof(*gw));
+#endif
+}
+
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+ SHADOW2_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ SHADOW2_PRINTK(" l4mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l4mfn));
+ SHADOW2_PRINTK(" l4e=%p\n", gw->l4e);
+ if ( gw->l4e )
+ SHADOW2_PRINTK(" *l4e=%" SH2_PRI_gpte "\n", gw->l4e->l4);
+#endif /* PAE or 64... */
+ SHADOW2_PRINTK(" l3mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l3mfn));
+ SHADOW2_PRINTK(" l3e=%p\n", gw->l3e);
+ if ( gw->l3e )
+ SHADOW2_PRINTK(" *l3e=%" SH2_PRI_gpte "\n", gw->l3e->l3);
+#endif /* All levels... */
+ SHADOW2_PRINTK(" l2mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l2mfn));
+ SHADOW2_PRINTK(" l2e=%p\n", gw->l2e);
+ if ( gw->l2e )
+ SHADOW2_PRINTK(" *l2e=%" SH2_PRI_gpte "\n", gw->l2e->l2);
+ SHADOW2_PRINTK(" l1mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l1mfn));
+ SHADOW2_PRINTK(" l1e=%p\n", gw->l1e);
+ if ( gw->l1e )
+ SHADOW2_PRINTK(" *l1e=%" SH2_PRI_gpte "\n", gw->l1e->l1);
+ SHADOW2_PRINTK(" eff_l1e=%" SH2_PRI_gpte "\n", gw->eff_l1e.l1);
+}
+
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
+/* Lightweight audit: pass all the shadows associated with this guest walk
+ * through the audit mechanisms */
+static void sh2_audit_gw(struct vcpu *v, walk_t *gw)
+{
+ mfn_t smfn;
+
+ if ( !(SHADOW2_AUDIT_ENABLE) )
+ return;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ if ( valid_mfn(gw->l4mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn,
+ PGC_SH2_l4_shadow))) )
+ (void) sh2_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* PAE or 64... */
+ if ( valid_mfn(gw->l3mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn,
+ PGC_SH2_l3_shadow))) )
+ (void) sh2_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* All levels... */
+ if ( valid_mfn(gw->l2mfn) )
+ {
+ if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
+ PGC_SH2_l2_shadow))) )
+ (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#if GUEST_PAGING_LEVELS == 3
+ if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
+ PGC_SH2_l2h_shadow))) )
+ (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#endif
+ }
+ if ( valid_mfn(gw->l1mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn,
+ PGC_SH2_l1_shadow))) )
+ (void) sh2_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
+ else if ( gw->l2e
+ && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
+ && valid_mfn(
+ (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
+ (void) sh2_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
+}
+
+#else
+#define sh2_audit_gw(_v, _gw) do {} while(0)
+#endif /* audit code */
+
+
+
+/**************************************************************************/
+/* Function to write to the guest tables, for propagating accessed and
+ * dirty bits from the shadow to the guest.
+ * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
+ * and an operation type. The guest entry is always passed as an l1e:
+ * since we only ever write flags, that's OK.
+ * Returns the new flag bits of the guest entry. */
+
+static u32 guest_set_ad_bits(struct vcpu *v,
+ mfn_t gmfn,
+ guest_l1e_t *ep,
+ unsigned int level,
+ fetch_type_t ft)
+{
+ u32 flags, shflags, bit;
+ struct page_info *pg;
+ int res = 0;
+
+ ASSERT(valid_mfn(gmfn)
+ && (sh2_mfn_is_a_page_table(gmfn)
+ || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
+ == 0)));
+ ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
+ ASSERT(level <= GUEST_PAGING_LEVELS);
+ ASSERT(ft == ft_demand_read || ft == ft_demand_write);
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ flags = guest_l1e_get_flags(*ep);
+
+ /* PAE l3s do not have A and D bits */
+ if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) )
+ return flags;
+
+ /* Need the D bit as well for writes, in l1es and PSE l2es. */
+ if ( ft == ft_demand_write
+ && (level == 1 || (level == 2 && (flags & _PAGE_PSE))) )
+ {
+ if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
+ == (_PAGE_DIRTY | _PAGE_ACCESSED) )
+ return flags; /* Guest already has A and D bits set */
+ flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
+ perfc_incrc(shadow2_ad_update);
+ }
+ else
+ {
+ if ( flags & _PAGE_ACCESSED )
+ return flags; /* Guest already has A bit set */
+ flags |= _PAGE_ACCESSED;
+ perfc_incrc(shadow2_a_update);
+ }
+
+ /* Set the bit(s) */
+ sh2_mark_dirty(v->domain, gmfn);
+ SHADOW2_DEBUG(A_AND_D, "gfn = %"SH2_PRI_gfn", "
+ "old flags = %#x, new flags = %#x\n",
+ guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags);
+ *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
+
+ /* May need to propagate this change forward to other kinds of shadow */
+ pg = mfn_to_page(gmfn);
+ if ( !sh2_mfn_is_a_page_table(gmfn) )
+ {
+ /* This guest pagetable is not yet shadowed at all. */
+ // MAF: I think this assert is busted... If this gmfn has not yet
+ // been promoted, then it seems perfectly reasonable for there to be
+ // outstanding type refs to it...
+ /* TJD: No. If the gmfn has not been promoted, we must at least
+ * have recognised that it is a pagetable, and pulled write access.
+ * The type count should only be non-zero if it is actually a page
+ * table. The test above was incorrect, though, so I've fixed it. */
+ ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0);
+ return flags;
+ }
+
+ shflags = pg->shadow2_flags & SH2F_page_type_mask;
+ while ( shflags )
+ {
+ bit = find_first_set_bit(shflags);
+ ASSERT(shflags & (1u << bit));
+ shflags &= ~(1u << bit);
+ if ( !(pg->shadow2_flags & (1u << bit)) )
+ continue;
+ switch ( bit )
+ {
+ case PGC_SH2_type_to_index(PGC_SH2_l1_shadow):
+ if (level != 1)
+ res |= sh2_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep));
+ break;
+ case PGC_SH2_type_to_index(PGC_SH2_l2_shadow):
+ if (level != 2)
+ res |= sh2_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep));
+ break;
+#if GUEST_PAGING_LEVELS == 3 /* PAE only */
+ case PGC_SH2_type_to_index(PGC_SH2_l2h_shadow):
+ if (level != 2)
+ res |= sh2_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep));
+ break;
+#endif
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+ case PGC_SH2_type_to_index(PGC_SH2_l3_shadow):
+ if (level != 3)
+ res |= sh2_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep));
+ break;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ case PGC_SH2_type_to_index(PGC_SH2_l4_shadow):
+ if (level != 4)
+ res |= sh2_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep));
+ break;
+#endif
+#endif
+ default:
+ SHADOW2_ERROR("mfn %"SH2_PRI_mfn" is shadowed in multiple "
+ "modes: A&D bits may be out of sync (flags=%#x).\n",
+ mfn_x(gmfn), pg->shadow2_flags);
+ /* XXX Shadows in other modes will not be updated, so will
+ * have their A and D bits out of sync. */
+ }
+ }
+
+ /* We should never need to flush the TLB or recopy PAE entries */
+ ASSERT( res == 0 || res == SHADOW2_SET_CHANGED );
+ return flags;
+}
+
+/**************************************************************************/
+/* Functions to compute the correct index into a shadow page, given an
+ * index into the guest page (as returned by guest_get_index()).
+ * This is trivial when the shadow and guest use the same sized PTEs, but
+ * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
+ * PAE- or 64-bit shadows).
+ *
+ * These functions also increment the shadow mfn, when necessary. When PTE
+ * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
+ * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
+ * use simple pointer arithmetic on a pointer to the guest L1e to figure out
+ * which shadow page we really want. Similarly, when PTE sizes are
+ * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
+ * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
+ * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
+ * space.)
+ *
+ * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
+ * of shadow (to store both the shadow, and the info that would normally be
+ * stored in page_info fields). This arrangement allows the shadow and the
+ * "page_info" fields to always be stored in the same page (in fact, in
+ * the same cache line), avoiding an extra call to map_domain_page().
+ */
+
+static inline u32
+guest_index(void *ptr)
+{
+ return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
+}
+
+static inline u32
+shadow_l1_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
+ return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
+#else
+ return guest_index;
+#endif
+}
+
+static inline u32
+shadow_l2_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+ // Because we use 2 shadow l2 entries for each guest entry, the number of
+ // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
+ //
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+ // We multiple by two to get the index of the first of the two entries
+ // used to shadow the specified guest entry.
+ return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
+#else
+ return guest_index;
+#endif
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+
+static inline u32
+shadow_l3_index(mfn_t *smfn, u32 guest_index)
+{
+#if GUEST_PAGING_LEVELS == 3
+ u32 group_id;
+
+ // Because we use twice the space in L3 shadows as was consumed in guest
+ // L3s, the number of guest entries per shadow page is
+ // SHADOW_L2_PAGETABLE_ENTRIES/2. (Note this is *not*
+ // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...)
+ //
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+ // We store PAE L3 shadows in groups of 4, alternating shadows and
+ // pae_l3_bookkeeping structs. So the effective shadow index is
+ // the the group_id * 8 + the offset within the group.
+ //
+ guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2);
+ group_id = guest_index / 4;
+ return (group_id * 8) + (guest_index % 4);
+#else
+ return guest_index;
+#endif
+}
+
+#endif // GUEST_PAGING_LEVELS >= 3
+
+#if GUEST_PAGING_LEVELS >= 4
+
+static inline u32
+shadow_l4_index(mfn_t *smfn, u32 guest_index)
+{
+ return guest_index;
+}
+
+#endif // GUEST_PAGING_LEVELS >= 4
+
+
+/**************************************************************************/
+/* Functions which compute shadow entries from their corresponding guest
+ * entries.
+ *
+ * These are the "heart" of the shadow code.
+ *
+ * There are two sets of these: those that are called on demand faults (read
+ * faults and write faults), and those that are essentially called to
+ * "prefetch" (or propagate) entries from the guest into the shadow. The read
+ * fault and write fault are handled as two separate cases for L1 entries (due
+ * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
+ * into the respective demand_fault functions.
+ */
+
+#define CHECK(_cond) \
+do { \
+ if (unlikely(!(_cond))) \
+ { \
+ printk("%s %s %d ASSERTION (%s) FAILED\n", \
+ __func__, __FILE__, __LINE__, #_cond); \
+ return -1; \
+ } \
+} while (0);
+
+// The function below tries to capture all of the flag manipulation for the
+// demand and propagate functions into one place.
+//
+static always_inline u32
+sh2_propagate_flags(struct vcpu *v, mfn_t target_mfn,
+ u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn,
+ int mmio, int level, fetch_type_t ft)
+{
+ struct domain *d = v->domain;
+ u32 pass_thru_flags;
+ u32 sflags;
+ int lowest_level_guest_mapping;
+
+ // XXX -- might want to think about PAT support for HVM guests...
+
+#ifndef NDEBUG
+ // MMIO can only occur from L1e's
+ //
+ if ( mmio )
+ CHECK(level == 1);
+
+ // We should always have a pointer to the guest entry if it's a non-PSE
+ // non-MMIO demand access.
+ if ( ft & FETCH_TYPE_DEMAND )
+ CHECK(guest_entry_ptr || level == 1);
+#endif
+
+ // A not-present guest entry has a special signature in the shadow table,
+ // so that we do not have to consult the guest tables multiple times...
+ //
+ if ( unlikely(!(gflags & _PAGE_PRESENT)) )
+ return _PAGE_SHADOW_GUEST_NOT_PRESENT;
+
+ // Must have a valid target_mfn, unless this is mmio, or unless this is a
+ // prefetch. In the case of a prefetch, an invalid mfn means that we can
+ // not usefully shadow anything, and so we return early.
+ //
+ if ( !valid_mfn(target_mfn) )
+ {
+ CHECK((ft == ft_prefetch) || mmio);
+ if ( !mmio )
+ return 0;
+ }
+
+ // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's...
+ //
+ if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) )
+ pass_thru_flags = _PAGE_PRESENT;
+ else
+ {
+ pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
+ _PAGE_RW | _PAGE_PRESENT);
+ if ( guest_supports_nx(v) )
+ pass_thru_flags |= _PAGE_NX_BIT;
+ }
+
+ // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their
+ // L3e's; they are all implied. So we emulate them here.
+ //
+ if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) )
+ gflags = pass_thru_flags;
+
+ // Propagate bits from the guest to the shadow.
+ // Some of these may be overwritten, below.
+ // Since we know the guest's PRESENT bit is set, we also set the shadow's
+ // SHADOW_PRESENT bit.
+ //
+ sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
+
+ // Copy the guest's RW bit into the SHADOW_RW bit.
+ //
+ if ( gflags & _PAGE_RW )
+ sflags |= _PAGE_SHADOW_RW;
+
+ // Set the A&D bits for higher level shadows.
+ // Higher level entries do not, strictly speaking, have dirty bits, but
+ // since we use shadow linear tables, each of these entries may, at some
+ // point in time, also serve as a shadow L1 entry.
+ // By setting both the A&D bits in each of these, we eliminate the burden
+ // on the hardware to update these bits on initial accesses.
+ //
+ if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
+ sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
+
+ lowest_level_guest_mapping =
+ ((level == 1) ||
+ ((level == 2) && guest_supports_superpages(v) &&
+ (gflags & _PAGE_PSE)));
+
+ // Set the A and D bits in the guest entry, if we need to.
+ if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
+ gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
+
+ // If the A or D bit has not yet been set in the guest, then we must
+ // prevent the corresponding kind of access.
+ //
+ if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) &&
+ !(gflags & _PAGE_ACCESSED)) )
+ sflags &= ~_PAGE_PRESENT;
+
+ if ( unlikely(lowest_level_guest_mapping &&
+ !(gflags & _PAGE_DIRTY)) )
+ sflags &= ~_PAGE_RW;
+
+ // MMIO caching
+ //
+ // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
+ // to cache the fact that this entry is in MMIO space.
+ //
+ if ( (level == 1) && mmio )
+ {
+ sflags &= ~(_PAGE_PRESENT);
+ sflags |= _PAGE_SHADOW_MMIO;
+ }
+ else
+ {
+ // shadow2_mode_log_dirty support
+ //
+ // Only allow the guest write access to a page a) on a demand fault,
+ // or b) if the page is already marked as dirty.
+ //
+ if ( unlikely((level == 1) &&
+ !(ft & FETCH_TYPE_WRITE) &&
+ shadow2_mode_log_dirty(d) &&
+ !sh2_mfn_is_dirty(d, target_mfn)) )
+ {
+ sflags &= ~_PAGE_RW;
+ }
+
+ // protect guest page tables
+ //
+ if ( unlikely((level == 1) &&
+ sh2_mfn_is_a_page_table(target_mfn)) )
+ {
+ if ( shadow2_mode_trap_reads(d) )
+ {
+ // if we are trapping both reads & writes, then mark this page
+ // as not present...
+ //
+ sflags &= ~_PAGE_PRESENT;
+ }
+ else
+ {
+ // otherwise, just prevent any writes...
+ //
+ sflags &= ~_PAGE_RW;
+ }
+ }
+ }
+
+ return sflags;
+}
+
+#undef CHECK
+
+#if GUEST_PAGING_LEVELS >= 4
+static void
+l4e_propagate_from_guest(struct vcpu *v,
+ guest_l4e_t *gl4e,
+ mfn_t gl4mfn,
+ mfn_t sl3mfn,
+ shadow_l4e_t *sl4p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l4e_get_flags(*gl4e);
+ u32 sflags = sh2_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
+ gl4mfn, 0, 4, ft);
+
+ *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "%s gl4e=%" SH2_PRI_gpte " sl4e=%" SH2_PRI_pte "\n",
+ fetch_type_names[ft], gl4e->l4, sl4p->l4);
+ ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static void
+l3e_propagate_from_guest(struct vcpu *v,
+ guest_l3e_t *gl3e,
+ mfn_t gl3mfn,
+ mfn_t sl2mfn,
+ shadow_l3e_t *sl3p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l3e_get_flags(*gl3e);
+ u32 sflags = sh2_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
+ gl3mfn, 0, 3, ft);
+
+ *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "%s gl3e=%" SH2_PRI_gpte " sl3e=%" SH2_PRI_pte "\n",
+ fetch_type_names[ft], gl3e->l3, sl3p->l3);
+ ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static void
+l2e_propagate_from_guest(struct vcpu *v,
+ guest_l2e_t *gl2e,
+ mfn_t gl2mfn,
+ mfn_t sl1mfn,
+ shadow_l2e_t *sl2p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l2e_get_flags(*gl2e);
+ u32 sflags = sh2_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e,
+ gl2mfn, 0, 2, ft);
+
+ *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "%s gl2e=%" SH2_PRI_gpte " sl2e=%" SH2_PRI_pte "\n",
+ fetch_type_names[ft], gl2e->l2, sl2p->l2);
+ ASSERT(sflags != -1);
+}
+
+static inline int
+l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+ int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+ struct domain *d = v->domain;
+ u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+ u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+ mmio, 1, ft_demand_read);
+
+ if ( shadow2_mode_trap_reads(d) && !mmio && sh2_mfn_is_a_page_table(gmfn) )
+ {
+ // emulation required!
+ *sl1p = shadow_l1e_empty();
+ return 1;
+ }
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
+ (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+ return 0;
+}
+
+static inline int
+l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+ int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+ struct domain *d = v->domain;
+ u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+ u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+ mmio, 1, ft_demand_write);
+
+ sh2_mark_dirty(d, gmfn);
+
+ if ( !mmio && sh2_mfn_is_a_page_table(gmfn) )
+ {
+ // emulation required!
+ *sl1p = shadow_l1e_empty();
+ return 1;
+ }
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
+ (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+ return 0;
+}
+
+static inline void
+l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
+ int mmio)
+{
+ gfn_t gfn = guest_l1e_get_gfn(gl1e);
+ mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
+ u32 gflags = guest_l1e_get_flags(gl1e);
+ u32 sflags = sh2_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN),
+ mmio, 1, ft_prefetch);
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW2_DEBUG(PROPAGATE,
+ "gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
+ gl1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+}
+
+
+/**************************************************************************/
+/* These functions update shadow entries (and do bookkeeping on the shadow
+ * tables they are in). It is intended that they are the only
+ * functions which ever write (non-zero) data onto a shadow page.
+ *
+ * They return a set of flags:
+ * SHADOW2_SET_CHANGED -- we actually wrote a new value to the shadow.
+ * SHADOW2_SET_FLUSH -- the caller must cause a TLB flush.
+ * SHADOW2_SET_ERROR -- the input is not a valid entry (for example, if
+ * shadow2_get_page_from_l1e() fails).
+ * SHADOW2_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
+ * copies of their PAE L3 entries re-copied.
+ */
+
+static inline void safe_write_entry(void *dst, void *src)
+/* Copy one PTE safely when processors might be running on the
+ * destination pagetable. This does *not* give safety against
+ * concurrent writes (that's what the shadow lock is for), just
+ * stops the hardware picking up partially written entries. */
+{
+ volatile unsigned long *d = dst;
+ unsigned long *s = src;
+ ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
+#if CONFIG_PAGING_LEVELS == 3
+ /* In PAE mode, pagetable entries are larger
+ * than machine words, so won't get written atomically. We need to make
+ * sure any other cpu running on these shadows doesn't see a
+ * half-written entry. Do this by marking the entry not-present first,
+ * then writing the high word before the low word. */
+ BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
+ d[0] = 0;
+ d[1] = s[1];
+ d[0] = s[0];
+#else
+ /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
+ * which will be an atomic write, since the entry is aligned. */
+ BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
+ *d = *s;
+#endif
+}
+
+
+static inline void
+shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
+/* This function does the actual writes to shadow pages.
+ * It must not be called directly, since it doesn't do the bookkeeping
+ * that shadow_set_l*e() functions do. */
+{
+ shadow_l1e_t *dst = d;
+ shadow_l1e_t *src = s;
+ void *map = NULL;
+ int i;
+
+ /* Because we mirror access rights at all levels in the shadow, an
+ * l2 (or higher) entry with the RW bit cleared will leave us with
+ * no write access through the linear map.
+ * We detect that by writing to the shadow with copy_to_user() and
+ * using map_domain_page() to get a writeable mapping if we need to. */
+ if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
+ {
+ perfc_incrc(shadow2_linear_map_failed);
+ map = sh2_map_domain_page(mfn);
+ ASSERT(map != NULL);
+ dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
+ }
+
+
+ for ( i = 0; i < entries; i++ )
+ safe_write_entry(dst++, src++);
+
+ if ( map != NULL ) sh2_unmap_domain_page(map);
+
+ /* XXX TODO:
+ * Update min/max field in page_info struct of this mfn */
+}
+
+static inline int
+perms_strictly_increased(u32 old_flags, u32 new_flags)
+/* Given the flags of two entries, are the new flags a strict
+ * increase in rights over the old ones? */
+{
+ u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+ u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+ /* Flip the NX bit, since it's the only one that decreases rights;
+ * we calculate as if it were an "X" bit. */
+ of ^= _PAGE_NX_BIT;
+ nf ^= _PAGE_NX_BIT;
+ /* If the changed bits are all set in the new flags, then rights strictly
+ * increased between old and new. */
+ return ((of | (of ^ nf)) == nf);
+}
+
+static int inline
+shadow2_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{
+ int res;
+ mfn_t mfn;
+ struct domain *owner;
+ shadow_l1e_t sanitized_sl1e =
+ shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
+
+ //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
+ //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
+
+ if ( !shadow2_mode_refcounts(d) )
+ return 1;
+
+ res = get_page_from_l1e(sanitized_sl1e, d);
+
+ // If a privileged domain is attempting to install a map of a page it does
+ // not own, we let it succeed anyway.
+ //
+ if ( unlikely(!res) &&
+ IS_PRIV(d) &&
+ !shadow2_mode_translate(d) &&
+ valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
+ (owner = page_get_owner(mfn_to_page(mfn))) &&
+ (d != owner) )
+ {
+ res = get_page_from_l1e(sanitized_sl1e, owner);
+ SHADOW2_PRINTK("privileged domain %d installs map of mfn %05lx "
+ "which is owned by domain %d: %s\n",
+ d->domain_id, mfn_x(mfn), owner->domain_id,
+ res ? "success" : "failed");
+ }
+
+ if ( unlikely(!res) )
+ {
+ perfc_incrc(shadow2_get_page_fail);
+ SHADOW2_PRINTK("failed: l1e=" SH2_PRI_pte "\n");
+ }
+
+ return res;
+}
+
+static void inline
+shadow2_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{
+ if ( !shadow2_mode_refcounts(d) )
+ return;
+
+ put_page_from_l1e(sl1e, d);
+}
+
+#if GUEST_PAGING_LEVELS >= 4
+static int shadow_set_l4e(struct vcpu *v,
+ shadow_l4e_t *sl4e,
+ shadow_l4e_t new_sl4e,
+ mfn_t sl4mfn)
+{
+ int flags = 0;
+ shadow_l4e_t old_sl4e;
+ paddr_t paddr;
+ ASSERT(sl4e != NULL);
+ old_sl4e = *sl4e;
+
+ if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl4e) & ~PAGE_MASK));
+
+ if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh2_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr);
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+ flags |= SHADOW2_SET_CHANGED;
+
+ if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
+ if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
+ || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
+ shadow_l4e_get_flags(new_sl4e)) )
+ {
+ flags |= SHADOW2_SET_FLUSH;
+ }
+ sh2_put_ref(v, osl3mfn, paddr);
+ }
+ return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if GUEST_PAGING_LEVELS >= 3
+static int shadow_set_l3e(struct vcpu *v,
+ shadow_l3e_t *sl3e,
+ shadow_l3e_t new_sl3e,
+ mfn_t sl3mfn)
+{
+ int flags = 0;
+ shadow_l3e_t old_sl3e;
+ paddr_t paddr;
+ ASSERT(sl3e != NULL);
+ old_sl3e = *sl3e;
+
+ if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl3e) & ~PAGE_MASK));
+
+ if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh2_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr);
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
+ flags |= SHADOW2_SET_CHANGED;
+
+#if GUEST_PAGING_LEVELS == 3
+ /* We wrote a guest l3e in a PAE pagetable. This table is copied in
+ * the linear pagetable entries of its l2s, and may also be copied
+ * to a low memory location to make it fit in CR3. Report that we
+ * need to resync those copies (we can't wait for the guest to flush
+ * the TLB because it might be an increase in rights). */
+ {
+ struct vcpu *vcpu;
+
+ struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e);
+ for_each_vcpu(v->domain, vcpu)
+ {
+ if (info->vcpus & (1 << vcpu->vcpu_id))
+ {
+ // Remember that this flip/update needs to occur.
+ vcpu->arch.shadow2_pae_flip_pending = 1;
+ flags |= SHADOW2_SET_L3PAE_RECOPY;
+ }
+ }
+ }
+#endif
+
+ if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
+ if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
+ !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
+ shadow_l3e_get_flags(new_sl3e)) )
+ {
+ flags |= SHADOW2_SET_FLUSH;
+ }
+ sh2_put_ref(v, osl2mfn, paddr);
+ }
+ return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+static int shadow_set_l2e(struct vcpu *v,
+ shadow_l2e_t *sl2e,
+ shadow_l2e_t new_sl2e,
+ mfn_t sl2mfn)
+{
+ int flags = 0;
+ shadow_l2e_t old_sl2e;
+ paddr_t paddr;
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+ /* In 2-on-3 we work with pairs of l2es pointing at two-page
+ * shadows. Reference counting and up-pointers track from the first
+ * page of the shadow to the first l2e, so make sure that we're
+ * working with those:
+ * Align the pointer down so it's pointing at the first of the pair */
+ sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
+ /* Align the mfn of the shadow entry too */
+ new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
+#endif
+
+ ASSERT(sl2e != NULL);
+ old_sl2e = *sl2e;
+
+ if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl2e) & ~PAGE_MASK));
+
+ if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh2_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr);
+ }
+
+ /* Write the new entry */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+ {
+ shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
+ /* The l1 shadow is two pages long and need to be pointed to by
+ * two adjacent l1es. The pair have the same flags, but point
+ * at odd and even MFNs */
+ ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
+ pair[1].l2 |= (1<<PAGE_SHIFT);
+ shadow_write_entries(sl2e, &pair, 2, sl2mfn);
+ }
+#else /* normal case */
+ shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
+#endif
+ flags |= SHADOW2_SET_CHANGED;
+
+ if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
+ if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
+ !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
+ shadow_l2e_get_flags(new_sl2e)) )
+ {
+ flags |= SHADOW2_SET_FLUSH;
+ }
+ sh2_put_ref(v, osl1mfn, paddr);
+ }
+ return flags;
+}
+
+static int shadow_set_l1e(struct vcpu *v,
+ shadow_l1e_t *sl1e,
+ shadow_l1e_t new_sl1e,
+ mfn_t sl1mfn)
+{
+ int flags = 0;
+ struct domain *d = v->domain;
+ shadow_l1e_t old_sl1e;
+ ASSERT(sl1e != NULL);
+
+ old_sl1e = *sl1e;
+
+ if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
+
+ if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ if ( shadow2_mode_refcounts(d) ) {
+ if ( shadow2_get_page_from_l1e(new_sl1e, d) == 0 )
+ {
+ /* Doesn't look like a pagetable. */
+ flags |= SHADOW2_SET_ERROR;
+ new_sl1e = shadow_l1e_empty();
+ }
+ }
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
+ flags |= SHADOW2_SET_CHANGED;
+
+ if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ /* N.B. Unlike higher-level sets, never need an extra flush
+ * when writing an l1e. Because it points to the same guest frame
+ * as the guest l1e did, it's the guest's responsibility to
+ * trigger a flush later. */
+ if ( shadow2_mode_refcounts(d) )
+ {
+ shadow2_put_page_from_l1e(old_sl1e, d);
+ }
+ }
+ return flags;
+}
+
+
+/**************************************************************************/
+/* These functions take a vcpu and a virtual address, and return a pointer
+ * to the appropriate level N entry from the shadow tables.
+ * If the necessary tables are not present in the shadow, they return NULL. */
+
+/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
+ * more levels than the guest, the upper levels are always fixed and do not
+ * reflect any information from the guest, so we do not use these functions
+ * to access them. */
+
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t *
+shadow_get_l4e(struct vcpu *v, unsigned long va)
+{
+ /* Reading the top level table is always valid. */
+ return sh2_linear_l4_table(v) + shadow_l4_linear_offset(va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t *
+shadow_get_l3e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+ /* Get the l4 */
+ shadow_l4e_t *sl4e = shadow_get_l4e(v, va);
+ ASSERT(sl4e != NULL);
+ if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e)));
+ /* l4 was present; OK to get the l3 */
+ return sh2_linear_l3_table(v) + shadow_l3_linear_offset(va);
+#else /* PAE... */
+ /* Top level is always mapped */
+ ASSERT(v->arch.shadow_vtable);
+ return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va);
+#endif
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t *
+shadow_get_l2e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 3 /* 64bit/PAE... */
+ /* Get the l3 */
+ shadow_l3e_t *sl3e = shadow_get_l3e(v, va);
+ if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e)));
+ /* l3 was present; OK to get the l2 */
+#endif
+ return sh2_linear_l2_table(v) + shadow_l2_linear_offset(va);
+}
+
+
+#if 0 // avoid the compiler warning for now...
+
+static shadow_l1e_t *
+shadow_get_l1e(struct vcpu *v, unsigned long va)
+{
+ /* Get the l2 */
+ shadow_l2e_t *sl2e = shadow_get_l2e(v, va);
+ if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e)));
+ /* l2 was present; OK to get the l1 */
+ return sh2_linear_l1_table(v) + shadow_l1_linear_offset(va);
+}
+
+#endif
+
+
+/**************************************************************************/
+/* Macros to walk pagetables. These take the shadow of a pagetable and
+ * walk every "interesting" entry. That is, they don't touch Xen mappings,
+ * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
+ * second entry (since pairs of entries are managed together). For multi-page
+ * shadows they walk all pages.
+ *
+ * Arguments are an MFN, the variable to point to each entry, a variable
+ * to indicate that we are done (we will shortcut to the end of the scan
+ * when _done != 0), a variable to indicate that we should avoid Xen mappings,
+ * and the code.
+ *
+ * WARNING: These macros have side-effects. They change the values of both
+ * the pointer and the MFN. */
+
+static inline void increment_ptr_to_guest_entry(void *ptr)
+{
+ if ( ptr )
+ {
+ guest_l1e_t **entry = ptr;
+ (*entry)++;
+ }
+}
+
+/* All kinds of l1: touch all entries */
+#define _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+do { \
+ int _i; \
+ shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
+ ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l1_shadow \
+ || (mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_fl1_shadow); \
+ for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl1e) = _sp + _i; \
+ if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl1p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+do { \
+ int __done = 0; \
+ _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
+ ({ (__done = _done); }), _code); \
+ _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
+ if ( !__done ) \
+ _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
+ ({ (__done = _done); }), _code); \
+} while (0)
+#else /* Everything else; l1 shadows are only one page */
+#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+ _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
+#endif
+
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+
+/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i, _j, __done = 0; \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2_32_shadow); \
+ for ( _j = 0; _j < 4 && !__done; _j++ ) \
+ { \
+ shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
+ if ( (!(_xen)) \
+ || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
+ < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( (__done = (_done)) ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+ _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
+ } \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 2
+
+/* 32-bit on 32-bit: avoid Xen entries */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2_32_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ if ( (!(_xen)) \
+ || \
+ (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 3
+
+/* PAE: if it's an l2h, don't touch Xen mappings */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2_pae_shadow \
+ || (mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2h_pae_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ if ( (!(_xen)) \
+ || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ != PGC_SH2_l2h_pae_shadow) \
+ || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
+ < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#else
+
+/* 64-bit l2: touch all entries */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l2_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#endif /* different kinds of l2 */
+
+#if GUEST_PAGING_LEVELS == 3
+
+/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */
+#define SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code) \
+do { \
+ int _i; \
+ for ( _i = 0; _i < 4; _i++ ) \
+ { \
+ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ _sl3e++; \
+ increment_ptr_to_guest_entry(_gl3p); \
+ } \
+} while (0)
+
+/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */
+#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
+do { \
+ int _i, _j, _k, __done = 0; \
+ ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l3_pae_shadow); \
+ /* The subshadows are split, 64 on each page of the shadow */ \
+ for ( _j = 0; _j < 2 && !__done; _j++ ) \
+ { \
+ void *_sp = sh2_map_domain_page(_sl3mfn); \
+ for ( _i = 0; _i < 64; _i++ ) \
+ { \
+ /* Every second 32-byte region is a bookkeeping entry */ \
+ _sl3e = (shadow_l3e_t *)(_sp + (64 * _i)); \
+ if ( (sl3p_to_info(_sl3e))->refcount > 0 ) \
+ SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, \
+ ({ __done = (_done); __done; }), \
+ _code); \
+ else \
+ for ( _k = 0 ; _k < 4 ; _k++ ) \
+ increment_ptr_to_guest_entry(_gl3p); \
+ if ( __done ) break; \
+ } \
+ sh2_unmap_domain_page(_sp); \
+ _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1); \
+ } \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 4
+
+/* 64-bit l3: touch all entries */
+#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
+do { \
+ int _i; \
+ shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
+ ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l3_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl3e) = _sp + _i; \
+ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl3p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+/* 64-bit l4: avoid Xen mappings */
+#define SHADOW2_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
+ ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH2_type_mask) \
+ == PGC_SH2_l4_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ if ( (!(_xen)) || is_guest_l4_slot(_i) ) \
+ { \
+ (_sl4e) = _sp + _i; \
+ if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ } \
+ increment_ptr_to_guest_entry(_gl4p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#endif
+
+
+
+/**************************************************************************/
+/* Functions to install Xen mappings and linear mappings in shadow pages */
+
+static mfn_t sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
+
+// XXX -- this function should probably be moved to shadow2-common.c, but that
+// probably wants to wait until the shadow types have been moved from
+// shadow2-types.h to shadow2-private.h
+//
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
+{
+ struct domain *d = v->domain;
+ shadow_l4e_t *sl4e;
+
+ sl4e = sh2_map_domain_page(sl4mfn);
+ ASSERT(sl4e != NULL);
+ ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+ &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+ ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
+ shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
+ __PAGE_HYPERVISOR);
+
+ /* Linear mapping */
+ sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
+ sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
+
+ if ( shadow2_mode_translate(v->domain) )
+ {
+ /* install domain-specific P2M table */
+ sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
+ shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+ __PAGE_HYPERVISOR);
+ }
+
+ sh2_unmap_domain_page(sl4e);
+}
+#endif
+
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+// For 3-on-3 PV guests, we need to make sure the xen mappings are in
+// place, which means that we need to populate the l2h entry in the l3
+// table.
+
+void sh2_install_xen_entries_in_l2h(struct vcpu *v,
+ mfn_t sl2hmfn)
+{
+ struct domain *d = v->domain;
+ shadow_l2e_t *sl2e;
+ int i;
+
+ sl2e = sh2_map_domain_page(sl2hmfn);
+ ASSERT(sl2e != NULL);
+ ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
+ &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
+ L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ shadow_l2e_from_mfn(
+ page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+ __PAGE_HYPERVISOR);
+
+ /* We don't set up a linear mapping here because we can't until this
+ * l2h is installed in an l3e. sh2_update_linear_entries() handles
+ * the linear mappings when the l3 is loaded. */
+
+ if ( shadow2_mode_translate(d) )
+ {
+ /* Install the domain-specific p2m table */
+ l3_pgentry_t *p2m;
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+ p2m = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+ for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
+ {
+ sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
+ shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
+ __PAGE_HYPERVISOR);
+ }
+ sh2_unmap_domain_page(p2m);
+ }
+
+ sh2_unmap_domain_page(sl2e);
+}
+
+void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn)
+{
+ shadow_l3e_t *sl3e;
+ guest_l3e_t *gl3e = v->arch.guest_vtable;
+ shadow_l3e_t new_sl3e;
+ gfn_t l2gfn;
+ mfn_t l2gmfn, l2smfn;
+ int r;
+
+ ASSERT(!shadow2_mode_external(v->domain));
+ ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT);
+ l2gfn = guest_l3e_get_gfn(gl3e[3]);
+ l2gmfn = sh2_gfn_to_mfn(v->domain, gfn_x(l2gfn));
+ l2smfn = get_shadow_status(v, l2gmfn, PGC_SH2_l2h_shadow);
+ if ( !valid_mfn(l2smfn) )
+ {
+ l2smfn = sh2_make_shadow(v, l2gmfn, PGC_SH2_l2h_shadow);
+ }
+ l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e,
+ ft_prefetch);
+ sl3e = sh2_map_domain_page(sl3mfn);
+ r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn);
+ sh2_unmap_domain_page(sl3e);
+}
+#endif
+
+
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
+{
+ struct domain *d = v->domain;
+ shadow_l2e_t *sl2e;
+ int i;
+
+ sl2e = sh2_map_domain_page(sl2mfn);
+ ASSERT(sl2e != NULL);
+ ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
+ &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
+ L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ shadow_l2e_from_mfn(
+ page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+ __PAGE_HYPERVISOR);
+
+ /* Linear mapping */
+ sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
+ shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
+ sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
+
+ if ( shadow2_mode_translate(d) )
+ {
+ /* install domain-specific P2M table */
+ sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
+ shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+ __PAGE_HYPERVISOR);
+ }
+
+ sh2_unmap_domain_page(sl2e);
+}
+#endif
+
+
+
+
+
+/**************************************************************************/
+/* Create a shadow of a given guest page.
+ */
+static mfn_t
+sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+{
+ mfn_t smfn = shadow2_alloc(v->domain, shadow_type, mfn_x(gmfn));
+ SHADOW2_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
+ mfn_x(gmfn), shadow_type, mfn_x(smfn));
+
+ if ( shadow_type != PGC_SH2_guest_root_type )
+ /* Lower-level shadow, not yet linked form a higher level */
+ mfn_to_page(smfn)->up = 0;
+
+ // Create the Xen mappings...
+ if ( !shadow2_mode_external(v->domain) )
+ {
+ switch (shadow_type)
+ {
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+ case PGC_SH2_l4_shadow:
+ sh2_install_xen_entries_in_l4(v, gmfn, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+ case PGC_SH2_l3_shadow:
+ sh2_install_xen_entries_in_l3(v, gmfn, smfn); break;
+ case PGC_SH2_l2h_shadow:
+ sh2_install_xen_entries_in_l2h(v, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+ case PGC_SH2_l2_shadow:
+ sh2_install_xen_entries_in_l2(v, gmfn, smfn); break;
+#endif
+ default: /* Do nothing */ break;
+ }
+ }
+
+ shadow2_promote(v, gmfn, shadow_type);
+ set_shadow2_status(v, gmfn, shadow_type, smfn);
+
+ return smfn;
+}
+
+/* Make a splintered superpage shadow */
+static mfn_t
+make_fl1_shadow(struct vcpu *v, gfn_t gfn)
+{
+ mfn_t smfn = shadow2_alloc(v->domain, PGC_SH2_fl1_shadow,
+ (unsigned long) gfn_x(gfn));
+
+ SHADOW2_DEBUG(MAKE_SHADOW, "(%" SH2_PRI_gfn ")=>%" SH2_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(smfn));
+
+ set_fl1_shadow_status(v, gfn, smfn);
+ return smfn;
+}
+
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+mfn_t
+sh2_make_monitor_table(struct vcpu *v)
+{
+
+ ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
+
+#if CONFIG_PAGING_LEVELS == 4
+ {
+ struct domain *d = v->domain;
+ mfn_t m4mfn;
+ m4mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ sh2_install_xen_entries_in_l4(v, m4mfn, m4mfn);
+ /* Remember the level of this table */
+ mfn_to_page(m4mfn)->shadow2_flags = 4;
+#if SHADOW_PAGING_LEVELS < 4
+ // Install a monitor l3 table in slot 0 of the l4 table.
+ // This is used for shadow linear maps.
+ {
+ mfn_t m3mfn;
+ l4_pgentry_t *l4e;
+ m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ mfn_to_page(m3mfn)->shadow2_flags = 3;
+ l4e = sh2_map_domain_page(m4mfn);
+ l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
+ sh2_unmap_domain_page(l4e);
+ }
+#endif /* SHADOW_PAGING_LEVELS < 4 */
+ return m4mfn;
+ }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+ {
+ struct domain *d = v->domain;
+ mfn_t m3mfn, m2mfn;
+ l3_pgentry_t *l3e;
+ l2_pgentry_t *l2e;
+ int i;
+
+ m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ /* Remember the level of this table */
+ mfn_to_page(m3mfn)->shadow2_flags = 3;
+
+ // Install a monitor l2 table in slot 3 of the l3 table.
+ // This is used for all Xen entries, including linear maps
+ m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ mfn_to_page(m2mfn)->shadow2_flags = 2;
+ l3e = sh2_map_domain_page(m3mfn);
+ l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
+ sh2_install_xen_entries_in_l2h(v, m2mfn);
+ /* Install the monitor's own linear map */
+ l2e = sh2_map_domain_page(m2mfn);
+ for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+ l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
+ (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
+ : l2e_empty();
+ sh2_unmap_domain_page(l2e);
+ sh2_unmap_domain_page(l3e);
+
+ SHADOW2_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
+ return m3mfn;
+ }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+ {
+ struct domain *d = v->domain;
+ mfn_t m2mfn;
+ m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+ sh2_install_xen_entries_in_l2(v, m2mfn, m2mfn);
+ /* Remember the level of this table */
+ mfn_to_page(m2mfn)->shadow2_flags = 2;
+ return m2mfn;
+ }
+
+#else
+#error this should not happen
+#endif /* CONFIG_PAGING_LEVELS */
+}
+#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
+
+/**************************************************************************/
+/* These functions also take a virtual address and return the level-N
+ * shadow table mfn and entry, but they create the shadow pagetables if
+ * they are needed. The "demand" argument is non-zero when handling
+ * a demand fault (so we know what to do about accessed bits &c).
+ * If the necessary tables are not present in the guest, they return NULL. */
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl4mfn)
+{
+ /* There is always a shadow of the top level table. Get it. */
+ *sl4mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* Reading the top level table is always valid. */
+ return sh2_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl3mfn,
+ fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+ mfn_t sl4mfn;
+ shadow_l4e_t *sl4e;
+ if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
+ /* Get the l4e */
+ sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
+ ASSERT(sl4e != NULL);
+ if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
+ {
+ *sl3mfn = shadow_l4e_get_mfn(*sl4e);
+ ASSERT(valid_mfn(*sl3mfn));
+ }
+ else
+ {
+ int r;
+ shadow_l4e_t new_sl4e;
+ /* No l3 shadow installed: find and install it. */
+ *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH2_l3_shadow);
+ if ( !valid_mfn(*sl3mfn) )
+ {
+ /* No l3 shadow of this page exists at all: make one. */
+ *sl3mfn = sh2_make_shadow(v, gw->l3mfn, PGC_SH2_l3_shadow);
+ }
+ /* Install the new sl3 table in the sl4e */
+ l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
+ *sl3mfn, &new_sl4e, ft);
+ r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
+ ASSERT((r & SHADOW2_SET_FLUSH) == 0);
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh2_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
+#else /* PAE... */
+ /* There is always a shadow of the top level table. Get it. */
+ *sl3mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* This next line is important: the shadow l3 table is in an 8k
+ * shadow and we need to return the right mfn of the pair. This call
+ * will set it for us as a side-effect. */
+ (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e));
+ ASSERT(v->arch.shadow_vtable);
+ return ((shadow_l3e_t *)v->arch.shadow_vtable)
+ + shadow_l3_table_offset(gw->va);
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl2mfn,
+ fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */
+ mfn_t sl3mfn = _mfn(INVALID_MFN);
+ shadow_l3e_t *sl3e;
+ if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+ /* Get the l3e */
+ sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
+ ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */
+ if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
+ {
+ *sl2mfn = shadow_l3e_get_mfn(*sl3e);
+ ASSERT(valid_mfn(*sl2mfn));
+ }
+ else
+ {
+ int r;
+ shadow_l3e_t new_sl3e;
+ /* No l2 shadow installed: find and install it. */
+ *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH2_l2_shadow);
+ if ( !valid_mfn(*sl2mfn) )
+ {
+ /* No l2 shadow of this page exists at all: make one. */
+ *sl2mfn = sh2_make_shadow(v, gw->l2mfn, PGC_SH2_l2_shadow);
+ }
+ /* Install the new sl2 table in the sl3e */
+ l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
+ *sl2mfn, &new_sl3e, ft);
+ r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
+ ASSERT((r & SHADOW2_SET_FLUSH) == 0);
+#if GUEST_PAGING_LEVELS == 3
+ /* Need to sync up the linear maps, as we are about to use them */
+ ASSERT( r & SHADOW2_SET_L3PAE_RECOPY );
+ sh2_pae_recopy(v->domain);
+#endif
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#else /* 32bit... */
+ /* There is always a shadow of the top level table. Get it. */
+ *sl2mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* This next line is important: the guest l2 has a 16k
+ * shadow, we need to return the right mfn of the four. This
+ * call will set it for us as a side-effect. */
+ (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
+ /* Reading the top level table is always valid. */
+ return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#endif
+}
+
+
+static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl1mfn,
+ fetch_type_t ft)
+{
+ mfn_t sl2mfn;
+ shadow_l2e_t *sl2e;
+
+ /* Get the l2e */
+ sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
+ if ( sl2e == NULL ) return NULL;
+ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
+ {
+ *sl1mfn = shadow_l2e_get_mfn(*sl2e);
+ ASSERT(valid_mfn(*sl1mfn));
+ }
+ else
+ {
+ shadow_l2e_t new_sl2e;
+ int r, flags = guest_l2e_get_flags(*gw->l2e);
+ /* No l1 shadow installed: find and install it. */
+ if ( !(flags & _PAGE_PRESENT) )
+ return NULL; /* No guest page. */
+ if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
+ {
+ /* Splintering a superpage */
+ gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
+ *sl1mfn = get_fl1_shadow_status(v, l2gfn);
+ if ( !valid_mfn(*sl1mfn) )
+ {
+ /* No fl1 shadow of this superpage exists at all: make one. */
+ *sl1mfn = make_fl1_shadow(v, l2gfn);
+ }
+ }
+ else
+ {
+ /* Shadowing an actual guest l1 table */
+ if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+ *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH2_l1_shadow);
+ if ( !valid_mfn(*sl1mfn) )
+ {
+ /* No l1 shadow of this page exists at all: make one. */
+ *sl1mfn = sh2_make_shadow(v, gw->l1mfn, PGC_SH2_l1_shadow);
+ }
+ }
+ /* Install the new sl1 table in the sl2e */
+ l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
+ *sl1mfn, &new_sl2e, ft);
+ r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
+ ASSERT((r & SHADOW2_SET_FLUSH) == 0);
+ /* This next line is important: in 32-on-PAE and 32-on-64 modes,
+ * the guest l1 table has an 8k shadow, and we need to return
+ * the right mfn of the pair. This call will set it for us as a
+ * side-effect. (In all other cases, it's a no-op and will be
+ * compiled out.) */
+ (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh2_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
+}
+
+
+
+/**************************************************************************/
+/* Destructors for shadow tables:
+ * Unregister the shadow, decrement refcounts of any entries present in it,
+ * and release the memory.
+ *
+ * N.B. These destructors do not clear the contents of the shadows.
+ * This allows us to delay TLB shootdowns until the page is being reused.
+ * See shadow2_alloc() and shadow2_free() for how this is handled.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+void sh2_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l4e_t *sl4e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+ mfn_t gmfn, sl4mfn;
+ int xen_mappings;
+
+ SHADOW2_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH2_l4_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow2_status(v, gmfn, t, smfn);
+ shadow2_demote(v, gmfn, t);
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+
+ /* Decrement refcounts of all the old entries */
+ xen_mappings = (!shadow2_mode_external(v->domain));
+ sl4mfn = smfn;
+ SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+ if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
+ {
+ sh2_put_ref(v, shadow_l4e_get_mfn(*sl4e),
+ (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl4e & ~PAGE_MASK));
+ }
+ });
+
+ /* Put the memory back in the pool */
+ shadow2_free(v->domain, smfn);
+}
+#endif
+
+#if GUEST_PAGING_LEVELS >= 3
+void sh2_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l3e_t *sl3e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+ mfn_t gmfn, sl3mfn;
+
+ SHADOW2_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH2_l3_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow2_status(v, gmfn, t, smfn);
+ shadow2_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 3
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+ /* Decrement refcounts of all the old entries */
+ sl3mfn = smfn;
+ SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+ if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
+ sh2_put_ref(v, shadow_l3e_get_mfn(*sl3e),
+ (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl3e & ~PAGE_MASK));
+ });
+
+ /* Put the memory back in the pool */
+ shadow2_free(v->domain, smfn);
+}
+#endif
+
+
+#if GUEST_PAGING_LEVELS == 3
+static void sh2_destroy_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e)
+/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */
+{
+ int i;
+ ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0);
+ for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ )
+ if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT )
+ sh2_put_ref(v, shadow_l3e_get_mfn(sl3e[i]),
+ mapped_domain_page_to_maddr(sl3e));
+}
+#endif
+
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh2_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn)
+/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */
+{
+ int i, j;
+ struct pae_l3_bookkeeping *bk;
+
+ ASSERT((mfn_to_page(smfn)->count_info & PGC_SH2_type_mask)
+ == PGC_SH2_l3_pae_shadow);
+ /* The subshadows are split, 64 on each page of the shadow */
+ for ( i = 0; i < 2; i++ )
+ {
+ void *p = sh2_map_domain_page(_mfn(mfn_x(smfn) + i));
+ for ( j = 0; j < 64; j++ )
+ {
+ /* Every second 32-byte region is a bookkeeping entry */
+ bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32);
+ if ( bk->pinned )
+ sh2_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn);
+ /* Check whether we've just freed the whole shadow */
+ if ( (mfn_to_page(smfn)->count_info & PGC_SH2_count_mask) == 0 )
+ {
+ sh2_unmap_domain_page(p);
+ return;
+ }
+ }
+ sh2_unmap_domain_page(p);
+ }
+}
+#endif
+
+void sh2_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l2e_t *sl2e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+ mfn_t gmfn, sl2mfn;
+ int xen_mappings;
+
+ SHADOW2_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH2_l2_shadow
+ || t == PGC_SH2_l2h_pae_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow2_status(v, gmfn, t, smfn);
+ shadow2_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 2
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+ /* Decrement refcounts of all the old entries */
+ sl2mfn = smfn;
+ xen_mappings = (!shadow2_mode_external(v->domain) &&
+ ((GUEST_PAGING_LEVELS == 2) ||
+ ((GUEST_PAGING_LEVELS == 3) &&
+ (t == PGC_SH2_l2h_pae_shadow))));
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
+ sh2_put_ref(v, shadow_l2e_get_mfn(*sl2e),
+ (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl2e & ~PAGE_MASK));
+ });
+
+ /* Put the memory back in the pool */
+ shadow2_free(v->domain, smfn);
+}
+
+void sh2_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
+{
+ struct domain *d = v->domain;
+ shadow_l1e_t *sl1e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+
+ SHADOW2_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH2_l1_shadow || t == PGC_SH2_fl1_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ if ( t == PGC_SH2_fl1_shadow )
+ {
+ gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_fl1_shadow_status(v, gfn, smfn);
+ }
+ else
+ {
+ mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow2_status(v, gmfn, t, smfn);
+ shadow2_demote(v, gmfn, t);
+ }
+
+ if ( shadow2_mode_refcounts(d) )
+ {
+ /* Decrement refcounts of all the old entries */
+ mfn_t sl1mfn = smfn;
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
+ if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT )
+ shadow2_put_page_from_l1e(*sl1e, d);
+ });
+ }
+
+ /* Put the memory back in the pool */
+ shadow2_free(v->domain, smfn);
+}
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+void sh2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
+{
+ struct domain *d = v->domain;
+ ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH2_type_mask)
+ == PGC_SH2_monitor_table);
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
+ /* Need to destroy the l3 monitor page in slot 0 too */
+ {
+ l4_pgentry_t *l4e = sh2_map_domain_page(mmfn);
+ ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+ shadow2_free(d, _mfn(l4e_get_pfn(l4e[0])));
+ sh2_unmap_domain_page(l4e);
+ }
+#elif CONFIG_PAGING_LEVELS == 3
+ /* Need to destroy the l2 monitor page in slot 4 too */
+ {
+ l3_pgentry_t *l3e = sh2_map_domain_page(mmfn);
+ ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+ shadow2_free(d, _mfn(l3e_get_pfn(l3e[3])));
+ sh2_unmap_domain_page(l3e);
+ }
+#endif
+
+ /* Put the memory back in the pool */
+ shadow2_free(d, mmfn);
+}
+#endif
+
+/**************************************************************************/
+/* Functions to destroy non-Xen mappings in a pagetable hierarchy.
+ * These are called from common code when we are running out of shadow
+ * memory, and unpinning all the top-level shadows hasn't worked.
+ *
+ * This implementation is pretty crude and slow, but we hope that it won't
+ * be called very often. */
+
+#if GUEST_PAGING_LEVELS == 2
+
+void sh2_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
+{
+ shadow_l2e_t *sl2e;
+ int xen_mappings = !shadow2_mode_external(v->domain);
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+ (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ });
+}
+
+#elif GUEST_PAGING_LEVELS == 3
+
+void sh2_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn)
+/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */
+{
+ shadow_l3e_t *sl3e;
+ SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+ if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) {
+ mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e);
+ if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask)
+ == PGC_SH2_l2h_pae_shadow )
+ {
+ /* High l2: need to pick particular l2es to unhook */
+ shadow_l2e_t *sl2e;
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, {
+ (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ });
+ }
+ else
+ {
+ /* Normal l2: can safely unhook the whole l3e */
+ (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+ }
+ }
+ });
+ /* We've changed PAE L3 entries: must sync up various copies of them */
+ sh2_pae_recopy(v->domain);
+}
+
+#elif GUEST_PAGING_LEVELS == 4
+
+void sh2_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
+{
+ shadow_l4e_t *sl4e;
+ int xen_mappings = !shadow2_mode_external(v->domain);
+ SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+ (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+ });
+}
+
+#endif
+
+/**************************************************************************/
+/* Internal translation functions.
+ * These functions require a pointer to the shadow entry that will be updated.
+ */
+
+/* These functions take a new guest entry, translate it to shadow and write
+ * the shadow entry.
+ *
+ * They return the same bitmaps as the shadow_set_lXe() functions.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
+{
+ shadow_l4e_t new_sl4e;
+ guest_l4e_t *new_gl4e = new_ge;
+ shadow_l4e_t *sl4p = se;
+ mfn_t sl3mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow2_validate_gl4e_calls);
+
+ if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
+ {
+ gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
+ mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
+ if ( valid_mfn(gl3mfn) )
+ sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH2_l3_shadow);
+ else
+ result |= SHADOW2_SET_ERROR;
+ }
+ l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
+ sl3mfn, &new_sl4e, ft_prefetch);
+ result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
+ return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
+{
+ shadow_l3e_t new_sl3e;
+ guest_l3e_t *new_gl3e = new_ge;
+ shadow_l3e_t *sl3p = se;
+ mfn_t sl2mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow2_validate_gl3e_calls);
+
+ if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
+ {
+ gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
+ mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
+ if ( valid_mfn(gl2mfn) )
+ sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH2_l2_shadow);
+ else
+ result |= SHADOW2_SET_ERROR;
+ }
+ l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
+ sl2mfn, &new_sl3e, ft_prefetch);
+ result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
+
+#if GUEST_PAGING_LEVELS == 3
+ /* We have changed a PAE l3 entry: need to sync up the possible copies
+ * of it */
+ if ( result & SHADOW2_SET_L3PAE_RECOPY )
+ sh2_pae_recopy(v->domain);
+#endif
+
+ return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
+{
+ shadow_l2e_t new_sl2e;
+ guest_l2e_t *new_gl2e = new_ge;
+ shadow_l2e_t *sl2p = se;
+ mfn_t sl1mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow2_validate_gl2e_calls);
+
+ if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
+ {
+ gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
+ if ( guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
+ {
+ // superpage -- need to look up the shadow L1 which holds the
+ // splitters...
+ sl1mfn = get_fl1_shadow_status(v, gl1gfn);
+#if 0
+ // XXX - it's possible that we want to do some kind of prefetch
+ // for superpage fl1's here, but this is *not* on the demand path,
+ // so we'll hold off trying that for now...
+ //
+ if ( !valid_mfn(sl1mfn) )
+ sl1mfn = make_fl1_shadow(v, gl1gfn);
+#endif
+ }
+ else
+ {
+ mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
+ if ( valid_mfn(gl1mfn) )
+ sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH2_l1_shadow);
+ else
+ result |= SHADOW2_SET_ERROR;
+ }
+ }
+ l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
+ sl1mfn, &new_sl2e, ft_prefetch);
+ result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
+
+ return result;
+}
+
+static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
+{
+ shadow_l1e_t new_sl1e;
+ guest_l1e_t *new_gl1e = new_ge;
+ shadow_l1e_t *sl1p = se;
+ gfn_t gfn;
+ mfn_t mfn;
+ int result = 0;
+
+ perfc_incrc(shadow2_validate_gl1e_calls);
+
+ gfn = guest_l1e_get_gfn(*new_gl1e);
+ mfn = vcpu_gfn_to_mfn(v, gfn);
+
+ l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e,
+ /* mmio? */ !valid_mfn(mfn));
+
+ result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
+ return result;
+}
+
+
+/**************************************************************************/
+/* Functions which translate and install a the shadows of arbitrary guest
+ * entries that we have just seen the guest write. */
+
+
+static inline int
+sh2_map_and_validate(struct vcpu *v, mfn_t gmfn,
+ void *new_gp, u32 size, u32 sh_type,
+ u32 (*shadow_index)(mfn_t *smfn, u32 idx),
+ int (*validate_ge)(struct vcpu *v, void *ge,
+ mfn_t smfn, void *se))
+/* Generic function for mapping and validating. */
+{
+ mfn_t smfn, smfn2, map_mfn;
+ shadow_l1e_t *sl1p;
+ u32 shadow_idx, guest_idx;
+ int result = 0;
+
+ /* Align address and size to guest entry boundaries */
+ size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
+ new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
+ size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
+ ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
+
+ /* Map the shadow page */
+ smfn = get_shadow_status(v, gmfn, sh_type);
+ ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
+ guest_idx = guest_index(new_gp);
+ map_mfn = smfn;
+ shadow_idx = shadow_index(&map_mfn, guest_idx);
+ sl1p = map_shadow_page(map_mfn);
+
+ /* Validate one entry at a time */
+ while ( size )
+ {
+ smfn2 = smfn;
+ guest_idx = guest_index(new_gp);
+ shadow_idx = shadow_index(&smfn2, guest_idx);
+ if ( mfn_x(smfn2) != mfn_x(map_mfn) )
+ {
+ /* We have moved to another page of the shadow */
+ map_mfn = smfn2;
+ unmap_shadow_page(sl1p);
+ sl1p = map_shadow_page(map_mfn);
+ }
+ result |= validate_ge(v,
+ new_gp,
+ map_mfn,
+ &sl1p[shadow_idx]);
+ size -= sizeof(guest_l1e_t);
+ new_gp += sizeof(guest_l1e_t);
+ }
+ unmap_shadow_page(sl1p);
+ return result;
+}
+
+
+int
+sh2_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
+ void *new_gl4p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 4
+ return sh2_map_and_validate(v, gl4mfn, new_gl4p, size,
+ PGC_SH2_l4_shadow,
+ shadow_l4_index,
+ validate_gl4e);
+#else // ! GUEST_PAGING_LEVELS >= 4
+ SHADOW2_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh2_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
+ void *new_gl3p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 3
+ return sh2_map_and_validate(v, gl3mfn, new_gl3p, size,
+ PGC_SH2_l3_shadow,
+ shadow_l3_index,
+ validate_gl3e);
+#else // ! GUEST_PAGING_LEVELS >= 3
+ SHADOW2_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh2_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
+ void *new_gl2p, u32 size)
+{
+ return sh2_map_and_validate(v, gl2mfn, new_gl2p, size,
+ PGC_SH2_l2_shadow,
+ shadow_l2_index,
+ validate_gl2e);
+}
+
+int
+sh2_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
+ void *new_gl2p, u32 size)
+{
+#if GUEST_PAGING_LEVELS == 3
+ return sh2_map_and_validate(v, gl2mfn, new_gl2p, size,
+ PGC_SH2_l2h_shadow,
+ shadow_l2_index,
+ validate_gl2e);
+#else /* Non-PAE guests don't have different kinds of l2 table */
+ SHADOW2_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh2_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
+ void *new_gl1p, u32 size)
+{
+ return sh2_map_and_validate(v, gl1mfn, new_gl1p, size,
+ PGC_SH2_l1_shadow,
+ shadow_l1_index,
+ validate_gl1e);
+}
+
+
+/**************************************************************************/
+/* Optimization: If we see two emulated writes of zeros to the same
+ * page-table without another kind of page fault in between, we guess
+ * that this is a batch of changes (for process destruction) and
+ * unshadow the page so we don't take a pagefault on every entry. This
+ * should also make finding writeable mappings of pagetables much
+ * easier. */
+
+/* Look to see if this is the second emulated write in a row to this
+ * page, and unshadow/unhook if it is */
+static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
+{
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
+ if ( v->arch.last_emulated_mfn == mfn_x(gmfn) &&
+ sh2_mfn_is_a_page_table(gmfn) )
+ {
+ u32 flags = mfn_to_page(gmfn)->shadow2_flags;
+ mfn_t smfn;
+ if ( !(flags & (SH2F_L2_32|SH2F_L3_PAE|SH2F_L4_64)) )
+ {
+ perfc_incrc(shadow2_early_unshadow);
+ sh2_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ );
+ return;
+ }
+ /* SH2F_unhooked_mappings is set to make sure we only unhook
+ * once in a single batch of updates. It is reset when this
+ * top-level page is loaded into CR3 again */
+ if ( !(flags & SH2F_unhooked_mappings) )
+ {
+ perfc_incrc(shadow2_early_unshadow_top);
+ mfn_to_page(gmfn)->shadow2_flags |= SH2F_unhooked_mappings;
+ if ( flags & SH2F_L2_32 )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH2_l2_32_shadow);
+ shadow2_unhook_mappings(v, smfn);
+ }
+ if ( flags & SH2F_L3_PAE )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH2_l3_pae_shadow);
+ shadow2_unhook_mappings(v, smfn);
+ }
+ if ( flags & SH2F_L4_64 )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH2_l4_64_shadow);
+ shadow2_unhook_mappings(v, smfn);
+ }
+ }
+ }
+ v->arch.last_emulated_mfn = mfn_x(gmfn);
+#endif
+}
+
+/* Stop counting towards early unshadows, as we've seen a real page fault */
+static inline void reset_early_unshadow(struct vcpu *v)
+{
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
+ v->arch.last_emulated_mfn = INVALID_MFN;
+#endif
+}
+
+
+
+/**************************************************************************/
+/* Entry points into the shadow code */
+
+/* Called from pagefault handler in Xen, and from the HVM trap handlers
+ * for pagefaults. Returns 1 if this fault was an artefact of the
+ * shadow code (and the guest should retry) or 0 if it is not (and the
+ * fault should be handled elsewhere or passed to the guest). */
+
+static int sh2_page_fault(struct vcpu *v,
+ unsigned long va,
+ struct cpu_user_regs *regs)
+{
+ struct domain *d = v->domain;
+ walk_t gw;
+ u32 accumulated_gflags;
+ gfn_t gfn;
+ mfn_t gmfn, sl1mfn=_mfn(0);
+ shadow_l1e_t sl1e, *ptr_sl1e;
+ paddr_t gpa;
+ struct cpu_user_regs emul_regs;
+ struct x86_emulate_ctxt emul_ctxt;
+ int r, mmio;
+ fetch_type_t ft = 0;
+
+ //
+ // XXX: Need to think about eventually mapping superpages directly in the
+ // shadow (when possible), as opposed to splintering them into a
+ // bunch of 4K maps.
+ //
+
+ SHADOW2_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
+ v->domain->domain_id, v->vcpu_id, va, regs->error_code);
+
+ shadow2_lock(d);
+
+ shadow2_audit_tables(v);
+
+ if ( guest_walk_tables(v, va, &gw, 1) != 0 )
+ {
+ SHADOW2_PRINTK("malformed guest pagetable!");
+ print_gw(&gw);
+ }
+
+ sh2_audit_gw(v, &gw);
+
+ // We do not look at the gw->l1e, as that will not exist for superpages.
+ // Instead, we use the gw->eff_l1e...
+ //
+ // We need not check all the levels of the guest page table entries for
+ // present vs not-present, as the eff_l1e will always be not present if
+ // one of the higher level entries is not present.
+ //
+ if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
+ {
+ if ( hvm_guest(v) && !shadow2_vcpu_mode_translate(v) )
+ {
+ /* Not present in p2m map, means this is mmio */
+ gpa = va;
+ goto mmio;
+ }
+
+ perfc_incrc(shadow2_fault_bail_not_present);
+ goto not_a_shadow_fault;
+ }
+
+ // All levels of the guest page table are now known to be present.
+ accumulated_gflags = accumulate_guest_flags(&gw);
+
+ // Check for attempts to access supervisor-only pages from user mode,
+ // i.e. ring 3. Such errors are not caused or dealt with by the shadow
+ // code.
+ //
+ if ( (regs->error_code & X86_PFEC_SUPERVISOR_FAULT) &&
+ !(accumulated_gflags & _PAGE_USER) )
+ {
+ /* illegal user-mode access to supervisor-only page */
+ perfc_incrc(shadow2_fault_bail_user_supervisor);
+ goto not_a_shadow_fault;
+ }
+
+ // Was it a write fault?
+ //
+ if ( regs->error_code & X86_PFEC_WRITE_FAULT )
+ {
+ if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
+ {
+ perfc_incrc(shadow2_fault_bail_ro_mapping);
+ goto not_a_shadow_fault;
+ }
+ }
+ else // must have been either an insn fetch or read fault
+ {
+ // Check for NX bit violations: attempts to execute code that is
+ // marked "do not execute". Such errors are not caused or dealt with
+ // by the shadow code.
+ //
+ if ( regs->error_code & X86_PFEC_INSN_FETCH_FAULT )
+ {
+ if ( accumulated_gflags & _PAGE_NX_BIT )
+ {
+ /* NX prevented this code fetch */
+ perfc_incrc(shadow2_fault_bail_nx);
+ goto not_a_shadow_fault;
+ }
+ }
+ }
+
+ /* Is this an MMIO access? */
+ gfn = guest_l1e_get_gfn(gw.eff_l1e);
+ mmio = ( hvm_guest(v)
+ && shadow2_vcpu_mode_translate(v)
+ && mmio_space(gfn_to_paddr(gfn)) );
+
+ /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds
+ * the equivalent mfn. */
+ if ( mmio )
+ gmfn = _mfn(gfn_x(gfn));
+ else
+ {
+ gmfn = vcpu_gfn_to_mfn(v, gfn);
+ if ( !valid_mfn(gmfn) )
+ {
+ perfc_incrc(shadow2_fault_bail_bad_gfn);
+ SHADOW2_PRINTK("BAD gfn=%"SH2_PRI_gfn" gmfn=%"SH2_PRI_mfn"\n",
+ gfn_x(gfn), mfn_x(gmfn));
+ goto not_a_shadow_fault;
+ }
+ }
+
+ /* Make sure there is enough free shadow memory to build a chain of
+ * shadow tables: one SHADOW2_MAX_ORDER chunk will always be enough
+ * to allocate all we need. (We never allocate a top-level shadow
+ * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
+ shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+
+ /* Acquire the shadow. This must happen before we figure out the rights
+ * for the shadow entry, since we might promote a page here. */
+ // XXX -- this code will need to change somewhat if/when the shadow code
+ // can directly map superpages...
+ ft = ((regs->error_code & X86_PFEC_WRITE_FAULT)
+ ? ft_demand_write : ft_demand_read);
+ ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
+ ASSERT(ptr_sl1e);
+
+ /* Calculate the shadow entry */
+ if ( ft == ft_demand_write )
+ {
+ if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
+ {
+ perfc_incrc(shadow2_fault_emulate_write);
+ goto emulate;
+ }
+ }
+ else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
+ {
+ perfc_incrc(shadow2_fault_emulate_read);
+ goto emulate;
+ }
+
+ /* Quick sanity check: we never make an MMIO entry that's got the
+ * _PAGE_PRESENT flag set in it. */
+ ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
+
+ r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
+
+ if ( mmio )
+ {
+ gpa = guest_walk_to_gpa(&gw);
+ goto mmio;
+ }
+
+#if 0
+ if ( !(r & SHADOW2_SET_CHANGED) )
+ debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH2_PRI_pte
+ ") did not change anything\n",
+ __func__, gw.va, l1e_get_intpte(sl1e));
+#endif
+
+ perfc_incrc(shadow2_fault_fixed);
+ d->arch.shadow_fault_count++;
+ reset_early_unshadow(v);
+
+ done:
+ sh2_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW2_PRINTK("fixed\n");
+ shadow2_audit_tables(v);
+ shadow2_unlock(d);
+ return EXCRET_fault_fixed;
+
+ emulate:
+
+ /* Take the register set we were called with */
+ emul_regs = *regs;
+ if ( hvm_guest(v) )
+ {
+ /* Add the guest's segment selectors, rip, rsp. rflags */
+ hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
+ }
+ emul_ctxt.regs = &emul_regs;
+ emul_ctxt.cr2 = va;
+ emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST;
+
+ SHADOW2_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
+
+ v->arch.shadow2_propagate_fault = 0;
+ if ( x86_emulate_memop(&emul_ctxt, &shadow2_emulator_ops) )
+ {
+ SHADOW2_PRINTK("emulator failure, unshadowing mfn %#lx\n",
+ mfn_x(gmfn));
+ perfc_incrc(shadow2_fault_emulate_failed);
+ /* If this is actually a page table, then we have a bug, and need
+ * to support more operations in the emulator. More likely,
+ * though, this is a hint that this page should not be shadowed. */
+ shadow2_remove_all_shadows(v, gmfn);
+ /* This means that actual missing operations will cause the
+ * guest to loop on the same page fault. */
+ goto done;
+ }
+ if ( v->arch.shadow2_propagate_fault )
+ {
+ /* Emulation triggered another page fault */
+ goto not_a_shadow_fault;
+ }
+
+ /* Emulator has changed the user registers: write back */
+ if ( hvm_guest(v) )
+ {
+ /* Write back the guest's segment selectors, rip, rsp. rflags */
+ hvm_load_cpu_guest_regs(v, &emul_regs);
+ /* And don't overwrite those in the caller's regs. */
+ emul_regs.eip = regs->eip;
+ emul_regs.cs = regs->cs;
+ emul_regs.eflags = regs->eflags;
+ emul_regs.esp = regs->esp;
+ emul_regs.ss = regs->ss;
+ emul_regs.es = regs->es;
+ emul_regs.ds = regs->ds;
+ emul_regs.fs = regs->fs;
+ emul_regs.gs = regs->gs;
+ }
+ *regs = emul_regs;
+
+ goto done;
+
+ mmio:
+ perfc_incrc(shadow2_fault_mmio);
+ if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) )
+ {
+ /* Need to deal with these disabled-APIC accesses, as
+ * handle_mmio() apparently does not currently do that. */
+ /* TJD: What about it, then? For now, I'm turning this BUG()
+ * into a domain_crash() since we don't want to kill Xen. */
+ SHADOW2_ERROR("disabled-APIC access: not supported\n.");
+ domain_crash(d);
+ }
+ sh2_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW2_PRINTK("mmio\n");
+ shadow2_audit_tables(v);
+ reset_early_unshadow(v);
+ shadow2_unlock(d);
+ sh2_log_mmio(v, gpa);
+ handle_mmio(va, gpa);
+ return EXCRET_fault_fixed;
+
+ not_a_shadow_fault:
+ sh2_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW2_PRINTK("not a shadow fault\n");
+ shadow2_audit_tables(v);
+ reset_early_unshadow(v);
+ shadow2_unlock(d);
+ return 0;
+}
+
+
+static int
+sh2_invlpg(struct vcpu *v, unsigned long va)
+/* Called when the guest requests an invlpg. Returns 1 if the invlpg
+ * instruction should be issued on the hardware, or 0 if it's safe not
+ * to do so. */
+{
+ shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va);
+
+ // XXX -- might be a good thing to prefetch the va into the shadow
+
+ // no need to flush anything if there's no SL2...
+ //
+ if ( !ptr_sl2e )
+ return 0;
+
+ // If there's nothing shadowed for this particular sl2e, then
+ // there is no need to do an invlpg, either...
+ //
+ if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) )
+ return 0;
+
+ // Check to see if the SL2 is a splintered superpage...
+ // If so, then we'll need to flush the entire TLB (because that's
+ // easier than invalidating all of the individual 4K pages).
+ //
+ if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info &
+ PGC_SH2_type_mask) == PGC_SH2_fl1_shadow )
+ {
+ local_flush_tlb();
+ return 0;
+ }
+
+ return 1;
+}
+
+static unsigned long
+sh2_gva_to_gfn(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ walk_t gw;
+ gfn_t gfn;
+
+ guest_walk_tables(v, va, &gw, 0);
+ gfn = guest_walk_to_gfn(&gw);
+ unmap_walk(v, &gw);
+
+ return gfn_x(gfn);
+}
+
+
+static unsigned long
+sh2_gva_to_gpa(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ unsigned long gfn = sh2_gva_to_gfn(v, va);
+ if ( gfn == INVALID_GFN )
+ return 0;
+ else
+ return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK);
+}
+
+
+// XXX -- should this be in this file?
+// Or should it be moved to shadow2-common.c?
+//
+/* returns a lowmem machine address of the copied HVM L3 root table
+ * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy,
+ * otherwise blank out any entries with reserved bits in them. */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long
+hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res)
+{
+ int i, f;
+ int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY);
+ l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+ memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t));
+ for ( i = 0; i < 4; i++ )
+ {
+ f = l3e_get_flags(l3tab[i]);
+ if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) )
+ new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res);
+ else
+ new_l3e = l3e_empty();
+ safe_write_entry(&copy[i], &new_l3e);
+ }
+ return __pa(copy);
+}
+#endif
+
+
+static inline void
+sh2_update_linear_entries(struct vcpu *v)
+/* Sync up all the linear mappings for this vcpu's pagetables */
+{
+ struct domain *d = v->domain;
+
+ /* Linear pagetables in PV guests
+ * ------------------------------
+ *
+ * Guest linear pagetables, which map the guest pages, are at
+ * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
+ * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
+ * are set up at shadow creation time, but (of course!) the PAE case
+ * is subtler. Normal linear mappings are made by having an entry
+ * in the top-level table that points to itself (shadow linear) or
+ * to the guest top-level table (guest linear). For PAE, to set up
+ * a linear map requires us to copy the four top-level entries into
+ * level-2 entries. That means that every time we change a PAE l3e,
+ * we need to reflect the change into the copy.
+ *
+ * Linear pagetables in HVM guests
+ * -------------------------------
+ *
+ * For HVM guests, the linear pagetables are installed in the monitor
+ * tables (since we can't put them in the shadow). Shadow linear
+ * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
+ * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
+ * a linear pagetable of the monitor tables themselves. We have
+ * the same issue of having to re-copy PAE l3 entries whevever we use
+ * PAE shadows.
+ *
+ * Because HVM guests run on the same monitor tables regardless of the
+ * shadow tables in use, the linear mapping of the shadow tables has to
+ * be updated every time v->arch.shadow_table changes.
+ */
+
+ /* Don't try to update the monitor table if it doesn't exist */
+ if ( shadow2_mode_external(d)
+ && pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ return;
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
+
+ /* For PV, one l4e points at the guest l4, one points at the shadow
+ * l4. No maintenance required.
+ * For HVM, just need to update the l4e that points to the shadow l4. */
+
+ if ( shadow2_mode_external(d) )
+ {
+ /* Use the linear map if we can; otherwise make a new mapping */
+ if ( v == current )
+ {
+ __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
+ l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ }
+ else
+ {
+ l4_pgentry_t *ml4e;
+ ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ sh2_unmap_domain_page(ml4e);
+ }
+ }
+
+#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
+
+ /* This case only exists in HVM. To give ourselves a linear map of the
+ * shadows, we need to extend a PAE shadow to 4 levels. We do this by
+ * having a monitor l3 in slot 0 of the monitor l4 table, and
+ * copying the PAE l3 entries into it. Then, by having the monitor l4e
+ * for shadow pagetables also point to the monitor l4, we can use it
+ * to access the shadows. */
+
+ if ( shadow2_mode_external(d) )
+ {
+ /* Install copies of the shadow l3es into the monitor l3 table.
+ * The monitor l3 table is hooked into slot 0 of the monitor
+ * l4 table, so we use l3 linear indices 0 to 3 */
+ shadow_l3e_t *sl3e;
+ l3_pgentry_t *ml3e;
+ mfn_t l3mfn;
+ int i;
+
+ /* Use linear mappings if we can; otherwise make new mappings */
+ if ( v == current )
+ {
+ ml3e = __linear_l3_table;
+ l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables are made up by update_cr3 */
+ sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+ sl3e = v->arch.shadow_vtable;
+#endif
+ }
+ else
+ {
+ l4_pgentry_t *ml4e;
+ ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
+ l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
+ ml3e = sh2_map_domain_page(l3mfn);
+ sh2_unmap_domain_page(ml4e);
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables are made up by update_cr3 */
+ sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+ sl3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.shadow_table));
+#endif
+ }
+
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ ml3e[i] =
+ (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
+ ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
+ __PAGE_HYPERVISOR)
+ : l3e_empty();
+ }
+
+ if ( v != current )
+ {
+ sh2_unmap_domain_page(ml3e);
+#if GUEST_PAGING_LEVELS != 2
+ sh2_unmap_domain_page(sl3e);
+#endif
+ }
+ }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+ /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
+ * entries in the shadow, and the shadow's l3 entries into the
+ * shadow-linear-map l2 entries in the shadow. This is safe to do
+ * because Xen does not let guests share high-slot l2 tables between l3s,
+ * so we know we're not treading on anyone's toes.
+ *
+ * HVM: need to copy the shadow's l3 entries into the
+ * shadow-linear-map l2 entries in the monitor table. This is safe
+ * because we have one monitor table for each vcpu. The monitor's
+ * own l3es don't need to be copied because they never change.
+ * XXX That might change if we start stuffing things into the rest
+ * of the monitor's virtual address space.
+ */
+ {
+ l2_pgentry_t *l2e, new_l2e;
+ shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
+ int i;
+
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables were built by update_cr3 */
+ if ( shadow2_mode_external(d) )
+ shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+ else
+ BUG(); /* PV 2-on-3 is not supported yet */
+
+#else /* GUEST_PAGING_LEVELS == 3 */
+
+ /* Use local vcpu's mappings if we can; otherwise make new mappings */
+ if ( v == current )
+ {
+ shadow_l3e = v->arch.shadow_vtable;
+ if ( !shadow2_mode_external(d) )
+ guest_l3e = v->arch.guest_vtable;
+ }
+ else
+ {
+ mfn_t smfn;
+ int idx;
+
+ /* Map the shadow l3 */
+ smfn = pagetable_get_mfn(v->arch.shadow_table);
+ idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable));
+ shadow_l3e = sh2_map_domain_page(smfn);
+ shadow_l3e += idx;
+ if ( !shadow2_mode_external(d) )
+ {
+ /* Also the guest l3 */
+ mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table);
+ guest_l3e = sh2_map_domain_page(gmfn);
+ guest_l3e += guest_index(v->arch.guest_vtable);
+ }
+ }
+#endif /* GUEST_PAGING_LEVELS */
+
+ /* Choose where to write the entries, using linear maps if possible */
+ if ( v == current && shadow2_mode_external(d) )
+ {
+ /* From the monitor tables, it's safe to use linear maps to update
+ * monitor l2s */
+ l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
+ }
+ else if ( shadow2_mode_external(d) )
+ {
+ /* Map the monitor table's high l2 */
+ l3_pgentry_t *l3e;
+ l3e = sh2_map_domain_page(
+ pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+ l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
+ sh2_unmap_domain_page(l3e);
+ }
+ else
+ {
+ /* Map the shadow table's high l2 */
+ ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
+ l2e = sh2_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
+ }
+
+
+ if ( !shadow2_mode_external(d) )
+ {
+ /* Write linear mapping of guest. */
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
+ __PAGE_HYPERVISOR)
+ : l2e_empty();
+ safe_write_entry(
+ &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
+ &new_l2e);
+ }
+ }
+
+ /* Write linear mapping of shadow. */
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
+ __PAGE_HYPERVISOR)
+ : l2e_empty();
+ safe_write_entry(
+ &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
+ &new_l2e);
+ }
+
+ if ( v != current || !shadow2_mode_external(d) )
+ sh2_unmap_domain_page(l2e);
+
+#if GUEST_PAGING_LEVELS == 3
+ if ( v != current)
+ {
+ sh2_unmap_domain_page(shadow_l3e);
+ if ( !shadow2_mode_external(d) )
+ sh2_unmap_domain_page(guest_l3e);
+ }
+#endif
+ }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+ /* For PV, one l2e points at the guest l2, one points at the shadow
+ * l2. No maintenance required.
+ * For HVM, just need to update the l2e that points to the shadow l2. */
+
+ if ( shadow2_mode_external(d) )
+ {
+ /* Use the linear map if we can; otherwise make a new mapping */
+ if ( v == current )
+ {
+ __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
+ l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ }
+ else
+ {
+ l2_pgentry_t *ml2e;
+ ml2e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ sh2_unmap_domain_page(ml2e);
+ }
+ }
+
+#else
+#error this should not happen
+#endif
+}
+
+
+// XXX -- should this be in this file?
+// Or should it be moved to shadow2-common.c?
+//
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh2_pae_recopy(struct domain *d)
+/* Called whenever we write to the l3 entries of a PAE pagetable which
+ * is currently in use. Each vcpu that is using the table needs to
+ * resync its copies of the l3s in linear maps and any low-memory
+ * copies it might have made for fitting into 32bit CR3.
+ * Since linear maps are also resynced when we change CR3, we don't
+ * need to worry about changes to PAE l3es that are not currently in use.*/
+{
+ struct vcpu *v;
+ cpumask_t flush_mask = CPU_MASK_NONE;
+ ASSERT(shadow2_lock_is_acquired(d));
+
+ for_each_vcpu(d, v)
+ {
+ if ( !v->arch.shadow2_pae_flip_pending )
+ continue;
+
+ cpu_set(v->processor, flush_mask);
+
+ SHADOW2_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id);
+
+ /* This vcpu has a copy in its linear maps */
+ sh2_update_linear_entries(v);
+ if ( hvm_guest(v) )
+ {
+ /* This vcpu has a copy in its HVM PAE l3 */
+ v->arch.hvm_vcpu.hw_cr3 =
+ hvm_pae_copy_root(v, v->arch.shadow_vtable,
+ !shadow2_vcpu_mode_translate(v));
+ }
+#if CONFIG_PAGING_LEVELS == 3
+ else
+ {
+ /* This vcpu might have copied the l3 to below 4GB */
+ if ( v->arch.cr3 >> PAGE_SHIFT
+ != pagetable_get_pfn(v->arch.shadow_table) )
+ {
+ /* Recopy to where that copy is. */
+ int i;
+ l3_pgentry_t *dst, *src;
+ dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */
+ src = v->arch.shadow_vtable;
+ for ( i = 0 ; i < 4 ; i++ )
+ safe_write_entry(dst + i, src + i);
+ }
+ }
+#endif
+ v->arch.shadow2_pae_flip_pending = 0;
+ }
+
+ flush_tlb_mask(flush_mask);
+}
+#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */
+
+
+/* removes:
+ * vcpu->arch.guest_vtable
+ * vcpu->arch.shadow_table
+ * vcpu->arch.shadow_vtable
+ * Does all appropriate management/bookkeeping/refcounting/etc...
+ */
+static void
+sh2_detach_old_tables(struct vcpu *v)
+{
+ mfn_t smfn;
+
+ ////
+ //// vcpu->arch.guest_vtable
+ ////
+ if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+ v->arch.guest_vtable )
+ {
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ sh2_unmap_domain_page_global(v->arch.guest_vtable);
+ v->arch.guest_vtable = NULL;
+ }
+
+ ////
+ //// vcpu->arch.shadow_table
+ ////
+ smfn = pagetable_get_mfn(v->arch.shadow_table);
+ if ( mfn_x(smfn) )
+ {
+ ASSERT(v->arch.shadow_vtable);
+
+#if GUEST_PAGING_LEVELS == 3
+ // PAE guests do not (necessarily) use an entire page for their
+ // 4-entry L3s, so we have to deal with them specially.
+ //
+ sh2_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn);
+#else
+ sh2_put_ref(v, smfn, 0);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ {
+ struct pae_l3_bookkeeping *info =
+ sl3p_to_info(v->arch.shadow_vtable);
+ ASSERT(test_bit(v->vcpu_id, &info->vcpus));
+ clear_bit(v->vcpu_id, &info->vcpus);
+ }
+#endif
+ v->arch.shadow_table = pagetable_null();
+ }
+
+ ////
+ //// vcpu->arch.shadow_vtable
+ ////
+ if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+ v->arch.shadow_vtable )
+ {
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ //
+ sh2_unmap_domain_page_global(v->arch.shadow_vtable);
+ v->arch.shadow_vtable = NULL;
+ }
+}
+
+static void
+sh2_update_cr3(struct vcpu *v)
+/* Updates vcpu->arch.shadow_table after the guest has changed CR3.
+ * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
+ * if appropriate).
+ * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)...
+ */
+{
+ struct domain *d = v->domain;
+ mfn_t gmfn, smfn;
+#if GUEST_PAGING_LEVELS == 3
+ u32 guest_idx=0;
+#endif
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ ASSERT(v->arch.shadow2);
+
+ ////
+ //// vcpu->arch.guest_table is already set
+ ////
+
+#ifndef NDEBUG
+ /* Double-check that the HVM code has sent us a sane guest_table */
+ if ( hvm_guest(v) )
+ {
+ gfn_t gfn;
+
+ ASSERT(shadow2_mode_external(d));
+
+ // Is paging enabled on this vcpu?
+ if ( shadow2_vcpu_mode_translate(v) )
+ {
+ gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
+ gmfn = vcpu_gfn_to_mfn(v, gfn);
+ ASSERT(valid_mfn(gmfn));
+ ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
+ }
+ else
+ {
+ /* Paging disabled: guest_table points at (part of) p2m */
+#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
+ /* For everything else, they sould be the same */
+ ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
+#endif
+ }
+ }
+#endif
+
+ SHADOW2_PRINTK("d=%u v=%u guest_table=%05lx\n",
+ d->domain_id, v->vcpu_id,
+ (unsigned long)pagetable_get_pfn(v->arch.guest_table));
+
+#if GUEST_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ gmfn = pagetable_get_mfn(v->arch.guest_table_user);
+ else
+#endif
+ gmfn = pagetable_get_mfn(v->arch.guest_table);
+
+ sh2_detach_old_tables(v);
+
+ if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+ {
+ ASSERT(v->arch.cr3 == 0);
+ return;
+ }
+
+ ////
+ //// vcpu->arch.guest_vtable
+ ////
+ if ( shadow2_mode_external(d) )
+ {
+#if GUEST_PAGING_LEVELS == 3
+ if ( shadow2_vcpu_mode_translate(v) )
+ /* Paging enabled: find where in the page the l3 table is */
+ guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
+ else
+ /* Paging disabled: l3 is at the start of a page (in the p2m) */
+ guest_idx = 0;
+
+ // Ignore the low 2 bits of guest_idx -- they are really just
+ // cache control.
+ guest_idx &= ~3;
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable =
+ (guest_l3e_t *)sh2_map_domain_page_global(gmfn) + guest_idx;
+#else
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable = sh2_map_domain_page_global(gmfn);
+#endif
+ }
+ else
+ {
+#ifdef __x86_64__
+ v->arch.guest_vtable = __linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable = sh2_map_domain_page_global(gmfn);
+#else
+ v->arch.guest_vtable = __linear_l2_table;
+#endif
+ }
+
+#if 0
+ printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
+ __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
+#endif
+
+ ////
+ //// vcpu->arch.shadow_table
+ ////
+ smfn = get_shadow_status(v, gmfn, PGC_SH2_guest_root_type);
+ if ( valid_mfn(smfn) )
+ {
+ /* Pull this root shadow to the front of the list of roots. */
+ list_del(&mfn_to_page(smfn)->list);
+ list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2_toplevel_shadows);
+ }
+ else
+ {
+ /* This guest MFN is a pagetable. Must revoke write access. */
+ if ( shadow2_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0)
+ != 0 )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ /* Make sure there's enough free shadow memory. */
+ shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+ /* Shadow the page. */
+ smfn = sh2_make_shadow(v, gmfn, PGC_SH2_guest_root_type);
+ list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2_toplevel_shadows);
+ }
+ ASSERT(valid_mfn(smfn));
+ v->arch.shadow_table = pagetable_from_mfn(smfn);
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
+ /* Once again OK to unhook entries from this table if we see fork/exit */
+ ASSERT(sh2_mfn_is_a_page_table(gmfn));
+ mfn_to_page(gmfn)->shadow2_flags &= ~SH2F_unhooked_mappings;
+#endif
+
+
+ ////
+ //// vcpu->arch.shadow_vtable
+ ////
+ if ( shadow2_mode_external(d) )
+ {
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ mfn_t adjusted_smfn = smfn;
+ u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx);
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ v->arch.shadow_vtable =
+ (shadow_l3e_t *)sh2_map_domain_page_global(adjusted_smfn) +
+ shadow_idx;
+#else
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ v->arch.shadow_vtable = sh2_map_domain_page_global(smfn);
+#endif
+ }
+ else
+ {
+#if SHADOW_PAGING_LEVELS == 4
+ v->arch.shadow_vtable = __sh2_linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+ // XXX - why does this need a global map?
+ v->arch.shadow_vtable = sh2_map_domain_page_global(smfn);
+#else
+ v->arch.shadow_vtable = __sh2_linear_l2_table;
+#endif
+ }
+
+ ////
+ //// Take a ref to the new shadow table, and pin it.
+ ////
+ //
+ // This ref is logically "held" by v->arch.shadow_table entry itself.
+ // Release the old ref.
+ //
+#if GUEST_PAGING_LEVELS == 3
+ // PAE guests do not (necessarily) use an entire page for their
+ // 4-entry L3s, so we have to deal with them specially.
+ //
+ // XXX - might want to revisit this if/when we do multiple compilation for
+ // HVM-vs-PV guests, as PAE PV guests could get away without doing
+ // subshadows.
+ //
+ sh2_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn);
+ sh2_pin_l3_subshadow(v->arch.shadow_vtable, smfn);
+#else
+ sh2_get_ref(smfn, 0);
+ sh2_pin(smfn);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ // PAE 3-on-3 shadows have to keep track of which vcpu's are using
+ // which l3 subshadow, in order handle the SHADOW2_SET_L3PAE_RECOPY
+ // case from validate_gl3e(). Search for SHADOW2_SET_L3PAE_RECOPY
+ // in the code for more info.
+ //
+ {
+ struct pae_l3_bookkeeping *info =
+ sl3p_to_info(v->arch.shadow_vtable);
+ ASSERT(!test_bit(v->vcpu_id, &info->vcpus));
+ set_bit(v->vcpu_id, &info->vcpus);
+ }
+#endif
+
+ debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n",
+ __func__, gmfn, smfn);
+
+ ///
+ /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3
+ ///
+ if ( shadow2_mode_external(d) )
+ {
+ ASSERT(hvm_guest(v));
+ make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
+
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+#if SHADOW_PAGING_LEVELS != 3
+#error unexpected combination of GUEST and SHADOW paging levels
+#endif
+ /* 2-on-3: make a PAE l3 table that points at the four-page l2 */
+ {
+ mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table);
+ int i;
+
+ ASSERT(v->arch.hvm_vcpu.hw_cr3 ==
+ virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab));
+ for (i = 0; i < 4; i++)
+ {
+ v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] =
+ shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT);
+ }
+ }
+#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+ /* 3-on-3: copy the shadow l3 to slots that are below 4GB.
+ * If paging is disabled, clear l3e reserved bits; otherwise
+ * remove entries that have reserved bits set. */
+ v->arch.hvm_vcpu.hw_cr3 =
+ hvm_pae_copy_root(v, v->arch.shadow_vtable,
+ !shadow2_vcpu_mode_translate(v));
+#else
+ /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */
+ v->arch.hvm_vcpu.hw_cr3 =
+ pagetable_get_paddr(v->arch.shadow_table);
+#endif
+ }
+ else // not shadow2_mode_external...
+ {
+ /* We don't support PV except guest == shadow == config levels */
+ BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
+ make_cr3(v, pagetable_get_pfn(v->arch.shadow_table));
+ }
+
+ /* Fix up the linear pagetable mappings */
+ sh2_update_linear_entries(v);
+}
+
+
+/**************************************************************************/
+/* Functions to revoke guest rights */
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+static int sh2_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
+/* Look up this vaddr in the current shadow and see if it's a writeable
+ * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
+{
+ shadow_l1e_t sl1e, *sl1p;
+ shadow_l2e_t *sl2p;
+#if GUEST_PAGING_LEVELS >= 3
+ shadow_l3e_t *sl3p;
+#if GUEST_PAGING_LEVELS >= 4
+ shadow_l4e_t *sl4p;
+#endif
+#endif
+ mfn_t sl1mfn;
+
+
+ /* Carefully look in the shadow linear map for the l1e we expect */
+ if ( v->arch.shadow_vtable == NULL ) return 0;
+#if GUEST_PAGING_LEVELS >= 4
+ sl4p = sh2_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
+ if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
+ return 0;
+ sl3p = sh2_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
+ if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
+ return 0;
+#elif GUEST_PAGING_LEVELS == 3
+ sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable)
+ + shadow_l3_linear_offset(vaddr);
+ if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
+ return 0;
+#endif
+ sl2p = sh2_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
+ if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
+ return 0;
+ sl1p = sh2_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
+ sl1e = *sl1p;
+ if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
+ != (_PAGE_PRESENT|_PAGE_RW))
+ || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
+ return 0;
+
+ /* Found it! Need to remove its write permissions. */
+ sl1mfn = shadow_l2e_get_mfn(*sl2p);
+ sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
+ shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
+ return 1;
+}
+#endif
+
+int sh2_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
+/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
+{
+ shadow_l1e_t *sl1e;
+ int done = 0;
+ int flags;
+
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done,
+ {
+ flags = shadow_l1e_get_flags(*sl1e);
+ if ( (flags & _PAGE_PRESENT)
+ && (flags & _PAGE_RW)
+ && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
+ {
+ shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
+ if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
+ & PGT_count_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+
+int sh2_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
+/* Excises all mappings to guest frame from this shadow l1 table */
+{
+ shadow_l1e_t *sl1e;
+ int done = 0;
+ int flags;
+
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done,
+ {
+ flags = shadow_l1e_get_flags(*sl1e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
+ {
+ shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
+ if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+/**************************************************************************/
+/* Functions to excise all pointers to shadows from higher-level shadows. */
+
+void sh2_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
+/* Blank out a single shadow entry */
+{
+ switch (mfn_to_page(smfn)->count_info & PGC_SH2_type_mask)
+ {
+ case PGC_SH2_l1_shadow:
+ shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
+ case PGC_SH2_l2_shadow:
+#if GUEST_PAGING_LEVELS == 3
+ case PGC_SH2_l2h_shadow:
+#endif
+ shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
+#if GUEST_PAGING_LEVELS >= 3
+ case PGC_SH2_l3_shadow:
+ shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
+#if GUEST_PAGING_LEVELS >= 4
+ case PGC_SH2_l4_shadow:
+ shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
+#endif
+#endif
+ default: BUG(); /* Called with the wrong kind of shadow. */
+ }
+}
+
+int sh2_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
+/* Remove all mappings of this l1 shadow from this l2 shadow */
+{
+ shadow_l2e_t *sl2e;
+ int done = 0;
+ int flags;
+#if GUEST_PAGING_LEVELS != 4
+ int xen_mappings = !shadow2_mode_external(v->domain);
+#endif
+
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings,
+ {
+ flags = shadow_l2e_get_flags(*sl2e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
+ {
+ shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH2_type_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+int sh2_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
+/* Remove all mappings of this l2 shadow from this l3 shadow */
+{
+ shadow_l3e_t *sl3e;
+ int done = 0;
+ int flags;
+
+ SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, done,
+ {
+ flags = shadow_l3e_get_flags(*sl3e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
+ {
+ shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+ if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+
+#if GUEST_PAGING_LEVELS >= 4
+int sh2_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
+/* Remove all mappings of this l3 shadow from this l4 shadow */
+{
+ shadow_l4e_t *sl4e;
+ int done = 0;
+ int flags, xen_mappings = !shadow2_mode_external(v->domain);
+
+ SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
+ {
+ flags = shadow_l4e_get_flags(*sl4e);
+ if ( (flags & _PAGE_PRESENT)
+ && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
+ {
+ shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+ if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH2_type_mask) == 0 )
+ /* This breaks us cleanly out of the FOREACH macro */
+ done = 1;
+ }
+ });
+ return done;
+}
+#endif /* 64bit guest */
+#endif /* PAE guest */
+
+/**************************************************************************/
+/* Handling HVM guest writes to pagetables */
+
+/* Check that the user is allowed to perform this write.
+ * Returns a mapped pointer to write to, and the mfn it's on,
+ * or NULL for error. */
+static inline void * emulate_map_dest(struct vcpu *v,
+ unsigned long vaddr,
+ struct x86_emulate_ctxt *ctxt,
+ mfn_t *mfnp)
+{
+ walk_t gw;
+ u32 flags;
+ gfn_t gfn;
+ mfn_t mfn;
+
+ guest_walk_tables(v, vaddr, &gw, 1);
+ flags = accumulate_guest_flags(&gw);
+ gfn = guest_l1e_get_gfn(gw.eff_l1e);
+ mfn = vcpu_gfn_to_mfn(v, gfn);
+ sh2_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+
+ if ( !(flags & _PAGE_PRESENT)
+ || !(flags & _PAGE_RW)
+ || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) )
+ {
+ /* This write would have faulted even on bare metal */
+ v->arch.shadow2_propagate_fault = 1;
+ return NULL;
+ }
+
+ if ( !valid_mfn(mfn) )
+ {
+ /* Attempted a write to a bad gfn. This should never happen:
+ * after all, we're here because this write is to a page table. */
+ BUG();
+ }
+
+ ASSERT(sh2_mfn_is_a_page_table(mfn));
+ *mfnp = mfn;
+ return sh2_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
+}
+
+int
+sh2_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
+ u32 bytes, struct x86_emulate_ctxt *ctxt)
+{
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ while ( bytes > 0 )
+ {
+ mfn_t mfn;
+ int bytes_on_page;
+ void *addr;
+
+ bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK);
+ if ( bytes_on_page > bytes )
+ bytes_on_page = bytes;
+
+ if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+ return X86EMUL_PROPAGATE_FAULT;
+ memcpy(addr, src, bytes_on_page);
+ shadow2_validate_guest_pt_write(v, mfn, addr, bytes_on_page);
+ bytes -= bytes_on_page;
+ /* If we are writing zeros to this page, might want to unshadow */
+ if ( *(u8 *)addr == 0 )
+ check_for_early_unshadow(v, mfn);
+ sh2_unmap_domain_page(addr);
+ }
+ shadow2_audit_tables(v);
+ return X86EMUL_CONTINUE;
+}
+
+int
+sh2_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
+ unsigned long old, unsigned long new,
+ unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+ mfn_t mfn;
+ void *addr;
+ unsigned long prev;
+ int rv = X86EMUL_CONTINUE;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+ ASSERT(bytes <= sizeof (unsigned long));
+
+ if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+ return X86EMUL_PROPAGATE_FAULT;
+
+ switch (bytes)
+ {
+ case 1: prev = cmpxchg(((u8 *)addr), old, new); break;
+ case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
+ case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
+ case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
+ default:
+ SHADOW2_PRINTK("cmpxchg of size %i is not supported\n", bytes);
+ prev = ~old;
+ }
+
+ if ( (prev == old) )
+ shadow2_validate_guest_pt_write(v, mfn, addr, bytes);
+ else
+ rv = X86EMUL_CMPXCHG_FAILED;
+
+ SHADOW2_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
+ " wanted %#lx now %#lx bytes %u\n",
+ vaddr, prev, old, new, *(unsigned long *)addr, bytes);
+
+ /* If we are writing zeros to this page, might want to unshadow */
+ if ( *(u8 *)addr == 0 )
+ check_for_early_unshadow(v, mfn);
+
+ sh2_unmap_domain_page(addr);
+ shadow2_audit_tables(v);
+ check_for_early_unshadow(v, mfn);
+ return rv;
+}
+
+int
+sh2_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr,
+ unsigned long old_lo, unsigned long old_hi,
+ unsigned long new_lo, unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt)
+{
+ mfn_t mfn;
+ void *addr;
+ u64 old, new, prev;
+ int rv = X86EMUL_CONTINUE;
+
+ ASSERT(shadow2_lock_is_acquired(v->domain));
+
+ if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+ return X86EMUL_PROPAGATE_FAULT;
+
+ old = (((u64) old_hi) << 32) | (u64) old_lo;
+ new = (((u64) new_hi) << 32) | (u64) new_lo;
+ prev = cmpxchg(((u64 *)addr), old, new);
+
+ if ( (prev == old) )
+ shadow2_validate_guest_pt_write(v, mfn, addr, 8);
+ else
+ rv = X86EMUL_CMPXCHG_FAILED;
+
+ /* If we are writing zeros to this page, might want to unshadow */
+ if ( *(u8 *)addr == 0 )
+ check_for_early_unshadow(v, mfn);
+
+ sh2_unmap_domain_page(addr);
+ shadow2_audit_tables(v);
+ check_for_early_unshadow(v, mfn);
+ return rv;
+}
+
+
+/**************************************************************************/
+/* Audit tools */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
+
+#define AUDIT_FAIL(_level, _fmt, _a...) do { \
+ printk("Shadow2 %u-on-%u audit failed at level %i, index %i\n" \
+ "gl" #_level "mfn = %" SH2_PRI_mfn \
+ " sl" #_level "mfn = %" SH2_PRI_mfn \
+ " &gl" #_level "e = %p &sl" #_level "e = %p" \
+ " gl" #_level "e = %" SH2_PRI_gpte \
+ " sl" #_level "e = %" SH2_PRI_pte "\nError: " _fmt "\n", \
+ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \
+ _level, guest_index(gl ## _level ## e), \
+ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \
+ gl ## _level ## e, sl ## _level ## e, \
+ gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
+ ##_a); \
+ BUG(); \
+ done = 1; \
+} while (0)
+
+
+static char * sh2_audit_flags(struct vcpu *v, int level,
+ int gflags, int sflags)
+/* Common code for auditing flag bits */
+{
+ if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
+ return "shadow is present but guest is not present";
+ if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) )
+ return "global bit set in PV shadow";
+ if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
+ && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) )
+ return "dirty bit not propagated";
+ if ( level == 2 && (sflags & _PAGE_PSE) )
+ return "PS bit set in shadow";
+#if SHADOW_PAGING_LEVELS == 3
+ if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
+#endif
+ if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) )
+ return "user/supervisor bit does not match";
+ if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) )
+ return "NX bit does not match";
+ if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) )
+ return "shadow grants write access but guest does not";
+ if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) )
+ return "accessed bit not propagated";
+ return NULL;
+}
+
+static inline mfn_t
+audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
+/* Convert this gfn to an mfn in the manner appropriate for the
+ * guest pagetable it's used in (gmfn) */
+{
+ if ( !shadow2_mode_translate(v->domain) )
+ return _mfn(gfn_x(gfn));
+
+ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
+ != PGT_writable_page )
+ return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
+ else
+ return sh2_gfn_to_mfn(v->domain, gfn_x(gfn));
+}
+
+
+int sh2_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
+{
+ guest_l1e_t *gl1e, *gp;
+ shadow_l1e_t *sl1e;
+ mfn_t mfn, gmfn, gl1mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+
+ /* Follow the backpointer */
+ gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
+ gl1e = gp = sh2_map_domain_page(gl1mfn);
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
+
+ s = sh2_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
+ shadow_l1e_get_flags(*sl1e));
+ if ( s ) AUDIT_FAIL(1, "%s", s);
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l1e_get_gfn(*gl1e);
+ mfn = shadow_l1e_get_mfn(*sl1e);
+ gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(1, "bad translation: gfn %" SH2_PRI_gfn
+ " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh2_unmap_domain_page(gp);
+ return done;
+}
+
+int sh2_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
+{
+ guest_l1e_t *gl1e, e;
+ shadow_l1e_t *sl1e;
+ mfn_t gl1mfn = _mfn(INVALID_MFN);
+ int f;
+ int done = 0;
+
+ /* fl1 has no useful backpointer: all we can check are flags */
+ e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
+ SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
+ f = shadow_l1e_get_flags(*sl1e);
+ f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
+ if ( !(f == 0
+ || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+ _PAGE_ACCESSED|_PAGE_DIRTY)
+ || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) )
+ AUDIT_FAIL(1, "fl1e has bad flags");
+ });
+ return 0;
+}
+
+int sh2_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
+{
+ guest_l2e_t *gl2e, *gp;
+ shadow_l2e_t *sl2e;
+ mfn_t mfn, gmfn, gl2mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+#if GUEST_PAGING_LEVELS != 4
+ int xen_mappings = !shadow2_mode_external(v->domain);
+#endif
+
+ /* Follow the backpointer */
+ gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info);
+ gl2e = gp = sh2_map_domain_page(gl2mfn);
+ SHADOW2_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
+
+ s = sh2_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
+ shadow_l2e_get_flags(*sl2e));
+ if ( s ) AUDIT_FAIL(2, "%s", s);
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l2e_get_gfn(*gl2e);
+ mfn = shadow_l2e_get_mfn(*sl2e);
+ gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)
+ ? get_fl1_shadow_status(v, gfn)
+ : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn),
+ PGC_SH2_l1_shadow);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(2, "bad translation: gfn %" SH2_PRI_gfn
+ " (--> %" SH2_PRI_mfn ")"
+ " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+ gfn_x(gfn),
+ (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
+ : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
+ mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh2_unmap_domain_page(gp);
+ return 0;
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+int sh2_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
+{
+ guest_l3e_t *gl3e, *gp;
+ shadow_l3e_t *sl3e;
+ mfn_t mfn, gmfn, gl3mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+
+ /* Follow the backpointer */
+ gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info);
+ gl3e = gp = sh2_map_domain_page(gl3mfn);
+ SHADOW2_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
+
+ s = sh2_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
+ shadow_l3e_get_flags(*sl3e));
+ if ( s ) AUDIT_FAIL(3, "%s", s);
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l3e_get_gfn(*gl3e);
+ mfn = shadow_l3e_get_mfn(*sl3e);
+ gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn),
+ (GUEST_PAGING_LEVELS == 3
+ && !shadow2_mode_external(v->domain)
+ && (guest_index(gl3e) % 4) == 3)
+ ? PGC_SH2_l2h_pae_shadow
+ : PGC_SH2_l2_shadow);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(3, "bad translation: gfn %" SH2_PRI_gfn
+ " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh2_unmap_domain_page(gp);
+ return 0;
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+#if GUEST_PAGING_LEVELS >= 4
+int sh2_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
+{
+ guest_l4e_t *gl4e, *gp;
+ shadow_l4e_t *sl4e;
+ mfn_t mfn, gmfn, gl4mfn;
+ gfn_t gfn;
+ char *s;
+ int done = 0;
+ int xen_mappings = !shadow2_mode_external(v->domain);
+
+ /* Follow the backpointer */
+ gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info);
+ gl4e = gp = sh2_map_domain_page(gl4mfn);
+ SHADOW2_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
+ {
+ s = sh2_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
+ shadow_l4e_get_flags(*sl4e));
+ if ( s ) AUDIT_FAIL(4, "%s", s);
+
+ if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+ {
+ gfn = guest_l4e_get_gfn(*gl4e);
+ mfn = shadow_l4e_get_mfn(*sl4e);
+ gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn),
+ PGC_SH2_l3_shadow);
+ if ( mfn_x(gmfn) != mfn_x(mfn) )
+ AUDIT_FAIL(4, "bad translation: gfn %" SH2_PRI_gfn
+ " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+ }
+ });
+ sh2_unmap_domain_page(gp);
+ return 0;
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#undef AUDIT_FAIL
+
+#endif /* Audit code */
+
+/**************************************************************************/
+/* Entry points into this mode of the shadow code.
+ * This will all be mangled by the preprocessor to uniquify everything. */
+struct shadow2_entry_points shadow2_entry = {
+ .page_fault = sh2_page_fault,
+ .invlpg = sh2_invlpg,
+ .gva_to_gpa = sh2_gva_to_gpa,
+ .gva_to_gfn = sh2_gva_to_gfn,
+ .update_cr3 = sh2_update_cr3,
+ .map_and_validate_gl1e = sh2_map_and_validate_gl1e,
+ .map_and_validate_gl2e = sh2_map_and_validate_gl2e,
+ .map_and_validate_gl2he = sh2_map_and_validate_gl2he,
+ .map_and_validate_gl3e = sh2_map_and_validate_gl3e,
+ .map_and_validate_gl4e = sh2_map_and_validate_gl4e,
+ .detach_old_tables = sh2_detach_old_tables,
+ .x86_emulate_write = sh2_x86_emulate_write,
+ .x86_emulate_cmpxchg = sh2_x86_emulate_cmpxchg,
+ .x86_emulate_cmpxchg8b = sh2_x86_emulate_cmpxchg8b,
+ .make_monitor_table = sh2_make_monitor_table,
+ .destroy_monitor_table = sh2_destroy_monitor_table,
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+ .guess_wrmap = sh2_guess_wrmap,
+#endif
+ .guest_levels = GUEST_PAGING_LEVELS,
+ .shadow_levels = SHADOW_PAGING_LEVELS,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */