27 files changed, 20351 insertions, 0 deletions
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN
new file mode 100644
index 0000000000..2fbb9998ac
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN
@@ -0,0 +1,176 @@
+# $NetBSD: XEN,v 1.1.2.2 2004/07/15 20:19:34 he Exp $
+
+include 	"arch/xen/conf/std.xen"
+
+options 	INCLUDE_CONFIG_FILE	# embed config file in kernel binary
+
+#options		UVMHIST
+#options		UVMHIST_PRINT
+#options		SYSCALL_DEBUG
+
+maxusers	32		# estimated number of users
+
+#
+options		XEN
+#options		DOM0OPS
+options		HZ=50
+
+#options 	I586_CPU
+options 	I686_CPU
+
+#options 	VM86		# virtual 8086 emulation
+#options 	USER_LDT	# user-settable LDT; used by WINE
+
+#options 	MTRR		# memory-type range register syscall support
+
+#options 	CONSDEVNAME="\"xencons\""
+#options 	CONS_OVERRIDE
+
+options		INSECURE	# disable kernel security levels - X needs this
+
+options 	RTC_OFFSET=0	# hardware clock is this many mins. west of GMT
+#options 	NTP		# NTP phase/frequency locked loop
+
+options 	KTRACE		# system call tracing via ktrace(1)
+#options 	SYSTRACE	# system call vetting via systrace(1)
+
+options 	SYSVMSG		# System V-like message queues
+options 	SYSVSEM		# System V-like semaphores
+#options 	SEMMNI=10	# number of semaphore identifiers
+#options 	SEMMNS=60	# number of semaphores in system
+#options 	SEMUME=10	# max number of undo entries per process
+#options 	SEMMNU=30	# number of undo structures in system
+options 	SYSVSHM		# System V-like memory sharing
+#options 	SHMMAXPGS=2048	# 2048 pages is the default
+options 	P1003_1B_SEMAPHORE	# p1003.1b semaphore support
+
+options 	LKM		# loadable kernel modules
+
+options 	USERCONF	# userconf(4) support
+options 	SYSCTL_INCLUDE_DESCR	# Include sysctl descriptions in kernel
+
+# Diagnostic/debugging support options
+options 	DIAGNOSTIC	# expensive kernel consistency checks
+options 	DEBUG		# expensive debugging checks/support 
+options 	KMEMSTATS	# kernel memory statistics (vmstat -m)
+options 	DDB		# in-kernel debugger
+options		DDB_ONPANIC=1	# see also sysctl(8): `ddb.onpanic'
+options 	DDB_HISTORY_SIZE=512	# enable history editing in DDB
+#options 	KGDB		# remote debugger
+#options 	KGDB_DEVNAME="\"com\"",KGDB_DEVADDR=0x2f8,KGDB_DEVRATE=57600
+makeoptions	DEBUG="-g"	# compile full symbol table
+
+#options 	COMPAT_14	# NetBSD 1.4
+#options 	COMPAT_15	# NetBSD 1.5
+options 	COMPAT_16	# NetBSD 1.6
+
+##options 	COMPAT_LINUX	# binary compatibility with Linux
+#options 	COMPAT_FREEBSD	# binary compatibility with FreeBSD
+#options 	COMPAT_MACH	# binary compatibility with Mach binaries
+#options	COMPAT_DARWIN	# binary compatibility with Darwin binaries
+#options 	EXEC_MACHO	# exec MACH-O binaries
+#options 	COMPAT_PECOFF	# kernel support to run Win32 apps
+
+file-system 	FFS		# UFS
+file-system 	EXT2FS		# second extended file system (linux)
+#file-system 	LFS		# log-structured file system
+#file-system 	MFS		# memory file system
+file-system 	NFS		# Network File System client
+#file-system 	NTFS		# Windows/NT file system (experimental)
+#file-system 	CD9660		# ISO 9660 + Rock Ridge file system
+#file-system 	MSDOSFS		# MS-DOS file system
+file-system 	FDESC		# /dev/fd
+file-system 	KERNFS		# /kern
+file-system 	NULLFS		# loopback file system
+#file-system 	OVERLAY		# overlay file system
+#file-system 	PORTAL		# portal filesystem (still experimental)
+file-system 	PROCFS		# /proc
+#file-system 	UMAPFS		# NULLFS + uid and gid remapping
+#file-system 	UNION		# union file system
+#file-system	SMBFS		# experimental - CIFS; also needs nsmb (below)
+
+#options 	QUOTA		# UFS quotas
+#options 	SOFTDEP		# FFS soft updates support.
+#options 	NFSSERVER	# Network File System server
+
+options 	GATEWAY		# packet forwarding
+options 	INET		# IP + ICMP + TCP + UDP
+options 	INET6		# IPV6
+options 	IPSEC		# IP security
+options 	IPSEC_ESP	# IP security (encryption part; define w/IPSEC)
+options 	MROUTING	# IP multicast routing
+options 	PFIL_HOOKS	# pfil(9) packet filter hooks
+options 	IPFILTER_LOG	# ipmon(8) log support
+
+options 	NFS_BOOT_DHCP,NFS_BOOT_BOOTPARAM,NFS_BOOT_BOOTSTATIC
+#options 	NFS_BOOTSTATIC_MYIP="\"169.254.1.2\""
+#options 	NFS_BOOTSTATIC_GWIP="\"169.254.1.1\""
+#options 	NFS_BOOTSTATIC_MASK="\"255.255.255.0\""
+#options 	NFS_BOOTSTATIC_SERVADDR="\"169.254.1.1\""
+#options 	NFS_BOOTSTATIC_SERVER="\"server:/path/to/root\""
+
+options 	WSEMUL_VT100		# VT100 / VT220 emulation
+options 	WS_KERNEL_FG=WSCOL_GREEN
+options 	WSDISPLAY_COMPAT_PCVT		# emulate some ioctls
+options 	WSDISPLAY_COMPAT_SYSCONS	# emulate some ioctls
+options 	WSDISPLAY_COMPAT_USL		# VT handling
+options 	WSDISPLAY_COMPAT_RAWKBD		# can get raw scancodes
+options 	WSDISPLAY_DEFAULTSCREENS=4
+options 	PCDISPLAY_SOFTCURSOR
+
+config		netbsd	root on ? type ?
+#config		netbsd	root on wd0a type ffs
+#config		netbsd	root on xennet0 type nfs
+
+mainbus0 at root
+
+cpu* at mainbus?
+
+hypervisor*	at mainbus?		# Xen hypervisor
+
+npx0		at hypervisor?		# x86 math coprocessor
+
+xencons*	at hypervisor?		# Xen virtual console
+xennet* 	at hypervisor?		# Xen virtual network interface
+
+#xbd*		at hypervisor?		# Xen virtual block device
+#wd*		at hypervisor?		# Xen vbd (wd identity)
+#sd*		at hypervisor?		# Xen vbd (sd identity)
+#cd*		at hypervisor?		# Xen vbd (cd identity)
+
+#xenkbc* 	at hypervisor?		# Xen Keyboard/Mouse Interface
+#pckbd*		at xenkbc?		# Keyboard
+#vga*		at hypervisor?		# Xen VGA display
+#pms*		at xenkbc?		# PS/2 Mouse for wsmouse
+
+#wskbd*		at pckbd? console ?
+#wsdisplay*	at vga? console ?
+#wsmouse*	at pms? mux 0
+
+
+include	"arch/xen/conf/GENERIC.local"
+
+
+pseudo-device	ccd		4	# concatenated/striped disk devices
+#pseudo-device	cgd		4	# cryptographic disk devices
+#pseudo-device	md		1	# memory disk device (ramdisk)
+#pseudo-device	vnd		4	# disk-like interface to files
+
+pseudo-device	bpfilter	8	# Berkeley packet filter
+pseudo-device	ipfilter		# IP filter (firewall) and NAT
+pseudo-device	loop			# network loopback
+#pseudo-device	tun		2	# network tunneling over tty
+#pseudo-device	gre		2	# generic L3 over IP tunnel
+#pseudo-device	gif		4	# IPv[46] over IPv[46] tunnel (RFC1933)
+#pseudo-device	faith		1	# IPv[46] tcp relay translation i/f
+#pseudo-device	stf		1	# 6to4 IPv6 over IPv4 encapsulation
+#pseudo-device	vlan			# IEEE 802.1q encapsulation
+#pseudo-device	bridge			# simple inter-network bridging
+
+pseudo-device	pty			# pseudo-terminals
+pseudo-device	rnd			# /dev/random and in-kernel generator
+pseudo-device	clockctl		# user control of clock subsystem
+
+pseudo-device	wsmux			# mouse & keyboard multiplexor
+pseudo-device	wsfont
+pseudo-device	ksyms			# /dev/ksyms
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen
new file mode 100644
index 0000000000..12f6bfa1d5
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen
@@ -0,0 +1,232 @@
+#	$NetBSD: files.xen,v 1.3.2.1 2004/05/22 15:59:02 he Exp $
+#	NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp 
+#	NetBSD: files.i386,v 1.254 2004/03/25 23:32:10 jmc Exp 
+
+maxpartitions 8
+
+maxusers 2 16 128
+
+# Processor type options.
+defflag	opt_cputype.h	I686_CPU
+
+# delay before cpu_reset() for reboot.
+defparam		CPURESET_DELAY
+
+# No unmapped page below kernel stack
+defflag			NOREDZONE
+
+# Beep on halt
+defflag opt_beep.h		BEEP_ONHALT
+defparam opt_beep.h		BEEP_ONHALT_COUNT
+defparam opt_beep.h		BEEP_ONHALT_PITCH BEEP_ONHALT_PERIOD
+
+file	arch/xen/i386/autoconf.c
+file	arch/i386/i386/db_dbgreg.S	ddb | kstack_check_dr0
+file	arch/i386/i386/db_disasm.c	ddb
+file	arch/i386/i386/db_interface.c	ddb
+file	arch/i386/i386/db_memrw.c	ddb | kgdb
+file	arch/i386/i386/db_trace.c	ddb
+file	kern/subr_disk_mbr.c		disk
+file	arch/xen/i386/gdt.c
+file	arch/xen/i386/hypervisor_machdep.c
+file	arch/i386/i386/in_cksum.S	inet | inet6
+file	arch/i386/i386/ipkdb_glue.c	ipkdb
+file	arch/i386/i386/kgdb_machdep.c	kgdb
+file	arch/xen/i386/machdep.c
+file	arch/xen/i386/identcpu.c
+file	arch/i386/i386/math_emulate.c	math_emulate
+file	arch/i386/i386/mem.c
+file	kern/kern_microtime.c		i586_cpu | i686_cpu
+file	arch/i386/i386/mtrr_k6.c	mtrr
+file	netns/ns_cksum.c		ns
+file	arch/xen/i386/pmap.c
+file	arch/i386/i386/process_machdep.c
+file	arch/i386/i386/procfs_machdep.c	procfs
+file	arch/xen/i386/sys_machdep.c
+file	arch/i386/i386/syscall.c
+file	arch/xen/i386/trap.c
+file	arch/i386/i386/vm_machdep.c
+file	arch/xen/i386/xen_machdep.c
+
+file	arch/xen/xen/xen_debug.c
+
+file	arch/xen/xen/clock.c
+file	arch/xen/xen/evtchn.c
+file	arch/xen/xen/ctrl_if.c
+
+file	dev/cons.c
+
+file	arch/i386/i386/mptramp.S		multiprocessor
+file    arch/i386/i386/ipifuncs.c	multiprocessor
+
+file	arch/i386/i386/pmc.c		perfctrs
+
+file	crypto/des/arch/i386/des_enc.S		des
+file	crypto/des/arch/i386/des_cbc.S		des
+
+file	crypto/blowfish/arch/i386/bf_enc.S	blowfish
+file	crypto/blowfish/arch/i386/bf_cbc.S	blowfish & !i386_cpu
+
+#
+# Machine-independent SCSI drivers
+#
+
+#xxx include	"dev/scsipi/files.scsipi"
+
+#
+# Machine-independent ATA drivers
+#
+
+#xxx include	"dev/ata/files.ata"
+
+# Memory Disk for install floppy
+file	dev/md_root.c			memory_disk_hooks
+
+#
+define  mainbus { [apid = -1] }
+
+file	arch/x86/x86/bus_dma.c
+file	arch/xen/x86/bus_space.c
+file	arch/x86/x86/cacheinfo.c
+file	arch/xen/x86/consinit.c
+file	arch/xen/x86/intr.c
+file	arch/x86/x86/ipi.c		multiprocessor
+file	arch/x86/x86/lock_machdep.c	lockdebug
+file	arch/x86/x86/softintr.c
+
+include	"arch/xen/conf/files.compat"
+
+#
+# System bus types
+#
+
+device	mainbus: mainbus
+attach	mainbus at root
+file	arch/xen/i386/mainbus.c		mainbus
+
+# Xen hypervisor
+device	hypervisor { }
+attach	hypervisor at mainbus
+file	arch/xen/xen/hypervisor.c	hypervisor needs-flag
+
+# Numeric Processing Extension; Math Co-processor
+device	npx
+file	arch/xen/i386/npx.c		npx needs-flag
+
+attach	npx at hypervisor with npx_hv
+file	arch/xen/i386/npx_hv.c		npx_hv
+
+# Xen console support
+device	xencons: tty
+attach	xencons at hypervisor
+file	arch/xen/xen/xencons.c		xencons needs-flag
+
+include	"dev/wscons/files.wscons"
+include	"dev/wsfont/files.wsfont"
+
+include	"dev/pckbport/files.pckbport"
+
+# CPUS
+
+define cpu { [apid = -1] }
+device cpu
+attach cpu at mainbus
+file	arch/xen/i386/cpu.c		cpu
+
+#
+# Compatibility modules
+#
+
+# VM86 mode
+file	arch/i386/i386/vm86.c			vm86
+
+# VM86 in kernel
+file	arch/i386/i386/kvm86.c			kvm86
+file	arch/i386/i386/kvm86call.S		kvm86
+
+# Binary compatibility with previous NetBSD releases (COMPAT_XX)
+file	arch/i386/i386/compat_13_machdep.c	compat_13 | compat_aout
+file	arch/i386/i386/compat_16_machdep.c	compat_16 | compat_ibcs2
+
+# SVR4 binary compatibility (COMPAT_SVR4)
+include	"compat/svr4/files.svr4"
+file	arch/i386/i386/svr4_machdep.c		compat_svr4
+file	arch/i386/i386/svr4_sigcode.S		compat_svr4
+file	arch/i386/i386/svr4_syscall.c		compat_svr4
+
+# MACH binary compatibility (COMPAT_MACH)
+include	"compat/mach/files.mach"
+file	arch/i386/i386/mach_machdep.c		compat_mach | compat_darwin
+file	arch/i386/i386/mach_sigcode.S		compat_mach | compat_darwin
+file	arch/i386/i386/mach_syscall.c		compat_mach | compat_darwin
+file	arch/i386/i386/macho_machdep.c		exec_macho
+
+# DARWIN binary compatibility (COMPAT_DARWIN)
+include	"compat/darwin/files.darwin"
+file	arch/i386/i386/darwin_machdep.c		compat_darwin
+
+# iBCS-2 binary compatibility (COMPAT_IBCS2)
+include	"compat/ibcs2/files.ibcs2"
+file	arch/i386/i386/ibcs2_machdep.c		compat_ibcs2
+file	arch/i386/i386/ibcs2_sigcode.S		compat_ibcs2
+file	arch/i386/i386/ibcs2_syscall.c		compat_ibcs2
+
+# Linux binary compatibility (COMPAT_LINUX)
+include	"compat/linux/files.linux"
+include	"compat/linux/arch/i386/files.linux_i386"
+file	arch/i386/i386/linux_sigcode.S		compat_linux
+file	arch/i386/i386/linux_syscall.c		compat_linux
+file	arch/i386/i386/linux_trap.c		compat_linux
+
+# FreeBSD binary compatibility (COMPAT_FREEBSD)
+include	"compat/freebsd/files.freebsd"
+file	arch/i386/i386/freebsd_machdep.c	compat_freebsd
+file	arch/i386/i386/freebsd_sigcode.S	compat_freebsd
+file	arch/i386/i386/freebsd_syscall.c	compat_freebsd
+
+# a.out binary compatibility (COMPAT_AOUT)
+include	"compat/aout/files.aout"
+
+# Win32 binary compatibility (COMPAT_PECOFF)
+include	"compat/pecoff/files.pecoff"
+
+# OSS audio driver compatibility
+include	"compat/ossaudio/files.ossaudio"
+
+# Xen devices
+
+# Network driver
+device	xennet: arp, ether, ifnet
+attach	xennet at hypervisor
+file	arch/xen/xen/if_xennet.c	xennet needs-flag
+
+# Block device driver and wd/sd/cd identities
+device	xbd: disk
+attach	xbd at hypervisor
+file	arch/xen/xen/xbd.c		xbd | wd | sd | cd needs-flag
+
+device	wd: disk
+attach	wd at hypervisor
+
+device	sd: disk
+attach	sd at hypervisor
+
+device	cd: disk
+attach	cd at hypervisor
+
+# Keyboard
+device	xenkbc: pckbport
+attach	xenkbc at hypervisor
+file	arch/xen/xen/xenkbc.c		xenkbc		needs-flag
+
+# Generic VGA
+attach	vga at hypervisor with vga_xen
+file	arch/xen/xen/vga_xen.c		vga_xen		needs-flag
+
+# Domain-0 operations
+defflag	opt_xen.h			DOM0OPS
+file	arch/xen/xen/machmem.c		dom0ops
+file	arch/xen/xen/privcmd.c		dom0ops
+file	arch/xen/xen/vfr.c		dom0ops
+
+include "arch/xen/conf/majors.i386"
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c
new file mode 100644
index 0000000000..766b7aaee2
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c
@@ -0,0 +1,630 @@
+/*	$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $	*/
+/*	NetBSD: autoconf.c,v 1.75 2003/12/30 12:33:22 pk Exp 	*/
+
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)autoconf.c	7.1 (Berkeley) 5/9/91
+ */
+
+/*
+ * Setup the system to run on the current machine.
+ *
+ * Configure() is called at boot time and initializes the vba
+ * device tables and the memory controller monitoring.  Available
+ * devices are determined (from possibilities mentioned in ioconf.c),
+ * and the drivers are initialized.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $");
+
+#include "opt_compat_oldboot.h"
+#include "opt_multiprocessor.h"
+#include "opt_nfs_boot.h"
+#include "xennet.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/disklabel.h>
+#include <sys/conf.h>
+#ifdef COMPAT_OLDBOOT
+#include <sys/reboot.h>
+#endif
+#include <sys/device.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/fcntl.h>
+#include <sys/dkio.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+
+#ifdef NFS_BOOT_BOOTSTATIC
+#include <net/if.h>
+#include <net/if_ether.h>
+#include <netinet/in.h>
+#include <nfs/rpcv2.h>
+#include <nfs/nfsproto.h>
+#include <nfs/nfs.h>
+#include <nfs/nfsmount.h>
+#include <nfs/nfsdiskless.h>
+#include <machine/if_xennetvar.h>
+#endif
+
+#include <machine/pte.h>
+#include <machine/cpu.h>
+#include <machine/gdt.h>
+#include <machine/pcb.h>
+#include <machine/bootinfo.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+
+#if NIOAPIC > 0
+#include <machine/i82093var.h>
+#endif
+
+#if NLAPIC > 0
+#include <machine/i82489var.h>
+#endif
+
+static int match_harddisk(struct device *, struct btinfo_bootdisk *);
+static void matchbiosdisks(void);
+static void findroot(void);
+static int is_valid_disk(struct device *);
+
+extern struct disklist *i386_alldisks;
+extern int i386_ndisks;
+
+#include "bios32.h"
+#if NBIOS32 > 0
+#include <machine/bios32.h>
+#endif
+
+#include "opt_pcibios.h"
+#ifdef PCIBIOS
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <i386/pci/pcibios.h>
+#endif
+
+#include "opt_kvm86.h"
+#ifdef KVM86
+#include <machine/kvm86.h>
+#endif
+
+#include "opt_xen.h"
+
+struct device *booted_device;
+int booted_partition;
+
+/*
+ * Determine i/o configuration for a machine.
+ */
+void
+cpu_configure(void)
+{
+
+	startrtclock();
+
+#if NBIOS32 > 0
+	bios32_init();
+#endif
+#ifdef PCIBIOS
+	pcibios_init();
+#endif
+
+	/* kvm86 needs a TSS */
+	i386_proc0_tss_ldt_init();
+#ifdef KVM86
+	kvm86_init();
+#endif
+
+	if (config_rootfound("mainbus", NULL) == NULL)
+		panic("configure: mainbus not configured");
+
+#ifdef INTRDEBUG
+	intr_printconfig();
+#endif
+
+#if NIOAPIC > 0
+	lapic_set_lvt();
+	ioapic_enable();
+#endif
+	/* resync cr0 after FPU configuration */
+	lwp0.l_addr->u_pcb.pcb_cr0 = rcr0();
+#ifdef MULTIPROCESSOR
+	/* propagate this to the idle pcb's. */
+	cpu_init_idle_pcbs();
+#endif
+
+	spl0();
+#if NLAPIC > 0
+	lapic_tpr = 0;
+#endif
+}
+
+void
+cpu_rootconf(void)
+{
+	findroot();
+	matchbiosdisks();
+
+	printf("boot device: %s\n",
+	    booted_device ? booted_device->dv_xname : "<unknown>");
+
+	setroot(booted_device, booted_partition);
+}
+
+/*
+ * XXX ugly bit of code. But, this is the only safe time that the
+ * match between BIOS disks and native disks can be done.
+ */
+static void
+matchbiosdisks(void)
+{
+	struct btinfo_biosgeom *big;
+	struct bi_biosgeom_entry *be;
+	struct device *dv;
+	int i, ck, error, m, n;
+	struct vnode *tv;
+	char mbr[DEV_BSIZE];
+	int  dklist_size;
+	int bmajor;
+
+	big = lookup_bootinfo(BTINFO_BIOSGEOM);
+
+	if (big == NULL)
+		return;
+
+	/*
+	 * First, count all native disks
+	 */
+	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next)
+		if (is_valid_disk(dv))
+			i386_ndisks++;
+
+	if (i386_ndisks == 0)
+		return;
+
+	dklist_size = sizeof (struct disklist) + (i386_ndisks - 1) *
+	    sizeof (struct nativedisk_info);
+
+	/* XXX M_TEMP is wrong */
+	i386_alldisks = malloc(dklist_size, M_TEMP, M_NOWAIT);
+	if (i386_alldisks == NULL)
+		return;
+
+	memset(i386_alldisks, 0, dklist_size);
+
+	i386_alldisks->dl_nnativedisks = i386_ndisks;
+	i386_alldisks->dl_nbiosdisks = big->num;
+	for (i = 0; i < big->num; i++) {
+		i386_alldisks->dl_biosdisks[i].bi_dev = big->disk[i].dev;
+		i386_alldisks->dl_biosdisks[i].bi_sec = big->disk[i].sec;
+		i386_alldisks->dl_biosdisks[i].bi_head = big->disk[i].head;
+		i386_alldisks->dl_biosdisks[i].bi_cyl = big->disk[i].cyl;
+		i386_alldisks->dl_biosdisks[i].bi_lbasecs = big->disk[i].totsec;
+		i386_alldisks->dl_biosdisks[i].bi_flags = big->disk[i].flags;
+#ifdef GEOM_DEBUG
+#ifdef NOTYET
+		printf("disk %x: flags %x, interface %x, device %llx\n",
+			big->disk[i].dev, big->disk[i].flags,
+			big->disk[i].interface_path, big->disk[i].device_path);
+#endif
+#endif
+	}
+
+	/*
+	 * XXX code duplication from findroot()
+	 */
+	n = -1;
+	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+		if (dv->dv_class != DV_DISK)
+			continue;
+#ifdef GEOM_DEBUG
+		printf("matchbiosdisks: trying to match (%s) %s\n",
+		    dv->dv_xname, dv->dv_cfdata->cf_name);
+#endif
+		if (is_valid_disk(dv)) {
+			n++;
+			sprintf(i386_alldisks->dl_nativedisks[n].ni_devname,
+			    "%s%d", dv->dv_cfdata->cf_name,
+			    dv->dv_unit);
+
+			bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
+			if (bmajor == -1)
+				return;
+
+			if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, RAW_PART),
+			    &tv))
+				panic("matchbiosdisks: can't alloc vnode");
+
+			error = VOP_OPEN(tv, FREAD, NOCRED, 0);
+			if (error) {
+				vput(tv);
+				continue;
+			}
+			error = vn_rdwr(UIO_READ, tv, mbr, DEV_BSIZE, 0,
+			    UIO_SYSSPACE, 0, NOCRED, NULL, 0);
+			VOP_CLOSE(tv, FREAD, NOCRED, 0);
+			if (error) {
+#ifdef GEOM_DEBUG
+				printf("matchbiosdisks: %s: MBR read failure\n",
+				    dv->dv_xname);
+#endif
+				continue;
+			}
+
+			for (ck = i = 0; i < DEV_BSIZE; i++)
+				ck += mbr[i];
+			for (m = i = 0; i < big->num; i++) {
+				be = &big->disk[i];
+#ifdef GEOM_DEBUG
+				printf("match %s with %d ", dv->dv_xname, i);
+				printf("dev ck %x bios ck %x\n", ck, be->cksum);
+#endif
+				if (be->flags & BI_GEOM_INVALID)
+					continue;
+				if (be->cksum == ck &&
+				    !memcmp(&mbr[MBR_PART_OFFSET], be->dosparts,
+					MBR_PART_COUNT *
+					    sizeof (struct mbr_partition))) {
+#ifdef GEOM_DEBUG
+					printf("matched bios disk %x with %s\n",
+					    be->dev, dv->dv_xname);
+#endif
+					i386_alldisks->dl_nativedisks[n].
+					    ni_biosmatches[m++] = i;
+				}
+			}
+			i386_alldisks->dl_nativedisks[n].ni_nmatches = m;
+			vput(tv);
+		}
+	}
+}
+
+#ifdef COMPAT_OLDBOOT
+u_long	bootdev = 0;		/* should be dev_t, but not until 32 bits */
+#endif
+
+/*
+ * helper function for "findroot()":
+ * return nonzero if disk device matches bootinfo
+ */
+static int
+match_harddisk(struct device *dv, struct btinfo_bootdisk *bid)
+{
+	struct vnode *tmpvn;
+	int error;
+	struct disklabel label;
+	int found = 0;
+	int bmajor;
+
+	/*
+	 * A disklabel is required here.  The
+	 * bootblocks don't refuse to boot from
+	 * a disk without a label, but this is
+	 * normally not wanted.
+	 */
+	if (bid->labelsector == -1)
+		return(0);
+
+	/*
+	 * lookup major number for disk block device
+	 */
+	bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
+	if (bmajor == -1)
+		return(0); /* XXX panic() ??? */
+
+	/*
+	 * Fake a temporary vnode for the disk, open
+	 * it, and read the disklabel for comparison.
+	 */
+	if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, bid->partition), &tmpvn))
+		panic("findroot can't alloc vnode");
+	error = VOP_OPEN(tmpvn, FREAD, NOCRED, 0);
+	if (error) {
+#ifndef DEBUG
+		/*
+		 * Ignore errors caused by missing
+		 * device, partition or medium.
+		 */
+		if (error != ENXIO && error != ENODEV)
+#endif
+			printf("findroot: can't open dev %s%c (%d)\n",
+			       dv->dv_xname, 'a' + bid->partition, error);
+		vput(tmpvn);
+		return(0);
+	}
+	error = VOP_IOCTL(tmpvn, DIOCGDINFO, &label, FREAD, NOCRED, 0);
+	if (error) {
+		/*
+		 * XXX can't happen - open() would
+		 * have errored out (or faked up one)
+		 */
+		printf("can't get label for dev %s%c (%d)\n",
+		       dv->dv_xname, 'a' + bid->partition, error);
+		goto closeout;
+	}
+
+	/* compare with our data */
+	if (label.d_type == bid->label.type &&
+	    label.d_checksum == bid->label.checksum &&
+	    !strncmp(label.d_packname, bid->label.packname, 16))
+		found = 1;
+
+closeout:
+	VOP_CLOSE(tmpvn, FREAD, NOCRED, 0);
+	vput(tmpvn);
+	return(found);
+}
+
+/*
+ * Attempt to find the device from which we were booted.
+ * If we can do so, and not instructed not to do so,
+ * change rootdev to correspond to the load device.
+ */
+void
+findroot(void)
+{
+	struct btinfo_bootdisk *bid;
+	struct device *dv;
+	union xen_cmdline_parseinfo xcp;
+#ifdef COMPAT_OLDBOOT
+	int i, majdev, unit, part;
+	char buf[32];
+#endif
+
+	if (booted_device)
+		return;
+
+	if (lookup_bootinfo(BTINFO_NETIF)) {
+		/*
+		 * We got netboot interface information, but
+		 * "device_register()" couldn't match it to a configured
+		 * device. Bootdisk information cannot be present at the
+		 * same time, so give up.
+		 */
+		printf("findroot: netboot interface not found\n");
+		return;
+	}
+
+	bid = lookup_bootinfo(BTINFO_BOOTDISK);
+	if (bid) {
+		/*
+		 * Scan all disk devices for ones that match the passed data.
+		 * Don't break if one is found, to get possible multiple
+		 * matches - for problem tracking. Use the first match anyway
+		 * because lower device numbers are more likely to be the
+		 * boot device.
+		 */
+		for (dv = alldevs.tqh_first; dv != NULL;
+		    dv = dv->dv_list.tqe_next) {
+			if (dv->dv_class != DV_DISK)
+				continue;
+
+			if (!strcmp(dv->dv_cfdata->cf_name, "fd")) {
+				/*
+				 * Assume the configured unit number matches
+				 * the BIOS device number.  (This is the old
+				 * behaviour.)  Needs some ideas how to handle
+				 * BIOS's "swap floppy drive" options.
+				 */
+				if ((bid->biosdev & 0x80) ||
+				    dv->dv_unit != bid->biosdev)
+					continue;
+
+				goto found;
+			}
+
+			if (is_valid_disk(dv)) {
+				/*
+				 * Don't trust BIOS device numbers, try
+				 * to match the information passed by the
+				 * bootloader instead.
+				 */
+				if ((bid->biosdev & 0x80) == 0 ||
+				    !match_harddisk(dv, bid))
+					continue;
+
+				goto found;
+			}
+
+			/* no "fd", "wd", "sd", "ld", "ed" */
+			continue;
+
+found:
+			if (booted_device) {
+				printf("warning: double match for boot "
+				    "device (%s, %s)\n",
+				    booted_device->dv_xname, dv->dv_xname);
+				continue;
+			}
+			booted_device = dv;
+			booted_partition = bid->partition;
+		}
+
+		if (booted_device)
+			return;
+	}
+
+	xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
+
+	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+		if (is_valid_disk(dv) == 0)
+			continue;
+
+		if (xcp.xcp_bootdev[0] == 0) {
+			booted_device = dv;
+			break;
+		}
+
+		if (strncmp(xcp.xcp_bootdev, dv->dv_xname,
+		    strlen(dv->dv_xname)))
+			continue;
+
+		if (strlen(xcp.xcp_bootdev) > strlen(dv->dv_xname)) {
+			booted_partition = toupper(
+				xcp.xcp_bootdev[strlen(dv->dv_xname)]) - 'A';
+		}
+
+		booted_device = dv;
+		break;
+	}
+
+	if (booted_device)
+		return;
+
+#ifdef COMPAT_OLDBOOT
+#if 0
+	printf("howto %x bootdev %x ", boothowto, bootdev);
+#endif
+
+	if ((bootdev & B_MAGICMASK) != (u_long)B_DEVMAGIC)
+		return;
+
+	majdev = (bootdev >> B_TYPESHIFT) & B_TYPEMASK;
+	name = devsw_blk2name(majdev);
+	if (name == NULL)
+		return;
+
+	part = (bootdev >> B_PARTITIONSHIFT) & B_PARTITIONMASK;
+	unit = (bootdev >> B_UNITSHIFT) & B_UNITMASK;
+
+	sprintf(buf, "%s%d", name, unit);
+	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+		if (strcmp(buf, dv->dv_xname) == 0) {
+			booted_device = dv;
+			booted_partition = part;
+			return;
+		}
+	}
+#endif
+}
+
+#include "pci.h"
+
+#include <dev/isa/isavar.h>
+#if NPCI > 0
+#include <dev/pci/pcivar.h>
+#endif
+
+void
+device_register(struct device *dev, void *aux)
+{
+	/*
+	 * Handle network interfaces here, the attachment information is
+	 * not available driver independantly later.
+	 * For disks, there is nothing useful available at attach time.
+	 */
+#if NXENNET > 0
+	if (dev->dv_class == DV_IFNET) {
+		union xen_cmdline_parseinfo xcp;
+
+		xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
+		if (strncmp(xcp.xcp_bootdev, dev->dv_xname, 16) == 0) {
+#ifdef NFS_BOOT_BOOTSTATIC
+			nfs_bootstatic_callback = xennet_bootstatic_callback;
+#endif
+			goto found;
+		}
+	}
+#endif
+	if (dev->dv_class == DV_IFNET) {
+		struct btinfo_netif *bin = lookup_bootinfo(BTINFO_NETIF);
+		if (bin == NULL)
+			return;
+
+		/*
+		 * We don't check the driver name against the device name
+		 * passed by the boot ROM. The ROM should stay usable
+		 * if the driver gets obsoleted.
+		 * The physical attachment information (checked below)
+		 * must be sufficient to identify the device.
+		 */
+
+		if (bin->bus == BI_BUS_ISA &&
+		    !strcmp(dev->dv_parent->dv_cfdata->cf_name, "isa")) {
+			struct isa_attach_args *iaa = aux;
+
+			/* compare IO base address */
+			/* XXXJRT what about multiple I/O addrs? */
+			if (iaa->ia_nio > 0 &&
+			    bin->addr.iobase == iaa->ia_io[0].ir_addr)
+				goto found;
+		}
+#if NPCI > 0
+		if (bin->bus == BI_BUS_PCI &&
+		    !strcmp(dev->dv_parent->dv_cfdata->cf_name, "pci")) {
+			struct pci_attach_args *paa = aux;
+			int b, d, f;
+
+			/*
+			 * Calculate BIOS representation of:
+			 *
+			 *	<bus,device,function>
+			 *
+			 * and compare.
+			 */
+			pci_decompose_tag(paa->pa_pc, paa->pa_tag, &b, &d, &f);
+			if (bin->addr.tag == ((b << 8) | (d << 3) | f))
+				goto found;
+		}
+#endif
+	}
+	return;
+
+found:
+	if (booted_device) {
+		/* XXX should be a "panic()" */
+		printf("warning: double match for boot device (%s, %s)\n",
+		    booted_device->dv_xname, dev->dv_xname);
+		return;
+	}
+	booted_device = dev;
+}
+
+static int
+is_valid_disk(struct device *dv)
+{
+	const char *name;
+
+	if (dv->dv_class != DV_DISK)
+		return (0);
+
+	name = dv->dv_cfdata->cf_name;
+
+	return (strcmp(name, "sd") == 0 || strcmp(name, "wd") == 0 ||
+	    strcmp(name, "ld") == 0 || strcmp(name, "ed") == 0 ||
+	    strcmp(name, "xbd") == 0);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c
new file mode 100644
index 0000000000..23dd52f1d3
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c
@@ -0,0 +1,408 @@
+/*	$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $	*/
+/*	NetBSD: gdt.c,v 1.32 2004/02/13 11:36:13 wiz Exp 	*/
+
+/*-
+ * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by John T. Kohl and Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $");
+
+#include "opt_multiprocessor.h"
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/user.h>
+
+#include <uvm/uvm.h>
+
+#include <machine/gdt.h>
+
+int gdt_size[2];	/* total number of GDT entries */
+int gdt_count[2];	/* number of GDT entries in use */
+int gdt_next[2];	/* next available slot for sweeping */
+int gdt_free[2];	/* next free slot; terminated with GNULL_SEL */
+
+struct lock gdt_lock_store;
+
+static __inline void gdt_lock(void);
+static __inline void gdt_unlock(void);
+void gdt_init(void);
+void gdt_grow(int);
+int gdt_get_slot(void);
+int gdt_get_slot1(int);
+void gdt_put_slot(int);
+void gdt_put_slot1(int, int);
+
+/*
+ * Lock and unlock the GDT, to avoid races in case gdt_{ge,pu}t_slot() sleep
+ * waiting for memory.
+ *
+ * Note that the locking done here is not sufficient for multiprocessor
+ * systems.  A freshly allocated slot will still be of type SDT_SYSNULL for
+ * some time after the GDT is unlocked, so gdt_compact() could attempt to
+ * reclaim it.
+ */
+static __inline void
+gdt_lock()
+{
+
+	(void) lockmgr(&gdt_lock_store, LK_EXCLUSIVE, NULL);
+}
+
+static __inline void
+gdt_unlock()
+{
+
+	(void) lockmgr(&gdt_lock_store, LK_RELEASE, NULL);
+}
+
+void
+setgdt(int sel, void *base, size_t limit,
+    int type, int dpl, int def32, int gran)
+{
+	struct segment_descriptor sd;
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+
+	if (type == SDT_SYS386TSS) {
+		/* printk("XXX TSS descriptor not supported in GDT\n"); */
+		return;
+	}
+
+	setsegment(&sd, base, limit, type, dpl, def32, gran);
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if (ci->ci_gdt != NULL) {
+#ifndef XEN
+			ci->ci_gdt[sel].sd = sd;
+#else
+			xen_update_descriptor(&ci->ci_gdt[sel],
+			    (union descriptor *)&sd);
+#endif
+		}
+	}
+}
+
+/*
+ * Initialize the GDT subsystem.  Called from autoconf().
+ */
+void
+gdt_init()
+{
+	size_t max_len, min_len;
+	union descriptor *old_gdt;
+	struct vm_page *pg;
+	vaddr_t va;
+	struct cpu_info *ci = &cpu_info_primary;
+
+	lockinit(&gdt_lock_store, PZERO, "gdtlck", 0, 0);
+
+	max_len = MAXGDTSIZ * sizeof(gdt[0]);
+	min_len = MINGDTSIZ * sizeof(gdt[0]);
+
+	gdt_size[0] = MINGDTSIZ;
+	gdt_count[0] = NGDT;
+	gdt_next[0] = NGDT;
+	gdt_free[0] = GNULL_SEL;
+
+	gdt_size[1] = 0;
+	gdt_count[1] = MAXGDTSIZ;
+	gdt_next[1] = MAXGDTSIZ;
+	gdt_free[1] = GNULL_SEL;
+
+	old_gdt = gdt;
+	gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len + max_len);
+	for (va = (vaddr_t)gdt; va < (vaddr_t)gdt + min_len; va += PAGE_SIZE) {
+		pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+		if (pg == NULL) {
+			panic("gdt_init: no pages");
+		}
+		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+		    VM_PROT_READ | VM_PROT_WRITE);
+	}
+	memcpy(gdt, old_gdt, NGDT * sizeof(gdt[0]));
+	ci->ci_gdt = gdt;
+	setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
+	    SDT_MEMRWA, SEL_KPL, 1, 1);
+
+	gdt_init_cpu(ci);
+}
+
+/*
+ * Allocate shadow GDT for a slave CPU.
+ */
+void
+gdt_alloc_cpu(struct cpu_info *ci)
+{
+	int max_len = MAXGDTSIZ * sizeof(gdt[0]);
+	int min_len = MINGDTSIZ * sizeof(gdt[0]);
+	struct vm_page *pg;
+	vaddr_t va;
+
+	ci->ci_gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len);
+	for (va = (vaddr_t)ci->ci_gdt; va < (vaddr_t)ci->ci_gdt + min_len;
+	    va += PAGE_SIZE) {
+		while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO))
+		    == NULL) {
+			uvm_wait("gdt_alloc_cpu");
+		}
+		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+		    VM_PROT_READ | VM_PROT_WRITE);
+	}
+	memset(ci->ci_gdt, 0, min_len);
+	memcpy(ci->ci_gdt, gdt, gdt_count[0] * sizeof(gdt[0]));
+	setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
+	    SDT_MEMRWA, SEL_KPL, 1, 1);
+}
+
+
+/*
+ * Load appropriate gdt descriptor; we better be running on *ci
+ * (for the most part, this is how a CPU knows who it is).
+ */
+void
+gdt_init_cpu(struct cpu_info *ci)
+{
+#ifndef XEN
+	struct region_descriptor region;
+	size_t max_len;
+
+	max_len = MAXGDTSIZ * sizeof(gdt[0]);
+	setregion(&region, ci->ci_gdt, max_len - 1);
+	lgdt(&region);
+#else
+	size_t len = gdt_size[0] * sizeof(gdt[0]);
+	unsigned long frames[len >> PAGE_SHIFT];
+	vaddr_t va;
+	pt_entry_t *ptp;
+	pt_entry_t *maptp;
+	int f;
+
+	for (va = (vaddr_t)ci->ci_gdt, f = 0;
+	     va < (vaddr_t)ci->ci_gdt + len;
+	     va += PAGE_SIZE, f++) {
+		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
+		ptp = kvtopte(va);
+		frames[f] = *ptp >> PAGE_SHIFT;
+		maptp = (pt_entry_t *)vtomach((vaddr_t)ptp);
+		PTE_CLEARBITS(ptp, maptp, PG_RW);
+	}
+	PTE_UPDATES_FLUSH();
+	/* printk("loading gdt %x, %d entries, %d pages", */
+	    /* frames[0] << PAGE_SHIFT, gdt_size[0], len >> PAGE_SHIFT); */
+	if (HYPERVISOR_set_gdt(frames, gdt_size[0]))
+		panic("HYPERVISOR_set_gdt failed!\n");
+	lgdt_finish();
+#endif
+}
+
+#ifdef MULTIPROCESSOR
+
+void
+gdt_reload_cpu(struct cpu_info *ci)
+{
+	struct region_descriptor region;
+	size_t max_len;
+
+	max_len = MAXGDTSIZ * sizeof(gdt[0]);
+	setregion(&region, ci->ci_gdt, max_len - 1);
+	lgdt(&region);
+}
+#endif
+
+
+/*
+ * Grow the GDT.
+ */
+void
+gdt_grow(int which)
+{
+	size_t old_len, new_len, max_len;
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+	struct vm_page *pg;
+	vaddr_t va;
+
+	old_len = gdt_size[which] * sizeof(gdt[0]);
+	gdt_size[which] <<= 1;
+	new_len = old_len << 1;
+
+	if (which != 0) {
+		max_len = MAXGDTSIZ * sizeof(gdt[0]);
+		if (old_len == 0) {
+			gdt_size[which] = MINGDTSIZ;
+			new_len = gdt_size[which] * sizeof(gdt[0]);
+		}
+		for (va = (vaddr_t)(cpu_info_primary.ci_gdt) + old_len + max_len;
+		     va < (vaddr_t)(cpu_info_primary.ci_gdt) + new_len + max_len;
+		     va += PAGE_SIZE) {
+			while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
+			    NULL) {
+				uvm_wait("gdt_grow");
+			}
+			pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+			    VM_PROT_READ | VM_PROT_WRITE);
+		}
+		return;
+	}
+
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		for (va = (vaddr_t)(ci->ci_gdt) + old_len;
+		     va < (vaddr_t)(ci->ci_gdt) + new_len;
+		     va += PAGE_SIZE) {
+			while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
+			    NULL) {
+				uvm_wait("gdt_grow");
+			}
+			pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+			    VM_PROT_READ | VM_PROT_WRITE);
+		}
+	}
+}
+
+/*
+ * Allocate a GDT slot as follows:
+ * 1) If there are entries on the free list, use those.
+ * 2) If there are fewer than gdt_size entries in use, there are free slots
+ *    near the end that we can sweep through.
+ * 3) As a last resort, we increase the size of the GDT, and sweep through
+ *    the new slots.
+ */
+int
+gdt_get_slot()
+{
+	return gdt_get_slot1(0);
+}
+
+int
+gdt_get_slot1(int which)
+{
+	size_t offset;
+	int slot;
+
+	gdt_lock();
+
+	if (gdt_free[which] != GNULL_SEL) {
+		slot = gdt_free[which];
+		gdt_free[which] = gdt[slot].gd.gd_selector;
+	} else {
+		offset = which * MAXGDTSIZ * sizeof(gdt[0]);
+		if (gdt_next[which] != gdt_count[which] + offset)
+			panic("gdt_get_slot botch 1");
+		if (gdt_next[which] - offset >= gdt_size[which]) {
+			if (gdt_size[which] >= MAXGDTSIZ)
+				panic("gdt_get_slot botch 2");
+			gdt_grow(which);
+		}
+		slot = gdt_next[which]++;
+	}
+
+	gdt_count[which]++;
+	gdt_unlock();
+	return (slot);
+}
+
+/*
+ * Deallocate a GDT slot, putting it on the free list.
+ */
+void
+gdt_put_slot(int slot)
+{
+	gdt_put_slot1(slot, 0);
+}
+
+void
+gdt_put_slot1(int slot, int which)
+{
+
+	gdt_lock();
+	gdt_count[which]--;
+
+	gdt[slot].gd.gd_type = SDT_SYSNULL;
+	gdt[slot].gd.gd_selector = gdt_free[which];
+	gdt_free[which] = slot;
+
+	gdt_unlock();
+}
+
+int
+tss_alloc(struct pcb *pcb)
+{
+	int slot;
+
+	slot = gdt_get_slot();
+	setgdt(slot, &pcb->pcb_tss, sizeof(struct pcb) - 1,
+	    SDT_SYS386TSS, SEL_KPL, 0, 0);
+	return GSEL(slot, SEL_KPL);
+}
+
+void
+tss_free(int sel)
+{
+
+	gdt_put_slot(IDXSEL(sel));
+}
+
+/*
+ * Caller must have pmap locked for both of these functions.
+ */
+void
+ldt_alloc(struct pmap *pmap, union descriptor *ldt, size_t len)
+{
+	int slot;
+
+	slot = gdt_get_slot1(1);
+#ifndef XEN
+	setgdt(slot, ldt, len - 1, SDT_SYSLDT, SEL_KPL, 0, 0);
+#else
+	cpu_info_primary.ci_gdt[slot].ld.ld_base = (uint32_t)ldt;
+	cpu_info_primary.ci_gdt[slot].ld.ld_entries =
+		len / sizeof(union descriptor);
+#endif
+	pmap->pm_ldt_sel = GSEL(slot, SEL_KPL);
+}
+
+void
+ldt_free(struct pmap *pmap)
+{
+	int slot;
+
+	slot = IDXSEL(pmap->pm_ldt_sel);
+
+	gdt_put_slot1(slot, 1);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c
new file mode 100644
index 0000000000..e08b5a64bd
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c
@@ -0,0 +1,230 @@
+/*	$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/******************************************************************************
+ * hypervisor.c
+ * 
+ * Communication to/from hypervisor.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $");
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void
+hypervisor_force_callback(void)
+{
+
+	(void)HYPERVISOR_xen_version(0);
+}
+
+int stipending(void);
+int
+stipending()
+{
+	uint32_t l1;
+	unsigned long l2;
+	unsigned int l1i, l2i, port;
+	int irq;
+	shared_info_t *s = HYPERVISOR_shared_info;
+	struct cpu_info *ci;
+	int ret;
+
+	ret = 0;
+	ci = curcpu();
+
+#if 0
+	if (HYPERVISOR_shared_info->events)
+		printf("stipending events %08lx mask %08lx ilevel %d\n",
+		    HYPERVISOR_shared_info->events,
+		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
+#endif
+
+	/*
+	 * we're only called after STIC, so we know that we'll have to
+	 * STI at the end
+	 */
+	cli();
+	while (s->vcpu_data[0].evtchn_upcall_pending) {
+		s->vcpu_data[0].evtchn_upcall_pending = 0;
+		/* NB. No need for a barrier here -- XCHG is a barrier
+		 * on x86. */
+		l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
+		while ((l1i = ffs(l1)) != 0) {
+			l1i--;
+			l1 &= ~(1 << l1i);
+
+			l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
+			while ((l2i = ffs(l2)) != 0) {
+				l2i--;
+				l2 &= ~(1 << l2i);
+
+				port = (l1i << 5) + l2i;
+				if ((irq = evtchn_to_irq[port]) != -1) {
+					hypervisor_acknowledge_irq(irq);
+					ci->ci_ipending |= (1 << irq);
+					if (ret == 0 && ci->ci_ilevel <
+					    ci->ci_isources[irq]->is_handlers
+					    ->ih_level)
+						ret = 1;
+				}
+#if 0 /* XXXcl dev/evtchn */
+				else
+					evtchn_device_upcall(port);
+#endif
+			}
+		}
+	}
+	sti();
+
+#if 0
+	if (ci->ci_ipending & 0x1)
+		printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n",
+		    HYPERVISOR_shared_info->events,
+		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel,
+		    ci->ci_ipending);
+#endif
+
+	return (ret);
+}
+
+void do_hypervisor_callback(struct trapframe *regs)
+{
+	uint32_t l1;
+	unsigned long l2;
+	unsigned int l1i, l2i, port;
+	int irq;
+	shared_info_t *s = HYPERVISOR_shared_info;
+	struct cpu_info *ci;
+	int level;
+
+	ci = curcpu();
+	level = ci->ci_ilevel;
+
+	while (s->vcpu_data[0].evtchn_upcall_pending) {
+		s->vcpu_data[0].evtchn_upcall_pending = 0;
+		/* NB. No need for a barrier here -- XCHG is a barrier
+		 * on x86. */
+		l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
+		while ((l1i = ffs(l1)) != 0) {
+			l1i--;
+			l1 &= ~(1 << l1i);
+
+			l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
+			while ((l2i = ffs(l2)) != 0) {
+				l2i--;
+				l2 &= ~(1 << l2i);
+
+				port = (l1i << 5) + l2i;
+				if ((irq = evtchn_to_irq[port]) != -1)
+					do_event(irq, regs);
+#if 0 /* XXXcl dev/evtchn */
+				else
+					evtchn_device_upcall(port);
+#endif
+			}
+		}
+	}
+
+#ifdef DIAGNOSTIC
+	if (level != ci->ci_ilevel)
+		printf("hypervisor done %08x level %d/%d ipending %08x\n",
+		    HYPERVISOR_shared_info->evtchn_pending_sel, level,
+		    ci->ci_ilevel, ci->ci_ipending);
+#endif
+}
+
+void hypervisor_unmask_event(unsigned int ev)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+
+	x86_atomic_clear_bit(&s->evtchn_mask[0], ev);
+	/*
+	 * The following is basically the equivalent of
+	 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
+	 * interrupt edge' if the channel is masked.
+	 */
+	if (x86_atomic_test_bit(&s->evtchn_pending[0], ev) && 
+	    !x86_atomic_test_and_set_bit(&s->evtchn_pending_sel, ev>>5)) {
+		s->vcpu_data[0].evtchn_upcall_pending = 1;
+		if (!s->vcpu_data[0].evtchn_upcall_mask)
+			hypervisor_force_callback();
+	}
+}
+
+void hypervisor_mask_event(unsigned int ev)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+
+	x86_atomic_set_bit(&s->evtchn_mask[0], ev);
+}
+
+void hypervisor_clear_event(unsigned int ev)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+
+	x86_atomic_clear_bit(&s->evtchn_pending[0], ev);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S
new file mode 100644
index 0000000000..45af67272f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S
@@ -0,0 +1,2000 @@
+/*	$NetBSD: locore.S,v 1.2.2.1 2004/05/22 15:59:48 he Exp $	*/
+/*	NetBSD: locore.S,v 1.26 2004/04/12 13:17:46 yamt Exp 	*/
+
+/*-
+ * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)locore.s	7.3 (Berkeley) 5/13/91
+ */
+
+#include "opt_compat_netbsd.h"
+#include "opt_compat_oldboot.h"
+#include "opt_cputype.h"
+#include "opt_ddb.h"
+#include "opt_ipkdb.h"
+#include "opt_lockdebug.h"
+#include "opt_multiprocessor.h"
+#include "opt_realmem.h"
+#include "opt_user_ldt.h"
+#include "opt_vm86.h"
+#include "opt_xen.h"
+
+#include "npx.h"
+#include "assym.h"
+#include "apm.h"
+#include "lapic.h"
+#include "ioapic.h"
+#include "ksyms.h"
+
+#include <sys/errno.h>
+#include <sys/syscall.h>
+
+#include <machine/cputypes.h>
+#include <machine/param.h>
+#include <machine/pte.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/trap.h>
+#include <machine/bootinfo.h>
+
+#if NLAPIC > 0
+#include <machine/i82489reg.h>
+#endif
+
+/* LINTSTUB: include <sys/types.h> */
+/* LINTSTUB: include <machine/cpu.h> */
+/* LINTSTUB: include <sys/systm.h> */
+
+#include <machine/asm.h>
+
+#if defined(MULTIPROCESSOR)
+	
+#define SET_CURLWP(lwp,cpu)				\
+	movl	CPUVAR(SELF),cpu		; 	\
+	movl	lwp,CPUVAR(CURLWP)	;	\
+	movl	cpu,L_CPU(lwp)
+	
+#else
+
+#define SET_CURLWP(lwp,tcpu)		movl	lwp,CPUVAR(CURLWP)
+#define GET_CURLWP(reg)			movl	CPUVAR(CURLWP),reg
+
+#endif
+
+#define GET_CURPCB(reg)			movl	CPUVAR(CURPCB),reg	
+#define SET_CURPCB(reg)			movl	reg,CPUVAR(CURPCB)
+
+#define CLEAR_RESCHED(reg)		movl	reg,CPUVAR(RESCHED)
+
+/* XXX temporary kluge; these should not be here */
+/* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */
+#include <dev/isa/isareg.h>
+
+
+/* Disallow old names for REALBASEMEM */
+#ifdef BIOSBASEMEM
+#error BIOSBASEMEM option deprecated; use REALBASEMEM only if memory size reported by latest boot block is incorrect
+#endif
+
+/* Disallow old names for REALEXTMEM */
+#ifdef EXTMEM_SIZE
+#error EXTMEM_SIZE option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
+#endif
+#ifdef BIOSEXTMEM
+#error BIOSEXTMEM option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
+#endif
+
+#include <machine/frameasm.h>
+
+
+#ifdef MULTIPROCESSOR
+#include <machine/i82489reg.h>
+#endif
+	
+/*
+ * PTmap is recursive pagemap at top of virtual address space.
+ * Within PTmap, the page directory can be found (third indirection).
+ *
+ * XXX 4 == sizeof pde
+ */
+	.set	_C_LABEL(PTmap),(PDSLOT_PTE << PDSHIFT)
+	.set	_C_LABEL(PTD),(_C_LABEL(PTmap) + PDSLOT_PTE * PAGE_SIZE)
+	.set	_C_LABEL(PTDpde),(_C_LABEL(PTD) + PDSLOT_PTE * 4)
+
+/*
+ * APTmap, APTD is the alternate recursive pagemap.
+ * It's used when modifying another process's page tables.
+ *
+ * XXX 4 == sizeof pde
+ */
+	.set	_C_LABEL(APTmap),(PDSLOT_APTE << PDSHIFT)
+	.set	_C_LABEL(APTD),(_C_LABEL(APTmap) + PDSLOT_APTE * PAGE_SIZE)
+	.set	_C_LABEL(APTDpde),(_C_LABEL(PTD) + PDSLOT_APTE * 4)
+
+
+/*
+ * Xen guest identifier and loader selection
+ */
+.section __xen_guest
+	.asciz "GUEST_OS=netbsd,GUEST_VER=2.0,XEN_VER=2.0,LOADER=generic"
+
+
+/*
+ * Initialization
+ */
+	.data
+
+	.globl	_C_LABEL(cpu)
+	.globl	_C_LABEL(esym),_C_LABEL(boothowto)
+	.globl	_C_LABEL(bootinfo),_C_LABEL(atdevbase)
+#ifdef COMPAT_OLDBOOT
+	.globl	_C_LABEL(bootdev)
+#endif
+	.globl	_C_LABEL(proc0paddr),_C_LABEL(PTDpaddr)
+	.globl	_C_LABEL(biosbasemem),_C_LABEL(biosextmem)
+	.globl	_C_LABEL(gdt)
+#ifdef I586_CPU
+	.globl	_C_LABEL(idt)
+#endif
+	.globl	_C_LABEL(lapic_tpr)	
+	
+#if NLAPIC > 0
+#ifdef __ELF__
+	.align	PAGE_SIZE
+#else
+	.align	12
+#endif
+	.globl _C_LABEL(local_apic), _C_LABEL(lapic_id)
+_C_LABEL(local_apic):
+	.space	LAPIC_ID
+_C_LABEL(lapic_id):	
+	.long	0x00000000
+	.space  LAPIC_TPRI-(LAPIC_ID+4)
+_C_LABEL(lapic_tpr):		
+	.space  LAPIC_PPRI-LAPIC_TPRI
+_C_LABEL(lapic_ppr):		
+	.space	LAPIC_ISR-LAPIC_PPRI
+_C_LABEL(lapic_isr):
+	.space	PAGE_SIZE-LAPIC_ISR
+#else
+_C_LABEL(lapic_tpr):	
+	.long 0
+#endif
+	
+
+_C_LABEL(cpu):		.long	0	# are we 386, 386sx, or 486,
+					#   or Pentium, or..
+_C_LABEL(esym):		.long	0	# ptr to end of syms
+_C_LABEL(atdevbase):	.long	0	# location of start of iomem in virtual
+_C_LABEL(proc0paddr):	.long	0
+_C_LABEL(PTDpaddr):	.long	0	# paddr of PTD, for libkvm
+#ifndef REALBASEMEM
+_C_LABEL(biosbasemem):	.long	0	# base memory reported by BIOS
+#else
+_C_LABEL(biosbasemem):	.long	REALBASEMEM
+#endif
+#ifndef REALEXTMEM
+_C_LABEL(biosextmem):	.long	0	# extended memory reported by BIOS
+#else
+_C_LABEL(biosextmem):	.long	REALEXTMEM
+#endif
+
+#include <machine/xen.h>
+#define __HYPERVISOR_yield		   8
+
+	.space 512
+tmpstk:
+	.long tmpstk, __KERNEL_DS
+
+
+#define	_RELOC(x)	((x))
+#define	RELOC(x)	_RELOC(_C_LABEL(x))
+
+/* XXX assym.h */
+#define MOD_START   48
+#define MOD_LEN     56
+/* XXX assym.h */
+
+	.text
+	.globl	_C_LABEL(kernel_text)
+	.set	_C_LABEL(kernel_text),KERNTEXTOFF
+
+	.globl	start
+start:
+	cld
+
+	lss	tmpstk,%esp		# bootstrap stack end location
+
+	movl	%esi,%ebx		# save start_info pointer
+
+#if (NKSYMS || defined(DDB) || defined(LKM)) && !defined(SYMTAB_SPACE)
+	/* Save the symbol locations. */
+	movl	MOD_START(%ebx),%esi
+	addl	MOD_LEN(%ebx),%esi
+	movl	%esi,RELOC(esym)
+#endif
+
+        /* Clear BSS first so that there are no surprises... */
+	xorl	%eax,%eax
+	movl	$RELOC(__bss_start),%edi
+	movl	$RELOC(_end),%ecx
+	subl	%edi,%ecx
+	rep stosb
+
+	movl	%ebx,RELOC(avail_start)
+
+	/* Copy the necessary stuff from start_info structure. */
+        /* We need to copy shared_info early, so that sti/cli work */
+	movl	%ebx,%esi
+	movl	$RELOC(start_info_union),%edi
+	movl	$128,%ecx
+	rep movsl
+
+    	/* (howto, [bootdev], bootinfo, basemem, extmem). */
+	xorl	%eax,%eax
+	movl	%eax,RELOC(boothowto)
+#ifdef COMPAT_OLDBOOT
+	movl	%eax,RELOC(bootdev)
+#endif
+	movl	$0x20000,%eax
+	movl	%eax,RELOC(boothowto)
+
+	/* First, reset the PSL. */
+	pushl	$PSL_MBO
+	popfl
+
+	/* Clear segment registers; always null in proc0. */
+	xorl	%eax,%eax
+	movw	%ax,%fs
+	movw	%ax,%gs
+	decl	%eax
+	movl	%eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
+
+	xorl	%eax,%eax
+	cpuid
+	movl	%eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
+
+/*
+ * Virtual address space of kernel:
+ *
+ * text | data | bss | [syms] | page dir | proc0 kstack 
+ *			      0          1       2      3
+ */
+#define	PROC0PDIR	((0)              * PAGE_SIZE)
+#define	PROC0STACK	((1)              * PAGE_SIZE)
+#define	SYSMAP		((1+UPAGES)       * PAGE_SIZE)
+#define	TABLESIZE	((1+UPAGES) * PAGE_SIZE) /* + nkpde * PAGE_SIZE */
+
+	/* Find end of kernel image. */
+	movl	RELOC(avail_start),%edi
+	/* Calculate where to start the bootstrap tables. */
+	movl	%edi,%esi
+
+	/*
+	 * Calculate the size of the kernel page table directory, and
+	 * how many entries it will have.
+	 */
+	movl	RELOC(nkpde),%ecx		# get nkpde
+	cmpl	$NKPTP_MIN,%ecx			# larger than min?
+	jge	1f
+	movl	$NKPTP_MIN,%ecx			# set at min
+	jmp	2f
+1:	cmpl	$NKPTP_MAX,%ecx			# larger than max?
+	jle	2f
+	movl	$NKPTP_MAX,%ecx
+2:
+
+	/* Clear memory for bootstrap tables. */
+	shll	$PGSHIFT,%ecx
+	addl	$TABLESIZE,%ecx
+	addl	%esi,%ecx			# end of tables
+	movl	%ecx,RELOC(gdt)
+	addl	$PAGE_SIZE,%ecx
+	movl	%ecx,RELOC(avail_start)
+	subl	%edi,%ecx			# size of tables
+	shrl	$2,%ecx
+	xorl	%eax,%eax
+	cld
+	rep
+	stosl
+
+/*
+ * fillkpt
+ *	eax = pte (page frame | control | status)
+ *	ebx = page table address
+ *	ecx = number of pages to map
+ */
+#define	fillkpt		\
+1:	movl	%eax,(%ebx)	; \
+	addl	$PAGE_SIZE,%eax	; /* increment physical address */ \
+	addl	$4,%ebx		; /* next pte */ \
+	loop	1b		;
+
+/*
+ * Build initial page tables.
+ */
+	/* Calculate end of text segment, rounded to a page. */
+	leal	(RELOC(etext)+PGOFSET),%edx
+	andl	$~PGOFSET,%edx
+	
+	/* Skip over the first 1MB. */
+	movl	$KERNTEXTOFF,%eax
+	movl	%eax,%ecx
+	subl	$KERNBASE_LOCORE,%ecx
+	shrl	$PGSHIFT,%ecx
+	leal	(SYSMAP)(%esi,%ecx,4),%ebx
+
+	/* Map the kernel text read-only. */
+	movl	%edx,%ecx
+	subl	%eax,%ecx
+	shrl	$PGSHIFT,%ecx
+	orl	$(PG_V|PG_KR),%eax
+	fillkpt
+
+	/* Map the data, BSS, and bootstrap tables read-write. */
+	movl	RELOC(avail_start),%ecx
+						    # end of tables
+	subl	%edx,%ecx				# subtract end of text
+	shrl	$PGSHIFT,%ecx
+	leal	(PG_V|PG_KW)(%edx),%eax
+	fillkpt
+
+	movl	$0xffffffff,(%ebx)
+	addl	$4,%ebx
+
+/*
+ * Construct a page table directory.
+ */
+	/* Map kernel PDEs. */
+	movl	RELOC(nkpde),%ecx			# for this many pde s,
+	leal	(PROC0PDIR+PDSLOT_KERN*4)(%esi),%ebx	# kernel pde offset
+	leal	(SYSMAP+PG_V|PG_KW)(%esi),%eax		# pte for KPT in proc 0,
+	fillkpt
+
+	/* Install a PDE recursively mapping page directory as a page table! */
+	leal	(PROC0PDIR+PG_V/*|PG_KW*/)(%esi),%eax	# pte for ptd
+	movl	%eax,(PROC0PDIR+PDSLOT_PTE*4)(%esi)	# recursive PD slot
+
+	/* Save phys. addr of PTD, for libkvm. */
+	movl	%esi,RELOC(PTDpaddr)
+
+    	call	xpmap_init
+
+	/* cr0 is 0x8005003b */
+
+	/* Relocate atdevbase. */
+	movl	_C_LABEL(avail_start),%edx
+	movl	%edx,_C_LABEL(HYPERVISOR_shared_info)
+	addl	$PAGE_SIZE,%edx			# shared_inf
+	movl	%edx,_C_LABEL(atdevbase)
+
+	/* Set up bootstrap stack. */
+	leal	(PROC0STACK)(%esi),%eax
+	movl	%eax,_C_LABEL(proc0paddr)
+	leal	(USPACE-FRAMESIZE)(%eax),%esp
+	subl	$KERNBASE_LOCORE,%esi
+	movl	%esi,PCB_CR3(%eax)	# pcb->pcb_cr3
+	xorl	%ebp,%ebp               # mark end of frames
+
+	movl	_C_LABEL(atdevbase),%eax
+	pushl	%eax
+	call	_C_LABEL(init386)	# wire 386 chip for unix operation
+	addl	$4,%esp
+
+#ifdef SAFARI_FIFO_HACK
+	movb	$5,%al
+	movw	$0x37b,%dx
+	outb	%al,%dx
+	movw	$0x37f,%dx
+	inb	%dx,%al
+	movb	%al,%cl
+
+	orb	$1,%cl
+
+	movb	$5,%al
+	movw	$0x37b,%dx
+	outb	%al,%dx
+	movw	$0x37f,%dx
+	movb	%cl,%al
+	outb	%al,%dx
+#endif /* SAFARI_FIFO_HACK */
+
+	call 	_C_LABEL(main)
+
+/*
+ * void proc_trampoline(void);
+ * This is a trampoline function pushed onto the stack of a newly created
+ * process in order to do some additional setup.  The trampoline is entered by
+ * cpu_switch()ing to the process, so we abuse the callee-saved registers used
+ * by cpu_switch() to store the information about the stub to call.
+ * NOTE: This function does not have a normal calling sequence!
+ */
+/* LINTSTUB: Func: void proc_trampoline(void) */
+NENTRY(proc_trampoline)
+#ifdef MULTIPROCESSOR
+	call	_C_LABEL(proc_trampoline_mp)
+#endif
+	movl	$IPL_NONE,CPUVAR(ILEVEL)
+	pushl	%ebx
+	call	*%esi
+	addl	$4,%esp
+	DO_DEFERRED_SWITCH(%eax)
+	INTRFASTEXIT
+	/* NOTREACHED */
+
+/*****************************************************************************/
+#ifdef COMPAT_16
+/*
+ * Signal trampoline; copied to top of user stack.
+ */
+/* LINTSTUB: Var: char sigcode[1], esigcode[1]; */
+NENTRY(sigcode)
+	/*
+	 * Handler has returned here as if we called it.  The sigcontext
+	 * is on the stack after the 3 args "we" pushed.
+	 */
+	leal	12(%esp),%eax		# get pointer to sigcontext
+	movl	%eax,4(%esp)		# put it in the argument slot
+					# fake return address already there
+	movl	$SYS_compat_16___sigreturn14,%eax
+	int	$0x80	 		# enter kernel with args on stack
+	movl	$SYS_exit,%eax
+	int	$0x80			# exit if sigreturn fails
+	.globl	_C_LABEL(esigcode)
+_C_LABEL(esigcode):
+#endif
+
+/*****************************************************************************/
+
+/*
+ * The following primitives are used to fill and copy regions of memory.
+ */
+
+/*
+ * XXX No section 9 man page for fillw.
+ * fillw seems to be very sparsely used (only in pccons it seems.)
+ * One wonders if it couldn't be done without.
+ * -- Perry Metzger, May 7, 2001
+ */
+/*
+ * void fillw(short pattern, void *addr, size_t len);
+ * Write len copies of pattern at addr.
+ */
+/* LINTSTUB: Func: void fillw(short pattern, void *addr, size_t len) */
+ENTRY(fillw)
+	pushl	%edi
+	movl	8(%esp),%eax
+	movl	12(%esp),%edi
+	movw	%ax,%cx
+	rorl	$16,%eax
+	movw	%cx,%ax
+	cld
+	movl	16(%esp),%ecx
+	shrl	%ecx			# do longwords
+	rep
+	stosl
+	movl	16(%esp),%ecx
+	andl	$1,%ecx			# do remainder
+	rep
+	stosw
+	popl	%edi
+	ret
+
+/*
+ * int kcopy(const void *from, void *to, size_t len);
+ * Copy len bytes, abort on fault.
+ */
+/* LINTSTUB: Func: int kcopy(const void *from, void *to, size_t len) */
+ENTRY(kcopy)
+	pushl	%esi
+	pushl	%edi
+	GET_CURPCB(%eax)		# load curpcb into eax and set on-fault
+	pushl	PCB_ONFAULT(%eax)
+	movl	$_C_LABEL(kcopy_fault), PCB_ONFAULT(%eax)
+
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ecx,%eax		# overlapping?
+	jb	1f
+	cld				# nope, copy forward
+	shrl	$2,%ecx			# copy by 32-bit words
+	rep
+	movsl
+	movl	24(%esp),%ecx
+	andl	$3,%ecx			# any bytes left?
+	rep
+	movsb
+
+	GET_CURPCB(%edx)		# XXX save curpcb?
+	popl	PCB_ONFAULT(%edx)
+	popl	%edi
+	popl	%esi
+	xorl	%eax,%eax
+	ret
+
+	ALIGN_TEXT
+1:	addl	%ecx,%edi		# copy backward
+	addl	%ecx,%esi
+	std
+	andl	$3,%ecx			# any fractional bytes?
+	decl	%edi
+	decl	%esi
+	rep
+	movsb
+	movl	24(%esp),%ecx		# copy remainder by 32-bit words
+	shrl	$2,%ecx
+	subl	$3,%esi
+	subl	$3,%edi
+	rep
+	movsl
+	cld
+
+	GET_CURPCB(%edx)
+	popl	PCB_ONFAULT(%edx)
+	popl	%edi
+	popl	%esi
+	xorl	%eax,%eax
+	ret
+
+/*****************************************************************************/
+
+/*
+ * The following primitives are used to copy data in and out of the user's
+ * address space.
+ */
+
+/*
+ * Default to the lowest-common-denominator.  We will improve it
+ * later.
+ */
+#if defined(I386_CPU)
+#define	DEFAULT_COPYOUT		_C_LABEL(i386_copyout)
+#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)
+#elif defined(I486_CPU)
+#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)
+#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)
+#elif defined(I586_CPU)
+#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
+#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
+#elif defined(I686_CPU)
+#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
+#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
+#endif
+
+	.data
+
+	.globl	_C_LABEL(copyout_func)
+_C_LABEL(copyout_func):
+	.long	DEFAULT_COPYOUT
+
+	.globl	_C_LABEL(copyin_func)
+_C_LABEL(copyin_func):
+	.long	DEFAULT_COPYIN
+
+	.text
+
+/*
+ * int copyout(const void *from, void *to, size_t len);
+ * Copy len bytes into the user's address space.
+ * see copyout(9)
+ */
+/* LINTSTUB: Func: int copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(copyout)
+	DO_DEFERRED_SWITCH(%eax)
+	jmp	*_C_LABEL(copyout_func)
+
+#if defined(I386_CPU)
+/* LINTSTUB: Func: int i386_copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(i386_copyout)
+	pushl	%esi
+	pushl	%edi
+	pushl	$0
+	
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%eax
+
+	/*
+	 * We check that the end of the destination buffer is not past the end
+	 * of the user's address space.  If it's not, then we only need to
+	 * check that each page is writable.  The 486 will do this for us; the
+	 * 386 will not.  (We assume that pages in user space that are not
+	 * writable by the user are not writable by the kernel either.)
+	 */
+	movl	%edi,%edx
+	addl	%eax,%edx
+	jc	_C_LABEL(copy_efault)
+	cmpl	$VM_MAXUSER_ADDRESS,%edx
+	ja	_C_LABEL(copy_efault)
+
+	testl	%eax,%eax		# anything to do?
+	jz	3f
+
+	/*
+	 * We have to check each PTE for (write) permission, since the CPU
+	 * doesn't do it for us.
+	 */
+
+	/* Compute number of pages. */
+	movl	%edi,%ecx
+	andl	$PGOFSET,%ecx
+	addl	%eax,%ecx
+	decl	%ecx
+	shrl	$PGSHIFT,%ecx
+
+	/* Compute PTE offset for start address. */
+	shrl	$PGSHIFT,%edi
+
+	GET_CURPCB(%edx)
+	movl	$2f,PCB_ONFAULT(%edx)
+
+1:	/* Check PTE for each page. */
+	testb	$PG_RW,_C_LABEL(PTmap)(,%edi,4)
+	jz	2f
+	
+4:	incl	%edi
+	decl	%ecx
+	jns	1b
+
+	movl	20(%esp),%edi
+	movl	24(%esp),%eax
+	jmp	3f
+	
+2:	/* Simulate a trap. */
+	pushl	%ecx
+	movl	%edi,%eax
+	shll	$PGSHIFT,%eax
+	pushl	%eax
+	call	_C_LABEL(trapwrite)	# trapwrite(addr)
+	addl	$4,%esp			# pop argument
+	popl	%ecx
+	testl	%eax,%eax		# if not ok, return EFAULT
+	jz	4b
+	jmp	_C_LABEL(copy_efault)
+
+3:	GET_CURPCB(%edx)
+	movl	$_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
+
+	/* bcopy(%esi, %edi, %eax); */
+	cld
+	movl	%eax,%ecx
+	shrl	$2,%ecx
+	rep
+	movsl
+	movl	%eax,%ecx
+	andl	$3,%ecx
+	rep
+	movsb
+
+	popl	PCB_ONFAULT(%edx)
+	popl	%edi
+	popl	%esi
+	xorl	%eax,%eax
+	ret
+#endif /* I386_CPU */
+
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+/* LINTSTUB: Func: int i486_copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(i486_copyout)
+	pushl	%esi
+	pushl	%edi
+	pushl	$0
+	
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%eax
+
+	/*
+	 * We check that the end of the destination buffer is not past the end
+	 * of the user's address space.
+	 */
+	movl	%edi,%edx
+	addl	%eax,%edx
+	jc	_C_LABEL(copy_efault)
+	cmpl	$VM_MAXUSER_ADDRESS,%edx
+	ja	_C_LABEL(copy_efault)
+
+	GET_CURPCB(%edx)
+	movl	$_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
+
+	/* bcopy(%esi, %edi, %eax); */
+	cld
+	movl	%eax,%ecx
+	shrl	$2,%ecx
+	rep
+	movsl
+	movl	%eax,%ecx
+	andl	$3,%ecx
+	rep
+	movsb
+
+	popl	PCB_ONFAULT(%edx)
+	popl	%edi
+	popl	%esi
+	xorl	%eax,%eax
+	ret
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+/*
+ * int copyin(const void *from, void *to, size_t len);
+ * Copy len bytes from the user's address space.
+ * see copyin(9)
+ */
+/* LINTSTUB: Func: int copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(copyin)
+	DO_DEFERRED_SWITCH(%eax)
+	jmp	*_C_LABEL(copyin_func)
+
+#if defined(I386_CPU) || defined(I486_CPU) || defined(I586_CPU) || \
+    defined(I686_CPU)
+/* LINTSTUB: Func: int i386_copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(i386_copyin)
+	pushl	%esi
+	pushl	%edi
+	GET_CURPCB(%eax)
+	pushl	$0
+	movl	$_C_LABEL(copy_fault),PCB_ONFAULT(%eax)
+	
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%eax
+
+	/*
+	 * We check that the end of the destination buffer is not past the end
+	 * of the user's address space.  If it's not, then we only need to
+	 * check that each page is readable, and the CPU will do that for us.
+	 */
+	movl	%esi,%edx
+	addl	%eax,%edx
+	jc	_C_LABEL(copy_efault)
+	cmpl	$VM_MAXUSER_ADDRESS,%edx
+	ja	_C_LABEL(copy_efault)
+
+	/* bcopy(%esi, %edi, %eax); */
+	cld
+	movl	%eax,%ecx
+	shrl	$2,%ecx
+	rep
+	movsl
+	movl	%eax,%ecx
+	andl	$3,%ecx
+	rep
+	movsb
+
+	GET_CURPCB(%edx)
+	popl	PCB_ONFAULT(%edx)
+	popl	%edi
+	popl	%esi
+	xorl	%eax,%eax
+	ret
+#endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
+
+/* LINTSTUB: Ignore */
+NENTRY(copy_efault)
+	movl	$EFAULT,%eax
+
+/*
+ * kcopy_fault is used by kcopy and copy_fault is used by copyin/out.
+ *
+ * they're distinguished for lazy pmap switching.  see trap().
+ */
+/* LINTSTUB: Ignore */
+NENTRY(kcopy_fault)
+	GET_CURPCB(%edx)
+	popl	PCB_ONFAULT(%edx)
+	popl	%edi
+	popl	%esi
+	ret
+
+/* LINTSTUB: Ignore */
+NENTRY(copy_fault)
+	GET_CURPCB(%edx)
+	popl	PCB_ONFAULT(%edx)
+	popl	%edi
+	popl	%esi
+	ret
+
+/*
+ * int copyoutstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
+ * Copy a NUL-terminated string, at most maxlen characters long, into the
+ * user's address space.  Return the number of characters copied (including the
+ * NUL) in *lencopied.  If the string is too long, return ENAMETOOLONG; else
+ * return 0 or EFAULT.
+ * see copyoutstr(9)
+ */
+/* LINTSTUB: Func: int copyoutstr(const void *kaddr, void *uaddr, size_t len, size_t *done) */
+ENTRY(copyoutstr)
+	pushl	%esi
+	pushl	%edi
+
+	DO_DEFERRED_SWITCH(%eax)
+
+	movl	12(%esp),%esi		# esi = from
+	movl	16(%esp),%edi		# edi = to
+	movl	20(%esp),%edx		# edx = maxlen
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
+	jne	5f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+	/* Compute number of bytes in first page. */
+	movl	%edi,%eax
+	andl	$PGOFSET,%eax
+	movl	$PAGE_SIZE,%ecx
+	subl	%eax,%ecx		# ecx = PAGE_SIZE - (src % PAGE_SIZE)
+
+	GET_CURPCB(%eax)
+	movl	$6f,PCB_ONFAULT(%eax)
+
+1:	/*
+	 * Once per page, check that we are still within the bounds of user
+	 * space, and check for a write fault.
+	 */
+	cmpl	$VM_MAXUSER_ADDRESS,%edi
+	jae	_C_LABEL(copystr_efault)
+
+	/* Compute PTE offset. */
+	movl	%edi,%eax
+	shrl	$PGSHIFT,%eax		# calculate pte address
+
+	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
+	jnz	2f
+
+6:	/* Simulate a trap. */
+	pushl	%edx
+	pushl	%edi
+	call	_C_LABEL(trapwrite)	# trapwrite(addr)
+	addl	$4,%esp			# clear argument from stack
+	popl	%edx
+	testl	%eax,%eax
+	jnz	_C_LABEL(copystr_efault)
+
+2:	/* Copy up to end of this page. */
+	subl	%ecx,%edx		# predecrement total count
+	jnc	3f
+	addl	%edx,%ecx		# ecx += (edx - ecx) = edx
+	xorl	%edx,%edx
+
+3:	decl	%ecx
+	js	4f
+	lodsb
+	stosb
+	testb	%al,%al
+	jnz	3b
+
+	/* Success -- 0 byte reached. */
+	addl	%ecx,%edx		# add back residual for this page
+	xorl	%eax,%eax
+	jmp	copystr_return
+
+4:	/* Go to next page, if any. */
+	movl	$PAGE_SIZE,%ecx
+	testl	%edx,%edx
+	jnz	1b
+
+	/* edx is zero -- return ENAMETOOLONG. */
+	movl	$ENAMETOOLONG,%eax
+	jmp	copystr_return
+#endif /* I386_CPU */
+
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+5:	GET_CURPCB(%eax)
+	movl	$_C_LABEL(copystr_fault),PCB_ONFAULT(%eax)
+	/*
+	 * Get min(%edx, VM_MAXUSER_ADDRESS-%edi).
+	 */
+	movl	$VM_MAXUSER_ADDRESS,%eax
+	subl	%edi,%eax
+	cmpl	%edx,%eax
+	jae	1f
+	movl	%eax,%edx
+	movl	%eax,20(%esp)
+
+1:	incl	%edx
+	cld
+
+1:	decl	%edx
+	jz	2f
+	lodsb
+	stosb
+	testb	%al,%al
+	jnz	1b
+
+	/* Success -- 0 byte reached. */
+	decl	%edx
+	xorl	%eax,%eax
+	jmp	copystr_return
+
+2:	/* edx is zero -- return EFAULT or ENAMETOOLONG. */
+	cmpl	$VM_MAXUSER_ADDRESS,%edi
+	jae	_C_LABEL(copystr_efault)
+	movl	$ENAMETOOLONG,%eax
+	jmp	copystr_return
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+/*
+ * int copyinstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
+ * Copy a NUL-terminated string, at most maxlen characters long, from the
+ * user's address space.  Return the number of characters copied (including the
+ * NUL) in *lencopied.  If the string is too long, return ENAMETOOLONG; else
+ * return 0 or EFAULT.
+ * see copyinstr(9)
+ */
+/* LINTSTUB: Func: int copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) */
+ENTRY(copyinstr)
+	pushl	%esi
+	pushl	%edi
+
+	DO_DEFERRED_SWITCH(%eax)
+
+	GET_CURPCB(%ecx)
+	movl	$_C_LABEL(copystr_fault),PCB_ONFAULT(%ecx)
+
+	movl	12(%esp),%esi		# %esi = from
+	movl	16(%esp),%edi		# %edi = to
+	movl	20(%esp),%edx		# %edx = maxlen
+
+	/*
+	 * Get min(%edx, VM_MAXUSER_ADDRESS-%esi).
+	 */
+	movl	$VM_MAXUSER_ADDRESS,%eax
+	subl	%esi,%eax
+	cmpl	%edx,%eax
+	jae	1f
+	movl	%eax,%edx
+	movl	%eax,20(%esp)
+
+1:	incl	%edx
+	cld
+
+1:	decl	%edx
+	jz	2f
+	lodsb
+	stosb
+	testb	%al,%al
+	jnz	1b
+
+	/* Success -- 0 byte reached. */
+	decl	%edx
+	xorl	%eax,%eax
+	jmp	copystr_return
+
+2:	/* edx is zero -- return EFAULT or ENAMETOOLONG. */
+	cmpl	$VM_MAXUSER_ADDRESS,%esi
+	jae	_C_LABEL(copystr_efault)
+	movl	$ENAMETOOLONG,%eax
+	jmp	copystr_return
+
+/* LINTSTUB: Ignore */
+NENTRY(copystr_efault)
+	movl	$EFAULT,%eax
+
+/* LINTSTUB: Ignore */
+NENTRY(copystr_fault)
+copystr_return:
+	/* Set *lencopied and return %eax. */
+	GET_CURPCB(%ecx)
+	movl	$0,PCB_ONFAULT(%ecx)
+	movl	20(%esp),%ecx
+	subl	%edx,%ecx
+	movl	24(%esp),%edx
+	testl	%edx,%edx
+	jz	8f
+	movl	%ecx,(%edx)
+
+8:	popl	%edi
+	popl	%esi
+	ret
+
+/*
+ * int copystr(const void *from, void *to, size_t maxlen, size_t *lencopied);
+ * Copy a NUL-terminated string, at most maxlen characters long.  Return the
+ * number of characters copied (including the NUL) in *lencopied.  If the
+ * string is too long, return ENAMETOOLONG; else return 0.
+ * see copystr(9)
+ */
+/* LINTSTUB: Func: int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done) */
+ENTRY(copystr)
+	pushl	%esi
+	pushl	%edi
+
+	movl	12(%esp),%esi		# esi = from
+	movl	16(%esp),%edi		# edi = to
+	movl	20(%esp),%edx		# edx = maxlen
+	incl	%edx
+	cld
+
+1:	decl	%edx
+	jz	4f
+	lodsb
+	stosb
+	testb	%al,%al
+	jnz	1b
+
+	/* Success -- 0 byte reached. */
+	decl	%edx
+	xorl	%eax,%eax
+	jmp	6f
+
+4:	/* edx is zero -- return ENAMETOOLONG. */
+	movl	$ENAMETOOLONG,%eax
+
+6:	/* Set *lencopied and return %eax. */
+	movl	20(%esp),%ecx
+	subl	%edx,%ecx
+	movl	24(%esp),%edx
+	testl	%edx,%edx
+	jz	7f
+	movl	%ecx,(%edx)
+
+7:	popl	%edi
+	popl	%esi
+	ret
+
+/*
+ * long fuword(const void *uaddr);
+ * Fetch an int from the user's address space.
+ * see fuword(9)
+ */
+/* LINTSTUB: Func: long fuword(const void *base) */
+ENTRY(fuword)
+	DO_DEFERRED_SWITCH(%eax)
+	movl	4(%esp),%edx
+	cmpl	$VM_MAXUSER_ADDRESS-4,%edx
+	ja	_C_LABEL(fusuaddrfault)
+	GET_CURPCB(%ecx)
+	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+	movl	(%edx),%eax
+	movl	$0,PCB_ONFAULT(%ecx)
+	ret
+	
+/*
+ * int fusword(const void *uaddr);
+ * Fetch a short from the user's address space.
+ * see fusword(9)
+ */
+/* LINTSTUB: Func: int fusword(const void *base) */
+ENTRY(fusword)
+	DO_DEFERRED_SWITCH(%eax)
+	movl	4(%esp),%edx
+	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
+	ja	_C_LABEL(fusuaddrfault)
+	GET_CURPCB(%ecx)
+	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+	movzwl	(%edx),%eax
+	movl	$0,PCB_ONFAULT(%ecx)
+	ret
+	
+/*
+ * int fuswintr(const void *uaddr);
+ * Fetch a short from the user's address space.  Can be called during an
+ * interrupt.
+ * see fuswintr(9)
+ */
+/* LINTSTUB: Func: int fuswintr(const void *base) */
+ENTRY(fuswintr)
+	cmpl	$TLBSTATE_VALID, CPUVAR(TLBSTATE)
+	jnz	_C_LABEL(fusuaddrfault)
+	movl	4(%esp),%edx
+	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
+	ja	_C_LABEL(fusuaddrfault)
+	movl	CPUVAR(CURLWP),%ecx
+	movl	L_ADDR(%ecx),%ecx
+	movl	$_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
+	movzwl	(%edx),%eax
+	movl	$0,PCB_ONFAULT(%ecx)
+	ret
+	
+/*
+ * int fubyte(const void *uaddr);
+ * Fetch a byte from the user's address space.
+ * see fubyte(9)
+ */
+/* LINTSTUB: Func: int fubyte(const void *base) */
+ENTRY(fubyte)
+	DO_DEFERRED_SWITCH(%eax)
+	movl	4(%esp),%edx
+	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
+	ja	_C_LABEL(fusuaddrfault)
+	GET_CURPCB(%ecx)
+	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+	movzbl	(%edx),%eax
+	movl	$0,PCB_ONFAULT(%ecx)
+	ret
+
+/*
+ * Handle faults from [fs]u*().  Clean up and return -1.
+ */
+/* LINTSTUB: Ignore */
+NENTRY(fusufault)
+	movl	$0,PCB_ONFAULT(%ecx)
+	movl	$-1,%eax
+	ret
+
+/*
+ * Handle faults from [fs]u*().  Clean up and return -1.  This differs from
+ * fusufault() in that trap() will recognize it and return immediately rather
+ * than trying to page fault.
+ */
+/* LINTSTUB: Ignore */
+NENTRY(fusubail)
+	movl	$0,PCB_ONFAULT(%ecx)
+	movl	$-1,%eax
+	ret
+
+/*
+ * Handle earlier faults from [fs]u*(), due to our of range addresses.
+ */
+/* LINTSTUB: Ignore */
+NENTRY(fusuaddrfault)
+	movl	$-1,%eax
+	ret
+
+/*
+ * int suword(void *uaddr, long x);
+ * Store an int in the user's address space.
+ * see suword(9)
+ */
+/* LINTSTUB: Func: int suword(void *base, long c) */
+ENTRY(suword)
+	DO_DEFERRED_SWITCH(%eax)
+	movl	4(%esp),%edx
+	cmpl	$VM_MAXUSER_ADDRESS-4,%edx
+	ja	_C_LABEL(fusuaddrfault)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
+	jne	2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+	GET_CURPCB(%eax)
+	movl	$3f,PCB_ONFAULT(%eax)
+
+	movl	%edx,%eax
+	shrl	$PGSHIFT,%eax		# calculate pte address
+	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
+	jnz	1f
+
+3:	/* Simulate a trap. */
+	pushl	%edx
+	pushl	%edx
+	call	_C_LABEL(trapwrite)	# trapwrite(addr)
+	addl	$4,%esp			# clear parameter from the stack
+	popl	%edx
+	GET_CURPCB(%ecx)
+	testl	%eax,%eax
+	jnz	_C_LABEL(fusufault)
+
+1:	/* XXX also need to check the following 3 bytes for validity! */
+#endif
+
+2:	GET_CURPCB(%ecx)
+	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+
+	movl	8(%esp),%eax
+	movl	%eax,(%edx)
+	xorl	%eax,%eax
+	movl	%eax,PCB_ONFAULT(%ecx)
+	ret
+	
+/*
+ * int susword(void *uaddr, short x);
+ * Store a short in the user's address space.
+ * see susword(9)
+ */
+/* LINTSTUB: Func: int susword(void *base, short c) */
+ENTRY(susword)
+	DO_DEFERRED_SWITCH(%eax)
+	movl	4(%esp),%edx
+	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
+	ja	_C_LABEL(fusuaddrfault)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
+	jne	2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+	GET_CURPCB(%eax)
+	movl	$3f,PCB_ONFAULT(%eax)
+
+	movl	%edx,%eax
+	shrl	$PGSHIFT,%eax		# calculate pte address
+	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
+	jnz	1f
+
+3:	/* Simulate a trap. */
+	pushl	%edx
+	pushl	%edx
+	call	_C_LABEL(trapwrite)	# trapwrite(addr)
+	addl	$4,%esp			# clear parameter from the stack
+	popl	%edx
+	GET_CURPCB(%ecx)
+	testl	%eax,%eax
+	jnz	_C_LABEL(fusufault)
+
+1:	/* XXX also need to check the following byte for validity! */
+#endif
+
+2:	GET_CURPCB(%ecx)
+	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+
+	movl	8(%esp),%eax
+	movw	%ax,(%edx)
+	xorl	%eax,%eax
+	movl	%eax,PCB_ONFAULT(%ecx)
+	ret
+
+/*
+ * int suswintr(void *uaddr, short x);
+ * Store a short in the user's address space.  Can be called during an
+ * interrupt.
+ * see suswintr(9)
+ */
+/* LINTSTUB: Func: int suswintr(void *base, short c) */
+ENTRY(suswintr)
+	cmpl	$TLBSTATE_VALID, CPUVAR(TLBSTATE)
+	jnz	_C_LABEL(fusuaddrfault)
+	movl	4(%esp),%edx
+	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
+	ja	_C_LABEL(fusuaddrfault)
+	movl	CPUVAR(CURLWP),%ecx
+	movl	L_ADDR(%ecx),%ecx
+	movl	$_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
+	jne	2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+	movl	%edx,%eax
+	shrl	$PGSHIFT,%eax		# calculate pte address
+	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
+	jnz	1f
+
+	/* Simulate a trap. */
+	jmp	_C_LABEL(fusubail)
+
+1:	/* XXX also need to check the following byte for validity! */
+#endif
+
+2:	movl	8(%esp),%eax
+	movw	%ax,(%edx)
+	xorl	%eax,%eax
+	movl	%eax,PCB_ONFAULT(%ecx)
+	ret
+
+/*
+ * int subyte(void *uaddr, char x);
+ * Store a byte in the user's address space.
+ * see subyte(9)
+ */
+/* LINTSTUB: Func: int subyte(void *base, int c) */
+ENTRY(subyte)
+	DO_DEFERRED_SWITCH(%eax)
+	movl	4(%esp),%edx
+	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
+	ja	_C_LABEL(fusuaddrfault)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
+	jne	2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+	GET_CURPCB(%eax)	
+	movl	$3f,PCB_ONFAULT(%eax)
+
+	movl	%edx,%eax
+	shrl	$PGSHIFT,%eax		# calculate pte address
+	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
+	jnz	1f
+
+3:	/* Simulate a trap. */
+	pushl	%edx
+	pushl	%edx
+	call	_C_LABEL(trapwrite)	# trapwrite(addr)
+	addl	$4,%esp			# clear parameter from the stack
+	popl	%edx
+	GET_CURPCB(%ecx)
+	testl	%eax,%eax
+	jnz	_C_LABEL(fusufault)
+
+1:
+#endif
+
+2:	GET_CURPCB(%ecx)
+	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+
+	movb	8(%esp),%al
+	movb	%al,(%edx)
+	xorl	%eax,%eax
+	movl	%eax,PCB_ONFAULT(%ecx)
+	ret
+
+/*****************************************************************************/
+
+/*
+ * The following is i386-specific nonsense.
+ */
+
+/*
+ * void lgdt_finish(void);
+ * Finish load a new GDT pointer (do any necessary cleanup).
+ * XXX It's somewhat questionable whether reloading all the segment registers
+ * is necessary, since the actual descriptor data is not changed except by
+ * process creation and exit, both of which clean up via task switches.  OTOH,
+ * this only happens at run time when the GDT is resized.
+ */
+/* LINTSTUB: Func: void lgdt_finish(void) */
+NENTRY(lgdt_finish)
+	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax
+	movw	%ax,%ds
+	movw	%ax,%es
+	movw	%ax,%gs
+	movw	%ax,%ss
+	movl	$GSEL(GCPU_SEL, SEL_KPL),%eax
+	movw	%ax,%fs
+	/* Reload code selector by doing intersegment return. */
+	popl	%eax
+	pushl	$GSEL(GCODE_SEL, SEL_KPL)
+	pushl	%eax
+	lret
+
+/*****************************************************************************/
+
+/*
+ * These functions are primarily used by DDB.
+ */
+
+/* LINTSTUB: Func: int setjmp (label_t *l) */
+ENTRY(setjmp)
+	movl	4(%esp),%eax
+	movl	%ebx,(%eax)		# save ebx
+	movl	%esp,4(%eax)		# save esp
+	movl	%ebp,8(%eax)		# save ebp
+	movl	%esi,12(%eax)		# save esi
+	movl	%edi,16(%eax)		# save edi
+	movl	(%esp),%edx		# get rta
+	movl	%edx,20(%eax)		# save eip
+	xorl	%eax,%eax		# return (0);
+	ret
+
+/* LINTSTUB: Func: void longjmp (label_t *l) */
+ENTRY(longjmp)
+	movl	4(%esp),%eax
+	movl	(%eax),%ebx		# restore ebx
+	movl	4(%eax),%esp		# restore esp
+	movl	8(%eax),%ebp		# restore ebp
+	movl	12(%eax),%esi		# restore esi
+	movl	16(%eax),%edi		# restore edi
+	movl	20(%eax),%edx		# get rta
+	movl	%edx,(%esp)		# put in return frame
+	xorl	%eax,%eax		# return (1);
+	incl	%eax
+	ret
+
+/*****************************************************************************/
+
+	.globl	_C_LABEL(sched_whichqs),_C_LABEL(sched_qs)
+	.globl	_C_LABEL(uvmexp),_C_LABEL(panic)
+
+#ifdef DIAGNOSTIC
+NENTRY(switch_error)
+	pushl	$1f
+3:	call	_C_LABEL(panic)
+	/* NOTREACHED */
+1:	.asciz	"cpu_switch"
+#endif /* DIAGNOSTIC */
+
+/*
+ * void cpu_switch(struct lwp *)
+ * Find a runnable process and switch to it.  Wait if necessary.  If the new
+ * process is the same as the old one, we short-circuit the context save and
+ * restore.
+ *	
+ * Note that the stack frame layout is known to "struct switchframe"
+ * in <machine/frame.h> and to the code in cpu_fork() which initializes 
+ * it for a new lwp.
+ */
+ENTRY(cpu_switch)
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+#ifdef DEBUG
+	cmpl	$IPL_SCHED,CPUVAR(ILEVEL)
+	jae	1f
+	pushl	$2f
+	call	_C_LABEL(panic)
+	/* NOTREACHED */
+2:	.asciz	"not splsched() in cpu_switch!"
+1:	
+#endif /* DEBUG */
+
+	movl	16(%esp),%esi		# current
+
+	/*
+	 * Clear curlwp so that we don't accumulate system time while idle.
+	 * This also insures that schedcpu() will move the old lwp to
+	 * the correct queue if it happens to get called from the spllower()
+	 * below and changes the priority.  (See corresponding comment in
+	 * userret()).
+	 */
+	movl	$0,CPUVAR(CURLWP)
+	/*
+	 * First phase: find new lwp.
+	 *
+	 * Registers:
+	 *   %eax - queue head, scratch, then zero
+	 *   %ebx - queue number
+	 *   %ecx - cached value of whichqs
+	 *   %edx - next lwp in queue
+	 *   %esi - old lwp
+	 *   %edi - new lwp
+	 */
+
+	/* Look for new lwp. */
+	CLI(%ecx)			# splhigh doesn't do a cli
+	movl	_C_LABEL(sched_whichqs),%ecx
+	bsfl	%ecx,%ebx		# find a full q
+	jnz	switch_dequeue
+
+	/*
+	 * idling:	save old context.
+	 *
+	 * Registers:
+	 *   %eax, %ecx - scratch
+	 *   %esi - old lwp, then old pcb
+	 *   %edi - idle pcb
+	 */
+
+	pushl	%esi
+	call	_C_LABEL(pmap_deactivate2)	# pmap_deactivate(oldproc)
+	addl	$4,%esp
+
+	movl	L_ADDR(%esi),%esi
+
+	/* Save stack pointers. */
+	movl	%esp,PCB_ESP(%esi)
+	movl	%ebp,PCB_EBP(%esi)
+
+	/* Find idle PCB for this CPU */
+#ifndef MULTIPROCESSOR
+	movl	$_C_LABEL(lwp0),%ebx
+	movl	L_ADDR(%ebx),%edi
+	movl	L_MD_TSS_SEL(%ebx),%edx
+#else
+	movl	CPUVAR(IDLE_PCB),%edi
+	movl	CPUVAR(IDLE_TSS_SEL),%edx
+#endif
+	movl	$0,CPUVAR(CURLWP)		/* In case we fault... */
+
+	/* Restore the idle context (avoid interrupts) */
+	CLI(%ecx)
+
+	/* Restore stack pointers. */
+	movl	PCB_ESP(%edi),%esp
+	movl	PCB_EBP(%edi),%ebp
+
+	pushl	%edi
+	call	_C_LABEL(i386_switch_context)
+	addl	$4,%esp
+
+	/* Record new pcb. */
+	SET_CURPCB(%edi)
+
+	xorl	%esi,%esi
+	STI(%eax)
+idle_unlock:	
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)	
+	call	_C_LABEL(sched_unlock_idle)
+#endif
+	/* Interrupts are okay again. */
+	pushl	$IPL_NONE		# spl0()
+	call	_C_LABEL(Xspllower)	# process pending interrupts
+	addl	$4,%esp
+	jmp	idle_start
+idle_zero:		
+	STIC(%eax)
+    	jz	4f
+	call	_C_LABEL(stipending)
+	testl	%eax,%eax
+	jz	4f
+	pushl	$IPL_NONE
+	call	_C_LABEL(Xspllower)
+	addl	$4,%esp
+4:
+	call	_C_LABEL(uvm_pageidlezero)
+	CLI(%eax)
+	cmpl	$0,_C_LABEL(sched_whichqs)
+	jnz	idle_exit
+idle_loop:
+	/* Try to zero some pages. */
+	movl	_C_LABEL(uvm)+UVM_PAGE_IDLE_ZERO,%ecx
+	testl	%ecx,%ecx
+	jnz	idle_zero
+	STIC(%eax)
+    	jz	4f
+	call	_C_LABEL(stipending)
+	testl	%eax,%eax
+	jz	4f
+	pushl	$IPL_NONE
+	call	_C_LABEL(Xspllower)
+	addl	$4,%esp
+	jmp	idle_start
+4:
+	movl	$__HYPERVISOR_yield,%eax
+	TRAP_INSTR
+NENTRY(mpidle)
+idle_start:	
+	CLI(%eax)
+	cmpl	$0,_C_LABEL(sched_whichqs)
+	jz	idle_loop
+idle_exit:	
+	movl	$IPL_HIGH,CPUVAR(ILEVEL)		# splhigh
+	STI(%eax)
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)	
+	call	_C_LABEL(sched_lock_idle)
+#endif
+	movl	_C_LABEL(sched_whichqs),%ecx
+	bsfl	%ecx,%ebx
+	jz	idle_unlock
+
+#ifdef XENDEBUG_LOW
+	pushl	%ecx
+	call	_C_LABEL(xen_dbg1)
+	xorl	%ecx,%ecx
+	movl	%ecx,_C_LABEL(xen_once)
+	popl	%ecx
+#endif
+switch_dequeue:		
+	/* 
+	 * we're running at splhigh(), but it's otherwise okay to take
+	 * interrupts here. 
+	 */
+	STI(%edi)
+	leal	_C_LABEL(sched_qs)(,%ebx,8),%eax # select q
+
+	movl	L_FORW(%eax),%edi	# unlink from front of process q
+#ifdef	DIAGNOSTIC
+	cmpl	%edi,%eax		# linked to self (i.e. nothing queued)?
+	je	_C_LABEL(switch_error)	# not possible
+#endif /* DIAGNOSTIC */
+	movl	L_FORW(%edi),%edx
+	movl	%edx,L_FORW(%eax)
+	movl	%eax,L_BACK(%edx)
+
+	cmpl	%edx,%eax		# q empty?
+	jne	3f
+
+	btrl	%ebx,%ecx		# yes, clear to indicate empty
+	movl	%ecx,_C_LABEL(sched_whichqs) # update q status
+
+3:	/* We just did it. */
+	xorl	%eax,%eax
+	CLEAR_RESCHED(%eax)
+
+switch_resume:
+#ifdef	DIAGNOSTIC
+	cmpl	%eax,L_WCHAN(%edi)	# Waiting for something?
+	jne	_C_LABEL(switch_error)	# Yes; shouldn't be queued.
+	cmpb	$LSRUN,L_STAT(%edi)	# In run state?
+	jne	_C_LABEL(switch_error)	# No; shouldn't be queued.
+#endif /* DIAGNOSTIC */
+
+	/* Isolate lwp.  XXX Is this necessary? */
+	movl	%eax,L_BACK(%edi)
+
+	/* Record new lwp. */
+	movb	$LSONPROC,L_STAT(%edi)	# l->l_stat = LSONPROC
+	SET_CURLWP(%edi,%ecx)
+
+	/* Skip context switch if same lwp. */
+	xorl	%ebx,%ebx
+	cmpl	%edi,%esi
+	je	switch_return
+
+	/* If old lwp exited, don't bother. */
+	testl	%esi,%esi
+	jz	switch_exited
+
+	/*
+	 * Second phase: save old context.
+	 *
+	 * Registers:
+	 *   %eax, %ecx - scratch
+	 *   %esi - old lwp, then old pcb
+	 *   %edi - new lwp
+	 */
+
+	pushl	%esi
+	call	_C_LABEL(pmap_deactivate2)	# pmap_deactivate(oldproc)
+	addl	$4,%esp
+
+	movl	L_ADDR(%esi),%esi
+
+	/* Save stack pointers. */
+	movl	%esp,PCB_ESP(%esi)
+	movl	%ebp,PCB_EBP(%esi)
+
+switch_exited:
+	/*
+	 * Third phase: restore saved context.
+	 *
+	 * Registers:
+	 *   %eax, %ebx, %ecx, %edx - scratch
+	 *   %esi - new pcb
+	 *   %edi - new lwp
+	 */
+
+	/* No interrupts while loading new state. */
+	CLI(%eax)
+	movl	L_ADDR(%edi),%esi
+
+	/* Restore stack pointers. */
+	movl	PCB_ESP(%esi),%esp
+	movl	PCB_EBP(%esi),%ebp
+
+#if 0
+	/* Don't bother with the rest if switching to a system process. */
+	testl	$P_SYSTEM,L_FLAG(%edi);	XXX NJWLWP lwp's don't have P_SYSTEM!
+	jnz	switch_restored	; XXX skip stack_switch+pmap_activate
+#endif
+
+	pushl	%edi
+	call	_C_LABEL(pmap_activate)		# pmap_activate(p)
+	addl	$4,%esp
+
+	pushl	%esi
+	call	_C_LABEL(i386_switch_context)
+	addl	$4,%esp
+
+	/* Record new pcb. */
+	SET_CURPCB(%esi)
+
+	/* Interrupts are okay again. */
+	STI(%edi)
+
+/*
+ *  Check for restartable atomic sequences (RAS)
+ */
+	movl	CPUVAR(CURLWP),%edi
+	movl	L_PROC(%edi),%esi
+	cmpl	$0,P_RASLIST(%esi)
+	jne	2f
+1:
+	movl	$1,%ebx
+
+switch_return:
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)     
+	call    _C_LABEL(sched_unlock_idle)
+#endif
+	pushl	$IPL_NONE		# spl0()
+	call	_C_LABEL(Xspllower)	# process pending interrupts
+	addl	$4,%esp
+	movl	$IPL_HIGH,CPUVAR(ILEVEL)	# splhigh()
+
+	movl	%ebx,%eax
+
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	ret
+
+2:					# check RAS list
+	movl	L_MD_REGS(%edi),%ebx
+	movl	TF_EIP(%ebx),%eax
+	pushl	%eax
+	pushl	%esi
+	call	_C_LABEL(ras_lookup)
+	addl	$8,%esp
+	cmpl	$-1,%eax
+	je	1b
+	movl	%eax,TF_EIP(%ebx)
+	jmp	1b
+
+/*
+ * void cpu_switchto(struct lwp *current, struct lwp *next)
+ * Switch to the specified next LWP.
+ */
+ENTRY(cpu_switchto)
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+#ifdef DEBUG
+	cmpl	$IPL_SCHED,CPUVAR(ILEVEL)
+	jae	1f
+	pushl	$2f
+	call	_C_LABEL(panic)
+	/* NOTREACHED */
+2:	.asciz	"not splsched() in cpu_switchto!"
+1:
+#endif /* DEBUG */
+
+	movl	16(%esp),%esi		# current
+	movl	20(%esp),%edi		# next
+
+	/*
+	 * Clear curlwp so that we don't accumulate system time while idle.
+	 * This also insures that schedcpu() will move the old process to
+	 * the correct queue if it happens to get called from the spllower()
+	 * below and changes the priority.  (See corresponding comment in
+	 * usrret()).
+	 *
+	 * XXX Is this necessary?  We know we won't go idle.
+	 */
+	movl	$0,CPUVAR(CURLWP)
+
+	/*
+	 * We're running at splhigh(), but it's otherwise okay to take
+	 * interrupts here.
+	 */
+	STI(%eax)
+
+	/* Jump into the middle of cpu_switch */
+	xorl	%eax,%eax
+	jmp	switch_resume
+
+/*
+ * void cpu_exit(struct lwp *l)
+ * Switch to the appropriate idle context (lwp0's if uniprocessor; the CPU's 
+ * if multiprocessor) and deallocate the address space and kernel stack for p. 
+ * Then jump into cpu_switch(), as if we were in the idle proc all along.
+ */
+#ifndef MULTIPROCESSOR
+	.globl	_C_LABEL(lwp0)
+#endif
+	.globl  _C_LABEL(uvmspace_free),_C_LABEL(kernel_map)
+	.globl	_C_LABEL(uvm_km_free),_C_LABEL(tss_free)
+/* LINTSTUB: Func: void cpu_exit(struct lwp *l) */
+ENTRY(cpu_exit)
+	movl	4(%esp),%edi		# old process
+#ifndef MULTIPROCESSOR
+	movl	$_C_LABEL(lwp0),%ebx
+	movl	L_ADDR(%ebx),%esi
+	movl	L_MD_TSS_SEL(%ebx),%edx
+#else
+	movl	CPUVAR(IDLE_PCB),%esi
+	movl	CPUVAR(IDLE_TSS_SEL),%edx
+#endif
+	/* In case we fault... */
+	movl	$0,CPUVAR(CURLWP)
+
+	/* Restore the idle context. */
+	CLI(%eax)
+
+	/* Restore stack pointers. */
+	movl	PCB_ESP(%esi),%esp
+	movl	PCB_EBP(%esi),%ebp
+
+	pushl	%esi
+	call	_C_LABEL(i386_switch_context)
+	addl	$4,%esp
+
+	/* Record new pcb. */
+	SET_CURPCB(%esi)
+
+	/* Interrupts are okay again. */
+	STI(%eax)
+
+	/*
+	 * Schedule the dead LWP's stack to be freed.
+	 */
+	pushl	%edi
+	call	_C_LABEL(lwp_exit2)
+	addl	$4,%esp
+
+	/* Jump into cpu_switch() with the right state. */
+	xorl	%esi,%esi
+	movl	%esi,CPUVAR(CURLWP)
+	jmp	idle_start
+
+/*
+ * void savectx(struct pcb *pcb);
+ * Update pcb, saving current processor state.
+ */
+/* LINTSTUB: Func: void savectx(struct pcb *pcb) */
+ENTRY(savectx)
+	movl	4(%esp),%edx		# edx = p->p_addr
+  
+	/* Save stack pointers. */
+	movl	%esp,PCB_ESP(%edx)
+	movl	%ebp,PCB_EBP(%edx)
+
+	ret
+
+/*
+ * Old call gate entry for syscall
+ */
+/* LINTSTUB: Var: char Xosyscall[1]; */
+IDTVEC(osyscall)
+	/* Set eflags in trap frame. */
+	pushfl
+	popl	8(%esp)
+	pushl	$7		# size of instruction for restart
+	jmp	syscall1
+
+/*
+ * Trap gate entry for syscall
+ */
+/* LINTSTUB: Var: char Xsyscall[1]; */
+IDTVEC(syscall)
+	pushl	$2		# size of instruction for restart
+syscall1:
+	pushl	$T_ASTFLT	# trap # for doing ASTs
+	INTRENTRY
+
+#ifdef DIAGNOSTIC
+	cmpl    $0, CPUVAR(WANT_PMAPLOAD)
+	jz	1f
+	pushl	$6f
+	call	_C_LABEL(printf)
+	addl	$4, %esp
+1:
+	movl	CPUVAR(ILEVEL),%ebx
+	testl	%ebx,%ebx
+	jz	1f
+	pushl	$5f
+	call	_C_LABEL(printf)
+	addl	$4,%esp
+#ifdef DDB
+	int	$3
+#endif
+1:	
+#endif /* DIAGNOSTIC */
+	movl	CPUVAR(CURLWP),%edx
+	movl	%esp,L_MD_REGS(%edx)	# save pointer to frame
+	movl	L_PROC(%edx),%edx
+	pushl	%esp
+	call	*P_MD_SYSCALL(%edx)	# get pointer to syscall() function
+	addl	$4,%esp
+syscall_checkast:
+	/* Check for ASTs on exit to user mode. */
+	CLI(%eax)
+	CHECK_ASTPENDING(%eax)
+	je	1f
+	/* Always returning to user mode here. */
+	CLEAR_ASTPENDING(%eax)
+	STI(%eax)
+	/* Pushed T_ASTFLT into tf_trapno on entry. */
+	pushl	%esp
+	call	_C_LABEL(trap)
+	addl	$4,%esp
+	jmp	syscall_checkast
+1:	STI(%eax)
+	CHECK_DEFERRED_SWITCH(%eax)
+	jnz	9f
+#ifndef DIAGNOSTIC
+	INTRFASTEXIT
+#else /* DIAGNOSTIC */
+	cmpl	$IPL_NONE,CPUVAR(ILEVEL)
+	jne	3f
+	INTRFASTEXIT
+3:	pushl	$4f
+	call	_C_LABEL(printf)
+	addl	$4,%esp
+#ifdef DDB
+	int	$3
+#endif /* DDB */
+	movl	$IPL_NONE,CPUVAR(ILEVEL)
+	jmp	2b
+4:	.asciz	"WARNING: SPL NOT LOWERED ON SYSCALL EXIT\n"
+5:	.asciz	"WARNING: SPL NOT ZERO ON SYSCALL ENTRY\n"	
+6:	.asciz	"WARNING: WANT PMAPLOAD ON SYSCALL ENTRY\n"     
+#endif /* DIAGNOSTIC */
+9:	call    _C_LABEL(pmap_load)
+	jmp     syscall_checkast        /* re-check ASTs */
+
+#if NNPX > 0
+/*
+ * Special interrupt handlers.  Someday intr0-intr15 will be used to count
+ * interrupts.  We'll still need a special exception 16 handler.  The busy
+ * latch stuff in probintr() can be moved to npxprobe().
+ */
+
+/* LINTSTUB: Func: void probeintr(void) */
+NENTRY(probeintr)
+	ss
+	incl	_C_LABEL(npx_intrs_while_probing)
+	pushl	%eax
+	movb	$0x20,%al	# EOI (asm in strings loses cpp features)
+	outb	%al,$0xa0	# IO_ICU2
+	outb	%al,$0x20	# IO_ICU1
+	movb	$0,%al
+	outb	%al,$0xf0	# clear BUSY# latch
+	popl	%eax
+	iret
+
+/* LINTSTUB: Func: void probetrap(void) */
+NENTRY(probetrap)
+	ss
+	incl	_C_LABEL(npx_traps_while_probing)
+	fnclex
+	iret
+
+/* LINTSTUB: Func: int npx586bug1(int a, int b) */
+NENTRY(npx586bug1)
+	fildl	4(%esp)		# x
+	fildl	8(%esp)		# y
+	fld	%st(1)
+	fdiv	%st(1),%st	# x/y
+	fmulp	%st,%st(1)	# (x/y)*y
+	fsubrp	%st,%st(1)	# x-(x/y)*y
+	pushl	$0
+	fistpl	(%esp)
+	popl	%eax
+	ret
+#endif /* NNPX > 0 */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c
new file mode 100644
index 0000000000..61d2898096
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c
@@ -0,0 +1,2561 @@
+/*	$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $	*/
+/*	NetBSD: machdep.c,v 1.552 2004/03/24 15:34:49 atatat Exp 	*/
+
+/*-
+ * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
+ * Simulation Facility, NASA Ames Research Center.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the NetBSD
+ *	Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $");
+
+#include "opt_beep.h"
+#include "opt_compat_ibcs2.h"
+#include "opt_compat_mach.h"	/* need to get the right segment def */
+#include "opt_compat_netbsd.h"
+#include "opt_compat_svr4.h"
+#include "opt_cpureset_delay.h"
+#include "opt_cputype.h"
+#include "opt_ddb.h"
+#include "opt_ipkdb.h"
+#include "opt_kgdb.h"
+#include "opt_mtrr.h"
+#include "opt_multiprocessor.h"
+#include "opt_realmem.h"
+#include "opt_user_ldt.h"
+#include "opt_vm86.h"
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/signal.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/exec.h>
+#include <sys/buf.h>
+#include <sys/reboot.h>
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/msgbuf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/extent.h>
+#include <sys/syscallargs.h>
+#include <sys/core.h>
+#include <sys/kcore.h>
+#include <sys/ucontext.h>
+#include <machine/kcore.h>
+#include <sys/ras.h>
+#include <sys/sa.h>
+#include <sys/savar.h>
+#include <sys/ksyms.h>
+
+#ifdef IPKDB
+#include <ipkdb/ipkdb.h>
+#endif
+
+#ifdef KGDB
+#include <sys/kgdb.h>
+#endif
+
+#include <dev/cons.h>
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm_page.h>
+
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#include <machine/cpufunc.h>
+#include <machine/cpuvar.h>
+#include <machine/gdt.h>
+#include <machine/pio.h>
+#include <machine/psl.h>
+#include <machine/reg.h>
+#include <machine/specialreg.h>
+#include <machine/bootinfo.h>
+#include <machine/mtrr.h>
+#include <machine/evtchn.h>
+
+#include <dev/isa/isareg.h>
+#include <machine/isa_machdep.h>
+#include <dev/ic/i8042reg.h>
+
+#ifdef DDB
+#include <machine/db_machdep.h>
+#include <ddb/db_extern.h>
+#endif
+
+#ifdef VM86
+#include <machine/vm86.h>
+#endif
+
+#include "acpi.h"
+#include "apm.h"
+#include "bioscall.h"
+
+#if NBIOSCALL > 0
+#include <machine/bioscall.h>
+#endif
+
+#if NACPI > 0
+#include <dev/acpi/acpivar.h>
+#define ACPI_MACHDEP_PRIVATE
+#include <machine/acpi_machdep.h>
+#endif
+
+#if NAPM > 0
+#include <machine/apmvar.h>
+#endif
+
+#include "isa.h"
+#include "isadma.h"
+#include "npx.h"
+#include "ksyms.h"
+
+#include "mca.h"
+#if NMCA > 0
+#include <machine/mca_machdep.h>	/* for mca_busprobe() */
+#endif
+
+#ifdef MULTIPROCESSOR		/* XXX */
+#include <machine/mpbiosvar.h>	/* XXX */
+#endif				/* XXX */
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+
+#if defined(DDB) || defined(KGDB)
+#include <ddb/db_interface.h>
+#include <ddb/db_output.h>
+
+void ddb_trap_hook(int);
+#endif
+
+/* #define	XENDEBUG */
+/* #define	XENDEBUG_LOW */
+
+#ifdef XENDEBUG
+extern void printk(char *, ...);
+#define	XENPRINTF(x) printf x
+#define	XENPRINTK(x) printk x
+#else
+#define	XENPRINTF(x)
+#define	XENPRINTK(x)
+#endif
+#define	PRINTK(x) printf x
+
+#ifdef XENDEBUG_LOW
+void xen_dbglow_init(void);
+#endif
+
+#ifndef BEEP_ONHALT_COUNT
+#define BEEP_ONHALT_COUNT 3
+#endif
+#ifndef BEEP_ONHALT_PITCH
+#define BEEP_ONHALT_PITCH 1500
+#endif
+#ifndef BEEP_ONHALT_PERIOD
+#define BEEP_ONHALT_PERIOD 250
+#endif
+
+/* the following is used externally (sysctl_hw) */
+char machine[] = "i386";		/* CPU "architecture" */
+char machine_arch[] = "i386";		/* machine == machine_arch */
+
+char bootinfo[BOOTINFO_MAXSIZE];
+
+struct bi_devmatch *i386_alldisks = NULL;
+int i386_ndisks = 0;
+
+#ifdef CPURESET_DELAY
+int	cpureset_delay = CPURESET_DELAY;
+#else
+int     cpureset_delay = 2000; /* default to 2s */
+#endif
+
+#ifdef MTRR
+struct mtrr_funcs *mtrr_funcs;
+#endif
+
+#ifdef COMPAT_NOMID
+static int exec_nomid(struct proc *, struct exec_package *);
+#endif
+
+int	physmem;
+int	dumpmem_low;
+int	dumpmem_high;
+unsigned int cpu_feature;
+int	cpu_class;
+int	i386_fpu_present;
+int	i386_fpu_exception;
+int	i386_fpu_fdivbug;
+
+int	i386_use_fxsave;
+int	i386_has_sse;
+int	i386_has_sse2;
+
+int	tmx86_has_longrun;
+
+vaddr_t	msgbuf_vaddr;
+paddr_t msgbuf_paddr;
+
+vaddr_t	idt_vaddr;
+paddr_t	idt_paddr;
+
+#ifdef I586_CPU
+vaddr_t	pentium_idt_vaddr;
+#endif
+
+struct vm_map *exec_map = NULL;
+struct vm_map *mb_map = NULL;
+struct vm_map *phys_map = NULL;
+
+extern	paddr_t avail_start, avail_end;
+extern	paddr_t pmap_pa_start, pmap_pa_end;
+
+#ifdef ISA_CLOCK
+void (*delay_func)(int) = i8254_delay;
+void (*microtime_func)(struct timeval *) = i8254_microtime;
+void (*initclock_func)(void) = i8254_initclocks;
+#else
+void (*delay_func)(int) = xen_delay;
+void (*microtime_func)(struct timeval *) = xen_microtime;
+void (*initclock_func)(void) = xen_initclocks;
+#endif
+
+void hypervisor_callback(void);
+void failsafe_callback(void);
+
+/*
+ * Size of memory segments, before any memory is stolen.
+ */
+phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
+int	mem_cluster_cnt;
+
+int	cpu_dump(void);
+int	cpu_dumpsize(void);
+u_long	cpu_dump_mempagecnt(void);
+void	dumpsys(void);
+void	init386(paddr_t);
+void	initgdt(void);
+
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+void	add_mem_cluster(u_int64_t, u_int64_t, u_int32_t);
+#endif /* !defnied(REALBASEMEM) && !defined(REALEXTMEM) */
+
+extern int time_adjusted;
+
+/*
+ * Machine-dependent startup code
+ */
+void
+cpu_startup()
+{
+	int x;
+	vaddr_t minaddr, maxaddr;
+	char pbuf[9];
+
+	/*
+	 * Initialize error message buffer (et end of core).
+	 */
+	msgbuf_vaddr = uvm_km_valloc(kernel_map, x86_round_page(MSGBUFSIZE));
+	if (msgbuf_vaddr == 0)
+		panic("failed to valloc msgbuf_vaddr");
+
+	/* msgbuf_paddr was init'd in pmap */
+	for (x = 0; x < btoc(MSGBUFSIZE); x++)
+		pmap_kenter_pa((vaddr_t)msgbuf_vaddr + x * PAGE_SIZE,
+		    msgbuf_paddr + x * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE);
+	pmap_update(pmap_kernel());
+
+	initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
+
+	printf("%s", version);
+
+#ifdef TRAPLOG
+	/*
+	 * Enable recording of branch from/to in MSR's
+	 */
+	wrmsr(MSR_DEBUGCTLMSR, 0x1);
+#endif
+
+	format_bytes(pbuf, sizeof(pbuf), ptoa(physmem));
+	printf("total memory = %s\n", pbuf);
+
+	minaddr = 0;
+
+	/*
+	 * Allocate a submap for exec arguments.  This map effectively
+	 * limits the number of processes exec'ing at any time.
+	 */
+	exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
+				   16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
+
+	/*
+	 * Allocate a submap for physio
+	 */
+	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
+				   VM_PHYS_SIZE, 0, FALSE, NULL);
+
+	/*
+	 * Finally, allocate mbuf cluster submap.
+	 */
+	mb_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
+	    nmbclusters * mclbytes, VM_MAP_INTRSAFE, FALSE, NULL);
+
+	format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free));
+	printf("avail memory = %s\n", pbuf);
+
+	/* Safe for i/o port / memory space allocation to use malloc now. */
+	x86_bus_space_mallocok();
+}
+
+/*
+ * Set up proc0's TSS and LDT.
+ */
+void
+i386_proc0_tss_ldt_init()
+{
+	struct pcb *pcb;
+	int x;
+
+	gdt_init();
+
+	cpu_info_primary.ci_curpcb = pcb = &lwp0.l_addr->u_pcb;
+
+	pcb->pcb_tss.tss_ioopt =
+	    ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
+		| SEL_KPL;		/* i/o pl */
+
+	for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
+		pcb->pcb_iomap[x] = 0xffffffff;
+
+	pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+	pcb->pcb_cr0 = rcr0();
+	pcb->pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
+	pcb->pcb_tss.tss_esp0 = (int)lwp0.l_addr + USPACE - 16;
+	lwp0.l_md.md_regs = (struct trapframe *)pcb->pcb_tss.tss_esp0 - 1;
+	lwp0.l_md.md_tss_sel = tss_alloc(pcb);
+
+#ifndef XEN
+	ltr(lwp0.l_md.md_tss_sel);
+	lldt(pcb->pcb_ldt_sel);
+#else
+	HYPERVISOR_fpu_taskswitch();
+	XENPRINTF(("lwp tss sp %p ss %04x/%04x\n",
+		      (void *)pcb->pcb_tss.tss_esp0,
+		      pcb->pcb_tss.tss_ss0, IDXSEL(pcb->pcb_tss.tss_ss0)));
+	HYPERVISOR_stack_switch(pcb->pcb_tss.tss_ss0, pcb->pcb_tss.tss_esp0);
+#endif
+}
+
+/*
+ * Set up TSS and LDT for a new PCB.
+ */
+
+void
+i386_init_pcb_tss_ldt(struct cpu_info *ci)
+{
+	int x;
+	struct pcb *pcb = ci->ci_idle_pcb;
+
+	pcb->pcb_tss.tss_ioopt =
+	    ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
+		| SEL_KPL;		/* i/o pl */
+	for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
+		pcb->pcb_iomap[x] = 0xffffffff;
+
+	pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+	pcb->pcb_cr0 = rcr0();
+
+	ci->ci_idle_tss_sel = tss_alloc(pcb);
+}
+
+/*
+ * Switch context:
+ * - honor CR0_TS in saved CR0 and request DNA exception on FPU use
+ * - switch stack pointer for user->kernel transition
+ */
+void
+i386_switch_context(struct pcb *new)
+{
+	dom0_op_t op;
+	struct cpu_info *ci;
+
+	ci = curcpu();
+	if (ci->ci_fpused) {
+		HYPERVISOR_fpu_taskswitch();
+		ci->ci_fpused = 0;
+	}
+
+	HYPERVISOR_stack_switch(new->pcb_tss.tss_ss0, new->pcb_tss.tss_esp0);
+
+	if (xen_start_info.flags & SIF_PRIVILEGED) {
+		op.cmd = DOM0_IOPL;
+		op.u.iopl.domain = DOMID_SELF;
+		op.u.iopl.iopl = new->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */
+		HYPERVISOR_dom0_op(&op);
+	}
+}
+
+/*
+ * sysctl helper routine for machdep.tm* nodes.
+ */
+static int
+sysctl_machdep_tm_longrun(SYSCTLFN_ARGS)
+{
+	struct sysctlnode node;
+	int io, error;
+
+	if (!tmx86_has_longrun)
+		return (EOPNOTSUPP);
+
+	node = *rnode;
+	node.sysctl_data = &io;
+
+	switch (rnode->sysctl_num) {
+	case CPU_TMLR_MODE:
+		io = (int)(crusoe_longrun = tmx86_get_longrun_mode());
+		break;
+	case CPU_TMLR_FREQUENCY:
+		tmx86_get_longrun_status_all();
+		io = crusoe_frequency;
+		break;
+	case CPU_TMLR_VOLTAGE:
+		tmx86_get_longrun_status_all();
+		io = crusoe_voltage;
+		break;
+	case CPU_TMLR_PERCENTAGE:
+		tmx86_get_longrun_status_all();
+		io = crusoe_percentage;
+		break;
+	default:
+		return (EOPNOTSUPP);
+	}
+
+	error = sysctl_lookup(SYSCTLFN_CALL(&node));
+	if (error || newp == NULL)
+		return (error);
+
+	if (rnode->sysctl_num == CPU_TMLR_MODE) {
+		if (tmx86_set_longrun_mode(io))
+			crusoe_longrun = (u_int)io;
+		else
+			return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * sysctl helper routine for machdep.booted_kernel
+ */
+static int
+sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
+{
+	struct btinfo_bootpath *bibp;
+	struct sysctlnode node;
+
+	bibp = lookup_bootinfo(BTINFO_BOOTPATH);
+	if(!bibp)
+		return(ENOENT); /* ??? */
+
+	node = *rnode;
+	node.sysctl_data = bibp->bootpath;
+	node.sysctl_size = sizeof(bibp->bootpath);
+	return (sysctl_lookup(SYSCTLFN_CALL(&node)));
+}
+
+/*
+ * sysctl helper routine for machdep.diskinfo
+ */
+static int
+sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
+{
+	struct sysctlnode node;
+
+	node = *rnode;
+	node.sysctl_data = i386_alldisks;
+	node.sysctl_size = sizeof(struct disklist) +
+	    (i386_ndisks - 1) * sizeof(struct nativedisk_info);
+        return (sysctl_lookup(SYSCTLFN_CALL(&node)));
+}
+
+/*
+ * machine dependent system variables.
+ */
+SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
+{
+
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "machdep", NULL,
+		       NULL, 0, NULL, 0,
+		       CTL_MACHDEP, CTL_EOL);
+
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_STRUCT, "console_device", NULL,
+		       sysctl_consdev, 0, NULL, sizeof(dev_t),
+		       CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "biosbasemem", NULL,
+		       NULL, 0, &biosbasemem, 0,
+		       CTL_MACHDEP, CPU_BIOSBASEMEM, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "biosextmem", NULL,
+		       NULL, 0, &biosextmem, 0,
+		       CTL_MACHDEP, CPU_BIOSEXTMEM, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "nkpde", NULL,
+		       NULL, 0, &nkpde, 0,
+		       CTL_MACHDEP, CPU_NKPDE, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_STRING, "booted_kernel", NULL,
+		       sysctl_machdep_booted_kernel, 0, NULL, 0,
+		       CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_STRUCT, "diskinfo", NULL,
+		       sysctl_machdep_diskinfo, 0, NULL, 0,
+		       CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "fpu_present", NULL,
+		       NULL, 0, &i386_fpu_present, 0,
+		       CTL_MACHDEP, CPU_FPU_PRESENT, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "osfxsr", NULL,
+		       NULL, 0, &i386_use_fxsave, 0,
+		       CTL_MACHDEP, CPU_OSFXSR, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "sse", NULL,
+		       NULL, 0, &i386_has_sse, 0,
+		       CTL_MACHDEP, CPU_SSE, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "sse2", NULL,
+		       NULL, 0, &i386_has_sse2, 0,
+		       CTL_MACHDEP, CPU_SSE2, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "tm_longrun_mode", NULL,
+		       sysctl_machdep_tm_longrun, 0, NULL, 0,
+		       CTL_MACHDEP, CPU_TMLR_MODE, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "tm_longrun_frequency", NULL,
+		       sysctl_machdep_tm_longrun, 0, NULL, 0,
+		       CTL_MACHDEP, CPU_TMLR_FREQUENCY, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "tm_longrun_voltage", NULL,
+		       sysctl_machdep_tm_longrun, 0, NULL, 0,
+		       CTL_MACHDEP, CPU_TMLR_VOLTAGE, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "tm_longrun_percentage", NULL,
+		       sysctl_machdep_tm_longrun, 0, NULL, 0,
+		       CTL_MACHDEP, CPU_TMLR_PERCENTAGE, CTL_EOL);
+}
+
+void *
+getframe(struct lwp *l, int sig, int *onstack)
+{
+	struct proc *p = l->l_proc;
+	struct sigctx *ctx = &p->p_sigctx;
+	struct trapframe *tf = l->l_md.md_regs;
+
+	/* Do we need to jump onto the signal stack? */
+	*onstack = (ctx->ps_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
+	    && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
+	if (*onstack)
+		return (char *)ctx->ps_sigstk.ss_sp + ctx->ps_sigstk.ss_size;
+#ifdef VM86
+	if (tf->tf_eflags & PSL_VM)
+		return (void *)(tf->tf_esp + (tf->tf_ss << 4));
+	else
+#endif
+		return (void *)tf->tf_esp;
+}
+
+/*
+ * Build context to run handler in.  We invoke the handler
+ * directly, only returning via the trampoline.  Note the
+ * trampoline version numbers are coordinated with machine-
+ * dependent code in libc.
+ */
+void
+buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
+{
+	struct trapframe *tf = l->l_md.md_regs;
+
+	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
+	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
+	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
+	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
+	tf->tf_eip = (int)catcher;
+	tf->tf_cs = GSEL(sel, SEL_UPL);
+	tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
+	tf->tf_esp = (int)fp;
+	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
+}
+
+static void
+sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
+{
+	struct lwp *l = curlwp;
+	struct proc *p = l->l_proc;
+	struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
+	int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
+	    GUCODEBIG_SEL : GUCODE_SEL;
+	struct sigacts *ps = p->p_sigacts;
+	int onstack;
+	int sig = ksi->ksi_signo;
+	struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
+	sig_t catcher = SIGACTION(p, sig).sa_handler;
+	struct trapframe *tf = l->l_md.md_regs;
+
+	fp--;
+
+	/* Build stack frame for signal trampoline. */
+	switch (ps->sa_sigdesc[sig].sd_vers) {
+	case 0:		/* handled by sendsig_sigcontext */
+	case 1:		/* handled by sendsig_sigcontext */
+	default:	/* unknown version */
+		printf("nsendsig: bad version %d\n",
+		    ps->sa_sigdesc[sig].sd_vers);
+		sigexit(l, SIGILL);
+	case 2:
+		break;
+	}
+
+	frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
+	frame.sf_signum = sig;
+	frame.sf_sip = &fp->sf_si;
+	frame.sf_ucp = &fp->sf_uc;
+	frame.sf_si._info = ksi->ksi_info;
+	frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
+	frame.sf_uc.uc_sigmask = *mask;
+	frame.sf_uc.uc_link = NULL;
+	frame.sf_uc.uc_flags |= (p->p_sigctx.ps_sigstk.ss_flags & SS_ONSTACK)
+	    ? _UC_SETSTACK : _UC_CLRSTACK;
+	memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
+	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
+
+	if (tf->tf_eflags & PSL_VM)
+		(*p->p_emul->e_syscall_intern)(p);
+
+	if (copyout(&frame, fp, sizeof(frame)) != 0) {
+		/*
+		 * Process has trashed its stack; give it an illegal
+		 * instruction to halt it in its tracks.
+		 */
+		sigexit(l, SIGILL);
+		/* NOTREACHED */
+	}
+
+	buildcontext(l, sel, catcher, fp);
+
+	/* Remember that we're now on the signal stack. */
+	if (onstack)
+		p->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
+}
+
+void
+sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
+{
+#ifdef COMPAT_16
+	if (curproc->p_sigacts->sa_sigdesc[ksi->ksi_signo].sd_vers < 2)
+		sendsig_sigcontext(ksi, mask);
+	else
+#endif
+		sendsig_siginfo(ksi, mask);
+}
+
+void
+cpu_upcall(struct lwp *l, int type, int nevents, int ninterrupted, void *sas,
+    void *ap, void *sp, sa_upcall_t upcall)
+{
+	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+	struct saframe *sf, frame;
+	struct trapframe *tf;
+
+	tf = l->l_md.md_regs;
+
+	/* Finally, copy out the rest of the frame. */
+	frame.sa_type = type;
+	frame.sa_sas = sas;
+	frame.sa_events = nevents;
+	frame.sa_interrupted = ninterrupted;
+	frame.sa_arg = ap;
+	frame.sa_ra = 0;
+
+	sf = (struct saframe *)sp - 1;
+	if (copyout(&frame, sf, sizeof(frame)) != 0) {
+		/* Copying onto the stack didn't work. Die. */
+		sigexit(l, SIGILL);
+		/* NOTREACHED */
+	}
+
+	tf->tf_eip = (int) upcall;
+	tf->tf_esp = (int) sf;
+	tf->tf_ebp = 0; /* indicate call-frame-top to debuggers */
+	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
+	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
+	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
+	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
+	tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
+	    GSEL(GUCODEBIG_SEL, SEL_UPL) : GSEL(GUCODE_SEL, SEL_UPL);
+	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
+	tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
+}
+
+int	waittime = -1;
+struct pcb dumppcb;
+
+void
+cpu_reboot(int howto, char *bootstr)
+{
+
+	if (cold) {
+		howto |= RB_HALT;
+		goto haltsys;
+	}
+
+	boothowto = howto;
+	if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
+		waittime = 0;
+		vfs_shutdown();
+		/*
+		 * If we've been adjusting the clock, the todr
+		 * will be out of synch; adjust it now.
+		 */
+		if (time_adjusted != 0)
+			resettodr();
+	}
+
+	/* Disable interrupts. */
+	splhigh();
+
+	/* Do a dump if requested. */
+	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
+		dumpsys();
+
+haltsys:
+	doshutdownhooks();
+
+#ifdef MULTIPROCESSOR
+	x86_broadcast_ipi(X86_IPI_HALT);
+#endif
+
+	if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
+#if NACPI > 0
+		if (acpi_softc != NULL) {
+			delay(500000);
+			acpi_enter_sleep_state(acpi_softc, ACPI_STATE_S5);
+			printf("WARNING: ACPI powerdown failed!\n");
+		}
+#endif
+#if NAPM > 0 && !defined(APM_NO_POWEROFF)
+		/* turn off, if we can.  But try to turn disk off and
+		 * wait a bit first--some disk drives are slow to clean up
+		 * and users have reported disk corruption.
+		 */
+		delay(500000);
+		apm_set_powstate(APM_DEV_DISK(0xff), APM_SYS_OFF);
+		delay(500000);
+		apm_set_powstate(APM_DEV_ALLDEVS, APM_SYS_OFF);
+		printf("WARNING: APM powerdown failed!\n");
+		/*
+		 * RB_POWERDOWN implies RB_HALT... fall into it...
+		 */
+#endif
+		HYPERVISOR_shutdown();
+	}
+
+	if (howto & RB_HALT) {
+		printf("\n");
+		printf("The operating system has halted.\n");
+		printf("Please press any key to reboot.\n\n");
+
+#ifdef BEEP_ONHALT
+		{
+			int c;
+			for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
+				sysbeep(BEEP_ONHALT_PITCH,
+				        BEEP_ONHALT_PERIOD * hz / 1000);
+				delay(BEEP_ONHALT_PERIOD * 1000);
+				sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
+				delay(BEEP_ONHALT_PERIOD * 1000);
+			}
+		}
+#endif
+
+		cnpollc(1);	/* for proper keyboard command handling */
+		if (cngetc() == 0) {
+			/* no console attached, so just hlt */
+			for(;;) {
+				__asm __volatile("hlt");
+			}
+		}
+		cnpollc(0);
+	}
+
+	printf("rebooting...\n");
+	if (cpureset_delay > 0)
+		delay(cpureset_delay * 1000);
+	cpu_reset();
+	for(;;) ;
+	/*NOTREACHED*/
+}
+
+/*
+ * These variables are needed by /sbin/savecore
+ */
+u_int32_t dumpmag = 0x8fca0101;	/* magic number */
+int 	dumpsize = 0;		/* pages */
+long	dumplo = 0; 		/* blocks */
+
+/*
+ * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
+ */
+int
+cpu_dumpsize()
+{
+	int size;
+
+	size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
+	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
+	if (roundup(size, dbtob(1)) != dbtob(1))
+		return (-1);
+
+	return (1);
+}
+
+/*
+ * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
+ */
+u_long
+cpu_dump_mempagecnt()
+{
+	u_long i, n;
+
+	n = 0;
+	for (i = 0; i < mem_cluster_cnt; i++)
+		n += atop(mem_clusters[i].size);
+	return (n);
+}
+
+/*
+ * cpu_dump: dump the machine-dependent kernel core dump headers.
+ */
+int
+cpu_dump()
+{
+	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
+	char buf[dbtob(1)];
+	kcore_seg_t *segp;
+	cpu_kcore_hdr_t *cpuhdrp;
+	phys_ram_seg_t *memsegp;
+	const struct bdevsw *bdev;
+	int i;
+
+	bdev = bdevsw_lookup(dumpdev);
+	if (bdev == NULL)
+		return (ENXIO);
+	dump = bdev->d_dump;
+
+	memset(buf, 0, sizeof buf);
+	segp = (kcore_seg_t *)buf;
+	cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
+	memsegp = (phys_ram_seg_t *)&buf[ ALIGN(sizeof(*segp)) +
+	    ALIGN(sizeof(*cpuhdrp))];
+
+	/*
+	 * Generate a segment header.
+	 */
+	CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
+	segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
+
+	/*
+	 * Add the machine-dependent header info.
+	 */
+	cpuhdrp->ptdpaddr = PTDpaddr;
+	cpuhdrp->nmemsegs = mem_cluster_cnt;
+
+	/*
+	 * Fill in the memory segment descriptors.
+	 */
+	for (i = 0; i < mem_cluster_cnt; i++) {
+		memsegp[i].start = mem_clusters[i].start;
+		memsegp[i].size = mem_clusters[i].size;
+	}
+
+	return (dump(dumpdev, dumplo, (caddr_t)buf, dbtob(1)));
+}
+
+/*
+ * This is called by main to set dumplo and dumpsize.
+ * Dumps always skip the first PAGE_SIZE of disk space
+ * in case there might be a disk label stored there.
+ * If there is extra space, put dump at the end to
+ * reduce the chance that swapping trashes it.
+ */
+void
+cpu_dumpconf()
+{
+	const struct bdevsw *bdev;
+	int nblks, dumpblks;	/* size of dump area */
+
+	if (dumpdev == NODEV)
+		goto bad;
+	bdev = bdevsw_lookup(dumpdev);
+	if (bdev == NULL)
+		panic("dumpconf: bad dumpdev=0x%x", dumpdev);
+	if (bdev->d_psize == NULL)
+		goto bad;
+	nblks = (*bdev->d_psize)(dumpdev);
+	if (nblks <= ctod(1))
+		goto bad;
+
+	dumpblks = cpu_dumpsize();
+	if (dumpblks < 0)
+		goto bad;
+	dumpblks += ctod(cpu_dump_mempagecnt());
+
+	/* If dump won't fit (incl. room for possible label), punt. */
+	if (dumpblks > (nblks - ctod(1)))
+		goto bad;
+
+	/* Put dump at end of partition */
+	dumplo = nblks - dumpblks;
+
+	/* dumpsize is in page units, and doesn't include headers. */
+	dumpsize = cpu_dump_mempagecnt();
+	return;
+
+ bad:
+	dumpsize = 0;
+}
+
+/*
+ * Doadump comes here after turning off memory management and
+ * getting on the dump stack, either when called above, or by
+ * the auto-restart code.
+ */
+#define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
+static vaddr_t dumpspace;
+
+vaddr_t
+reserve_dumppages(vaddr_t p)
+{
+
+	dumpspace = p;
+	return (p + BYTES_PER_DUMP);
+}
+
+void
+dumpsys()
+{
+	u_long totalbytesleft, bytes, i, n, memseg;
+	u_long maddr;
+	int psize;
+	daddr_t blkno;
+	const struct bdevsw *bdev;
+	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
+	int error;
+
+	/* Save registers. */
+	savectx(&dumppcb);
+
+	if (dumpdev == NODEV)
+		return;
+
+	bdev = bdevsw_lookup(dumpdev);
+	if (bdev == NULL || bdev->d_psize == NULL)
+		return;
+
+	/*
+	 * For dumps during autoconfiguration,
+	 * if dump device has already configured...
+	 */
+	if (dumpsize == 0)
+		cpu_dumpconf();
+	if (dumplo <= 0 || dumpsize == 0) {
+		printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
+		    minor(dumpdev));
+		return;
+	}
+	printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
+	    minor(dumpdev), dumplo);
+
+	psize = (*bdev->d_psize)(dumpdev);
+	printf("dump ");
+	if (psize == -1) {
+		printf("area unavailable\n");
+		return;
+	}
+
+#if 0	/* XXX this doesn't work.  grr. */
+        /* toss any characters present prior to dump */
+	while (sget() != NULL); /*syscons and pccons differ */
+#endif
+
+	if ((error = cpu_dump()) != 0)
+		goto err;
+
+	totalbytesleft = ptoa(cpu_dump_mempagecnt());
+	blkno = dumplo + cpu_dumpsize();
+	dump = bdev->d_dump;
+	error = 0;
+
+	for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
+		maddr = mem_clusters[memseg].start;
+		bytes = mem_clusters[memseg].size;
+
+		for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
+			/* Print out how many MBs we have left to go. */
+			if ((totalbytesleft % (1024*1024)) == 0)
+				printf("%ld ", totalbytesleft / (1024 * 1024));
+
+			/* Limit size for next transfer. */
+			n = bytes - i;
+			if (n > BYTES_PER_DUMP)
+				n = BYTES_PER_DUMP;
+
+			(void) pmap_map(dumpspace, maddr, maddr + n,
+			    VM_PROT_READ);
+
+			error = (*dump)(dumpdev, blkno, (caddr_t)dumpspace, n);
+			if (error)
+				goto err;
+			maddr += n;
+			blkno += btodb(n);		/* XXX? */
+
+#if 0	/* XXX this doesn't work.  grr. */
+			/* operator aborting dump? */
+			if (sget() != NULL) {
+				error = EINTR;
+				break;
+			}
+#endif
+		}
+	}
+
+ err:
+	switch (error) {
+
+	case ENXIO:
+		printf("device bad\n");
+		break;
+
+	case EFAULT:
+		printf("device not ready\n");
+		break;
+
+	case EINVAL:
+		printf("area improper\n");
+		break;
+
+	case EIO:
+		printf("i/o error\n");
+		break;
+
+	case EINTR:
+		printf("aborted from console\n");
+		break;
+
+	case 0:
+		printf("succeeded\n");
+		break;
+
+	default:
+		printf("error %d\n", error);
+		break;
+	}
+	printf("\n\n");
+	delay(5000000);		/* 5 seconds */
+}
+
+/*
+ * Clear registers on exec
+ */
+void
+setregs(struct lwp *l, struct exec_package *pack, u_long stack)
+{
+	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+	struct pcb *pcb = &l->l_addr->u_pcb;
+	struct trapframe *tf;
+
+#if NNPX > 0
+	/* If we were using the FPU, forget about it. */
+	if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
+		npxsave_lwp(l, 0);
+#endif
+
+#ifdef USER_LDT
+	pmap_ldt_cleanup(l);
+#endif
+
+	l->l_md.md_flags &= ~MDL_USEDFPU;
+	if (i386_use_fxsave) {
+		pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __NetBSD_NPXCW__;
+		pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__;
+	} else
+		pcb->pcb_savefpu.sv_87.sv_env.en_cw = __NetBSD_NPXCW__;
+
+	tf = l->l_md.md_regs;
+	tf->tf_gs = LSEL(LUDATA_SEL, SEL_UPL);
+	tf->tf_fs = LSEL(LUDATA_SEL, SEL_UPL);
+	tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
+	tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
+	tf->tf_edi = 0;
+	tf->tf_esi = 0;
+	tf->tf_ebp = 0;
+	tf->tf_ebx = (int)l->l_proc->p_psstr;
+	tf->tf_edx = 0;
+	tf->tf_ecx = 0;
+	tf->tf_eax = 0;
+	tf->tf_eip = pack->ep_entry;
+	tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
+	    LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
+	tf->tf_eflags = PSL_USERSET;
+	tf->tf_esp = stack;
+	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
+}
+
+/*
+ * Initialize segments and descriptor tables
+ */
+
+union	descriptor *gdt, *ldt;
+struct gate_descriptor *idt;
+char idt_allocmap[NIDT];
+struct simplelock idt_lock = SIMPLELOCK_INITIALIZER;
+#ifdef I586_CPU
+union	descriptor *pentium_idt;
+#endif
+extern  struct user *proc0paddr;
+
+void
+setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
+    int sel)
+{
+
+	gd->gd_looffset = (int)func;
+	gd->gd_selector = sel;
+	gd->gd_stkcpy = args;
+	gd->gd_xx = 0;
+	gd->gd_type = type;
+	gd->gd_dpl = dpl;
+	gd->gd_p = 1;
+	gd->gd_hioffset = (int)func >> 16;
+}
+
+void
+unsetgate(struct gate_descriptor *gd)
+{
+	gd->gd_p = 0;
+	gd->gd_hioffset = 0;
+	gd->gd_looffset = 0;
+	gd->gd_selector = 0;
+	gd->gd_xx = 0;
+	gd->gd_stkcpy = 0;
+	gd->gd_type = 0;
+	gd->gd_dpl = 0;
+}
+
+
+void
+setregion(struct region_descriptor *rd, void *base, size_t limit)
+{
+
+	rd->rd_limit = (int)limit;
+	rd->rd_base = (int)base;
+}
+
+void
+setsegment(struct segment_descriptor *sd, void *base, size_t limit, int type,
+    int dpl, int def32, int gran)
+{
+
+	sd->sd_lolimit = (int)limit;
+	sd->sd_lobase = (int)base;
+	sd->sd_type = type;
+	sd->sd_dpl = dpl;
+	sd->sd_p = 1;
+	sd->sd_hilimit = (int)limit >> 16;
+	sd->sd_xx = 0;
+	sd->sd_def32 = def32;
+	sd->sd_gran = gran;
+	sd->sd_hibase = (int)base >> 24;
+}
+
+#define	IDTVEC(name)	__CONCAT(X, name)
+typedef void (vector)(void);
+extern vector IDTVEC(syscall);
+extern vector IDTVEC(osyscall);
+extern vector *IDTVEC(exceptions)[];
+#ifdef COMPAT_SVR4
+extern vector IDTVEC(svr4_fasttrap);
+#endif /* COMPAT_SVR4 */
+#ifdef COMPAT_MACH
+extern vector IDTVEC(mach_trap);
+#endif
+#define MAX_XEN_IDT 128
+trap_info_t xen_idt[MAX_XEN_IDT];
+int xen_idt_idx;
+
+#define	KBTOB(x)	((size_t)(x) * 1024UL)
+
+void cpu_init_idt()
+{
+	struct region_descriptor region;
+
+	panic("cpu_init_idt");
+#ifdef I586_CPU
+	setregion(&region, pentium_idt, NIDT * sizeof(idt[0]) - 1);
+#else
+	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
+#endif
+        lidt(&region);
+}
+
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+void
+add_mem_cluster(u_int64_t seg_start, u_int64_t seg_end, u_int32_t type)
+{
+	extern struct extent *iomem_ex;
+	int i;
+
+	if (seg_end > 0x100000000ULL) {
+		printf("WARNING: skipping large "
+		    "memory map entry: "
+		    "0x%qx/0x%qx/0x%x\n",
+		    seg_start,
+		    (seg_end - seg_start),
+		    type);
+		return;
+	}
+
+	/*
+	 * XXX Chop the last page off the size so that
+	 * XXX it can fit in avail_end.
+	 */
+	if (seg_end == 0x100000000ULL)
+		seg_end -= PAGE_SIZE;
+
+	if (seg_end <= seg_start)
+		return;
+
+	for (i = 0; i < mem_cluster_cnt; i++) {
+		if ((mem_clusters[i].start == round_page(seg_start))
+		    && (mem_clusters[i].size
+			    == trunc_page(seg_end) - mem_clusters[i].start)) {
+#ifdef DEBUG_MEMLOAD
+			printf("WARNING: skipping duplicate segment entry\n");
+#endif
+			return;
+		}
+	}
+
+	/*
+	 * Allocate the physical addresses used by RAM
+	 * from the iomem extent map.  This is done before
+	 * the addresses are page rounded just to make
+	 * sure we get them all.
+	 */
+	if (extent_alloc_region(iomem_ex, seg_start,
+	    seg_end - seg_start, EX_NOWAIT)) {
+		/* XXX What should we do? */
+		printf("WARNING: CAN'T ALLOCATE "
+		    "MEMORY SEGMENT "
+		    "(0x%qx/0x%qx/0x%x) FROM "
+		    "IOMEM EXTENT MAP!\n",
+		    seg_start, seg_end - seg_start, type);
+		return;
+	}
+
+	/*
+	 * If it's not free memory, skip it.
+	 */
+	if (type != BIM_Memory)
+		return;
+
+	/* XXX XXX XXX */
+	if (mem_cluster_cnt >= VM_PHYSSEG_MAX)
+		panic("init386: too many memory segments");
+
+	seg_start = round_page(seg_start);
+	seg_end = trunc_page(seg_end);
+
+	if (seg_start == seg_end)
+		return;
+
+	mem_clusters[mem_cluster_cnt].start = seg_start;
+	mem_clusters[mem_cluster_cnt].size =
+	    seg_end - seg_start;
+
+	if (avail_end < seg_end)
+		avail_end = seg_end;
+	physmem += atop(mem_clusters[mem_cluster_cnt].size);
+	mem_cluster_cnt++;
+}
+#endif /* !defined(REALBASEMEM) && !defined(REALEXTMEM) */
+
+void
+initgdt()
+{
+#if !defined(XEN)
+	struct region_descriptor region;
+#else
+	paddr_t frames[16];
+#endif
+
+#if !defined(XEN)
+	gdt = tgdt;
+	memset(gdt, 0, NGDT*sizeof(*gdt));
+#endif
+	/* make gdt gates and memory segments */
+	setsegment(&gdt[GCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 1, 1);
+	setsegment(&gdt[GDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 1, 1);
+	setsegment(&gdt[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
+	    SDT_MEMERA, SEL_UPL, 1, 1);
+	setsegment(&gdt[GUCODEBIG_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
+	    SDT_MEMERA, SEL_UPL, 1, 1);
+	setsegment(&gdt[GUDATA_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
+	    SDT_MEMRWA, SEL_UPL, 1, 1);
+#ifdef COMPAT_MACH
+	setgate(&gdt[GMACHCALLS_SEL].gd, &IDTVEC(mach_trap), 1,
+	    SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif
+#if NBIOSCALL > 0
+	/* bios trampoline GDT entries */
+	setsegment(&gdt[GBIOSCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 0,
+	    0);
+	setsegment(&gdt[GBIOSDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 0,
+	    0);
+#endif
+	setsegment(&gdt[GCPU_SEL].sd, &cpu_info_primary,
+	    sizeof(struct cpu_info)-1, SDT_MEMRWA, SEL_KPL, 1, 1);
+
+#if !defined(XEN)
+	setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
+	lgdt(&region);
+#else
+	frames[0] = xpmap_ptom((uint32_t)gdt - KERNBASE) >> PAGE_SHIFT;
+	/* pmap_kremove((vaddr_t)gdt, PAGE_SIZE); */
+	pmap_kenter_pa((vaddr_t)gdt, (uint32_t)gdt - KERNBASE,
+	    VM_PROT_READ);
+	XENPRINTK(("loading gdt %lx, %d entries\n", frames[0] << PAGE_SHIFT,
+	    LAST_RESERVED_GDT_ENTRY + 1));
+	if (HYPERVISOR_set_gdt(frames, LAST_RESERVED_GDT_ENTRY + 1))
+		panic("HYPERVISOR_set_gdt failed!\n");
+	lgdt_finish();
+#endif
+}
+
+void
+init386(paddr_t first_avail)
+{
+#if !defined(XEN)
+	union descriptor *tgdt;
+#endif
+	extern void consinit(void);
+#if !defined(XEN)
+	extern struct extent *iomem_ex;
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+	struct btinfo_memmap *bim;
+#endif
+	struct region_descriptor region;
+#endif
+	int x;
+#if !defined(XEN)
+	int first16q;
+	u_int64_t seg_start, seg_end;
+	u_int64_t seg_start1, seg_end1;
+#endif
+	paddr_t realmode_reserved_start;
+	psize_t realmode_reserved_size;
+	int needs_earlier_install_pte0;
+#if NBIOSCALL > 0
+	extern int biostramp_image_size;
+	extern u_char biostramp_image[];
+#endif
+
+	XENPRINTK(("HYPERVISOR_shared_info %p\n", HYPERVISOR_shared_info));
+#ifdef XENDEBUG_LOW
+	xen_dbglow_init();
+#endif
+
+	cpu_probe_features(&cpu_info_primary);
+	cpu_feature = cpu_info_primary.ci_feature_flags;
+
+	/* not on Xen... */
+	cpu_feature &= ~(CPUID_PGE|CPUID_PSE|CPUID_MTRR|CPUID_FXSR);
+
+	lwp0.l_addr = proc0paddr;
+	cpu_info_primary.ci_curpcb = &lwp0.l_addr->u_pcb;
+
+	XENPRINTK(("proc0paddr %p pcb %p first_avail %p\n",
+	    proc0paddr, cpu_info_primary.ci_curpcb, (void *)first_avail));
+	XENPRINTK(("ptdpaddr %p atdevbase %p\n", (void *)PTDpaddr,
+		      (void *)atdevbase));
+
+	x86_bus_space_init();
+	consinit();	/* XXX SHOULD NOT BE DONE HERE */
+	/*
+	 * Initailize PAGE_SIZE-dependent variables.
+	 */
+	uvm_setpagesize();
+
+	/*
+	 * Saving SSE registers won't work if the save area isn't
+	 * 16-byte aligned.
+	 */
+	if (offsetof(struct user, u_pcb.pcb_savefpu) & 0xf)
+		panic("init386: pcb_savefpu not 16-byte aligned");
+
+	/*
+	 * Start with 2 color bins -- this is just a guess to get us
+	 * started.  We'll recolor when we determine the largest cache
+	 * sizes on the system.
+	 */
+	uvmexp.ncolors = 2;
+
+#if !defined(XEN)
+	/*
+	 * BIOS leaves data in physical page 0
+	 * Even if it didn't, our VM system doesn't like using zero as a
+	 * physical page number.
+	 * We may also need pages in low memory (one each) for secondary CPU
+	 * startup, for BIOS calls, and for ACPI, plus a page table page to map
+	 * them into the first few pages of the kernel's pmap.
+	 */
+	avail_start = PAGE_SIZE;
+#else
+	/* Make sure the end of the space used by the kernel is rounded. */
+	first_avail = round_page(first_avail);
+	avail_start = first_avail - KERNBASE;
+	avail_end = ptoa(xen_start_info.nr_pages) +
+		(KERNTEXTOFF - KERNBASE_LOCORE);
+	pmap_pa_start = (KERNTEXTOFF - KERNBASE_LOCORE);
+	pmap_pa_end = avail_end;
+	mem_clusters[0].start = avail_start;
+	mem_clusters[0].size = avail_end - avail_start;
+	mem_cluster_cnt++;
+	physmem += atop(mem_clusters[0].size);
+#endif
+
+	/*
+	 * reserve memory for real-mode call
+	 */
+	needs_earlier_install_pte0 = 0;
+	realmode_reserved_start = 0;
+	realmode_reserved_size = 0;
+#if NBIOSCALL > 0
+	/* save us a page for trampoline code */
+	realmode_reserved_size += PAGE_SIZE;
+	needs_earlier_install_pte0 = 1;
+#endif
+#ifdef MULTIPROCESSOR						 /* XXX */
+#if !defined(XEN)
+	KASSERT(avail_start == PAGE_SIZE);			 /* XXX */
+#endif
+	if (realmode_reserved_size < MP_TRAMPOLINE)		 /* XXX */
+		realmode_reserved_size = MP_TRAMPOLINE;		 /* XXX */
+	needs_earlier_install_pte0 = 1;				 /* XXX */
+#endif								 /* XXX */
+#if NACPI > 0
+	/* trampoline code for wake handler */
+	realmode_reserved_size += ptoa(acpi_md_get_npages_of_wakecode()+1);
+	needs_earlier_install_pte0 = 1;
+#endif
+	if (needs_earlier_install_pte0) {
+		/* page table for directory entry 0 */
+		realmode_reserved_size += PAGE_SIZE;
+	}
+	if (realmode_reserved_size>0) {
+		realmode_reserved_start = avail_start;
+		avail_start += realmode_reserved_size;
+	}
+
+#ifdef DEBUG_MEMLOAD
+	printf("mem_cluster_count: %d\n", mem_cluster_cnt);
+#endif
+
+	/*
+	 * Call pmap initialization to make new kernel address space.
+	 * We must do this before loading pages into the VM system.
+	 */
+	pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
+
+#if !defined(XEN)
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+	/*
+	 * Check to see if we have a memory map from the BIOS (passed
+	 * to us by the boot program.
+	 */
+	bim = lookup_bootinfo(BTINFO_MEMMAP);
+	if (bim != NULL && bim->num > 0) {
+#ifdef DEBUG_MEMLOAD
+		printf("BIOS MEMORY MAP (%d ENTRIES):\n", bim->num);
+#endif
+		for (x = 0; x < bim->num; x++) {
+#ifdef DEBUG_MEMLOAD
+			printf("    addr 0x%qx  size 0x%qx  type 0x%x\n",
+			    bim->entry[x].addr,
+			    bim->entry[x].size,
+			    bim->entry[x].type);
+#endif
+
+			/*
+			 * If the segment is not memory, skip it.
+			 */
+			switch (bim->entry[x].type) {
+			case BIM_Memory:
+			case BIM_ACPI:
+			case BIM_NVS:
+				break;
+			default:
+				continue;
+			}
+
+			/*
+			 * Sanity check the entry.
+			 * XXX Need to handle uint64_t in extent code
+			 * XXX and 64-bit physical addresses in i386
+			 * XXX port.
+			 */
+			seg_start = bim->entry[x].addr;
+			seg_end = bim->entry[x].addr + bim->entry[x].size;
+
+			/*
+			 *   Avoid Compatibility Holes.
+			 * XXX  Holes within memory space that allow access
+			 * XXX to be directed to the PC-compatible frame buffer
+			 * XXX (0xa0000-0xbffff),to adapter ROM space
+			 * XXX (0xc0000-0xdffff), and to system BIOS space
+			 * XXX (0xe0000-0xfffff).
+			 * XXX  Some laptop(for example,Toshiba Satellite2550X)
+			 * XXX report this area and occurred problems,
+			 * XXX so we avoid this area.
+			 */
+			if (seg_start < 0x100000 && seg_end > 0xa0000) {
+				printf("WARNING: memory map entry overlaps "
+				    "with ``Compatibility Holes'': "
+				    "0x%qx/0x%qx/0x%x\n", seg_start,
+				    seg_end - seg_start, bim->entry[x].type);
+				add_mem_cluster(seg_start, 0xa0000,
+				    bim->entry[x].type);
+				add_mem_cluster(0x100000, seg_end,
+				    bim->entry[x].type);
+			} else
+				add_mem_cluster(seg_start, seg_end,
+				    bim->entry[x].type);
+		}
+	}
+#endif /* ! REALBASEMEM && ! REALEXTMEM */
+	/*
+	 * If the loop above didn't find any valid segment, fall back to
+	 * former code.
+	 */
+	if (mem_cluster_cnt == 0) {
+		/*
+		 * Allocate the physical addresses used by RAM from the iomem
+		 * extent map.  This is done before the addresses are
+		 * page rounded just to make sure we get them all.
+		 */
+		if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem),
+		    EX_NOWAIT)) {
+			/* XXX What should we do? */
+			printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
+			    "IOMEM EXTENT MAP!\n");
+		}
+		mem_clusters[0].start = 0;
+		mem_clusters[0].size = trunc_page(KBTOB(biosbasemem));
+		physmem += atop(mem_clusters[0].size);
+		if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
+		    EX_NOWAIT)) {
+			/* XXX What should we do? */
+			printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
+			    "IOMEM EXTENT MAP!\n");
+		}
+#if NISADMA > 0
+		/*
+		 * Some motherboards/BIOSes remap the 384K of RAM that would
+		 * normally be covered by the ISA hole to the end of memory
+		 * so that it can be used.  However, on a 16M system, this
+		 * would cause bounce buffers to be allocated and used.
+		 * This is not desirable behaviour, as more than 384K of
+		 * bounce buffers might be allocated.  As a work-around,
+		 * we round memory down to the nearest 1M boundary if
+		 * we're using any isadma devices and the remapped memory
+		 * is what puts us over 16M.
+		 */
+		if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
+			char pbuf[9];
+
+			format_bytes(pbuf, sizeof(pbuf),
+			    biosextmem - (15*1024));
+			printf("Warning: ignoring %s of remapped memory\n",
+			    pbuf);
+			biosextmem = (15*1024);
+		}
+#endif
+		mem_clusters[1].start = IOM_END;
+		mem_clusters[1].size = trunc_page(KBTOB(biosextmem));
+		physmem += atop(mem_clusters[1].size);
+
+		mem_cluster_cnt = 2;
+
+		avail_end = IOM_END + trunc_page(KBTOB(biosextmem));
+	}
+	/*
+	 * If we have 16M of RAM or less, just put it all on
+	 * the default free list.  Otherwise, put the first
+	 * 16M of RAM on a lower priority free list (so that
+	 * all of the ISA DMA'able memory won't be eaten up
+	 * first-off).
+	 */
+	if (avail_end <= (16 * 1024 * 1024))
+		first16q = VM_FREELIST_DEFAULT;
+	else
+		first16q = VM_FREELIST_FIRST16;
+
+	/* Make sure the end of the space used by the kernel is rounded. */
+	first_avail = round_page(first_avail);
+#endif
+
+	XENPRINTK(("load the memory cluster %p(%d) - %p(%ld)\n",
+	    (void *)avail_start, (int)atop(avail_start),
+	    (void *)avail_end, (int)atop(avail_end)));
+	uvm_page_physload(atop(avail_start), atop(avail_end),
+	    atop(avail_start), atop(avail_end),
+	    VM_FREELIST_DEFAULT);
+
+#if !defined(XEN)
+
+	/*
+	 * Now, load the memory clusters (which have already been
+	 * rounded and truncated) into the VM system.
+	 *
+	 * NOTE: WE ASSUME THAT MEMORY STARTS AT 0 AND THAT THE KERNEL
+	 * IS LOADED AT IOM_END (1M).
+	 */
+	for (x = 0; x < mem_cluster_cnt; x++) {
+		seg_start = mem_clusters[x].start;
+		seg_end = mem_clusters[x].start + mem_clusters[x].size;
+		seg_start1 = 0;
+		seg_end1 = 0;
+
+		/*
+		 * Skip memory before our available starting point.
+		 */
+		if (seg_end <= avail_start)
+			continue;
+
+		if (avail_start >= seg_start && avail_start < seg_end) {
+			if (seg_start != 0)
+				panic("init386: memory doesn't start at 0");
+			seg_start = avail_start;
+			if (seg_start == seg_end)
+				continue;
+		}
+
+		/*
+		 * If this segment contains the kernel, split it
+		 * in two, around the kernel.
+		 */
+		if (seg_start <= IOM_END && first_avail <= seg_end) {
+			seg_start1 = first_avail;
+			seg_end1 = seg_end;
+			seg_end = IOM_END;
+		}
+
+		/* First hunk */
+		if (seg_start != seg_end) {
+			if (seg_start < (16 * 1024 * 1024) &&
+			    first16q != VM_FREELIST_DEFAULT) {
+				u_int64_t tmp;
+
+				if (seg_end > (16 * 1024 * 1024))
+					tmp = (16 * 1024 * 1024);
+				else
+					tmp = seg_end;
+
+				if (tmp != seg_start) {
+#ifdef DEBUG_MEMLOAD
+					printf("loading 0x%qx-0x%qx "
+					    "(0x%lx-0x%lx)\n",
+				    	    seg_start, tmp,
+				  	    atop(seg_start), atop(tmp));
+#endif
+					uvm_page_physload(atop(seg_start),
+				    	    atop(tmp), atop(seg_start),
+				    	    atop(tmp), first16q);
+				}
+				seg_start = tmp;
+			}
+
+			if (seg_start != seg_end) {
+#ifdef DEBUG_MEMLOAD
+				printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
+				    seg_start, seg_end,
+				    atop(seg_start), atop(seg_end));
+#endif
+				uvm_page_physload(atop(seg_start),
+				    atop(seg_end), atop(seg_start),
+				    atop(seg_end), VM_FREELIST_DEFAULT);
+			}
+		}
+
+		/* Second hunk */
+		if (seg_start1 != seg_end1) {
+			if (seg_start1 < (16 * 1024 * 1024) &&
+			    first16q != VM_FREELIST_DEFAULT) {
+				u_int64_t tmp;
+
+				if (seg_end1 > (16 * 1024 * 1024))
+					tmp = (16 * 1024 * 1024);
+				else
+					tmp = seg_end1;
+
+				if (tmp != seg_start1) {
+#ifdef DEBUG_MEMLOAD
+					printf("loading 0x%qx-0x%qx "
+					    "(0x%lx-0x%lx)\n",
+				    	    seg_start1, tmp,
+				    	    atop(seg_start1), atop(tmp));
+#endif
+					uvm_page_physload(atop(seg_start1),
+				    	    atop(tmp), atop(seg_start1),
+				    	    atop(tmp), first16q);
+				}
+				seg_start1 = tmp;
+			}
+
+			if (seg_start1 != seg_end1) {
+#ifdef DEBUG_MEMLOAD
+				printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
+				    seg_start1, seg_end1,
+				    atop(seg_start1), atop(seg_end1));
+#endif
+				uvm_page_physload(atop(seg_start1),
+				    atop(seg_end1), atop(seg_start1),
+				    atop(seg_end1), VM_FREELIST_DEFAULT);
+			}
+		}
+	}
+#endif
+
+	/*
+	 * Steal memory for the message buffer (at end of core).
+	 */
+	{
+		struct vm_physseg *vps;
+		psize_t sz = round_page(MSGBUFSIZE);
+		psize_t reqsz = sz;
+
+		for (x = 0; x < vm_nphysseg; x++) {
+			vps = &vm_physmem[x];
+			if (ptoa(vps->avail_end) == avail_end)
+				goto found;
+		}
+		panic("init386: can't find end of memory");
+
+	found:
+		/* Shrink so it'll fit in the last segment. */
+		if ((vps->avail_end - vps->avail_start) < atop(sz))
+			sz = ptoa(vps->avail_end - vps->avail_start);
+
+		vps->avail_end -= atop(sz);
+		vps->end -= atop(sz);
+		msgbuf_paddr = ptoa(vps->avail_end);
+
+		/* Remove the last segment if it now has no pages. */
+		if (vps->start == vps->end) {
+			for (vm_nphysseg--; x < vm_nphysseg; x++)
+				vm_physmem[x] = vm_physmem[x + 1];
+		}
+
+		/* Now find where the new avail_end is. */
+		for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
+			if (vm_physmem[x].avail_end > avail_end)
+				avail_end = vm_physmem[x].avail_end;
+		avail_end = ptoa(avail_end);
+
+		/* Warn if the message buffer had to be shrunk. */
+		if (sz != reqsz)
+			printf("WARNING: %ld bytes not available for msgbuf "
+			    "in last cluster (%ld used)\n", reqsz, sz);
+	}
+
+	/*
+	 * install PT page for the first 4M if needed.
+	 */
+	if (needs_earlier_install_pte0) {
+		paddr_t paddr;
+#ifdef DIAGNOSTIC
+		if (realmode_reserved_size < PAGE_SIZE) {
+			panic("cannot steal memory for first 4M PT page.");
+		}
+#endif
+		paddr=realmode_reserved_start+realmode_reserved_size-PAGE_SIZE;
+		pmap_enter(pmap_kernel(), (vaddr_t)vtopte(0), paddr,
+			   VM_PROT_READ|VM_PROT_WRITE,
+			   PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
+		pmap_update(pmap_kernel());
+		/* make sure it is clean before using */
+		memset(vtopte(0), 0, PAGE_SIZE);
+		realmode_reserved_size -= PAGE_SIZE;
+	}
+
+#if NBIOSCALL > 0
+	/*
+	 * this should be caught at kernel build time, but put it here
+	 * in case someone tries to fake it out...
+	 */
+#ifdef DIAGNOSTIC
+	if (realmode_reserved_start > BIOSTRAMP_BASE ||
+	    (realmode_reserved_start+realmode_reserved_size) < (BIOSTRAMP_BASE+
+							       PAGE_SIZE)) {
+	    panic("cannot steal memory for PT page of bioscall.");
+	}
+	if (biostramp_image_size > PAGE_SIZE)
+	    panic("biostramp_image_size too big: %x vs. %x",
+		  biostramp_image_size, PAGE_SIZE);
+#endif
+	pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE,	/* virtual */
+		       (paddr_t)BIOSTRAMP_BASE,	/* physical */
+		       VM_PROT_ALL);		/* protection */
+	pmap_update(pmap_kernel());
+	memcpy((caddr_t)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
+#ifdef DEBUG_BIOSCALL
+	printf("biostramp installed @ %x\n", BIOSTRAMP_BASE);
+#endif
+	realmode_reserved_size  -= PAGE_SIZE;
+	realmode_reserved_start += PAGE_SIZE;
+#endif
+
+#if NACPI > 0
+	/*
+	 * Steal memory for the acpi wake code
+	 */
+	{
+		paddr_t paddr, p;
+		psize_t sz;
+		int npg;
+
+		paddr = realmode_reserved_start;
+		npg = acpi_md_get_npages_of_wakecode();
+		sz = ptoa(npg);
+#ifdef DIAGNOSTIC
+		if (realmode_reserved_size < sz) {
+			panic("cannot steal memory for ACPI wake code.");
+		}
+#endif
+
+		/* identical mapping */
+		p = paddr;
+		for (x=0; x<npg; x++) {
+			printf("kenter: 0x%08X\n", (unsigned)p);
+			pmap_kenter_pa((vaddr_t)p, p, VM_PROT_ALL);
+			p += PAGE_SIZE;
+		}
+		pmap_update(pmap_kernel());
+
+		acpi_md_install_wakecode(paddr);
+
+		realmode_reserved_size  -= sz;
+		realmode_reserved_start += sz;
+	}
+#endif
+
+	pmap_enter(pmap_kernel(), idt_vaddr, idt_paddr,
+	    VM_PROT_READ|VM_PROT_WRITE, PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
+	pmap_update(pmap_kernel());
+	memset((void *)idt_vaddr, 0, PAGE_SIZE);
+
+#if !defined(XEN)
+	idt = (struct gate_descriptor *)idt_vaddr;
+#ifdef I586_CPU
+	pmap_enter(pmap_kernel(), pentium_idt_vaddr, idt_paddr,
+	    VM_PROT_READ, PMAP_WIRED|VM_PROT_READ);
+	pentium_idt = (union descriptor *)pentium_idt_vaddr;
+#endif
+#endif
+	pmap_update(pmap_kernel());
+
+	initgdt();
+
+	HYPERVISOR_set_callbacks(
+		GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
+		GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
+
+#if !defined(XEN)
+	tgdt = gdt;
+	gdt = (union descriptor *)
+		    ((char *)idt + NIDT * sizeof (struct gate_descriptor));
+	ldt = gdt + NGDT;
+
+	memcpy(gdt, tgdt, NGDT*sizeof(*gdt));
+
+	setsegment(&gdt[GLDT_SEL].sd, ldt, NLDT * sizeof(ldt[0]) - 1,
+	    SDT_SYSLDT, SEL_KPL, 0, 0);
+#else
+	ldt = (union descriptor *)idt_vaddr;
+#endif
+
+	/* make ldt gates and memory segments */
+	setgate(&ldt[LSYS5CALLS_SEL].gd, &IDTVEC(osyscall), 1,
+	    SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+
+	ldt[LUCODE_SEL] = gdt[GUCODE_SEL];
+	ldt[LUCODEBIG_SEL] = gdt[GUCODEBIG_SEL];
+	ldt[LUDATA_SEL] = gdt[GUDATA_SEL];
+	ldt[LSOL26CALLS_SEL] = ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
+
+#if !defined(XEN)
+	/* exceptions */
+	for (x = 0; x < 32; x++) {
+		setgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386TGT,
+		    (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
+		    GSEL(GCODE_SEL, SEL_KPL));
+		idt_allocmap[x] = 1;
+	}
+
+	/* new-style interrupt gate for syscalls */
+	setgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386TGT, SEL_UPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	idt_allocmap[128] = 1;
+#ifdef COMPAT_SVR4
+	setgate(&idt[0xd2], &IDTVEC(svr4_fasttrap), 0, SDT_SYS386TGT,
+	    SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+	idt_allocmap[0xd2] = 1;
+#endif /* COMPAT_SVR4 */
+#endif
+
+	memset(xen_idt, 0, sizeof(trap_info_t) * MAX_XEN_IDT);
+	xen_idt_idx = 0;
+	for (x = 0; x < 32; x++) {
+		KASSERT(xen_idt_idx < MAX_XEN_IDT);
+		xen_idt[xen_idt_idx].vector = x;
+		xen_idt[xen_idt_idx].flags =
+			(x == 3 || x == 4) ? SEL_UPL : SEL_XEN;
+		xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
+		xen_idt[xen_idt_idx].address =
+			(uint32_t)IDTVEC(exceptions)[x];
+		xen_idt_idx++;
+	}
+	KASSERT(xen_idt_idx < MAX_XEN_IDT);
+	xen_idt[xen_idt_idx].vector = 128;
+	xen_idt[xen_idt_idx].flags = SEL_UPL;
+	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
+	xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(syscall);
+	xen_idt_idx++;
+#ifdef COMPAT_SVR4
+	KASSERT(xen_idt_idx < MAX_XEN_IDT);
+	xen_idt[xen_idt_idx].vector = 0xd2;
+	xen_idt[xen_idt_idx].flags = SEL_UPL;
+	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
+	xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(svr4_fasttrap);
+	xen_idt_idx++;
+#endif /* COMPAT_SVR4 */
+
+#if !defined(XEN)
+	setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
+	lgdt(&region);
+#else
+	lldt(GSEL(GLDT_SEL, SEL_KPL));
+#endif
+
+#if !defined(XEN)
+	cpu_init_idt();
+#else
+	db_trap_callback = ddb_trap_hook;
+
+	XENPRINTF(("HYPERVISOR_set_trap_table %p\n", xen_idt));
+	if (HYPERVISOR_set_trap_table(xen_idt))
+		panic("HYPERVISOR_set_trap_table %p failed\n", xen_idt);
+#endif
+
+#if NKSYMS || defined(DDB) || defined(LKM)
+	{
+		extern int end;
+		extern int *esym;
+		struct btinfo_symtab *symtab;
+
+#ifdef DDB
+		db_machine_init();
+#endif
+
+		symtab = lookup_bootinfo(BTINFO_SYMTAB);
+
+		if (symtab) {
+			symtab->ssym += KERNBASE;
+			symtab->esym += KERNBASE;
+			ksyms_init(symtab->nsym, (int *)symtab->ssym,
+			    (int *)symtab->esym);
+		}
+		else
+			ksyms_init(*(int *)&end, ((int *)&end) + 1, esym);
+	}
+#endif
+#ifdef DDB
+	if (boothowto & RB_KDB)
+		Debugger();
+#endif
+#ifdef IPKDB
+	ipkdb_init();
+	if (boothowto & RB_KDB)
+		ipkdb_connect(0);
+#endif
+#ifdef KGDB
+	kgdb_port_init();
+	if (boothowto & RB_KDB) {
+		kgdb_debug_init = 1;
+		kgdb_connect(1);
+	}
+#endif
+
+#if NMCA > 0
+	/* check for MCA bus, needed to be done before ISA stuff - if
+	 * MCA is detected, ISA needs to use level triggered interrupts
+	 * by default */
+	mca_busprobe();
+#endif
+
+#if defined(XEN)
+	events_default_setup();
+#else
+	intr_default_setup();
+#endif
+
+	/* Initialize software interrupts. */
+	softintr_init();
+
+	splraise(IPL_IPI);
+	enable_intr();
+
+	if (physmem < btoc(2 * 1024 * 1024)) {
+		printf("warning: too little memory available; "
+		       "have %lu bytes, want %lu bytes\n"
+		       "running in degraded mode\n"
+		       "press a key to confirm\n\n",
+		       ptoa(physmem), 2*1024*1024UL);
+		cngetc();
+	}
+
+#ifdef __HAVE_CPU_MAXPROC
+	/* Make sure maxproc is sane */
+	if (maxproc > cpu_maxproc())
+		maxproc = cpu_maxproc();
+#endif
+}
+
+#ifdef COMPAT_NOMID
+static int
+exec_nomid(struct proc *p, struct exec_package *epp)
+{
+	int error;
+	u_long midmag, magic;
+	u_short mid;
+	struct exec *execp = epp->ep_hdr;
+
+	/* check on validity of epp->ep_hdr performed by exec_out_makecmds */
+
+	midmag = ntohl(execp->a_midmag);
+	mid = (midmag >> 16) & 0xffff;
+	magic = midmag & 0xffff;
+
+	if (magic == 0) {
+		magic = (execp->a_midmag & 0xffff);
+		mid = MID_ZERO;
+	}
+
+	midmag = mid << 16 | magic;
+
+	switch (midmag) {
+	case (MID_ZERO << 16) | ZMAGIC:
+		/*
+		 * 386BSD's ZMAGIC format:
+		 */
+		error = exec_aout_prep_oldzmagic(p, epp);
+		break;
+
+	case (MID_ZERO << 16) | QMAGIC:
+		/*
+		 * BSDI's QMAGIC format:
+		 * same as new ZMAGIC format, but with different magic number
+		 */
+		error = exec_aout_prep_zmagic(p, epp);
+		break;
+
+	case (MID_ZERO << 16) | NMAGIC:
+		/*
+		 * BSDI's NMAGIC format:
+		 * same as NMAGIC format, but with different magic number
+		 * and with text starting at 0.
+		 */
+		error = exec_aout_prep_oldnmagic(p, epp);
+		break;
+
+	case (MID_ZERO << 16) | OMAGIC:
+		/*
+		 * BSDI's OMAGIC format:
+		 * same as OMAGIC format, but with different magic number
+		 * and with text starting at 0.
+		 */
+		error = exec_aout_prep_oldomagic(p, epp);
+		break;
+
+	default:
+		error = ENOEXEC;
+	}
+
+	return error;
+}
+#endif
+
+/*
+ * cpu_exec_aout_makecmds():
+ *	CPU-dependent a.out format hook for execve().
+ *
+ * Determine of the given exec package refers to something which we
+ * understand and, if so, set up the vmcmds for it.
+ *
+ * On the i386, old (386bsd) ZMAGIC binaries and BSDI QMAGIC binaries
+ * if COMPAT_NOMID is given as a kernel option.
+ */
+int
+cpu_exec_aout_makecmds(struct proc *p, struct exec_package *epp)
+{
+	int error = ENOEXEC;
+
+#ifdef COMPAT_NOMID
+	if ((error = exec_nomid(p, epp)) == 0)
+		return error;
+#endif /* ! COMPAT_NOMID */
+
+	return error;
+}
+
+void *
+lookup_bootinfo(int type)
+{
+	struct btinfo_common *help;
+	int n = *(int*)bootinfo;
+	help = (struct btinfo_common *)(bootinfo + sizeof(int));
+	while(n--) {
+		if(help->type == type)
+			return(help);
+		help = (struct btinfo_common *)((char*)help + help->len);
+	}
+	return(0);
+}
+
+#include <dev/ic/mc146818reg.h>		/* for NVRAM POST */
+#include <i386/isa/nvram.h>		/* for NVRAM POST */
+
+void
+cpu_reset()
+{
+
+	disable_intr();
+
+#if 0
+	/*
+	 * Ensure the NVRAM reset byte contains something vaguely sane.
+	 */
+
+	outb(IO_RTC, NVRAM_RESET);
+	outb(IO_RTC+1, NVRAM_RESET_RST);
+
+	/*
+	 * The keyboard controller has 4 random output pins, one of which is
+	 * connected to the RESET pin on the CPU in many PCs.  We tell the
+	 * keyboard controller to pulse this line a couple of times.
+	 */
+	outb(IO_KBD + KBCMDP, KBC_PULSE0);
+	delay(100000);
+	outb(IO_KBD + KBCMDP, KBC_PULSE0);
+	delay(100000);
+#endif
+
+	HYPERVISOR_reboot();
+
+	for (;;);
+}
+
+void
+cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
+{
+	const struct trapframe *tf = l->l_md.md_regs;
+	__greg_t *gr = mcp->__gregs;
+	__greg_t ras_eip;
+
+	/* Save register context. */
+#ifdef VM86
+	if (tf->tf_eflags & PSL_VM) {
+		gr[_REG_GS]  = tf->tf_vm86_gs;
+		gr[_REG_FS]  = tf->tf_vm86_fs;
+		gr[_REG_ES]  = tf->tf_vm86_es;
+		gr[_REG_DS]  = tf->tf_vm86_ds;
+		gr[_REG_EFL] = get_vflags(l);
+	} else
+#endif
+	{
+		gr[_REG_GS]  = tf->tf_gs;
+		gr[_REG_FS]  = tf->tf_fs;
+		gr[_REG_ES]  = tf->tf_es;
+		gr[_REG_DS]  = tf->tf_ds;
+		gr[_REG_EFL] = tf->tf_eflags;
+	}
+	gr[_REG_EDI]    = tf->tf_edi;
+	gr[_REG_ESI]    = tf->tf_esi;
+	gr[_REG_EBP]    = tf->tf_ebp;
+	gr[_REG_EBX]    = tf->tf_ebx;
+	gr[_REG_EDX]    = tf->tf_edx;
+	gr[_REG_ECX]    = tf->tf_ecx;
+	gr[_REG_EAX]    = tf->tf_eax;
+	gr[_REG_EIP]    = tf->tf_eip;
+	gr[_REG_CS]     = tf->tf_cs;
+	gr[_REG_ESP]    = tf->tf_esp;
+	gr[_REG_UESP]   = tf->tf_esp;
+	gr[_REG_SS]     = tf->tf_ss;
+	gr[_REG_TRAPNO] = tf->tf_trapno;
+	gr[_REG_ERR]    = tf->tf_err;
+
+	if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
+	    (caddr_t) gr[_REG_EIP])) != -1)
+		gr[_REG_EIP] = ras_eip;
+
+	*flags |= _UC_CPU;
+
+	/* Save floating point register context, if any. */
+	if ((l->l_md.md_flags & MDL_USEDFPU) != 0) {
+#if NNPX > 0
+		/*
+		 * If this process is the current FP owner, dump its
+		 * context to the PCB first.
+		 * XXX npxsave() also clears the FPU state; depending on the
+		 * XXX application this might be a penalty.
+		 */
+		if (l->l_addr->u_pcb.pcb_fpcpu) {
+			npxsave_lwp(l, 1);
+		}
+#endif
+		if (i386_use_fxsave) {
+			memcpy(&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
+			    &l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
+			    sizeof (mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm));
+			*flags |= _UC_FXSAVE;
+		} else {
+			memcpy(&mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
+			    &l->l_addr->u_pcb.pcb_savefpu.sv_87,
+			    sizeof (mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state));
+		}
+#if 0
+		/* Apparently nothing ever touches this. */
+		ucp->mcp.mc_fp.fp_emcsts = l->l_addr->u_pcb.pcb_saveemc;
+#endif
+		*flags |= _UC_FPU;
+	}
+}
+
+int
+cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
+{
+	struct trapframe *tf = l->l_md.md_regs;
+	__greg_t *gr = mcp->__gregs;
+
+	/* Restore register context, if any. */
+	if ((flags & _UC_CPU) != 0) {
+#ifdef VM86
+		if (gr[_REG_EFL] & PSL_VM) {
+			tf->tf_vm86_gs = gr[_REG_GS];
+			tf->tf_vm86_fs = gr[_REG_FS];
+			tf->tf_vm86_es = gr[_REG_ES];
+			tf->tf_vm86_ds = gr[_REG_DS];
+			set_vflags(l, gr[_REG_EFL]);
+			if (flags & _UC_VM) {
+				void syscall_vm86(struct trapframe *);
+				l->l_proc->p_md.md_syscall = syscall_vm86;
+			}
+		} else
+#endif
+		{
+			/*
+			 * Check for security violations.  If we're returning
+			 * to protected mode, the CPU will validate the segment
+			 * registers automatically and generate a trap on
+			 * violations.  We handle the trap, rather than doing
+			 * all of the checking here.
+			 */
+			if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
+			    !USERMODE(gr[_REG_CS], gr[_REG_EFL])) {
+				printf("cpu_setmcontext error: uc EFL: 0x%08x"
+				    " tf EFL: 0x%08x uc CS: 0x%x\n",
+				    gr[_REG_EFL], tf->tf_eflags, gr[_REG_CS]);
+				return (EINVAL);
+			}
+			tf->tf_gs = gr[_REG_GS];
+			tf->tf_fs = gr[_REG_FS];
+			tf->tf_es = gr[_REG_ES];
+			tf->tf_ds = gr[_REG_DS];
+			/* Only change the user-alterable part of eflags */
+			tf->tf_eflags &= ~PSL_USER;
+			tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
+		}
+		tf->tf_edi    = gr[_REG_EDI];
+		tf->tf_esi    = gr[_REG_ESI];
+		tf->tf_ebp    = gr[_REG_EBP];
+		tf->tf_ebx    = gr[_REG_EBX];
+		tf->tf_edx    = gr[_REG_EDX];
+		tf->tf_ecx    = gr[_REG_ECX];
+		tf->tf_eax    = gr[_REG_EAX];
+		tf->tf_eip    = gr[_REG_EIP];
+		tf->tf_cs     = gr[_REG_CS];
+		tf->tf_esp    = gr[_REG_UESP];
+		tf->tf_ss     = gr[_REG_SS];
+	}
+
+	/* Restore floating point register context, if any. */
+	if ((flags & _UC_FPU) != 0) {
+#if NNPX > 0
+		/*
+		 * If we were using the FPU, forget that we were.
+		 */
+		if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
+			npxsave_lwp(l, 0);
+#endif
+		if (flags & _UC_FXSAVE) {
+			if (i386_use_fxsave) {
+				memcpy(
+					&l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
+					&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
+					sizeof (&l->l_addr->u_pcb.pcb_savefpu.sv_xmm));
+			} else {
+				/* This is a weird corner case */
+				process_xmm_to_s87((struct savexmm *)
+				    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
+				    &l->l_addr->u_pcb.pcb_savefpu.sv_87);
+			}
+		} else {
+			if (i386_use_fxsave) {
+				process_s87_to_xmm((struct save87 *)
+				    &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
+				    &l->l_addr->u_pcb.pcb_savefpu.sv_xmm);
+			} else {
+				memcpy(&l->l_addr->u_pcb.pcb_savefpu.sv_87,
+				    &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
+				    sizeof (l->l_addr->u_pcb.pcb_savefpu.sv_87));
+			}
+		}
+		/* If not set already. */
+		l->l_md.md_flags |= MDL_USEDFPU;
+#if 0
+		/* Apparently unused. */
+		l->l_addr->u_pcb.pcb_saveemc = mcp->mc_fp.fp_emcsts;
+#endif
+	}
+	if (flags & _UC_SETSTACK)
+		l->l_proc->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
+	if (flags & _UC_CLRSTACK)
+		l->l_proc->p_sigctx.ps_sigstk.ss_flags &= ~SS_ONSTACK;
+	return (0);
+}
+
+void
+cpu_initclocks()
+{
+	(*initclock_func)();
+}
+
+#ifdef MULTIPROCESSOR
+void
+need_resched(struct cpu_info *ci)
+{
+
+	if (ci->ci_want_resched)
+		return;
+
+	ci->ci_want_resched = 1;
+	if ((ci)->ci_curlwp != NULL)
+		aston((ci)->ci_curlwp->l_proc);
+	else if (ci != curcpu())
+		x86_send_ipi(ci, 0);
+}
+#endif
+
+/*
+ * Allocate an IDT vector slot within the given range.
+ * XXX needs locking to avoid MP allocation races.
+ */
+
+int
+idt_vec_alloc(int low, int high)
+{
+	int vec;
+
+	simple_lock(&idt_lock);
+	for (vec = low; vec <= high; vec++) {
+		if (idt_allocmap[vec] == 0) {
+			idt_allocmap[vec] = 1;
+			simple_unlock(&idt_lock);
+			return vec;
+		}
+	}
+	simple_unlock(&idt_lock);
+	return 0;
+}
+
+void
+idt_vec_set(int vec, void (*function)(void))
+{
+	/*
+	 * Vector should be allocated, so no locking needed.
+	 */
+	KASSERT(idt_allocmap[vec] == 1);
+	setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+}
+
+void
+idt_vec_free(int vec)
+{
+	simple_lock(&idt_lock);
+	unsetgate(&idt[vec]);
+	idt_allocmap[vec] = 0;
+	simple_unlock(&idt_lock);
+}
+
+/*
+ * Number of processes is limited by number of available GDT slots.
+ */
+int
+cpu_maxproc(void)
+{
+#ifdef USER_LDT
+	return ((MAXGDTSIZ - NGDT) / 2);
+#else
+	return (MAXGDTSIZ - NGDT);
+#endif
+}
+
+#if defined(DDB) || defined(KGDB)
+
+/* 
+ * Callback to output a backtrace when entering ddb.
+ */
+void
+ddb_trap_hook(int where)
+{
+	static int once = 0;
+	db_addr_t db_dot;
+
+	if (once != 0 || where != 1)
+		return;
+	once = 1;
+
+	if (curlwp != NULL) {
+		db_printf("Stopped");
+		if (curproc == NULL)
+			db_printf("; curlwp = %p,"
+			    " curproc is NULL at\t", curlwp);
+		else
+			db_printf(" in pid %d.%d (%s) at\t", 
+			    curproc->p_pid, curlwp->l_lid,
+			    curproc->p_comm);
+	} else
+		db_printf("Stopped at\t");
+	db_dot = PC_REGS(DDB_REGS);
+	db_print_loc_and_inst(db_dot);
+
+	db_stack_trace_print((db_expr_t) db_dot, FALSE, 65535,
+	    "", db_printf);
+#ifdef DEBUG
+	db_show_regs((db_expr_t) db_dot, FALSE, 65535, "");
+#endif
+}
+
+#endif /* DDB || KGDB */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c
new file mode 100644
index 0000000000..8e031eb242
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c
@@ -0,0 +1,4522 @@
+/*	$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $	*/
+/*	NetBSD: pmap.c,v 1.172 2004/04/12 13:17:46 yamt Exp 	*/
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Charles D. Cranor and
+ *      Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * pmap.c: i386 pmap module rewrite
+ * Chuck Cranor <chuck@ccrc.wustl.edu>
+ * 11-Aug-97
+ *
+ * history of this pmap module: in addition to my own input, i used
+ *    the following references for this rewrite of the i386 pmap:
+ *
+ * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
+ *     BSD hp300 pmap done by Mike Hibler at University of Utah.
+ *     it was then ported to the i386 by William Jolitz of UUNET
+ *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
+ *     project fixed some bugs and provided some speed ups.
+ *
+ * [2] the FreeBSD i386 pmap.   this pmap seems to be the
+ *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
+ *     and David Greenman.
+ *
+ * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
+ *     between several processors.   the VAX version was done by
+ *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
+ *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
+ *     David Golub, and Richard Draves.    the alpha version was
+ *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
+ *     (NetBSD/alpha).
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $");
+
+#include "opt_cputype.h"
+#include "opt_user_ldt.h"
+#include "opt_largepages.h"
+#include "opt_lockdebug.h"
+#include "opt_multiprocessor.h"
+#include "opt_kstack_dr0.h"
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/user.h>
+#include <sys/kernel.h>
+
+#include <uvm/uvm.h>
+
+#include <machine/atomic.h>
+#include <machine/cpu.h>
+#include <machine/specialreg.h>
+#include <machine/gdt.h>
+
+#include <dev/isa/isareg.h>
+#include <machine/isa_machdep.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/xenpmap.h>
+
+void xpmap_find_pte(paddr_t);
+
+/* #define XENDEBUG */
+
+#ifdef XENDEBUG
+#define	XENPRINTF(x) printf x
+#define	XENPRINTK(x) printf x
+#else
+#define	XENPRINTF(x)
+#define	XENPRINTK(x)
+#endif
+#define	PRINTF(x) printf x
+#define	PRINTK(x) printf x
+
+
+/*
+ * general info:
+ *
+ *  - for an explanation of how the i386 MMU hardware works see
+ *    the comments in <machine/pte.h>.
+ *
+ *  - for an explanation of the general memory structure used by
+ *    this pmap (including the recursive mapping), see the comments
+ *    in <machine/pmap.h>.
+ *
+ * this file contains the code for the "pmap module."   the module's
+ * job is to manage the hardware's virtual to physical address mappings.
+ * note that there are two levels of mapping in the VM system:
+ *
+ *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
+ *      to map ranges of virtual address space to objects/files.  for
+ *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
+ *      to the file /bin/ls starting at offset zero."   note that
+ *      the upper layer mapping is not concerned with how individual
+ *      vm_pages are mapped.
+ *
+ *  [2] the lower layer of the VM system (the pmap) maintains the mappings
+ *      from virtual addresses.   it is concerned with which vm_page is
+ *      mapped where.   for example, when you run /bin/ls and start
+ *      at page 0x1000 the fault routine may lookup the correct page
+ *      of the /bin/ls file and then ask the pmap layer to establish
+ *      a mapping for it.
+ *
+ * note that information in the lower layer of the VM system can be
+ * thrown away since it can easily be reconstructed from the info
+ * in the upper layer.
+ *
+ * data structures we use include:
+ *
+ *  - struct pmap: describes the address space of one thread
+ *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
+ *  - struct pv_head: there is one pv_head per managed page of
+ *	physical memory.   the pv_head points to a list of pv_entry
+ *	structures which describe all the <PMAP,VA> pairs that this
+ *      page is mapped in.    this is critical for page based operations
+ *      such as pmap_page_protect() [change protection on _all_ mappings
+ *      of a page]
+ *  - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's.
+ *      if we run out of pv_entry's we allocate a new pv_page and free
+ *      its pv_entrys.
+ * - pmap_remove_record: a list of virtual addresses whose mappings
+ *	have been changed.   used for TLB flushing.
+ */
+
+/*
+ * memory allocation
+ *
+ *  - there are three data structures that we must dynamically allocate:
+ *
+ * [A] new process' page directory page (PDP)
+ *	- plan 1: done at pmap_create() we use
+ *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
+ *	  allocation.
+ *
+ * if we are low in free physical memory then we sleep in
+ * uvm_km_alloc -- in this case this is ok since we are creating
+ * a new pmap and should not be holding any locks.
+ *
+ * if the kernel is totally out of virtual space
+ * (i.e. uvm_km_alloc returns NULL), then we panic.
+ *
+ * XXX: the fork code currently has no way to return an "out of
+ * memory, try again" error code since uvm_fork [fka vm_fork]
+ * is a void function.
+ *
+ * [B] new page tables pages (PTP)
+ * 	- call uvm_pagealloc()
+ * 		=> success: zero page, add to pm_pdir
+ * 		=> failure: we are out of free vm_pages, let pmap_enter()
+ *		   tell UVM about it.
+ *
+ * note: for kernel PTPs, we start with NKPTP of them.   as we map
+ * kernel memory (at uvm_map time) we check to see if we've grown
+ * the kernel pmap.   if so, we call the optional function
+ * pmap_growkernel() to grow the kernel PTPs in advance.
+ *
+ * [C] pv_entry structures
+ *	- plan 1: try to allocate one off the free list
+ *		=> success: done!
+ *		=> failure: no more free pv_entrys on the list
+ *	- plan 2: try to allocate a new pv_page to add a chunk of
+ *	pv_entrys to the free list
+ *		[a] obtain a free, unmapped, VA in kmem_map.  either
+ *		we have one saved from a previous call, or we allocate
+ *		one now using a "vm_map_lock_try" in uvm_map
+ *		=> success: we have an unmapped VA, continue to [b]
+ *		=> failure: unable to lock kmem_map or out of VA in it.
+ *			move on to plan 3.
+ *		[b] allocate a page in kmem_object for the VA
+ *		=> success: map it in, free the pv_entry's, DONE!
+ *		=> failure: kmem_object locked, no free vm_pages, etc.
+ *			save VA for later call to [a], go to plan 3.
+ *	If we fail, we simply let pmap_enter() tell UVM about it.
+ */
+
+/*
+ * locking
+ *
+ * we have the following locks that we must contend with:
+ *
+ * "normal" locks:
+ *
+ *  - pmap_main_lock
+ *    this lock is used to prevent deadlock and/or provide mutex
+ *    access to the pmap system.   most operations lock the pmap
+ *    structure first, then they lock the pv_lists (if needed).
+ *    however, some operations such as pmap_page_protect lock
+ *    the pv_lists and then lock pmaps.   in order to prevent a
+ *    cycle, we require a mutex lock when locking the pv_lists
+ *    first.   thus, the "pmap = >pv_list" lockers must gain a
+ *    read-lock on pmap_main_lock before locking the pmap.   and
+ *    the "pv_list => pmap" lockers must gain a write-lock on
+ *    pmap_main_lock before locking.    since only one thread
+ *    can write-lock a lock at a time, this provides mutex.
+ *
+ * "simple" locks:
+ *
+ * - pmap lock (per pmap, part of uvm_object)
+ *   this lock protects the fields in the pmap structure including
+ *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
+ *   in the alternate PTE space (since that is determined by the
+ *   entry in the PDP).
+ *
+ * - pvh_lock (per pv_head)
+ *   this lock protects the pv_entry list which is chained off the
+ *   pv_head structure for a specific managed PA.   it is locked
+ *   when traversing the list (e.g. adding/removing mappings,
+ *   syncing R/M bits, etc.)
+ *
+ * - pvalloc_lock
+ *   this lock protects the data structures which are used to manage
+ *   the free list of pv_entry structures.
+ *
+ * - pmaps_lock
+ *   this lock protects the list of active pmaps (headed by "pmaps").
+ *   we lock it when adding or removing pmaps from this list.
+ *
+ */
+
+/*
+ * locking data structures
+ */
+
+static struct simplelock pvalloc_lock;
+static struct simplelock pmaps_lock;
+
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+static struct lock pmap_main_lock;
+
+#define PMAP_MAP_TO_HEAD_LOCK() \
+     (void) spinlockmgr(&pmap_main_lock, LK_SHARED, NULL)
+#define PMAP_MAP_TO_HEAD_UNLOCK() \
+     (void) spinlockmgr(&pmap_main_lock, LK_RELEASE, NULL)
+
+#define PMAP_HEAD_TO_MAP_LOCK() \
+     (void) spinlockmgr(&pmap_main_lock, LK_EXCLUSIVE, NULL)
+#define PMAP_HEAD_TO_MAP_UNLOCK() \
+     spinlockmgr(&pmap_main_lock, LK_RELEASE, (void *) 0)
+
+#else
+
+#define PMAP_MAP_TO_HEAD_LOCK()		/* null */
+#define PMAP_MAP_TO_HEAD_UNLOCK()	/* null */
+
+#define PMAP_HEAD_TO_MAP_LOCK()		/* null */
+#define PMAP_HEAD_TO_MAP_UNLOCK()	/* null */
+
+#endif
+
+#define COUNT(x)	/* nothing */
+
+/*
+ * TLB Shootdown:
+ *
+ * When a mapping is changed in a pmap, the TLB entry corresponding to
+ * the virtual address must be invalidated on all processors.  In order
+ * to accomplish this on systems with multiple processors, messages are
+ * sent from the processor which performs the mapping change to all
+ * processors on which the pmap is active.  For other processors, the
+ * ASN generation numbers for that processor is invalidated, so that
+ * the next time the pmap is activated on that processor, a new ASN
+ * will be allocated (which implicitly invalidates all TLB entries).
+ *
+ * Shootdown job queue entries are allocated using a simple special-
+ * purpose allocator for speed.
+ */
+struct pmap_tlb_shootdown_job {
+	TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list;
+	vaddr_t pj_va;			/* virtual address */
+	pmap_t pj_pmap;			/* the pmap which maps the address */
+	pt_entry_t pj_pte;		/* the PTE bits */
+	struct pmap_tlb_shootdown_job *pj_nextfree;
+};
+
+#define PMAP_TLB_SHOOTDOWN_JOB_ALIGN 32
+union pmap_tlb_shootdown_job_al {
+	struct pmap_tlb_shootdown_job pja_job;
+	char pja_align[PMAP_TLB_SHOOTDOWN_JOB_ALIGN];
+};
+
+struct pmap_tlb_shootdown_q {
+	TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head;
+	int pq_pte;			/* aggregate PTE bits */
+	int pq_count;			/* number of pending requests */
+	__cpu_simple_lock_t pq_slock;	/* spin lock on queue */
+	int pq_flushg;		/* pending flush global */
+	int pq_flushu;		/* pending flush user */
+} pmap_tlb_shootdown_q[X86_MAXPROCS];
+
+#define	PMAP_TLB_MAXJOBS	16
+
+void	pmap_tlb_shootdown_q_drain(struct pmap_tlb_shootdown_q *);
+struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
+	   (struct pmap_tlb_shootdown_q *);
+void	pmap_tlb_shootdown_job_put(struct pmap_tlb_shootdown_q *,
+	    struct pmap_tlb_shootdown_job *);
+
+__cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
+union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
+
+/*
+ * global data structures
+ */
+
+struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
+
+/*
+ * nkpde is the number of kernel PTPs allocated for the kernel at
+ * boot time (NKPTP is a compile time override).   this number can
+ * grow dynamically as needed (but once allocated, we never free
+ * kernel PTPs).
+ */
+
+int nkpde = NKPTP;
+#ifdef NKPDE
+#error "obsolete NKPDE: use NKPTP"
+#endif
+
+/*
+ * pmap_pg_g: if our processor supports PG_G in the PTE then we
+ * set pmap_pg_g to PG_G (otherwise it is zero).
+ */
+
+int pmap_pg_g = 0;
+
+#ifdef LARGEPAGES
+/*
+ * pmap_largepages: if our processor supports PG_PS and we are
+ * using it, this is set to TRUE.
+ */
+
+int pmap_largepages;
+#endif
+
+/*
+ * i386 physical memory comes in a big contig chunk with a small
+ * hole toward the front of it...  the following two paddr_t's
+ * (shared with machdep.c) describe the physical address space
+ * of this machine.
+ */
+paddr_t avail_start;	/* PA of first available physical page */
+paddr_t avail_end;	/* PA of last available physical page */
+
+paddr_t pmap_pa_start;	/* PA of first physical page for this domain */
+paddr_t pmap_pa_end;	/* PA of last physical page for this domain */
+
+	/* MA of last physical page of the machine */
+paddr_t pmap_mem_end = HYPERVISOR_VIRT_START; /* updated for domain-0 */
+
+/*
+ * other data structures
+ */
+
+static pt_entry_t protection_codes[8];     /* maps MI prot to i386 prot code */
+static boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */
+
+/*
+ * the following two vaddr_t's are used during system startup
+ * to keep track of how much of the kernel's VM space we have used.
+ * once the system is started, the management of the remaining kernel
+ * VM space is turned over to the kernel_map vm_map.
+ */
+
+static vaddr_t virtual_avail;	/* VA of first free KVA */
+static vaddr_t virtual_end;	/* VA of last free KVA */
+
+
+/*
+ * pv_page management structures: locked by pvalloc_lock
+ */
+
+TAILQ_HEAD(pv_pagelist, pv_page);
+static struct pv_pagelist pv_freepages;	/* list of pv_pages with free entrys */
+static struct pv_pagelist pv_unusedpgs; /* list of unused pv_pages */
+static int pv_nfpvents;			/* # of free pv entries */
+static struct pv_page *pv_initpage;	/* bootstrap page from kernel_map */
+static vaddr_t pv_cachedva;		/* cached VA for later use */
+
+#define PVE_LOWAT (PVE_PER_PVPAGE / 2)	/* free pv_entry low water mark */
+#define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
+					/* high water mark */
+
+static __inline int
+pv_compare(struct pv_entry *a, struct pv_entry *b)
+{
+	if (a->pv_pmap < b->pv_pmap)
+		return (-1);
+	else if (a->pv_pmap > b->pv_pmap)
+		return (1);
+	else if (a->pv_va < b->pv_va)
+		return (-1);
+	else if (a->pv_va > b->pv_va)
+		return (1);
+	else
+		return (0);
+}
+
+SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare);
+SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare);
+
+/*
+ * linked list of all non-kernel pmaps
+ */
+
+static struct pmap_head pmaps;
+
+/*
+ * pool that pmap structures are allocated from
+ */
+
+struct pool pmap_pmap_pool;
+
+/*
+ * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
+ * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing
+ * due to false sharing.
+ */
+
+#ifdef MULTIPROCESSOR
+#define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
+#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
+#else
+#define PTESLEW(pte, id) (pte)
+#define VASLEW(va,id) (va)
+#endif
+
+/*
+ * special VAs and the PTEs that map them
+ */
+static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte;
+static caddr_t csrcp, cdstp, zerop, ptpp;
+
+/*
+ * pool and cache that PDPs are allocated from
+ */
+
+struct pool pmap_pdp_pool;
+struct pool_cache pmap_pdp_cache;
+u_int pmap_pdp_cache_generation;
+
+int	pmap_pdp_ctor(void *, void *, int);
+void	pmap_pdp_dtor(void *, void *);
+
+caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
+
+extern vaddr_t msgbuf_vaddr;
+extern paddr_t msgbuf_paddr;
+
+extern vaddr_t idt_vaddr;			/* we allocate IDT early */
+extern paddr_t idt_paddr;
+
+#if defined(I586_CPU)
+/* stuff to fix the pentium f00f bug */
+extern vaddr_t pentium_idt_vaddr;
+#endif
+
+
+/*
+ * local prototypes
+ */
+
+static struct pv_entry	*pmap_add_pvpage(struct pv_page *, boolean_t);
+static struct vm_page	*pmap_alloc_ptp(struct pmap *, int);
+static struct pv_entry	*pmap_alloc_pv(struct pmap *, int); /* see codes below */
+#define ALLOCPV_NEED	0	/* need PV now */
+#define ALLOCPV_TRY	1	/* just try to allocate, don't steal */
+#define ALLOCPV_NONEED	2	/* don't need PV, just growing cache */
+static struct pv_entry	*pmap_alloc_pvpage(struct pmap *, int);
+static void		 pmap_enter_pv(struct pv_head *,
+				       struct pv_entry *, struct pmap *,
+				       vaddr_t, struct vm_page *);
+static void		 pmap_free_pv(struct pmap *, struct pv_entry *);
+static void		 pmap_free_pvs(struct pmap *, struct pv_entry *);
+static void		 pmap_free_pv_doit(struct pv_entry *);
+static void		 pmap_free_pvpage(void);
+static struct vm_page	*pmap_get_ptp(struct pmap *, int);
+static boolean_t	 pmap_is_curpmap(struct pmap *);
+static boolean_t	 pmap_is_active(struct pmap *, int);
+static pt_entry_t	*pmap_map_ptes(struct pmap *);
+static struct pv_entry	*pmap_remove_pv(struct pv_head *, struct pmap *,
+					vaddr_t);
+static void		 pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
+static boolean_t	 pmap_remove_pte(struct pmap *, struct vm_page *,
+					 pt_entry_t *, vaddr_t, int32_t *, int);
+static void		 pmap_remove_ptes(struct pmap *, struct vm_page *,
+					  vaddr_t, vaddr_t, vaddr_t, int32_t *,
+					  int);
+#define PMAP_REMOVE_ALL		0	/* remove all mappings */
+#define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
+
+static vaddr_t		 pmap_tmpmap_pa(paddr_t);
+static pt_entry_t	*pmap_tmpmap_pvepte(struct pv_entry *);
+static void		 pmap_tmpunmap_pa(void);
+static void		 pmap_tmpunmap_pvepte(struct pv_entry *);
+static void		 pmap_unmap_ptes(struct pmap *);
+
+static boolean_t	 pmap_reactivate(struct pmap *);
+
+#ifdef DEBUG
+u_int	curapdp;
+#endif
+
+/*
+ * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
+ */
+
+/*
+ * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
+ *		of course the kernel is always loaded
+ */
+
+__inline static boolean_t
+pmap_is_curpmap(pmap)
+	struct pmap *pmap;
+{
+
+	return((pmap == pmap_kernel()) ||
+	       (pmap == curcpu()->ci_pmap));
+}
+
+/*
+ * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
+ */
+
+__inline static boolean_t
+pmap_is_active(pmap, cpu_id)
+	struct pmap *pmap;
+	int cpu_id;
+{
+
+	return (pmap == pmap_kernel() ||
+	    (pmap->pm_cpus & (1U << cpu_id)) != 0);
+}
+
+/*
+ * pmap_tmpmap_pa: map a page in for tmp usage
+ */
+
+__inline static vaddr_t
+pmap_tmpmap_pa(pa)
+	paddr_t pa;
+{
+#ifdef MULTIPROCESSOR
+	int id = cpu_number();
+#endif
+	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
+	pt_entry_t *maptp;
+	caddr_t ptpva = VASLEW(ptpp, id);
+#if defined(DIAGNOSTIC)
+	if (*ptpte)
+		panic("pmap_tmpmap_pa: ptp_pte in use?");
+#endif
+	maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
+	PTE_SET(ptpte, maptp, PG_V | PG_RW | pa); /* always a new mapping */
+	return((vaddr_t)ptpva);
+}
+
+/*
+ * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa)
+ */
+
+__inline static void
+pmap_tmpunmap_pa()
+{
+#ifdef MULTIPROCESSOR
+	int id = cpu_number();
+#endif
+	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
+	pt_entry_t *maptp;
+	caddr_t ptpva = VASLEW(ptpp, id);
+#if defined(DIAGNOSTIC)
+	if (!pmap_valid_entry(*ptp_pte))
+		panic("pmap_tmpunmap_pa: our pte invalid?");
+#endif
+	maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
+	PTE_CLEAR(ptpte, maptp);		/* zap! */
+	pmap_update_pg((vaddr_t)ptpva);
+#ifdef MULTIPROCESSOR
+	/*
+	 * No need for tlb shootdown here, since ptp_pte is per-CPU.
+	 */
+#endif
+}
+
+/*
+ * pmap_tmpmap_pvepte: get a quick mapping of a PTE for a pv_entry
+ *
+ * => do NOT use this on kernel mappings [why?  because pv_ptp may be NULL]
+ */
+
+__inline static pt_entry_t *
+pmap_tmpmap_pvepte(pve)
+	struct pv_entry *pve;
+{
+#ifdef DIAGNOSTIC
+	if (pve->pv_pmap == pmap_kernel())
+		panic("pmap_tmpmap_pvepte: attempt to map kernel");
+#endif
+
+	/* is it current pmap?  use direct mapping... */
+	if (pmap_is_curpmap(pve->pv_pmap))
+		return(vtopte(pve->pv_va));
+
+	return(((pt_entry_t *)pmap_tmpmap_pa(VM_PAGE_TO_PHYS(pve->pv_ptp)))
+	       + ptei((unsigned)pve->pv_va));
+}
+
+/*
+ * pmap_tmpunmap_pvepte: release a mapping obtained with pmap_tmpmap_pvepte
+ */
+
+__inline static void
+pmap_tmpunmap_pvepte(pve)
+	struct pv_entry *pve;
+{
+	/* was it current pmap?   if so, return */
+	if (pmap_is_curpmap(pve->pv_pmap))
+		return;
+
+	pmap_tmpunmap_pa();
+}
+
+__inline static void
+pmap_apte_flush(struct pmap *pmap)
+{
+#if defined(MULTIPROCESSOR)
+	struct pmap_tlb_shootdown_q *pq;
+	struct cpu_info *ci, *self = curcpu();
+	CPU_INFO_ITERATOR cii;
+	int s;
+#endif
+
+	tlbflush();		/* flush TLB on current processor */
+#if defined(MULTIPROCESSOR)
+	/*
+	 * Flush the APTE mapping from all other CPUs that
+	 * are using the pmap we are using (who's APTE space
+	 * is the one we've just modified).
+	 *
+	 * XXXthorpej -- find a way to defer the IPI.
+	 */
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if (ci == self)
+			continue;
+		if (pmap_is_active(pmap, ci->ci_cpuid)) {
+			pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
+			s = splipi();
+			__cpu_simple_lock(&pq->pq_slock);
+			pq->pq_flushu++;
+			__cpu_simple_unlock(&pq->pq_slock);
+			splx(s);
+			x86_send_ipi(ci, X86_IPI_TLB);
+		}
+	}
+#endif
+}
+
+/*
+ * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
+ *
+ * => we lock enough pmaps to keep things locked in
+ * => must be undone with pmap_unmap_ptes before returning
+ */
+
+__inline static pt_entry_t *
+pmap_map_ptes(pmap)
+	struct pmap *pmap;
+{
+	pd_entry_t opde;
+	pd_entry_t *mapdp;
+	struct pmap *ourpmap;
+	struct cpu_info *ci;
+
+	/* the kernel's pmap is always accessible */
+	if (pmap == pmap_kernel()) {
+		return(PTE_BASE);
+	}
+
+	ci = curcpu();
+	if (ci->ci_want_pmapload &&
+	    vm_map_pmap(&ci->ci_curlwp->l_proc->p_vmspace->vm_map) == pmap)
+		pmap_load();
+
+	/* if curpmap then we are always mapped */
+	if (pmap_is_curpmap(pmap)) {
+		simple_lock(&pmap->pm_obj.vmobjlock);
+		return(PTE_BASE);
+	}
+
+	ourpmap = ci->ci_pmap;
+
+	/* need to lock both curpmap and pmap: use ordered locking */
+	if ((unsigned) pmap < (unsigned) ourpmap) {
+		simple_lock(&pmap->pm_obj.vmobjlock);
+		simple_lock(&ourpmap->pm_obj.vmobjlock);
+	} else {
+		simple_lock(&ourpmap->pm_obj.vmobjlock);
+		simple_lock(&pmap->pm_obj.vmobjlock);
+	}
+
+	/* need to load a new alternate pt space into curpmap? */
+	COUNT(apdp_pde_map);
+	opde = PDE_GET(APDP_PDE);
+	if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
+		XENPRINTF(("APDP_PDE %p %p/%p set %p/%p\n",
+			   pmap,
+			   (void *)vtophys((vaddr_t)APDP_PDE),
+			   (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
+			   (void *)pmap->pm_pdirpa,
+			   (void *)xpmap_ptom(pmap->pm_pdirpa)));
+		mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
+		PDE_SET(APDP_PDE, mapdp, pmap->pm_pdirpa /* | PG_RW */ | PG_V);
+#ifdef DEBUG
+		curapdp = pmap->pm_pdirpa;
+#endif
+		if (pmap_valid_entry(opde))
+			pmap_apte_flush(ourpmap);
+		XENPRINTF(("APDP_PDE set done\n"));
+	}
+	return(APTE_BASE);
+}
+
+/*
+ * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
+ */
+
+__inline static void
+pmap_unmap_ptes(pmap)
+	struct pmap *pmap;
+{
+#if defined(MULTIPROCESSOR)
+	pd_entry_t *mapdp;
+#endif
+
+	if (pmap == pmap_kernel()) {
+		return;
+	}
+	if (pmap_is_curpmap(pmap)) {
+		simple_unlock(&pmap->pm_obj.vmobjlock);
+	} else {
+		struct pmap *ourpmap = curcpu()->ci_pmap;
+
+#if defined(MULTIPROCESSOR)
+		mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
+		PDE_CLEAR(APDP_PDE, mapdp);
+		pmap_apte_flush(ourpmap);
+#endif
+#ifdef DEBUG
+		curapdp = 0;
+#endif
+		XENPRINTF(("APDP_PDE clear %p/%p set %p/%p\n",
+			   (void *)vtophys((vaddr_t)APDP_PDE),
+			   (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
+			   (void *)pmap->pm_pdirpa,
+			   (void *)xpmap_ptom(pmap->pm_pdirpa)));
+		COUNT(apdp_pde_unmap);
+		simple_unlock(&pmap->pm_obj.vmobjlock);
+		simple_unlock(&ourpmap->pm_obj.vmobjlock);
+	}
+}
+
+__inline static void
+pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
+{
+	if (curproc == NULL || curproc->p_vmspace == NULL ||
+	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
+		return;
+
+	if ((opte ^ npte) & PG_X)
+		pmap_update_pg(va);
+
+	/*
+	 * Executability was removed on the last executable change.
+	 * Reset the code segment to something conservative and
+	 * let the trap handler deal with setting the right limit.
+	 * We can't do that because of locking constraints on the vm map.
+	 */
+
+	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
+		struct trapframe *tf = curlwp->l_md.md_regs;
+		struct pcb *pcb = &curlwp->l_addr->u_pcb;
+
+		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
+		pm->pm_hiexec = I386_MAX_EXE_ADDR;
+	}
+}
+
+__inline static pt_entry_t
+pte_mtop(pt_entry_t pte)
+{
+	pt_entry_t ppte;
+
+	KDASSERT(pmap_valid_entry(pte));
+	ppte = xpmap_mtop(pte);
+	if ((ppte & PG_FRAME) == XPMAP_OFFSET) {
+		XENPRINTF(("pte_mtop: null page %08x -> %08x\n",
+		    ppte, pte));
+		ppte = pte;
+	}
+
+	return ppte;
+}
+
+__inline static pt_entry_t
+pte_get_ma(pt_entry_t *pte)
+{
+
+	return *pte;
+}
+
+__inline static pt_entry_t
+pte_get(pt_entry_t *pte)
+{
+
+	if (pmap_valid_entry(*pte))
+		return pte_mtop(*pte);
+	return *pte;
+}
+
+__inline static pt_entry_t
+pte_atomic_update_ma(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
+{
+	pt_entry_t opte;
+
+	XENPRINTK(("pte_atomic_update_ma pte %p mapte %p npte %08x\n",
+		   pte, mapte, npte));
+	opte = PTE_GET_MA(pte);
+	if (opte > pmap_mem_end) {
+		/* must remove opte unchecked */
+		if (npte > pmap_mem_end)
+			/* must set npte unchecked */
+			xpq_queue_unchecked_pte_update(mapte, npte);
+		else {
+			/* must set npte checked */
+			xpq_queue_unchecked_pte_update(mapte, 0);
+			xpq_queue_pte_update(mapte, npte);
+		}
+	} else {
+		/* must remove opte checked */
+		if (npte > pmap_mem_end) {
+			/* must set npte unchecked */
+			xpq_queue_pte_update(mapte, 0);
+			xpq_queue_unchecked_pte_update(mapte, npte);
+		} else
+			/* must set npte checked */
+			xpq_queue_pte_update(mapte, npte);
+	}
+	xpq_flush_queue();
+
+	return opte;
+}
+
+__inline static pt_entry_t
+pte_atomic_update(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
+{
+	pt_entry_t opte;
+
+	opte = pte_atomic_update_ma(pte, mapte, npte);
+
+	return pte_mtop(opte);
+}
+
+/*
+ * Fixup the code segment to cover all potential executable mappings.
+ * returns 0 if no changes to the code segment were made.
+ */
+
+int
+pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
+{
+	struct vm_map_entry *ent;
+	struct pmap *pm = vm_map_pmap(map);
+	vaddr_t va = 0;
+
+	vm_map_lock_read(map);
+	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
+
+		/*
+		 * This entry has greater va than the entries before.
+		 * We need to make it point to the last page, not past it.
+		 */
+
+		if (ent->protection & VM_PROT_EXECUTE)
+			va = trunc_page(ent->end) - PAGE_SIZE;
+	}
+	vm_map_unlock_read(map);
+	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
+		return (0);
+
+	pm->pm_hiexec = va;
+	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
+		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
+	} else {
+		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
+		return (0);
+	}
+	return (1);
+}
+
+/*
+ * p m a p   k e n t e r   f u n c t i o n s
+ *
+ * functions to quickly enter/remove pages from the kernel address
+ * space.   pmap_kremove is exported to MI kernel.  we make use of
+ * the recursive PTE mappings.
+ */
+
+/*
+ * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
+ *
+ * => no need to lock anything, assume va is already allocated
+ * => should be faster than normal pmap enter function
+ */
+
+void
+pmap_kenter_pa(va, pa, prot)
+	vaddr_t va;
+	paddr_t pa;
+	vm_prot_t prot;
+{
+	pt_entry_t *pte, opte, npte;
+	pt_entry_t *maptp;
+
+	if (va < VM_MIN_KERNEL_ADDRESS)
+		pte = vtopte(va);
+	else
+		pte = kvtopte(va);
+
+	npte = ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
+	     PG_V | pmap_pg_g;
+
+	if (pa >= pmap_pa_start && pa < pmap_pa_end) {
+		npte |= xpmap_ptom(pa);
+	} else {
+		XENPRINTF(("pmap_kenter: va %08lx outside pa range %08lx\n",
+			      va, pa));
+		npte |= pa;
+	}
+
+	maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+	opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
+	XENPRINTK(("pmap_kenter_pa(%p,%p) %p, was %08x now %08x\n", (void *)va, 
+		      (void *)pa, pte, opte, npte));
+#ifdef LARGEPAGES
+	/* XXX For now... */
+	if (opte & PG_PS)
+		panic("pmap_kenter_pa: PG_PS");
+#endif
+	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+		int32_t cpumask = 0;
+
+		pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+		pmap_tlb_shootnow(cpumask);
+#else
+		/* Don't bother deferring in the single CPU case. */
+		pmap_update_pg(va);
+#endif
+	}
+}
+
+/*
+ * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
+ *
+ * => no need to lock anything, assume va is already allocated
+ * => should be faster than normal pmap enter function
+ */
+
+void		 pmap_kenter_ma __P((vaddr_t, paddr_t, vm_prot_t));
+
+void
+pmap_kenter_ma(va, ma, prot)
+	vaddr_t va;
+	paddr_t ma;
+	vm_prot_t prot;
+{
+	pt_entry_t *pte, opte, npte;
+	pt_entry_t *maptp;
+
+	KASSERT (va >= VM_MIN_KERNEL_ADDRESS);
+	pte = kvtopte(va);
+
+	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
+	     PG_V | pmap_pg_g;
+
+	maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+	opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
+	XENPRINTK(("pmap_kenter_ma(%p,%p) %p, was %08x\n", (void *)va,
+		      (void *)ma, pte, opte));
+#ifdef LARGEPAGES
+	/* XXX For now... */
+	if (opte & PG_PS)
+		panic("pmap_kenter_ma: PG_PS");
+#endif
+	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+		int32_t cpumask = 0;
+
+		pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+		pmap_tlb_shootnow(cpumask);
+#else
+		/* Don't bother deferring in the single CPU case. */
+		pmap_update_pg(va);
+#endif
+	}
+}
+
+/*
+ * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
+ *
+ * => no need to lock anything
+ * => caller must dispose of any vm_page mapped in the va range
+ * => note: not an inline function
+ * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
+ * => we assume kernel only unmaps valid addresses and thus don't bother
+ *    checking the valid bit before doing TLB flushing
+ */
+
+void
+pmap_kremove(va, len)
+	vaddr_t va;
+	vsize_t len;
+{
+	pt_entry_t *pte, opte;
+	pt_entry_t *maptp;
+	int32_t cpumask = 0;
+
+	XENPRINTK(("pmap_kremove va %p, len %08lx\n", (void *)va, len));
+	len >>= PAGE_SHIFT;
+	for ( /* null */ ; len ; len--, va += PAGE_SIZE) {
+		if (va < VM_MIN_KERNEL_ADDRESS)
+			pte = vtopte(va);
+		else
+			pte = kvtopte(va);
+		maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+		opte = pte_atomic_update_ma(pte, maptp, 0); /* zap! */
+		XENPRINTK(("pmap_kremove pte %p, was %08x\n", pte, opte));
+#ifdef LARGEPAGES
+		/* XXX For now... */
+		if (opte & PG_PS)
+			panic("pmap_kremove: PG_PS");
+#endif
+#ifdef DIAGNOSTIC
+		if (opte & PG_PVLIST)
+			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
+			      va);
+#endif
+		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U))
+			pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+	}
+	pmap_tlb_shootnow(cpumask);
+}
+
+/*
+ * p m a p   i n i t   f u n c t i o n s
+ *
+ * pmap_bootstrap and pmap_init are called during system startup
+ * to init the pmap module.   pmap_bootstrap() does a low level
+ * init just to get things rolling.   pmap_init() finishes the job.
+ */
+
+/*
+ * pmap_bootstrap: get the system in a state where it can run with VM
+ *	properly enabled (called before main()).   the VM system is
+ *      fully init'd later...
+ *
+ * => on i386, locore.s has already enabled the MMU by allocating
+ *	a PDP for the kernel, and nkpde PTP's for the kernel.
+ * => kva_start is the first free virtual address in kernel space
+ */
+
+void
+pmap_bootstrap(kva_start)
+	vaddr_t kva_start;
+{
+	struct pmap *kpm;
+	vaddr_t kva;
+	pt_entry_t *pte;
+	pt_entry_t *maptp;
+	int i;
+
+	/*
+	 * set up our local static global vars that keep track of the
+	 * usage of KVM before kernel_map is set up
+	 */
+
+	virtual_avail = kva_start;		/* first free KVA */
+	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
+
+	/*
+	 * find out where physical memory ends on the real hardware.
+	 */
+
+	if (xen_start_info.flags & SIF_PRIVILEGED)
+		pmap_mem_end = find_pmap_mem_end(kva_start);
+
+	/*
+	 * set up protection_codes: we need to be able to convert from
+	 * a MI protection code (some combo of VM_PROT...) to something
+	 * we can jam into a i386 PTE.
+	 */
+
+	protection_codes[VM_PROT_NONE] = 0;  			/* --- */
+	protection_codes[VM_PROT_EXECUTE] = PG_X;		/* --x */
+	protection_codes[VM_PROT_READ] = PG_RO;			/* -r- */
+	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO|PG_X;/* -rx */
+	protection_codes[VM_PROT_WRITE] = PG_RW;		/* w-- */
+	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW|PG_X;/* w-x */
+	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW;	/* wr- */
+	protection_codes[VM_PROT_ALL] = PG_RW|PG_X;		/* wrx */
+
+	/*
+	 * now we init the kernel's pmap
+	 *
+	 * the kernel pmap's pm_obj is not used for much.   however, in
+	 * user pmaps the pm_obj contains the list of active PTPs.
+	 * the pm_obj currently does not have a pager.   it might be possible
+	 * to add a pager that would allow a process to read-only mmap its
+	 * own page tables (fast user level vtophys?).   this may or may not
+	 * be useful.
+	 */
+
+	kpm = pmap_kernel();
+	simple_lock_init(&kpm->pm_obj.vmobjlock);
+	kpm->pm_obj.pgops = NULL;
+	TAILQ_INIT(&kpm->pm_obj.memq);
+	kpm->pm_obj.uo_npages = 0;
+	kpm->pm_obj.uo_refs = 1;
+	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
+	kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE);
+	XENPRINTF(("pm_pdirpa %p PTDpaddr %p\n",
+	    (void *)lwp0.l_addr->u_pcb.pcb_cr3, (void *)PTDpaddr));
+	kpm->pm_pdirpa = (u_int32_t) lwp0.l_addr->u_pcb.pcb_cr3;
+	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
+		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
+
+	/*
+	 * the above is just a rough estimate and not critical to the proper
+	 * operation of the system.
+	 */
+
+	/*
+	 * Begin to enable global TLB entries if they are supported.
+	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
+	 * which happens in cpu_init(), which is run on each cpu
+	 * (and happens later)
+	 */
+
+	if (cpu_feature & CPUID_PGE) {
+		pmap_pg_g = PG_G;		/* enable software */
+
+		/* add PG_G attribute to already mapped kernel pages */
+		for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
+		     kva += PAGE_SIZE)
+			if (pmap_valid_entry(PTE_BASE[x86_btop(kva)])) {
+#if !defined(XEN)
+				PTE_BASE[x86_btop(kva)] |= PG_G;
+#else
+				maptp = (pt_entry_t *)vtomach(
+					(vaddr_t)&PTE_BASE[x86_btop(kva)]);
+				PTE_SETBITS(&PTE_BASE[x86_btop(kva)], maptp,
+				    PG_G);
+			}
+		PTE_UPDATES_FLUSH();
+#endif
+	}
+
+#ifdef LARGEPAGES
+	/*
+	 * enable large pages if they are supported.
+	 */
+
+	if (cpu_feature & CPUID_PSE) {
+		paddr_t pa;
+		vaddr_t kva_end;
+		pd_entry_t *pde;
+		pd_entry_t *mapdp;
+		extern char _etext;
+
+		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
+		pmap_largepages = 1;	/* enable software */
+
+		/*
+		 * the TLB must be flushed after enabling large pages
+		 * on Pentium CPUs, according to section 3.6.2.2 of
+		 * "Intel Architecture Software Developer's Manual,
+		 * Volume 3: System Programming".
+		 */
+		tlbflush();
+
+		/*
+		 * now, remap the kernel text using large pages.  we
+		 * assume that the linker has properly aligned the
+		 * .data segment to a 4MB boundary.
+		 */
+		kva_end = roundup((vaddr_t)&_etext, NBPD);
+		for (pa = 0, kva = KERNBASE; kva < kva_end;
+		     kva += NBPD, pa += NBPD) {
+			pde = &kpm->pm_pdir[pdei(kva)];
+			mapdp = (pt_entry_t *)vtomach((vaddr_t)pde);
+			PDE_SET(pde, mapdp, pa | pmap_pg_g | PG_PS |
+			    PG_KR | PG_V); /* zap! */
+			tlbflush();
+		}
+	}
+#endif /* LARGEPAGES */
+
+	/*
+	 * now we allocate the "special" VAs which are used for tmp mappings
+	 * by the pmap (and other modules).    we allocate the VAs by advancing
+	 * virtual_avail (note that there are no pages mapped at these VAs).
+	 * we find the PTE that maps the allocated VA via the linear PTE
+	 * mapping.
+	 */
+
+	pte = PTE_BASE + x86_btop(virtual_avail);
+
+#ifdef MULTIPROCESSOR
+	/*
+	 * Waste some VA space to avoid false sharing of cache lines
+	 * for page table pages: Give each possible CPU a cache line
+	 * of PTE's (8) to play with, though we only need 4.  We could
+	 * recycle some of this waste by putting the idle stacks here
+	 * as well; we could waste less space if we knew the largest
+	 * CPU ID beforehand.
+	 */
+	csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;
+
+	cdstp = (caddr_t) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
+
+	zerop = (caddr_t) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
+
+	ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
+
+	virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL;
+	pte += X86_MAXPROCS * NPTECL;
+#else
+	csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;  /* allocate */
+	virtual_avail += PAGE_SIZE; pte++;			     /* advance */
+
+	cdstp = (caddr_t) virtual_avail;  cdst_pte = pte;
+	virtual_avail += PAGE_SIZE; pte++;
+
+	zerop = (caddr_t) virtual_avail;  zero_pte = pte;
+	virtual_avail += PAGE_SIZE; pte++;
+
+	ptpp = (caddr_t) virtual_avail;  ptp_pte = pte;
+	virtual_avail += PAGE_SIZE; pte++;
+#endif
+
+	XENPRINTK(("pmap_bootstrap csrcp %p cdstp %p zerop %p ptpp %p\n", 
+		      csrc_pte, cdst_pte, zero_pte, ptp_pte));
+	/*
+	 * Nothing after this point actually needs pte;
+	 */
+	pte = (void *)0xdeadbeef;
+
+	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
+	vmmap = (char *)virtual_avail;			/* don't need pte */
+	virtual_avail += PAGE_SIZE;
+
+	msgbuf_vaddr = virtual_avail;			/* don't need pte */
+	virtual_avail += round_page(MSGBUFSIZE);
+
+	idt_vaddr = virtual_avail;			/* don't need pte */
+	virtual_avail += PAGE_SIZE;
+	idt_paddr = avail_start;			/* steal a page */
+	avail_start += PAGE_SIZE;
+
+#if defined(I586_CPU)
+	/* pentium f00f bug stuff */
+	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
+	virtual_avail += PAGE_SIZE;
+#endif
+
+	/*
+	 * now we reserve some VM for mapping pages when doing a crash dump
+	 */
+
+	virtual_avail = reserve_dumppages(virtual_avail);
+
+	/*
+	 * init the static-global locks and global lists.
+	 */
+
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+	spinlockinit(&pmap_main_lock, "pmaplk", 0);
+#endif
+	simple_lock_init(&pvalloc_lock);
+	simple_lock_init(&pmaps_lock);
+	LIST_INIT(&pmaps);
+	TAILQ_INIT(&pv_freepages);
+	TAILQ_INIT(&pv_unusedpgs);
+
+	/*
+	 * initialize the pmap pool.
+	 */
+
+	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0, "pmappl",
+	    &pool_allocator_nointr);
+
+	/*
+	 * Initialize the TLB shootdown queues.
+	 */
+
+	__cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
+
+	for (i = 0; i < X86_MAXPROCS; i++) {
+		TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
+		__cpu_simple_lock_init(&pmap_tlb_shootdown_q[i].pq_slock);
+	}
+
+	/*
+	 * initialize the PDE pool and cache.
+	 */
+	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, 0, "pdppl",
+		  &pool_allocator_nointr);
+	pool_cache_init(&pmap_pdp_cache, &pmap_pdp_pool,
+			pmap_pdp_ctor, pmap_pdp_dtor, NULL);
+
+	/*
+	 * ensure the TLB is sync'd with reality by flushing it...
+	 */
+
+	tlbflush();
+}
+
+/*
+ * pmap_init: called from uvm_init, our job is to get the pmap
+ * system ready to manage mappings... this mainly means initing
+ * the pv_entry stuff.
+ */
+
+void
+pmap_init()
+{
+	int i;
+
+	/*
+	 * now we need to free enough pv_entry structures to allow us to get
+	 * the kmem_map/kmem_object allocated and inited (done after this
+	 * function is finished).  to do this we allocate one bootstrap page out
+	 * of kernel_map and use it to provide an initial pool of pv_entry
+	 * structures.   we never free this page.
+	 */
+
+	pv_initpage = (struct pv_page *) uvm_km_alloc(kernel_map, PAGE_SIZE);
+	if (pv_initpage == NULL)
+		panic("pmap_init: pv_initpage");
+	pv_cachedva = 0;   /* a VA we have allocated but not used yet */
+	pv_nfpvents = 0;
+	(void) pmap_add_pvpage(pv_initpage, FALSE);
+
+	pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE);
+	if (pj_page == NULL)
+		panic("pmap_init: pj_page");
+
+	for (i = 0;
+	     i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
+	     i++)
+		pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
+	pj_page[i].pja_job.pj_nextfree = NULL;
+	pj_free = &pj_page[0];
+
+	/*
+	 * done: pmap module is up (and ready for business)
+	 */
+
+	pmap_initialized = TRUE;
+}
+
+/*
+ * p v _ e n t r y   f u n c t i o n s
+ */
+
+/*
+ * pv_entry allocation functions:
+ *   the main pv_entry allocation functions are:
+ *     pmap_alloc_pv: allocate a pv_entry structure
+ *     pmap_free_pv: free one pv_entry
+ *     pmap_free_pvs: free a list of pv_entrys
+ *
+ * the rest are helper functions
+ */
+
+/*
+ * pmap_alloc_pv: inline function to allocate a pv_entry structure
+ * => we lock pvalloc_lock
+ * => if we fail, we call out to pmap_alloc_pvpage
+ * => 3 modes:
+ *    ALLOCPV_NEED   = we really need a pv_entry, even if we have to steal it
+ *    ALLOCPV_TRY    = we want a pv_entry, but not enough to steal
+ *    ALLOCPV_NONEED = we are trying to grow our free list, don't really need
+ *			one now
+ *
+ * "try" is for optional functions like pmap_copy().
+ */
+
+__inline static struct pv_entry *
+pmap_alloc_pv(pmap, mode)
+	struct pmap *pmap;
+	int mode;
+{
+	struct pv_page *pvpage;
+	struct pv_entry *pv;
+
+	simple_lock(&pvalloc_lock);
+
+	pvpage = TAILQ_FIRST(&pv_freepages);
+	if (pvpage != NULL) {
+		pvpage->pvinfo.pvpi_nfree--;
+		if (pvpage->pvinfo.pvpi_nfree == 0) {
+			/* nothing left in this one? */
+			TAILQ_REMOVE(&pv_freepages, pvpage, pvinfo.pvpi_list);
+		}
+		pv = pvpage->pvinfo.pvpi_pvfree;
+		KASSERT(pv);
+		pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
+		pv_nfpvents--;  /* took one from pool */
+	} else {
+		pv = NULL;		/* need more of them */
+	}
+
+	/*
+	 * if below low water mark or we didn't get a pv_entry we try and
+	 * create more pv_entrys ...
+	 */
+
+	if (pv_nfpvents < PVE_LOWAT || pv == NULL) {
+		if (pv == NULL)
+			pv = pmap_alloc_pvpage(pmap, (mode == ALLOCPV_TRY) ?
+					       mode : ALLOCPV_NEED);
+		else
+			(void) pmap_alloc_pvpage(pmap, ALLOCPV_NONEED);
+	}
+	simple_unlock(&pvalloc_lock);
+	return(pv);
+}
+
+/*
+ * pmap_alloc_pvpage: maybe allocate a new pvpage
+ *
+ * if need_entry is false: try and allocate a new pv_page
+ * if need_entry is true: try and allocate a new pv_page and return a
+ *	new pv_entry from it.   if we are unable to allocate a pv_page
+ *	we make a last ditch effort to steal a pv_page from some other
+ *	mapping.    if that fails, we panic...
+ *
+ * => we assume that the caller holds pvalloc_lock
+ */
+
+static struct pv_entry *
+pmap_alloc_pvpage(pmap, mode)
+	struct pmap *pmap;
+	int mode;
+{
+	struct vm_page *pg;
+	struct pv_page *pvpage;
+	struct pv_entry *pv;
+	int s;
+
+	/*
+	 * if we need_entry and we've got unused pv_pages, allocate from there
+	 */
+
+	pvpage = TAILQ_FIRST(&pv_unusedpgs);
+	if (mode != ALLOCPV_NONEED && pvpage != NULL) {
+
+		/* move it to pv_freepages list */
+		TAILQ_REMOVE(&pv_unusedpgs, pvpage, pvinfo.pvpi_list);
+		TAILQ_INSERT_HEAD(&pv_freepages, pvpage, pvinfo.pvpi_list);
+
+		/* allocate a pv_entry */
+		pvpage->pvinfo.pvpi_nfree--;	/* can't go to zero */
+		pv = pvpage->pvinfo.pvpi_pvfree;
+		KASSERT(pv);
+		pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
+		pv_nfpvents--;  /* took one from pool */
+		return(pv);
+	}
+
+	/*
+	 *  see if we've got a cached unmapped VA that we can map a page in.
+	 * if not, try to allocate one.
+	 */
+
+	if (pv_cachedva == 0) {
+		s = splvm();   /* must protect kmem_map with splvm! */
+		pv_cachedva = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE,
+		    UVM_KMF_TRYLOCK|UVM_KMF_VALLOC);
+		splx(s);
+		if (pv_cachedva == 0) {
+			return (NULL);
+		}
+	}
+
+	pg = uvm_pagealloc(NULL, pv_cachedva - vm_map_min(kernel_map), NULL,
+	    UVM_PGA_USERESERVE);
+	if (pg == NULL)
+		return (NULL);
+	pg->flags &= ~PG_BUSY;	/* never busy */
+
+	/*
+	 * add a mapping for our new pv_page and free its entrys (save one!)
+	 *
+	 * NOTE: If we are allocating a PV page for the kernel pmap, the
+	 * pmap is already locked!  (...but entering the mapping is safe...)
+	 */
+
+	pmap_kenter_pa(pv_cachedva, VM_PAGE_TO_PHYS(pg),
+	    VM_PROT_READ | VM_PROT_WRITE);
+	pmap_update(pmap_kernel());
+	pvpage = (struct pv_page *) pv_cachedva;
+	pv_cachedva = 0;
+	return (pmap_add_pvpage(pvpage, mode != ALLOCPV_NONEED));
+}
+
+/*
+ * pmap_add_pvpage: add a pv_page's pv_entrys to the free list
+ *
+ * => caller must hold pvalloc_lock
+ * => if need_entry is true, we allocate and return one pv_entry
+ */
+
+static struct pv_entry *
+pmap_add_pvpage(pvp, need_entry)
+	struct pv_page *pvp;
+	boolean_t need_entry;
+{
+	int tofree, lcv;
+
+	/* do we need to return one? */
+	tofree = (need_entry) ? PVE_PER_PVPAGE - 1 : PVE_PER_PVPAGE;
+
+	pvp->pvinfo.pvpi_pvfree = NULL;
+	pvp->pvinfo.pvpi_nfree = tofree;
+	for (lcv = 0 ; lcv < tofree ; lcv++) {
+		SPLAY_RIGHT(&pvp->pvents[lcv], pv_node) =
+			pvp->pvinfo.pvpi_pvfree;
+		pvp->pvinfo.pvpi_pvfree = &pvp->pvents[lcv];
+	}
+	if (need_entry)
+		TAILQ_INSERT_TAIL(&pv_freepages, pvp, pvinfo.pvpi_list);
+	else
+		TAILQ_INSERT_TAIL(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
+	pv_nfpvents += tofree;
+	return((need_entry) ? &pvp->pvents[lcv] : NULL);
+}
+
+/*
+ * pmap_free_pv_doit: actually free a pv_entry
+ *
+ * => do not call this directly!  instead use either
+ *    1. pmap_free_pv ==> free a single pv_entry
+ *    2. pmap_free_pvs => free a list of pv_entrys
+ * => we must be holding pvalloc_lock
+ */
+
+__inline static void
+pmap_free_pv_doit(pv)
+	struct pv_entry *pv;
+{
+	struct pv_page *pvp;
+
+	pvp = (struct pv_page *) x86_trunc_page(pv);
+	pv_nfpvents++;
+	pvp->pvinfo.pvpi_nfree++;
+
+	/* nfree == 1 => fully allocated page just became partly allocated */
+	if (pvp->pvinfo.pvpi_nfree == 1) {
+		TAILQ_INSERT_HEAD(&pv_freepages, pvp, pvinfo.pvpi_list);
+	}
+
+	/* free it */
+	SPLAY_RIGHT(pv, pv_node) = pvp->pvinfo.pvpi_pvfree;
+	pvp->pvinfo.pvpi_pvfree = pv;
+
+	/*
+	 * are all pv_page's pv_entry's free?  move it to unused queue.
+	 */
+
+	if (pvp->pvinfo.pvpi_nfree == PVE_PER_PVPAGE) {
+		TAILQ_REMOVE(&pv_freepages, pvp, pvinfo.pvpi_list);
+		TAILQ_INSERT_HEAD(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
+	}
+}
+
+/*
+ * pmap_free_pv: free a single pv_entry
+ *
+ * => we gain the pvalloc_lock
+ */
+
+__inline static void
+pmap_free_pv(pmap, pv)
+	struct pmap *pmap;
+	struct pv_entry *pv;
+{
+	simple_lock(&pvalloc_lock);
+	pmap_free_pv_doit(pv);
+
+	/*
+	 * Can't free the PV page if the PV entries were associated with
+	 * the kernel pmap; the pmap is already locked.
+	 */
+	if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
+	    pmap != pmap_kernel())
+		pmap_free_pvpage();
+
+	simple_unlock(&pvalloc_lock);
+}
+
+/*
+ * pmap_free_pvs: free a list of pv_entrys
+ *
+ * => we gain the pvalloc_lock
+ */
+
+__inline static void
+pmap_free_pvs(pmap, pvs)
+	struct pmap *pmap;
+	struct pv_entry *pvs;
+{
+	struct pv_entry *nextpv;
+
+	simple_lock(&pvalloc_lock);
+
+	for ( /* null */ ; pvs != NULL ; pvs = nextpv) {
+		nextpv = SPLAY_RIGHT(pvs, pv_node);
+		pmap_free_pv_doit(pvs);
+	}
+
+	/*
+	 * Can't free the PV page if the PV entries were associated with
+	 * the kernel pmap; the pmap is already locked.
+	 */
+	if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
+	    pmap != pmap_kernel())
+		pmap_free_pvpage();
+
+	simple_unlock(&pvalloc_lock);
+}
+
+
+/*
+ * pmap_free_pvpage: try and free an unused pv_page structure
+ *
+ * => assume caller is holding the pvalloc_lock and that
+ *	there is a page on the pv_unusedpgs list
+ * => if we can't get a lock on the kmem_map we try again later
+ */
+
+static void
+pmap_free_pvpage()
+{
+	int s;
+	struct vm_map *map;
+	struct vm_map_entry *dead_entries;
+	struct pv_page *pvp;
+
+	s = splvm(); /* protect kmem_map */
+
+	pvp = TAILQ_FIRST(&pv_unusedpgs);
+
+	/*
+	 * note: watch out for pv_initpage which is allocated out of
+	 * kernel_map rather than kmem_map.
+	 */
+
+	if (pvp == pv_initpage)
+		map = kernel_map;
+	else
+		map = kmem_map;
+	if (vm_map_lock_try(map)) {
+
+		/* remove pvp from pv_unusedpgs */
+		TAILQ_REMOVE(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
+
+		/* unmap the page */
+		dead_entries = NULL;
+		uvm_unmap_remove(map, (vaddr_t)pvp, ((vaddr_t)pvp) + PAGE_SIZE,
+		    &dead_entries);
+		vm_map_unlock(map);
+
+		if (dead_entries != NULL)
+			uvm_unmap_detach(dead_entries, 0);
+
+		pv_nfpvents -= PVE_PER_PVPAGE;  /* update free count */
+	}
+	if (pvp == pv_initpage)
+		/* no more initpage, we've freed it */
+		pv_initpage = NULL;
+
+	splx(s);
+}
+
+/*
+ * pmap_lock_pvhs: Lock pvh1 and optional pvh2
+ *                 Observe locking order when locking both pvhs
+ */
+
+__inline static void
+pmap_lock_pvhs(struct pv_head *pvh1, struct pv_head *pvh2)
+{
+
+	if (pvh2 == NULL) {
+		simple_lock(&pvh1->pvh_lock);
+		return;
+	}
+
+	if (pvh1 < pvh2) {
+		simple_lock(&pvh1->pvh_lock);
+		simple_lock(&pvh2->pvh_lock);
+	} else {
+		simple_lock(&pvh2->pvh_lock);
+		simple_lock(&pvh1->pvh_lock);
+	}
+}
+
+
+/*
+ * main pv_entry manipulation functions:
+ *   pmap_enter_pv: enter a mapping onto a pv_head list
+ *   pmap_remove_pv: remove a mappiing from a pv_head list
+ *
+ * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 
+ *       the pvh before calling
+ */
+
+/*
+ * pmap_enter_pv: enter a mapping onto a pv_head lst
+ *
+ * => caller should hold the proper lock on pmap_main_lock
+ * => caller should have pmap locked
+ * => caller should have the pv_head locked
+ * => caller should adjust ptp's wire_count before calling
+ */
+
+__inline static void
+pmap_enter_pv(pvh, pve, pmap, va, ptp)
+	struct pv_head *pvh;
+	struct pv_entry *pve;	/* preallocated pve for us to use */
+	struct pmap *pmap;
+	vaddr_t va;
+	struct vm_page *ptp;	/* PTP in pmap that maps this VA */
+{
+	pve->pv_pmap = pmap;
+	pve->pv_va = va;
+	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
+	SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */
+}
+
+/*
+ * pmap_remove_pv: try to remove a mapping from a pv_list
+ *
+ * => caller should hold proper lock on pmap_main_lock
+ * => pmap should be locked
+ * => caller should hold lock on pv_head [so that attrs can be adjusted]
+ * => caller should adjust ptp's wire_count and free PTP if needed
+ * => we return the removed pve
+ */
+
+__inline static struct pv_entry *
+pmap_remove_pv(pvh, pmap, va)
+	struct pv_head *pvh;
+	struct pmap *pmap;
+	vaddr_t va;
+{
+	struct pv_entry tmp, *pve;
+
+	tmp.pv_pmap = pmap;
+	tmp.pv_va = va;
+	pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp);
+	if (pve == NULL)
+		return (NULL);
+	SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve);
+	return(pve);				/* return removed pve */
+}
+
+/*
+ * p t p   f u n c t i o n s
+ */
+
+/*
+ * pmap_alloc_ptp: allocate a PTP for a PMAP
+ *
+ * => pmap should already be locked by caller
+ * => we use the ptp's wire_count to count the number of active mappings
+ *	in the PTP (we start it at one to prevent any chance this PTP
+ *	will ever leak onto the active/inactive queues)
+ */
+
+__inline static struct vm_page *
+pmap_alloc_ptp(pmap, pde_index)
+	struct pmap *pmap;
+	int pde_index;
+{
+	struct vm_page *ptp;
+	pd_entry_t *mapdp;
+
+	ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
+			    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
+	if (ptp == NULL)
+		return(NULL);
+
+	/* got one! */
+	ptp->flags &= ~PG_BUSY;	/* never busy */
+	ptp->wire_count = 1;	/* no mappings yet */
+	mapdp = (pt_entry_t *)vtomach((vaddr_t)&pmap->pm_pdir[pde_index]);
+	PDE_SET(&pmap->pm_pdir[pde_index], mapdp,
+	    (pd_entry_t) (VM_PAGE_TO_PHYS(ptp) | PG_u | PG_RW | PG_V));
+	pmap->pm_stats.resident_count++;	/* count PTP as resident */
+	pmap->pm_ptphint = ptp;
+	return(ptp);
+}
+
+/*
+ * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
+ *
+ * => pmap should NOT be pmap_kernel()
+ * => pmap should be locked
+ */
+
+static struct vm_page *
+pmap_get_ptp(pmap, pde_index)
+	struct pmap *pmap;
+	int pde_index;
+{
+	struct vm_page *ptp;
+
+	if (pmap_valid_entry(pmap->pm_pdir[pde_index])) {
+
+		/* valid... check hint (saves us a PA->PG lookup) */
+		if (pmap->pm_ptphint &&
+		    (PDE_GET(&pmap->pm_pdir[pde_index]) & PG_FRAME) ==
+		    VM_PAGE_TO_PHYS(pmap->pm_ptphint))
+			return(pmap->pm_ptphint);
+
+		ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
+#ifdef DIAGNOSTIC
+		if (ptp == NULL)
+			panic("pmap_get_ptp: unmanaged user PTP");
+#endif
+		pmap->pm_ptphint = ptp;
+		return(ptp);
+	}
+
+	/* allocate a new PTP (updates ptphint) */
+	return(pmap_alloc_ptp(pmap, pde_index));
+}
+
+/*
+ * p m a p  l i f e c y c l e   f u n c t i o n s
+ */
+
+/*
+ * pmap_pdp_ctor: constructor for the PDP cache.
+ */
+
+int
+pmap_pdp_ctor(void *arg, void *object, int flags)
+{
+	pd_entry_t *pdir = object;
+	paddr_t pdirpa;
+
+	/*
+	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
+	 * WE MUST NOT BLOCK!
+	 */
+
+	/* fetch the physical address of the page directory. */
+	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
+
+	XENPRINTF(("pmap_pdp_ctor %p %p\n", pdir, (void *)pdirpa));
+
+	/* zero init area */
+	memset(pdir, 0, PDSLOT_PTE * sizeof(pd_entry_t));
+
+	/* put in recursive PDE to map the PTEs */
+	pdir[PDSLOT_PTE] = xpmap_ptom(pdirpa | PG_V /* | PG_KW */);
+
+	/* put in kernel VM PDEs */
+	memcpy(&pdir[PDSLOT_KERN], &PDP_BASE[PDSLOT_KERN],
+	    nkpde * sizeof(pd_entry_t));
+
+	/* zero the rest */
+	memset(&pdir[PDSLOT_KERN + nkpde], 0,
+	    PAGE_SIZE - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
+
+	pmap_enter(pmap_kernel(), (vaddr_t)pdir, pdirpa, VM_PROT_READ,
+	    VM_PROT_READ);
+	pmap_update(pmap_kernel());
+
+	/* pin page type */
+	xpq_queue_pin_table(xpmap_ptom(pdirpa), XPQ_PIN_L2_TABLE);
+	xpq_flush_queue();
+
+	return (0);
+}
+
+void
+pmap_pdp_dtor(void *arg, void *object)
+{
+	pd_entry_t *pdir = object;
+	paddr_t pdirpa;
+
+	/* fetch the physical address of the page directory. */
+	pdirpa = PDE_GET(&pdir[PDSLOT_PTE]) & PG_FRAME;
+
+	XENPRINTF(("pmap_pdp_dtor %p %p\n", pdir, (void *)pdirpa));
+
+	/* unpin page type */
+	xpq_queue_unpin_table(xpmap_ptom(pdirpa));
+	xpq_flush_queue();
+}
+
+/*
+ * pmap_create: create a pmap
+ *
+ * => note: old pmap interface took a "size" args which allowed for
+ *	the creation of "software only" pmaps (not in bsd).
+ */
+
+struct pmap *
+pmap_create()
+{
+	struct pmap *pmap;
+	u_int gen;
+
+	XENPRINTF(("pmap_create\n"));
+	pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
+
+	/* init uvm_object */
+	simple_lock_init(&pmap->pm_obj.vmobjlock);
+	pmap->pm_obj.pgops = NULL;	/* currently not a mappable object */
+	TAILQ_INIT(&pmap->pm_obj.memq);
+	pmap->pm_obj.uo_npages = 0;
+	pmap->pm_obj.uo_refs = 1;
+	pmap->pm_stats.wired_count = 0;
+	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
+	pmap->pm_ptphint = NULL;
+	pmap->pm_hiexec = 0;
+	pmap->pm_flags = 0;
+	pmap->pm_cpus = 0;
+
+	/* init the LDT */
+	pmap->pm_ldt = NULL;
+	pmap->pm_ldt_len = 0;
+	pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+
+	/* allocate PDP */
+
+	/*
+	 * we need to lock pmaps_lock to prevent nkpde from changing on
+	 * us.  note that there is no need to splvm to protect us from
+	 * malloc since malloc allocates out of a submap and we should
+	 * have already allocated kernel PTPs to cover the range...
+	 *
+	 * NOTE: WE MUST NOT BLOCK WHILE HOLDING THE `pmap_lock', nor
+	 * must we call pmap_growkernel() while holding it!
+	 */
+
+ try_again:
+	gen = pmap_pdp_cache_generation;
+	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
+
+	simple_lock(&pmaps_lock);
+
+	if (gen != pmap_pdp_cache_generation) {
+		simple_unlock(&pmaps_lock);
+		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
+		goto try_again;
+	}
+
+	pmap->pm_pdirpa = PDE_GET(&pmap->pm_pdir[PDSLOT_PTE]) & PG_FRAME;
+	XENPRINTF(("pmap_create %p set pm_pdirpa %p/%p slotval %p\n", pmap,
+		   (void *)pmap->pm_pdirpa,
+		   (void *)xpmap_ptom(pmap->pm_pdirpa),
+		   (void *)pmap->pm_pdir[PDSLOT_PTE]));
+
+	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
+
+	simple_unlock(&pmaps_lock);
+
+	return (pmap);
+}
+
+/*
+ * pmap_destroy: drop reference count on pmap.   free pmap if
+ *	reference count goes to zero.
+ */
+
+void
+pmap_destroy(pmap)
+	struct pmap *pmap;
+{
+	int refs;
+#ifdef DIAGNOSTIC
+	struct cpu_info *ci;
+	CPU_INFO_ITERATOR cii;
+#endif /* DIAGNOSTIC */
+
+	/*
+	 * drop reference count
+	 */
+
+	simple_lock(&pmap->pm_obj.vmobjlock);
+	refs = --pmap->pm_obj.uo_refs;
+	simple_unlock(&pmap->pm_obj.vmobjlock);
+	if (refs > 0) {
+		return;
+	}
+
+#ifdef DIAGNOSTIC
+	for (CPU_INFO_FOREACH(cii, ci))
+		if (ci->ci_pmap == pmap)
+			panic("destroying pmap being used");
+#endif /* DIAGNOSTIC */
+
+	/*
+	 * reference count is zero, free pmap resources and then free pmap.
+	 */
+
+	XENPRINTF(("pmap_destroy %p pm_pdirpa %p/%p\n", pmap,
+		   (void *)pmap->pm_pdirpa,
+		   (void *)xpmap_ptom(pmap->pm_pdirpa)));
+
+	/*
+	 * remove it from global list of pmaps
+	 */
+
+	simple_lock(&pmaps_lock);
+	LIST_REMOVE(pmap, pm_list);
+	simple_unlock(&pmaps_lock);
+
+	/*
+	 * destroyed pmap shouldn't have remaining PTPs
+	 */
+
+	KASSERT(pmap->pm_obj.uo_npages == 0);
+	KASSERT(TAILQ_EMPTY(&pmap->pm_obj.memq));
+
+	/*
+	 * MULTIPROCESSOR -- no need to flush out of other processors'
+	 * APTE space because we do that in pmap_unmap_ptes().
+	 */
+	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
+
+#ifdef USER_LDT
+	if (pmap->pm_flags & PMF_USER_LDT) {
+		/*
+		 * no need to switch the LDT; this address space is gone,
+		 * nothing is using it.
+		 *
+		 * No need to lock the pmap for ldt_free (or anything else),
+		 * we're the last one to use it.
+		 */
+		ldt_free(pmap);
+		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
+			    pmap->pm_ldt_len * sizeof(union descriptor));
+	}
+#endif
+
+	pool_put(&pmap_pmap_pool, pmap);
+}
+
+/*
+ *	Add a reference to the specified pmap.
+ */
+
+void
+pmap_reference(pmap)
+	struct pmap *pmap;
+{
+	simple_lock(&pmap->pm_obj.vmobjlock);
+	pmap->pm_obj.uo_refs++;
+	simple_unlock(&pmap->pm_obj.vmobjlock);
+}
+
+#if defined(PMAP_FORK)
+/*
+ * pmap_fork: perform any necessary data structure manipulation when
+ * a VM space is forked.
+ */
+
+void
+pmap_fork(pmap1, pmap2)
+	struct pmap *pmap1, *pmap2;
+{
+	simple_lock(&pmap1->pm_obj.vmobjlock);
+	simple_lock(&pmap2->pm_obj.vmobjlock);
+
+#ifdef USER_LDT
+	/* Copy the LDT, if necessary. */
+	if (pmap1->pm_flags & PMF_USER_LDT) {
+		union descriptor *new_ldt;
+		size_t len;
+
+		len = pmap1->pm_ldt_len * sizeof(union descriptor);
+		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len);
+		memcpy(new_ldt, pmap1->pm_ldt, len);
+		pmap2->pm_ldt = new_ldt;
+		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
+		pmap2->pm_flags |= PMF_USER_LDT;
+		ldt_alloc(pmap2, new_ldt, len);
+	}
+#endif /* USER_LDT */
+
+	simple_unlock(&pmap2->pm_obj.vmobjlock);
+	simple_unlock(&pmap1->pm_obj.vmobjlock);
+}
+#endif /* PMAP_FORK */
+
+#ifdef USER_LDT
+/*
+ * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
+ * restore the default.
+ */
+
+void
+pmap_ldt_cleanup(l)
+	struct lwp *l;
+{
+	struct pcb *pcb = &l->l_addr->u_pcb;
+	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
+	union descriptor *old_ldt = NULL;
+	size_t len = 0;
+
+	simple_lock(&pmap->pm_obj.vmobjlock);
+
+	if (pmap->pm_flags & PMF_USER_LDT) {
+		ldt_free(pmap);
+		pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
+		if (pcb == curpcb)
+			lldt(pcb->pcb_ldt_sel);
+		old_ldt = pmap->pm_ldt;
+		len = pmap->pm_ldt_len * sizeof(union descriptor);
+		pmap->pm_ldt = NULL;
+		pmap->pm_ldt_len = 0;
+		pmap->pm_flags &= ~PMF_USER_LDT;
+	}
+
+	simple_unlock(&pmap->pm_obj.vmobjlock);
+
+	if (old_ldt != NULL)
+		uvm_km_free(kernel_map, (vaddr_t)old_ldt, len);
+}
+#endif /* USER_LDT */
+
+/*
+ * pmap_activate: activate a process' pmap
+ *
+ * => called from cpu_switch()
+ * => if lwp is the curlwp, then set ci_want_pmapload so that
+ *    actual MMU context switch will be done by pmap_load() later
+ */
+
+void
+pmap_activate(l)
+	struct lwp *l;
+{
+	struct cpu_info *ci = curcpu();
+	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+
+	if (l == ci->ci_curlwp) {
+		struct pcb *pcb;
+
+		KASSERT(ci->ci_want_pmapload == 0);
+		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
+#ifdef KSTACK_CHECK_DR0
+		/*
+		 * setup breakpoint on the top of stack
+		 */
+		if (l == &lwp0)
+			dr0(0, 0, 0, 0);
+		else
+			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
+#endif
+
+		/*
+		 * no need to switch to kernel vmspace because
+		 * it's a subset of any vmspace.
+		 */
+
+		if (pmap == pmap_kernel()) {
+			ci->ci_want_pmapload = 0;
+			return;
+		}
+
+		pcb = &l->l_addr->u_pcb;
+		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
+
+		ci->ci_want_pmapload = 1;
+	}
+}
+
+/*
+ * pmap_reactivate: try to regain reference to the pmap.
+ */
+
+static boolean_t
+pmap_reactivate(struct pmap *pmap)
+{
+	struct cpu_info *ci = curcpu();
+	u_int32_t cpumask = 1U << ci->ci_cpuid;
+	int s;
+	boolean_t result;
+	u_int32_t oldcpus;
+
+	/*
+	 * if we still have a lazy reference to this pmap,
+	 * we can assume that there was no tlb shootdown
+	 * for this pmap in the meantime.
+	 */
+
+	s = splipi(); /* protect from tlb shootdown ipis. */
+	oldcpus = pmap->pm_cpus;
+	x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
+	if (oldcpus & cpumask) {
+		KASSERT(ci->ci_tlbstate == TLBSTATE_LAZY);
+		/* got it */
+		result = TRUE;
+	} else {
+		KASSERT(ci->ci_tlbstate == TLBSTATE_STALE);
+		result = FALSE;
+	}
+	ci->ci_tlbstate = TLBSTATE_VALID;
+	splx(s);
+
+	return result;
+}
+
+/*
+ * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
+ */
+
+void
+pmap_load()
+{
+	struct cpu_info *ci = curcpu();
+	u_int32_t cpumask = 1U << ci->ci_cpuid;
+	struct pmap *pmap;
+	struct pmap *oldpmap;
+	struct lwp *l;
+	struct pcb *pcb;
+	pd_entry_t *mapdp;
+	int s;
+
+	KASSERT(ci->ci_want_pmapload);
+
+	l = ci->ci_curlwp;
+	KASSERT(l != NULL);
+	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+	KASSERT(pmap != pmap_kernel());
+	oldpmap = ci->ci_pmap;
+
+	pcb = ci->ci_curpcb;
+	KASSERT(pcb == &l->l_addr->u_pcb);
+	/* loaded by pmap_activate */
+	KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel);
+
+	if (pmap == oldpmap) {
+		if (!pmap_reactivate(pmap)) {
+
+			/*
+			 * pmap has been changed during deactivated.
+			 * our tlb may be stale.
+			 */
+
+			tlbflush();
+		}
+
+		ci->ci_want_pmapload = 0;
+		return;
+	}
+
+	/*
+	 * actually switch pmap.
+	 */
+
+	x86_atomic_clearbits_l(&oldpmap->pm_cpus, cpumask);
+
+	KASSERT((pmap->pm_cpus & cpumask) == 0);
+
+	KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
+	pmap_reference(pmap);
+	KERNEL_UNLOCK();
+
+	/*
+	 * mark the pmap in use by this processor.
+	 */
+
+	s = splipi();
+	x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
+	ci->ci_pmap = pmap;
+	ci->ci_tlbstate = TLBSTATE_VALID;
+	splx(s);
+
+	/*
+	 * clear apdp slot before loading %cr3 since Xen only allows
+	 * linear pagetable mappings in the current pagetable.
+	 */
+	KDASSERT(curapdp == 0);
+	mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
+	PDE_CLEAR(APDP_PDE, mapdp);
+
+	/*
+	 * update tss and load corresponding registers.
+	 */
+
+	lldt(pcb->pcb_ldt_sel);
+	pcb->pcb_cr3 = pmap->pm_pdirpa;
+	lcr3(pcb->pcb_cr3);
+
+	ci->ci_want_pmapload = 0;
+
+	KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
+	pmap_destroy(oldpmap);
+	KERNEL_UNLOCK();
+}
+
+/*
+ * pmap_deactivate: deactivate a process' pmap
+ */
+
+void
+pmap_deactivate(l)
+	struct lwp *l;
+{
+
+	if (l == curlwp)
+		pmap_deactivate2(l);
+}
+
+/*
+ * pmap_deactivate2: context switch version of pmap_deactivate.
+ * always treat l as curlwp.
+ */
+
+void
+pmap_deactivate2(l)
+	struct lwp *l;
+{
+	struct pmap *pmap;
+	struct cpu_info *ci = curcpu();
+
+	if (ci->ci_want_pmapload) {
+		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
+		    != pmap_kernel());
+		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
+		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
+
+		/*
+		 * userspace has not been touched.
+		 * nothing to do here.
+		 */
+
+		ci->ci_want_pmapload = 0;
+		return;
+	}
+
+	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+
+	if (pmap == pmap_kernel()) {
+		return;
+	}
+
+	KASSERT(ci->ci_pmap == pmap);
+
+	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
+	ci->ci_tlbstate = TLBSTATE_LAZY;
+	XENPRINTF(("pmap_deactivate %p ebp %p esp %p\n",
+		      l, (void *)l->l_addr->u_pcb.pcb_ebp, 
+		      (void *)l->l_addr->u_pcb.pcb_esp));
+}
+
+/*
+ * end of lifecycle functions
+ */
+
+/*
+ * some misc. functions
+ */
+
+/*
+ * pmap_extract: extract a PA for the given VA
+ */
+
+boolean_t
+pmap_extract(pmap, va, pap)
+	struct pmap *pmap;
+	vaddr_t va;
+	paddr_t *pap;
+{
+	pt_entry_t *ptes, pte;
+	pd_entry_t pde;
+
+	if (__predict_true((pde = PDE_GET(&pmap->pm_pdir[pdei(va)])) != 0)) {
+#ifdef LARGEPAGES
+		if (pde & PG_PS) {
+			if (pap != NULL)
+				*pap = (pde & PG_LGFRAME) | (va & ~PG_LGFRAME);
+			return (TRUE);
+		}
+#endif
+
+		ptes = pmap_map_ptes(pmap);
+		pte = PTE_GET(&ptes[x86_btop(va)]);
+		pmap_unmap_ptes(pmap);
+
+		if (__predict_true((pte & PG_V) != 0)) {
+			if (pap != NULL)
+				*pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
+			return (TRUE);
+		}
+	}
+	return (FALSE);
+}
+
+
+/*
+ * vtophys: virtual address to physical address.  For use by
+ * machine-dependent code only.
+ */
+
+paddr_t
+vtophys(va)
+	vaddr_t va;
+{
+	paddr_t pa;
+
+	if (pmap_extract(pmap_kernel(), va, &pa) == TRUE)
+		return (pa);
+	return (0);
+}
+
+
+/*
+ * pmap_virtual_space: used during bootup [pmap_steal_memory] to
+ *	determine the bounds of the kernel virtual addess space.
+ */
+
+void
+pmap_virtual_space(startp, endp)
+	vaddr_t *startp;
+	vaddr_t *endp;
+{
+	*startp = virtual_avail;
+	*endp = virtual_end;
+}
+
+/*
+ * pmap_map: map a range of PAs into kvm
+ *
+ * => used during crash dump
+ * => XXX: pmap_map() should be phased out?
+ */
+
+vaddr_t
+pmap_map(va, spa, epa, prot)
+	vaddr_t va;
+	paddr_t spa, epa;
+	vm_prot_t prot;
+{
+	while (spa < epa) {
+		pmap_enter(pmap_kernel(), va, spa, prot, 0);
+		va += PAGE_SIZE;
+		spa += PAGE_SIZE;
+	}
+	pmap_update(pmap_kernel());
+	return va;
+}
+
+/*
+ * pmap_zero_page: zero a page
+ */
+
+void
+pmap_zero_page(pa)
+	paddr_t pa;
+{
+#ifdef MULTIPROCESSOR
+	int id = cpu_number();
+#endif
+	pt_entry_t *zpte = PTESLEW(zero_pte, id);
+	pt_entry_t *maptp;
+	caddr_t zerova = VASLEW(zerop, id);
+
+#ifdef DIAGNOSTIC
+	if (PTE_GET(zpte))
+		panic("pmap_zero_page: lock botch");
+#endif
+
+	maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
+	PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW);	/* map in */
+	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
+
+	memset(zerova, 0, PAGE_SIZE);			/* zero */
+	PTE_CLEAR(zpte, maptp);				/* zap! */
+}
+
+/*
+ * pmap_pagezeroidle: the same, for the idle loop page zero'er.
+ * Returns TRUE if the page was zero'd, FALSE if we aborted for
+ * some reason.
+ */
+
+boolean_t
+pmap_pageidlezero(pa)
+	paddr_t pa;
+{
+#ifdef MULTIPROCESSOR
+	int id = cpu_number();
+#endif
+	pt_entry_t *zpte = PTESLEW(zero_pte, id);
+	pt_entry_t *maptp;
+	caddr_t zerova = VASLEW(zerop, id);
+	boolean_t rv = TRUE;
+	int i, *ptr;
+
+#ifdef DIAGNOSTIC
+	if (PTE_GET(zpte))
+		panic("pmap_zero_page_uncached: lock botch");
+#endif
+	maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
+	PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW);	/* map in */
+	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
+	for (i = 0, ptr = (int *) zerova; i < PAGE_SIZE / sizeof(int); i++) {
+		if (sched_whichqs != 0) {
+
+			/*
+			 * A process has become ready.  Abort now,
+			 * so we don't keep it waiting while we
+			 * do slow memory access to finish this
+			 * page.
+			 */
+
+			rv = FALSE;
+			break;
+		}
+		*ptr++ = 0;
+	}
+
+	PTE_CLEAR(zpte, maptp);				/* zap! */
+	return (rv);
+}
+
+/*
+ * pmap_copy_page: copy a page
+ */
+
+void
+pmap_copy_page(srcpa, dstpa)
+	paddr_t srcpa, dstpa;
+{
+#ifdef MULTIPROCESSOR
+	int id = cpu_number();
+#endif
+	pt_entry_t *spte = PTESLEW(csrc_pte,id), *maspte;
+	pt_entry_t *dpte = PTESLEW(cdst_pte,id), *madpte;
+	caddr_t csrcva = VASLEW(csrcp, id);
+	caddr_t cdstva = VASLEW(cdstp, id);
+
+#ifdef DIAGNOSTIC
+	if (PTE_GET(spte) || PTE_GET(dpte))
+		panic("pmap_copy_page: lock botch");
+#endif
+
+	maspte = (pt_entry_t *)vtomach((vaddr_t)spte);
+	madpte = (pt_entry_t *)vtomach((vaddr_t)dpte);
+	PTE_SET(spte, maspte, (srcpa & PG_FRAME) | PG_V | PG_RW);
+	PTE_SET(dpte, madpte, (dstpa & PG_FRAME) | PG_V | PG_RW);
+	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
+	memcpy(cdstva, csrcva, PAGE_SIZE);
+	PTE_CLEAR(spte, maspte);			/* zap! */
+	PTE_CLEAR(dpte, madpte);			/* zap! */
+}
+
+/*
+ * p m a p   r e m o v e   f u n c t i o n s
+ *
+ * functions that remove mappings
+ */
+
+/*
+ * pmap_remove_ptes: remove PTEs from a PTP
+ *
+ * => must have proper locking on pmap_master_lock
+ * => caller must hold pmap's lock
+ * => PTP must be mapped into KVA
+ * => PTP should be null if pmap == pmap_kernel()
+ */
+
+static void
+pmap_remove_ptes(pmap, ptp, ptpva, startva, endva, cpumaskp, flags)
+	struct pmap *pmap;
+	struct vm_page *ptp;
+	vaddr_t ptpva;
+	vaddr_t startva, endva;
+	int32_t *cpumaskp;
+	int flags;
+{
+	struct pv_entry *pv_tofree = NULL;	/* list of pv_entrys to free */
+	struct pv_entry *pve;
+	pt_entry_t *pte = (pt_entry_t *) ptpva;
+	pt_entry_t opte;
+	pt_entry_t *maptp;
+
+	/*
+	 * note that ptpva points to the PTE that maps startva.   this may
+	 * or may not be the first PTE in the PTP.
+	 *
+	 * we loop through the PTP while there are still PTEs to look at
+	 * and the wire_count is greater than 1 (because we use the wire_count
+	 * to keep track of the number of real PTEs in the PTP).
+	 */
+
+	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
+			     ; pte++, startva += PAGE_SIZE) {
+		struct vm_page *pg;
+		struct vm_page_md *mdpg;
+
+		if (!pmap_valid_entry(*pte))
+			continue;			/* VA not mapped */
+		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
+			continue;
+		}
+
+		/* atomically save the old PTE and zap! it */
+		maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+		opte = pte_atomic_update(pte, maptp, 0);
+		pmap_exec_account(pmap, startva, opte, 0);
+
+		if (opte & PG_W)
+			pmap->pm_stats.wired_count--;
+		pmap->pm_stats.resident_count--;
+
+		if (opte & PG_U)
+			pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
+
+		if (ptp) {
+			ptp->wire_count--;		/* dropping a PTE */
+			/* Make sure that the PDE is flushed */
+			if ((ptp->wire_count <= 1) && !(opte & PG_U))
+				pmap_tlb_shootdown(pmap, startva, opte,
+				    cpumaskp);
+		}
+
+		/*
+		 * if we are not on a pv_head list we are done.
+		 */
+
+		if ((opte & PG_PVLIST) == 0) {
+#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
+			if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
+				panic("pmap_remove_ptes: managed page without "
+				      "PG_PVLIST for 0x%lx", startva);
+#endif
+			continue;
+		}
+
+		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+		if (pg == NULL)
+			panic("pmap_remove_ptes: unmanaged page marked "
+			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
+			      startva, (u_long)(opte & PG_FRAME));
+#endif
+		mdpg = &pg->mdpage;
+
+		/* sync R/M bits */
+		simple_lock(&mdpg->mp_pvhead.pvh_lock);
+		mdpg->mp_attrs |= (opte & (PG_U|PG_M));
+		pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, startva);
+		simple_unlock(&mdpg->mp_pvhead.pvh_lock);
+
+		if (pve) {
+			SPLAY_RIGHT(pve, pv_node) = pv_tofree;
+			pv_tofree = pve;
+		}
+
+		/* end of "for" loop: time for next pte */
+	}
+	if (pv_tofree)
+		pmap_free_pvs(pmap, pv_tofree);
+}
+
+
+/*
+ * pmap_remove_pte: remove a single PTE from a PTP
+ *
+ * => must have proper locking on pmap_master_lock
+ * => caller must hold pmap's lock
+ * => PTP must be mapped into KVA
+ * => PTP should be null if pmap == pmap_kernel()
+ * => returns true if we removed a mapping
+ */
+
+static boolean_t
+pmap_remove_pte(pmap, ptp, pte, va, cpumaskp, flags)
+	struct pmap *pmap;
+	struct vm_page *ptp;
+	pt_entry_t *pte;
+	vaddr_t va;
+	int32_t *cpumaskp;
+	int flags;
+{
+	pt_entry_t opte;
+	pt_entry_t *maptp;
+	struct pv_entry *pve;
+	struct vm_page *pg;
+	struct vm_page_md *mdpg;
+
+	if (!pmap_valid_entry(*pte))
+		return(FALSE);		/* VA not mapped */
+	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
+		return(FALSE);
+	}
+
+	/* atomically save the old PTE and zap! it */
+	maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+	opte = pte_atomic_update(pte, maptp, 0);
+
+	XENPRINTK(("pmap_remove_pte %p, was %08x\n", pte, opte));
+	pmap_exec_account(pmap, va, opte, 0);
+
+	if (opte & PG_W)
+		pmap->pm_stats.wired_count--;
+	pmap->pm_stats.resident_count--;
+
+	if (opte & PG_U)
+		pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
+
+	if (ptp) {
+		ptp->wire_count--;		/* dropping a PTE */
+		/* Make sure that the PDE is flushed */
+		if ((ptp->wire_count <= 1) && !(opte & PG_U))
+			pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
+
+	}
+	/*
+	 * if we are not on a pv_head list we are done.
+	 */
+
+	if ((opte & PG_PVLIST) == 0) {
+#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
+		if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
+			panic("pmap_remove_pte: managed page without "
+			      "PG_PVLIST for 0x%lx", va);
+#endif
+		return(TRUE);
+	}
+
+	pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+	if (pg == NULL)
+		panic("pmap_remove_pte: unmanaged page marked "
+		    "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
+		    (u_long)(opte & PG_FRAME));
+#endif
+	mdpg = &pg->mdpage;
+
+	/* sync R/M bits */
+	simple_lock(&mdpg->mp_pvhead.pvh_lock);
+	mdpg->mp_attrs |= (opte & (PG_U|PG_M));
+	pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, va);
+	simple_unlock(&mdpg->mp_pvhead.pvh_lock);
+
+	if (pve)
+		pmap_free_pv(pmap, pve);
+	return(TRUE);
+}
+
+/*
+ * pmap_remove: top level mapping removal function
+ *
+ * => caller should not be holding any pmap locks
+ */
+
+void
+pmap_remove(pmap, sva, eva)
+	struct pmap *pmap;
+	vaddr_t sva, eva;
+{
+	pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
+}
+
+/*
+ * pmap_do_remove: mapping removal guts
+ *
+ * => caller should not be holding any pmap locks
+ */
+
+static void
+pmap_do_remove(pmap, sva, eva, flags)
+	struct pmap *pmap;
+	vaddr_t sva, eva;
+	int flags;
+{
+	pt_entry_t *ptes, opte;
+	pt_entry_t *maptp;
+	boolean_t result;
+	paddr_t ptppa;
+	vaddr_t blkendva;
+	struct vm_page *ptp;
+	int32_t cpumask = 0;
+	TAILQ_HEAD(, vm_page) empty_ptps;
+	struct cpu_info *ci;
+	struct pmap *curpmap;
+
+	/*
+	 * we lock in the pmap => pv_head direction
+	 */
+
+	TAILQ_INIT(&empty_ptps);
+
+	PMAP_MAP_TO_HEAD_LOCK();
+
+	ptes = pmap_map_ptes(pmap);	/* locks pmap */
+
+	ci = curcpu();
+	curpmap = ci->ci_pmap;
+
+	/*
+	 * removing one page?  take shortcut function.
+	 */
+
+	if (sva + PAGE_SIZE == eva) {
+		if (pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) {
+
+			/* PA of the PTP */
+			ptppa = PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME;
+
+			/* get PTP if non-kernel mapping */
+			if (pmap == pmap_kernel()) {
+				/* we never free kernel PTPs */
+				ptp = NULL;
+			} else {
+				if (pmap->pm_ptphint &&
+				    VM_PAGE_TO_PHYS(pmap->pm_ptphint) ==
+				    ptppa) {
+					ptp = pmap->pm_ptphint;
+				} else {
+					ptp = PHYS_TO_VM_PAGE(ptppa);
+#ifdef DIAGNOSTIC
+					if (ptp == NULL)
+						panic("pmap_remove: unmanaged "
+						      "PTP detected");
+#endif
+				}
+			}
+
+			/* do it! */
+			result = pmap_remove_pte(pmap, ptp,
+			    &ptes[x86_btop(sva)], sva, &cpumask, flags);
+
+			/*
+			 * if mapping removed and the PTP is no longer
+			 * being used, free it!
+			 */
+
+			if (result && ptp && ptp->wire_count <= 1) {
+				/* zap! */
+				maptp = (pt_entry_t *)vtomach(
+					(vaddr_t)&pmap->pm_pdir[pdei(sva)]);
+				PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
+				    maptp, opte);
+#if defined(MULTIPROCESSOR)
+				/*
+				 * XXXthorpej Redundant shootdown can happen
+				 * here if we're using APTE space.
+				 */
+#endif
+				pmap_tlb_shootdown(curpmap,
+				    ((vaddr_t)ptes) + ptp->offset, opte,
+				    &cpumask);
+#if defined(MULTIPROCESSOR)
+				/*
+				 * Always shoot down the pmap's self-mapping
+				 * of the PTP.
+				 * XXXthorpej Redundant shootdown can happen
+				 * here if pmap == curpmap (not APTE space).
+				 */
+				pmap_tlb_shootdown(pmap,
+				    ((vaddr_t)PTE_BASE) + ptp->offset, opte,
+				    &cpumask);
+#endif
+				pmap->pm_stats.resident_count--;
+				if (pmap->pm_ptphint == ptp)
+					pmap->pm_ptphint =
+					    TAILQ_FIRST(&pmap->pm_obj.memq);
+				ptp->wire_count = 0;
+				ptp->flags |= PG_ZERO;
+				uvm_pagerealloc(ptp, NULL, 0);
+				TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
+			}
+		}
+		pmap_tlb_shootnow(cpumask);
+		pmap_unmap_ptes(pmap);		/* unlock pmap */
+		PMAP_MAP_TO_HEAD_UNLOCK();
+		/* Now we can free unused ptps */
+		TAILQ_FOREACH(ptp, &empty_ptps, listq)
+			uvm_pagefree(ptp);
+		return;
+	}
+
+	cpumask = 0;
+
+	for (/* null */ ; sva < eva ; sva = blkendva) {
+
+		/* determine range of block */
+		blkendva = x86_round_pdr(sva+1);
+		if (blkendva > eva)
+			blkendva = eva;
+
+		/*
+		 * XXXCDC: our PTE mappings should never be removed
+		 * with pmap_remove!  if we allow this (and why would
+		 * we?) then we end up freeing the pmap's page
+		 * directory page (PDP) before we are finished using
+		 * it when we hit in in the recursive mapping.  this
+		 * is BAD.
+		 *
+		 * long term solution is to move the PTEs out of user
+		 * address space.  and into kernel address space (up
+		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
+		 * be VM_MAX_ADDRESS.
+		 */
+
+		if (pdei(sva) == PDSLOT_PTE)
+			/* XXXCDC: ugly hack to avoid freeing PDP here */
+			continue;
+
+		if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
+			/* valid block? */
+			continue;
+
+		/* PA of the PTP */
+		ptppa = (PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME);
+
+		/* get PTP if non-kernel mapping */
+		if (pmap == pmap_kernel()) {
+			/* we never free kernel PTPs */
+			ptp = NULL;
+		} else {
+			if (pmap->pm_ptphint &&
+			    VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
+				ptp = pmap->pm_ptphint;
+			} else {
+				ptp = PHYS_TO_VM_PAGE(ptppa);
+#ifdef DIAGNOSTIC
+				if (ptp == NULL)
+					panic("pmap_remove: unmanaged PTP "
+					      "detected");
+#endif
+			}
+		}
+		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[x86_btop(sva)],
+		    sva, blkendva, &cpumask, flags);
+
+		/* if PTP is no longer being used, free it! */
+		if (ptp && ptp->wire_count <= 1) {
+			/* zap! */
+			maptp = (pt_entry_t *)vtomach(
+				(vaddr_t)&pmap->pm_pdir[pdei(sva)]);
+			PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
+			    maptp, opte);
+#if defined(MULTIPROCESSOR)
+			/*
+			 * XXXthorpej Redundant shootdown can happen here
+			 * if we're using APTE space.
+			 */
+#endif
+			pmap_tlb_shootdown(curpmap,
+			    ((vaddr_t)ptes) + ptp->offset, opte, &cpumask);
+#if defined(MULTIPROCESSOR)
+			/*
+			 * Always shoot down the pmap's self-mapping
+			 * of the PTP.
+			 * XXXthorpej Redundant shootdown can happen here
+			 * if pmap == curpmap (not APTE space).
+			 */
+			pmap_tlb_shootdown(pmap,
+			    ((vaddr_t)PTE_BASE) + ptp->offset, opte, &cpumask);
+#endif
+			pmap->pm_stats.resident_count--;
+			if (pmap->pm_ptphint == ptp)	/* update hint? */
+				pmap->pm_ptphint = pmap->pm_obj.memq.tqh_first;
+			ptp->wire_count = 0;
+			ptp->flags |= PG_ZERO;
+			/* Postpone free to shootdown */
+			uvm_pagerealloc(ptp, NULL, 0);
+			TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
+		}
+	}
+
+	pmap_tlb_shootnow(cpumask);
+	pmap_unmap_ptes(pmap);
+	PMAP_MAP_TO_HEAD_UNLOCK();
+	/* Now we can free unused ptps */
+	TAILQ_FOREACH(ptp, &empty_ptps, listq)
+		uvm_pagefree(ptp);
+}
+
+/*
+ * pmap_page_remove: remove a managed vm_page from all pmaps that map it
+ *
+ * => we set pv_head => pmap locking
+ * => R/M bits are sync'd back to attrs
+ */
+
+void
+pmap_page_remove(pg)
+	struct vm_page *pg;
+{
+	struct pv_head *pvh;
+	struct pv_entry *pve, *npve, *killlist = NULL;
+	pt_entry_t *ptes, opte;
+	pt_entry_t *maptp;
+	int32_t cpumask = 0;
+	TAILQ_HEAD(, vm_page) empty_ptps;
+	struct vm_page *ptp;
+	struct cpu_info *ci;
+	struct pmap *curpmap;
+
+#ifdef DIAGNOSTIC
+	int bank, off;
+
+	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
+	if (bank == -1)
+		panic("pmap_page_remove: unmanaged page?");
+#endif
+
+	pvh = &pg->mdpage.mp_pvhead;
+	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
+		return;
+	}
+
+	TAILQ_INIT(&empty_ptps);
+
+	/* set pv_head => pmap locking */
+	PMAP_HEAD_TO_MAP_LOCK();
+
+	ci = curcpu();
+	curpmap = ci->ci_pmap;
+
+	/* XXX: needed if we hold head->map lock? */
+	simple_lock(&pvh->pvh_lock);
+
+	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) {
+		npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve);
+		ptes = pmap_map_ptes(pve->pv_pmap);		/* locks pmap */
+
+#ifdef DIAGNOSTIC
+		if (pve->pv_ptp &&
+		    (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]) &
+			PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
+			printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n",
+			    pg, pve->pv_va, pve->pv_ptp);
+			printf("pmap_page_remove: PTP's phys addr: "
+			    "actual=%lx, recorded=%lx\n",
+			    (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)])
+				& PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
+			panic("pmap_page_remove: mapped managed page has "
+			    "invalid pv_ptp field");
+		}
+#endif
+
+		/* atomically save the old PTE and zap! it */
+		maptp = (pt_entry_t *)vtomach(
+			(vaddr_t)&ptes[x86_btop(pve->pv_va)]);
+		opte = pte_atomic_update(&ptes[x86_btop(pve->pv_va)],
+		    maptp, 0);
+
+		if (opte & PG_W)
+			pve->pv_pmap->pm_stats.wired_count--;
+		pve->pv_pmap->pm_stats.resident_count--;
+
+		/* Shootdown only if referenced */
+		if (opte & PG_U)
+			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
+			    &cpumask);
+
+		/* sync R/M bits */
+		pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M));
+
+		/* update the PTP reference count.  free if last reference. */
+		if (pve->pv_ptp) {
+			pve->pv_ptp->wire_count--;
+			if (pve->pv_ptp->wire_count <= 1) {
+				/*
+				 * Do we have to shootdown the page just to
+				 * get the pte out of the TLB ?
+				 */
+				if(!(opte & PG_U))
+					pmap_tlb_shootdown(pve->pv_pmap,
+					    pve->pv_va, opte, &cpumask);
+
+				/* zap! */
+				maptp = (pt_entry_t *)vtomach((vaddr_t)
+				    &pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]);
+				PTE_ATOMIC_CLEAR(&pve->pv_pmap->pm_pdir
+				    [pdei(pve->pv_va)], maptp, opte);
+				pmap_tlb_shootdown(curpmap,
+				    ((vaddr_t)ptes) + pve->pv_ptp->offset,
+				    opte, &cpumask);
+#if defined(MULTIPROCESSOR)
+				/*
+				 * Always shoot down the other pmap's
+				 * self-mapping of the PTP.
+				 */
+				pmap_tlb_shootdown(pve->pv_pmap,
+				    ((vaddr_t)PTE_BASE) + pve->pv_ptp->offset,
+				    opte, &cpumask);
+#endif
+				pve->pv_pmap->pm_stats.resident_count--;
+				/* update hint? */
+				if (pve->pv_pmap->pm_ptphint == pve->pv_ptp)
+					pve->pv_pmap->pm_ptphint =
+					    pve->pv_pmap->pm_obj.memq.tqh_first;
+				pve->pv_ptp->wire_count = 0;
+				pve->pv_ptp->flags |= PG_ZERO;
+				/* Free only after the shootdown */
+				uvm_pagerealloc(pve->pv_ptp, NULL, 0);
+				TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp,
+				    listq);
+			}
+		}
+		pmap_unmap_ptes(pve->pv_pmap);		/* unlocks pmap */
+		SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */
+		SPLAY_RIGHT(pve, pv_node) = killlist;	/* mark it for death */
+		killlist = pve;
+	}
+	pmap_free_pvs(NULL, killlist);
+	simple_unlock(&pvh->pvh_lock);
+	PMAP_HEAD_TO_MAP_UNLOCK();
+	pmap_tlb_shootnow(cpumask);
+
+	/* Now we can free unused ptps */
+	TAILQ_FOREACH(ptp, &empty_ptps, listq)
+		uvm_pagefree(ptp);
+}
+
+/*
+ * p m a p   a t t r i b u t e  f u n c t i o n s
+ * functions that test/change managed page's attributes
+ * since a page can be mapped multiple times we must check each PTE that
+ * maps it by going down the pv lists.
+ */
+
+/*
+ * pmap_test_attrs: test a page's attributes
+ *
+ * => we set pv_head => pmap locking
+ */
+
+boolean_t
+pmap_test_attrs(pg, testbits)
+	struct vm_page *pg;
+	int testbits;
+{
+	struct vm_page_md *mdpg;
+	int *myattrs;
+	struct pv_head *pvh;
+	struct pv_entry *pve;
+	volatile pt_entry_t *ptes;
+	pt_entry_t pte;
+
+#if DIAGNOSTIC
+	int bank, off;
+
+	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
+	if (bank == -1)
+		panic("pmap_test_attrs: unmanaged page?");
+#endif
+	mdpg = &pg->mdpage;
+
+	/*
+	 * before locking: see if attributes are already set and if so,
+	 * return!
+	 */
+
+	myattrs = &mdpg->mp_attrs;
+	if (*myattrs & testbits)
+		return(TRUE);
+
+	/* test to see if there is a list before bothering to lock */
+	pvh = &mdpg->mp_pvhead;
+	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
+		return(FALSE);
+	}
+
+	/* nope, gonna have to do it the hard way */
+	PMAP_HEAD_TO_MAP_LOCK();
+	/* XXX: needed if we hold head->map lock? */
+	simple_lock(&pvh->pvh_lock);
+
+	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root);
+	     pve != NULL && (*myattrs & testbits) == 0;
+	     pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) {
+		ptes = pmap_map_ptes(pve->pv_pmap);
+		pte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); /* XXX flags only? */
+		pmap_unmap_ptes(pve->pv_pmap);
+		*myattrs |= pte;
+	}
+
+	/*
+	 * note that we will exit the for loop with a non-null pve if
+	 * we have found the bits we are testing for.
+	 */
+
+	simple_unlock(&pvh->pvh_lock);
+	PMAP_HEAD_TO_MAP_UNLOCK();
+	return((*myattrs & testbits) != 0);
+}
+
+/*
+ * pmap_clear_attrs: clear the specified attribute for a page.
+ *
+ * => we set pv_head => pmap locking
+ * => we return TRUE if we cleared one of the bits we were asked to
+ */
+
+boolean_t
+pmap_clear_attrs(pg, clearbits)
+	struct vm_page *pg;
+	int clearbits;
+{
+	struct vm_page_md *mdpg;
+	u_int32_t result;
+	struct pv_head *pvh;
+	struct pv_entry *pve;
+	pt_entry_t *ptes, opte;
+	pt_entry_t *maptp;
+	int *myattrs;
+	int32_t cpumask = 0;
+
+#ifdef DIAGNOSTIC
+	int bank, off;
+
+	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
+	if (bank == -1)
+		panic("pmap_change_attrs: unmanaged page?");
+#endif
+	mdpg = &pg->mdpage;
+
+	PMAP_HEAD_TO_MAP_LOCK();
+	pvh = &mdpg->mp_pvhead;
+	/* XXX: needed if we hold head->map lock? */
+	simple_lock(&pvh->pvh_lock);
+
+	myattrs = &mdpg->mp_attrs;
+	result = *myattrs & clearbits;
+	*myattrs &= ~clearbits;
+
+	SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) {
+#ifdef DIAGNOSTIC
+		if (!pmap_valid_entry(pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]))
+			panic("pmap_change_attrs: mapping without PTP "
+			      "detected");
+#endif
+
+		ptes = pmap_map_ptes(pve->pv_pmap);	/* locks pmap */
+		opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
+		if (opte & clearbits) {
+			/* We need to do something */
+			if (clearbits == PG_RW) {
+				result |= PG_RW;
+
+				/*
+				 * On write protect we might not need to flush 
+				 * the TLB
+				 */
+
+				/* First zap the RW bit! */
+				maptp = (pt_entry_t *)vtomach(
+					(vaddr_t)&ptes[x86_btop(pve->pv_va)]);
+				PTE_ATOMIC_CLEARBITS(
+					&ptes[x86_btop(pve->pv_va)],
+					maptp, PG_RW);
+				opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
+
+				/*
+				 * Then test if it is not cached as RW the TLB
+				 */
+				if (!(opte & PG_M))
+					goto no_tlb_shootdown;
+			}
+
+			/*
+			 * Since we need a shootdown me might as well
+			 * always clear PG_U AND PG_M.
+			 */
+
+			/* zap! */
+			maptp = (pt_entry_t *)vtomach(
+				(vaddr_t)&ptes[x86_btop(pve->pv_va)]);
+			PTE_ATOMIC_SET(&ptes[x86_btop(pve->pv_va)], maptp,
+			    (opte & ~(PG_U | PG_M)), opte);
+
+			result |= (opte & clearbits);
+			*myattrs |= (opte & ~(clearbits));
+
+			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
+					   &cpumask);
+		}
+no_tlb_shootdown:
+		pmap_unmap_ptes(pve->pv_pmap);		/* unlocks pmap */
+	}
+
+	simple_unlock(&pvh->pvh_lock);
+	PMAP_HEAD_TO_MAP_UNLOCK();
+
+	pmap_tlb_shootnow(cpumask);
+	return(result != 0);
+}
+
+
+/*
+ * p m a p   p r o t e c t i o n   f u n c t i o n s
+ */
+
+/*
+ * pmap_page_protect: change the protection of all recorded mappings
+ *	of a managed page
+ *
+ * => NOTE: this is an inline function in pmap.h
+ */
+
+/* see pmap.h */
+
+/*
+ * pmap_protect: set the protection in of the pages in a pmap
+ *
+ * => NOTE: this is an inline function in pmap.h
+ */
+
+/* see pmap.h */
+
+/*
+ * pmap_write_protect: write-protect pages in a pmap
+ */
+
+void
+pmap_write_protect(pmap, sva, eva, prot)
+	struct pmap *pmap;
+	vaddr_t sva, eva;
+	vm_prot_t prot;
+{
+	pt_entry_t *ptes, *epte;
+	pt_entry_t *maptp;
+#ifndef XEN
+	volatile
+#endif
+		pt_entry_t *spte;
+	vaddr_t blockend;
+	int32_t cpumask = 0;
+
+	ptes = pmap_map_ptes(pmap);		/* locks pmap */
+
+	/* should be ok, but just in case ... */
+	sva &= PG_FRAME;
+	eva &= PG_FRAME;
+
+	for (/* null */ ; sva < eva ; sva = blockend) {
+
+		blockend = (sva & PD_MASK) + NBPD;
+		if (blockend > eva)
+			blockend = eva;
+
+		/*
+		 * XXXCDC: our PTE mappings should never be write-protected!
+		 *
+		 * long term solution is to move the PTEs out of user
+		 * address space.  and into kernel address space (up
+		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
+		 * be VM_MAX_ADDRESS.
+		 */
+
+		/* XXXCDC: ugly hack to avoid freeing PDP here */
+		if (pdei(sva) == PDSLOT_PTE)
+			continue;
+
+		/* empty block? */
+		if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
+			continue;
+
+#ifdef DIAGNOSTIC
+		if (sva >= VM_MAXUSER_ADDRESS &&
+		    sva < VM_MAX_ADDRESS)
+			panic("pmap_write_protect: PTE space");
+#endif
+
+		spte = &ptes[x86_btop(sva)];
+		epte = &ptes[x86_btop(blockend)];
+
+		for (/*null */; spte < epte ; spte++) {
+			if ((PTE_GET(spte) & (PG_RW|PG_V)) == (PG_RW|PG_V)) {
+				maptp = (pt_entry_t *)vtomach((vaddr_t)spte);
+				PTE_ATOMIC_CLEARBITS(spte, maptp, PG_RW);
+				if (PTE_GET(spte) & PG_M)
+					pmap_tlb_shootdown(pmap,
+					    x86_ptob(spte - ptes),
+					    PTE_GET(spte), &cpumask);
+			}
+		}
+	}
+
+	/*
+	 * if we kept a removal record and removed some pages update the TLB
+	 */
+
+	pmap_tlb_shootnow(cpumask);
+	pmap_unmap_ptes(pmap);		/* unlocks pmap */
+}
+
+/*
+ * end of protection functions
+ */
+
+/*
+ * pmap_unwire: clear the wired bit in the PTE
+ *
+ * => mapping should already be in map
+ */
+
+void
+pmap_unwire(pmap, va)
+	struct pmap *pmap;
+	vaddr_t va;
+{
+	pt_entry_t *ptes;
+	pt_entry_t *maptp;
+
+	if (pmap_valid_entry(pmap->pm_pdir[pdei(va)])) {
+		ptes = pmap_map_ptes(pmap);		/* locks pmap */
+
+#ifdef DIAGNOSTIC
+		if (!pmap_valid_entry(ptes[x86_btop(va)]))
+			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
+#endif
+		if ((ptes[x86_btop(va)] & PG_W) != 0) {
+			maptp = (pt_entry_t *)vtomach(
+				(vaddr_t)&ptes[x86_btop(va)]);
+			PTE_ATOMIC_CLEARBITS(&ptes[x86_btop(va)], maptp, PG_W);
+			pmap->pm_stats.wired_count--;
+		}
+#ifdef DIAGNOSTIC
+		else {
+			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
+			       "didn't change!\n", pmap, va);
+		}
+#endif
+		pmap_unmap_ptes(pmap);		/* unlocks map */
+	}
+#ifdef DIAGNOSTIC
+	else {
+		panic("pmap_unwire: invalid PDE");
+	}
+#endif
+}
+
+/*
+ * pmap_collect: free resources held by a pmap
+ *
+ * => optional function.
+ * => called when a process is swapped out to free memory.
+ */
+
+void
+pmap_collect(pmap)
+	struct pmap *pmap;
+{
+	/*
+	 * free all of the pt pages by removing the physical mappings
+	 * for its entire address space.
+	 */
+
+	pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
+	    PMAP_REMOVE_SKIPWIRED);
+}
+
+/*
+ * pmap_copy: copy mappings from one pmap to another
+ *
+ * => optional function
+ * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
+ */
+
+/*
+ * defined as macro in pmap.h
+ */
+
+/*
+ * pmap_enter: enter a mapping into a pmap
+ *
+ * => must be done "now" ... no lazy-evaluation
+ * => we set pmap => pv_head locking
+ */
+
+int
+pmap_enter(pmap, va, pa, prot, flags)
+	struct pmap *pmap;
+	vaddr_t va;
+	paddr_t pa;
+	vm_prot_t prot;
+	int flags;
+{
+	pt_entry_t *ptes, opte, npte;
+	struct vm_page *ptp, *pg;
+	struct vm_page_md *mdpg;
+	struct pv_head *old_pvh, *new_pvh;
+	struct pv_entry *pve = NULL; /* XXX gcc */
+	int error;
+	boolean_t wired = (flags & PMAP_WIRED) != 0;
+	pt_entry_t *maptp;
+
+	XENPRINTK(("pmap_enter(%p, %p, %p, %08x, %08x)\n",
+	    pmap, (void *)va, (void *)pa, prot, flags));
+
+#ifdef DIAGNOSTIC
+	/* sanity check: totally out of range? */
+	if (va >= VM_MAX_KERNEL_ADDRESS)
+		panic("pmap_enter: too big");
+
+	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
+		panic("pmap_enter: trying to map over PDP/APDP!");
+
+	/* sanity check: kernel PTPs should already have been pre-allocated */
+	if (va >= VM_MIN_KERNEL_ADDRESS &&
+	    !pmap_valid_entry(pmap->pm_pdir[pdei(va)]))
+		panic("pmap_enter: missing kernel PTP!");
+#endif
+
+	npte = protection_codes[prot] | PG_V;
+
+	if (pa >= pmap_pa_start && pa < pmap_pa_end)
+		npte |= xpmap_ptom(pa);
+	else {
+		XENPRINTF(("pmap_enter: va %08lx outside pa range %08lx\n",
+		    va, pa));
+		npte |= pa;
+	}
+
+	/* XENPRINTK(("npte %p\n", npte)); */
+
+	if (wired)
+	        npte |= PG_W;
+
+	if (va < VM_MAXUSER_ADDRESS)
+		npte |= PG_u;
+	else if (va < VM_MAX_ADDRESS)
+		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
+	if (pmap == pmap_kernel())
+		npte |= pmap_pg_g;
+
+	/* get lock */
+	PMAP_MAP_TO_HEAD_LOCK();
+
+	ptes = pmap_map_ptes(pmap);		/* locks pmap */
+	if (pmap == pmap_kernel()) {
+		ptp = NULL;
+	} else {
+		ptp = pmap_get_ptp(pmap, pdei(va));
+		if (ptp == NULL) {
+			if (flags & PMAP_CANFAIL) {
+				error = ENOMEM;
+				goto out;
+			}
+			panic("pmap_enter: get ptp failed");
+		}
+	}
+
+	/*
+	 * Get first view on old PTE 
+	 * on SMP the PTE might gain PG_U and PG_M flags
+	 * before we zap it later
+	 */
+	opte = pte_get(&ptes[x86_btop(va)]);		/* old PTE */
+	XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", 
+		      (void *)npte, (void *)opte, ptes, x86_btop(va)));
+
+	/*
+	 * is there currently a valid mapping at our VA and does it
+	 * map to the same PA as the one we want to map ?
+	 */
+
+	if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) {
+
+		/*
+		 * first, calculate pm_stats updates.  resident count will not
+		 * change since we are replacing/changing a valid mapping.
+		 * wired count might change...
+		 */
+		pmap->pm_stats.wired_count +=
+		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+		npte |= (opte & PG_PVLIST);
+
+		XENPRINTK(("pmap update opte == pa"));
+		/* zap! */
+		maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+		opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte);
+
+		/*
+		 * Any change in the protection level that the CPU
+		 * should know about ? 
+		 */
+		if ((npte & PG_RW)
+		     || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) {
+			XENPRINTK(("pmap update opte == pa, prot change"));
+			/*
+			 * No need to flush the TLB.
+			 * Just add old PG_M, ... flags in new entry.
+			 */
+			PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp,
+			    opte & (PG_M | PG_U));
+			goto out_ok;
+		}
+
+		/*
+		 * Might be cached in the TLB as being writable
+		 * if this is on the PVLIST, sync R/M bit
+		 */
+		if (opte & PG_PVLIST) {
+			pg = PHYS_TO_VM_PAGE(pa);
+#ifdef DIAGNOSTIC
+			if (pg == NULL)
+				panic("pmap_enter: same pa PG_PVLIST "
+				      "mapping with unmanaged page "
+				      "pa = 0x%lx (0x%lx)", pa,
+				      atop(pa));
+#endif
+			mdpg = &pg->mdpage;
+			old_pvh = &mdpg->mp_pvhead;
+			simple_lock(&old_pvh->pvh_lock);
+			mdpg->mp_attrs |= opte;
+			simple_unlock(&old_pvh->pvh_lock);
+		}
+		goto shootdown_now;
+	}
+
+	pg = PHYS_TO_VM_PAGE(pa);
+	XENPRINTK(("pg %p from %p, init %d\n", pg, (void *)pa,
+		      pmap_initialized));
+	if (pmap_initialized && pg != NULL) {
+		/* This is a managed page */
+		npte |= PG_PVLIST;
+		mdpg = &pg->mdpage;
+		new_pvh = &mdpg->mp_pvhead;
+		if ((opte & (PG_PVLIST | PG_V)) != (PG_PVLIST | PG_V)) {
+			/* We can not steal a pve - allocate one */
+			pve = pmap_alloc_pv(pmap, ALLOCPV_NEED);
+			if (pve == NULL) {
+				if (!(flags & PMAP_CANFAIL))
+					panic("pmap_enter: "
+					    "no pv entries available");
+				error = ENOMEM;
+				goto out;
+  			}
+  		}
+	} else {
+		new_pvh = NULL;
+	}
+
+	/*
+	 * is there currently a valid mapping at our VA?
+	 */
+
+	if (pmap_valid_entry(opte)) {
+
+		/*
+		 * changing PAs: we must remove the old one first
+		 */
+
+		/*
+		 * first, calculate pm_stats updates.  resident count will not
+		 * change since we are replacing/changing a valid mapping.
+		 * wired count might change...
+		 */
+		pmap->pm_stats.wired_count +=
+		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+		if (opte & PG_PVLIST) {
+			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+			if (pg == NULL)
+				panic("pmap_enter: PG_PVLIST mapping with "
+				      "unmanaged page "
+				      "pa = 0x%lx (0x%lx)", pa, atop(pa));
+#endif
+			mdpg = &pg->mdpage;
+			old_pvh = &mdpg->mp_pvhead;
+
+			/* new_pvh is NULL if page will not be managed */
+			pmap_lock_pvhs(old_pvh, new_pvh);
+
+			XENPRINTK(("pmap change pa"));
+			/* zap! */
+			maptp = (pt_entry_t *)vtomach(
+				(vaddr_t)&ptes[x86_btop(va)]);
+			opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp,
+						    npte);
+
+			pve = pmap_remove_pv(old_pvh, pmap, va);
+			KASSERT(pve != 0);
+			mdpg->mp_attrs |= opte;
+
+			if (new_pvh) {
+				pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
+				simple_unlock(&new_pvh->pvh_lock);
+			} else
+				pmap_free_pv(pmap, pve);
+			simple_unlock(&old_pvh->pvh_lock);
+
+			goto shootdown_test;
+		}
+	} else {	/* opte not valid */
+		pmap->pm_stats.resident_count++;
+		if (wired) 
+			pmap->pm_stats.wired_count++;
+		if (ptp)
+			ptp->wire_count++;
+	}
+
+	if (new_pvh) {
+		simple_lock(&new_pvh->pvh_lock);
+		pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
+		simple_unlock(&new_pvh->pvh_lock);
+	}
+
+	XENPRINTK(("pmap initial setup\n"));
+	maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+	opte = pte_atomic_update_ma(&ptes[x86_btop(va)],
+	    maptp, npte); /* zap! */
+
+shootdown_test:
+	/* Update page attributes if needed */
+	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+		int32_t cpumask = 0;
+#endif
+shootdown_now:
+#if defined(MULTIPROCESSOR)
+		pmap_tlb_shootdown(pmap, va, opte, &cpumask);
+		pmap_tlb_shootnow(cpumask);
+#else
+		/* Don't bother deferring in the single CPU case. */
+		if (pmap_is_curpmap(pmap))
+			pmap_update_pg(va);
+#endif
+	}
+
+out_ok:
+	error = 0;
+
+out:
+	pmap_unmap_ptes(pmap);
+	PMAP_MAP_TO_HEAD_UNLOCK();
+
+	XENPRINTK(("pmap_enter: %d\n", error));
+	return error;
+}
+
+/*
+ * pmap_enter_ma: enter a mapping into a pmap
+ *
+ * => must be done "now" ... no lazy-evaluation
+ * => we set pmap => pv_head locking
+ */
+
+int
+pmap_enter_ma(pmap, va, pa, prot, flags)
+	struct pmap *pmap;
+	vaddr_t va;
+	paddr_t pa;
+	vm_prot_t prot;
+	int flags;
+{
+	pt_entry_t *ptes, opte, npte;
+	pt_entry_t *maptp;
+	struct vm_page *ptp, *pg;
+	struct vm_page_md *mdpg;
+	struct pv_head *old_pvh;
+	struct pv_entry *pve = NULL; /* XXX gcc */
+	int error;
+	boolean_t wired = (flags & PMAP_WIRED) != 0;
+
+	XENPRINTK(("pmap_enter_ma(%p, %p, %p, %08x, %08x)\n",
+	    pmap, (void *)va, (void *)pa, prot, flags));
+
+#ifdef DIAGNOSTIC
+	/* sanity check: totally out of range? */
+	if (va >= VM_MAX_KERNEL_ADDRESS)
+		panic("pmap_enter: too big");
+
+	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
+		panic("pmap_enter: trying to map over PDP/APDP!");
+
+	/* sanity check: kernel PTPs should already have been pre-allocated */
+	if (va >= VM_MIN_KERNEL_ADDRESS &&
+	    !pmap_valid_entry(pmap->pm_pdir[pdei(va)]))
+		panic("pmap_enter: missing kernel PTP!");
+#endif
+
+	npte = pa | protection_codes[prot] | PG_V;
+	/* XENPRINTK(("npte %p\n", npte)); */
+
+	if (wired)
+	        npte |= PG_W;
+
+	if (va < VM_MAXUSER_ADDRESS)
+		npte |= PG_u;
+	else if (va < VM_MAX_ADDRESS)
+		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
+	if (pmap == pmap_kernel())
+		npte |= pmap_pg_g;
+
+	/* get lock */
+	PMAP_MAP_TO_HEAD_LOCK();
+
+	ptes = pmap_map_ptes(pmap);		/* locks pmap */
+	if (pmap == pmap_kernel()) {
+		ptp = NULL;
+	} else {
+		ptp = pmap_get_ptp(pmap, pdei(va));
+		if (ptp == NULL) {
+			if (flags & PMAP_CANFAIL) {
+				error = ENOMEM;
+				goto out;
+			}
+			panic("pmap_enter: get ptp failed");
+		}
+	}
+
+	/*
+	 * Get first view on old PTE 
+	 * on SMP the PTE might gain PG_U and PG_M flags
+	 * before we zap it later
+	 */
+	opte = pte_get_ma(&ptes[x86_btop(va)]);		/* old PTE */
+	XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", 
+		      (void *)npte, (void *)opte, ptes, x86_btop(va)));
+	XENPRINTF(("pmap_enter_ma pa %08lx va %08lx opte %08x npte %08x "
+	    "wired %d count %ld\n", pa, va, opte, npte, wired,
+	    pmap->pm_stats.wired_count));
+
+	/*
+	 * is there currently a valid mapping at our VA and does it
+	 * map to the same MA as the one we want to map ?
+	 */
+
+	if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) {
+
+		/*
+		 * first, calculate pm_stats updates.  resident count will not
+		 * change since we are replacing/changing a valid mapping.
+		 * wired count might change...
+		 */
+		pmap->pm_stats.wired_count +=
+		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+		XENPRINTK(("pmap update opte == pa"));
+		/* zap! */
+		maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+		opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte);
+
+		/*
+		 * Any change in the protection level that the CPU
+		 * should know about ? 
+		 */
+		if ((npte & PG_RW)
+		     || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) {
+			XENPRINTK(("pmap update opte == pa, prot change"));
+			/*
+			 * No need to flush the TLB.
+			 * Just add old PG_M, ... flags in new entry.
+			 */
+			PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp,
+			    opte & (PG_M | PG_U));
+			goto out_ok;
+		}
+
+		/*
+		 * Might be cached in the TLB as being writable
+		 * if this is on the PVLIST, sync R/M bit
+		 */
+		KDASSERT((opte & PG_PVLIST) == 0);
+		goto shootdown_now;
+	}
+
+	/* 
+	 * no managed mapping for pages mapped through pmap_enter_ma.
+	 */
+
+	/*
+	 * is there currently a valid mapping at our VA?
+	 */
+
+	if (pmap_valid_entry(opte)) {
+
+		/*
+		 * changing PAs: we must remove the old one first
+		 */
+
+		/*
+		 * first, calculate pm_stats updates.  resident count will not
+		 * change since we are replacing/changing a valid mapping.
+		 * wired count might change...
+		 */
+		pmap->pm_stats.wired_count +=
+		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+		if (opte & PG_PVLIST) {
+			opte = xpmap_mtop(opte);
+			KDASSERT((opte & PG_FRAME) !=
+			    (KERNTEXTOFF - KERNBASE_LOCORE));
+
+			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+			if (pg == NULL)
+				panic("pmap_enter: PG_PVLIST mapping with "
+				      "unmanaged page "
+				      "pa = 0x%lx (0x%lx)", pa, atop(pa));
+#endif
+			mdpg = &pg->mdpage;
+			old_pvh = &mdpg->mp_pvhead;
+
+			/* NULL new_pvh since page will not be managed */
+			pmap_lock_pvhs(old_pvh, NULL);
+
+			XENPRINTK(("pmap change pa"));
+			/* zap! */
+			maptp = (pt_entry_t *)vtomach(
+				(vaddr_t)&ptes[x86_btop(va)]);
+			opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp,
+						    npte);
+
+			pve = pmap_remove_pv(old_pvh, pmap, va);
+			KASSERT(pve != 0);
+			mdpg->mp_attrs |= opte;
+
+			pmap_free_pv(pmap, pve);
+			simple_unlock(&old_pvh->pvh_lock);
+
+			goto shootdown_test;
+		}
+	} else {	/* opte not valid */
+		pmap->pm_stats.resident_count++;
+		if (wired) 
+			pmap->pm_stats.wired_count++;
+		if (ptp)
+			ptp->wire_count++;
+	}
+
+	XENPRINTK(("pmap initial setup"));
+	maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+	opte = pte_atomic_update_ma(&ptes[x86_btop(va)],
+	    maptp, npte); /* zap! */
+
+shootdown_test:
+	/* Update page attributes if needed */
+	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+		int32_t cpumask = 0;
+#endif
+shootdown_now:
+#if defined(MULTIPROCESSOR)
+		pmap_tlb_shootdown(pmap, va, opte, &cpumask);
+		pmap_tlb_shootnow(cpumask);
+#else
+		/* Don't bother deferring in the single CPU case. */
+		if (pmap_is_curpmap(pmap))
+			pmap_update_pg(va);
+#endif
+	}
+
+out_ok:
+	error = 0;
+
+out:
+	pmap_unmap_ptes(pmap);
+	PMAP_MAP_TO_HEAD_UNLOCK();
+
+	XENPRINTK(("pmap_enter: %d\n", error));
+	return error;
+}
+
+/*
+ * pmap_growkernel: increase usage of KVM space
+ *
+ * => we allocate new PTPs for the kernel and install them in all
+ *	the pmaps on the system.
+ */
+
+vaddr_t
+pmap_growkernel(maxkvaddr)
+	vaddr_t maxkvaddr;
+{
+	struct pmap *kpm = pmap_kernel(), *pm;
+	pd_entry_t *mapdp;
+	pt_entry_t *maptp;
+	int needed_kpde;   /* needed number of kernel PTPs */
+	int s;
+	paddr_t ptaddr;
+
+	needed_kpde = (u_int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1))
+		/ NBPD;
+	XENPRINTF(("pmap_growkernel %p: %d -> %d\n", (void *)maxkvaddr,
+		      nkpde, needed_kpde));
+	if (needed_kpde <= nkpde)
+		goto out;		/* we are OK */
+
+	/*
+	 * whoops!   we need to add kernel PTPs
+	 */
+
+	s = splhigh();	/* to be safe */
+	simple_lock(&kpm->pm_obj.vmobjlock);
+
+	for (/*null*/ ; nkpde < needed_kpde ; nkpde++) {
+
+		mapdp = (pt_entry_t *)vtomach((vaddr_t)&kpm->pm_pdir[PDSLOT_KERN + nkpde]);
+		if (uvm.page_init_done == FALSE) {
+
+			/*
+			 * we're growing the kernel pmap early (from
+			 * uvm_pageboot_alloc()).  this case must be
+			 * handled a little differently.
+			 */
+
+			if (uvm_page_physget(&ptaddr) == FALSE)
+				panic("pmap_growkernel: out of memory");
+			pmap_zero_page(ptaddr);
+
+			XENPRINTF(("xxxx maybe not PG_RW\n"));
+			PDE_SET(&kpm->pm_pdir[PDSLOT_KERN + nkpde], mapdp, ptaddr | PG_RW | PG_V);
+
+			/* count PTP as resident */
+			kpm->pm_stats.resident_count++;
+			continue;
+		}
+
+		/*
+		 * THIS *MUST* BE CODED SO AS TO WORK IN THE
+		 * pmap_initialized == FALSE CASE!  WE MAY BE
+		 * INVOKED WHILE pmap_init() IS RUNNING!
+		 */
+
+		if (pmap_alloc_ptp(kpm, PDSLOT_KERN + nkpde) == NULL) {
+			panic("pmap_growkernel: alloc ptp failed");
+		}
+
+		/* PG_u not for kernel */
+		PDE_CLEARBITS(&kpm->pm_pdir[PDSLOT_KERN + nkpde], mapdp, PG_u);
+
+		/* distribute new kernel PTP to all active pmaps */
+		simple_lock(&pmaps_lock);
+		for (pm = pmaps.lh_first; pm != NULL;
+		     pm = pm->pm_list.le_next) {
+			XENPRINTF(("update\n"));
+			maptp = (pt_entry_t *)vtomach(
+				(vaddr_t)&pm->pm_pdir[PDSLOT_KERN + nkpde]);
+			PDE_COPY(&pm->pm_pdir[PDSLOT_KERN + nkpde], maptp,
+			    &kpm->pm_pdir[PDSLOT_KERN + nkpde]);
+		}
+
+		/* Invalidate the PDP cache. */
+		pool_cache_invalidate(&pmap_pdp_cache);
+		pmap_pdp_cache_generation++;
+
+		simple_unlock(&pmaps_lock);
+	}
+
+	simple_unlock(&kpm->pm_obj.vmobjlock);
+	splx(s);
+
+out:
+	XENPRINTF(("pmap_growkernel return %d %p\n", nkpde,
+		      (void *)(VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD))));
+	return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD));
+}
+
+#ifdef DEBUG
+void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
+
+/*
+ * pmap_dump: dump all the mappings from a pmap
+ *
+ * => caller should not be holding any pmap locks
+ */
+
+void
+pmap_dump(pmap, sva, eva)
+	struct pmap *pmap;
+	vaddr_t sva, eva;
+{
+	pt_entry_t *ptes, *pte;
+	vaddr_t blkendva;
+
+	/*
+	 * if end is out of range truncate.
+	 * if (end == start) update to max.
+	 */
+
+	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
+		eva = VM_MAXUSER_ADDRESS;
+
+	/*
+	 * we lock in the pmap => pv_head direction
+	 */
+
+	PMAP_MAP_TO_HEAD_LOCK();
+	ptes = pmap_map_ptes(pmap);	/* locks pmap */
+
+	/*
+	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
+	 */
+
+	for (/* null */ ; sva < eva ; sva = blkendva) {
+
+		/* determine range of block */
+		blkendva = x86_round_pdr(sva+1);
+		if (blkendva > eva)
+			blkendva = eva;
+
+		/* valid block? */
+		if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
+			continue;
+
+		pte = &ptes[x86_btop(sva)];
+		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
+			if (!pmap_valid_entry(*pte))
+				continue;
+			XENPRINTF(("va %#lx -> pa %#lx (pte=%#lx)\n",
+			       sva, PTE_GET(pte), PTE_GET(pte) & PG_FRAME));
+		}
+	}
+	pmap_unmap_ptes(pmap);
+	PMAP_MAP_TO_HEAD_UNLOCK();
+}
+#endif
+
+/******************** TLB shootdown code ********************/
+
+
+void
+pmap_tlb_shootnow(int32_t cpumask)
+{
+	struct cpu_info *self;
+#ifdef MULTIPROCESSOR
+	struct cpu_info *ci;
+	CPU_INFO_ITERATOR cii;
+	int s;
+#ifdef DIAGNOSTIC
+	int count = 0;
+#endif
+#endif
+
+	if (cpumask == 0)
+		return;
+
+	self = curcpu();
+#ifdef MULTIPROCESSOR
+	s = splipi();
+	self->ci_tlb_ipi_mask = cpumask;
+#endif
+
+	pmap_do_tlb_shootdown(self);	/* do *our* work. */
+
+#ifdef MULTIPROCESSOR
+	splx(s);
+
+	/*
+	 * Send the TLB IPI to other CPUs pending shootdowns.
+	 */
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if (ci == self)
+			continue;
+		if (cpumask & (1U << ci->ci_cpuid))
+			if (x86_send_ipi(ci, X86_IPI_TLB) != 0)
+				x86_atomic_clearbits_l(&self->ci_tlb_ipi_mask,
+				    (1U << ci->ci_cpuid));
+	}
+
+	while (self->ci_tlb_ipi_mask != 0) {
+#ifdef DIAGNOSTIC
+		if (count++ > 10000000)
+			panic("TLB IPI rendezvous failed (mask %x)",
+			    self->ci_tlb_ipi_mask);
+#endif
+		x86_pause();
+	}
+#endif
+}
+
+/*
+ * pmap_tlb_shootdown:
+ *
+ *	Cause the TLB entry for pmap/va to be shot down.
+ */
+void
+pmap_tlb_shootdown(pmap, va, pte, cpumaskp)
+	pmap_t pmap;
+	vaddr_t va;
+	pt_entry_t pte;
+	int32_t *cpumaskp;
+{
+	struct cpu_info *ci, *self;
+	struct pmap_tlb_shootdown_q *pq;
+	struct pmap_tlb_shootdown_job *pj;
+	CPU_INFO_ITERATOR cii;
+	int s;
+
+#ifdef LARGEPAGES
+	if (pte & PG_PS)
+		va &= PG_LGFRAME;
+#endif
+
+	if (pmap_initialized == FALSE || cpus_attached == 0) {
+		pmap_update_pg(va);
+		return;
+	}
+
+	self = curcpu();
+
+	s = splipi();
+#if 0
+	printf("dshootdown %lx\n", va);
+#endif
+
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		/* Note: we queue shootdown events for ourselves here! */
+		if (pmap_is_active(pmap, ci->ci_cpuid) == 0)
+			continue;
+		if (ci != self && !(ci->ci_flags & CPUF_RUNNING))
+			continue;
+		pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
+		__cpu_simple_lock(&pq->pq_slock);
+
+		/*
+		 * If there's a global flush already queued, or a
+		 * non-global flush, and this pte doesn't have the G
+		 * bit set, don't bother.
+		 */
+		if (pq->pq_flushg > 0 ||
+		    (pq->pq_flushu > 0 && (pte & pmap_pg_g) == 0)) {
+			__cpu_simple_unlock(&pq->pq_slock);
+			continue;
+		}
+
+#ifdef I386_CPU
+		/*
+		 * i386 CPUs can't invalidate a single VA, only
+		 * flush the entire TLB, so don't bother allocating
+		 * jobs for them -- just queue a `flushu'.
+		 *
+		 * XXX note that this can be executed for non-i386
+		 * when called * early (before identifycpu() has set
+		 * cpu_class)
+		 */
+		if (cpu_class == CPUCLASS_386) {
+			pq->pq_flushu++;
+			*cpumaskp |= 1U << ci->ci_cpuid;
+			__cpu_simple_unlock(&pq->pq_slock);
+			continue;
+		}
+#endif
+
+		pj = pmap_tlb_shootdown_job_get(pq);
+		pq->pq_pte |= pte;
+		if (pj == NULL) {
+			/*
+			 * Couldn't allocate a job entry.
+			 * Kill it now for this CPU, unless the failure
+			 * was due to too many pending flushes; otherwise,
+			 * tell other cpus to kill everything..
+			 */
+			if (ci == self && pq->pq_count < PMAP_TLB_MAXJOBS) {
+				pmap_update_pg(va);
+				__cpu_simple_unlock(&pq->pq_slock);
+				continue;
+			} else {
+				if (pq->pq_pte & pmap_pg_g)
+					pq->pq_flushg++;
+				else
+					pq->pq_flushu++;
+				/*
+				 * Since we've nailed the whole thing,
+				 * drain the job entries pending for that
+				 * processor.
+				 */
+				pmap_tlb_shootdown_q_drain(pq);
+				*cpumaskp |= 1U << ci->ci_cpuid;
+			}
+		} else {
+			pj->pj_pmap = pmap;
+			pj->pj_va = va;
+			pj->pj_pte = pte;
+			TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list);
+			*cpumaskp |= 1U << ci->ci_cpuid;
+		}
+		__cpu_simple_unlock(&pq->pq_slock);
+	}
+	splx(s);
+}
+
+/*
+ * pmap_do_tlb_shootdown_checktlbstate: check and update ci_tlbstate.
+ *
+ * => called at splipi.
+ * => return TRUE if we need to maintain user tlbs.
+ */
+static __inline boolean_t
+pmap_do_tlb_shootdown_checktlbstate(struct cpu_info *ci)
+{
+
+	KASSERT(ci == curcpu());
+
+	if (ci->ci_tlbstate == TLBSTATE_LAZY) {
+		KASSERT(ci->ci_pmap != pmap_kernel());
+		/*
+		 * mostly KASSERT(ci->ci_pmap->pm_cpus & (1U << ci->ci_cpuid));
+		 */
+
+		/*
+		 * we no longer want tlb shootdown ipis for this pmap.
+		 * mark the pmap no longer in use by this processor.
+		 */
+
+		x86_atomic_clearbits_l(&ci->ci_pmap->pm_cpus,
+		    1U << ci->ci_cpuid);
+		ci->ci_tlbstate = TLBSTATE_STALE;
+	}
+
+	if (ci->ci_tlbstate == TLBSTATE_STALE)
+		return FALSE;
+
+	return TRUE;
+}
+
+/*
+ * pmap_do_tlb_shootdown:
+ *
+ *	Process pending TLB shootdown operations for this processor.
+ */
+void
+pmap_do_tlb_shootdown(struct cpu_info *self)
+{
+	u_long cpu_id = self->ci_cpuid;
+	struct pmap_tlb_shootdown_q *pq = &pmap_tlb_shootdown_q[cpu_id];
+	struct pmap_tlb_shootdown_job *pj;
+	int s;
+#ifdef MULTIPROCESSOR
+	struct cpu_info *ci;
+	CPU_INFO_ITERATOR cii;
+#endif
+	KASSERT(self == curcpu());
+
+	s = splipi();
+
+	__cpu_simple_lock(&pq->pq_slock);
+
+	if (pq->pq_flushg) {
+		COUNT(flushg);
+		pmap_do_tlb_shootdown_checktlbstate(self);
+		tlbflushg();
+		pq->pq_flushg = 0;
+		pq->pq_flushu = 0;
+		pmap_tlb_shootdown_q_drain(pq);
+	} else {
+		/*
+		 * TLB flushes for PTEs with PG_G set may be in the queue
+		 * after a flushu, they need to be dealt with.
+		 */
+		if (pq->pq_flushu) {
+			COUNT(flushu);
+			pmap_do_tlb_shootdown_checktlbstate(self);
+			tlbflush();
+		}
+		while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) {
+			TAILQ_REMOVE(&pq->pq_head, pj, pj_list);
+
+			if ((pj->pj_pte & pmap_pg_g) ||
+			    pj->pj_pmap == pmap_kernel()) {
+				pmap_update_pg(pj->pj_va);
+			} else if (!pq->pq_flushu &&
+			    pj->pj_pmap == self->ci_pmap) {
+				if (pmap_do_tlb_shootdown_checktlbstate(self))
+					pmap_update_pg(pj->pj_va);
+			}
+
+			pmap_tlb_shootdown_job_put(pq, pj);
+		}
+
+		pq->pq_flushu = pq->pq_pte = 0;
+	}
+
+#ifdef MULTIPROCESSOR
+	for (CPU_INFO_FOREACH(cii, ci))
+		x86_atomic_clearbits_l(&ci->ci_tlb_ipi_mask,
+		    (1U << cpu_id));
+#endif
+	__cpu_simple_unlock(&pq->pq_slock);
+
+	splx(s);
+}
+
+
+/*
+ * pmap_tlb_shootdown_q_drain:
+ *
+ *	Drain a processor's TLB shootdown queue.  We do not perform
+ *	the shootdown operations.  This is merely a convenience
+ *	function.
+ *
+ *	Note: We expect the queue to be locked.
+ */
+void
+pmap_tlb_shootdown_q_drain(pq)
+	struct pmap_tlb_shootdown_q *pq;
+{
+	struct pmap_tlb_shootdown_job *pj;
+
+	while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) {
+		TAILQ_REMOVE(&pq->pq_head, pj, pj_list);
+		pmap_tlb_shootdown_job_put(pq, pj);
+	}
+	pq->pq_pte = 0;
+}
+
+/*
+ * pmap_tlb_shootdown_job_get:
+ *
+ *	Get a TLB shootdown job queue entry.  This places a limit on
+ *	the number of outstanding jobs a processor may have.
+ *
+ *	Note: We expect the queue to be locked.
+ */
+struct pmap_tlb_shootdown_job *
+pmap_tlb_shootdown_job_get(pq)
+	struct pmap_tlb_shootdown_q *pq;
+{
+	struct pmap_tlb_shootdown_job *pj;
+
+	if (pq->pq_count >= PMAP_TLB_MAXJOBS)
+		return (NULL);
+
+	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
+	if (pj_free == NULL) {
+		__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
+		return NULL;
+	}
+	pj = &pj_free->pja_job;
+	pj_free =
+	    (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
+	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
+
+	pq->pq_count++;
+	return (pj);
+}
+
+/*
+ * pmap_tlb_shootdown_job_put:
+ *
+ *	Put a TLB shootdown job queue entry onto the free list.
+ *
+ *	Note: We expect the queue to be locked.
+ */
+void
+pmap_tlb_shootdown_job_put(pq, pj)
+	struct pmap_tlb_shootdown_q *pq;
+	struct pmap_tlb_shootdown_job *pj;
+{
+
+#ifdef DIAGNOSTIC
+	if (pq->pq_count == 0)
+		panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
+#endif
+	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
+	pj->pj_nextfree = &pj_free->pja_job;
+	pj_free = (union pmap_tlb_shootdown_job_al *)pj;
+	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
+
+	pq->pq_count--;
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c
new file mode 100644
index 0000000000..d65741fbf2
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c
@@ -0,0 +1,550 @@
+/*	$NetBSD: sys_machdep.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $	*/
+/*	NetBSD: sys_machdep.c,v 1.70 2003/10/27 14:11:47 junyoung Exp 	*/
+
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: sys_machdep.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $");
+
+#include "opt_compat_netbsd.h"
+#include "opt_mtrr.h"
+#include "opt_perfctrs.h"
+#include "opt_user_ldt.h"
+#include "opt_vm86.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl.h>
+#include <sys/file.h>
+#include <sys/time.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/buf.h>
+#include <sys/signal.h>
+#include <sys/malloc.h>
+
+#include <sys/mount.h>
+#include <sys/sa.h>
+#include <sys/syscallargs.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/cpu.h>
+#include <machine/cpufunc.h>
+#include <machine/gdt.h>
+#include <machine/psl.h>
+#include <machine/reg.h>
+#include <machine/sysarch.h>
+#include <machine/mtrr.h>
+
+#ifdef VM86
+#include <machine/vm86.h>
+#endif
+
+#ifdef PERFCTRS
+#include <machine/pmc.h>
+#endif
+
+extern struct vm_map *kernel_map;
+
+int i386_iopl(struct lwp *, void *, register_t *);
+int i386_get_ioperm(struct lwp *, void *, register_t *);
+int i386_set_ioperm(struct lwp *, void *, register_t *);
+int i386_get_mtrr(struct lwp *, void *, register_t *);
+int i386_set_mtrr(struct lwp *, void *, register_t *);
+
+#ifdef USER_LDT
+
+#ifdef LDT_DEBUG
+static void i386_print_ldt(int, const struct segment_descriptor *);
+
+static void
+i386_print_ldt(i, d)
+	int  i;
+	const struct segment_descriptor *d;
+{
+	printf("[%d] lolimit=0x%x, lobase=0x%x, type=%u, dpl=%u, p=%u, "
+	    "hilimit=0x%x, xx=%x, def32=%u, gran=%u, hibase=0x%x\n",
+	    i, d->sd_lolimit, d->sd_lobase, d->sd_type, d->sd_dpl, d->sd_p,
+	    d->sd_hilimit, d->sd_xx, d->sd_def32, d->sd_gran, d->sd_hibase);
+}
+#endif
+
+int
+i386_get_ldt(l, args, retval)
+	struct lwp *l;
+	void *args;
+	register_t *retval;
+{
+	int error;
+	struct proc *p = l->l_proc;
+	pmap_t pmap = p->p_vmspace->vm_map.pmap;
+	int nldt, num;
+	union descriptor *lp, *cp;
+	struct i386_get_ldt_args ua;
+
+	if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+		return (error);
+
+#ifdef	LDT_DEBUG
+	printf("i386_get_ldt: start=%d num=%d descs=%p\n", ua.start,
+	    ua.num, ua.desc);
+#endif
+
+	if (ua.start < 0 || ua.num < 0 || ua.start > 8192 || ua.num > 8192 ||
+	    ua.start + ua.num > 8192)
+		return (EINVAL);
+
+	cp = malloc(ua.num * sizeof(union descriptor), M_TEMP, M_WAITOK);
+	if (cp == NULL)
+		return ENOMEM;
+
+	simple_lock(&pmap->pm_lock);
+
+	if (pmap->pm_flags & PMF_USER_LDT) {
+		nldt = pmap->pm_ldt_len;
+		lp = pmap->pm_ldt;
+	} else {
+		nldt = NLDT;
+		lp = ldt;
+	}
+
+	if (ua.start > nldt) {
+		simple_unlock(&pmap->pm_lock);
+		free(cp, M_TEMP);
+		return (EINVAL);
+	}
+
+	lp += ua.start;
+	num = min(ua.num, nldt - ua.start);
+#ifdef LDT_DEBUG
+	{
+		int i;
+		for (i = 0; i < num; i++)
+			i386_print_ldt(i, &lp[i].sd);
+	}
+#endif
+
+	memcpy(cp, lp, num * sizeof(union descriptor));
+	simple_unlock(&pmap->pm_lock);
+
+	error = copyout(cp, ua.desc, num * sizeof(union descriptor));
+	if (error == 0)
+		*retval = num;
+
+	free(cp, M_TEMP);
+	return (error);
+}
+
+int
+i386_set_ldt(l, args, retval)
+	struct lwp *l;
+	void *args;
+	register_t *retval;
+{
+	int error, i, n;
+	struct proc *p = l->l_proc;
+	struct pcb *pcb = &l->l_addr->u_pcb;
+	pmap_t pmap = p->p_vmspace->vm_map.pmap;
+	struct i386_set_ldt_args ua;
+	union descriptor *descv;
+	size_t old_len, new_len, ldt_len;
+	union descriptor *old_ldt, *new_ldt;
+
+	if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+		return (error);
+
+	if (ua.start < 0 || ua.num < 0 || ua.start > 8192 || ua.num > 8192 ||
+	    ua.start + ua.num > 8192)
+		return (EINVAL);
+
+	descv = malloc(sizeof (*descv) * ua.num, M_TEMP, M_NOWAIT);
+	if (descv == NULL)
+		return (ENOMEM);
+
+	if ((error = copyin(ua.desc, descv, sizeof (*descv) * ua.num)) != 0)
+		goto out;
+
+	/* Check descriptors for access violations. */
+	for (i = 0; i < ua.num; i++) {
+		union descriptor *desc = &descv[i];
+
+		switch (desc->sd.sd_type) {
+		case SDT_SYSNULL:
+			desc->sd.sd_p = 0;
+			break;
+		case SDT_SYS286CGT:
+		case SDT_SYS386CGT:
+			/*
+			 * Only allow call gates targeting a segment
+			 * in the LDT or a user segment in the fixed
+			 * part of the gdt.  Segments in the LDT are
+			 * constrained (below) to be user segments.
+			 */
+			if (desc->gd.gd_p != 0 &&
+			    !ISLDT(desc->gd.gd_selector) &&
+			    ((IDXSEL(desc->gd.gd_selector) >= NGDT) ||
+			     (gdt[IDXSEL(desc->gd.gd_selector)].sd.sd_dpl !=
+				 SEL_UPL))) {
+				error = EACCES;
+				goto out;
+			}
+			break;
+		case SDT_MEMEC:
+		case SDT_MEMEAC:
+		case SDT_MEMERC:
+		case SDT_MEMERAC:
+			/* Must be "present" if executable and conforming. */
+			if (desc->sd.sd_p == 0) {
+				error = EACCES;
+				goto out;
+			}
+			break;
+		case SDT_MEMRO:
+		case SDT_MEMROA:
+		case SDT_MEMRW:
+		case SDT_MEMRWA:
+		case SDT_MEMROD:
+		case SDT_MEMRODA:
+		case SDT_MEMRWD:
+		case SDT_MEMRWDA:
+		case SDT_MEME:
+		case SDT_MEMEA:
+		case SDT_MEMER:
+		case SDT_MEMERA:
+			break;
+		default:
+			/*
+			 * Make sure that unknown descriptor types are
+			 * not marked present.
+			 */
+			if (desc->sd.sd_p != 0) {
+				error = EACCES;
+				goto out;
+			}
+			break;
+		}
+
+		if (desc->sd.sd_p != 0) {
+			/* Only user (ring-3) descriptors may be present. */
+			if (desc->sd.sd_dpl != SEL_UPL) {
+				error = EACCES;
+				goto out;
+			}
+		}
+	}
+
+	/* allocate user ldt */
+	simple_lock(&pmap->pm_lock);
+	if (pmap->pm_ldt == 0 || (ua.start + ua.num) > pmap->pm_ldt_len) {
+		if (pmap->pm_flags & PMF_USER_LDT)
+			ldt_len = pmap->pm_ldt_len;
+		else
+			ldt_len = 512;
+		while ((ua.start + ua.num) > ldt_len)
+			ldt_len *= 2;
+		new_len = ldt_len * sizeof(union descriptor);
+
+		simple_unlock(&pmap->pm_lock);
+		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
+		    new_len);
+		simple_lock(&pmap->pm_lock);
+
+		if (pmap->pm_ldt != NULL && ldt_len <= pmap->pm_ldt_len) {
+			/*
+			 * Another thread (re)allocated the LDT to
+			 * sufficient size while we were blocked in
+			 * uvm_km_alloc. Oh well. The new entries
+			 * will quite probably not be right, but
+			 * hey.. not our problem if user applications
+			 * have race conditions like that.
+			 */
+			uvm_km_free(kernel_map, (vaddr_t)new_ldt, new_len);
+			goto copy;
+		}
+
+		old_ldt = pmap->pm_ldt;
+
+		if (old_ldt != NULL) {
+			old_len = pmap->pm_ldt_len * sizeof(union descriptor);
+		} else {
+			old_len = NLDT * sizeof(union descriptor);
+			old_ldt = ldt;
+		}
+
+		memcpy(new_ldt, old_ldt, old_len);
+		memset((caddr_t)new_ldt + old_len, 0, new_len - old_len);
+
+		if (old_ldt != ldt)
+			uvm_km_free(kernel_map, (vaddr_t)old_ldt, old_len);
+
+		pmap->pm_ldt = new_ldt;
+		pmap->pm_ldt_len = ldt_len;
+
+		if (pmap->pm_flags & PMF_USER_LDT)
+			ldt_free(pmap);
+		else
+			pmap->pm_flags |= PMF_USER_LDT;
+		ldt_alloc(pmap, new_ldt, new_len);
+		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
+		if (pcb == curpcb)
+			lldt(pcb->pcb_ldt_sel);
+
+	}
+copy:
+	/* Now actually replace the descriptors. */
+	for (i = 0, n = ua.start; i < ua.num; i++, n++)
+		pmap->pm_ldt[n] = descv[i];
+
+	simple_unlock(&pmap->pm_lock);
+
+	*retval = ua.start;
+
+out:
+	free(descv, M_TEMP);
+	return (error);
+}
+#endif	/* USER_LDT */
+
+int
+i386_iopl(l, args, retval)
+	struct lwp *l;
+	void *args;
+	register_t *retval;
+{
+	int error;
+	struct proc *p = l->l_proc;
+	struct pcb *pcb = &l->l_addr->u_pcb;
+	struct i386_iopl_args ua;
+	dom0_op_t op;
+
+	if ((xen_start_info.flags & SIF_PRIVILEGED) == 0)
+		return EPERM;
+
+	if (securelevel > 1)
+		return EPERM;
+
+	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
+		return error;
+
+	if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+		return error;
+
+	pcb->pcb_tss.tss_ioopt &= ~SEL_RPL;
+	if (ua.iopl)
+		pcb->pcb_tss.tss_ioopt |= SEL_UPL; /* i/o pl */
+	else
+		pcb->pcb_tss.tss_ioopt |= SEL_KPL; /* i/o pl */
+
+	/* Force the change at ring 0. */
+	op.cmd = DOM0_IOPL;
+	op.u.iopl.domain = DOMID_SELF;
+	op.u.iopl.iopl = pcb->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */
+	HYPERVISOR_dom0_op(&op);
+
+	return 0;
+}
+
+int
+i386_get_ioperm(l, args, retval)
+	struct lwp *l;
+	void *args;
+	register_t *retval;
+{
+	int error;
+	struct pcb *pcb = &l->l_addr->u_pcb;
+	struct i386_get_ioperm_args ua;
+
+	if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+		return (error);
+
+	return copyout(pcb->pcb_iomap, ua.iomap, sizeof(pcb->pcb_iomap));
+}
+
+int
+i386_set_ioperm(l, args, retval)
+	struct lwp *l;
+	void *args;
+	register_t *retval;
+{
+	int error;
+	struct proc *p = l->l_proc;
+	struct pcb *pcb = &l->l_addr->u_pcb;
+	struct i386_set_ioperm_args ua;
+
+	if (securelevel > 1)
+		return EPERM;
+
+	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
+		return error;
+
+	if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+		return (error);
+
+	return copyin(ua.iomap, pcb->pcb_iomap, sizeof(pcb->pcb_iomap));
+}
+
+#ifdef MTRR
+int
+i386_get_mtrr(struct lwp *l, void *args, register_t *retval)
+{
+	struct i386_get_mtrr_args ua;
+	int error, n;
+	struct proc *p = l->l_proc;
+
+	if (mtrr_funcs == NULL)
+		return ENOSYS;
+
+	error = copyin(args, &ua, sizeof ua);
+	if (error != 0)
+		return error;
+
+	error = copyin(ua.n, &n, sizeof n);
+	if (error != 0)
+		return error;
+
+	error = mtrr_get(ua.mtrrp, &n, p, MTRR_GETSET_USER);
+
+	copyout(&n, ua.n, sizeof (int));
+
+	return error;
+}
+
+int
+i386_set_mtrr(struct lwp *l, void *args, register_t *retval)
+{
+	int error, n;
+	struct i386_set_mtrr_args ua;
+	struct proc *p = l->l_proc;
+
+	if (mtrr_funcs == NULL)
+		return ENOSYS;
+
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error != 0)
+		return error;
+
+	error = copyin(args, &ua, sizeof ua);
+	if (error != 0)
+		return error;
+
+	error = copyin(ua.n, &n, sizeof n);
+	if (error != 0)
+		return error;
+
+	error = mtrr_set(ua.mtrrp, &n, p, MTRR_GETSET_USER);
+	if (n != 0)
+		mtrr_commit();
+
+	copyout(&n, ua.n, sizeof n);
+
+	return error;
+}
+#endif
+
+int
+sys_sysarch(struct lwp *l, void *v, register_t *retval)
+{
+	struct sys_sysarch_args /* {
+		syscallarg(int) op;
+		syscallarg(void *) parms;
+	} */ *uap = v;
+	int error = 0;
+
+	switch(SCARG(uap, op)) {
+#ifdef	USER_LDT
+	case I386_GET_LDT: 
+		error = i386_get_ldt(l, SCARG(uap, parms), retval);
+		break;
+
+	case I386_SET_LDT: 
+		error = i386_set_ldt(l, SCARG(uap, parms), retval);
+		break;
+#endif
+
+	case I386_IOPL: 
+		error = i386_iopl(l, SCARG(uap, parms), retval);
+		break;
+
+	case I386_GET_IOPERM: 
+		error = i386_get_ioperm(l, SCARG(uap, parms), retval);
+		break;
+
+	case I386_SET_IOPERM: 
+		error = i386_set_ioperm(l, SCARG(uap, parms), retval);
+		break;
+
+#ifdef VM86
+	case I386_VM86:
+		error = i386_vm86(l, SCARG(uap, parms), retval);
+		break;
+#ifdef COMPAT_16
+	case I386_OLD_VM86:
+		error = compat_16_i386_vm86(l, SCARG(uap, parms), retval);
+		break;
+#endif
+#endif
+#ifdef MTRR
+	case I386_GET_MTRR:
+		error = i386_get_mtrr(l, SCARG(uap, parms), retval);
+		break;
+	case I386_SET_MTRR:
+		error = i386_set_mtrr(l, SCARG(uap, parms), retval);
+		break;
+#endif
+#ifdef PERFCTRS
+	case I386_PMC_INFO:
+		error = pmc_info(l, SCARG(uap, parms), retval);
+		break;
+
+	case I386_PMC_STARTSTOP:
+		error = pmc_startstop(l, SCARG(uap, parms), retval);
+		break;
+
+	case I386_PMC_READ:
+		error = pmc_read(l, SCARG(uap, parms), retval);
+		break;
+#endif
+
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S
new file mode 100644
index 0000000000..165b5f06be
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S
@@ -0,0 +1,1587 @@
+/*	$NetBSD: vector.S,v 1.1.2.1 2004/05/22 15:57:16 he Exp $	*/
+/*	NetBSD: 1.13 2004/03/11 11:39:26 yamt Exp 	*/
+
+/*
+ * Copyright 2002 (c) Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Frank van der Linden for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed for the NetBSD Project by
+ *      Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_ddb.h"
+#include "opt_multiprocessor.h"
+#include "opt_ipkdb.h"
+#include "opt_vm86.h"
+#include "opt_xen.h"
+
+#ifndef XEN
+#include <machine/i8259.h>
+#endif
+#include <machine/i82093reg.h>
+#include <machine/i82489reg.h>
+#include <machine/asm.h>
+#include <machine/frameasm.h>
+#include <machine/segments.h>
+#include <machine/trap.h>
+#include <machine/intr.h>
+#include <machine/psl.h>
+#ifdef XEN
+#include <machine/xen.h>
+#endif
+
+#include <net/netisr.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+
+#include "npx.h"
+#include "assym.h"
+
+#define __HAVE_GENERIC_SOFT_INTERRUPTS	/* XXX */
+
+
+/*
+ * Macros for interrupt entry, call to handler, and exit.
+ *
+ * XXX
+ * The interrupt frame is set up to look like a trap frame.  This may be a
+ * waste.  The only handler which needs a frame is the clock handler, and it
+ * only needs a few bits.  Xdoreti() needs a trap frame for handling ASTs, but
+ * it could easily convert the frame on demand.
+ *
+ * The direct costs of setting up a trap frame are two pushl's (error code and
+ * trap number), an addl to get rid of these, and pushing and popping the
+ * callee-saved registers %esi, %edi, %ebx, and %ebp twice.
+ *
+ * If the interrupt frame is made more flexible,  INTR can push %eax first and
+ * decide the ipending case with less overhead, e.g., by avoiding loading the
+ * segment registers.
+ *
+ */
+
+#define MY_COUNT _C_LABEL(uvmexp)
+
+/* XXX See comment in locore.s */
+#ifdef __ELF__
+#define	XINTR(name,num)		Xintr_/**/name/**/num
+#define	XSTRAY(name,num)	Xstray_/**/name/**/num
+#define XINTR_TSS(irq_num)	Xintr_tss_ ## irq_num
+#else
+#define	XINTR(name,num)		_Xintr_/**/name/**/num
+#define	XSTRAY(name,num)	_Xstray_/**/name/**/num
+#define XINTR_TSS(irq_num)	Xintr_tss_/**/irq_num
+#endif
+
+/*
+ * Store address of TSS in %eax, given a selector in %eax.
+ * Clobbers %eax, %ecx, %edx, but that's ok for its usage.
+ * This is a bit complicated, but it's done to make as few
+ * assumptions as possible about the validity of the environment.
+ * The GDT and the current and previous TSS are known to be OK,
+ * otherwise we would not be here. The only other thing that needs
+ * to be OK is the cpu_info structure for the current CPU.
+ */
+#define GET_TSS \
+	andl	$0xfff8,%eax				;\
+	addl	CPUVAR(GDT),%eax			;\
+	movl	2(%eax),%edx				;\
+	andl	$0xffffff,%edx				;\
+	movzbl	7(%eax),%eax				;\
+	shl	$24,%eax				;\
+	orl	%edx,%eax
+
+#if NLAPIC > 0
+#ifdef MULTIPROCESSOR
+IDTVEC(recurse_lapic_ipi)
+	pushfl
+	pushl	%cs
+	pushl	%esi
+	pushl	$0
+	pushl	$T_ASTFLT
+	INTRENTRY
+IDTVEC(resume_lapic_ipi)
+	cli
+	jmp	1f
+IDTVEC(intr_lapic_ipi)
+	pushl	$0
+	pushl	$T_ASTFLT
+	INTRENTRY
+	movl	$0,_C_LABEL(local_apic)+LAPIC_EOI
+	movl	CPUVAR(ILEVEL),%ebx
+	cmpl	$IPL_IPI,%ebx
+	jae	2f
+1:
+	incl	CPUVAR(IDEPTH)
+	movl	$IPL_IPI,CPUVAR(ILEVEL)
+        sti
+	pushl	%ebx
+	call	_C_LABEL(x86_ipi_handler)
+	jmp	_C_LABEL(Xdoreti)
+2:
+	orl	$(1 << LIR_IPI),CPUVAR(IPENDING)
+	sti
+	INTRFASTEXIT
+
+#if defined(DDB)
+IDTVEC(intrddbipi)
+1:
+	str	%ax
+	GET_TSS
+	movzwl	(%eax),%eax
+	GET_TSS
+	pushl	%eax
+	movl	$0xff,_C_LABEL(lapic_tpr)
+	movl	$0,_C_LABEL(local_apic)+LAPIC_EOI
+	sti
+	call	_C_LABEL(ddb_ipi_tss)
+	addl	$4,%esp
+	movl	$0,_C_LABEL(lapic_tpr)
+	iret
+	jmp	1b
+#endif /* DDB */
+#endif /* MULTIPROCESSOR */
+
+	/*
+	 * Interrupt from the local APIC timer.
+	 */
+IDTVEC(recurse_lapic_ltimer)
+	pushfl
+	pushl	%cs
+	pushl	%esi
+	pushl	$0
+	pushl	$T_ASTFLT
+	INTRENTRY
+IDTVEC(resume_lapic_ltimer)
+	cli
+	jmp	1f
+IDTVEC(intr_lapic_ltimer)
+	pushl	$0
+	pushl	$T_ASTFLT
+	INTRENTRY
+	movl	$0,_C_LABEL(local_apic)+LAPIC_EOI
+	movl	CPUVAR(ILEVEL),%ebx
+	cmpl	$IPL_CLOCK,%ebx
+	jae	2f
+1:
+	incl	CPUVAR(IDEPTH)
+	movl	$IPL_CLOCK,CPUVAR(ILEVEL)
+	sti
+	pushl	%ebx
+	pushl	$0
+	call	_C_LABEL(lapic_clockintr)
+	addl	$4,%esp
+	jmp	_C_LABEL(Xdoreti)
+2:
+	orl	$(1 << LIR_TIMER),CPUVAR(IPENDING)
+	sti
+	INTRFASTEXIT
+#endif /* NLAPIC > 0 */
+
+#ifdef MULTIPROCESSOR
+#define LOCK_KERNEL	pushl %esp ; call _C_LABEL(x86_intlock) ; addl $4,%esp
+#define UNLOCK_KERNEL	pushl %esp ; call _C_LABEL(x86_intunlock) ; addl $4,%esp
+#else
+#define LOCK_KERNEL
+#define UNLOCK_KERNEL
+#endif
+
+#define voidop(num)
+
+
+#define	XENINTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
+IDTVEC(recurse_/**/name/**/num)						;\
+	pushfl								;\
+	pushl	%cs							;\
+	pushl	%esi							;\
+	subl	$4,%esp							;\
+	pushl	$T_ASTFLT		/* trap # for doing ASTs */	;\
+	INTRENTRY							;\
+IDTVEC(resume_/**/name/**/num)						\
+	/*movl	%esp,%ecx*/						;\
+	movl	$IREENT_MAGIC,TF_ERR(%esp)				;\
+	movl	%ebx,%esi						;\
+	movl	CPUVAR(ISOURCES) + (num) * 4, %ebp			;\
+	movl	IS_MAXLEVEL(%ebp),%ebx					;\
+	jmp	1f							;\
+IDTVEC(intr_/**/name/**/num)						;\
+	pushl	$0			/* dummy error code */		;\
+	pushl	$T_ASTFLT		/* trap # for doing ASTs */	;\
+	INTRENTRY							;\
+	/*movl	%esp,%ecx*/						;\
+	movl	CPUVAR(ISOURCES) + (num) * 4, %ebp		;\
+	mask(num)		/* mask it in hardware */	;\
+	early_ack(num)			/* and allow other intrs */	;\
+	testl	%ebp,%ebp						;\
+	jz	9f			/* stray */			;\
+	movl	IS_MAXLEVEL(%ebp),%ebx					;\
+	movl	CPUVAR(ILEVEL),%esi					;\
+	cmpl	%ebx,%esi						;\
+	jae	10f			/* currently masked; hold it */	;\
+	incl	MY_COUNT+V_INTR		/* statistical info */		;\
+	addl	$1,IS_EVCNTLO(%ebp)	/* inc event counter */		;\
+	adcl	$0,IS_EVCNTHI(%ebp)					;\
+1:									\
+	pushl	%esi							;\
+	movl	%ebx,CPUVAR(ILEVEL)					;\
+	STI(%eax)							;\
+	incl	CPUVAR(IDEPTH)						;\
+	movl	IS_HANDLERS(%ebp),%ebx					;\
+	LOCK_KERNEL							;\
+6:									\
+	movl	IH_LEVEL(%ebx),%edi					;\
+	cmpl	%esi,%edi						;\
+	jle	7f							;\
+	pushl	%esp							;\
+	pushl	IH_ARG(%ebx)						;\
+	movl	%edi,CPUVAR(ILEVEL)					;\
+	call	*IH_FUN(%ebx)		/* call it */			;\
+	addl	$8,%esp			/* toss the arg */		;\
+	movl	IH_NEXT(%ebx),%ebx	/* next handler in chain */	;\
+	testl	%ebx,%ebx						;\
+	jnz	6b							;\
+5:									\
+	UNLOCK_KERNEL							;\
+	CLI(%eax)							;\
+	unmask(num)			/* unmask it in hardware */	;\
+	late_ack(num)							;\
+	STI(%eax)							;\
+	jmp	_C_LABEL(Xdoreti)	/* lower spl and do ASTs */	;\
+7:									\
+	UNLOCK_KERNEL							;\
+	CLI(%eax)							;\
+	orl     $(1 << num),CPUVAR(IPENDING)				;\
+	level_mask(num)							;\
+	late_ack(num)							;\
+	STI(%eax)							;\
+	jmp	_C_LABEL(Xdoreti)	/* lower spl and do ASTs */	;\
+10:									\
+	CLI(%eax)							;\
+	orl     $(1 << num),CPUVAR(IPENDING)				;\
+	level_mask(num)							;\
+6:				; \
+	late_ack(num)							;\
+	STIC(%eax)							;\
+    	jz	4f  		; \
+	call	_C_LABEL(stipending) ; \
+	testl	%eax,%eax	; \
+	jnz	1b		; \
+4:	INTRFASTEXIT							;\
+9:									\
+	unmask(num)							;\
+	jmp	6b
+
+#define hypervisor_asm_unmask(num)			\
+	movl	irq_to_evtchn + (num) * 4,%ecx		;\
+	movl	HYPERVISOR_shared_info,%eax		;\
+	lock						;\
+	btrl	%ecx,EVENTS_MASK(%eax)
+
+XENINTRSTUB(xenev,0,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,1,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,2,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,3,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,4,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,5,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,6,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,7,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,8,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,9,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,10,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,11,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,12,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,13,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,14,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,15,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,16,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,17,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,18,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,19,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,20,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,21,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,22,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,23,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,24,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,25,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,26,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,27,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,28,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,29,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,30,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,31,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+
+.globl _C_LABEL(xenev_stubs)
+_C_LABEL(xenev_stubs):
+	.long _C_LABEL(Xintr_xenev0), _C_LABEL(Xrecurse_xenev0)
+	.long _C_LABEL(Xresume_xenev0)
+	.long _C_LABEL(Xintr_xenev1), _C_LABEL(Xrecurse_xenev1)
+	.long _C_LABEL(Xresume_xenev1)
+	.long _C_LABEL(Xintr_xenev2), _C_LABEL(Xrecurse_xenev2)
+	.long _C_LABEL(Xresume_xenev2)
+	.long _C_LABEL(Xintr_xenev3), _C_LABEL(Xrecurse_xenev3)
+	.long _C_LABEL(Xresume_xenev3)
+	.long _C_LABEL(Xintr_xenev4), _C_LABEL(Xrecurse_xenev4)
+	.long _C_LABEL(Xresume_xenev4)
+	.long _C_LABEL(Xintr_xenev5), _C_LABEL(Xrecurse_xenev5)
+	.long _C_LABEL(Xresume_xenev5)
+	.long _C_LABEL(Xintr_xenev6), _C_LABEL(Xrecurse_xenev6)
+	.long _C_LABEL(Xresume_xenev6)
+	.long _C_LABEL(Xintr_xenev7), _C_LABEL(Xrecurse_xenev7)
+	.long _C_LABEL(Xresume_xenev7)
+	.long _C_LABEL(Xintr_xenev8), _C_LABEL(Xrecurse_xenev8)
+	.long _C_LABEL(Xresume_xenev8)
+	.long _C_LABEL(Xintr_xenev9), _C_LABEL(Xrecurse_xenev9)
+	.long _C_LABEL(Xresume_xenev9)
+	.long _C_LABEL(Xintr_xenev10), _C_LABEL(Xrecurse_xenev10)
+	.long _C_LABEL(Xresume_xenev10)
+	.long _C_LABEL(Xintr_xenev11), _C_LABEL(Xrecurse_xenev11)
+	.long _C_LABEL(Xresume_xenev11)
+	.long _C_LABEL(Xintr_xenev12), _C_LABEL(Xrecurse_xenev12)
+	.long _C_LABEL(Xresume_xenev12)
+	.long _C_LABEL(Xintr_xenev13), _C_LABEL(Xrecurse_xenev13)
+	.long _C_LABEL(Xresume_xenev13)
+	.long _C_LABEL(Xintr_xenev14), _C_LABEL(Xrecurse_xenev14)
+	.long _C_LABEL(Xresume_xenev14)
+	.long _C_LABEL(Xintr_xenev15), _C_LABEL(Xrecurse_xenev15)
+	.long _C_LABEL(Xresume_xenev15)
+	.long _C_LABEL(Xintr_xenev16), _C_LABEL(Xrecurse_xenev16)
+	.long _C_LABEL(Xresume_xenev16)
+	.long _C_LABEL(Xintr_xenev17), _C_LABEL(Xrecurse_xenev17)
+	.long _C_LABEL(Xresume_xenev17)
+	.long _C_LABEL(Xintr_xenev18), _C_LABEL(Xrecurse_xenev18)
+	.long _C_LABEL(Xresume_xenev18)
+	.long _C_LABEL(Xintr_xenev19), _C_LABEL(Xrecurse_xenev19)
+	.long _C_LABEL(Xresume_xenev19)
+	.long _C_LABEL(Xintr_xenev20), _C_LABEL(Xrecurse_xenev20)
+	.long _C_LABEL(Xresume_xenev20)
+	.long _C_LABEL(Xintr_xenev21), _C_LABEL(Xrecurse_xenev21)
+	.long _C_LABEL(Xresume_xenev21)
+	.long _C_LABEL(Xintr_xenev22), _C_LABEL(Xrecurse_xenev22)
+	.long _C_LABEL(Xresume_xenev22)
+	.long _C_LABEL(Xintr_xenev23), _C_LABEL(Xrecurse_xenev23)
+	.long _C_LABEL(Xresume_xenev23)
+	.long _C_LABEL(Xintr_xenev24), _C_LABEL(Xrecurse_xenev24)
+	.long _C_LABEL(Xresume_xenev24)
+	.long _C_LABEL(Xintr_xenev25), _C_LABEL(Xrecurse_xenev25)
+	.long _C_LABEL(Xresume_xenev25)
+	.long _C_LABEL(Xintr_xenev26), _C_LABEL(Xrecurse_xenev26)
+	.long _C_LABEL(Xresume_xenev26)
+	.long _C_LABEL(Xintr_xenev27), _C_LABEL(Xrecurse_xenev27)
+	.long _C_LABEL(Xresume_xenev27)
+	.long _C_LABEL(Xintr_xenev28), _C_LABEL(Xrecurse_xenev28)
+	.long _C_LABEL(Xresume_xenev28)
+	.long _C_LABEL(Xintr_xenev29), _C_LABEL(Xrecurse_xenev29)
+	.long _C_LABEL(Xresume_xenev29)
+	.long _C_LABEL(Xintr_xenev30), _C_LABEL(Xrecurse_xenev30)
+	.long _C_LABEL(Xresume_xenev30)
+	.long _C_LABEL(Xintr_xenev31), _C_LABEL(Xrecurse_xenev31)
+	.long _C_LABEL(Xresume_xenev31)
+
+#ifndef XEN
+/*
+ * This macro defines the generic stub code. Its arguments modifiy it
+ * for specific PICs.
+ */
+
+#define	INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
+IDTVEC(recurse_/**/name/**/num)						;\
+	pushfl								;\
+	pushl	%cs							;\
+	pushl	%esi							;\
+	subl	$4,%esp							;\
+	pushl	$T_ASTFLT		/* trap # for doing ASTs */	;\
+	INTRENTRY							;\
+IDTVEC(resume_/**/name/**/num)						\
+	movl	$IREENT_MAGIC,TF_ERR(%esp)				;\
+	movl	%ebx,%esi						;\
+	movl	CPUVAR(ISOURCES) + (num) * 4, %ebp			;\
+	movl	IS_MAXLEVEL(%ebp),%ebx					;\
+	jmp	1f							;\
+IDTVEC(intr_/**/name/**/num)						;\
+	pushl	$0			/* dummy error code */		;\
+	pushl	$T_ASTFLT		/* trap # for doing ASTs */	;\
+	INTRENTRY							;\
+	movl	CPUVAR(ISOURCES) + (num) * 4, %ebp		;\
+	mask(num)		/* mask it in hardware */	;\
+	early_ack(num)			/* and allow other intrs */	;\
+	testl	%ebp,%ebp						;\
+	jz	9f			/* stray */			;\
+	movl	IS_MAXLEVEL(%ebp),%ebx					;\
+	movl	CPUVAR(ILEVEL),%esi					;\
+	cmpl	%ebx,%esi						;\
+	jae	10f			/* currently masked; hold it */	;\
+	incl	MY_COUNT+V_INTR		/* statistical info */		;\
+	addl	$1,IS_EVCNTLO(%ebp)	/* inc event counter */		;\
+	adcl	$0,IS_EVCNTHI(%ebp)					;\
+1:									\
+	pushl	%esi							;\
+	movl	%ebx,CPUVAR(ILEVEL)					;\
+	STI(%eax)							;\
+	incl	CPUVAR(IDEPTH)						;\
+	movl	IS_HANDLERS(%ebp),%ebx					;\
+	LOCK_KERNEL							;\
+6:									\
+	movl	IH_LEVEL(%ebx),%edi					;\
+	cmpl	%esi,%edi						;\
+	jle	7f							;\
+	pushl	IH_ARG(%ebx)						;\
+	movl	%edi,CPUVAR(ILEVEL)					;\
+	call	*IH_FUN(%ebx)		/* call it */			;\
+	addl	$4,%esp			/* toss the arg */		;\
+	movl	IH_NEXT(%ebx),%ebx	/* next handler in chain */	;\
+	testl	%ebx,%ebx						;\
+	jnz	6b							;\
+5:									\
+	UNLOCK_KERNEL							;\
+	CLI(%eax)							;\
+	unmask(num)			/* unmask it in hardware */	;\
+	late_ack(num)							;\
+	STI(%eax)							;\
+	jmp	_C_LABEL(Xdoreti)	/* lower spl and do ASTs */	;\
+7:									\
+	UNLOCK_KERNEL							;\
+	CLI(%eax)							;\
+	orl     $(1 << num),CPUVAR(IPENDING)				;\
+	level_mask(num)							;\
+	late_ack(num)							;\
+	STI(%eax)							;\
+	jmp	_C_LABEL(Xdoreti)	/* lower spl and do ASTs */	;\
+10:									\
+	CLI(%eax)							;\
+	orl     $(1 << num),CPUVAR(IPENDING)				;\
+	level_mask(num)							;\
+	late_ack(num)							;\
+	STIC(%eax)							;\
+    	jz	4f		; \
+	call	_C_LABEL(stipending) ; \
+	testl	%eax,%eax	; \
+	jnz	1b		; \
+4:	INTRFASTEXIT							;\
+9:									\
+	unmask(num)							;\
+	late_ack(num)							;\
+	STIC(%eax)							;\
+    	jz	4f  		; \
+	call	_C_LABEL(stipending) ; \
+	testl	%eax,%eax   	; \
+	jnz	1b		; \
+4:	INTRFASTEXIT
+
+#define ICUADDR IO_ICU1
+
+INTRSTUB(legacy,0,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,1,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,2,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,3,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,4,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,5,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,6,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,7,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+#undef ICUADDR
+#define ICUADDR IO_ICU2
+
+INTRSTUB(legacy,8,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,9,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,10,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,11,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,12,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,13,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,14,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,15,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+#endif
+
+#if NIOAPIC > 0
+
+INTRSTUB(ioapic_edge,0,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,1,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,2,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,3,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,4,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,5,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,6,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,7,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,8,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,9,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,10,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,11,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,12,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,13,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,14,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,15,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,16,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,17,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,18,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,19,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,20,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,21,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,22,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,23,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,24,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,25,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,26,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,27,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,28,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,29,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,30,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,31,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+
+INTRSTUB(ioapic_level,0,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,1,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,2,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,3,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,4,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,5,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,6,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,7,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,8,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,9,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,10,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,11,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,12,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,13,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,14,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,15,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,16,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,17,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,18,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,19,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,20,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,21,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,22,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,23,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,24,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,25,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,26,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,27,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,28,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,29,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,30,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,31,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+
+#endif
+
+#ifndef XEN
+.globl _C_LABEL(i8259_stubs)
+_C_LABEL(i8259_stubs):
+	.long _C_LABEL(Xintr_legacy0), _C_LABEL(Xrecurse_legacy0)
+	.long _C_LABEL(Xresume_legacy0)
+	.long _C_LABEL(Xintr_legacy1), _C_LABEL(Xrecurse_legacy1)
+	.long _C_LABEL(Xresume_legacy1)
+	.long _C_LABEL(Xintr_legacy2), _C_LABEL(Xrecurse_legacy2)
+	.long _C_LABEL(Xresume_legacy2)
+	.long _C_LABEL(Xintr_legacy3), _C_LABEL(Xrecurse_legacy3)
+	.long _C_LABEL(Xresume_legacy3)
+	.long _C_LABEL(Xintr_legacy4), _C_LABEL(Xrecurse_legacy4)
+	.long _C_LABEL(Xresume_legacy4)
+	.long _C_LABEL(Xintr_legacy5), _C_LABEL(Xrecurse_legacy5)
+	.long _C_LABEL(Xresume_legacy5)
+	.long _C_LABEL(Xintr_legacy6), _C_LABEL(Xrecurse_legacy6)
+	.long _C_LABEL(Xresume_legacy6)
+	.long _C_LABEL(Xintr_legacy7), _C_LABEL(Xrecurse_legacy7)
+	.long _C_LABEL(Xresume_legacy7)
+	.long _C_LABEL(Xintr_legacy8), _C_LABEL(Xrecurse_legacy8)
+	.long _C_LABEL(Xresume_legacy8)
+	.long _C_LABEL(Xintr_legacy9), _C_LABEL(Xrecurse_legacy9)
+	.long _C_LABEL(Xresume_legacy9)
+	.long _C_LABEL(Xintr_legacy10), _C_LABEL(Xrecurse_legacy10)
+	.long _C_LABEL(Xresume_legacy10)
+	.long _C_LABEL(Xintr_legacy11), _C_LABEL(Xrecurse_legacy11)
+	.long _C_LABEL(Xresume_legacy11)
+	.long _C_LABEL(Xintr_legacy12), _C_LABEL(Xrecurse_legacy12)
+	.long _C_LABEL(Xresume_legacy12)
+	.long _C_LABEL(Xintr_legacy13), _C_LABEL(Xrecurse_legacy13)
+	.long _C_LABEL(Xresume_legacy13)
+	.long _C_LABEL(Xintr_legacy14), _C_LABEL(Xrecurse_legacy14)
+	.long _C_LABEL(Xresume_legacy14)
+	.long _C_LABEL(Xintr_legacy15), _C_LABEL(Xrecurse_legacy15)
+	.long _C_LABEL(Xresume_legacy15)
+#endif
+
+#if NIOAPIC > 0
+.globl _C_LABEL(ioapic_edge_stubs)
+_C_LABEL(ioapic_edge_stubs):
+	.long _C_LABEL(Xintr_ioapic_edge0), _C_LABEL(Xrecurse_ioapic_edge0)
+	.long _C_LABEL(Xresume_ioapic_edge0)
+	.long _C_LABEL(Xintr_ioapic_edge1), _C_LABEL(Xrecurse_ioapic_edge1)
+	.long _C_LABEL(Xresume_ioapic_edge1)
+	.long _C_LABEL(Xintr_ioapic_edge2), _C_LABEL(Xrecurse_ioapic_edge2)
+	.long _C_LABEL(Xresume_ioapic_edge2)
+	.long _C_LABEL(Xintr_ioapic_edge3), _C_LABEL(Xrecurse_ioapic_edge3)
+	.long _C_LABEL(Xresume_ioapic_edge3)
+	.long _C_LABEL(Xintr_ioapic_edge4), _C_LABEL(Xrecurse_ioapic_edge4)
+	.long _C_LABEL(Xresume_ioapic_edge4)
+	.long _C_LABEL(Xintr_ioapic_edge5), _C_LABEL(Xrecurse_ioapic_edge5)
+	.long _C_LABEL(Xresume_ioapic_edge5)
+	.long _C_LABEL(Xintr_ioapic_edge6), _C_LABEL(Xrecurse_ioapic_edge6)
+	.long _C_LABEL(Xresume_ioapic_edge6)
+	.long _C_LABEL(Xintr_ioapic_edge7), _C_LABEL(Xrecurse_ioapic_edge7)
+	.long _C_LABEL(Xresume_ioapic_edge7)
+	.long _C_LABEL(Xintr_ioapic_edge8), _C_LABEL(Xrecurse_ioapic_edge8)
+	.long _C_LABEL(Xresume_ioapic_edge8)
+	.long _C_LABEL(Xintr_ioapic_edge9), _C_LABEL(Xrecurse_ioapic_edge9)
+	.long _C_LABEL(Xresume_ioapic_edge9)
+	.long _C_LABEL(Xintr_ioapic_edge10), _C_LABEL(Xrecurse_ioapic_edge10)
+	.long _C_LABEL(Xresume_ioapic_edge10)
+	.long _C_LABEL(Xintr_ioapic_edge11), _C_LABEL(Xrecurse_ioapic_edge11)
+	.long _C_LABEL(Xresume_ioapic_edge11)
+	.long _C_LABEL(Xintr_ioapic_edge12), _C_LABEL(Xrecurse_ioapic_edge12)
+	.long _C_LABEL(Xresume_ioapic_edge12)
+	.long _C_LABEL(Xintr_ioapic_edge13), _C_LABEL(Xrecurse_ioapic_edge13)
+	.long _C_LABEL(Xresume_ioapic_edge13)
+	.long _C_LABEL(Xintr_ioapic_edge14), _C_LABEL(Xrecurse_ioapic_edge14)
+	.long _C_LABEL(Xresume_ioapic_edge14)
+	.long _C_LABEL(Xintr_ioapic_edge15), _C_LABEL(Xrecurse_ioapic_edge15)
+	.long _C_LABEL(Xresume_ioapic_edge15)
+	.long _C_LABEL(Xintr_ioapic_edge16), _C_LABEL(Xrecurse_ioapic_edge16)
+	.long _C_LABEL(Xresume_ioapic_edge16)
+	.long _C_LABEL(Xintr_ioapic_edge17), _C_LABEL(Xrecurse_ioapic_edge17)
+	.long _C_LABEL(Xresume_ioapic_edge17)
+	.long _C_LABEL(Xintr_ioapic_edge18), _C_LABEL(Xrecurse_ioapic_edge18)
+	.long _C_LABEL(Xresume_ioapic_edge18)
+	.long _C_LABEL(Xintr_ioapic_edge19), _C_LABEL(Xrecurse_ioapic_edge19)
+	.long _C_LABEL(Xresume_ioapic_edge19)
+	.long _C_LABEL(Xintr_ioapic_edge20), _C_LABEL(Xrecurse_ioapic_edge20)
+	.long _C_LABEL(Xresume_ioapic_edge20)
+	.long _C_LABEL(Xintr_ioapic_edge21), _C_LABEL(Xrecurse_ioapic_edge21)
+	.long _C_LABEL(Xresume_ioapic_edge21)
+	.long _C_LABEL(Xintr_ioapic_edge22), _C_LABEL(Xrecurse_ioapic_edge22)
+	.long _C_LABEL(Xresume_ioapic_edge22)
+	.long _C_LABEL(Xintr_ioapic_edge23), _C_LABEL(Xrecurse_ioapic_edge23)
+	.long _C_LABEL(Xresume_ioapic_edge23)
+	.long _C_LABEL(Xintr_ioapic_edge24), _C_LABEL(Xrecurse_ioapic_edge24)
+	.long _C_LABEL(Xresume_ioapic_edge24)
+	.long _C_LABEL(Xintr_ioapic_edge25), _C_LABEL(Xrecurse_ioapic_edge25)
+	.long _C_LABEL(Xresume_ioapic_edge25)
+	.long _C_LABEL(Xintr_ioapic_edge26), _C_LABEL(Xrecurse_ioapic_edge26)
+	.long _C_LABEL(Xresume_ioapic_edge26)
+	.long _C_LABEL(Xintr_ioapic_edge27), _C_LABEL(Xrecurse_ioapic_edge27)
+	.long _C_LABEL(Xresume_ioapic_edge27)
+	.long _C_LABEL(Xintr_ioapic_edge28), _C_LABEL(Xrecurse_ioapic_edge28)
+	.long _C_LABEL(Xresume_ioapic_edge28)
+	.long _C_LABEL(Xintr_ioapic_edge29), _C_LABEL(Xrecurse_ioapic_edge29)
+	.long _C_LABEL(Xresume_ioapic_edge29)
+	.long _C_LABEL(Xintr_ioapic_edge30), _C_LABEL(Xrecurse_ioapic_edge30)
+	.long _C_LABEL(Xresume_ioapic_edge30)
+	.long _C_LABEL(Xintr_ioapic_edge31), _C_LABEL(Xrecurse_ioapic_edge31)
+	.long _C_LABEL(Xresume_ioapic_edge31)
+
+.globl _C_LABEL(ioapic_level_stubs)
+_C_LABEL(ioapic_level_stubs):
+	.long _C_LABEL(Xintr_ioapic_level0), _C_LABEL(Xrecurse_ioapic_level0)
+	.long _C_LABEL(Xresume_ioapic_level0)
+	.long _C_LABEL(Xintr_ioapic_level1), _C_LABEL(Xrecurse_ioapic_level1)
+	.long _C_LABEL(Xresume_ioapic_level1)
+	.long _C_LABEL(Xintr_ioapic_level2), _C_LABEL(Xrecurse_ioapic_level2)
+	.long _C_LABEL(Xresume_ioapic_level2)
+	.long _C_LABEL(Xintr_ioapic_level3), _C_LABEL(Xrecurse_ioapic_level3)
+	.long _C_LABEL(Xresume_ioapic_level3)
+	.long _C_LABEL(Xintr_ioapic_level4), _C_LABEL(Xrecurse_ioapic_level4)
+	.long _C_LABEL(Xresume_ioapic_level4)
+	.long _C_LABEL(Xintr_ioapic_level5), _C_LABEL(Xrecurse_ioapic_level5)
+	.long _C_LABEL(Xresume_ioapic_level5)
+	.long _C_LABEL(Xintr_ioapic_level6), _C_LABEL(Xrecurse_ioapic_level6)
+	.long _C_LABEL(Xresume_ioapic_level6)
+	.long _C_LABEL(Xintr_ioapic_level7), _C_LABEL(Xrecurse_ioapic_level7)
+	.long _C_LABEL(Xresume_ioapic_level7)
+	.long _C_LABEL(Xintr_ioapic_level8), _C_LABEL(Xrecurse_ioapic_level8)
+	.long _C_LABEL(Xresume_ioapic_level8)
+	.long _C_LABEL(Xintr_ioapic_level9), _C_LABEL(Xrecurse_ioapic_level9)
+	.long _C_LABEL(Xresume_ioapic_level9)
+	.long _C_LABEL(Xintr_ioapic_level10), _C_LABEL(Xrecurse_ioapic_level10)
+	.long _C_LABEL(Xresume_ioapic_level10)
+	.long _C_LABEL(Xintr_ioapic_level11), _C_LABEL(Xrecurse_ioapic_level11)
+	.long _C_LABEL(Xresume_ioapic_level11)
+	.long _C_LABEL(Xintr_ioapic_level12), _C_LABEL(Xrecurse_ioapic_level12)
+	.long _C_LABEL(Xresume_ioapic_level12)
+	.long _C_LABEL(Xintr_ioapic_level13), _C_LABEL(Xrecurse_ioapic_level13)
+	.long _C_LABEL(Xresume_ioapic_level13)
+	.long _C_LABEL(Xintr_ioapic_level14), _C_LABEL(Xrecurse_ioapic_level14)
+	.long _C_LABEL(Xresume_ioapic_level14)
+	.long _C_LABEL(Xintr_ioapic_level15), _C_LABEL(Xrecurse_ioapic_level15)
+	.long _C_LABEL(Xresume_ioapic_level15)
+	.long _C_LABEL(Xintr_ioapic_level16), _C_LABEL(Xrecurse_ioapic_level16)
+	.long _C_LABEL(Xresume_ioapic_level16)
+	.long _C_LABEL(Xintr_ioapic_level17), _C_LABEL(Xrecurse_ioapic_level17)
+	.long _C_LABEL(Xresume_ioapic_level17)
+	.long _C_LABEL(Xintr_ioapic_level18), _C_LABEL(Xrecurse_ioapic_level18)
+	.long _C_LABEL(Xresume_ioapic_level18)
+	.long _C_LABEL(Xintr_ioapic_level19), _C_LABEL(Xrecurse_ioapic_level19)
+	.long _C_LABEL(Xresume_ioapic_level19)
+	.long _C_LABEL(Xintr_ioapic_level20), _C_LABEL(Xrecurse_ioapic_level20)
+	.long _C_LABEL(Xresume_ioapic_level20)
+	.long _C_LABEL(Xintr_ioapic_level21), _C_LABEL(Xrecurse_ioapic_level21)
+	.long _C_LABEL(Xresume_ioapic_level21)
+	.long _C_LABEL(Xintr_ioapic_level22), _C_LABEL(Xrecurse_ioapic_level22)
+	.long _C_LABEL(Xresume_ioapic_level22)
+	.long _C_LABEL(Xintr_ioapic_level23), _C_LABEL(Xrecurse_ioapic_level23)
+	.long _C_LABEL(Xresume_ioapic_level23)
+	.long _C_LABEL(Xintr_ioapic_level24), _C_LABEL(Xrecurse_ioapic_level24)
+	.long _C_LABEL(Xresume_ioapic_level24)
+	.long _C_LABEL(Xintr_ioapic_level25), _C_LABEL(Xrecurse_ioapic_level25)
+	.long _C_LABEL(Xresume_ioapic_level25)
+	.long _C_LABEL(Xintr_ioapic_level26), _C_LABEL(Xrecurse_ioapic_level26)
+	.long _C_LABEL(Xresume_ioapic_level26)
+	.long _C_LABEL(Xintr_ioapic_level27), _C_LABEL(Xrecurse_ioapic_level27)
+	.long _C_LABEL(Xresume_ioapic_level27)
+	.long _C_LABEL(Xintr_ioapic_level28), _C_LABEL(Xrecurse_ioapic_level28)
+	.long _C_LABEL(Xresume_ioapic_level28)
+	.long _C_LABEL(Xintr_ioapic_level29), _C_LABEL(Xrecurse_ioapic_level29)
+	.long _C_LABEL(Xresume_ioapic_level29)
+	.long _C_LABEL(Xintr_ioapic_level30), _C_LABEL(Xrecurse_ioapic_level30)
+	.long _C_LABEL(Xresume_ioapic_level30)
+	.long _C_LABEL(Xintr_ioapic_level31), _C_LABEL(Xrecurse_ioapic_level31)
+	.long _C_LABEL(Xresume_ioapic_level31)
+#endif
+
+/*
+ * Symbols that vmstat -i wants, even though they're not used.
+ */
+.globl	_C_LABEL(intrnames)
+_C_LABEL(intrnames):
+.globl	_C_LABEL(eintrnames)
+_C_LABEL(eintrnames):
+
+.globl	_C_LABEL(intrcnt)
+_C_LABEL(intrcnt):
+.globl	_C_LABEL(eintrcnt)
+_C_LABEL(eintrcnt):
+
+/*
+ * Soft interrupt handlers
+ */
+
+IDTVEC(softserial)
+	movl	$IPL_SOFTSERIAL, CPUVAR(ILEVEL)
+	incl	CPUVAR(IDEPTH)
+#ifdef MULTIPROCESSOR
+	call	_C_LABEL(x86_softintlock)
+#endif
+	movl	CPUVAR(ISOURCES) + SIR_SERIAL * 4, %edi
+	addl	$1,IS_EVCNTLO(%edi)
+	adcl	$0,IS_EVCNTHI(%edi)
+	pushl	$X86_SOFTINTR_SOFTSERIAL
+	call	_C_LABEL(softintr_dispatch)
+	addl	$4,%esp
+#ifdef MULTIPROCESSOR
+	call	_C_LABEL(x86_softintunlock)
+#endif
+	decl	CPUVAR(IDEPTH)
+	jmp	*%esi
+
+IDTVEC(softnet)
+	movl	$IPL_SOFTNET, CPUVAR(ILEVEL)
+	incl	CPUVAR(IDEPTH)
+#ifdef MULTIPROCESSOR
+	call	_C_LABEL(x86_softintlock)
+#endif
+	movl	CPUVAR(ISOURCES) + SIR_NET * 4, %edi
+	addl	$1,IS_EVCNTLO(%edi)
+	adcl	$0,IS_EVCNTHI(%edi)
+
+	xorl	%edi,%edi
+	xchgl	_C_LABEL(netisr),%edi
+
+	/* XXX Do the legacy netisrs here for now. */
+#define DONETISR(s, c) \
+	.globl  _C_LABEL(c)	;\
+	testl	$(1 << s),%edi	;\
+	jz	1f		;\
+	call	_C_LABEL(c)	;\
+1:
+#include <net/netisr_dispatch.h>
+
+	pushl	$X86_SOFTINTR_SOFTNET
+	call	_C_LABEL(softintr_dispatch)
+	addl	$4,%esp
+#ifdef MULTIPROCESSOR
+	call	_C_LABEL(x86_softintunlock)
+#endif
+	decl	CPUVAR(IDEPTH)
+	jmp	*%esi
+
+IDTVEC(softclock)
+	movl	$IPL_SOFTCLOCK, CPUVAR(ILEVEL)
+	incl	CPUVAR(IDEPTH)
+#ifdef MULTIPROCESSOR
+	call	_C_LABEL(x86_softintlock)
+#endif
+	movl	CPUVAR(ISOURCES) + SIR_CLOCK * 4, %edi
+	addl	$1,IS_EVCNTLO(%edi)
+	adcl	$0,IS_EVCNTHI(%edi)
+
+	pushl	$X86_SOFTINTR_SOFTCLOCK
+	call	_C_LABEL(softintr_dispatch)
+	addl	$4,%esp
+#ifdef MULTIPROCESSOR
+	call	_C_LABEL(x86_softintunlock)
+#endif
+	decl	CPUVAR(IDEPTH)
+	jmp	*%esi
+
+/*
+ * Trap and fault vector routines
+ *
+ * On exit from the kernel to user mode, we always need to check for ASTs.  In
+ * addition, we need to do this atomically; otherwise an interrupt may occur
+ * which causes an AST, but it won't get processed until the next kernel entry
+ * (possibly the next clock tick).  Thus, we disable interrupt before checking,
+ * and only enable them again on the final `iret' or before calling the AST
+ * handler.
+ */
+
+#define TRAP(a)			pushl $(a) ; jmp _C_LABEL(alltraps)
+#define ZTRAP(a)		pushl $0 ; TRAP(a)
+
+#ifdef IPKDB
+#define BPTTRAP(a)	pushl $0; pushl $(a); jmp _C_LABEL(bpttraps)
+#else
+#define BPTTRAP(a)	ZTRAP(a)
+#endif
+
+
+	.text
+IDTVEC(trap00)
+	ZTRAP(T_DIVIDE)
+IDTVEC(trap01)
+	BPTTRAP(T_TRCTRAP)
+IDTVEC(trap02)
+	ZTRAP(T_NMI)
+IDTVEC(trap03)
+	BPTTRAP(T_BPTFLT)
+IDTVEC(trap04)
+	ZTRAP(T_OFLOW)
+IDTVEC(trap05)
+	ZTRAP(T_BOUND)
+IDTVEC(trap06)
+	ZTRAP(T_PRIVINFLT)
+IDTVEC(trap07)
+#if NNPX > 0
+	pushl	$0			# dummy error code
+	pushl	$T_DNA
+	INTRENTRY
+#ifdef XENDEBUG_LOW
+	pushl	%esp
+#endif
+	pushl	CPUVAR(SELF)
+	call	*_C_LABEL(npxdna_func)
+	addl	$4,%esp
+#ifdef XENDEBUG_LOW
+	addl	$4,%esp
+#endif
+	testl	%eax,%eax
+	jz	calltrap
+	INTRFASTEXIT
+#else
+	ZTRAP(T_DNA)
+#endif
+IDTVEC(trap08)
+	TRAP(T_DOUBLEFLT)
+IDTVEC(trap09)
+	ZTRAP(T_FPOPFLT)
+IDTVEC(trap0a)
+	TRAP(T_TSSFLT)
+IDTVEC(trap0b)
+	TRAP(T_SEGNPFLT)
+IDTVEC(trap0c)
+	TRAP(T_STKFLT)
+IDTVEC(trap0d)
+	TRAP(T_PROTFLT)
+#ifndef XEN
+IDTVEC(trap0e)
+#ifndef I586_CPU
+	TRAP(T_PAGEFLT)
+#else
+	pushl	$T_PAGEFLT
+	INTRENTRY
+	testb	$PGEX_U,TF_ERR(%esp)
+	jnz	calltrap
+	movl	%cr2,%eax
+	subl	_C_LABEL(pentium_idt),%eax
+	cmpl	$(6*8),%eax
+	jne	calltrap
+	movb	$T_PRIVINFLT,TF_TRAPNO(%esp)
+	jmp	calltrap
+#endif
+#endif
+
+IDTVEC(intrspurious)
+IDTVEC(trap0f)
+	/*
+	 * The Pentium Pro local APIC may erroneously call this vector for a
+	 * default IR7.  Just ignore it.
+	 *
+	 * (The local APIC does this when CPL is raised while it's on the
+	 * way to delivering an interrupt.. presumably enough has been set
+	 * up that it's inconvenient to abort delivery completely..)
+	 */
+	iret
+
+IDTVEC(trap10)
+#if NNPX > 0
+	/*
+	 * Handle like an interrupt so that we can call npxintr to clear the
+	 * error.  It would be better to handle npx interrupts as traps but
+	 * this is difficult for nested interrupts.
+	 */
+	pushl	$0			# dummy error code
+	pushl	$T_ASTFLT
+	INTRENTRY
+	pushl	CPUVAR(ILEVEL)
+	pushl	%esp
+	incl	_C_LABEL(uvmexp)+V_TRAP
+	call	_C_LABEL(npxintr)
+	addl	$8,%esp
+	INTRFASTEXIT
+#else
+	ZTRAP(T_ARITHTRAP)
+#endif
+IDTVEC(trap11)
+	TRAP(T_ALIGNFLT)
+IDTVEC(trap12)
+IDTVEC(trap13)
+IDTVEC(trap14)
+IDTVEC(trap15)
+IDTVEC(trap16)
+IDTVEC(trap17)
+IDTVEC(trap18)
+IDTVEC(trap19)
+IDTVEC(trap1a)
+IDTVEC(trap1b)
+IDTVEC(trap1c)
+IDTVEC(trap1d)
+IDTVEC(trap1e)
+IDTVEC(trap1f)
+	/* 18 - 31 reserved for future exp */
+	ZTRAP(T_RESERVED)
+
+IDTVEC(exceptions)
+#ifndef XENDEBUG_LOW
+	.long	_C_LABEL(Xtrap00), _C_LABEL(Xtrap01)
+	.long	_C_LABEL(Xtrap02), _C_LABEL(Xtrap03)
+	.long	_C_LABEL(Xtrap04), _C_LABEL(Xtrap05)
+	.long	_C_LABEL(Xtrap06), _C_LABEL(Xtrap07)
+	.long	_C_LABEL(Xtrap08), _C_LABEL(Xtrap09)
+	.long	_C_LABEL(Xtrap0a), _C_LABEL(Xtrap0b)
+	.long	_C_LABEL(Xtrap0c), _C_LABEL(Xtrap0d)
+	.long	_C_LABEL(Xtrap0e), _C_LABEL(Xtrap0f)
+	.long	_C_LABEL(Xtrap10), _C_LABEL(Xtrap11)
+	.long	_C_LABEL(Xtrap12), _C_LABEL(Xtrap13)
+	.long	_C_LABEL(Xtrap14), _C_LABEL(Xtrap15)
+	.long	_C_LABEL(Xtrap16), _C_LABEL(Xtrap17)
+	.long	_C_LABEL(Xtrap18), _C_LABEL(Xtrap19)
+	.long	_C_LABEL(Xtrap1a), _C_LABEL(Xtrap1b)
+	.long	_C_LABEL(Xtrap1c), _C_LABEL(Xtrap1d)
+	.long	_C_LABEL(Xtrap1e), _C_LABEL(Xtrap1f)
+#else
+	.long	_C_LABEL(divide_error), _C_LABEL(debug)
+	.long	_C_LABEL(Xtrap02), _C_LABEL(Xtrap03)	    #int3)
+	.long	_C_LABEL(overflow), _C_LABEL(bounds)
+	.long	_C_LABEL(invalid_op), _C_LABEL(device_not_available)
+	.long	_C_LABEL(double_fault), _C_LABEL(coprocessor_segment_overrun)
+	.long	_C_LABEL(invalid_TSS), _C_LABEL(segment_not_present)
+	.long	_C_LABEL(stack_segment)
+	#.long	_C_LABEL(general_protection)
+        .long	_C_LABEL(Xtrap0d)
+	#.long	_C_LABEL(page_fault)
+        .long	_C_LABEL(Xtrap0e)
+	.long	_C_LABEL(spurious_interrupt_bug)
+	.long	_C_LABEL(coprocessor_error), _C_LABEL(alignment_check)
+	.long	_C_LABEL(machine_check), _C_LABEL(simd_coprocessor_error)
+	.long	_C_LABEL(Xtrap14), _C_LABEL(Xtrap15)
+	.long	_C_LABEL(Xtrap16), _C_LABEL(Xtrap17)
+	.long	_C_LABEL(Xtrap18), _C_LABEL(Xtrap19)
+	.long	_C_LABEL(Xtrap1a), _C_LABEL(Xtrap1b)
+	.long	_C_LABEL(Xtrap1c), _C_LABEL(Xtrap1d)
+	.long	_C_LABEL(Xtrap1e), _C_LABEL(Xtrap1f)
+#endif
+
+
+IDTVEC(tss_trap08)
+1:
+	str	%ax
+	GET_TSS
+	movzwl	(%eax),%eax
+	GET_TSS
+	pushl	$T_DOUBLEFLT
+	pushl	%eax
+	call	_C_LABEL(trap_tss)
+	addl	$12,%esp
+	iret
+	jmp	1b
+
+/* LINTSTUB: Ignore */
+NENTRY(alltraps)
+	INTRENTRY
+calltrap:
+#ifdef DIAGNOSTIC
+	movl	CPUVAR(ILEVEL),%ebx
+#endif /* DIAGNOSTIC */
+	pushl	%esp
+	call	_C_LABEL(trap)
+	addl	$4,%esp
+	testb	$CHK_UPL,TF_CS(%esp)
+	jnz	alltraps_checkast
+#ifdef VM86
+	testl	$PSL_VM,TF_EFLAGS(%esp)
+	jz	6f
+#else
+	jmp	6f
+#endif
+alltraps_checkast:
+	/* Check for ASTs on exit to user mode. */
+	CLI(%eax)
+	CHECK_ASTPENDING(%eax)
+	jz	3f
+5:	CLEAR_ASTPENDING(%eax)
+	STI(%eax)
+	movl	$T_ASTFLT,TF_TRAPNO(%esp)
+	pushl	%esp
+	call	_C_LABEL(trap)
+	addl	$4,%esp
+	jmp	alltraps_checkast	/* re-check ASTs */
+3:	CHECK_DEFERRED_SWITCH(%eax)
+	jnz	9f
+6:	STIC(%eax)
+    	jz	4f
+	call	_C_LABEL(stipending)
+	#testl	%eax,%eax		/* XXXcl */
+	#jnz	1b
+4:
+#ifndef DIAGNOSTIC
+	INTRFASTEXIT
+#else
+	cmpl	CPUVAR(ILEVEL),%ebx
+	jne	3f
+	INTRFASTEXIT
+3:	pushl	$4f
+	call	_C_LABEL(printf)
+	addl	$4,%esp
+#ifdef DDB
+	int	$3
+#endif /* DDB */
+	movl	%ebx,CPUVAR(ILEVEL)
+	jmp	alltraps_checkast	/* re-check ASTs */
+4:	.asciz	"WARNING: SPL NOT LOWERED ON TRAP EXIT\n"
+#endif /* DIAGNOSTIC */
+9:	STI(%eax)
+	call	_C_LABEL(pmap_load)
+	jmp	alltraps_checkast	/* re-check ASTs */
+
+/* LINTSTUB: Ignore */
+IDTVEC(trap0e)
+	INTRENTRY
+	movl	TF_TRAPNO(%esp),%eax
+	movl	$T_PAGEFLT,TF_TRAPNO(%esp)
+#ifdef DIAGNOSTIC
+	movl	CPUVAR(ILEVEL),%ebx
+#endif /* DIAGNOSTIC */
+	#pushl	%esp
+	pushl	%eax
+	movl	%esp,%eax
+	addl	$4,%eax
+	pushl	%eax
+	call	_C_LABEL(trap)
+	addl	$4,%esp
+	addl	$4,%esp
+	testb	$CHK_UPL,TF_CS(%esp)
+	jnz	trap0e_checkast
+#ifdef VM86
+	testl	$PSL_VM,TF_EFLAGS(%esp)
+	jz	6f
+#else
+	jmp	6f
+#endif
+trap0e_checkast:
+	/* Check for ASTs on exit to user mode. */
+	CLI(%eax)
+	CHECK_ASTPENDING(%eax)
+	jz	3f
+5:	CLEAR_ASTPENDING(%eax)
+	STI(%eax)
+	movl	$T_ASTFLT,TF_TRAPNO(%esp)
+	pushl	%esp
+	call	_C_LABEL(trap)
+	addl	$4,%esp
+	jmp	trap0e_checkast		/* re-check ASTs */
+3:	CHECK_DEFERRED_SWITCH(%eax)
+	jnz	9f
+6:	STIC(%eax)
+    	jz	4f
+	call	_C_LABEL(stipending)
+	#testl	%eax,%eax		/* XXXcl */
+	#jnz	1b
+4:
+#ifndef DIAGNOSTIC
+	INTRFASTEXIT
+#else
+	cmpl	CPUVAR(ILEVEL),%ebx
+	jne	3f
+	INTRFASTEXIT
+3:	pushl	$4f
+	call	_C_LABEL(printf)
+	addl	$4,%esp
+#ifdef DDB
+	int	$3
+#endif /* DDB */
+	movl	%ebx,CPUVAR(ILEVEL)
+	jmp	trap0e_checkast		/* re-check ASTs */
+4:	.asciz	"WARNING: SPL NOT LOWERED ON TRAP EXIT\n"
+#endif /* DIAGNOSTIC */
+9:	STI(%eax)
+	call	_C_LABEL(pmap_load)
+	jmp	trap0e_checkast		/* re-check ASTs */
+
+#ifdef IPKDB
+/* LINTSTUB: Ignore */
+NENTRY(bpttraps)
+	INTRENTRY
+	call	_C_LABEL(ipkdb_trap_glue)
+	testl	%eax,%eax
+	jz	calltrap
+	INTRFASTEXIT
+
+ipkdbsetup:
+	popl	%ecx
+
+	/* Disable write protection: */
+	movl	%cr0,%eax
+	pushl	%eax
+	andl	$~CR0_WP,%eax
+	movl	%eax,%cr0
+
+	/* Substitute Protection & Page Fault handlers: */
+	movl	_C_LABEL(idt),%edx
+	pushl	13*8(%edx)
+	pushl	13*8+4(%edx)
+	pushl	14*8(%edx)
+	pushl	14*8+4(%edx)
+	movl	$fault,%eax
+	movw	%ax,13*8(%edx)
+	movw	%ax,14*8(%edx)
+	shrl	$16,%eax
+	movw	%ax,13*8+6(%edx)
+	movw	%ax,14*8+6(%edx)
+
+	pushl	%ecx
+	ret
+
+ipkdbrestore:
+	popl	%ecx
+
+	/* Restore Protection & Page Fault handlers: */
+	movl	_C_LABEL(idt),%edx
+	popl	14*8+4(%edx)
+	popl	14*8(%edx)
+	popl	13*8+4(%edx)
+	popl	13*8(%edx)
+
+	/* Restore write protection: */
+	popl	%edx
+	movl	%edx,%cr0
+
+	pushl	%ecx
+	ret
+#endif /* IPKDB */
+
+
+/*
+ * If an error is detected during trap, syscall, or interrupt exit, trap() will
+ * change %eip to point to one of these labels.  We clean up the stack, if
+ * necessary, and resume as if we were handling a general protection fault.
+ * This will cause the process to get a SIGBUS.
+ */
+/* LINTSTUB: Var: char resume_iret[1]; */
+NENTRY(resume_iret)
+	ZTRAP(T_PROTFLT)
+/* LINTSTUB: Var: char resume_pop_ds[1]; */
+NENTRY(resume_pop_ds)
+	movl	%es,TF_ES(%esp)
+	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax
+	movw	%ax,%es
+/* LINTSTUB: Var: char resume_pop_es[1]; */
+NENTRY(resume_pop_es)
+	movl	%fs,TF_FS(%esp)
+	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax
+	movw	%ax,%fs
+/* LINTSTUB: Var: char resume_pop_fs[1]; */
+NENTRY(resume_pop_fs)
+	movl	%gs,TF_GS(%esp)
+	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax
+	movw	%ax,%gs
+/* LINTSTUB: Var: char resume_pop_gs[1]; */
+NENTRY(resume_pop_gs)
+	movl	$T_PROTFLT,TF_TRAPNO(%esp)
+	jmp	calltrap
+
+#ifdef IPKDB
+/* LINTSTUB: Func: int ipkdbfbyte(u_char *c) */
+NENTRY(ipkdbfbyte)
+	pushl	%ebp
+	movl	%esp,%ebp
+	call	ipkdbsetup
+	movl	8(%ebp),%edx
+	movzbl	(%edx),%eax
+faultexit:
+	call	ipkdbrestore
+	popl	%ebp
+	ret
+
+/* LINTSTUB: Func: int ipkdbsbyte(u_char *c, int i) */
+NENTRY(ipkdbsbyte)
+	pushl	%ebp
+	movl	%esp,%ebp
+	call	ipkdbsetup
+	movl	8(%ebp),%edx
+	movl	12(%ebp),%eax
+	movb	%al,(%edx)
+	call	ipkdbrestore
+	popl	%ebp
+	ret
+
+fault:
+	popl	%eax		/* error code */
+	movl	$faultexit,%eax
+	movl	%eax,(%esp)
+	movl	$-1,%eax
+	iret
+#endif	/* IPKDB */
+
+
+
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until weve done all processing. HOWEVER, we must enable events before
+# popping the stack frame (cant be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so wed
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(hypervisor_callback)
+	pushl	$0			# dummy error code
+	pushl	$T_ASTFLT
+	INTRENTRY
+        movl TF_EIP(%esp),%eax
+        cmpl $scrit,%eax
+        jb   11f
+        cmpl $ecrit,%eax
+        jb   critical_region_fixup
+11:     push %esp
+        call do_hypervisor_callback
+        add  $4,%esp
+        movl HYPERVISOR_shared_info,%esi
+        xorl %eax,%eax
+        movb TF_CS(%esp),%cl
+        test $CHK_UPL,%cl		# slow return to ring 2 or 3
+        je   safesti
+        movl CPUVAR(ILEVEL),%ebx
+        jmp  doreti_checkast
+safesti:XEN_UNBLOCK_EVENTS(%esi)	# reenable event callbacks
+scrit:  /**** START OF CRITICAL REGION ****/
+        testb $1,evtchn_upcall_pending(%esi)
+        jnz  14f			# process more events if necessary...
+        INTRFASTEXIT
+critiret:
+14:     XEN_BLOCK_EVENTS(%esi)
+        jmp  11b
+ecrit:  /**** END OF CRITICAL REGION ****/
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame.
+critical_region_fixup:
+        cmpl	$(critiret-1),%eax	    # eip points to iret?
+	jne	1f
+	movl	$(TF_PUSHSIZE+0x8),%eax
+	jmp	2f
+1:	xorl	%eax,%eax
+2:
+				# %eax contains num bytes popped
+        mov  %esp,%esi
+        add  %eax,%esi        # %esi points at end of src region
+        mov  %esp,%edi
+        add  $(TF_PUSHSIZE+0x8+0xC),%edi # %edi points at end of dst region
+        mov  %eax,%ecx
+        shr  $2,%ecx          # convert words to bytes
+        je   16f              # skip loop if nothing to copy
+15:     subl $4,%esi          # pre-decrementing copy loop
+        subl $4,%edi
+        movl (%esi),%eax
+        movl %eax,(%edi)
+        loop 15b
+16:     movl %edi,%esp        # final %edi is top of merged stack
+        jmp  11b
+
+
+# Hypervisor uses this for application faults while it executes.
+ENTRY(failsafe_callback)
+	pop	%ds
+	pop	%es
+	pop	%fs
+	pop	%gs
+	call	_C_LABEL(xen_failsafe_handler)
+	iret
+
+#ifdef XENDEBUG_LOW
+
+ES		= 0x20
+ORIG_EAX	= 0x24
+EIP		= 0x28
+CS		= 0x2C
+
+#define SAVE_ALL \
+	cld; \
+	pushl %es; \
+	pushl %ds; \
+	pushl %eax; \
+	pushl %ebp; \
+	pushl %edi; \
+	pushl %esi; \
+	pushl %edx; \
+	pushl %ecx; \
+	pushl %ebx; \
+	movl $GSEL(GDATA_SEL, SEL_KPL),%edx; \
+	movl %edx,%ds; \
+	movl %edx,%es;
+
+#define RESTORE_ALL	\
+	popl %ebx;	\
+	popl %ecx;	\
+	popl %edx;	\
+	popl %esi;	\
+	popl %edi;	\
+	popl %ebp;	\
+	popl %eax;	\
+	popl %ds;	\
+	popl %es;	\
+	addl $4,%esp;	\
+	iret;		\
+
+ret_from_exception:
+        movb CS(%esp),%cl
+	test $2,%cl          # slow return to ring 2 or 3
+	jne  safesti
+        RESTORE_ALL
+
+
+ENTRY(divide_error)
+	pushl $0		# no error code
+	pushl $do_divide_error
+do_exception:
+	pushl %ds
+	pushl %eax
+	xorl %eax,%eax
+	pushl %ebp
+	pushl %edi
+	pushl %esi
+	pushl %edx
+	decl %eax			# eax = -1
+	pushl %ecx
+	pushl %ebx
+	cld
+	movl %es,%ecx
+	movl ORIG_EAX(%esp), %esi	# get the error code
+	movl ES(%esp), %edi		# get the function address
+	movl %eax, ORIG_EAX(%esp)
+	movl %ecx, ES(%esp)
+	movl %esp,%edx
+	pushl %esi			# push the error code
+	pushl %edx			# push the pt_regs pointer
+	movl $(__KERNEL_DS),%edx
+	movl %edx,%ds
+	movl %edx,%es
+	call *%edi
+	addl $8,%esp
+	jmp ret_from_exception
+
+ENTRY(coprocessor_error)
+	pushl $0
+	pushl $do_coprocessor_error
+	jmp do_exception
+
+ENTRY(simd_coprocessor_error)
+	pushl $0
+	pushl $do_simd_coprocessor_error
+	jmp do_exception
+
+ENTRY(device_not_available)
+        iret
+
+ENTRY(debug)
+	pushl $0
+	pushl $do_debug
+	jmp do_exception
+
+ENTRY(int3)
+	pushl $0
+	pushl $do_int3
+	jmp do_exception
+
+ENTRY(overflow)
+	pushl $0
+	pushl $do_overflow
+	jmp do_exception
+
+ENTRY(bounds)
+	pushl $0
+	pushl $do_bounds
+	jmp do_exception
+
+ENTRY(invalid_op)
+	pushl $0
+	pushl $do_invalid_op
+	jmp do_exception
+
+ENTRY(coprocessor_segment_overrun)
+	pushl $0
+	pushl $do_coprocessor_segment_overrun
+	jmp do_exception
+
+ENTRY(double_fault)
+	pushl $do_double_fault
+	jmp do_exception
+
+ENTRY(invalid_TSS)
+	pushl $do_invalid_TSS
+	jmp do_exception
+
+ENTRY(segment_not_present)
+	pushl $do_segment_not_present
+	jmp do_exception
+
+ENTRY(stack_segment)
+	pushl $do_stack_segment
+	jmp do_exception
+
+ENTRY(general_protection)
+	pushl $do_general_protection
+	jmp do_exception
+
+ENTRY(alignment_check)
+	pushl $do_alignment_check
+	jmp do_exception
+
+# This handler is special, because it gets an extra value on its stack,
+# which is the linear faulting address.
+ENTRY(page_fault)
+	pushl %ds
+	pushl %eax
+	xorl %eax,%eax
+	pushl %ebp
+	pushl %edi
+	pushl %esi
+	pushl %edx
+	decl %eax			# eax = -1
+	pushl %ecx
+	pushl %ebx
+	cld
+	movl %es,%ecx
+	movl ORIG_EAX(%esp), %esi	# get the error code
+	movl ES(%esp), %edi		# get the faulting address
+	movl %eax, ORIG_EAX(%esp)
+	movl %ecx, ES(%esp)
+	movl %esp,%edx
+        pushl %edi                      # push the faulting address
+	pushl %esi			# push the error code
+	pushl %edx			# push the pt_regs pointer
+	movl $(__KERNEL_DS),%edx
+	movl %edx,%ds
+	movl %edx,%es
+	call do_page_fault
+	addl $12,%esp
+	jmp ret_from_exception
+
+ENTRY(machine_check)
+	pushl $0
+	pushl $do_machine_check
+	jmp do_exception
+
+ENTRY(spurious_interrupt_bug)
+	pushl $0
+	pushl $do_spurious_interrupt_bug
+	jmp do_exception
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c
new file mode 100644
index 0000000000..d51baba078
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c
@@ -0,0 +1,680 @@
+/*	$NetBSD: xen_machdep.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xen_machdep.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $");
+
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+
+#include <uvm/uvm.h>
+
+#include <machine/gdt.h>
+#include <machine/xenfunc.h>
+#include <machine/xenpmap.h>
+
+/* #define	XENDEBUG */
+/* #define	XENDEBUG_LOW */
+
+#ifdef XENDEBUG
+#define	XENPRINTF(x) printf x
+#define	XENPRINTK(x) printk x
+#define	XENPRINTK2(x) /* printk x */
+
+static char XBUF[256];
+#else
+#define	XENPRINTF(x)
+#define	XENPRINTK(x)
+#define	XENPRINTK2(x)
+#endif
+void printk(char *, ...);
+#define	PRINTF(x) printf x
+#define	PRINTK(x) printk x
+
+shared_info_t *HYPERVISOR_shared_info;
+union start_info_union start_info_union;
+
+void xen_failsafe_handler(void);
+
+void
+xen_failsafe_handler(void)
+{
+
+	panic("xen_failsafe_handler called!\n");
+}
+
+
+void
+xen_update_descriptor(union descriptor *table, union descriptor *entry)
+{
+	paddr_t pa;
+	pt_entry_t *ptp;
+
+	ptp = kvtopte((vaddr_t)table);
+	pa = (*ptp & PG_FRAME) | ((vaddr_t)table & ~PG_FRAME);
+	if (HYPERVISOR_update_descriptor(pa, entry->raw[0], entry->raw[1]))
+		panic("HYPERVISOR_update_descriptor failed\n");
+}
+
+void
+xen_set_ldt(vaddr_t base, uint32_t entries)
+{
+	vaddr_t va;
+	pt_entry_t *ptp, *maptp;
+
+	for (va = base; va < base + entries * sizeof(union descriptor);
+	     va += PAGE_SIZE) {
+		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
+		ptp = kvtopte(va);
+		maptp = (pt_entry_t *)vtomach((vaddr_t)ptp);
+		XENPRINTF(("xen_set_ldt %p %d %p %p\n", (void *)base,
+			      entries, ptp, maptp));
+		PTE_CLEARBITS(ptp, maptp, PG_RW);
+	}
+	PTE_UPDATES_FLUSH();
+
+	xpq_queue_set_ldt(base, entries);
+	xpq_flush_queue();
+}
+
+void
+lgdt(struct region_descriptor *rdp)
+{
+
+	panic("lgdt %p %08x\n", (void *)rdp->rd_base, rdp->rd_limit);
+}
+
+void
+xen_parse_cmdline(int what, union xen_cmdline_parseinfo *xcp)
+{
+	char *cmd_line, *opt, *s;
+	int b, i, ipidx = 0;
+	uint32_t xi_ip[5];
+
+	cmd_line = xen_start_info.cmd_line;
+
+	switch (what) {
+	case XEN_PARSE_BOOTDEV:
+		xcp->xcp_bootdev[0] = 0;
+		break;
+	case XEN_PARSE_CONSOLE:
+		xcp->xcp_console[0] = 0;
+		break;
+	}
+
+	while (cmd_line && *cmd_line) {
+		opt = cmd_line;
+		cmd_line = strchr(opt, ' ');
+		if (cmd_line)
+			*cmd_line = 0;
+
+		switch (what) {
+		case XEN_PARSE_BOOTDEV:
+			if (strncasecmp(opt, "bootdev=", 8) == 0)
+				strncpy(xcp->xcp_bootdev, opt + 8,
+				    sizeof(xcp->xcp_console));
+			break;
+
+		case XEN_PARSE_NETINFO:
+			if (xcp->xcp_netinfo.xi_root &&
+			    strncasecmp(opt, "nfsroot=", 8) == 0)
+				strncpy(xcp->xcp_netinfo.xi_root, opt + 8,
+				    MNAMELEN);
+
+			if (strncasecmp(opt, "ip=", 3) == 0) {
+				memset(xi_ip, 0, sizeof(xi_ip));
+				opt += 3;
+				ipidx = 0;
+				while (opt && *opt) {
+					s = opt;
+					opt = strchr(opt, ':');
+					if (opt)
+						*opt = 0;
+
+					switch (ipidx) {
+					case 0:	/* ip */
+					case 1:	/* nfs server */
+					case 2:	/* gw */
+					case 3:	/* mask */
+					case 4:	/* host */
+						if (*s == 0)
+							break;
+						for (i = 0; i < 4; i++) {
+							b = strtoul(s, &s, 10);
+							xi_ip[ipidx] = b + 256
+								* xi_ip[ipidx];
+							if (*s != '.')
+								break;
+							s++;
+						}
+						if (i < 3)
+							xi_ip[ipidx] = 0;
+						break;
+					case 5:	/* interface */
+						if (!strncmp(s, "xennet", 6))
+							s += 6;
+						else if (!strncmp(s, "eth", 3))
+							s += 3;
+						else
+							break;
+						if (xcp->xcp_netinfo.xi_ifno
+						    == strtoul(s, NULL, 10))
+							memcpy(xcp->
+							    xcp_netinfo.xi_ip,
+							    xi_ip,
+							    sizeof(xi_ip));
+						break;
+					}
+					ipidx++;
+
+					if (opt)
+						*opt++ = ':';
+				}
+			}
+			break;
+
+		case XEN_PARSE_CONSOLE:
+			if (strncasecmp(opt, "console=", 8) == 0)
+				strncpy(xcp->xcp_console, opt + 8,
+				    sizeof(xcp->xcp_console));
+			break;
+
+		}
+
+		if (cmd_line)
+			*cmd_line++ = ' ';
+	}
+}
+
+
+
+
+
+#define XEN_PAGE_OFFSET 0xC0100000
+
+static pd_entry_t
+xpmap_get_bootpde(paddr_t va)
+{
+
+	return ((pd_entry_t *)xen_start_info.pt_base)[va >> PDSHIFT];
+}
+
+static pd_entry_t
+xpmap_get_vbootpde(paddr_t va)
+{
+	pd_entry_t pde;
+
+	pde = xpmap_get_bootpde(va);
+	if ((pde & PG_V) == 0)
+		return (pde & ~PG_FRAME);
+	return (pde & ~PG_FRAME) |
+		(xpmap_mtop(pde & PG_FRAME) + KERNBASE);
+}
+
+static pt_entry_t *
+xpmap_get_bootptep(paddr_t va)
+{
+	pd_entry_t pde;
+
+	pde = xpmap_get_vbootpde(va);
+	if ((pde & PG_V) == 0)
+		return (void *)-1;
+	return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]);
+}
+
+static pt_entry_t
+xpmap_get_bootpte(paddr_t va)
+{
+
+	return xpmap_get_bootptep(va)[0];
+}
+
+#if defined(XENDEBUG)
+static void
+xpmap_dump_pt(pt_entry_t *ptp, int p)
+{
+	pt_entry_t pte;
+	int j;
+	int bufpos;
+
+	pte = xpmap_ptom((uint32_t)ptp - KERNBASE);
+	PRINTK(("%03x: %p(%p) %08x\n", p, ptp, (void *)pte, p << PDSHIFT));
+
+	bufpos = 0;
+	for (j = 0; j < PTES_PER_PTP; j++) {
+		if ((ptp[j] & PG_V) == 0)
+			continue;
+		pte = ptp[j] /* & PG_FRAME */;
+		bufpos += sprintf(XBUF + bufpos, "%x:%03x:%08x ",
+		    p, j, pte);
+		if (bufpos > 70) {
+			int k;
+			sprintf(XBUF + bufpos, "\n");
+			PRINTK((XBUF));
+			bufpos = 0;
+			for (k = 0; k < 1000000; k++);
+		}
+	}
+	if (bufpos) {
+		PRINTK((XBUF));
+		PRINTK(("\n"));
+		bufpos = 0;
+	}
+}
+#endif
+
+void
+xpmap_init(void)
+{
+	pd_entry_t *xen_pdp;
+	pt_entry_t *ptp, *sysptp;
+	pt_entry_t pte;
+	uint32_t i, j;
+	int bufpos;
+#if defined(XENDEBUG_LOW)
+	extern char kernel_text, _etext, __bss_start, end, *esym;
+#endif
+
+	xpmap_phys_to_machine_mapping = (void *)xen_start_info.mfn_list;
+
+	xen_pdp = (pd_entry_t *)xen_start_info.pt_base;
+
+	XENPRINTK(("text %p data %p bss %p end %p esym %p\n", &kernel_text,
+		   &_etext, &__bss_start, &end, esym));
+	XENPRINTK(("xpmap_init PTD %p nkpde %d upages %d xen_PTD %p p2m-map %p\n",
+		   (void *)PTDpaddr, nkpde, UPAGES, xen_pdp,
+		   xpmap_phys_to_machine_mapping));
+
+	bufpos = 0;
+
+	XENPRINTK(("shared_inf %08x\n", (paddr_t)xen_start_info.shared_info));
+	XENPRINTK(("c0100000: %08x\n",
+	    xpmap_get_bootpte(0xc0100000)));
+
+	/* Map kernel. */
+
+	/* Map kernel data/bss/tables. */
+
+	/* Map ISA I/O memory. */
+	
+	/* Map kernel PDEs. */
+
+	/* Install a PDE recursively mapping page directory as a page table! */
+
+	sysptp = (pt_entry_t *)(PTDpaddr + ((1 + UPAGES) << PAGE_SHIFT));
+
+	/* make xen's PDE and PTE pages read-only in our pagetable */
+	for (i = 0; i < xen_start_info.nr_pt_frames; i++) {
+		/* mark PTE page read-only in our table */
+		sysptp[((xen_start_info.pt_base +
+			    (i << PAGE_SHIFT) - KERNBASE_LOCORE) & 
+			   (PD_MASK | PT_MASK)) >> PAGE_SHIFT] &= ~PG_RW;
+	}
+
+	xpq_flush_queue();
+
+	for (i = 0; i < 1 + UPAGES + nkpde; i++) {
+		/* mark PTE page read-only in xen's table */
+		ptp = xpmap_get_bootptep(PTDpaddr + (i << PAGE_SHIFT));
+		xpq_queue_pte_update(
+		    (void *)xpmap_ptom((unsigned long)ptp - KERNBASE), *ptp & ~PG_RW);
+		XENPRINTK(("%03x: %p(%p) -> %08x\n", i, ptp,
+			      (unsigned long)ptp - KERNTEXTOFF, *ptp));
+
+		/* mark PTE page read-only in our table */
+		sysptp[((PTDpaddr + (i << PAGE_SHIFT) - KERNBASE_LOCORE) & 
+			   (PD_MASK | PT_MASK)) >> PAGE_SHIFT] &= ~PG_RW;
+
+		/* update our pte's */
+		ptp = (pt_entry_t *)(PTDpaddr + (i << PAGE_SHIFT));
+#if 0
+		pte = xpmap_ptom((uint32_t)ptp - KERNBASE);
+		XENPRINTK(("%03x: %p(%p) %08x\n", i, ptp, pte, i << PDSHIFT));
+#endif
+		for (j = 0; j < PTES_PER_PTP; j++) {
+			if ((ptp[j] & PG_V) == 0)
+				continue;
+			if (ptp[j] == 0xffffffff)
+				ptp[j] = xen_start_info.shared_info |
+					(PG_V|PG_RW);
+			if (ptp[j] >= KERNTEXTOFF) {
+				pte = ptp[j];
+				ptp[j] = (pte & ~PG_FRAME) |
+					(xpmap_get_bootpte(pte & PG_FRAME) &
+					    PG_FRAME);
+			}
+#if defined(XENDEBUG) && 0
+			pte = ptp[j] /* & PG_FRAME */;
+			bufpos += sprintf(XBUF + bufpos, "%x:%03x:%08x ",
+			    i, j, pte);
+			if (bufpos > 70) {
+				int k;
+				sprintf(XBUF + bufpos, "\n");
+				XENPRINTK((XBUF));
+				bufpos = 0;
+				for (k = 0; k < 1000000; k++);
+			}
+		}
+		if (bufpos) {
+			XENPRINTK((XBUF));
+			bufpos = 0;
+#endif
+		}
+		if (i == 0)
+			i = 1 + UPAGES - 1;
+	}
+
+#if 0
+	for (i = 0x300; i < 0x305; i++)
+		if (((pt_entry_t *)xen_start_info.pt_base)[i] & PG_V)
+			xpmap_dump_pt((pt_entry_t *)
+			    (xpmap_mtop(((pt_entry_t *)xen_start_info.pt_base)[i] &
+				PG_FRAME) + KERNBASE), i);
+	xpmap_dump_pt((pt_entry_t *)xen_start_info.pt_base, 0);
+#endif
+
+	XENPRINTK(("switching pdp: %p, %08lx, %p, %p, %p\n", (void *)PTDpaddr,
+		      PTDpaddr - KERNBASE,
+		      (void *)xpmap_ptom(PTDpaddr - KERNBASE),
+		      (void *)xpmap_get_bootpte(PTDpaddr),
+		      (void *)xpmap_mtop(xpmap_ptom(PTDpaddr - KERNBASE))));
+
+#if defined(XENDEBUG)
+	xpmap_dump_pt((pt_entry_t *)PTDpaddr, 0);
+#endif
+
+	xpq_flush_queue();
+
+	xpq_queue_pin_table(xpmap_get_bootpte(PTDpaddr) & PG_FRAME,
+	    XPQ_PIN_L2_TABLE);
+	xpq_queue_pt_switch(xpmap_get_bootpte(PTDpaddr) & PG_FRAME);
+	xpq_queue_unpin_table(
+		xpmap_get_bootpte(xen_start_info.pt_base) & PG_FRAME);
+
+	/* make xen's PDE and PTE pages writable in our pagetable */
+	for (i = 0; i < xen_start_info.nr_pt_frames; i++) {
+		/* mark PTE page writable in our table */
+		ptp = &sysptp[((xen_start_info.pt_base +
+				   (i << PAGE_SHIFT) - KERNBASE_LOCORE) & 
+				  (PD_MASK | PT_MASK)) >> PAGE_SHIFT];
+		xpq_queue_pte_update(
+		    (void *)xpmap_ptom((unsigned long)ptp - KERNBASE), *ptp |
+		    PG_RW);
+	}
+
+	xpq_flush_queue();
+	XENPRINTK(("pt_switch done!\n"));
+}
+
+/*
+ * Do a binary search to find out where physical memory ends on the
+ * real hardware.  Xen will fail our updates if they are beyond the
+ * last available page (max_page in xen/common/memory.c).
+ */
+paddr_t
+find_pmap_mem_end(vaddr_t va)
+{
+	mmu_update_t r;
+	int start, end, ok;
+	pt_entry_t old;
+
+	start = xen_start_info.nr_pages;
+	end = HYPERVISOR_VIRT_START >> PAGE_SHIFT;
+
+	r.ptr = (unsigned long)&PTE_BASE[x86_btop(va)];
+	old = PTE_BASE[x86_btop(va)];
+
+	while (start + 1 < end) {
+		r.val = (((start + end) / 2) << PAGE_SHIFT) | PG_V;
+
+		if (HYPERVISOR_mmu_update(&r, 1, &ok) < 0)
+			end = (start + end) / 2;
+		else
+			start = (start + end) / 2;
+	}
+	r.val = old;
+	if (HYPERVISOR_mmu_update(&r, 1, &ok) < 0)
+		printf("pmap_mem_end find: old update failed %08x\n",
+		    old);
+
+	return end << PAGE_SHIFT;
+}
+
+
+#if 0
+void xpmap_find_memory(paddr_t);
+void
+xpmap_find_memory(paddr_t first_avail)
+{
+	char buf[256];
+	uint32_t i;
+	int bufpos;
+	paddr_t p;
+
+	bufpos = 0;
+	for (i = ((first_avail - KERNTEXTOFF) >> PAGE_SHIFT);
+	     i < xen_start_info.nr_pages; i++) {
+		/* if (xpmap_phys_to_machine_mapping[i] */
+		bufpos += sprintf(buf + bufpos, "%03x:%08x:%08x ",
+		    i, (uint32_t)xpmap_phys_to_machine_mapping[i],
+		    (uint32_t)xpmap_mtop(xpmap_phys_to_machine_mapping[i] <<
+			PAGE_SHIFT));
+		p = xpmap_phys_to_machine_mapping[i];
+		uvm_page_physload(p, p + 1, p, p + 1, VM_FREELIST_DEFAULT);
+		    
+		if (bufpos > 70) {
+			int k;
+			sprintf(buf + bufpos, "\n");
+			XENPRINTK((buf));
+			bufpos = 0;
+			for (k = 0; k < 1000000; k++);
+		}
+	}
+	if (bufpos) {
+		XENPRINTK((buf));
+		bufpos = 0;
+	}
+}
+#endif
+
+
+#ifdef XENDEBUG
+void xpq_debug_dump(void);
+#endif
+
+#define XPQUEUE_SIZE 2048
+typedef union xpq_queue {
+	struct {
+		pd_entry_t *ptr;
+		pd_entry_t val;
+	} pde;
+	struct {
+		pt_entry_t *ptr;
+		pt_entry_t val;
+	} pte;
+	struct {
+		paddr_t ptr;
+		uint32_t val;
+	} pa;
+} xpq_queue_t;
+static xpq_queue_t xpq_queue[XPQUEUE_SIZE];
+static int xpq_idx = 0;
+
+void
+xpq_flush_queue()
+{
+	int i, ok;
+
+	XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
+	for (i = 0; i < xpq_idx; i++)
+		XENPRINTK2(("%d: %p %08x\n", i, xpq_queue[i].pde.ptr,
+		    xpq_queue[i].pde.val));
+	if (xpq_idx != 0 &&
+	    HYPERVISOR_mmu_update((mmu_update_t *)xpq_queue, xpq_idx, &ok) < 0)
+		panic("HYPERVISOR_mmu_update failed\n");
+	xpq_idx = 0;
+}
+
+static inline void
+xpq_increment_idx(void)
+{
+
+	xpq_idx++;
+	if (__predict_false(xpq_idx == XPQUEUE_SIZE))
+		xpq_flush_queue();
+}
+
+void
+xpq_queue_invlpg(vaddr_t va)
+{
+
+	XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va));
+	xpq_queue[xpq_idx].pa.ptr = (va & PG_FRAME) | MMU_EXTENDED_COMMAND;
+	xpq_queue[xpq_idx].pa.val = MMUEXT_INVLPG;
+	xpq_increment_idx();
+}
+
+void
+xpq_queue_pde_update(pd_entry_t *ptr, pd_entry_t val)
+{
+
+	xpq_queue[xpq_idx].pde.ptr = ptr;
+	xpq_queue[xpq_idx].pde.val = val;
+	xpq_increment_idx();
+}
+
+void
+xpq_queue_pte_update(pt_entry_t *ptr, pt_entry_t val)
+{
+
+	xpq_queue[xpq_idx].pte.ptr = ptr;
+	xpq_queue[xpq_idx].pte.val = val;
+	xpq_increment_idx();
+}
+
+void
+xpq_queue_unchecked_pte_update(pt_entry_t *ptr, pt_entry_t val)
+{
+
+	xpq_queue[xpq_idx].pa.ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
+	/* XXXcl UNCHECKED_PT_UPDATE */
+	xpq_queue[xpq_idx].pa.val = val;
+	xpq_increment_idx();
+}
+
+void
+xpq_queue_pt_switch(paddr_t pa)
+{
+
+	XENPRINTK2(("xpq_queue_pt_switch: %p %p\n", (void *)pa, (void *)pa));
+	xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND;
+	xpq_queue[xpq_idx].pa.val = MMUEXT_NEW_BASEPTR;
+	xpq_increment_idx();
+}
+
+void
+xpq_queue_pin_table(paddr_t pa, int type)
+{
+
+	XENPRINTK2(("xpq_queue_pin_table: %p %p\n", (void *)pa, (void *)pa));
+	xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND;
+	switch (type) {
+	case XPQ_PIN_L1_TABLE:
+		xpq_queue[xpq_idx].pa.val = MMUEXT_PIN_L1_TABLE;
+		break;
+	case XPQ_PIN_L2_TABLE:
+		xpq_queue[xpq_idx].pa.val = MMUEXT_PIN_L2_TABLE;
+		break;
+	}
+	xpq_increment_idx();
+}
+
+void
+xpq_queue_unpin_table(paddr_t pa)
+{
+
+	XENPRINTK2(("xpq_queue_unpin_table: %p %p\n", (void *)pa, (void *)pa));
+	xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND;
+	xpq_queue[xpq_idx].pa.val = MMUEXT_UNPIN_TABLE;
+	xpq_increment_idx();
+}
+
+void
+xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
+{
+
+	XENPRINTK2(("xpq_queue_set_ldt\n"));
+	KASSERT(va == (va & PG_FRAME));
+	xpq_queue[xpq_idx].pa.ptr = MMU_EXTENDED_COMMAND | va;
+	xpq_queue[xpq_idx].pa.val = MMUEXT_SET_LDT |
+		(entries << MMUEXT_CMD_SHIFT);
+	xpq_increment_idx();
+}
+
+void
+xpq_queue_tlb_flush()
+{
+
+	XENPRINTK2(("xpq_queue_tlb_flush\n"));
+	xpq_queue[xpq_idx].pa.ptr = MMU_EXTENDED_COMMAND;
+	xpq_queue[xpq_idx].pa.val = MMUEXT_TLB_FLUSH;
+	xpq_increment_idx();
+}
+
+#ifdef XENDEBUG
+void
+xpq_debug_dump()
+{
+	int i;
+
+	XENPRINTK2(("idx: %d\n", xpq_idx));
+	for (i = 0; i < xpq_idx; i++) {
+		sprintf(XBUF, "%p %08x ", xpq_queue[i].pte.ptr,
+		    xpq_queue[i].pte.val);
+		if (++i < xpq_idx)
+			sprintf(XBUF + strlen(XBUF), "%p %08x ",
+			    xpq_queue[i].pte.ptr, xpq_queue[i].pte.val);
+		if (++i < xpq_idx)
+			sprintf(XBUF + strlen(XBUF), "%p %08x ",
+			    xpq_queue[i].pte.ptr, xpq_queue[i].pte.val);
+		if (++i < xpq_idx)
+			sprintf(XBUF + strlen(XBUF), "%p %08x ",
+			    xpq_queue[i].pte.ptr, xpq_queue[i].pte.val);
+		XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
+	}
+}
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h
new file mode 100644
index 0000000000..cad97f21e1
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h
@@ -0,0 +1,130 @@
+/*	$NetBSD: frameasm.h,v 1.1 2004/03/11 21:44:08 cl Exp $	*/
+/*	NetBSD: frameasm.h,v 1.4 2004/02/20 17:35:01 yamt Exp 	*/
+
+#ifndef _I386_FRAMEASM_H_
+#define _I386_FRAMEASM_H_
+
+#ifdef _KERNEL_OPT
+#include "opt_multiprocessor.h"
+#endif
+
+/* XXX assym.h */
+#define TRAP_INSTR	int $0x82
+#define __HYPERVISOR_stack_switch          4
+#define __HYPERVISOR_fpu_taskswitch	   7
+
+#ifndef TRAPLOG
+#define TLOG		/**/
+#else
+/*
+ * Fill in trap record
+ */
+#define TLOG						\
+9:							\
+	movl	%fs:CPU_TLOG_OFFSET, %eax;		\
+	movl	%fs:CPU_TLOG_BASE, %ebx;		\
+	addl	$SIZEOF_TREC,%eax;			\
+	andl	$SIZEOF_TLOG-1,%eax;			\
+	addl	%eax,%ebx;				\
+	movl	%eax,%fs:CPU_TLOG_OFFSET;		\
+	movl	%esp,TREC_SP(%ebx);			\
+	movl	$9b,TREC_HPC(%ebx);			\
+	movl	TF_EIP(%esp),%eax;			\
+	movl	%eax,TREC_IPC(%ebx);			\
+	rdtsc			;			\
+	movl	%eax,TREC_TSC(%ebx);			\
+	movl	$MSR_LASTBRANCHFROMIP,%ecx;		\
+	rdmsr			;			\
+	movl	%eax,TREC_LBF(%ebx);			\
+	incl	%ecx		;			\
+	rdmsr			;			\
+	movl	%eax,TREC_LBT(%ebx);			\
+	incl	%ecx		;			\
+	rdmsr			;			\
+	movl	%eax,TREC_IBF(%ebx);			\
+	incl	%ecx		;			\
+	rdmsr			;			\
+	movl	%eax,TREC_IBT(%ebx)
+#endif
+		
+/*
+ * These are used on interrupt or trap entry or exit.
+ */
+#define	INTRENTRY \
+	cld; \
+	subl	$TF_PUSHSIZE,%esp	; \
+	movl	%gs,TF_GS(%esp)	; \
+	movl	%fs,TF_FS(%esp) ; \
+	movl	%eax,TF_EAX(%esp)	; \
+	movl	%es,TF_ES(%esp) ; \
+	movl	%ds,TF_DS(%esp) ; \
+	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax	; \
+	movl	%edi,TF_EDI(%esp)	; \
+	movl	%esi,TF_ESI(%esp)	; \
+	movl	%eax,%ds	; \
+	movl	%ebp,TF_EBP(%esp)	; \
+	movl	%eax,%es	; \
+	movl	%ebx,TF_EBX(%esp)	; \
+	movl	%eax,%gs	; \
+	movl	%edx,TF_EDX(%esp)	; \
+	movl	$GSEL(GCPU_SEL, SEL_KPL),%eax	; \
+	movl	%ecx,TF_ECX(%esp)	; \
+	movl	%eax,%fs	; \
+	TLOG
+
+#define	INTRFASTEXIT \
+	movl	TF_GS(%esp),%gs	; \
+	movl	TF_FS(%esp),%fs	; \
+	movl	TF_ES(%esp),%es	; \
+	movl	TF_DS(%esp),%ds	; \
+	movl	TF_EDI(%esp),%edi	; \
+	movl	TF_ESI(%esp),%esi	; \
+	movl	TF_EBP(%esp),%ebp	; \
+	movl	TF_EBX(%esp),%ebx	; \
+	movl	TF_EDX(%esp),%edx	; \
+	movl	TF_ECX(%esp),%ecx	; \
+	movl	TF_EAX(%esp),%eax	; \
+	addl	$(TF_PUSHSIZE+8),%esp	; \
+	iret
+
+#define	DO_DEFERRED_SWITCH(reg) \
+	cmpl	$0, CPUVAR(WANT_PMAPLOAD)		; \
+	jz	1f					; \
+	call	_C_LABEL(pmap_load)			; \
+	1:
+
+#define	CHECK_DEFERRED_SWITCH(reg) \
+	cmpl	$0, CPUVAR(WANT_PMAPLOAD)
+
+#define	CHECK_ASTPENDING(reg)	movl	CPUVAR(CURLWP),reg	; \
+				cmpl	$0, reg			; \
+				je	1f			; \
+				movl	L_PROC(reg),reg		; \
+				cmpl	$0, P_MD_ASTPENDING(reg); \
+				1:
+#define	CLEAR_ASTPENDING(reg)	movl	$0, P_MD_ASTPENDING(reg)
+
+#if !defined(XEN)
+#define	CLI(reg)	cli
+#define	STI(reg)	sti
+#else
+/* XXX assym.h */
+#define	EVENTS_MASK 136
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending		/* 0 */
+#define evtchn_upcall_mask		1
+
+#define XEN_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)
+#define XEN_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)
+#define XEN_TEST_PENDING(reg)	testb $0xFF,evtchn_upcall_pending(%reg)
+
+#define CLI(reg)	movl	_C_LABEL(HYPERVISOR_shared_info),reg ;	\
+    			XEN_BLOCK_EVENTS(reg)
+#define STI(reg)	movl	_C_LABEL(HYPERVISOR_shared_info),reg ;	\
+    			XEN_UNBLOCK_EVENTS(reg)
+#define STIC(reg)	movl	_C_LABEL(HYPERVISOR_shared_info),reg ;	\
+    			XEN_UNBLOCK_EVENTS(reg)  ; \
+			testb $1,evtchn_upcall_pending(reg)
+#endif
+
+#endif /* _I386_FRAMEASM_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h
new file mode 100644
index 0000000000..13442d22eb
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h
@@ -0,0 +1,423 @@
+/*	$NetBSD: hypervisor.h,v 1.1.2.2 2004/06/17 09:23:19 tron Exp $	*/
+
+/*
+ * 
+ * Communication to/from hypervisor.
+ * 
+ * Copyright (c) 2002-2003, K A Fraser
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _XEN_HYPERVISOR_H_
+#define _XEN_HYPERVISOR_H_
+
+
+struct hypervisor_attach_args {
+	const char 		*haa_busname;
+};
+
+struct xencons_attach_args {
+	const char 		*xa_device;
+};
+
+struct xen_npx_attach_args {
+	const char 		*xa_device;
+};
+
+
+#define	u8 uint8_t
+#define	u16 uint16_t
+#define	u32 uint32_t
+#define	u64 uint64_t
+#define	s8 int8_t
+#define	s16 int16_t
+#define	s32 int32_t
+#define	s64 int64_t
+
+/* include the hypervisor interface */
+#include <sys/systm.h>
+#include <machine/hypervisor-ifs/hypervisor-if.h>
+#include <machine/hypervisor-ifs/dom0_ops.h>
+#include <machine/hypervisor-ifs/event_channel.h>
+#include <machine/hypervisor-ifs/io/domain_controller.h>
+#include <machine/hypervisor-ifs/io/netif.h>
+
+#undef u8
+#undef u16
+#undef u32
+#undef u64
+#undef s8
+#undef s16
+#undef s32
+#undef s64
+
+
+/*
+ * a placeholder for the start of day information passed up from the hypervisor
+ */
+union start_info_union
+{
+    start_info_t start_info;
+    char padding[512];
+};
+extern union start_info_union start_info_union;
+#define xen_start_info (start_info_union.start_info)
+
+
+/* hypervisor.c */
+void do_hypervisor_callback(struct trapframe *regs);
+void hypervisor_notify_via_evtchn(unsigned int);
+void hypervisor_enable_irq(unsigned int);
+void hypervisor_disable_irq(unsigned int);
+void hypervisor_acknowledge_irq(unsigned int);
+
+/* hypervisor_machdep.c */
+void hypervisor_unmask_event(unsigned int);
+void hypervisor_mask_event(unsigned int);
+void hypervisor_clear_event(unsigned int);
+void hypervisor_force_callback(void);
+
+/*
+ * Assembler stubs for hyper-calls.
+ */
+
+static inline int HYPERVISOR_set_trap_table(trap_info_t *table)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_trap_table),
+        "b" (table) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count,
+					int *success_count)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_mmu_update), 
+        "b" (req), "c" (count), "d" (success_count) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_gdt), 
+        "b" (frame_list), "c" (entries) : "memory" );
+
+
+    return ret;
+}
+
+static inline int HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_stack_switch),
+        "b" (ss), "c" (esp) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_set_callbacks(
+    unsigned long event_selector, unsigned long event_address,
+    unsigned long failsafe_selector, unsigned long failsafe_address)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_callbacks),
+        "b" (event_selector), "c" (event_address), 
+        "d" (failsafe_selector), "S" (failsafe_address) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_fpu_taskswitch(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_fpu_taskswitch) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_yield(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_yield) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_block(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_block) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_shutdown(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift))
+        : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_reboot(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift))
+        : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_suspend(unsigned long srec)
+{
+    int ret;
+    /* NB. On suspend, control software expects a suspend record in %esi. */
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)), 
+        "S" (srec) : "memory" );
+
+    return ret;
+}
+
+static inline long HYPERVISOR_set_timer_op(uint64_t timeout)
+{
+    int ret;
+    unsigned long timeout_hi = (unsigned long)(timeout>>32);
+    unsigned long timeout_lo = (unsigned long)timeout;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_timer_op),
+        "b" (timeout_hi), "c" (timeout_lo) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_dom0_op(dom0_op_t *dom0_op)
+{
+    int ret;
+    dom0_op->interface_version = DOM0_INTERFACE_VERSION;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_dom0_op),
+        "b" (dom0_op) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_set_debugreg(int reg, unsigned long value)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_debugreg),
+        "b" (reg), "c" (value) : "memory" );
+
+    return ret;
+}
+
+static inline unsigned long HYPERVISOR_get_debugreg(int reg)
+{
+    unsigned long ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_get_debugreg),
+        "b" (reg) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_update_descriptor(
+    unsigned long pa, unsigned long word1, unsigned long word2)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_update_descriptor), 
+        "b" (pa), "c" (word1), "d" (word2) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_set_fast_trap(int idx)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_fast_trap), 
+        "b" (idx) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_dom_mem_op(unsigned int   op,
+                                        unsigned long *extent_list,
+                                        unsigned long  nr_extents,
+                                        unsigned int   extent_order)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_dom_mem_op),
+        "b" (op), "c" (extent_list), "d" (nr_extents), "S" (extent_order),
+	"D" (DOMID_SELF)
+        : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_multicall(void *call_list, int nr_calls)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_multicall),
+        "b" (call_list), "c" (nr_calls) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_update_va_mapping(
+    unsigned long page_nr, unsigned long new_val, unsigned long flags)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping), 
+        "b" (page_nr), "c" (new_val), "d" (flags) : "memory" );
+
+    if (__predict_false(ret < 0))
+        panic("Failed update VA mapping: %08lx, %08lx, %08lx",
+              page_nr, new_val, flags);
+
+    return ret;
+}
+
+static inline int HYPERVISOR_event_channel_op(void *op)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_event_channel_op),
+        "b" (op) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_xen_version(int cmd)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_xen_version), 
+        "b" (cmd) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_console_io(int cmd, int count, char *str)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_console_io),
+        "b" (cmd), "c" (count), "d" (str) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_physdev_op(void *physdev_op)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_physdev_op),
+        "b" (physdev_op) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_grant_table_op(void *gnttab_op)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_grant_table_op),
+        "b" (gnttab_op) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_update_va_mapping_otherdomain(
+    unsigned long page_nr, unsigned long new_val, unsigned long flags, domid_t domid)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping_otherdomain), 
+        "b" (page_nr), "c" (new_val), "d" (flags), "S" (domid) :
+        "memory" );
+    
+    return ret;
+}
+
+static inline int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_vm_assist),
+        "b" (cmd), "c" (type) : "memory" );
+
+    return ret;
+}
+
+#endif /* _XEN_HYPERVISOR_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h
new file mode 100644
index 0000000000..32a774b1b6
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h
@@ -0,0 +1,110 @@
+/*	$NetBSD: if_xennetvar.h,v 1.1.2.1 2004/05/22 15:59:31 he Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_IF_XENNETVAR_H_
+#define _XEN_IF_XENNETVAR_H_
+
+#include <machine/xen.h>
+
+union xennet_bufarray {
+	struct {
+		struct mbuf *xbtx_m;
+	} xb_tx;
+	struct {
+		vaddr_t xbrx_va;
+		paddr_t xbrx_pa;
+		struct xennet_softc *xbrx_sc;
+	} xb_rx;
+	int xb_next;
+};
+
+struct xennet_txbuf {
+	SLIST_ENTRY(xennet_txbuf)	xt_next;
+	struct xennet_softc		*xt_sc;
+	paddr_t				xt_pa;
+	u_char				xt_buf[0];
+};
+#define	TXBUF_PER_PAGE 2
+#define	TXBUF_BUFSIZE	(PAGE_SIZE / TXBUF_PER_PAGE) - sizeof(struct xennet_txbuf)
+
+struct xennet_softc {
+	struct device		sc_dev;		/* base device glue */
+	struct ethercom		sc_ethercom;	/* Ethernet common part */
+
+	int			sc_ifno;
+
+	uint8_t			sc_enaddr[6];
+
+#ifdef mediacode
+	struct ifmedia		sc_media;
+#endif
+
+	/* What is the status of our connection to the remote backend? */
+#define BEST_CLOSED       0
+#define BEST_DISCONNECTED 1
+#define BEST_CONNECTED    2
+	unsigned int		sc_backend_state;
+
+	unsigned int		sc_evtchn;
+	unsigned int		sc_irq;
+
+	netif_tx_interface_t	*sc_tx;
+	netif_rx_interface_t	*sc_rx;
+
+	uint32_t		sc_tx_entries;
+	uint32_t		sc_tx_resp_cons;
+
+	uint32_t		sc_rx_resp_cons;
+	uint32_t		sc_rx_bufs_to_notify;
+
+	union xennet_bufarray	sc_tx_bufa[NETIF_TX_RING_SIZE];
+	union xennet_bufarray	sc_rx_bufa[NETIF_TX_RING_SIZE];
+
+	SLIST_HEAD(, xennet_txbuf)	sc_tx_bufs;
+};
+
+struct xennet_attach_args {
+	const char 		*xa_device;
+	int			xa_handle;
+};
+
+struct nfs_diskless;
+
+int xennet_scan(struct device *, struct xennet_attach_args *, cfprint_t);
+void xennet_start(struct ifnet *);
+int xennet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
+void xennet_watchdog(struct ifnet *ifp);
+int xennet_bootstatic_callback(struct nfs_diskless *);
+
+#endif /* _XEN_IF_XENNETVAR_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h
new file mode 100644
index 0000000000..1a482ea287
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h
@@ -0,0 +1,533 @@
+/*	$NetBSD: pmap.h,v 1.1.2.1 2004/05/22 15:59:58 he Exp $	*/
+/*	NetBSD: pmap.h,v 1.79 2004/02/20 17:35:01 yamt Exp 	*/
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgment:
+ *      This product includes software developed by Charles D. Cranor and
+ *      Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * pmap.h: see pmap.c for the history of this pmap module.
+ */
+
+#ifndef	_I386_PMAP_H_
+#define	_I386_PMAP_H_
+
+#if defined(_KERNEL_OPT)
+#include "opt_user_ldt.h"
+#include "opt_largepages.h"
+#endif
+
+#include "opt_xen.h"
+
+#include <machine/cpufunc.h>
+#include <machine/pte.h>
+#include <machine/xenfunc.h>
+#include <machine/xenpmap.h>
+#include <machine/segments.h>
+#include <uvm/uvm_object.h>
+
+/*
+ * see pte.h for a description of i386 MMU terminology and hardware
+ * interface.
+ *
+ * a pmap describes a processes' 4GB virtual address space.  this
+ * virtual address space can be broken up into 1024 4MB regions which
+ * are described by PDEs in the PDP.  the PDEs are defined as follows:
+ *
+ * (ranges are inclusive -> exclusive, just like vm_map_entry start/end)
+ * (the following assumes that KERNBASE is 0xc0000000)
+ *
+ * PDE#s	VA range		usage
+ * 0->766	0x0 -> 0xbfc00000	user address space
+ * 767		0xbfc00000->		recursive mapping of PDP (used for
+ *			0xc0000000	linear mapping of PTPs)
+ * 768->1023	0xc0000000->		kernel address space (constant
+ *			0xffc00000	across all pmap's/processes)
+ * 1023		0xffc00000->		"alternate" recursive PDP mapping
+ *			<end>		(for other pmaps)
+ *
+ *
+ * note: a recursive PDP mapping provides a way to map all the PTEs for
+ * a 4GB address space into a linear chunk of virtual memory.  in other
+ * words, the PTE for page 0 is the first int mapped into the 4MB recursive
+ * area.  the PTE for page 1 is the second int.  the very last int in the
+ * 4MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB
+ * address).
+ *
+ * all pmap's PD's must have the same values in slots 768->1023 so that
+ * the kernel is always mapped in every process.  these values are loaded
+ * into the PD at pmap creation time.
+ *
+ * at any one time only one pmap can be active on a processor.  this is
+ * the pmap whose PDP is pointed to by processor register %cr3.  this pmap
+ * will have all its PTEs mapped into memory at the recursive mapping
+ * point (slot #767 as show above).  when the pmap code wants to find the
+ * PTE for a virtual address, all it has to do is the following:
+ *
+ * address of PTE = (767 * 4MB) + (VA / PAGE_SIZE) * sizeof(pt_entry_t)
+ *                = 0xbfc00000 + (VA / 4096) * 4
+ *
+ * what happens if the pmap layer is asked to perform an operation
+ * on a pmap that is not the one which is currently active?  in that
+ * case we take the PA of the PDP of non-active pmap and put it in
+ * slot 1023 of the active pmap.  this causes the non-active pmap's
+ * PTEs to get mapped in the final 4MB of the 4GB address space
+ * (e.g. starting at 0xffc00000).
+ *
+ * the following figure shows the effects of the recursive PDP mapping:
+ *
+ *   PDP (%cr3)
+ *   +----+
+ *   |   0| -> PTP#0 that maps VA 0x0 -> 0x400000
+ *   |    |
+ *   |    |
+ *   | 767| -> points back to PDP (%cr3) mapping VA 0xbfc00000 -> 0xc0000000
+ *   | 768| -> first kernel PTP (maps 0xc0000000 -> 0xf0400000)
+ *   |    |
+ *   |1023| -> points to alternate pmap's PDP (maps 0xffc00000 -> end)
+ *   +----+
+ *
+ * note that the PDE#767 VA (0xbfc00000) is defined as "PTE_BASE"
+ * note that the PDE#1023 VA (0xffc00000) is defined as "APTE_BASE"
+ *
+ * starting at VA 0xbfc00000 the current active PDP (%cr3) acts as a
+ * PTP:
+ *
+ * PTP#767 == PDP(%cr3) => maps VA 0xbfc00000 -> 0xc0000000
+ *   +----+
+ *   |   0| -> maps the contents of PTP#0 at VA 0xbfc00000->0xbfc01000
+ *   |    |
+ *   |    |
+ *   | 767| -> maps contents of PTP#767 (the PDP) at VA 0xbffbf000
+ *   | 768| -> maps contents of first kernel PTP
+ *   |    |
+ *   |1023|
+ *   +----+
+ *
+ * note that mapping of the PDP at PTP#767's VA (0xbffbf000) is
+ * defined as "PDP_BASE".... within that mapping there are two
+ * defines:
+ *   "PDP_PDE" (0xbfeffbfc) is the VA of the PDE in the PDP
+ *      which points back to itself.
+ *   "APDP_PDE" (0xbfeffffc) is the VA of the PDE in the PDP which
+ *      establishes the recursive mapping of the alternate pmap.
+ *      to set the alternate PDP, one just has to put the correct
+ *	PA info in *APDP_PDE.
+ *
+ * note that in the APTE_BASE space, the APDP appears at VA
+ * "APDP_BASE" (0xfffff000).
+ */
+/* XXX MP should we allocate one APDP_PDE per processor?? */
+
+/*
+ * the following defines identify the slots used as described above.
+ */
+
+#define PDSLOT_PTE	((KERNBASE/NBPD)-1) /* 767: for recursive PDP map */
+#define PDSLOT_KERN	(KERNBASE/NBPD)	    /* 768: start of kernel space */
+#define PDSLOT_APTE	((unsigned)1023-16) /* 1023: alternative recursive slot */
+
+/*
+ * the following defines give the virtual addresses of various MMU
+ * data structures:
+ * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
+ * PTD_BASE and APTD_BASE: the base VA of the recursive mapping of the PTD
+ * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
+ */
+
+#define PTE_BASE	((pt_entry_t *)  (PDSLOT_PTE * NBPD) )
+#define APTE_BASE	((pt_entry_t *)  (PDSLOT_APTE * NBPD) )
+#define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * PAGE_SIZE)))
+#define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * PAGE_SIZE)))
+#define PDP_PDE		(PDP_BASE + PDSLOT_PTE)
+#define APDP_PDE	(PDP_BASE + PDSLOT_APTE)
+
+/*
+ * the follow define determines how many PTPs should be set up for the
+ * kernel by locore.s at boot time.  this should be large enough to
+ * get the VM system running.  once the VM system is running, the
+ * pmap module can add more PTPs to the kernel area on demand.
+ */
+
+#ifndef NKPTP
+#define NKPTP		4	/* 16MB to start */
+#endif
+#define NKPTP_MIN	4	/* smallest value we allow */
+#define NKPTP_MAX	(1024 - (KERNBASE/NBPD) - 1)
+				/* largest value (-1 for APTP space) */
+
+/*
+ * pdei/ptei: generate index into PDP/PTP from a VA
+ */
+#define	pdei(VA)	(((VA) & PD_MASK) >> PDSHIFT)
+#define	ptei(VA)	(((VA) & PT_MASK) >> PGSHIFT)
+
+/*
+ * PTP macros:
+ *   a PTP's index is the PD index of the PDE that points to it
+ *   a PTP's offset is the byte-offset in the PTE space that this PTP is at
+ *   a PTP's VA is the first VA mapped by that PTP
+ *
+ * note that PAGE_SIZE == number of bytes in a PTP (4096 bytes == 1024 entries)
+ *           NBPD == number of bytes a PTP can map (4MB)
+ */
+
+#define ptp_i2o(I)	((I) * PAGE_SIZE)	/* index => offset */
+#define ptp_o2i(O)	((O) / PAGE_SIZE)	/* offset => index */
+#define ptp_i2v(I)	((I) * NBPD)	/* index => VA */
+#define ptp_v2i(V)	((V) / NBPD)	/* VA => index (same as pdei) */
+
+/*
+ * PG_AVAIL usage: we make use of the ignored bits of the PTE
+ */
+
+#define PG_W		PG_AVAIL1	/* "wired" mapping */
+#define PG_PVLIST	PG_AVAIL2	/* mapping has entry on pvlist */
+#define PG_X		PG_AVAIL3	/* executable mapping */
+
+/*
+ * Number of PTE's per cache line.  4 byte pte, 32-byte cache line
+ * Used to avoid false sharing of cache lines.
+ */
+#define NPTECL			8
+
+#ifdef _KERNEL
+/*
+ * pmap data structures: see pmap.c for details of locking.
+ */
+
+struct pmap;
+typedef struct pmap *pmap_t;
+
+/*
+ * we maintain a list of all non-kernel pmaps
+ */
+
+LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */
+
+/*
+ * the pmap structure
+ *
+ * note that the pm_obj contains the simple_lock, the reference count,
+ * page list, and number of PTPs within the pmap.
+ *
+ * XXX If we ever support processor numbers higher than 31, we'll have
+ * XXX to rethink the CPU mask.
+ */
+
+struct pmap {
+	struct uvm_object pm_obj;	/* object (lck by object lock) */
+#define	pm_lock	pm_obj.vmobjlock
+	LIST_ENTRY(pmap) pm_list;	/* list (lck by pm_list lock) */
+	pd_entry_t *pm_pdir;		/* VA of PD (lck by object lock) */
+	u_int32_t pm_pdirpa;		/* PA of PD (read-only after create) */
+	struct vm_page *pm_ptphint;	/* pointer to a PTP in our pmap */
+	struct pmap_statistics pm_stats;  /* pmap stats (lck by object lock) */
+
+	vaddr_t pm_hiexec;		/* highest executable mapping */
+	int pm_flags;			/* see below */
+
+	union descriptor *pm_ldt;	/* user-set LDT */
+	int pm_ldt_len;			/* number of LDT entries */
+	int pm_ldt_sel;			/* LDT selector */
+	u_int32_t pm_cpus;		/* mask of CPUs using pmap */
+};
+
+/* pm_flags */
+#define	PMF_USER_LDT	0x01	/* pmap has user-set LDT */
+
+/*
+ * for each managed physical page we maintain a list of <PMAP,VA>'s
+ * which it is mapped at.  the list is headed by a pv_head structure.
+ * there is one pv_head per managed phys page (allocated at boot time).
+ * the pv_head structure points to a list of pv_entry structures (each
+ * describes one mapping).
+ */
+
+struct pv_entry {			/* locked by its list's pvh_lock */
+	SPLAY_ENTRY(pv_entry) pv_node;	/* splay-tree node */
+	struct pmap *pv_pmap;		/* the pmap */
+	vaddr_t pv_va;			/* the virtual address */
+	struct vm_page *pv_ptp;		/* the vm_page of the PTP */
+};
+
+/*
+ * pv_entrys are dynamically allocated in chunks from a single page.
+ * we keep track of how many pv_entrys are in use for each page and
+ * we can free pv_entry pages if needed.  there is one lock for the
+ * entire allocation system.
+ */
+
+struct pv_page_info {
+	TAILQ_ENTRY(pv_page) pvpi_list;
+	struct pv_entry *pvpi_pvfree;
+	int pvpi_nfree;
+};
+
+/*
+ * number of pv_entry's in a pv_page
+ * (note: won't work on systems where NPBG isn't a constant)
+ */
+
+#define PVE_PER_PVPAGE ((PAGE_SIZE - sizeof(struct pv_page_info)) / \
+			sizeof(struct pv_entry))
+
+/*
+ * a pv_page: where pv_entrys are allocated from
+ */
+
+struct pv_page {
+	struct pv_page_info pvinfo;
+	struct pv_entry pvents[PVE_PER_PVPAGE];
+};
+
+/*
+ * global kernel variables
+ */
+
+/* PTDpaddr: is the physical address of the kernel's PDP */
+extern u_long PTDpaddr;
+
+extern struct pmap kernel_pmap_store;	/* kernel pmap */
+extern int nkpde;			/* current # of PDEs for kernel */
+extern int pmap_pg_g;			/* do we support PG_G? */
+
+/*
+ * macros
+ */
+
+#define	pmap_kernel()			(&kernel_pmap_store)
+#define	pmap_resident_count(pmap)	((pmap)->pm_stats.resident_count)
+#define	pmap_wired_count(pmap)		((pmap)->pm_stats.wired_count)
+#define	pmap_update(pmap)		/* nothing (yet) */
+
+#define pmap_clear_modify(pg)		pmap_clear_attrs(pg, PG_M)
+#define pmap_clear_reference(pg)	pmap_clear_attrs(pg, PG_U)
+#define pmap_copy(DP,SP,D,L,S)
+#define pmap_is_modified(pg)		pmap_test_attrs(pg, PG_M)
+#define pmap_is_referenced(pg)		pmap_test_attrs(pg, PG_U)
+#define pmap_move(DP,SP,D,L,S)
+#define pmap_phys_address(ppn)		x86_ptob(ppn)
+#define pmap_valid_entry(E) 		((E) & PG_V) /* is PDE or PTE valid? */
+
+
+/*
+ * prototypes
+ */
+
+void		pmap_activate(struct lwp *);
+void		pmap_bootstrap(vaddr_t);
+boolean_t	pmap_clear_attrs(struct vm_page *, int);
+void		pmap_deactivate(struct lwp *);
+void		pmap_deactivate2(struct lwp *);
+void		pmap_page_remove (struct vm_page *);
+void		pmap_remove(struct pmap *, vaddr_t, vaddr_t);
+boolean_t	pmap_test_attrs(struct vm_page *, int);
+void		pmap_write_protect(struct pmap *, vaddr_t, vaddr_t, vm_prot_t);
+int		pmap_exec_fixup(struct vm_map *, struct trapframe *,
+		    struct pcb *);
+void		pmap_load(void);
+int		pmap_enter_ma(struct pmap *, vaddr_t, paddr_t, vm_prot_t,
+		    int);
+
+vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
+
+void	pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, int32_t *);
+void	pmap_tlb_shootnow(int32_t);
+void	pmap_do_tlb_shootdown(struct cpu_info *);
+
+#define PMAP_GROWKERNEL		/* turn on pmap_growkernel interface */
+
+/*
+ * Do idle page zero'ing uncached to avoid polluting the cache.
+ */
+boolean_t			pmap_pageidlezero(paddr_t);
+#define	PMAP_PAGEIDLEZERO(pa)	pmap_pageidlezero((pa))
+
+/*
+ * inline functions
+ */
+
+/*ARGSUSED*/
+static __inline void
+pmap_remove_all(struct pmap *pmap)
+{
+	/* Nothing. */
+}
+
+/*
+ * pmap_update_pg: flush one page from the TLB (or flush the whole thing
+ *	if hardware doesn't support one-page flushing)
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_update_pg(vaddr_t va)
+{
+#if defined(I386_CPU)
+	if (cpu_class == CPUCLASS_386)
+		tlbflush();
+	else
+#endif
+		invlpg((u_int) va);
+}
+
+/*
+ * pmap_update_2pg: flush two pages from the TLB
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_update_2pg(vaddr_t va, vaddr_t vb)
+{
+#if defined(I386_CPU)
+	if (cpu_class == CPUCLASS_386)
+		tlbflush();
+	else
+#endif
+	{
+		invlpg((u_int) va);
+		invlpg((u_int) vb);
+	}
+}
+
+/*
+ * pmap_page_protect: change the protection of all recorded mappings
+ *	of a managed page
+ *
+ * => this function is a frontend for pmap_page_remove/pmap_clear_attrs
+ * => we only have to worry about making the page more protected.
+ *	unprotecting a page is done on-demand at fault time.
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
+{
+	if ((prot & VM_PROT_WRITE) == 0) {
+		if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
+			(void) pmap_clear_attrs(pg, PG_RW);
+		} else {
+			pmap_page_remove(pg);
+		}
+	}
+}
+
+/*
+ * pmap_protect: change the protection of pages in a pmap
+ *
+ * => this function is a frontend for pmap_remove/pmap_write_protect
+ * => we only have to worry about making the page more protected.
+ *	unprotecting a page is done on-demand at fault time.
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
+{
+	if ((prot & VM_PROT_WRITE) == 0) {
+		if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
+			pmap_write_protect(pmap, sva, eva, prot);
+		} else {
+			pmap_remove(pmap, sva, eva);
+		}
+	}
+}
+
+/*
+ * various address inlines
+ *
+ *  vtopte: return a pointer to the PTE mapping a VA, works only for
+ *  user and PT addresses
+ *
+ *  kvtopte: return a pointer to the PTE mapping a kernel VA
+ */
+
+#include <lib/libkern/libkern.h>
+
+static __inline pt_entry_t * __attribute__((__unused__))
+vtopte(vaddr_t va)
+{
+
+	KASSERT(va < (PDSLOT_KERN << PDSHIFT));
+
+	return (PTE_BASE + x86_btop(va));
+}
+
+static __inline pt_entry_t * __attribute__((__unused__))
+kvtopte(vaddr_t va)
+{
+
+	KASSERT(va >= (PDSLOT_KERN << PDSHIFT));
+
+#ifdef LARGEPAGES
+	{
+		pd_entry_t *pde;
+
+		pde = PDP_BASE + pdei(va);
+		if (*pde & PG_PS)
+			return ((pt_entry_t *)pde);
+	}
+#endif
+
+	return (PTE_BASE + x86_btop(va));
+}
+
+/*
+ * vtomach: virtual address to machine address.  For use by
+ * machine-dependent code only.
+ */
+
+static inline paddr_t __attribute__((__unused__))
+vtomach(vaddr_t va)
+{
+	pt_entry_t pte;
+
+	pte = PTE_GET(&PTE_BASE[x86_btop(va)]);
+	return xpmap_ptom((pte & PG_FRAME) | (va & ~PG_FRAME));
+}
+
+#define pmap_cpu_has_pg_n()		(cpu_class != CPUCLASS_386)
+#define pmap_cpu_has_invlpg()		(cpu_class != CPUCLASS_386)
+
+paddr_t vtophys(vaddr_t);
+vaddr_t	pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t);
+
+void	pmap_kenter_ma(vaddr_t, paddr_t, vm_prot_t);
+
+#if defined(USER_LDT)
+void	pmap_ldt_cleanup(struct lwp *);
+#define	PMAP_FORK
+#endif /* USER_LDT */
+
+/* 
+ * Hooks for the pool allocator.
+ */
+#define	POOL_VTOPHYS(va)	vtophys((vaddr_t) (va))
+
+#endif /* _KERNEL */
+#endif	/* _I386_PMAP_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h
new file mode 100644
index 0000000000..48bff484b9
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h
@@ -0,0 +1,247 @@
+/*	$NetBSD: xen.h,v 1.1.2.2 2004/06/17 09:23:19 tron Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2003, 2004 Keir Fraser (on behalf of the Xen team)
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _XEN_H
+#define _XEN_H
+
+#ifndef _LOCORE
+
+struct xen_netinfo {
+	uint32_t xi_ifno;
+	char *xi_root;
+	uint32_t xi_ip[5];
+};
+
+union xen_cmdline_parseinfo {
+	char			xcp_bootdev[16]; /* sizeof(dv_xname) */
+	struct xen_netinfo	xcp_netinfo;
+	char			xcp_console[16];
+};
+
+#define	XEN_PARSE_BOOTDEV	0
+#define	XEN_PARSE_NETINFO	1
+#define	XEN_PARSE_CONSOLE	2
+
+void	xen_parse_cmdline(int, union xen_cmdline_parseinfo *);
+
+void	xenconscn_attach(void);
+
+void	xenmachmem_init(void);
+void	xenprivcmd_init(void);
+void	xenvfr_init(void);
+
+#ifdef XENDEBUG
+void printk(const char *, ...);
+void vprintk(const char *, va_list);
+#endif
+
+#endif
+
+#endif /* _XEN_H */
+
+/******************************************************************************
+ * os.h
+ * 
+ * random collection of macros and definition
+ */
+
+#ifndef _OS_H_
+#define _OS_H_
+
+/*
+ * These are the segment descriptors provided for us by the hypervisor.
+ * For now, these are hardwired -- guest OSes cannot update the GDT
+ * or LDT.
+ * 
+ * It shouldn't be hard to support descriptor-table frobbing -- let me 
+ * know if the BSD or XP ports require flexibility here.
+ */
+
+
+/*
+ * these are also defined in hypervisor-if.h but can't be pulled in as
+ * they are used in start of day assembly. Need to clean up the .h files
+ * a bit more...
+ */
+
+#ifndef FLAT_RING1_CS
+#define FLAT_RING1_CS		0x0819
+#define FLAT_RING1_DS		0x0821
+#define FLAT_RING3_CS		0x082b
+#define FLAT_RING3_DS		0x0833
+#endif
+
+#define __KERNEL_CS        FLAT_RING1_CS
+#define __KERNEL_DS        FLAT_RING1_DS
+
+/* Everything below this point is not included by assembler (.S) files. */
+#ifndef _LOCORE
+
+/* some function prototypes */
+void trap_init(void);
+
+
+/*
+ * STI/CLI equivalents. These basically set and clear the virtual
+ * event_enable flag in the shared_info structure. Note that when
+ * the enable bit is set, there may be pending events to be handled.
+ * We may therefore call into do_hypervisor_callback() directly.
+ */
+
+#define __save_flags(x)							\
+do {									\
+	(x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask;	\
+} while (0)
+
+#define __restore_flags(x)						\
+do {									\
+	shared_info_t *_shared = HYPERVISOR_shared_info;		\
+	__insn_barrier();						\
+	if ((_shared->vcpu_data[0].evtchn_upcall_mask = (x)) == 0) {	\
+		__insn_barrier();					\
+		if (__predict_false(_shared->vcpu_data[0].evtchn_upcall_pending)) \
+			hypervisor_force_callback();			\
+	}								\
+} while (0)
+
+#define __cli()								\
+do {									\
+	HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1;	\
+	__insn_barrier();						\
+} while (0)
+
+#define __sti()								\
+do {									\
+	shared_info_t *_shared = HYPERVISOR_shared_info;		\
+	__insn_barrier();						\
+	_shared->vcpu_data[0].evtchn_upcall_mask = 0;			\
+	__insn_barrier(); /* unmask then check (avoid races) */		\
+	if (__predict_false(_shared->vcpu_data[0].evtchn_upcall_pending)) \
+		hypervisor_force_callback();				\
+} while (0)
+
+#define cli()			__cli()
+#define sti()			__sti()
+#define save_flags(x)		__save_flags(x)
+#define restore_flags(x)	__restore_flags(x)
+#define save_and_cli(x)	do {					\
+	__save_flags(x);					\
+	__cli();						\
+} while (/* CONSTCOND */ 0)
+#define save_and_sti(x)		__save_and_sti(x)
+
+#ifdef MULTIPROCESSOR
+#define __LOCK_PREFIX "lock; "
+#else
+#define __LOCK_PREFIX ""
+#endif
+
+static __inline__ uint32_t
+x86_atomic_xchg(uint32_t *ptr, unsigned long val)
+{
+	unsigned long result;
+
+        __asm __volatile("xchgl %0,%1"
+	    :"=r" (result)
+	    :"m" (*ptr), "0" (val)
+	    :"memory");
+
+	return result;
+}
+
+static __inline__ int
+x86_atomic_test_and_clear_bit(volatile void *ptr, int bitno)
+{
+        int result;
+
+        __asm __volatile(__LOCK_PREFIX
+	    "btrl %2,%1 ;"
+	    "sbbl %0,%0"
+	    :"=r" (result), "=m" (*(volatile uint32_t *)(ptr))
+	    :"Ir" (bitno) : "memory");
+        return result;
+}
+
+static __inline__ int
+x86_atomic_test_and_set_bit(volatile void *ptr, int bitno)
+{
+        int result;
+
+        __asm __volatile(__LOCK_PREFIX
+	    "btsl %2,%1 ;"
+	    "sbbl %0,%0"
+	    :"=r" (result), "=m" (*(volatile uint32_t *)(ptr))
+	    :"Ir" (bitno) : "memory");
+        return result;
+}
+
+static __inline int
+x86_constant_test_bit(const volatile void *ptr, int bitno)
+{
+	return ((1UL << (bitno & 31)) &
+	    (((const volatile uint32_t *) ptr)[bitno >> 5])) != 0;
+}
+
+static __inline int
+x86_variable_test_bit(const volatile void *ptr, int bitno)
+{
+	int result;
+    
+	__asm __volatile(
+		"btl %2,%1 ;"
+		"sbbl %0,%0"
+		:"=r" (result)
+		:"m" (*(volatile uint32_t *)(ptr)), "Ir" (bitno));
+	return result;
+}
+
+#define x86_atomic_test_bit(ptr, bitno) \
+	(__builtin_constant_p(bitno) ? \
+	 x86_constant_test_bit((ptr),(bitno)) : \
+	 x86_variable_test_bit((ptr),(bitno)))
+
+static __inline void
+x86_atomic_set_bit(volatile void *ptr, int bitno)
+{
+        __asm __volatile(__LOCK_PREFIX
+	    "btsl %1,%0"
+	    :"=m" (*(volatile uint32_t *)(ptr))
+	    :"Ir" (bitno));
+}
+
+static __inline void
+x86_atomic_clear_bit(volatile void *ptr, int bitno)
+{
+        __asm __volatile(__LOCK_PREFIX
+	    "btrl %1,%0"
+	    :"=m" (*(volatile uint32_t *)(ptr))
+	    :"Ir" (bitno));
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _OS_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h
new file mode 100644
index 0000000000..2df026a922
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h
@@ -0,0 +1,135 @@
+/*	$NetBSD: xenfunc.h,v 1.1.2.1 2004/05/22 15:59:31 he Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_XENFUNC_H_
+#define _XEN_XENFUNC_H_
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+#include <machine/xenpmap.h>
+#include <machine/pte.h>
+
+#ifdef XENDEBUG_LOW
+#define	__PRINTK(x) printk x
+#else
+#define	__PRINTK(x)
+#endif
+
+void xen_set_ldt(vaddr_t, uint32_t);
+void xen_update_descriptor(union descriptor *, union descriptor *);
+
+static __inline void 
+invlpg(u_int addr)
+{
+	xpq_queue_invlpg(addr);
+	xpq_flush_queue();
+}  
+
+static __inline void
+lldt(u_short sel)
+{
+
+	/* __PRINTK(("ldt %x\n", IDXSELN(sel))); */
+	if (sel == GSEL(GLDT_SEL, SEL_KPL))
+		xen_set_ldt((vaddr_t)ldt, NLDT);
+	else
+		xen_set_ldt(cpu_info_primary.ci_gdt[IDXSELN(sel)].ld.ld_base,
+		    cpu_info_primary.ci_gdt[IDXSELN(sel)].ld.ld_entries);
+}
+
+static __inline void
+ltr(u_short sel)
+{
+	__PRINTK(("XXX ltr not supported\n"));
+}
+
+static __inline void
+lcr0(u_int val)
+{
+	__PRINTK(("XXX lcr0 not supported\n"));
+}
+
+static __inline u_int
+rcr0(void)
+{
+	__PRINTK(("XXX rcr0 not supported\n"));
+	return 0;
+}
+
+#define lcr3(_v) _lcr3((_v), __FILE__, __LINE__)
+static __inline void
+_lcr3(u_int val, char *file, int line)
+{
+/* 	__PRINTK(("lcr3 %08x at %s:%d\n", val, file, line)); */
+	xpq_queue_pt_switch(xpmap_ptom(val) & PG_FRAME);
+	xpq_flush_queue();
+}
+
+static __inline void
+tlbflush(void)
+{
+	xpq_queue_tlb_flush();
+	xpq_flush_queue();
+}
+
+static __inline u_int
+rdr6(void)
+{
+	u_int val;
+
+	val = HYPERVISOR_get_debugreg(6);
+	return val;
+}
+
+static __inline void
+ldr6(u_int val)
+{
+
+	HYPERVISOR_set_debugreg(6, val);
+}
+
+static __inline void
+disable_intr(void)
+{
+	__cli();
+}
+
+static __inline void
+enable_intr(void)
+{
+	__sti();
+}
+
+#endif /* _XEN_XENFUNC_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h
new file mode 100644
index 0000000000..f3c8c7f2d8
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h
@@ -0,0 +1,193 @@
+/*	$NetBSD: xenpmap.h,v 1.1.2.1 2004/05/22 15:59:58 he Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_XENPMAP_H_
+#define _XEN_XENPMAP_H_
+
+#define	INVALID_P2M_ENTRY	(~0UL)
+
+void xpq_queue_invlpg(vaddr_t);
+void xpq_queue_pde_update(pd_entry_t *, pd_entry_t);
+void xpq_queue_pte_update(pt_entry_t *, pt_entry_t);
+void xpq_queue_unchecked_pte_update(pt_entry_t *, pt_entry_t);
+void xpq_queue_pt_switch(paddr_t);
+void xpq_flush_queue(void);
+void xpq_queue_set_ldt(vaddr_t, uint32_t);
+void xpq_queue_tlb_flush(void);
+void xpq_queue_pin_table(paddr_t, int);
+void xpq_queue_unpin_table(paddr_t);
+
+extern paddr_t *xpmap_phys_to_machine_mapping;
+
+#define	XPQ_PIN_L1_TABLE 1
+#define	XPQ_PIN_L2_TABLE 2
+
+#ifndef XEN
+#define	PDE_GET(_pdp)						\
+	*(_pdp)
+#define PDE_SET(_pdp,_mapdp,_npde)				\
+	*(_mapdp) = (_npde)
+#define PDE_CLEAR(_pdp,_mapdp)					\
+	*(_mapdp) = 0
+#define PTE_SET(_ptp,_maptp,_npte)				\
+	*(_maptp) = (_npte)
+#define PTE_CLEAR(_ptp,_maptp)					\
+	*(_maptp) = 0
+#define PTE_ATOMIC_SET(_ptp,_maptp,_npte,_opte)			\
+	(_opte) = x86_atomic_testset_ul((_maptp), (_npte))
+#define PTE_ATOMIC_CLEAR(_ptp,_maptp,_opte)			\
+	(_opte) = x86_atomic_testset_ul((_maptp), 0)
+#define PDE_CLEARBITS(_pdp,_mapdp,_bits)			\
+	*(_mapdp) &= ~(_bits)
+#define PTE_ATOMIC_CLEARBITS(_ptp,_maptp,_bits)			\
+	x86_atomic_clearbits_l((_maptp), (_bits))
+#define PTE_SETBITS(_ptp,_maptp,_bits)				\
+	*(_maptp) |= (_bits)
+#define PTE_ATOMIC_SETBITS(_ptp,_maptp,_bits)			\
+	x86_atomic_setbits_l((_maptp), (_bits))
+#else
+paddr_t *xpmap_phys_to_machine_mapping;
+
+#define	PDE_GET(_pdp)						\
+	(pmap_valid_entry(*(_pdp)) ? xpmap_mtop(*(_pdp)) : *(_pdp))
+#define PDE_SET(_pdp,_mapdp,_npde) do {				\
+	xpq_queue_pde_update((_mapdp), xpmap_ptom((_npde)));	\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PDE_CLEAR(_pdp,_mapdp) do {				\
+	xpq_queue_pde_update((_mapdp), 0);			\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define	PTE_GET(_ptp)						\
+	(pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : *(_ptp))
+#define	PTE_GET_MA(_ptp)					\
+	*(_ptp)
+#define PTE_SET(_ptp,_maptp,_npte) do {				\
+	xpq_queue_pte_update((_maptp), xpmap_ptom((_npte)));	\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_SET_MA(_ptp,_maptp,_npte) do {			\
+	xpq_queue_pte_update((_maptp), (_npte));		\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_SET_MA_UNCHECKED(_ptp,_maptp,_npte) do {		\
+	xpq_queue_unchecked_pte_update((_maptp), (_npte));	\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_CLEAR(_ptp,_maptp) do {				\
+	xpq_queue_pte_update((_maptp), 0);			\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_SET(_ptp,_maptp,_npte,_opte) do {		\
+	(_opte) = PTE_GET(_ptp);				\
+	xpq_queue_pte_update((_maptp), xpmap_ptom((_npte)));	\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_SET_MA(_ptp,_maptp,_npte,_opte) do {		\
+	(_opte) = *(_ptp);					\
+	xpq_queue_pte_update((_maptp), (_npte));		\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_CLEAR(_ptp,_maptp,_opte) do {		\
+	(_opte) = PTE_GET(_ptp);				\
+	xpq_queue_pte_update((_maptp), 0);			\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_CLEAR_MA(_ptp,_maptp,_opte) do {		\
+	(_opte) = *(_ptp);					\
+	xpq_queue_pte_update((_maptp), 0);			\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PDE_CLEARBITS(_pdp,_mapdp,_bits) do {			\
+	xpq_queue_pte_update((_mapdp), *(_pdp) & ~((_bits) & ~PG_FRAME)); \
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_CLEARBITS(_ptp,_maptp,_bits) do {			\
+	xpq_queue_pte_update((_maptp), *(_ptp) & ~((_bits) & ~PG_FRAME)); \
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PDE_ATOMIC_CLEARBITS(_pdp,_mapdp,_bits) do {		\
+	xpq_queue_pde_update((_mapdp), *(_pdp) & ~((_bits) & ~PG_FRAME)); \
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_CLEARBITS(_ptp,_maptp,_bits) do {		\
+	xpq_queue_pte_update((_maptp), *(_ptp) & ~((_bits) & ~PG_FRAME)); \
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_SETBITS(_ptp,_maptp,_bits) do {			\
+	xpq_queue_pte_update((_maptp), *(_ptp) | ((_bits) & ~PG_FRAME)); \
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PDE_ATOMIC_SETBITS(_pdp,_mapdp,_bits) do {		\
+	xpq_queue_pde_update((_mapdp), *(_pdp) | ((_bits) & ~PG_FRAME)); \
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_SETBITS(_ptp,_maptp,_bits) do {		\
+	xpq_queue_pte_update((_maptp), *(_ptp) | ((_bits) & ~PG_FRAME)); \
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define PDE_COPY(_dpdp,_madpdp,_spdp) do {			\
+	xpq_queue_pde_update((_madpdp), *(_spdp));		\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+#define	PTE_UPDATES_FLUSH() do {				\
+	xpq_flush_queue();					\
+} while (/*CONSTCOND*/0)
+
+#endif
+
+#define	XPMAP_OFFSET	(KERNTEXTOFF - KERNBASE_LOCORE)
+static __inline paddr_t
+xpmap_mtop(paddr_t mpa)
+{
+	return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) +
+	    XPMAP_OFFSET) | (mpa & ~PG_FRAME);
+}
+
+static __inline paddr_t
+xpmap_ptom(paddr_t ppa)
+{
+	return (xpmap_phys_to_machine_mapping[(ppa -
+	    XPMAP_OFFSET) >> PAGE_SHIFT] << PAGE_SHIFT)
+		| (ppa & ~PG_FRAME);
+}
+
+static __inline paddr_t
+xpmap_ptom_masked(paddr_t ppa)
+{
+	return (xpmap_phys_to_machine_mapping[(ppa -
+	    XPMAP_OFFSET) >> PAGE_SHIFT] << PAGE_SHIFT);
+}
+
+#endif /* _XEN_XENPMAP_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c b/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c
new file mode 100644
index 0000000000..dda715fa54
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c
@@ -0,0 +1,505 @@
+/*	$NetBSD: bus_space.c,v 1.2.2.1 2004/05/22 15:57:25 he Exp $	*/
+/*	NetBSD: bus_space.c,v 1.2 2003/03/14 18:47:53 christos Exp 	*/
+
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
+ * Simulation Facility, NASA Ames Research Center.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the NetBSD
+ *	Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: bus_space.c,v 1.2.2.1 2004/05/22 15:57:25 he Exp $");
+
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/extent.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/bus.h>
+
+#include <dev/isa/isareg.h>
+#include <machine/isa_machdep.h>
+
+#include <machine/hypervisor.h>
+#include <machine/xenpmap.h>
+
+/*
+ * Extent maps to manage I/O and memory space.  Allocate
+ * storage for 8 regions in each, initially.  Later, ioport_malloc_safe
+ * will indicate that it's safe to use malloc() to dynamically allocate
+ * region descriptors.
+ *
+ * N.B. At least two regions are _always_ allocated from the iomem
+ * extent map; (0 -> ISA hole) and (end of ISA hole -> end of RAM).
+ *
+ * The extent maps are not static!  Machine-dependent ISA and EISA
+ * routines need access to them for bus address space allocation.
+ */
+static	long ioport_ex_storage[EXTENT_FIXED_STORAGE_SIZE(8) / sizeof(long)];
+static	long iomem_ex_storage[EXTENT_FIXED_STORAGE_SIZE(8) / sizeof(long)];
+struct	extent *ioport_ex;
+struct	extent *iomem_ex;
+static	int ioport_malloc_safe;
+
+int	x86_mem_add_mapping __P((bus_addr_t, bus_size_t,
+	    int, bus_space_handle_t *));
+
+void
+x86_bus_space_init()
+{
+	/*
+	 * Initialize the I/O port and I/O mem extent maps.
+	 * Note: we don't have to check the return value since
+	 * creation of a fixed extent map will never fail (since
+	 * descriptor storage has already been allocated).
+	 *
+	 * N.B. The iomem extent manages _all_ physical addresses
+	 * on the machine.  When the amount of RAM is found, the two
+	 * extents of RAM are allocated from the map (0 -> ISA hole
+	 * and end of ISA hole -> end of RAM).
+	 */
+	ioport_ex = extent_create("ioport", 0x0, 0xffff, M_DEVBUF,
+	    (caddr_t)ioport_ex_storage, sizeof(ioport_ex_storage),
+	    EX_NOCOALESCE|EX_NOWAIT);
+	iomem_ex = extent_create("iomem", 0x0, 0xffffffff, M_DEVBUF,
+	    (caddr_t)iomem_ex_storage, sizeof(iomem_ex_storage),
+	    EX_NOCOALESCE|EX_NOWAIT);
+
+	/* We are privileged guest os - should have IO privileges. */
+	if (xen_start_info.flags & SIF_PRIVILEGED) {
+		dom0_op_t op;
+		op.cmd = DOM0_IOPL;
+		op.u.iopl.domain = DOMID_SELF;
+		op.u.iopl.iopl = 1;
+		if (HYPERVISOR_dom0_op(&op) != 0)
+			panic("Unable to obtain IOPL, "
+			    "despite being SIF_PRIVILEGED");
+	}
+}
+
+void
+x86_bus_space_mallocok()
+{
+
+	ioport_malloc_safe = 1;
+}
+
+int
+x86_memio_map(t, bpa, size, flags, bshp)
+	bus_space_tag_t t;
+	bus_addr_t bpa;
+	bus_size_t size;
+	int flags;
+	bus_space_handle_t *bshp;
+{
+	int error;
+	struct extent *ex;
+
+	/*
+	 * Pick the appropriate extent map.
+	 */
+	if (t == X86_BUS_SPACE_IO) {
+		if (flags & BUS_SPACE_MAP_LINEAR)
+			return (EOPNOTSUPP);
+		ex = ioport_ex;
+	} else if (t == X86_BUS_SPACE_MEM)
+		ex = iomem_ex;
+	else
+		panic("x86_memio_map: bad bus space tag");
+
+	/*
+	 * Before we go any further, let's make sure that this
+	 * region is available.
+	 */
+	error = extent_alloc_region(ex, bpa, size,
+	    EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0));
+	if (error)
+		return (error);
+
+	/*
+	 * For I/O space, that's all she wrote.
+	 */
+	if (t == X86_BUS_SPACE_IO) {
+		*bshp = bpa;
+		return (0);
+	}
+
+	/*
+	 * For memory space, map the bus physical address to
+	 * a kernel virtual address.
+	 */
+	error = x86_mem_add_mapping(bpa, size,
+		(flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp);
+	if (error) {
+		if (extent_free(ex, bpa, size, EX_NOWAIT |
+		    (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
+			printf("x86_memio_map: pa 0x%lx, size 0x%lx\n",
+			    bpa, size);
+			printf("x86_memio_map: can't free region\n");
+		}
+	}
+
+	return (error);
+}
+
+int
+_x86_memio_map(t, bpa, size, flags, bshp)
+	bus_space_tag_t t;
+	bus_addr_t bpa;
+	bus_size_t size;
+	int flags;
+	bus_space_handle_t *bshp;
+{
+
+	/*
+	 * For I/O space, just fill in the handle.
+	 */
+	if (t == X86_BUS_SPACE_IO) {
+		if (flags & BUS_SPACE_MAP_LINEAR)
+			return (EOPNOTSUPP);
+		*bshp = bpa;
+		return (0);
+	}
+
+	/*
+	 * For memory space, map the bus physical address to
+	 * a kernel virtual address.
+	 */
+	return (x86_mem_add_mapping(bpa, size,
+	    (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp));
+}
+
+int
+x86_memio_alloc(t, rstart, rend, size, alignment, boundary, flags,
+    bpap, bshp)
+	bus_space_tag_t t;
+	bus_addr_t rstart, rend;
+	bus_size_t size, alignment, boundary;
+	int flags;
+	bus_addr_t *bpap;
+	bus_space_handle_t *bshp;
+{
+	struct extent *ex;
+	u_long bpa;
+	int error;
+
+	/*
+	 * Pick the appropriate extent map.
+	 */
+	if (t == X86_BUS_SPACE_IO) {
+		if (flags & BUS_SPACE_MAP_LINEAR)
+			return (EOPNOTSUPP);
+		ex = ioport_ex;
+	} else if (t == X86_BUS_SPACE_MEM)
+		ex = iomem_ex;
+	else
+		panic("x86_memio_alloc: bad bus space tag");
+
+	/*
+	 * Sanity check the allocation against the extent's boundaries.
+	 */
+	if (rstart < ex->ex_start || rend > ex->ex_end)
+		panic("x86_memio_alloc: bad region start/end");
+
+	/*
+	 * Do the requested allocation.
+	 */
+	error = extent_alloc_subregion(ex, rstart, rend, size, alignment,
+	    boundary,
+	    EX_FAST | EX_NOWAIT | (ioport_malloc_safe ?  EX_MALLOCOK : 0),
+	    &bpa);
+
+	if (error)
+		return (error);
+
+	/*
+	 * For I/O space, that's all she wrote.
+	 */
+	if (t == X86_BUS_SPACE_IO) {
+		*bshp = *bpap = bpa;
+		return (0);
+	}
+
+	/*
+	 * For memory space, map the bus physical address to
+	 * a kernel virtual address.
+	 */
+	error = x86_mem_add_mapping(bpa, size,
+	    (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp);
+	if (error) {
+		if (extent_free(iomem_ex, bpa, size, EX_NOWAIT |
+		    (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
+			printf("x86_memio_alloc: pa 0x%lx, size 0x%lx\n",
+			    bpa, size);
+			printf("x86_memio_alloc: can't free region\n");
+		}
+	}
+
+	*bpap = bpa;
+
+	return (error);
+}
+
+int
+x86_mem_add_mapping(bpa, size, cacheable, bshp)
+	bus_addr_t bpa;
+	bus_size_t size;
+	int cacheable;
+	bus_space_handle_t *bshp;
+{
+	u_long pa, endpa;
+	vaddr_t va;
+	pt_entry_t *pte;
+	pt_entry_t *maptp;
+	int32_t cpumask = 0;
+
+	pa = x86_trunc_page(bpa);
+	endpa = x86_round_page(bpa + size);
+
+#ifdef DIAGNOSTIC
+	if (endpa <= pa)
+		panic("x86_mem_add_mapping: overflow");
+#endif
+
+	if (bpa >= IOM_BEGIN && (bpa + size) <= IOM_END) {
+		va = (vaddr_t)ISA_HOLE_VADDR(pa);
+	} else {
+		va = uvm_km_valloc(kernel_map, endpa - pa);
+		if (va == 0)
+			return (ENOMEM);
+	}
+
+	*bshp = (bus_space_handle_t)(va + (bpa & PGOFSET));
+
+	for (; pa < endpa; pa += PAGE_SIZE, va += PAGE_SIZE) {
+		pmap_kenter_pa(va, pa, VM_PROT_READ | VM_PROT_WRITE);
+
+		/*
+		 * PG_N doesn't exist on 386's, so we assume that
+		 * the mainboard has wired up device space non-cacheable
+		 * on those machines.
+		 *
+		 * Note that it's not necessary to use atomic ops to
+		 * fiddle with the PTE here, because we don't care
+		 * about mod/ref information.
+		 *
+		 * XXX should hand this bit to pmap_kenter_pa to
+		 * save the extra invalidate!
+		 *
+		 * XXX extreme paranoia suggests tlb shootdown belongs here.
+		 */
+		if (pmap_cpu_has_pg_n()) {
+			pte = kvtopte(va);
+			maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+			if (cacheable)
+				PTE_CLEARBITS(pte, maptp, PG_N);
+			else
+				PTE_SETBITS(pte, maptp, PG_N);
+			pmap_tlb_shootdown(pmap_kernel(), va, *pte,
+			    &cpumask);
+		}
+	}
+
+	pmap_tlb_shootnow(cpumask);
+	pmap_update(pmap_kernel());
+
+	return 0;
+}
+
+/*
+ * void _x86_memio_unmap(bus_space_tag bst, bus_space_handle bsh,
+ *                        bus_size_t size, bus_addr_t *adrp)
+ *
+ *   This function unmaps memory- or io-space mapped by the function
+ *   _x86_memio_map().  This function works nearly as same as
+ *   x86_memio_unmap(), but this function does not ask kernel
+ *   built-in extents and returns physical address of the bus space,
+ *   for the convenience of the extra extent manager.
+ */
+void
+_x86_memio_unmap(t, bsh, size, adrp)
+	bus_space_tag_t t;
+	bus_space_handle_t bsh;
+	bus_size_t size;
+	bus_addr_t *adrp;
+{
+	u_long va, endva;
+	bus_addr_t bpa;
+
+	/*
+	 * Find the correct extent and bus physical address.
+	 */
+	if (t == X86_BUS_SPACE_IO) {
+		bpa = bsh;
+	} else if (t == X86_BUS_SPACE_MEM) {
+		if (bsh >= atdevbase && (bsh + size) <= (atdevbase + IOM_SIZE)) {
+			bpa = (bus_addr_t)ISA_PHYSADDR(bsh);
+		} else {
+
+			va = x86_trunc_page(bsh);
+			endva = x86_round_page(bsh + size);
+
+#ifdef DIAGNOSTIC
+			if (endva <= va) {
+				panic("_x86_memio_unmap: overflow");
+			}
+#endif
+
+#if __NetBSD_Version__ > 104050000
+			if (pmap_extract(pmap_kernel(), va, &bpa) == FALSE) {
+				panic("_x86_memio_unmap:"
+				    " wrong virtual address");
+			}
+			bpa += (bsh & PGOFSET);
+#else
+			bpa = pmap_extract(pmap_kernel(), va) + (bsh & PGOFSET);
+#endif
+
+			pmap_kremove(va, endva - va);
+			/*
+			 * Free the kernel virtual mapping.
+			 */
+			uvm_km_free(kernel_map, va, endva - va);
+		}
+	} else {
+		panic("_x86_memio_unmap: bad bus space tag");
+	}
+
+	if (adrp != NULL) {
+		*adrp = bpa;
+	}
+}
+
+void
+x86_memio_unmap(t, bsh, size)
+	bus_space_tag_t t;
+	bus_space_handle_t bsh;
+	bus_size_t size;
+{
+	struct extent *ex;
+	u_long va, endva;
+	bus_addr_t bpa;
+
+	/*
+	 * Find the correct extent and bus physical address.
+	 */
+	if (t == X86_BUS_SPACE_IO) {
+		ex = ioport_ex;
+		bpa = bsh;
+	} else if (t == X86_BUS_SPACE_MEM) {
+		ex = iomem_ex;
+
+		if (bsh >= atdevbase &&
+		    (bsh + size) <= (atdevbase + IOM_SIZE)) {
+			bpa = (bus_addr_t)ISA_PHYSADDR(bsh);
+			goto ok;
+		}
+
+		va = x86_trunc_page(bsh);
+		endva = x86_round_page(bsh + size);
+
+#ifdef DIAGNOSTIC
+		if (endva <= va)
+			panic("x86_memio_unmap: overflow");
+#endif
+
+		(void) pmap_extract(pmap_kernel(), va, &bpa);
+		bpa += (bsh & PGOFSET);
+
+		pmap_kremove(va, endva - va);
+		/*
+		 * Free the kernel virtual mapping.
+		 */
+		uvm_km_free(kernel_map, va, endva - va);
+	} else
+		panic("x86_memio_unmap: bad bus space tag");
+
+ok:
+	if (extent_free(ex, bpa, size,
+	    EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
+		printf("x86_memio_unmap: %s 0x%lx, size 0x%lx\n",
+		    (t == X86_BUS_SPACE_IO) ? "port" : "pa", bpa, size);
+		printf("x86_memio_unmap: can't free region\n");
+	}
+}
+
+void
+x86_memio_free(t, bsh, size)
+	bus_space_tag_t t;
+	bus_space_handle_t bsh;
+	bus_size_t size;
+{
+
+	/* x86_memio_unmap() does all that we need to do. */
+	x86_memio_unmap(t, bsh, size);
+}
+
+int
+x86_memio_subregion(t, bsh, offset, size, nbshp)
+	bus_space_tag_t t;
+	bus_space_handle_t bsh;
+	bus_size_t offset, size;
+	bus_space_handle_t *nbshp;
+{
+
+	*nbshp = bsh + offset;
+	return (0);
+}
+
+paddr_t
+x86_memio_mmap(t, addr, off, prot, flags)
+	bus_space_tag_t t;
+	bus_addr_t addr;
+	off_t off;
+	int prot;
+	int flags;
+{
+
+	/* Can't mmap I/O space. */
+	if (t == X86_BUS_SPACE_IO)
+		return (-1);
+
+	/*
+	 * "addr" is the base address of the device we're mapping.
+	 * "off" is the offset into that device.
+	 *
+	 * Note we are called for each "page" in the device that
+	 * the upper layers want to map.
+	 */
+	return (x86_btop(addr + off));
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c
new file mode 100644
index 0000000000..6783f69363
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c
@@ -0,0 +1,234 @@
+/*	$NetBSD: clock.c,v 1.1.2.2 2004/07/17 16:43:56 he Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_xen.h"
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.1.2.2 2004/07/17 16:43:56 he Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/device.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+#include <machine/cpu_counter.h>
+
+#include <dev/clock_subr.h>
+
+#include "config_time.h"		/* for CONFIG_TIME */
+
+static int xen_timer_handler(void *, struct trapframe *);
+
+/* These are peridically updated in shared_info, and then copied here. */
+static unsigned long shadow_tsc_stamp;
+static u_int64_t shadow_system_time;
+static unsigned long shadow_time_version;
+static struct timeval shadow_tv;
+
+static int timeset;
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.  Must be called at splclock.
+ */
+static void
+get_time_values_from_xen(void)
+{
+	do {
+		shadow_time_version = HYPERVISOR_shared_info->time_version2;
+		__insn_barrier();
+		shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec;
+		shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_usec;
+		shadow_tsc_stamp = HYPERVISOR_shared_info->tsc_timestamp;
+		shadow_system_time = HYPERVISOR_shared_info->system_time;
+		__insn_barrier();
+	} while (shadow_time_version != HYPERVISOR_shared_info->time_version1);
+}
+
+void
+inittodr(time_t base)
+{
+	int s;
+
+	/*
+	 * if the file system time is more than a year older than the
+	 * kernel, warn and then set the base time to the CONFIG_TIME.
+	 */
+	if (base && base < (CONFIG_TIME-SECYR)) {
+		printf("WARNING: preposterous time in file system\n");
+		base = CONFIG_TIME;
+	}
+
+	s = splclock();
+	get_time_values_from_xen();
+	splx(s);
+
+	time.tv_usec = shadow_tv.tv_usec;
+	time.tv_sec = shadow_tv.tv_sec + rtc_offset * 60;
+#ifdef DEBUG_CLOCK
+	printf("readclock: %ld (%ld)\n", time.tv_sec, base);
+#endif
+	if (base != 0 && base < time.tv_sec - 5*SECYR)
+		printf("WARNING: file system time much less than clock time\n");
+	else if (base > time.tv_sec + 5*SECYR) {
+		printf("WARNING: clock time much less than file system time\n");
+		printf("WARNING: using file system time\n");
+		goto fstime;
+	}
+
+	timeset = 1;
+	return;
+
+fstime:
+	timeset = 1;
+	time.tv_sec = base;
+	printf("WARNING: CHECK AND RESET THE DATE!\n");
+}
+
+void
+resettodr()
+{
+#ifdef DOM0OPS
+	dom0_op_t op;
+	int s;
+#endif
+#ifdef DEBUG_CLOCK
+	struct clock_ymdhms dt;
+#endif
+
+	/*
+	 * We might have been called by boot() due to a crash early
+	 * on.  Don't reset the clock chip in this case.
+	 */
+	if (!timeset)
+		return;
+
+#ifdef DEBUG_CLOCK
+	clock_secs_to_ymdhms(time.tv_sec - rtc_offset * 60, &dt);
+
+	printf("setclock: %d/%d/%d %02d:%02d:%02d\n", dt.dt_year,
+	    dt.dt_mon, dt.dt_day, dt.dt_hour, dt.dt_min, dt.dt_sec);
+#endif
+#ifdef DOM0OPS
+	if (xen_start_info.dom_id == 0) {
+		s = splclock();
+
+		op.cmd = DOM0_SETTIME;
+		op.u.settime.secs	 = time.tv_sec - rtc_offset * 60;
+		op.u.settime.usecs	 = time.tv_usec;
+		op.u.settime.system_time = shadow_system_time;
+		HYPERVISOR_dom0_op(&op);
+
+		splx(s);
+	}
+#endif
+}
+
+void
+startrtclock()
+{
+
+}
+
+/*
+ * Wait approximately `n' microseconds.
+ */
+void
+xen_delay(int n)
+{
+	u_int64_t when;
+
+	get_time_values_from_xen();
+	when = shadow_system_time + n * 1000;
+	while (shadow_system_time < when)
+		get_time_values_from_xen();
+}
+
+void
+xen_microtime(struct timeval *tv)
+{
+
+	*tv = time;
+}
+
+void
+xen_initclocks()
+{
+	int irq = bind_virq_to_irq(VIRQ_TIMER);
+
+	event_set_handler(irq, (int (*)(void *))xen_timer_handler,
+	    NULL, IPL_CLOCK);
+	hypervisor_enable_irq(irq);
+}
+
+static int
+xen_timer_handler(void *arg, struct trapframe *regs)
+{
+#if defined(I586_CPU) || defined(I686_CPU)
+	static int microset_iter; /* call cc_microset once/sec */
+	struct cpu_info *ci = curcpu();
+	
+	/*
+	 * If we have a cycle counter, do the microset thing.
+	 */
+	if (ci->ci_feature_flags & CPUID_TSC) {
+		if (
+#if defined(MULTIPROCESSOR)
+		    CPU_IS_PRIMARY(ci) &&
+#endif
+		    (microset_iter--) == 0) {
+			microset_iter = hz - 1;
+#if defined(MULTIPROCESSOR)
+			x86_broadcast_ipi(X86_IPI_MICROSET);
+#endif
+			cc_microset_time = time;
+			cc_microset(ci);
+		}
+	}
+#endif
+
+	get_time_values_from_xen();
+
+	hardclock((struct clockframe *)regs);
+
+	return 0;
+}
+
+void
+setstatclockrate(int arg)
+{
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c
new file mode 100644
index 0000000000..0f5a9fe788
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c
@@ -0,0 +1,226 @@
+/* $NetBSD: hypervisor.c,v 1.7.2.1 2004/05/22 15:58:54 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: hypervisor.c,v 1.7.2.1 2004/05/22 15:58:54 he Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/device.h>
+
+#include "xencons.h"
+#include "xennet.h"
+#include "xbd.h"
+#include "xenkbc.h"
+#include "vga_xen.h"
+#include "npx.h"
+
+#include "opt_xen.h"
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+
+#ifdef DOM0OPS
+#include <sys/dirent.h>
+#include <sys/stat.h>
+#include <sys/tree.h>
+#include <sys/vnode.h>
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/kernfs/kernfs.h>
+#include <machine/kernfs_machdep.h>
+#endif
+
+#if NXENNET > 0
+#include <net/if.h>
+#include <net/if_ether.h>
+#include <net/if_media.h>
+#include <machine/if_xennetvar.h>
+#endif
+
+#if NXBD > 0
+#include <sys/buf.h>
+#include <sys/disk.h>
+#include <dev/dkvar.h>
+#include <machine/xbdvar.h>
+#endif
+
+#if NXENKBC > 0
+#include <dev/pckbport/pckbportvar.h>
+#include <machine/xenkbcvar.h>
+#endif
+
+#if NVGA_XEN > 0
+#include <machine/bus.h>
+#include <machine/vga_xenvar.h>
+#endif
+
+int	hypervisor_match(struct device *, struct cfdata *, void *);
+void	hypervisor_attach(struct device *, struct device *, void *);
+
+CFATTACH_DECL(hypervisor, sizeof(struct device),
+    hypervisor_match, hypervisor_attach, NULL, NULL);
+
+int	hypervisor_print(void *, const char *);
+
+union hypervisor_attach_cookie {
+	const char *hac_device;		/* first elem of all */
+#if NXENKBC > 0
+	struct xenkbc_attach_args hac_xenkbc;
+#endif
+#if NVGA_XEN > 0
+	struct xen_vga_attach_args hac_vga_xen;
+#endif
+#if NXENCONS > 0
+	struct xencons_attach_args hac_xencons;
+#endif
+#if NXENNET > 0
+	struct xennet_attach_args hac_xennet;
+#endif
+#if NXBD > 0
+	struct xbd_attach_args hac_xbd;
+#endif
+#if NNPX > 0
+	struct xen_npx_attach_args hac_xennpx;
+#endif
+};
+
+
+/*
+ * Probe for the hypervisor; always succeeds.
+ */
+int
+hypervisor_match(parent, match, aux)
+	struct device *parent;
+	struct cfdata *match;
+	void *aux;
+{
+	struct hypervisor_attach_args *haa = aux;
+
+	if (strcmp(haa->haa_busname, "hypervisor") == 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * Attach the hypervisor.
+ */
+void
+hypervisor_attach(parent, self, aux)
+	struct device *parent, *self;
+	void *aux;
+{
+	union hypervisor_attach_cookie hac;
+
+	printf("\n");
+
+	init_events();
+
+#if NXENKBC > 0
+	hac.hac_xenkbc.xa_device = "xenkbc";
+	config_found(self, &hac.hac_xenkbc, hypervisor_print);
+#endif
+
+#if NVGA_XEN > 0
+	hac.hac_vga_xen.xa_device = "vga_xen";
+	hac.hac_vga_xen.xa_iot = X86_BUS_SPACE_IO;
+	hac.hac_vga_xen.xa_memt = X86_BUS_SPACE_MEM;
+	config_found(self, &hac.hac_vga_xen, hypervisor_print);
+#endif
+
+#if NXENCONS > 0
+	hac.hac_xencons.xa_device = "xencons";
+	config_found(self, &hac.hac_xencons, hypervisor_print);
+#endif
+#if NXENNET > 0
+	hac.hac_xennet.xa_device = "xennet";
+	xennet_scan(self, &hac.hac_xennet, hypervisor_print);
+#endif
+#if NXBD > 0
+	hac.hac_xbd.xa_device = "xbd";
+	xbd_scan(self, &hac.hac_xbd, hypervisor_print);
+#endif
+#if NNPX > 0
+	hac.hac_xennpx.xa_device = "npx";
+	config_found(self, &hac.hac_xennpx, hypervisor_print);
+#endif
+#ifdef DOM0OPS
+	if (xen_start_info.flags & SIF_PRIVILEGED) {
+		xenkernfs_init();
+		xenprivcmd_init();
+		xenmachmem_init();
+		xenvfr_init();
+	}
+#endif
+}
+
+int
+hypervisor_print(aux, parent)
+	void *aux;
+	const char *parent;
+{
+	union hypervisor_attach_cookie *hac = aux;
+
+	if (parent)
+		aprint_normal("%s at %s", hac->hac_device, parent);
+	return (UNCONF);
+}
+
+void
+hypervisor_notify_via_evtchn(unsigned int port)
+{
+	evtchn_op_t op;
+
+	op.cmd = EVTCHNOP_send;
+	op.u.send.local_port = port;
+	(void)HYPERVISOR_event_channel_op(&op);
+}
+
+#ifdef DOM0OPS
+
+#define DIR_MODE	(S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
+
+kernfs_parentdir_t *kernxen_pkt;
+
+void
+xenkernfs_init()
+{
+	kernfs_entry_t *dkt;
+
+	KERNFS_ALLOCENTRY(dkt, M_TEMP, M_WAITOK);
+	KERNFS_INITENTRY(dkt, DT_DIR, "xen", NULL, KFSsubdir, VDIR, DIR_MODE);
+	kernfs_addentry(NULL, dkt);
+	kernxen_pkt = KERNFS_ENTOPARENTDIR(dkt);
+}
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c
new file mode 100644
index 0000000000..51219a980f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c
@@ -0,0 +1,1241 @@
+/*	$NetBSD: if_xennet.c,v 1.1.2.1 2004/05/22 15:58:29 he Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: if_xennet.c,v 1.1.2.1 2004/05/22 15:58:29 he Exp $");
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/syslog.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/device.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#if NRND > 0
+#include <sys/rnd.h>
+#endif
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_dl.h>
+#include <net/if_ether.h>
+
+#ifdef mediacode
+#include <net/if_media.h>
+#endif
+
+#ifdef INET
+#include <netinet/in.h>
+#include <netinet/if_inarp.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#endif
+
+#include <nfs/rpcv2.h>
+
+#include <nfs/nfsproto.h>
+#include <nfs/nfs.h>
+#include <nfs/nfsmount.h>
+#include <nfs/nfsdiskless.h>
+
+#include "bpfilter.h"
+#if NBPFILTER > 0
+#include <net/bpf.h>
+#include <net/bpfdesc.h>
+#endif
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm_page.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+#include <machine/ctrl_if.h>
+
+#include <machine/if_xennetvar.h>
+
+#ifdef DEBUG
+#define XENNET_DEBUG
+#endif
+#if defined(XENNET_DEBUG) && !defined(DEBUG)
+#define DEBUG
+#endif
+/* #define XENNET_DEBUG_DUMP */
+
+#ifdef XENNET_DEBUG
+#define XEDB_FOLLOW	0x01
+#define XEDB_INIT	0x02
+#define XEDB_EVENT	0x04
+#define XEDB_MBUF	0x08
+#define XEDB_MEM	0x10
+int xennet_debug = 0x0;
+#define DPRINTF(x) if (xennet_debug) printf x;
+#define DPRINTFN(n,x) if (xennet_debug & (n)) printf x;
+#else
+#define DPRINTF(x)
+#define DPRINTFN(n,x)
+#endif
+#define PRINTF(x) printf x;
+
+#ifdef XENNET_DEBUG_DUMP
+static void xennet_hex_dump(unsigned char *, size_t, char *, int);
+#endif
+
+int xennet_match (struct device *, struct cfdata *, void *);
+void xennet_attach (struct device *, struct device *, void *);
+static void xennet_ctrlif_rx(ctrl_msg_t *, unsigned long);
+static void xennet_driver_status_change(netif_fe_driver_status_changed_t *);
+static void xennet_status_change(netif_fe_interface_status_changed_t *);
+static void xennet_tx_mbuf_free(struct mbuf *, caddr_t, size_t, void *);
+static void xennet_rx_mbuf_free(struct mbuf *, caddr_t, size_t, void *);
+static int xen_network_handler(void *);
+static void network_tx_buf_gc(struct xennet_softc *);
+static void network_alloc_rx_buffers(struct xennet_softc *);
+static void network_alloc_tx_buffers(struct xennet_softc *);
+void xennet_init(struct xennet_softc *);
+void xennet_reset(struct xennet_softc *);
+#ifdef mediacode
+static int xennet_mediachange (struct ifnet *);
+static void xennet_mediastatus(struct ifnet *, struct ifmediareq *);
+#endif
+
+CFATTACH_DECL(xennet, sizeof(struct xennet_softc),
+    xennet_match, xennet_attach, NULL, NULL);
+
+#define TX_MAX_ENTRIES (NETIF_TX_RING_SIZE - 2)
+#define RX_MAX_ENTRIES (NETIF_RX_RING_SIZE - 2)
+#define TX_ENTRIES 128
+#define RX_ENTRIES 128
+
+static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
+
+/** Network interface info. */
+struct xennet_ctrl {
+	/** Number of interfaces. */
+	int xc_interfaces;
+	/** Number of connected interfaces. */
+	int xc_connected;
+	/** Error code. */
+	int xc_err;
+
+	cfprint_t xc_cfprint;
+	struct device *xc_parent;
+};
+
+static struct xennet_ctrl netctrl = { -1, 0, 0 };
+
+#ifdef mediacode
+static int xennet_media[] = {
+	IFM_ETHER|IFM_AUTO,
+};
+static int nxennet_media = (sizeof(xennet_media)/sizeof(xennet_media[0]));
+#endif
+
+
+int
+xennet_scan(struct device *self, struct xennet_attach_args *xneta,
+    cfprint_t print)
+{
+	ctrl_msg_t cmsg;
+	netif_fe_driver_status_changed_t st;
+	int err = 0;
+
+	if ((xen_start_info.flags & SIF_INITDOMAIN) ||
+	    (xen_start_info.flags & SIF_NET_BE_DOMAIN))
+		return 0;
+
+	netctrl.xc_parent = self;
+	netctrl.xc_cfprint = print;
+
+	printf("Initialising Xen virtual ethernet frontend driver.\n");
+
+	(void)ctrl_if_register_receiver(CMSG_NETIF_FE, xennet_ctrlif_rx,
+	    CALLBACK_IN_BLOCKING_CONTEXT);
+
+	/* Send a driver-UP notification to the domain controller. */
+	cmsg.type      = CMSG_NETIF_FE;
+	cmsg.subtype   = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
+	cmsg.length    = sizeof(netif_fe_driver_status_changed_t);
+	st.status      = NETIF_DRIVER_STATUS_UP;
+	st.max_handle  = 0;
+	memcpy(cmsg.msg, &st, sizeof(st));
+	ctrl_if_send_message_block(&cmsg, NULL, 0, 0);
+
+#if 0
+	err = xennet_wait_for_interfaces();
+	if (err)
+		ctrl_if_unregister_receiver(CMSG_NETIF_FE, xennet_ctrlif_rx);
+#endif
+
+	return err;
+}
+
+int
+xennet_match(struct device *parent, struct cfdata *match, void *aux)
+{
+	struct xennet_attach_args *xa = (struct xennet_attach_args *)aux;
+
+	if (strcmp(xa->xa_device, "xennet") == 0)
+		return 1;
+	return 0;
+}
+
+void
+xennet_attach(struct device *parent, struct device *self, void *aux)
+{
+	struct xennet_attach_args *xneta = (struct xennet_attach_args *)aux;
+	struct xennet_softc *sc = (struct xennet_softc *)self;
+	struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+	int idx;
+
+	aprint_normal(": Xen Virtual Network Interface\n");
+
+	sc->sc_ifno = xneta->xa_handle;
+
+	/* Initialize ifnet structure. */
+	memcpy(ifp->if_xname, sc->sc_dev.dv_xname, IFNAMSIZ);
+	ifp->if_softc = sc;
+	ifp->if_start = xennet_start;
+	ifp->if_ioctl = xennet_ioctl;
+	ifp->if_watchdog = xennet_watchdog;
+	ifp->if_flags = IFF_BROADCAST | IFF_NOTRAILERS;
+
+#ifdef mediacode
+	ifmedia_init(&sc->sc_media, 0, xennet_mediachange,
+	    xennet_mediastatus);
+	for (idx = 0; idx < nxennet_media; idx++)
+		ifmedia_add(&sc->sc_media, xennet_media[idx], 0, NULL);
+	ifmedia_set(&sc->sc_media, xennet_media[0]);
+#endif
+
+	for (idx = 0; idx < NETIF_TX_RING_SIZE; idx++)
+		sc->sc_tx_bufa[idx].xb_next = idx + 1;
+	for (idx = 0; idx < NETIF_RX_RING_SIZE; idx++)
+		sc->sc_rx_bufa[idx].xb_next = idx + 1;
+}
+
+static struct xennet_softc *
+find_device(int handle)
+{
+	struct device *dv;
+	struct xennet_softc *xs = NULL;
+
+	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+		if (dv->dv_cfattach == NULL ||
+		    dv->dv_cfattach->ca_attach != xennet_attach)
+			continue;
+		xs = (struct xennet_softc *)dv;
+		if (xs->sc_ifno == handle)
+			break;
+	}
+	return xs;
+}
+
+static void
+xennet_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+	int respond = 1;
+
+	switch (msg->subtype) {
+	case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
+		if (msg->length != sizeof(netif_fe_interface_status_changed_t))
+			goto error;
+		xennet_status_change(
+			(netif_fe_interface_status_changed_t *)&msg->msg[0]);
+		break;
+
+	case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
+		if (msg->length != sizeof(netif_fe_driver_status_changed_t))
+			goto error;
+		xennet_driver_status_change(
+			(netif_fe_driver_status_changed_t *)&msg->msg[0]);
+		break;
+
+	error:
+	default:
+		msg->length = 0;
+		break;
+	}
+
+	if (respond)
+		ctrl_if_send_response(msg);
+}
+
+static void
+xennet_driver_status_change(netif_fe_driver_status_changed_t *status)
+{
+	struct xennet_attach_args xneta;
+	int i;
+
+	DPRINTFN(XEDB_EVENT, ("> max_handle=%d\n", status->max_handle));
+
+	/* XXX FIXME: Abuse of 'max_handle' as interface count. */
+	netctrl.xc_interfaces = status->max_handle;
+	netctrl.xc_connected = 0;
+
+	xneta.xa_device = "xennet";
+
+	for (i = 0; i < netctrl.xc_interfaces; i++) {
+		xneta.xa_handle = i;
+		config_found(netctrl.xc_parent, &xneta, netctrl.xc_cfprint);
+	}
+}
+
+static void
+xennet_status_change(netif_fe_interface_status_changed_t *status)
+{
+	ctrl_msg_t cmsg;
+	netif_fe_interface_connect_t up;
+	struct xennet_softc *sc;
+	struct ifnet *ifp;
+	struct vm_page *pg_tx, *pg_rx;
+
+	DPRINTFN(XEDB_EVENT, (">\n"));
+	DPRINTFN(XEDB_EVENT, ("> status=%d handle=%d mac=%02x:%02x:%02x:%02x:%02x:%02x\n",
+	    status->status,
+	    status->handle,
+	    status->mac[0], status->mac[1], status->mac[2],
+	    status->mac[3], status->mac[4], status->mac[5]));
+
+	if (netctrl.xc_interfaces <= 0) {
+		printf("Status change: no interfaces\n");
+		return;
+	}
+
+	sc = find_device(status->handle);
+	if (sc == NULL) {
+		printf("Status change: invalid netif handle %u\n",
+		    status->handle);
+		return;
+	}
+	ifp = &sc->sc_ethercom.ec_if;
+
+	switch (status->status) {
+	case NETIF_INTERFACE_STATUS_DESTROYED:
+		printf("Unexpected netif-DESTROYED message in state %d\n",
+		    sc->sc_backend_state);
+		break;
+
+	case NETIF_INTERFACE_STATUS_DISCONNECTED:
+#if 0
+		if (sc->sc_backend_state != BEST_CLOSED) {
+			printk("Unexpected netif-DISCONNECTED message"
+			    " in state %d\n", sc->sc_backend_state);
+			printk("Attempting to reconnect network interface\n");
+
+			/* Begin interface recovery.
+			 *
+			 * NB. Whilst we're recovering, we turn the
+			 * carrier state off.  We take measures to
+			 * ensure that this device isn't used for
+			 * anything.  We also stop the queue for this
+			 * device.  Various different approaches
+			 * (e.g. continuing to buffer packets) have
+			 * been tested but don't appear to improve the
+			 * overall impact on TCP connections.
+			 *
+			 * TODO: (MAW) Change the Xend<->Guest
+			 * protocol so that a recovery is initiated by
+			 * a special "RESET" message - disconnect
+			 * could just mean we're not allowed to use
+			 * this interface any more.
+			 */
+
+			/* Stop old i/f to prevent errors whilst we
+			 * rebuild the state. */
+			spin_lock_irq(&np->tx_lock);
+			spin_lock(&np->rx_lock);
+			netif_stop_queue(dev);
+			np->backend_state = BEST_DISCONNECTED;
+			spin_unlock(&np->rx_lock);
+			spin_unlock_irq(&np->tx_lock);
+
+			/* Free resources. */
+			free_irq(np->irq, dev);
+			unbind_evtchn_from_irq(np->evtchn);
+			free_page((unsigned long)np->tx);
+			free_page((unsigned long)np->rx);
+		}
+#endif
+
+		/* Move from CLOSED to DISCONNECTED state. */
+		sc->sc_tx = (netif_tx_interface_t *)
+			uvm_km_valloc_align(kernel_map, PAGE_SIZE, PAGE_SIZE);
+		if (sc->sc_tx == NULL)
+			panic("netif: no tx va");
+		sc->sc_rx = (netif_rx_interface_t *)
+			uvm_km_valloc_align(kernel_map, PAGE_SIZE, PAGE_SIZE);
+		if (sc->sc_rx == NULL)
+			panic("netif: no rx va");
+		pg_tx = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+		if (pg_tx == NULL) {
+			panic("netif: no tx pages");
+		}
+		pmap_kenter_pa((vaddr_t)sc->sc_tx, VM_PAGE_TO_PHYS(pg_tx),
+		    VM_PROT_READ | VM_PROT_WRITE);
+		pg_rx = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+		if (pg_rx == NULL) {
+			panic("netif: no rx pages");
+		}
+		pmap_kenter_pa((vaddr_t)sc->sc_rx, VM_PAGE_TO_PHYS(pg_rx),
+		    VM_PROT_READ | VM_PROT_WRITE);
+		sc->sc_backend_state = BEST_DISCONNECTED;
+
+		/* Construct an interface-CONNECT message for the
+		 * domain controller. */
+		cmsg.type      = CMSG_NETIF_FE;
+		cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
+		cmsg.length    = sizeof(netif_fe_interface_connect_t);
+		up.handle      = status->handle;
+		up.tx_shmem_frame = xpmap_ptom(VM_PAGE_TO_PHYS(pg_tx)) >> PAGE_SHIFT;
+		up.rx_shmem_frame = xpmap_ptom(VM_PAGE_TO_PHYS(pg_rx)) >> PAGE_SHIFT;
+		memcpy(cmsg.msg, &up, sizeof(up));
+
+		/* Tell the controller to bring up the interface. */
+		ctrl_if_send_message_block(&cmsg, NULL, 0, 0);
+		break;
+
+	case NETIF_INTERFACE_STATUS_CONNECTED:
+		if (sc->sc_backend_state == BEST_CLOSED) {
+			printf("Unexpected netif-CONNECTED message"
+			    " in state %d\n", sc->sc_backend_state);
+			break;
+		}
+
+		memcpy(sc->sc_enaddr, status->mac, ETHER_ADDR_LEN);
+#if 0
+		if (xen_start_info.flags & SIF_PRIVILEGED) {
+			/* XXX for domain-0 change out ethernet address to be
+			 * different than the physical address since arp
+			 * replies from other domains will report the physical
+			 * address.
+			 */
+			if (sc->sc_enaddr[0] != 0xaa)
+				sc->sc_enaddr[0] = 0xaa;
+			else
+				sc->sc_enaddr[0] = 0xab;
+		}
+#endif
+
+		/* Recovery procedure: */
+
+		/* Step 1: Reinitialise variables. */
+		sc->sc_rx_resp_cons = sc->sc_tx_resp_cons = /* sc->sc_tx_full = */ 0;
+		sc->sc_rx->event = sc->sc_tx->event = 1;
+
+		/* Step 2: Rebuild the RX and TX ring contents. */
+		network_alloc_rx_buffers(sc);
+		SLIST_INIT(&sc->sc_tx_bufs);
+		network_alloc_tx_buffers(sc);
+
+		/* Step 3: All public and private state should now be
+		 * sane.  Get ready to start sending and receiving
+		 * packets and give the driver domain a kick because
+		 * we've probably just requeued some packets.
+		 */
+		sc->sc_backend_state = BEST_CONNECTED;
+		__insn_barrier();
+		hypervisor_notify_via_evtchn(status->evtchn);  
+		network_tx_buf_gc(sc);
+
+		if_attach(ifp);
+		ether_ifattach(ifp, sc->sc_enaddr);
+
+		sc->sc_evtchn = status->evtchn;
+		sc->sc_irq = bind_evtchn_to_irq(sc->sc_evtchn);
+		event_set_handler(sc->sc_irq, &xen_network_handler, sc, IPL_NET);
+		hypervisor_enable_irq(sc->sc_irq);
+		netctrl.xc_connected++;
+
+		aprint_normal("%s: MAC address %s\n", sc->sc_dev.dv_xname,
+		    ether_sprintf(sc->sc_enaddr));
+
+#if NRND > 0
+		rnd_attach_source(&sc->rnd_source, sc->sc_dev.dv_xname,
+		    RND_TYPE_NET, 0);
+#endif
+		break;
+
+	default:
+		printf("Status change to unknown value %d\n", 
+		    status->status);
+		break;
+	}
+}
+
+static void
+xennet_tx_mbuf_free(struct mbuf *m, caddr_t buf, size_t size, void *arg)
+{
+	struct xennet_txbuf *txbuf = (struct xennet_txbuf *)arg;
+
+	DPRINTFN(XEDB_MBUF, ("xennet_tx_mbuf_free %p pa %p\n", txbuf,
+	    (void *)txbuf->xt_pa));
+	SLIST_INSERT_HEAD(&txbuf->xt_sc->sc_tx_bufs, txbuf, xt_next);
+	pool_cache_put(&mbpool_cache, m);
+}
+
+static void
+xennet_rx_push_buffer(struct xennet_softc *sc, int id)
+{
+	NETIF_RING_IDX ringidx;
+	int nr_pfns;
+
+	ringidx = sc->sc_rx->req_prod;
+	nr_pfns = 0;
+
+	DPRINTFN(XEDB_MEM, ("readding page va %p pa %p ma %p/%p to rx_ring "
+		     "at %d with id %d\n",
+		     (void *)sc->sc_rx_bufa[id].xb_rx.xbrx_va,
+		     (void *)sc->sc_rx_bufa[id].xb_rx.xbrx_pa,
+		     (void *)(PTE_BASE[x86_btop
+				  (sc->sc_rx_bufa[id].xb_rx.xbrx_va)] &
+			 PG_FRAME),
+		     (void *)xpmap_ptom(sc->sc_rx_bufa[id].xb_rx.xbrx_pa),
+		     ringidx, id));
+
+	sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].req.id = id;
+
+	rx_pfn_array[nr_pfns] = xpmap_ptom(sc->sc_rx_bufa[id].xb_rx.xbrx_pa)
+		>> PAGE_SHIFT;
+
+	/* Remove this page from pseudo phys map before
+	 * passing back to Xen. */
+	xpmap_phys_to_machine_mapping[(sc->sc_rx_bufa[id].xb_rx.xbrx_pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
+		INVALID_P2M_ENTRY;
+
+	rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
+	rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT;
+	rx_mcl[nr_pfns].args[1] = 0;
+	rx_mcl[nr_pfns].args[2] = 0;
+
+	nr_pfns++;
+
+	sc->sc_rx_bufs_to_notify++;
+
+	ringidx++;
+
+	/*
+	 * We may have allocated buffers which have entries
+	 * outstanding in the page update queue -- make sure we flush
+	 * those first!
+	 */
+	xpq_flush_queue();
+
+	/* After all PTEs have been zapped we blow away stale TLB entries. */
+	rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
+
+	/* Give away a batch of pages. */
+	rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
+	rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
+	rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
+	rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
+	rx_mcl[nr_pfns].args[3] = 0;
+	rx_mcl[nr_pfns].args[4] = DOMID_SELF;
+
+	/* Zap PTEs and give away pages in one big multicall. */
+	(void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
+
+	/* Check return status of HYPERVISOR_dom_mem_op(). */
+	if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
+		panic("Unable to reduce memory reservation\n");
+
+	/* Above is a suitable barrier to ensure backend will see requests. */
+	sc->sc_rx->req_prod = ringidx;
+}
+
+static void
+xennet_rx_mbuf_free(struct mbuf *m, caddr_t buf, size_t size, void *arg)
+{
+	union xennet_bufarray *xb = (union xennet_bufarray *)arg;
+	struct xennet_softc *sc = xb->xb_rx.xbrx_sc;
+	int id = (xb - sc->sc_rx_bufa);
+
+	DPRINTFN(XEDB_MBUF, ("xennet_rx_mbuf_free id %d, mbuf %p, buf %p, "
+	    "size %d\n", id, m, buf, size));
+
+	xennet_rx_push_buffer(sc, id);
+
+	pool_cache_put(&mbpool_cache, m);
+}
+
+static int
+xen_network_handler(void *arg)
+{
+	struct xennet_softc *sc = arg;
+	struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+	netif_rx_response_t *rx;
+	paddr_t pa;
+	NETIF_RING_IDX ringidx;
+	mmu_update_t *mmu = rx_mmu;
+	multicall_entry_t *mcl = rx_mcl;
+	struct mbuf *m;
+
+	network_tx_buf_gc(sc);
+
+ again:
+	for (ringidx = sc->sc_rx_resp_cons;
+	     ringidx != sc->sc_rx->resp_prod;
+	     ringidx++) {
+		rx = &sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].resp;
+
+		if (rx->status < 0)
+			panic("rx->status < 0");
+		/* XXXcl check rx->status for error */
+
+                MGETHDR(m, M_DONTWAIT, MT_DATA);
+                if (m == NULL) {
+			printf("xennet: rx no mbuf\n");
+			break;
+		}
+
+		pa = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_pa;
+
+		DPRINTFN(XEDB_EVENT, ("rx event %d for id %d, size %d, "
+			     "status %d, ma %08lx, pa %08lx\n", ringidx,
+			     rx->id, rx->status, rx->status, rx->addr, pa));
+
+		/* Remap the page. */
+		mmu->ptr  = (rx->addr & PG_FRAME) | MMU_MACHPHYS_UPDATE;
+		mmu->val  = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
+		mmu++;
+		mcl->op = __HYPERVISOR_update_va_mapping;
+		mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT;
+		mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW;
+		mcl->args[2] = UVMF_FLUSH_TLB; // 0;
+		mcl++;
+
+		xpmap_phys_to_machine_mapping
+			[(pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
+			rx->addr >> PAGE_SHIFT;
+
+		/* Do all the remapping work, and M->P updates, in one
+		 * big hypercall. */
+		if ((mcl - rx_mcl) != 0) {
+			mcl->op = __HYPERVISOR_mmu_update;
+			mcl->args[0] = (unsigned long)rx_mmu;
+			mcl->args[1] = mmu - rx_mmu;
+			mcl->args[2] = 0;
+			mcl++;
+			(void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
+		}
+		if (0)
+		printf("page mapped at va %08lx -> %08x/%08lx\n",
+		    sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va,
+		    PTE_BASE[x86_btop(sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)],
+		    rx->addr);
+		mmu = rx_mmu;
+		mcl = rx_mcl;
+
+		DPRINTFN(XEDB_MBUF, ("rx packet mbuf %p va %p pa %p/%p "
+		    "ma %p\n", m,
+		    (void *)sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va,
+		    (void *)(xpmap_mtop(PTE_BASE[x86_btop
+					    (sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)] & PG_FRAME)), (void *)pa,
+		    (void *)(PTE_BASE[x86_btop
+			(sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)] & PG_FRAME)));
+
+		m->m_len = m->m_pkthdr.len = rx->status;
+		m->m_pkthdr.rcvif = ifp;
+		if (sc->sc_rx->req_prod != sc->sc_rx->resp_prod) {
+			MEXTADD(m, (void *)(sc->sc_rx_bufa[rx->id].xb_rx.
+			    xbrx_va + (rx->addr & PAGE_MASK)), rx->status, M_DEVBUF,
+			    xennet_rx_mbuf_free,
+			    &sc->sc_rx_bufa[rx->id]);
+		} else {
+			/*
+			 * This was our last receive buffer, allocate
+			 * memory, copy data and push the receive
+			 * buffer back to the hypervisor.
+			 */
+			MEXTMALLOC(m, rx->status, M_DONTWAIT);
+			if ((m->m_flags & M_EXT) == 0) {
+				printf("xennet: rx no mbuf 2\n");
+				m_free(m);
+				break;
+			}
+			memcpy(m->m_data, (void *)(sc->sc_rx_bufa[rx->id].
+			    xb_rx.xbrx_va + (rx->addr & PAGE_MASK)), rx->status);
+			xennet_rx_push_buffer(sc, rx->id);
+		}
+
+#ifdef XENNET_DEBUG_DUMP
+		xennet_hex_dump(mtod(m, u_char *), m->m_pkthdr.len, "r", rx->id);
+#endif
+
+#if NBPFILTER > 0
+		/*
+		 * Pass packet to bpf if there is a listener.
+		 */
+		if (ifp->if_bpf)
+			bpf_mtap(ifp->if_bpf, m);
+#endif
+
+		ifp->if_ipackets++;
+
+		/* Pass the packet up. */
+		(*ifp->if_input)(ifp, m);
+	}
+
+	sc->sc_rx_resp_cons = ringidx;
+	sc->sc_rx->event = sc->sc_rx_resp_cons + 1;
+
+	if (sc->sc_rx->resp_prod != ringidx)
+		goto again;
+
+	return 0;
+}
+
+static inline int
+get_bufarray_entry(union xennet_bufarray *a)
+{
+	int idx;
+
+	idx = a[0].xb_next;
+	a[0].xb_next = a[idx].xb_next;
+	return idx;
+}
+
+static inline void
+put_bufarray_entry(union xennet_bufarray *a, int idx)
+{
+
+	a[idx].xb_next = a[0].xb_next;
+	a[0].xb_next = idx;
+}
+
+static void
+network_tx_buf_gc(struct xennet_softc *sc)
+{
+	struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+	NETIF_RING_IDX idx, prod;
+
+	do {
+		prod = sc->sc_tx->resp_prod;
+
+		for (idx = sc->sc_tx_resp_cons; idx != prod; idx++) {
+			DPRINTFN(XEDB_EVENT, ("tx event at pos %d, status: "
+				     "%d, id: %d, mbuf %p, buf %p\n", idx,
+				     sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.status,
+				     sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id,
+				     sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m,
+				     mtod(sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m, void *)));
+			m_freem(sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m);
+			put_bufarray_entry(sc->sc_tx_bufa,
+			    sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id);
+			sc->sc_tx_entries--; /* atomic */
+		}
+
+		sc->sc_tx_resp_cons = prod;
+
+		/*
+		 * Set a new event, then check for race with update of
+		 * tx_cons.
+		 */
+		sc->sc_tx->event = /* atomic */
+			prod + (sc->sc_tx_entries >> 1) + 1;
+		__insn_barrier();
+	} while (prod != sc->sc_tx->resp_prod);
+
+	if (sc->sc_tx->resp_prod == sc->sc_tx->req_prod)
+		ifp->if_timer = 0;
+	/* KDASSERT(sc->sc_net_idx->tx_req_prod == */
+	/* TX_RING_ADD(sc->sc_net_idx->tx_resp_prod, sc->sc_tx_entries)); */
+}
+
+static void
+network_alloc_rx_buffers(struct xennet_softc *sc)
+{
+	vaddr_t rxpages, va;
+	paddr_t pa;
+	struct vm_page *pg;
+	int id, nr_pfns;
+	NETIF_RING_IDX ringidx;
+	int s;
+
+	ringidx = sc->sc_rx->req_prod;
+	if (0) printf("network_alloc_rx_buffers prod %d cons %d\n", ringidx,
+	    sc->sc_rx_resp_cons);
+	if ((ringidx - sc->sc_rx_resp_cons) > (RX_MAX_ENTRIES / 2))
+		return;
+
+	nr_pfns = 0;
+
+	rxpages = uvm_km_valloc_align(kernel_map, RX_ENTRIES * PAGE_SIZE,
+	    PAGE_SIZE);
+
+	s = splnet();
+	for (va = rxpages; va < rxpages + RX_ENTRIES * PAGE_SIZE;
+	     va += PAGE_SIZE) {
+		pg = uvm_pagealloc(NULL, 0, NULL, 0);
+		if (pg == NULL)
+			panic("network_alloc_rx_buffers: no pages");
+		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+		    VM_PROT_READ | VM_PROT_WRITE);
+
+		id = get_bufarray_entry(sc->sc_rx_bufa);
+		sc->sc_rx_bufa[id].xb_rx.xbrx_va = va;
+		sc->sc_rx_bufa[id].xb_rx.xbrx_sc = sc;
+
+		pa = VM_PAGE_TO_PHYS(pg);
+		DPRINTFN(XEDB_MEM, ("adding page va %p pa %p/%p "
+		    "ma %p/%p to rx_ring at %d with id %d\n", (void *)va,
+			     (void *)(VM_PAGE_TO_PHYS(pg) & PG_FRAME), (void *)xpmap_mtop(PTE_BASE[x86_btop(va)]),
+		    (void *)(PTE_BASE[x86_btop(va)] & PG_FRAME),
+			     (void *)xpmap_ptom(VM_PAGE_TO_PHYS(pg)),
+		    ringidx, id));
+		sc->sc_rx_bufa[id].xb_rx.xbrx_pa = pa;
+		sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].req.id = id;
+
+		rx_pfn_array[nr_pfns] = xpmap_ptom(pa) >> PAGE_SHIFT;
+
+		/* Remove this page from pseudo phys map before
+		 * passing back to Xen. */
+		xpmap_phys_to_machine_mapping[(pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
+			INVALID_P2M_ENTRY;
+
+		rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
+		rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT;
+		rx_mcl[nr_pfns].args[1] = 0;
+		rx_mcl[nr_pfns].args[2] = 0;
+
+		nr_pfns++;
+
+		sc->sc_rx_bufs_to_notify++;
+
+		ringidx++;
+		if ((ringidx - sc->sc_rx_resp_cons) == RX_MAX_ENTRIES)
+			break;
+	}
+
+	if (nr_pfns == 0) {
+		splx(s);
+		return;
+	}
+
+	/*
+	 * We may have allocated buffers which have entries
+	 * outstanding in the page update queue -- make sure we flush
+	 * those first!
+	 */
+	xpq_flush_queue();
+
+	/* After all PTEs have been zapped we blow away stale TLB entries. */
+	rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
+
+	/* Give away a batch of pages. */
+	rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
+	rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
+	rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
+	rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
+	rx_mcl[nr_pfns].args[3] = 0;
+	rx_mcl[nr_pfns].args[4] = DOMID_SELF;
+
+	/* Zap PTEs and give away pages in one big multicall. */
+	(void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
+
+	/* Check return status of HYPERVISOR_dom_mem_op(). */
+	if (rx_mcl[nr_pfns].args[5] != nr_pfns)
+		panic("Unable to reduce memory reservation\n");
+
+	/* Above is a suitable barrier to ensure backend will see requests. */
+	sc->sc_rx->req_prod = ringidx;
+
+	splx(s);
+
+}
+
+static void
+network_alloc_tx_buffers(struct xennet_softc *sc)
+{
+	vaddr_t txpages, va;
+	struct vm_page *pg;
+	struct xennet_txbuf *txbuf;
+	int i;
+
+	txpages = uvm_km_valloc_align(kernel_map,
+	    (TX_ENTRIES / TXBUF_PER_PAGE) * PAGE_SIZE, PAGE_SIZE);
+	for (va = txpages;
+	     va < txpages + (TX_ENTRIES / TXBUF_PER_PAGE) * PAGE_SIZE;
+	     va += PAGE_SIZE) {
+		pg = uvm_pagealloc(NULL, 0, NULL, 0);
+		if (pg == NULL)
+			panic("network_alloc_tx_buffers: no pages");
+		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+		    VM_PROT_READ | VM_PROT_WRITE);
+
+		for (i = 0; i < TXBUF_PER_PAGE; i++) {
+			txbuf = (struct xennet_txbuf *)
+				(va + i * (PAGE_SIZE / TXBUF_PER_PAGE));
+			txbuf->xt_sc = sc;
+			txbuf->xt_pa = VM_PAGE_TO_PHYS(pg) +
+				i * (PAGE_SIZE / TXBUF_PER_PAGE) +
+				sizeof(struct xennet_txbuf);
+			SLIST_INSERT_HEAD(&sc->sc_tx_bufs, txbuf, xt_next);
+		}
+	}
+}
+
+/* 
+ * Called at splnet.
+ */
+void
+xennet_start(struct ifnet *ifp)
+{
+	struct xennet_softc *sc = ifp->if_softc;
+	struct mbuf *m, *new_m;
+	struct xennet_txbuf *txbuf;
+	netif_tx_request_t *txreq;
+	NETIF_RING_IDX idx;
+	paddr_t pa;
+	int bufid;
+
+	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_start()\n", sc->sc_dev.dv_xname));
+
+#ifdef DIAGNOSTIC
+	IFQ_POLL(&ifp->if_snd, m);
+	if (m == 0)
+		panic("%s: No packet to start", sc->sc_dev.dv_xname);
+#endif
+
+	if ((ifp->if_flags & (IFF_RUNNING | IFF_OACTIVE)) != IFF_RUNNING)
+		return;
+
+	idx = sc->sc_tx->req_prod;
+	while (/*CONSTCOND*/1) {
+
+		IFQ_POLL(&ifp->if_snd, m);
+		if (m == NULL)
+			break;
+
+		switch (m->m_flags & (M_EXT|M_EXT_CLUSTER)) {
+		case M_EXT|M_EXT_CLUSTER:
+			pa = m->m_ext.ext_paddr +
+				(m->m_data - m->m_ext.ext_buf);
+			break;
+		default:
+		case 0:
+			pa = m->m_paddr + M_BUFOFFSET(m) +
+				(m->m_data - M_BUFADDR(m));
+			break;
+		}
+
+		if (m->m_pkthdr.len != m->m_len ||
+		    (pa ^ (pa + m->m_pkthdr.len)) & PG_FRAME) {
+			txbuf = SLIST_FIRST(&sc->sc_tx_bufs);
+			if (txbuf == NULL) {
+				// printf("xennet: no tx bufs\n");
+				break;
+			}
+
+			MGETHDR(new_m, M_DONTWAIT, MT_DATA);
+			if (new_m == NULL) {
+				printf("xennet: no mbuf\n");
+				break;
+			}
+
+			SLIST_REMOVE_HEAD(&sc->sc_tx_bufs, xt_next);
+			IFQ_DEQUEUE(&ifp->if_snd, m);
+
+			KASSERT(m->m_flags & M_PKTHDR);
+			M_COPY_PKTHDR(new_m, m);
+			m_copydata(m, 0, m->m_pkthdr.len, txbuf->xt_buf);
+			MEXTADD(new_m, txbuf->xt_buf, m->m_pkthdr.len,
+			    M_DEVBUF, xennet_tx_mbuf_free, txbuf);
+			new_m->m_ext.ext_paddr = txbuf->xt_pa;
+			new_m->m_len = new_m->m_pkthdr.len = m->m_pkthdr.len;
+
+			m_freem(m);
+			m = new_m;
+
+			pa = m->m_ext.ext_paddr +
+				(m->m_data - m->m_ext.ext_buf);
+		} else
+			IFQ_DEQUEUE(&ifp->if_snd, m);
+
+		bufid = get_bufarray_entry(sc->sc_tx_bufa);
+		sc->sc_tx_bufa[bufid].xb_tx.xbtx_m = m;
+
+		DPRINTFN(XEDB_MBUF, ("xennet_start id %d, mbuf %p, buf %p/%p, "
+			     "size %d\n", bufid, m, mtod(m, void *),
+			     (void *)pa, m->m_pkthdr.len));
+#ifdef XENNET_DEBUG_DUMP
+		xennet_hex_dump(mtod(m, u_char *), m->m_pkthdr.len, "s", bufid);
+#endif
+
+		txreq = &sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].req;
+		txreq->id = bufid;
+		txreq->addr = xpmap_ptom(pa);
+		txreq->size = m->m_pkthdr.len;
+
+		__insn_barrier();
+		idx++;
+		sc->sc_tx->req_prod = idx;
+
+		sc->sc_tx_entries++; /* XXX atomic */
+
+#ifdef XENNET_DEBUG
+		DPRINTFN(XEDB_MEM, ("packet addr %p/%p, physical %p/%p, "
+		    "m_paddr %p, len %d/%d\n", M_BUFADDR(m), mtod(m, void *),
+		    (void *)*kvtopte(mtod(m, vaddr_t)),
+		    (void *)xpmap_mtop(*kvtopte(mtod(m, vaddr_t))),
+		    (void *)m->m_paddr, m->m_pkthdr.len, m->m_len));
+#endif
+
+#if NBPFILTER > 0
+		/*
+		 * Pass packet to bpf if there is a listener.
+		 */
+		if (ifp->if_bpf)
+			bpf_mtap(ifp->if_bpf, m);
+#endif
+	}
+
+	ifp->if_flags &= ~IFF_OACTIVE;
+
+	network_tx_buf_gc(sc);
+
+	__insn_barrier();
+	if (sc->sc_tx->resp_prod != idx)
+		hypervisor_notify_via_evtchn(sc->sc_evtchn);
+
+	ifp->if_timer = 5;
+
+	ifp->if_opackets++;
+
+	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_start() done\n",
+	    sc->sc_dev.dv_xname));
+}
+
+int
+xennet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+	struct xennet_softc *sc = ifp->if_softc;
+	struct ifaddr *ifa = (struct ifaddr *)data;
+#ifdef mediacode
+	struct ifreq *ifr = (struct ifreq *)data;
+#endif
+	int s, error = 0;
+
+	s = splnet();
+
+	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl()\n", sc->sc_dev.dv_xname));
+
+	switch(cmd) {
+	case SIOCSIFADDR:
+		DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOCSIFADDR\n",
+		    sc->sc_dev.dv_xname));
+		ifp->if_flags |= IFF_UP;
+		switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+		case AF_INET:
+			xennet_init(sc);
+			arp_ifinit(ifp, ifa);
+			break;
+#endif
+		default:
+			xennet_init(sc);
+			break;
+		}
+		break;
+
+	case SIOCSIFFLAGS:
+		DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOCSIFFLAGS\n",
+		    sc->sc_dev.dv_xname));
+		break;
+
+	case SIOCADDMULTI:
+	case SIOCDELMULTI:
+		DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOC*MULTI\n",
+		    sc->sc_dev.dv_xname));
+		break;
+
+#ifdef mediacode
+	case SIOCGIFMEDIA:
+	case SIOCSIFMEDIA:
+		DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOC*IFMEDIA\n",
+		    sc->sc_dev.dv_xname));
+		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
+		break;
+#endif
+
+	default:
+		DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl(0x%lx) unknown cmd\n",
+		    sc->sc_dev.dv_xname, cmd));
+		error = EINVAL;
+		break;
+	}
+
+	splx(s);
+
+	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() returning %d\n",
+	    sc->sc_dev.dv_xname, error));
+
+	return error;
+}
+
+void
+xennet_watchdog(struct ifnet *ifp)
+{
+
+	panic("xennet_watchdog\n");
+}
+
+void
+xennet_init(struct xennet_softc *sc)
+{
+	struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+
+	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_init()\n", sc->sc_dev.dv_xname));
+
+	if (ifp->if_flags & IFF_UP) {
+		if ((ifp->if_flags & IFF_RUNNING) == 0)
+			xennet_reset(sc);
+
+		ifp->if_flags |= IFF_RUNNING;
+		ifp->if_flags &= ~IFF_OACTIVE;
+		ifp->if_timer = 0;
+	} else {
+		ifp->if_flags &= ~IFF_RUNNING;
+		xennet_reset(sc);
+	}
+}
+
+void
+xennet_reset(struct xennet_softc *sc)
+{
+
+	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_reset()\n", sc->sc_dev.dv_xname));
+}
+
+#ifdef mediacode
+/*
+ * Media change callback.
+ */
+static int
+xennet_mediachange(struct ifnet *ifp)
+{
+	struct xennet_softc *sc = ifp->if_softc;
+
+	switch IFM_SUBTYPE(sc->sc_media.ifm_media) {
+	case IFM_AUTO:
+		break;
+	default:
+		return (1);
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * Media status callback.
+ */
+static void
+xennet_mediastatus(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+	struct xennet_softc *sc = ifp->if_softc;
+	
+	if (IFM_SUBTYPE(ifmr->ifm_active) == IFM_AUTO)
+		ifmr->ifm_active = sc->sc_media.ifm_cur->ifm_data;
+
+	ifmr->ifm_status &= ~IFM_AVALID;
+}
+#endif
+
+int
+xennet_bootstatic_callback(struct nfs_diskless *nd)
+{
+	struct ifnet *ifp = nd->nd_ifp;
+	struct xennet_softc *sc = (struct xennet_softc *)ifp->if_softc;
+	union xen_cmdline_parseinfo xcp;
+	struct sockaddr_in *sin;
+
+	memset(&xcp, 0, sizeof(xcp.xcp_netinfo));
+	xcp.xcp_netinfo.xi_ifno = sc->sc_ifno;
+	xcp.xcp_netinfo.xi_root = nd->nd_root.ndm_host;
+	xen_parse_cmdline(XEN_PARSE_NETINFO, &xcp);
+
+	nd->nd_myip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[0]);
+	nd->nd_gwip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[2]);
+	nd->nd_mask.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[3]);
+
+	sin = (struct sockaddr_in *) &nd->nd_root.ndm_saddr;
+	memset((caddr_t)sin, 0, sizeof(*sin));
+	sin->sin_len = sizeof(*sin);
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[1]);
+
+	return (NFS_BOOTSTATIC_HAS_MYIP|NFS_BOOTSTATIC_HAS_GWIP|
+	    NFS_BOOTSTATIC_HAS_MASK|NFS_BOOTSTATIC_HAS_SERVADDR|
+	    NFS_BOOTSTATIC_HAS_SERVER);
+}
+
+
+#ifdef XENNET_DEBUG_DUMP
+#define XCHR(x) "0123456789abcdef"[(x) & 0xf]
+static void
+xennet_hex_dump(unsigned char *pkt, size_t len, char *type, int id)
+{
+	size_t i, j;
+
+	printf("pkt %p len %d/%x type %s id %d\n", pkt, len, len, type, id);
+	printf("00000000  ");
+	for(i=0; i<len; i++) {
+		printf("%c%c ", XCHR(pkt[i]>>4), XCHR(pkt[i]));
+		if ((i+1) % 16 == 8)
+			printf(" ");
+		if ((i+1) % 16 == 0) {
+			printf(" %c", '|');
+			for(j=0; j<16; j++)
+				printf("%c", pkt[i-15+j]>=32 &&
+				    pkt[i-15+j]<127?pkt[i-15+j]:'.');
+			printf("%c\n%c%c%c%c%c%c%c%c  ", '|', 
+			    XCHR((i+1)>>28), XCHR((i+1)>>24),
+			    XCHR((i+1)>>20), XCHR((i+1)>>16),
+			    XCHR((i+1)>>12), XCHR((i+1)>>8),
+			    XCHR((i+1)>>4), XCHR(i+1));
+		}
+	}
+	printf("\n");
+}
+#undef XCHR
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c
new file mode 100644
index 0000000000..b72ffc95a1
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c
@@ -0,0 +1,1368 @@
+/* $NetBSD: xbd.c,v 1.9.2.1 2004/05/22 15:59:11 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xbd.c,v 1.9.2.1 2004/05/22 15:59:11 he Exp $");
+
+#include "xbd.h"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/ioctl.h>
+#include <sys/device.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/conf.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+
+#include <uvm/uvm.h>
+
+#include <dev/dkvar.h>
+#include <machine/xbdvar.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/hypervisor-ifs/hypervisor-if.h>
+#include <machine/hypervisor-ifs/vbd.h>
+#include <machine/evtchn.h>
+
+
+static void xbd_attach(struct device *, struct device *, void *);
+static int xbd_detach(struct device *, int);
+
+#if NXBD > 0
+int xbd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(xbd, sizeof(struct xbd_softc),
+    xbd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver xbd_cd;
+#endif
+
+#if NWD > 0
+int xbd_wd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(wd, sizeof(struct xbd_softc),
+    xbd_wd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver wd_cd;
+#endif
+
+#if NSD > 0
+int xbd_sd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(sd, sizeof(struct xbd_softc),
+    xbd_sd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver sd_cd;
+#endif
+
+#if NCD > 0
+int xbd_cd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(cd, sizeof(struct xbd_softc),
+    xbd_cd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver cd_cd;
+#endif
+
+
+dev_type_open(xbdopen);
+dev_type_close(xbdclose);
+dev_type_read(xbdread);
+dev_type_write(xbdwrite);
+dev_type_ioctl(xbdioctl);
+dev_type_ioctl(xbdioctl_cdev);
+dev_type_strategy(xbdstrategy);
+dev_type_dump(xbddump);
+dev_type_size(xbdsize);
+
+#if NXBD > 0
+const struct bdevsw xbd_bdevsw = {
+	xbdopen, xbdclose, xbdstrategy, xbdioctl,
+	xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw xbd_cdevsw = {
+	xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_major;
+#endif
+
+#if NWD > 0
+const struct bdevsw wd_bdevsw = {
+	xbdopen, xbdclose, xbdstrategy, xbdioctl,
+	xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw wd_cdevsw = {
+	xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_wd_major;
+static dev_t xbd_wd_cdev_major;
+#endif
+
+#if NSD > 0
+const struct bdevsw sd_bdevsw = {
+	xbdopen, xbdclose, xbdstrategy, xbdioctl,
+	xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw sd_cdevsw = {
+	xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_sd_major;
+static dev_t xbd_sd_cdev_major;
+#endif
+
+#if NCD > 0
+const struct bdevsw cd_bdevsw = {
+	xbdopen, xbdclose, xbdstrategy, xbdioctl,
+	xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw cd_cdevsw = {
+	xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_cd_major;
+static dev_t xbd_cd_cdev_major;
+#endif
+
+
+static int	xbdstart(struct dk_softc *, struct buf *);
+static int	xbd_response_handler(void *);
+static void	xbd_update_create_kthread(void *);
+static void	xbd_update_kthread(void *);
+static int	xbd_update_handler(void *);
+
+static int	xbdinit(struct xbd_softc *, xen_disk_t *, struct dk_intf *);
+
+/* Pseudo-disk Interface */
+static struct dk_intf dkintf_esdi = {
+	DTYPE_ESDI,
+	"Xen Virtual ESDI",
+	xbdopen,
+	xbdclose,
+	xbdstrategy,
+	xbdstart,
+};
+#if NSD > 0
+static struct dk_intf dkintf_scsi = {
+	DTYPE_SCSI,
+	"Xen Virtual SCSI",
+	xbdopen,
+	xbdclose,
+	xbdstrategy,
+	xbdstart,
+};
+#endif
+
+#if NXBD > 0
+static struct xbd_attach_args xbd_ata = {
+	.xa_device = "xbd",
+	.xa_dkintf = &dkintf_esdi,
+};
+#endif
+
+#if NWD > 0
+static struct xbd_attach_args wd_ata = {
+	.xa_device = "wd",
+	.xa_dkintf = &dkintf_esdi,
+};
+#endif
+
+#if NSD > 0
+static struct xbd_attach_args sd_ata = {
+	.xa_device = "sd",
+	.xa_dkintf = &dkintf_scsi,
+};
+#endif
+
+#if NCD > 0
+static struct xbd_attach_args cd_ata = {
+	.xa_device = "cd",
+	.xa_dkintf = &dkintf_esdi,
+};
+#endif
+
+static struct sysctlnode *diskcookies;
+
+
+#if defined(XBDDEBUG) && !defined(DEBUG)
+#define DEBUG
+#endif
+
+#ifdef DEBUG
+int xbddebug = 0;
+
+#define XBDB_FOLLOW	0x1
+#define XBDB_IO		0x2
+#define XBDB_SETUP	0x4
+#define XBDB_HOTPLUG	0x8
+
+#define IFDEBUG(x,y)		if (xbddebug & (x)) y
+#define DPRINTF(x,y)		IFDEBUG(x, printf y)
+#define DPRINTF_FOLLOW(y)	DPRINTF(XBDB_FOLLOW, y)
+#define	DEBUG_MARK_UNUSED(_xr)	(_xr)->xr_sc = (void *)0xdeadbeef
+
+struct xbdreq *xbd_allxr;
+#else
+#define IFDEBUG(x,y)
+#define DPRINTF(x,y)
+#define DPRINTF_FOLLOW(y)
+#define	DEBUG_MARK_UNUSED(_xr)
+#endif
+
+#ifdef DIAGNOSTIC
+#define DIAGPANIC(x)		panic x 
+#define DIAGCONDPANIC(x,y)	if (x) panic y
+#else
+#define DIAGPANIC(x)
+#define DIAGCONDPANIC(x,y)
+#endif
+
+
+struct xbdreq {
+	union {
+		SLIST_ENTRY(xbdreq) _unused;	/* ptr. to next free xbdreq */
+		SIMPLEQ_ENTRY(xbdreq) _suspended;
+					/* link when on suspended queue. */
+	} _link;
+	struct xbdreq		*xr_parent;	/* ptr. to parent xbdreq */
+	struct buf		*xr_bp;		/* ptr. to original I/O buf */
+	daddr_t			xr_bn;		/* block no. to process */
+	long			xr_bqueue;	/* bytes left to queue */
+	long			xr_bdone;	/* bytes left */
+	vaddr_t			xr_data;	/* ptr. to data to be proc. */
+	vaddr_t			xr_aligned;	/* ptr. to aligned data */
+	long			xr_breq;	/* bytes in this req. */
+	struct xbd_softc	*xr_sc;		/* ptr. to xbd softc */
+};
+#define	xr_unused	_link._unused
+#define	xr_suspended	_link._suspended
+
+SLIST_HEAD(,xbdreq) xbdreqs =
+	SLIST_HEAD_INITIALIZER(xbdreqs);
+static SIMPLEQ_HEAD(, xbdreq) xbdr_suspended =
+	SIMPLEQ_HEAD_INITIALIZER(xbdr_suspended);
+
+#define	CANGET_XBDREQ() (!SLIST_EMPTY(&xbdreqs))
+
+#define	GET_XBDREQ(_xr) do {				\
+	(_xr) = SLIST_FIRST(&xbdreqs);			\
+	if (__predict_true(_xr))			\
+		SLIST_REMOVE_HEAD(&xbdreqs, xr_unused);	\
+} while (/*CONSTCOND*/0)
+
+#define	PUT_XBDREQ(_xr) do {				\
+	DEBUG_MARK_UNUSED(_xr);				\
+	SLIST_INSERT_HEAD(&xbdreqs, _xr, xr_unused);	\
+} while (/*CONSTCOND*/0)
+
+static struct bufq_state bufq;
+static int bufq_users = 0;
+
+#define XEN_MAJOR(_dev)	((_dev) >> 8)
+#define XEN_MINOR(_dev)	((_dev) & 0xff)
+
+#define	XEN_SCSI_DISK0_MAJOR	8
+#define	XEN_SCSI_DISK1_MAJOR	65
+#define	XEN_SCSI_DISK2_MAJOR	66
+#define	XEN_SCSI_DISK3_MAJOR	67
+#define	XEN_SCSI_DISK4_MAJOR	68
+#define	XEN_SCSI_DISK5_MAJOR	69
+#define	XEN_SCSI_DISK6_MAJOR	70
+#define	XEN_SCSI_DISK7_MAJOR	71
+#define	XEN_SCSI_DISK8_MAJOR	128
+#define	XEN_SCSI_DISK9_MAJOR	129
+#define	XEN_SCSI_DISK10_MAJOR	130
+#define	XEN_SCSI_DISK11_MAJOR	131
+#define	XEN_SCSI_DISK12_MAJOR	132
+#define	XEN_SCSI_DISK13_MAJOR	133
+#define	XEN_SCSI_DISK14_MAJOR	134
+#define	XEN_SCSI_DISK15_MAJOR	135
+#define	XEN_SCSI_CDROM_MAJOR	11
+
+#define	XEN_IDE0_MAJOR		3
+#define	XEN_IDE1_MAJOR		22
+#define	XEN_IDE2_MAJOR		33
+#define	XEN_IDE3_MAJOR		34
+#define	XEN_IDE4_MAJOR		56
+#define	XEN_IDE5_MAJOR		57
+#define	XEN_IDE6_MAJOR		88
+#define	XEN_IDE7_MAJOR		89
+#define	XEN_IDE8_MAJOR		90
+#define	XEN_IDE9_MAJOR		91
+
+#define	XEN_BSHIFT	9		/* log2(XEN_BSIZE) */
+#define	XEN_BSIZE	(1 << XEN_BSHIFT)
+
+#define MAX_VBDS 64
+static int nr_vbds;
+static xen_disk_t *vbd_info;
+
+static blk_ring_t *blk_ring = NULL;
+static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */
+static BLK_RING_IDX req_prod;  /* Private request producer.         */
+static BLK_RING_IDX last_req_prod;  /* Request producer at last trap. */
+
+#define STATE_ACTIVE    0
+#define STATE_SUSPENDED 1
+#define STATE_CLOSED    2
+static unsigned int state = STATE_SUSPENDED;
+
+
+#define XBDUNIT(x)		DISKUNIT(x)
+#define GETXBD_SOFTC(_xs, x)	if (!((_xs) = getxbd_softc(x))) return ENXIO
+#define GETXBD_SOFTC_CDEV(_xs, x) do {			\
+	dev_t bx = devsw_chr2blk((x));			\
+	if (bx == NODEV)				\
+		return ENXIO;				\
+	if (!((_xs) = getxbd_softc(bx)))		\
+		return ENXIO;				\
+} while (/*CONSTCOND*/0)
+
+static struct xbd_softc *
+getxbd_softc(dev_t dev)
+{
+	int	unit = XBDUNIT(dev);
+
+	DPRINTF_FOLLOW(("getxbd_softc(0x%x): major = %d unit = %d\n", dev,
+	    major(dev), unit));
+#if NXBD > 0
+	if (major(dev) == xbd_major)
+		return device_lookup(&xbd_cd, unit);
+#endif
+#if NWD > 0
+	if (major(dev) == xbd_wd_major || major(dev) == xbd_wd_cdev_major)
+		return device_lookup(&wd_cd, unit);
+#endif
+#if NSD > 0
+	if (major(dev) == xbd_sd_major || major(dev) == xbd_sd_cdev_major)
+		return device_lookup(&sd_cd, unit);
+#endif
+#if NCD > 0
+	if (major(dev) == xbd_cd_major || major(dev) == xbd_cd_cdev_major)
+		return device_lookup(&cd_cd, unit);
+#endif
+	return NULL;
+}
+
+static int
+get_vbd_info(xen_disk_t *disk_info)
+{
+	int err;
+	block_io_op_t op; 
+
+	/* Probe for disk information. */
+	memset(&op, 0, sizeof(op)); 
+	op.cmd = BLOCK_IO_OP_VBD_PROBE; 
+	op.u.probe_params.domain = 0; 
+	op.u.probe_params.xdi.max = MAX_VBDS;
+	op.u.probe_params.xdi.disks = disk_info;
+	op.u.probe_params.xdi.count = 0;
+
+	err = HYPERVISOR_block_io_op(&op);
+	if (err) {
+		printf("WARNING: Could not probe disks (%d)\n", err);
+		DIAGPANIC(("get_vbd_info: Could not probe disks (%d)", err));
+		return -1;
+	}
+
+	return op.u.probe_params.xdi.count;
+}
+
+static void
+reset_interface(void)
+{
+	block_io_op_t op; 
+
+	op.cmd = BLOCK_IO_OP_RESET;
+	if (HYPERVISOR_block_io_op(&op) != 0)
+		printf("xbd: Possible blkdev trouble: couldn't reset ring\n");
+}
+
+static void
+init_interface(void)
+{
+	block_io_op_t op; 
+
+	reset_interface();
+
+	if (blk_ring == NULL) {
+		op.cmd = BLOCK_IO_OP_RING_ADDRESS;
+		(void)HYPERVISOR_block_io_op(&op);
+
+		blk_ring = (blk_ring_t *)uvm_km_valloc_align(kernel_map,
+		    PAGE_SIZE, PAGE_SIZE);
+		pmap_kenter_ma((vaddr_t)blk_ring, op.u.ring_mfn << PAGE_SHIFT,
+		    VM_PROT_READ|VM_PROT_WRITE);
+		DPRINTF(XBDB_SETUP, ("init_interface: "
+		    "ring va %p and wired to %p\n",
+		    blk_ring, (void *)(op.u.ring_mfn << PAGE_SHIFT)));
+
+		blk_ring->req_prod = blk_ring->resp_prod =
+			resp_cons = req_prod = last_req_prod = 0;
+
+		event_set_handler(_EVENT_BLKDEV, &xbd_response_handler,
+		    NULL, IPL_BIO);
+		hypervisor_enable_event(_EVENT_BLKDEV);
+	}
+
+	__insn_barrier();
+	state = STATE_ACTIVE;
+}
+
+static void
+enable_update_events(struct device *self)
+{
+
+	kthread_create(xbd_update_create_kthread, self);
+	event_set_handler(_EVENT_VBD_UPD, &xbd_update_handler, self, IPL_BIO);
+	hypervisor_enable_event(_EVENT_VBD_UPD);
+}
+
+static void
+signal_requests_to_xen(void)
+{
+	block_io_op_t op; 
+
+	DPRINTF(XBDB_IO, ("signal_requests_to_xen: %d -> %d\n",
+	    blk_ring->req_prod, MASK_BLK_IDX(req_prod)));
+	blk_ring->req_prod = MASK_BLK_IDX(req_prod);
+	last_req_prod = req_prod;
+
+	op.cmd = BLOCK_IO_OP_SIGNAL; 
+	HYPERVISOR_block_io_op(&op);
+	return;
+}
+
+static void
+setup_sysctl(void)
+{
+	struct sysctlnode *pnode;
+
+	sysctl_createv(NULL, 0, NULL, NULL,
+		       0,
+		       CTLTYPE_NODE, "machdep", NULL,
+		       NULL, 0, NULL, 0,
+		       CTL_MACHDEP, CTL_EOL);
+
+	sysctl_createv(NULL, 0, NULL, &pnode,
+		       0,
+		       CTLTYPE_NODE, "domain0", NULL,
+		       NULL, 0, NULL, 0,
+		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
+
+	if (pnode == NULL)
+		return;
+
+	sysctl_createv(NULL, 0, &pnode, &pnode,
+		       0,
+		       CTLTYPE_NODE, "diskcookie", NULL,
+		       NULL, 0, NULL, 0,
+		       CTL_CREATE, CTL_EOL);
+
+	if (pnode)
+		diskcookies = pnode;
+}
+
+static struct xbd_attach_args *
+get_xbda(xen_disk_t *xd)
+{
+
+	switch (XEN_MAJOR(xd->device)) {
+#if NSD > 0
+	case XEN_SCSI_DISK0_MAJOR:
+	case XEN_SCSI_DISK1_MAJOR ... XEN_SCSI_DISK7_MAJOR:
+	case XEN_SCSI_DISK8_MAJOR ... XEN_SCSI_DISK15_MAJOR:
+		if (xd->capacity == 0)
+			return NULL;
+		return &sd_ata;
+	case XEN_SCSI_CDROM_MAJOR:
+		return &cd_ata;
+#endif
+#if NWD > 0
+	case XEN_IDE0_MAJOR:
+	case XEN_IDE1_MAJOR:
+	case XEN_IDE2_MAJOR:
+	case XEN_IDE3_MAJOR:
+	case XEN_IDE4_MAJOR:
+	case XEN_IDE5_MAJOR:
+	case XEN_IDE6_MAJOR:
+	case XEN_IDE7_MAJOR:
+	case XEN_IDE8_MAJOR:
+	case XEN_IDE9_MAJOR:
+		switch (XD_TYPE(xd->info)) {
+		case XD_TYPE_CDROM:
+			return &cd_ata;
+		case XD_TYPE_DISK:
+			if (xd->capacity == 0)
+				return NULL;
+			return &wd_ata;
+		default:
+			return NULL;
+		}
+		break;
+#endif
+	default:
+		if (xd->capacity == 0)
+			return NULL;
+		return &xbd_ata;
+	}
+	return NULL;
+}
+
+int
+xbd_scan(struct device *self, struct xbd_attach_args *mainbus_xbda,
+    cfprint_t print)
+{
+	struct xbdreq *xr;
+	struct xbd_attach_args *xbda;
+	xen_disk_t *xd;
+	int i;
+
+	init_interface();
+	if (xen_start_info.flags & SIF_PRIVILEGED)
+		setup_sysctl();
+
+#if NXBD > 0
+	xbd_major = devsw_name2blk("xbd", NULL, 0);
+#endif
+#if NWD > 0
+	xbd_wd_major = devsw_name2blk("wd", NULL, 0);
+	/* XXX Also handle the cdev majors since stuff like
+	 * read_sector calls strategy on the cdev.  This only works if
+	 * all the majors we care about are different.
+	 */
+	xbd_wd_cdev_major = major(devsw_blk2chr(makedev(xbd_wd_major, 0)));
+#endif
+#if NSD > 0
+	xbd_sd_major = devsw_name2blk("sd", NULL, 0);
+	xbd_sd_cdev_major = major(devsw_blk2chr(makedev(xbd_sd_major, 0)));
+#endif
+#if NCD > 0
+	xbd_cd_major = devsw_name2blk("cd", NULL, 0);
+	xbd_cd_cdev_major = major(devsw_blk2chr(makedev(xbd_cd_major, 0)));
+#endif
+
+	MALLOC(xr, struct xbdreq *, BLK_RING_SIZE * sizeof(struct xbdreq),
+	    M_DEVBUF, M_WAITOK | M_ZERO);
+#ifdef DEBUG
+	xbd_allxr = xr;
+#endif
+
+	/* XXX Xen1.2: We cannot use BLK_RING_SIZE many slots, since
+	 * Xen 1.2 keeps indexes masked in the ring and the case where
+	 * we queue all slots at once is handled wrong. 
+	 */
+	for (i = 0; i < BLK_RING_SIZE - 1; i++)
+		PUT_XBDREQ(&xr[i]);
+
+	MALLOC(vbd_info, xen_disk_t *, MAX_VBDS * sizeof(xen_disk_t),
+	    M_DEVBUF, M_WAITOK);
+	memset(vbd_info, 0, MAX_VBDS * sizeof(xen_disk_t));
+	nr_vbds  = get_vbd_info(vbd_info);
+	if (nr_vbds <= 0)
+		goto out;
+
+	for (i = 0; i < nr_vbds; i++) {
+		xd = &vbd_info[i];
+		xbda = get_xbda(xd);
+		if (xbda) {
+			xbda->xa_xd = xd;
+			config_found(self, xbda, print);
+		}
+	}
+
+	enable_update_events(self);
+
+	return 0;
+
+ out:
+	FREE(vbd_info, M_DEVBUF);
+	vbd_info = NULL;
+	FREE(xr, M_DEVBUF);
+#ifdef DEBUG
+	xbd_allxr = NULL;
+#endif
+	SLIST_INIT(&xbdreqs);
+	return 0;
+}
+
+#if NXBD > 0
+int
+xbd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+	struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+	if (strcmp(xa->xa_device, "xbd") == 0)
+		return 1;
+	return 0;
+}
+#endif
+
+#if NWD > 0
+int
+xbd_wd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+	struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+	if (strcmp(xa->xa_device, "wd") == 0)
+		return 1;
+	return 0;
+}
+#endif
+
+#if NSD > 0
+int
+xbd_sd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+	struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+	if (strcmp(xa->xa_device, "sd") == 0)
+		return 1;
+	return 0;
+}
+#endif
+
+#if NCD > 0
+int
+xbd_cd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+	struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+	if (strcmp(xa->xa_device, "cd") == 0)
+		return 1;
+	return 0;
+}
+#endif
+
+static void
+xbd_attach(struct device *parent, struct device *self, void *aux)
+{
+	struct xbd_attach_args *xbda = (struct xbd_attach_args *)aux;
+	struct xbd_softc *xs = (struct xbd_softc *)self;
+
+	aprint_normal(": Xen Virtual Block Device");
+
+	simple_lock_init(&xs->sc_slock);
+	dk_sc_init(&xs->sc_dksc, xs, xs->sc_dev.dv_xname);
+	xbdinit(xs, xbda->xa_xd, xbda->xa_dkintf);
+	if (diskcookies) {
+		/* XXX beware that xs->sc_xd_device is a long */
+		sysctl_createv(NULL, 0, &diskcookies, NULL,
+		    0,
+		    CTLTYPE_INT, xs->sc_dev.dv_xname, NULL,
+		    NULL, 0, &xs->sc_xd_device, 0,
+		    CTL_CREATE, CTL_EOL);
+	}
+}
+
+static int
+xbd_detach(struct device *dv, int flags)
+{
+	struct	xbd_softc *xs = (struct	xbd_softc *)dv;
+
+	/* 
+	 * Mark disk about to be removed (between now and when the xs
+	 * will be freed).
+	 */
+	xs->sc_shutdown = 1;
+
+	/* And give it some time to settle if it's busy. */
+	if (xs->sc_dksc.sc_dkdev.dk_busy > 0)
+		tsleep(&xs, PWAIT, "xbdetach", hz);
+
+	/* Detach the disk. */
+	disk_detach(&xs->sc_dksc.sc_dkdev);
+
+	/* XXX decrement bufq_users and free? */
+
+	/* XXX no need to remove sysctl nodes since they only exist
+	 * in domain0 and domain0's devices are never removed.
+	 */
+
+	return 0;
+}
+
+int
+xbdopen(dev_t dev, int flags, int fmt, struct proc *p)
+{
+	struct	xbd_softc *xs;
+
+	DPRINTF_FOLLOW(("xbdopen(0x%04x, %d)\n", dev, flags));
+	switch (fmt) {
+	case S_IFCHR:
+		GETXBD_SOFTC_CDEV(xs, dev);
+		break;
+	case S_IFBLK:
+		GETXBD_SOFTC(xs, dev);
+		break;
+	default:
+		return ENXIO;
+	}
+	return dk_open(xs->sc_di, &xs->sc_dksc, dev, flags, fmt, p);
+}
+
+int
+xbdclose(dev_t dev, int flags, int fmt, struct proc *p)
+{
+	struct	xbd_softc *xs;
+
+	DPRINTF_FOLLOW(("xbdclose(%d, %d)\n", dev, flags));
+	switch (fmt) {
+	case S_IFCHR:
+		GETXBD_SOFTC_CDEV(xs, dev);
+		break;
+	case S_IFBLK:
+		GETXBD_SOFTC(xs, dev);
+		break;
+	default:
+		return ENXIO;
+	}
+	return dk_close(xs->sc_di, &xs->sc_dksc, dev, flags, fmt, p);
+}
+
+void
+xbdstrategy(struct buf *bp)
+{
+	struct	xbd_softc *xs = getxbd_softc(bp->b_dev);
+
+	DPRINTF_FOLLOW(("xbdstrategy(%p): b_bcount = %ld\n", bp,
+	    (long)bp->b_bcount));
+
+	if (xs == NULL || xs->sc_shutdown) {
+		bp->b_flags |= B_ERROR;
+		bp->b_error = EIO;
+		biodone(bp);
+		return;
+	}
+
+	dk_strategy(xs->sc_di, &xs->sc_dksc, bp);
+	return;
+}
+
+int
+xbdsize(dev_t dev)
+{
+	struct xbd_softc *xs = getxbd_softc(dev);
+
+	DPRINTF_FOLLOW(("xbdsize(%d)\n", dev));
+	if (xs == NULL || xs->sc_shutdown)
+		return -1;
+	return dk_size(xs->sc_di, &xs->sc_dksc, dev);
+}
+
+static void
+map_align(struct xbdreq *xr)
+{
+	int s;
+
+	s = splvm();
+	xr->xr_aligned = uvm_km_kmemalloc1(kmem_map, NULL,
+	    xr->xr_bqueue, XEN_BSIZE, UVM_UNKNOWN_OFFSET,
+	    0/*  UVM_KMF_NOWAIT */);
+	splx(s);
+	DPRINTF(XBDB_IO, ("map_align(%p): bp %p addr %p align 0x%08lx "
+	    "size 0x%04lx\n", xr, xr->xr_bp, xr->xr_bp->b_data,
+	    xr->xr_aligned, xr->xr_bqueue));
+	xr->xr_data = xr->xr_aligned;
+	if ((xr->xr_bp->b_flags & B_READ) == 0)
+		memcpy((void *)xr->xr_aligned, xr->xr_bp->b_data,
+		    xr->xr_bqueue);
+}
+
+static void
+unmap_align(struct xbdreq *xr)
+{
+	int s;
+
+	if (xr->xr_bp->b_flags & B_READ)
+		memcpy(xr->xr_bp->b_data, (void *)xr->xr_aligned,
+		    xr->xr_bp->b_bcount);
+	DPRINTF(XBDB_IO, ("unmap_align(%p): bp %p addr %p align 0x%08lx "
+	    "size 0x%04lx\n", xr, xr->xr_bp, xr->xr_bp->b_data,
+	    xr->xr_aligned, xr->xr_bp->b_bcount));
+	s = splvm();
+	uvm_km_free(kmem_map, xr->xr_aligned, xr->xr_bp->b_bcount);
+	splx(s);
+	xr->xr_aligned = (vaddr_t)0;
+}
+
+static void
+fill_ring(struct xbdreq *xr)
+{
+	struct xbdreq *pxr = xr->xr_parent;
+	paddr_t pa;
+	unsigned long ma;
+	vaddr_t addr, off;
+	blk_ring_req_entry_t *ring_req;
+	int breq, nr_sectors;
+
+	/* Fill out a communications ring structure. */
+	ring_req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req;
+	ring_req->id = (unsigned long)xr;
+	ring_req->operation = pxr->xr_bp->b_flags & B_READ ? XEN_BLOCK_READ :
+		XEN_BLOCK_WRITE;
+	ring_req->sector_number = (xen_sector_t)pxr->xr_bn;
+	ring_req->device = pxr->xr_sc->sc_xd_device;
+
+	DPRINTF(XBDB_IO, ("fill_ring(%d): bp %p sector %llu pxr %p xr %p\n",
+	    MASK_BLK_IDX(req_prod), pxr->xr_bp, (unsigned long long)pxr->xr_bn,
+	    pxr, xr));
+
+	xr->xr_breq = 0;
+	ring_req->nr_segments = 0;
+	addr = trunc_page(pxr->xr_data);
+	off = pxr->xr_data - addr;
+	while (pxr->xr_bqueue > 0) {
+#if 0
+		pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map),
+		    addr, &pa);
+#else
+		pmap_extract(pmap_kernel(), addr, &pa);
+#endif
+		ma = xpmap_ptom_masked(pa) + off;
+		DIAGCONDPANIC((ma & (XEN_BSIZE - 1)) != 0,
+		    ("xbd request ma not sector aligned"));
+
+		if (pxr->xr_bqueue > PAGE_SIZE - off)
+			breq = PAGE_SIZE - off;
+		else
+			breq = pxr->xr_bqueue;
+		nr_sectors = breq >> XEN_BSHIFT;
+		DIAGCONDPANIC(nr_sectors >= XEN_BSIZE,
+		    ("xbd request nr_sectors >= XEN_BSIZE"));
+
+		DPRINTF(XBDB_IO, ("fill_ring(%d): va 0x%08lx pa 0x%08lx "
+		    "ma 0x%08lx, sectors %d, left %ld/%ld\n",
+		    MASK_BLK_IDX(req_prod), addr, pa, ma, nr_sectors,
+		    pxr->xr_bqueue >> XEN_BSHIFT, pxr->xr_bqueue));
+
+		ring_req->buffer_and_sects[ring_req->nr_segments++] =
+			ma | nr_sectors;
+		addr += PAGE_SIZE;
+		pxr->xr_bqueue -= breq;
+		pxr->xr_bn += nr_sectors;
+		xr->xr_breq += breq;
+		off = 0;
+		if (ring_req->nr_segments == MAX_BLK_SEGS)
+			break;
+	}
+	pxr->xr_data = addr;
+
+	req_prod++;
+}
+
+static void
+xbdresume(void)
+{
+	struct xbdreq *pxr, *xr;
+	struct xbd_softc *xs;
+	struct buf *bp;
+
+	while ((pxr = SIMPLEQ_FIRST(&xbdr_suspended)) != NULL) {
+		DPRINTF(XBDB_IO, ("xbdstart: resuming xbdreq %p for bp %p\n",
+		    pxr, pxr->xr_bp));
+		bp = pxr->xr_bp;
+		xs = getxbd_softc(bp->b_dev);
+		if (xs == NULL || xs->sc_shutdown) {
+			bp->b_flags |= B_ERROR;
+			bp->b_error = EIO;
+		}
+		if (bp->b_flags & B_ERROR) {
+			pxr->xr_bdone -= pxr->xr_bqueue;
+			pxr->xr_bqueue = 0;
+			if (pxr->xr_bdone == 0) {
+				bp->b_resid = bp->b_bcount;
+				if (pxr->xr_aligned)
+					unmap_align(pxr);
+				PUT_XBDREQ(pxr);
+				if (xs)
+					disk_unbusy(&xs->sc_dksc.sc_dkdev,
+					    (bp->b_bcount - bp->b_resid),
+					    (bp->b_flags & B_READ));
+				biodone(bp);
+			}
+			continue;
+		}
+		while (__predict_true(pxr->xr_bqueue > 0)) {
+			GET_XBDREQ(xr);
+			if (__predict_false(xr == NULL))
+				goto out;
+			xr->xr_parent = pxr;
+			fill_ring(xr);
+		}
+		DPRINTF(XBDB_IO, ("xbdstart: resumed xbdreq %p for bp %p\n",
+		    pxr, bp));
+		SIMPLEQ_REMOVE_HEAD(&xbdr_suspended, xr_suspended);
+	}
+
+ out:
+	return;
+}
+
+static int
+xbdstart(struct dk_softc *dksc, struct buf *bp)
+{
+	struct	xbd_softc *xs;
+	struct xbdreq *pxr, *xr;
+	struct	partition *pp;
+	daddr_t	bn;
+	int ret, runqueue;
+
+	DPRINTF_FOLLOW(("xbdstart(%p, %p)\n", dksc, bp));
+
+	runqueue = 1;
+	ret = -1;
+
+	xs = getxbd_softc(bp->b_dev);
+	if (xs == NULL || xs->sc_shutdown) {
+		bp->b_flags |= B_ERROR;
+		bp->b_error = EIO;
+		biodone(bp);
+		return 0;
+	}
+	dksc = &xs->sc_dksc;
+
+	/* XXXrcd:
+	 * Translate partition relative blocks to absolute blocks,
+	 * this probably belongs (somehow) in dksubr.c, since it
+	 * is independant of the underlying code...  This will require
+	 * that the interface be expanded slightly, though.
+	 */
+	bn = bp->b_blkno;
+	if (DISKPART(bp->b_dev) != RAW_PART) {
+		pp = &xs->sc_dksc.sc_dkdev.dk_label->
+			d_partitions[DISKPART(bp->b_dev)];
+		bn += pp->p_offset;
+	}
+
+	DPRINTF(XBDB_IO, ("xbdstart: addr %p, sector %llu, "
+	    "count %ld [%s]\n", bp->b_data, (unsigned long long)bn,
+	    bp->b_bcount, bp->b_flags & B_READ ? "read" : "write"));
+
+	GET_XBDREQ(pxr);
+	if (__predict_false(pxr == NULL))
+		goto out;
+
+	disk_busy(&dksc->sc_dkdev); /* XXX: put in dksubr.c */
+	/*
+	 * We have a request slot, return 0 to make dk_start remove
+	 * the bp from the work queue.
+	 */
+	ret = 0;
+
+	pxr->xr_bp = bp;
+	pxr->xr_parent = pxr;
+	pxr->xr_bn = bn;
+	pxr->xr_bqueue = bp->b_bcount;
+	pxr->xr_bdone = bp->b_bcount;
+	pxr->xr_data = (vaddr_t)bp->b_data;
+	pxr->xr_sc = xs;
+
+	if (pxr->xr_data & (XEN_BSIZE - 1))
+		map_align(pxr);
+
+	fill_ring(pxr);
+
+	while (__predict_false(pxr->xr_bqueue > 0)) {
+		GET_XBDREQ(xr);
+		if (__predict_false(xr == NULL))
+			break;
+		xr->xr_parent = pxr;
+		fill_ring(xr);
+	}
+
+	if (__predict_false(pxr->xr_bqueue > 0)) {
+		SIMPLEQ_INSERT_TAIL(&xbdr_suspended, pxr,
+		    xr_suspended);
+		DPRINTF(XBDB_IO, ("xbdstart: suspended xbdreq %p "
+		    "for bp %p\n", pxr, bp));
+	} else if (CANGET_XBDREQ() && BUFQ_PEEK(&bufq) != NULL) {
+		/* 
+		 * We have enough resources to start another bp and
+		 * there are additional bps on the queue, dk_start
+		 * will call us again and we'll run the queue then.
+		 */
+		runqueue = 0;
+	}
+
+ out:
+	if (runqueue && last_req_prod != req_prod)
+		signal_requests_to_xen();
+
+	return ret;
+}
+
+static int
+xbd_response_handler(void *arg)
+{
+	struct buf *bp;
+	struct xbd_softc *xs;
+	blk_ring_resp_entry_t *ring_resp;
+	struct xbdreq *pxr, *xr;
+	int i;
+
+	for (i = resp_cons; i != blk_ring->resp_prod; i = BLK_RING_INC(i)) {
+		ring_resp = &blk_ring->ring[MASK_BLK_IDX(i)].resp;
+		xr = (struct xbdreq *)ring_resp->id;
+		pxr = xr->xr_parent;
+
+		DPRINTF(XBDB_IO, ("xbd_response_handler(%d): pxr %p xr %p "
+		    "bdone %04lx breq %04lx\n", i, pxr, xr, pxr->xr_bdone,
+		    xr->xr_breq));
+		pxr->xr_bdone -= xr->xr_breq;
+		DIAGCONDPANIC(pxr->xr_bdone < 0,
+		    ("xbd_response_handler: pxr->xr_bdone < 0"));
+
+		if (__predict_false(ring_resp->status)) {
+			pxr->xr_bp->b_flags |= B_ERROR;
+			pxr->xr_bp->b_error = EIO;
+		}
+
+		if (xr != pxr) {
+			PUT_XBDREQ(xr);
+			if (!SIMPLEQ_EMPTY(&xbdr_suspended))
+				xbdresume();
+		}
+
+		if (pxr->xr_bdone == 0) {
+			bp = pxr->xr_bp;
+			xs = getxbd_softc(bp->b_dev);
+			if (xs == NULL) { /* don't fail bp if we're shutdown */
+				bp->b_flags |= B_ERROR;
+				bp->b_error = EIO;
+			}
+			DPRINTF(XBDB_IO, ("xbd_response_handler(%d): "
+			    "completed bp %p\n", i, bp));
+			if (bp->b_flags & B_ERROR)
+				bp->b_resid = bp->b_bcount;
+			else
+				bp->b_resid = 0;
+
+			if (pxr->xr_aligned)
+				unmap_align(pxr);
+
+			PUT_XBDREQ(pxr);
+			if (xs)
+				disk_unbusy(&xs->sc_dksc.sc_dkdev,
+				    (bp->b_bcount - bp->b_resid),
+				    (bp->b_flags & B_READ));
+			biodone(bp);
+			if (!SIMPLEQ_EMPTY(&xbdr_suspended))
+				xbdresume();
+			/* XXX possible lockup if this was the only
+			 * active device and requests were held back in
+			 * the queue.
+			 */
+			if (xs)
+				dk_iodone(xs->sc_di, &xs->sc_dksc);
+		}
+	}
+	resp_cons = i;
+	/* check if xbdresume queued any requests */
+	if (last_req_prod != req_prod)
+		signal_requests_to_xen();
+	return 0;
+}
+
+static struct device *
+find_device(xen_disk_t *xd)
+{
+	struct device *dv;
+	struct xbd_softc *xs;
+
+	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+		if (dv->dv_cfattach == NULL ||
+		    dv->dv_cfattach->ca_attach != xbd_attach)
+			continue;
+		xs = (struct xbd_softc *)dv;
+		if (xs->sc_xd_device == xd->device)
+			break;
+	}
+	return dv;
+}
+
+static void
+xbd_update_create_kthread(void *arg)
+{
+
+	kthread_create1(xbd_update_kthread, arg, NULL, "xbdupdate");
+}
+
+static void
+xbd_update_kthread(void *arg)
+{
+	struct device *parent = arg;
+	struct xbd_attach_args *xbda;
+	struct device *dev;
+	xen_disk_t *xd;
+	xen_disk_t *vbd_info_update, *vbd_info_old;
+	int i, j, new_nr_vbds;
+	extern int hypervisor_print(void *, const char *);
+
+	MALLOC(vbd_info_update, xen_disk_t *, MAX_VBDS *
+	    sizeof(xen_disk_t), M_DEVBUF, M_WAITOK);
+
+	for (;;) {
+		memset(vbd_info_update, 0, MAX_VBDS * sizeof(xen_disk_t));
+		new_nr_vbds  = get_vbd_info(vbd_info_update);
+
+		if (memcmp(vbd_info, vbd_info_update, MAX_VBDS *
+		    sizeof(xen_disk_t)) == 0) {
+			FREE(vbd_info_update, M_DEVBUF);
+			tsleep(parent, PWAIT, "xbdupd", 0);
+			MALLOC(vbd_info_update, xen_disk_t *, MAX_VBDS *
+			    sizeof(xen_disk_t), M_DEVBUF, M_WAITOK);
+			continue;
+		}
+
+		j = 0;
+		for (i = 0; i < new_nr_vbds; i++) {
+			while (j < nr_vbds &&
+			    vbd_info[j].device < vbd_info_update[i].device) {
+				DPRINTF(XBDB_HOTPLUG,
+				    ("delete device %x size %lx\n",
+					vbd_info[j].device,
+					vbd_info[j].capacity));
+				xd = &vbd_info[j];
+				dev = find_device(xd);
+				if (dev)
+					config_detach(dev, DETACH_FORCE);
+				j++;
+			}
+			if (j < nr_vbds &&
+			    vbd_info[j].device == vbd_info_update[i].device) {
+				DPRINTF(XBDB_HOTPLUG,
+				    ("update device %x size %lx size %lx\n",
+					vbd_info_update[i].device,
+					vbd_info[j].capacity,
+					vbd_info_update[i].capacity));
+				j++;
+			} else {
+				DPRINTF(XBDB_HOTPLUG,
+				    ("add device %x size %lx\n",
+					vbd_info_update[i].device,
+					vbd_info_update[i].capacity));
+				xd = &vbd_info_update[i];
+				xbda = get_xbda(xd);
+				if (xbda) {
+					xbda->xa_xd = xd;
+					config_found(parent, xbda, hypervisor_print);
+				}
+			}
+		}
+
+		while (j < nr_vbds) {
+			DPRINTF(XBDB_HOTPLUG, ("delete device %x\n",
+			    vbd_info[j].device));
+			xd = &vbd_info[j];
+			dev = find_device(xd);
+			if (dev)
+				config_detach(dev, DETACH_FORCE);
+			j++;
+		}
+
+		nr_vbds = new_nr_vbds;
+
+		vbd_info_old = vbd_info;
+		vbd_info = vbd_info_update;
+		vbd_info_update = vbd_info_old;
+	}
+}
+
+static int
+xbd_update_handler(void *arg)
+{
+
+	wakeup(arg);
+
+	return 0;
+}
+
+/* XXX: we should probably put these into dksubr.c, mostly */
+int
+xbdread(dev_t dev, struct uio *uio, int flags)
+{
+	struct	xbd_softc *xs;
+	struct	dk_softc *dksc;
+
+	DPRINTF_FOLLOW(("xbdread(%d, %p, %d)\n", dev, uio, flags));
+	GETXBD_SOFTC_CDEV(xs, dev);
+	dksc = &xs->sc_dksc;
+	if ((dksc->sc_flags & DKF_INITED) == 0)
+		return ENXIO;
+	/* XXX see the comments about minphys in ccd.c */
+	return physio(xbdstrategy, NULL, dev, B_READ, minphys, uio);
+}
+
+/* XXX: we should probably put these into dksubr.c, mostly */
+int
+xbdwrite(dev_t dev, struct uio *uio, int flags)
+{
+	struct	xbd_softc *xs;
+	struct	dk_softc *dksc;
+
+	DPRINTF_FOLLOW(("xbdwrite(%d, %p, %d)\n", dev, uio, flags));
+	GETXBD_SOFTC_CDEV(xs, dev);
+	dksc = &xs->sc_dksc;
+	if ((dksc->sc_flags & DKF_INITED) == 0)
+		return ENXIO;
+	/* XXX see the comments about minphys in ccd.c */
+	return physio(xbdstrategy, NULL, dev, B_WRITE, minphys, uio);
+}
+
+int
+xbdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+	struct	xbd_softc *xs;
+	struct	dk_softc *dksc;
+	int	ret;
+
+	DPRINTF_FOLLOW(("xbdioctl(%d, %08lx, %p, %d, %p)\n",
+	    dev, cmd, data, flag, p));
+	GETXBD_SOFTC(xs, dev);
+	dksc = &xs->sc_dksc;
+
+	if ((ret = lockmgr(&dksc->sc_lock, LK_EXCLUSIVE, NULL)) != 0)
+		return ret;
+
+	switch (cmd) {
+	default:
+		ret = dk_ioctl(xs->sc_di, dksc, dev, cmd, data, flag, p);
+		break;
+	}
+
+	lockmgr(&dksc->sc_lock, LK_RELEASE, NULL);
+	return ret;
+}
+
+int
+xbdioctl_cdev(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+	dev_t bdev;
+
+	bdev = devsw_chr2blk(dev);
+	if (bdev == NODEV)
+		return ENXIO;
+	return xbdioctl(bdev, cmd, data, flag, p);
+}
+
+int
+xbddump(dev_t dev, daddr_t blkno, caddr_t va, size_t size)
+{
+	struct	xbd_softc *xs;
+
+	DPRINTF_FOLLOW(("xbddump(%d, %" PRId64 ", %p, %lu)\n", dev, blkno, va,
+	    (unsigned long)size));
+	GETXBD_SOFTC(xs, dev);
+	return dk_dump(xs->sc_di, &xs->sc_dksc, dev, blkno, va, size);
+}
+
+static int
+xbdinit(struct xbd_softc *xs, xen_disk_t *xd, struct dk_intf *dkintf)
+{
+	struct dk_geom *pdg;
+	char buf[9];
+	int ret;
+
+	ret = 0;
+
+	xs->sc_dksc.sc_size = xd->capacity;
+	xs->sc_xd_device = xd->device;
+	xs->sc_di = dkintf;
+	xs->sc_shutdown = 0;
+
+	/*
+	 * XXX here we should probe the underlying device.  If we
+	 *     are accessing a partition of type RAW_PART, then
+	 *     we should populate our initial geometry with the
+	 *     geometry that we discover from the device.
+	 */
+	pdg = &xs->sc_dksc.sc_geom;
+	pdg->pdg_secsize = DEV_BSIZE;
+	pdg->pdg_ntracks = 1;
+	pdg->pdg_nsectors = 1024 * (1024 / pdg->pdg_secsize);
+	pdg->pdg_ncylinders = xs->sc_dksc.sc_size / pdg->pdg_nsectors;
+
+	/*
+	 * We have one shared bufq for all devices because otherwise
+	 * requests can stall if there were no free request slots
+	 * available in xbdstart and this device had no requests
+	 * in-flight which would trigger a dk_start from the interrupt
+	 * handler.
+	 * XXX this assumes that we can just memcpy struct bufq_state
+	 *     to share it between devices.
+	 * XXX we reference count the usage in case so we can de-alloc
+	 *     the bufq if all devices are deconfigured.
+	 */
+	if (bufq_users == 0) {
+		bufq_alloc(&bufq, BUFQ_FCFS);
+		bufq_users = 1;
+	}
+	memcpy(&xs->sc_dksc.sc_bufq, &bufq, sizeof(struct bufq_state));
+
+	xs->sc_dksc.sc_flags |= DKF_INITED;
+
+	/* Attach the disk. */
+	disk_attach(&xs->sc_dksc.sc_dkdev);
+
+	/* Try and read the disklabel. */
+	dk_getdisklabel(xs->sc_di, &xs->sc_dksc, 0 /* XXX ? */);
+
+	format_bytes(buf, sizeof(buf), (uint64_t)xs->sc_dksc.sc_size *
+	    pdg->pdg_secsize);
+	printf(" %s\n", buf);
+
+/*   out: */
+	return ret;
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c
new file mode 100644
index 0000000000..8181f2b9b3
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c
@@ -0,0 +1,444 @@
+/*	$NetBSD: xen_debug.c,v 1.1.2.1 2004/05/22 15:59:31 he Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *
+ * Copyright (c) 2002-2003, K A Fraser & R Neugebauer
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xen_debug.c,v 1.1.2.1 2004/05/22 15:59:31 he Exp $");
+
+#define XENDEBUG
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/stdarg.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+
+#ifdef XENDEBUG
+
+#define PRINTK_BUFSIZE 1024
+void
+printk(const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	static char buf[PRINTK_BUFSIZE];
+
+	va_start(ap, fmt);
+	ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap);
+	va_end(ap);
+	buf[ret] = 0;
+	(void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf);
+}
+
+void
+vprintk(const char *fmt, va_list ap)
+{
+	int ret;
+	static char buf[PRINTK_BUFSIZE];
+
+	ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap);
+	buf[ret] = 0;
+	(void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf);
+}
+
+#endif
+
+#ifdef XENDEBUG_LOW
+
+int xen_once = 0;
+
+void hypervisor_callback(void);
+void failsafe_callback(void);
+
+void xen_dbglow_init(void);
+void
+xen_dbglow_init()
+{
+	start_info_t *si;
+#if 0
+	int i;
+#endif
+
+	si = &xen_start_info;
+
+	HYPERVISOR_set_callbacks(
+		__KERNEL_CS, (unsigned long)hypervisor_callback,
+		__KERNEL_CS, (unsigned long)failsafe_callback);
+
+	trap_init();
+
+	/* __sti(); */
+
+	/* print out some useful information  */
+	printk(version);
+	printk("start_info:   %p\n",  si);
+	printk("  nr_pages:   %lu",   si->nr_pages);
+	printk("  shared_inf: %p (was %p)\n",  HYPERVISOR_shared_info,
+	    si->shared_info);
+	printk("  pt_base:    %p",    (void *)si->pt_base); 
+	printk("  mod_start:  0x%lx\n", si->mod_start);
+	printk("  mod_len:    %lu\n", si->mod_len); 
+#if 0
+	printk("  net_rings: ");
+	for (i = 0; i < MAX_DOMAIN_VIFS; i++) {
+		if (si->net_rings[i] == 0)
+			break;
+		printk(" %lx", si->net_rings[i]);
+	};
+	printk("\n");
+	printk("  blk_ring:   0x%lx\n", si->blk_ring);
+#endif
+	printk("  dom_id:     %d\n",  si->dom_id);
+	printk("  flags:      0x%lx\n", si->flags);
+	printk("  cmd_line:   %s\n",  si->cmd_line ?
+	    (const char *)si->cmd_line : "NULL");
+}
+
+
+void xen_dbg0(char *);
+void
+xen_dbg0(char *end)
+{
+	struct cpu_info *ci;
+
+	ci = &cpu_info_primary;
+	if (xen_once)
+	printk("xencpu level %d ipending %08x master %08x\n",
+	    ci->ci_ilevel, ci->ci_ipending, 
+	    HYPERVISOR_shared_info->events_mask);
+	    /* ipending %08x imask %08x iunmask %08x */
+	    /* ci->ci_imask[IPL_NET], ci->ci_iunmask[IPL_NET]); */
+}
+
+void xen_dbg1(void *esp, int ss);
+void
+xen_dbg1(void *esp, int ss)
+{
+#if 1
+	struct cpu_info *ci;
+
+	ci = &cpu_info_primary;
+	if (xen_once)
+	printk("xenhighlevel %d ipending %08x master %08x events %08x\n",
+	    ci->ci_ilevel, ci->ci_ipending, 
+	    HYPERVISOR_shared_info->events_mask, HYPERVISOR_shared_info->events);
+#else
+	printk("stack switch %p %d/%d, sp %p\n", esp, ss, IDXSEL(ss), &ss);
+#endif
+}
+
+void xen_dbg2(void);
+void
+xen_dbg2(void)
+{
+	if (xen_once)
+	printk("xen_dbg2\n");
+}
+
+void xen_dbg3(void *, void *);
+void
+xen_dbg3(void *ss, void *esp)
+{
+	if (xen_once)
+	printk("xen_dbg3 %p %p\n", ss, esp);
+}
+
+void xen_dbg4(void *);
+void
+xen_dbg4(void *esi)
+{
+
+	printk("xen_dbg4 %p\n", esi);
+	for(;;);
+}
+
+
+
+
+static void do_exit(void);
+
+/*
+ * These are assembler stubs in vector.S.
+ * They are the actual entry points for virtual exceptions.
+ */
+void divide_error(void);
+void debug(void);
+void int3(void);
+void overflow(void);
+void bounds(void);
+void invalid_op(void);
+void device_not_available(void);
+void double_fault(void);
+void coprocessor_segment_overrun(void);
+void invalid_TSS(void);
+void segment_not_present(void);
+void stack_segment(void);
+void general_protection(void);
+void page_fault(void);
+void coprocessor_error(void);
+void simd_coprocessor_error(void);
+void alignment_check(void);
+void spurious_interrupt_bug(void);
+void machine_check(void);
+
+static void
+dump_regs(struct pt_regs *regs)
+{
+	int in_kernel = 1;
+	unsigned long esp;
+	unsigned short ss;
+
+	esp = (unsigned long) (&regs->esp);
+	ss = __KERNEL_DS;
+	if (regs->xcs & 2) {
+		in_kernel = 0;
+		esp = regs->esp;
+		ss = regs->xss & 0xffff;
+	}
+	printf("EIP:    %04x:[<%08lx>]\n",
+	    0xffff & regs->xcs, regs->eip);
+	printf("EFLAGS: %08lx\n",regs->eflags);
+	printf("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+	    regs->eax, regs->ebx, regs->ecx, regs->edx);
+	printf("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+	    regs->esi, regs->edi, regs->ebp, esp);
+	printf("ds: %04x   es: %04x   ss: %04x\n",
+	    regs->xds & 0xffff, regs->xes & 0xffff, ss);
+	printf("\n");
+}	
+
+
+static inline void
+dump_code(unsigned eip)
+{
+	unsigned *ptr = (unsigned *)eip;
+	int x;
+
+	printk("Bytes at eip:\n");
+	for (x = -4; x < 5; x++)
+		printf("%x", ptr[x]);
+}
+
+
+/*
+ * C handlers here have their parameter-list constructed by the
+ * assembler stubs above. Each one gets a pointer to a list
+ * of register values (to be restored at end of exception).
+ * Some will also receive an error code -- this is the code that
+ * was generated by the processor for the underlying real exception. 
+ * 
+ * Note that the page-fault exception is special. It also receives
+ * the faulting linear address. Normally this would be found in
+ * register CR2, but that is not accessible in a virtualised OS.
+ */
+
+static void inline
+do_trap(int trapnr, char *str, struct pt_regs *regs, long error_code)
+{
+
+	printk("FATAL:  Unhandled Trap (see mini-os:traps.c)");
+	printf("%d %s", trapnr, str);
+	dump_regs(regs);
+	dump_code(regs->eip);
+
+	do_exit();
+}
+
+#define DO_ERROR(trapnr, str, name) \
+void do_##name(struct pt_regs *regs, long error_code); \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+	do_trap(trapnr, str, regs, error_code); \
+}
+
+#define DO_ERROR_INFO(trapnr, str, name, sicode, siaddr) \
+void do_##name(struct pt_regs *regs, long error_code); \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+	do_trap(trapnr, str, regs, error_code); \
+}
+
+DO_ERROR_INFO( 0, "divide error", divide_error, FPE_INTDIV, regs->eip)
+DO_ERROR( 3, "int3", int3)
+DO_ERROR( 4, "overflow", overflow)
+DO_ERROR( 5, "bounds", bounds)
+DO_ERROR_INFO( 6, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
+DO_ERROR( 7, "device not available", device_not_available)
+DO_ERROR( 8, "double fault", double_fault)
+DO_ERROR( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, "invalid TSS", invalid_TSS)
+DO_ERROR(11, "segment not present", segment_not_present)
+DO_ERROR(12, "stack segment", stack_segment)
+DO_ERROR_INFO(17, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR(18, "machine check", machine_check)
+
+void do_page_fault(struct pt_regs *, long, unsigned long);
+void
+do_page_fault(struct pt_regs *regs, long error_code, unsigned long address)
+{
+
+	printk("Page fault\n");
+	printk("Address: 0x%lx", address);
+	printk("Error Code: 0x%lx", error_code);
+	printk("eip: \t 0x%lx", regs->eip);
+	do_exit();
+}
+
+void do_general_protection(struct pt_regs *, long);
+void
+do_general_protection(struct pt_regs *regs, long error_code)
+{
+
+	HYPERVISOR_shared_info->events_mask = 0;
+	printk("GPF\n");
+	printk("Error Code: 0x%lx", error_code);
+	dump_regs(regs);
+	dump_code(regs->eip);
+	do_exit();
+}
+
+
+void do_debug(struct pt_regs *, long);
+void
+do_debug(struct pt_regs *regs, long error_code)
+{
+
+	printk("Debug exception\n");
+#define TF_MASK 0x100
+	regs->eflags &= ~TF_MASK;
+	dump_regs(regs);
+	do_exit();
+}
+
+
+
+void do_coprocessor_error(struct pt_regs *, long);
+void
+do_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+
+	printk("Copro error\n");
+	dump_regs(regs);
+	dump_code(regs->eip);
+	do_exit();
+}
+
+void simd_math_error(void *);
+void
+simd_math_error(void *eip)
+{
+
+	printk("SIMD error\n");
+}
+
+void do_simd_coprocessor_error(struct pt_regs *, long);
+void
+do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+
+	printk("SIMD copro error\n");
+}
+
+void do_spurious_interrupt_bug(struct pt_regs *, long);
+void
+do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+{
+}
+
+static void
+do_exit(void)
+{
+
+	HYPERVISOR_exit();
+}
+
+/*
+ * Submit a virtual IDT to teh hypervisor. This consists of tuples
+ * (interrupt vector, privilege ring, CS:EIP of handler).
+ * The 'privilege ring' field specifies the least-privileged ring that
+ * can trap to that vector using a software-interrupt instruction (INT).
+ */
+static trap_info_t trap_table[] = {
+    {  0, 0, __KERNEL_CS, (unsigned long)divide_error                },
+    {  1, 0, __KERNEL_CS, (unsigned long)debug                       },
+    {  3, 3, __KERNEL_CS, (unsigned long)int3                        },
+    {  4, 3, __KERNEL_CS, (unsigned long)overflow                    },
+    {  5, 3, __KERNEL_CS, (unsigned long)bounds                      },
+    {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                  },
+    {  7, 0, __KERNEL_CS, (unsigned long)device_not_available        },
+    {  8, 0, __KERNEL_CS, (unsigned long)double_fault                },
+    {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
+    { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                 },
+    { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present         },
+    { 12, 0, __KERNEL_CS, (unsigned long)stack_segment               },
+    { 13, 0, __KERNEL_CS, (unsigned long)general_protection          },
+    { 14, 0, __KERNEL_CS, (unsigned long)page_fault                  },
+    { 15, 0, __KERNEL_CS, (unsigned long)spurious_interrupt_bug      },
+    { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error           },
+    { 17, 0, __KERNEL_CS, (unsigned long)alignment_check             },
+    { 18, 0, __KERNEL_CS, (unsigned long)machine_check               },
+    { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error      },
+    {  0, 0,           0, 0                           }
+};
+    
+void
+trap_init(void)
+{
+
+	HYPERVISOR_set_trap_table(trap_table);    
+}
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c
new file mode 100644
index 0000000000..a151e3dd83
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c
@@ -0,0 +1,352 @@
+/*	$NetBSD: xencons.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $	*/
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xencons.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $");
+
+#include <sys/param.h>
+#include <sys/ioctl.h>
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/systm.h>
+#include <sys/device.h>
+#include <sys/conf.h>
+
+#include <machine/stdarg.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+
+#include <dev/cons.h>
+
+#include <ddb/db_output.h>	/* XXX for db_max_line */
+
+static int xencons_isconsole = 0;
+
+#define	XENCONS_UNIT(x)	(minor(x))
+#define XENCONS_BURST 128
+
+int xencons_match (struct device *, struct cfdata *, void *);
+void xencons_attach (struct device *, struct device *, void *);
+/* int xencons_intr (void *); */
+void xencons_init (void);
+
+struct xencons_softc {
+	struct	device sc_dev;
+	struct	tty *sc_tty;
+};
+
+CFATTACH_DECL(xencons, sizeof(struct xencons_softc),
+    xencons_match, xencons_attach, NULL, NULL);
+
+extern struct cfdriver xencons_cd;
+
+dev_type_open(xencons_open);
+dev_type_close(xencons_close);
+dev_type_read(xencons_read);
+dev_type_write(xencons_write);
+dev_type_ioctl(xencons_ioctl);
+dev_type_stop(xencons_stop);
+dev_type_tty(xencons_tty);
+dev_type_poll(xencons_poll);
+
+const struct cdevsw xencons_cdevsw = {
+	xencons_open, xencons_close, xencons_read, xencons_write,
+	xencons_ioctl, xencons_stop, xencons_tty, xencons_poll,
+	NULL, ttykqfilter, D_TTY
+};
+
+
+void xenconscn_attach(void);
+int xenconscn_getc(dev_t);
+void xenconscn_putc(dev_t, int);
+void xenconscn_pollc(dev_t, int);
+
+static struct consdev xencons = {
+	NULL, NULL, xenconscn_getc, xenconscn_putc, xenconscn_pollc,
+	NULL, NULL, NULL, NODEV, CN_NORMAL
+};
+
+void	xencons_start (struct tty *);
+int	xencons_param (struct tty *, struct termios *);
+
+int
+xencons_match(struct device *parent, struct cfdata *match, void *aux)
+{
+	struct xencons_attach_args *xa = (struct xencons_attach_args *)aux;
+
+	if (strcmp(xa->xa_device, "xencons") == 0)
+		return 1;
+	return 0;
+}
+
+void
+xencons_attach(struct device *parent, struct device *self, void *aux)
+{
+	struct xencons_softc *sc = (void *)self;
+
+	aprint_normal(": Xen Virtual Console Driver\n");
+
+	if (xencons_isconsole) {
+		int maj;
+
+		/* Locate the major number. */
+		maj = cdevsw_lookup_major(&xencons_cdevsw);
+
+		/* There can be only one, but it can have any unit number. */
+		cn_tab->cn_dev = makedev(maj, sc->sc_dev.dv_unit);
+
+		aprint_verbose("%s: console major %d, unit %d\n",
+		    sc->sc_dev.dv_xname, maj, sc->sc_dev.dv_unit);
+
+		/* Set db_max_line to avoid paging. */
+		db_max_line = 0x7fffffff;
+	}
+}
+
+int
+xencons_open(dev_t dev, int flag, int mode, struct proc *p)
+{
+	struct xencons_softc *sc;
+	int unit = XENCONS_UNIT(dev);
+	struct tty *tp;
+
+	sc = device_lookup(&xencons_cd, unit);
+	if (sc == NULL)
+		return (ENXIO);
+
+	if (!sc->sc_tty) {
+		tp = sc->sc_tty = ttymalloc();
+		tty_attach(tp);
+	} else
+		tp = sc->sc_tty;
+
+	tp->t_oproc = xencons_start;
+	tp->t_param = xencons_param;
+	tp->t_dev = dev;
+	if ((tp->t_state & TS_ISOPEN) == 0) {
+		ttychars(tp);
+		tp->t_iflag = TTYDEF_IFLAG;
+		tp->t_oflag = TTYDEF_OFLAG;
+		tp->t_cflag = TTYDEF_CFLAG;
+		tp->t_lflag = TTYDEF_LFLAG;
+		tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED;
+		xencons_param(tp, &tp->t_termios);
+		ttsetwater(tp);
+	} else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0)
+		return (EBUSY);
+	tp->t_state |= TS_CARR_ON;
+
+	return ((*tp->t_linesw->l_open)(dev, tp));
+}
+
+int
+xencons_close(dev_t dev, int flag, int mode, struct proc *p)
+{
+	struct xencons_softc *sc = device_lookup(&xencons_cd,
+	    XENCONS_UNIT(dev));
+	struct tty *tp = sc->sc_tty;
+
+	if (tp == NULL)
+		return (0);
+	(*tp->t_linesw->l_close)(tp, flag);
+	ttyclose(tp);
+#ifdef notyet /* XXX */
+	ttyfree(tp);
+#endif
+	return (0);
+}
+
+int
+xencons_read(dev_t dev, struct uio *uio, int flag)
+{
+	struct xencons_softc *sc = device_lookup(&xencons_cd,
+	    XENCONS_UNIT(dev));
+	struct tty *tp = sc->sc_tty;
+
+	return ((*tp->t_linesw->l_read)(tp, uio, flag));
+}
+
+int
+xencons_write(dev_t dev, struct uio *uio, int flag)
+{
+	struct xencons_softc *sc = device_lookup(&xencons_cd,
+	    XENCONS_UNIT(dev));
+	struct tty *tp = sc->sc_tty;
+
+	return ((*tp->t_linesw->l_write)(tp, uio, flag));
+}
+
+int
+xencons_poll(dev_t dev, int events, struct proc *p)
+{
+	struct xencons_softc *sc = device_lookup(&xencons_cd,
+	    XENCONS_UNIT(dev));
+	struct tty *tp = sc->sc_tty;
+ 
+	return ((*tp->t_linesw->l_poll)(tp, events, p));
+}
+
+struct tty *
+xencons_tty(dev_t dev)
+{
+	struct xencons_softc *sc = device_lookup(&xencons_cd,
+	    XENCONS_UNIT(dev));
+	struct tty *tp = sc->sc_tty;
+
+	return (tp);
+}
+
+int
+xencons_ioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+	struct xencons_softc *sc = device_lookup(&xencons_cd,
+	    XENCONS_UNIT(dev));
+	struct tty *tp = sc->sc_tty;
+	int error;
+
+	error = (*tp->t_linesw->l_ioctl)(tp, cmd, data, flag, p);
+	if (error != EPASSTHROUGH)
+		return (error);
+
+	error = ttioctl(tp, cmd, data, flag, p);
+	if (error != EPASSTHROUGH)
+		return (error);
+
+	switch (cmd) {
+	default:
+		return (EPASSTHROUGH);
+	}
+
+#ifdef DIAGNOSTIC
+	panic("xencons_ioctl: impossible");
+#endif
+}
+
+void
+xencons_start(struct tty *tp)
+{
+	struct clist *cl;
+	int s, len;
+	u_char buf[XENCONS_BURST+1];
+
+	s = spltty();
+	if (tp->t_state & (TS_TIMEOUT | TS_BUSY | TS_TTSTOP))
+		goto out;
+	tp->t_state |= TS_BUSY;
+	splx(s);
+
+	/*
+	 * We need to do this outside spl since it could be fairly
+	 * expensive and we don't want our serial ports to overflow.
+	 */
+	cl = &tp->t_outq;
+	len = q_to_b(cl, buf, XENCONS_BURST);
+	(void)HYPERVISOR_console_io(CONSOLEIO_write, len, buf);
+
+	s = spltty();
+	tp->t_state &= ~TS_BUSY;
+	if (cl->c_cc) {
+		tp->t_state |= TS_TIMEOUT;
+		callout_reset(&tp->t_rstrt_ch, 1, ttrstrt, tp);
+	}
+	if (cl->c_cc <= tp->t_lowat) {
+		if (tp->t_state & TS_ASLEEP) {
+			tp->t_state &= ~TS_ASLEEP;
+			wakeup(cl);
+		}
+		selwakeup(&tp->t_wsel);
+	}
+out:
+	splx(s);
+}
+
+void
+xencons_stop(struct tty *tp, int flag)
+{
+
+}
+
+
+
+void
+xenconscn_attach()
+{
+
+	cn_tab = &xencons;
+
+	xencons_isconsole = 1;
+}
+
+int
+xenconscn_getc(dev_t dev)
+{
+
+	printf("\n");
+	for (;;);
+}
+
+#define MAXLINELEN 1024
+void
+xenconscn_putc(dev_t dev, int c)
+{
+	static char buf[1024+1];
+	static int bufpos = 0;
+
+	buf[bufpos++] = c;
+	if (c == '\n') {
+		buf[bufpos] = 0;
+		(void)HYPERVISOR_console_io(CONSOLEIO_write, bufpos, buf);
+		bufpos = 0;
+	}
+}
+
+void
+xenconscn_pollc(dev_t dev, int on)
+{
+	
+}
+
+/*
+ * Set line parameters.
+ */
+int
+xencons_param(struct tty *tp, struct termios *t)
+{
+
+	tp->t_ispeed = t->c_ispeed;
+	tp->t_ospeed = t->c_ospeed;
+	tp->t_cflag = t->c_cflag;
+	return (0);
+}
+
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c
new file mode 100644
index 0000000000..e54615567b
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c
@@ -0,0 +1,600 @@
+/* $NetBSD: xenkbc.c,v 1.3.2.1 2004/05/22 15:57:43 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2004 Ben Harris.
+ * Copyright (c) 1998
+ *	Matthias Drochner.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xenkbc.c,v 1.3.2.1 2004/05/22 15:57:43 he Exp $");
+
+#include <sys/param.h>
+#include <sys/device.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+
+#include <dev/pckbport/pckbportvar.h>
+#include <dev/ic/i8042reg.h>
+
+#include <machine/intr.h>
+
+#include <machine/xenkbcvar.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/hypervisor-ifs/kbd.h>
+#include <machine/evtchn.h>
+
+#define	KBC_DELAY	DELAY(1000)
+#define	KBC_TIMEOUT	250
+
+#define	XENKBC_NSLOTS	2
+
+/* data per slave device */
+struct xenkbc_slotdata {
+	int xsd_polling;	/* don't process data in interrupt handler */
+	int xsd_poll_data;	/* data read from inr handler if polling */
+	int xsd_poll_stat;	/* status read from inr handler if polling */
+#if NRND > 0
+	rndsource_element_t	xsd_rnd_source;
+#endif
+};
+
+struct xenkbc_internal {
+	struct xenkbc_softc	*xi_sc;
+	struct pckbport_tag	*xi_pt;
+	struct xenkbc_slotdata	*xi_slotdata[XENKBC_NSLOTS];
+	int			xi_flags;
+	int			xi_data;
+	int			xi_8042cmdbyte;
+};
+
+#define	XI_CONSOLE_FLAG		0x01
+#define	XI_HASAUX_FLAG		0x02
+
+#define	XI_CONSOLE(xi)		((xi)->xi_flags & XI_CONSOLE_FLAG)
+#define	XI_HASAUX(xi)		((xi)->xi_flags & XI_HASAUX_FLAG)
+
+#define	XI_SETCONSOLE(xi,on)	\
+	((on) ? ((xi)->xi_flags |= XI_CONSOLE_FLAG) : \
+		((xi)->xi_flags &= ~XI_CONSOLE_FLAG))
+#define	XI_SETHASAUX(xi,on)	\
+	((on) ? ((xi)->xi_flags |= XI_HASAUX_FLAG) : \
+		((xi)->xi_flags &= ~XI_HASAUX_FLAG))
+
+static int xenkbc_match(struct device *, struct cfdata *, void *);
+static void xenkbc_attach(struct device *, struct device *, void *);
+
+static int xenkbc_xt_translation(void *, pckbport_slot_t, int);
+static void xenkbc_init_slotdata(struct xenkbc_slotdata *);
+
+static int xenkbc_get8042cmd (struct xenkbc_internal *);
+static int xenkbc_put8042cmd (struct xenkbc_internal *);
+static int xenkbc_send_devcmd(void *, pckbport_slot_t, u_char);
+static int xenkbc_send_cmd(void *, u_char);
+static int xenkbc_send_data(void *, u_char);
+static int xenkbc_poll_data1(void *, pckbport_slot_t);
+
+static void xenkbc_slot_enable(void *, pckbport_slot_t, int);
+static void xenkbc_intr_establish(void *, pckbport_slot_t);
+static void xenkbc_set_poll(void *, pckbport_slot_t, int);
+
+static int xenkbc_intr(void *);
+
+CFATTACH_DECL(xenkbc, sizeof(struct xenkbc_softc),
+    xenkbc_match, xenkbc_attach, NULL, NULL);
+
+static struct pckbport_accessops const xenkbc_ops = {
+	xenkbc_xt_translation,
+	xenkbc_send_devcmd,
+	xenkbc_poll_data1,
+	xenkbc_slot_enable,
+	xenkbc_intr_establish,
+	xenkbc_set_poll
+};
+
+static struct xenkbc_internal xenkbc_consdata;
+static struct xenkbc_slotdata xenkbc_cons_slotdata;
+
+/*  #define XENKBCDEBUG */
+#ifdef XENKBCDEBUG
+#define	DPRINTF(x) printf x
+#else
+#define	DPRINTF(x)
+#endif
+
+
+static int
+xenkbc_getstatus(struct xenkbc_internal *xi)
+{
+	long res;
+
+	res = HYPERVISOR_kbd_op(KBD_OP_READ, 0);
+	if (res < 0) {
+		xi->xi_data = 0;
+		return 0;
+	}
+	xi->xi_data = KBD_CODE_SCANCODE(res);
+	return KBD_CODE_STATUS(res);
+}
+
+static int
+xenkbc_wait_output(struct xenkbc_internal *xi)
+{
+	u_int i;
+
+	for (i = KBC_TIMEOUT; i; i--) {
+		if ((xenkbc_getstatus(xi) & KBS_IBF) == 0)
+			return (1);
+		KBC_DELAY;
+	}
+	return (0);
+}
+
+static int
+xenkbc_match(struct device *parent, struct cfdata *cf, void *aux)
+{
+	struct xenkbc_attach_args *xa = aux;
+
+	if ((xen_start_info.flags & SIF_PRIVILEGED) == 0)
+		return 0;
+
+	if (strcmp(xa->xa_device, "xenkbc"))
+		return 0;
+
+	return 1;
+}
+
+static int
+xenkbc_attach_slot(struct xenkbc_softc *xs, pckbport_slot_t slot)
+{
+	struct xenkbc_internal *xi = xs->sc_xi;
+	struct device *child;
+	int alloced = 0;
+
+	if (xi->xi_slotdata[slot] == NULL) {
+		xi->xi_slotdata[slot] = malloc(sizeof(struct xenkbc_slotdata),
+		    M_DEVBUF, M_NOWAIT);
+		if (xi->xi_slotdata[slot] == NULL) {
+			printf("%s: no memory\n", xs->sc_dev.dv_xname);
+			return 0;
+		}
+		xenkbc_init_slotdata(xi->xi_slotdata[slot]);
+		alloced++;
+	}
+
+	child = pckbport_attach_slot(&xs->sc_dev, xi->xi_pt, slot);
+
+	if (child == NULL && alloced) {
+		free(xi->xi_slotdata[slot], M_DEVBUF);
+		xi->xi_slotdata[slot] = NULL;
+	}
+
+#if NRND > 0
+	   if (child != NULL && xi->xi_slotdata[slot] != NULL)
+		   rnd_attach_source(&xi->xi_slotdata[slot]->xsd_rnd_source,
+		       child->dv_xname, RND_TYPE_TTY, 0);
+#endif
+
+	return child != NULL;
+}
+
+static void
+xenkbc_attach(struct device *parent, struct device *self, void *aux)
+{
+	/*  struct xenkbc_attach_args *xa = aux; */
+	struct xenkbc_softc *xs = (struct xenkbc_softc *)self;
+	struct xenkbc_internal *xi;
+	int res;
+	u_char cmdbits = 0;
+
+	if (XI_CONSOLE(&xenkbc_consdata))
+		xi = &xenkbc_consdata;
+	else {
+		xi = malloc(sizeof(struct xenkbc_internal), M_DEVBUF,
+		    M_NOWAIT | M_ZERO);
+		if (xi == NULL) {
+			aprint_error(": no memory\n");
+			return;
+		}
+		xi->xi_8042cmdbyte = KC8_CPU;
+	}
+
+	aprint_normal(": Xen Keyboard/Mouse Device\n");
+
+	xs->sc_xi = xi;
+	xi->xi_sc = xs;
+
+	event_set_handler(_EVENT_PS2, &xenkbc_intr, xi, IPL_TTY);
+	hypervisor_enable_event(_EVENT_PS2);
+
+	xi->xi_pt = pckbport_attach(xi, &xenkbc_ops);
+
+	/* flush */
+	xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT);
+
+	/* set initial cmd byte */
+	if (!xenkbc_put8042cmd(xi)) {
+		printf("kbc: cmd word write error\n");
+		return;
+	}
+
+	if (xenkbc_attach_slot(xs, PCKBPORT_KBD_SLOT))
+		cmdbits |= KC8_KENABLE;
+
+	/*
+	 * Check aux port ok.
+	 */
+	if (!xenkbc_send_cmd(xi, KBC_AUXECHO)) {
+		printf("kbc: aux echo error 1\n");
+		goto nomouse;
+	}
+	if (!xenkbc_wait_output(xi)) {
+		printf("kbc: aux echo error 2\n");
+		goto nomouse;
+	}
+	XI_SETHASAUX(xi, 1);
+	xenkbc_send_data(xi, 0x5a); /* a random value */
+	res = xenkbc_poll_data1(xi, PCKBPORT_AUX_SLOT);
+	if (res != -1) {
+		/*
+		 * In most cases, the 0x5a gets echoed.
+		 * Some older controllers (Gateway 2000 circa 1993)
+		 * return 0xfe here.
+		 * We are satisfied if there is anything in the
+		 * aux output buffer.
+		 */
+		if (xenkbc_attach_slot(xs, PCKBPORT_AUX_SLOT))
+			cmdbits |= KC8_MENABLE;
+	} else {
+#ifdef XENKBCDEBUG
+		printf("kbc: aux echo test failed\n");
+#endif
+		XI_SETHASAUX(xi, 0);
+	}
+
+ nomouse:
+	/* enable needed interrupts */
+	xi->xi_8042cmdbyte |= cmdbits;
+	if (!xenkbc_put8042cmd(xi))
+		printf("kbc: cmd word write error\n");
+}
+
+static void
+xenkbc_init_slotdata(struct xenkbc_slotdata *xsd)
+{
+
+	xsd->xsd_polling = 0;
+}
+
+/*
+ * Get the current command byte.
+ */
+static int
+xenkbc_get8042cmd(struct xenkbc_internal *xi)
+{
+	int data;
+
+	if (!xenkbc_send_cmd(xi, K_RDCMDBYTE))
+		return 0;
+	data = xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT);
+	if (data == -1)
+		return 0;
+	xi->xi_8042cmdbyte = data;
+	return 1;
+}
+
+/*
+ * Pass command byte to keyboard controller (8042).
+ */
+static int
+xenkbc_put8042cmd(struct xenkbc_internal *xi)
+{
+
+	if (!xenkbc_send_cmd(xi, K_LDCMDBYTE))
+		return 0;
+	if (!xenkbc_wait_output(xi))
+		return 0;
+	return xenkbc_send_data(xi, xi->xi_8042cmdbyte);
+}
+
+static int
+xenkbc_send_devcmd(void *cookie, pckbport_slot_t slot, u_char devcmd)
+{
+
+	DPRINTF(("send_devcmd %x\n", devcmd));
+
+	if (slot == PCKBPORT_AUX_SLOT) {
+		if (!xenkbc_send_cmd(cookie, KBC_AUXWRITE)) {
+			DPRINTF(("xenkbc_send_devcmd: KBC_AUXWRITE failed\n"));
+			return 0;
+		}
+	}
+	if (!xenkbc_wait_output(cookie)) {
+		DPRINTF(("xenkbc_send_devcmd: wait_output failed\n"));
+		return 0;
+	}
+	return xenkbc_send_data(cookie, devcmd);
+}
+
+static int
+xenkbc_send_cmd(void *cookie, u_char cmd)
+{
+	struct xenkbc_internal *xi = cookie;
+
+	DPRINTF(("send_cmd %x\n", cmd));
+	xenkbc_wait_output(xi);
+	return !HYPERVISOR_kbd_op(KBD_OP_WRITECOMMAND, cmd);
+}
+
+static int
+xenkbc_send_data(void *cookie, u_char output)
+{
+	struct xenkbc_internal *xi = cookie;
+
+	DPRINTF(("send_data %x\n", output));
+	xenkbc_wait_output(xi);
+	return !HYPERVISOR_kbd_op(KBD_OP_WRITEOUTPUT, output);
+}
+
+static int
+xenkbc_poll_data1(void *cookie, pckbport_slot_t slot)
+{
+	struct xenkbc_internal *xi = cookie;
+	struct xenkbc_slotdata *xsd = xi->xi_slotdata[slot];
+	int s;
+	u_char stat, c;
+	int i = 1000;
+
+	s = splhigh();
+
+	if (xsd && xsd->xsd_polling && xsd->xsd_poll_data != -1 &&
+	    xsd->xsd_poll_stat != -1) {
+		stat = xsd->xsd_poll_stat;
+		c = xsd->xsd_poll_data;
+		xsd->xsd_poll_data = -1;
+		xsd->xsd_poll_stat = -1;
+		goto process;
+	}
+
+	DELAY(10);
+	for (; i; i--) {
+		stat = xenkbc_getstatus(xi);
+		if (stat & KBS_DIB) {
+			c = xi->xi_data;
+			DELAY(10);
+		process:
+			if (XI_HASAUX(xi) && (stat & 0x20)) { /* aux data */
+				if (slot != PCKBPORT_AUX_SLOT) {
+#ifdef XENKBCDEBUG
+					printf("lost aux 0x%x\n", c);
+#endif
+					continue;
+				}
+			} else {
+				if (slot == PCKBPORT_AUX_SLOT) {
+#ifdef XENKBCDEBUG
+					printf("lost kbd 0x%x\n", c);
+#endif
+					continue;
+				}
+			}
+			splx(s);
+			DPRINTF(("poll -> %x stat %x\n", c, stat));
+			return c;
+		}
+	}
+
+	DPRINTF(("poll failed -> -1\n"));
+	splx(s);
+	return -1;
+}
+
+/*
+ * switch scancode translation on / off
+ * return nonzero on success
+ */
+static int
+xenkbc_xt_translation(void *cookie, pckbport_slot_t slot, int on)
+{
+	struct xenkbc_internal *xi = cookie;
+	int ison;
+
+	if (slot != PCKBPORT_KBD_SLOT) {
+		/* translation only for kbd slot */
+		if (on)
+			return 0;
+		else
+			return 1;
+	}
+
+	ison = xi->xi_8042cmdbyte & KC8_TRANS;
+	if ((on && ison) || (!on && !ison))
+		return 1;
+
+	xi->xi_8042cmdbyte ^= KC8_TRANS;
+	if (!xenkbc_put8042cmd(xi))
+		return 0;
+
+	/* read back to be sure */
+	if (!xenkbc_get8042cmd(xi))
+		return 0;
+
+	ison = xi->xi_8042cmdbyte & KC8_TRANS;
+	if ((on && ison) || (!on && !ison))
+		return 1;
+	return 0;
+}
+
+static const struct xenkbc_portcmd {
+	u_char cmd_en, cmd_dis;
+} xenkbc_portcmd[2] = {
+	{
+		KBC_KBDENABLE, KBC_KBDDISABLE,
+	}, {
+		KBC_AUXENABLE, KBC_AUXDISABLE,
+	}
+};
+
+static void
+xenkbc_slot_enable(void *cookie, pckbport_slot_t slot, int on)
+{
+	struct xenkbc_internal *xi = cookie;
+	const struct xenkbc_portcmd *cmd;
+
+	cmd = &xenkbc_portcmd[slot];
+
+	DPRINTF(("slot enable %d -> %d\n", slot, on));
+	xenkbc_send_cmd(xi, on ? cmd->cmd_en : cmd->cmd_dis);
+}
+
+
+static void
+xenkbc_intr_establish(void *cookie, pckbport_slot_t slot)
+{
+
+}
+
+static void
+xenkbc_set_poll(void *cookie, pckbport_slot_t slot, int on)
+{
+	struct xenkbc_internal *xi = cookie;
+
+	DPRINTF(("xenkbc_set_poll %d -> %d\n", slot, on));
+
+	xi->xi_slotdata[slot]->xsd_polling = on;
+
+	if (on) {
+		xi->xi_slotdata[slot]->xsd_poll_data = -1;
+		xi->xi_slotdata[slot]->xsd_poll_stat = -1;
+	} else {
+                int s;
+
+                /*
+                 * If disabling polling on a device that's been configured,
+                 * make sure there are no bytes left in the FIFO, holding up
+                 * the interrupt line.  Otherwise we won't get any further
+                 * interrupts.
+                 */
+		s = spltty();
+		xenkbc_intr(xi);
+		splx(s);
+	}
+}
+
+static int
+xenkbc_intr(void *self)
+{
+	struct xenkbc_internal *xi = self;
+	u_char stat;
+	pckbport_slot_t slot;
+	struct xenkbc_slotdata *xsd;
+	int served = 0;
+
+	for (;;) {
+		stat = xenkbc_getstatus(xi);
+		if (!(stat & KBS_DIB))
+			break;
+
+		served = 1;
+
+		slot = (XI_HASAUX(xi) && (stat & 0x20)) ?
+			PCKBPORT_AUX_SLOT : PCKBPORT_KBD_SLOT;
+		xsd = xi->xi_slotdata[slot];
+
+		if (xsd == NULL)
+			continue;
+
+#if NRND > 0
+		rnd_add_uint32(&xsd->xsd_rnd_source,
+		    (stat << 8) | xi->xi_data);
+#endif
+
+		if (xsd->xsd_polling) {
+			xsd->xsd_poll_data = xi->xi_data;
+			xsd->xsd_poll_stat = stat;
+			break; /* xenkbc_poll_data() will get it */
+		}
+
+		pckbportintr(xi->xi_pt, slot, xi->xi_data);
+	}
+
+	return served;
+}
+
+int
+xenkbc_cnattach(pckbport_slot_t slot)
+{
+	struct xenkbc_internal *xi = &xenkbc_consdata;
+	int ret;
+
+	/* flush */
+	(void) xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT);
+
+	/* init cmd byte, enable ports */
+	xenkbc_consdata.xi_8042cmdbyte = KC8_CPU;
+	if (!xenkbc_put8042cmd(xi)) {
+		printf("kbc: cmd word write error\n");
+		return EIO;
+	}
+
+	ret = pckbport_cnattach(xi, &xenkbc_ops, slot);
+
+	xi->xi_slotdata[slot] = &xenkbc_cons_slotdata;
+	xenkbc_init_slotdata(xi->xi_slotdata[slot]);
+	XI_SETCONSOLE(xi, 1);
+
+	return ret;
+}
diff --git a/netbsd-2.0-xen-sparse/sys/nfs/files.nfs b/netbsd-2.0-xen-sparse/sys/nfs/files.nfs
new file mode 100644
index 0000000000..228c0c890f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/nfs/files.nfs
@@ -0,0 +1,34 @@
+#	$NetBSD: files.nfs,v 1.3 2004/03/11 21:48:43 cl Exp $
+
+deffs	fs_nfs.h		NFS
+
+defflag opt_nfs_boot.h		NFS_BOOT_BOOTP NFS_BOOT_BOOTPARAM NFS_BOOT_DHCP
+				NFS_BOOT_GATEWAY NFS_BOOT_TCP
+				NFS_BOOT_BOOTSTATIC
+
+defparam opt_nfs_boot.h		NFS_BOOT_BOOTP_REQFILE NFS_BOOT_OPTIONS
+				NFS_BOOT_RWSIZE
+				NFS_BOOTSTATIC_MYIP NFS_BOOTSTATIC_GWIP
+				NFS_BOOTSTATIC_MASK NFS_BOOTSTATIC_SERVADDR
+				NFS_BOOTSTATIC_SERVER
+
+defflag opt_nfs.h		NFS_V2_ONLY
+
+defflag				NFSSERVER
+
+file	nfs/krpc_subr.c		nfs 
+file	nfs/nfs_bio.c		nfs 
+file	nfs/nfs_boot.c		nfs 
+file	nfs/nfs_bootdhcp.c	nfs & (nfs_boot_bootp | nfs_boot_dhcp)
+file	nfs/nfs_bootparam.c	nfs & nfs_boot_bootparam
+file	nfs/nfs_bootstatic.c	nfs & nfs_boot_bootstatic
+file	nfs/nfs_kq.c		nfs   
+file	nfs/nfs_node.c		nfs   
+file	nfs/nfs_nqlease.c	nfsserver | nfs
+file	nfs/nfs_serv.c		nfsserver
+file	nfs/nfs_socket.c	nfsserver | nfs
+file	nfs/nfs_srvcache.c	nfsserver
+file	nfs/nfs_subs.c		nfsserver | nfs
+file	nfs/nfs_syscalls.c	nfsserver | nfs
+file	nfs/nfs_vfsops.c	nfs  
+file	nfs/nfs_vnops.c		nfs