aboutsummaryrefslogtreecommitdiffstats
path: root/netbsd-2.0-xen-sparse
diff options
context:
space:
mode:
authorcl349@labyrinth.cl.cam.ac.uk <cl349@labyrinth.cl.cam.ac.uk>2004-09-06 19:04:16 +0000
committercl349@labyrinth.cl.cam.ac.uk <cl349@labyrinth.cl.cam.ac.uk>2004-09-06 19:04:16 +0000
commit3ebb973d5045b339b449ecd3cd06cde2b00cafee (patch)
tree9d7abaa5310208204f1979a6d20c208496319c84 /netbsd-2.0-xen-sparse
parent2a632d88f77c98ce72f4ead7c372f74a3fb840a8 (diff)
downloadxen-3ebb973d5045b339b449ecd3cd06cde2b00cafee.tar.gz
xen-3ebb973d5045b339b449ecd3cd06cde2b00cafee.tar.bz2
xen-3ebb973d5045b339b449ecd3cd06cde2b00cafee.zip
bitkeeper revision 1.1159.72.2 (413cb4b0nYQ7KFQbxIn6g-4lsRAgbQ)
Add sparse tree for NetBSD.
Diffstat (limited to 'netbsd-2.0-xen-sparse')
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN176
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen232
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c630
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c408
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c230
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S2000
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c2561
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c4522
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c550
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S1587
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c680
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h130
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h423
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h110
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h533
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h247
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h135
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h193
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c505
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c234
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c226
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c1241
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c1368
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c444
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c352
-rw-r--r--netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c600
-rw-r--r--netbsd-2.0-xen-sparse/sys/nfs/files.nfs34
27 files changed, 20351 insertions, 0 deletions
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN
new file mode 100644
index 0000000000..2fbb9998ac
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN
@@ -0,0 +1,176 @@
+# $NetBSD: XEN,v 1.1.2.2 2004/07/15 20:19:34 he Exp $
+
+include "arch/xen/conf/std.xen"
+
+options INCLUDE_CONFIG_FILE # embed config file in kernel binary
+
+#options UVMHIST
+#options UVMHIST_PRINT
+#options SYSCALL_DEBUG
+
+maxusers 32 # estimated number of users
+
+#
+options XEN
+#options DOM0OPS
+options HZ=50
+
+#options I586_CPU
+options I686_CPU
+
+#options VM86 # virtual 8086 emulation
+#options USER_LDT # user-settable LDT; used by WINE
+
+#options MTRR # memory-type range register syscall support
+
+#options CONSDEVNAME="\"xencons\""
+#options CONS_OVERRIDE
+
+options INSECURE # disable kernel security levels - X needs this
+
+options RTC_OFFSET=0 # hardware clock is this many mins. west of GMT
+#options NTP # NTP phase/frequency locked loop
+
+options KTRACE # system call tracing via ktrace(1)
+#options SYSTRACE # system call vetting via systrace(1)
+
+options SYSVMSG # System V-like message queues
+options SYSVSEM # System V-like semaphores
+#options SEMMNI=10 # number of semaphore identifiers
+#options SEMMNS=60 # number of semaphores in system
+#options SEMUME=10 # max number of undo entries per process
+#options SEMMNU=30 # number of undo structures in system
+options SYSVSHM # System V-like memory sharing
+#options SHMMAXPGS=2048 # 2048 pages is the default
+options P1003_1B_SEMAPHORE # p1003.1b semaphore support
+
+options LKM # loadable kernel modules
+
+options USERCONF # userconf(4) support
+options SYSCTL_INCLUDE_DESCR # Include sysctl descriptions in kernel
+
+# Diagnostic/debugging support options
+options DIAGNOSTIC # expensive kernel consistency checks
+options DEBUG # expensive debugging checks/support
+options KMEMSTATS # kernel memory statistics (vmstat -m)
+options DDB # in-kernel debugger
+options DDB_ONPANIC=1 # see also sysctl(8): `ddb.onpanic'
+options DDB_HISTORY_SIZE=512 # enable history editing in DDB
+#options KGDB # remote debugger
+#options KGDB_DEVNAME="\"com\"",KGDB_DEVADDR=0x2f8,KGDB_DEVRATE=57600
+makeoptions DEBUG="-g" # compile full symbol table
+
+#options COMPAT_14 # NetBSD 1.4
+#options COMPAT_15 # NetBSD 1.5
+options COMPAT_16 # NetBSD 1.6
+
+##options COMPAT_LINUX # binary compatibility with Linux
+#options COMPAT_FREEBSD # binary compatibility with FreeBSD
+#options COMPAT_MACH # binary compatibility with Mach binaries
+#options COMPAT_DARWIN # binary compatibility with Darwin binaries
+#options EXEC_MACHO # exec MACH-O binaries
+#options COMPAT_PECOFF # kernel support to run Win32 apps
+
+file-system FFS # UFS
+file-system EXT2FS # second extended file system (linux)
+#file-system LFS # log-structured file system
+#file-system MFS # memory file system
+file-system NFS # Network File System client
+#file-system NTFS # Windows/NT file system (experimental)
+#file-system CD9660 # ISO 9660 + Rock Ridge file system
+#file-system MSDOSFS # MS-DOS file system
+file-system FDESC # /dev/fd
+file-system KERNFS # /kern
+file-system NULLFS # loopback file system
+#file-system OVERLAY # overlay file system
+#file-system PORTAL # portal filesystem (still experimental)
+file-system PROCFS # /proc
+#file-system UMAPFS # NULLFS + uid and gid remapping
+#file-system UNION # union file system
+#file-system SMBFS # experimental - CIFS; also needs nsmb (below)
+
+#options QUOTA # UFS quotas
+#options SOFTDEP # FFS soft updates support.
+#options NFSSERVER # Network File System server
+
+options GATEWAY # packet forwarding
+options INET # IP + ICMP + TCP + UDP
+options INET6 # IPV6
+options IPSEC # IP security
+options IPSEC_ESP # IP security (encryption part; define w/IPSEC)
+options MROUTING # IP multicast routing
+options PFIL_HOOKS # pfil(9) packet filter hooks
+options IPFILTER_LOG # ipmon(8) log support
+
+options NFS_BOOT_DHCP,NFS_BOOT_BOOTPARAM,NFS_BOOT_BOOTSTATIC
+#options NFS_BOOTSTATIC_MYIP="\"169.254.1.2\""
+#options NFS_BOOTSTATIC_GWIP="\"169.254.1.1\""
+#options NFS_BOOTSTATIC_MASK="\"255.255.255.0\""
+#options NFS_BOOTSTATIC_SERVADDR="\"169.254.1.1\""
+#options NFS_BOOTSTATIC_SERVER="\"server:/path/to/root\""
+
+options WSEMUL_VT100 # VT100 / VT220 emulation
+options WS_KERNEL_FG=WSCOL_GREEN
+options WSDISPLAY_COMPAT_PCVT # emulate some ioctls
+options WSDISPLAY_COMPAT_SYSCONS # emulate some ioctls
+options WSDISPLAY_COMPAT_USL # VT handling
+options WSDISPLAY_COMPAT_RAWKBD # can get raw scancodes
+options WSDISPLAY_DEFAULTSCREENS=4
+options PCDISPLAY_SOFTCURSOR
+
+config netbsd root on ? type ?
+#config netbsd root on wd0a type ffs
+#config netbsd root on xennet0 type nfs
+
+mainbus0 at root
+
+cpu* at mainbus?
+
+hypervisor* at mainbus? # Xen hypervisor
+
+npx0 at hypervisor? # x86 math coprocessor
+
+xencons* at hypervisor? # Xen virtual console
+xennet* at hypervisor? # Xen virtual network interface
+
+#xbd* at hypervisor? # Xen virtual block device
+#wd* at hypervisor? # Xen vbd (wd identity)
+#sd* at hypervisor? # Xen vbd (sd identity)
+#cd* at hypervisor? # Xen vbd (cd identity)
+
+#xenkbc* at hypervisor? # Xen Keyboard/Mouse Interface
+#pckbd* at xenkbc? # Keyboard
+#vga* at hypervisor? # Xen VGA display
+#pms* at xenkbc? # PS/2 Mouse for wsmouse
+
+#wskbd* at pckbd? console ?
+#wsdisplay* at vga? console ?
+#wsmouse* at pms? mux 0
+
+
+include "arch/xen/conf/GENERIC.local"
+
+
+pseudo-device ccd 4 # concatenated/striped disk devices
+#pseudo-device cgd 4 # cryptographic disk devices
+#pseudo-device md 1 # memory disk device (ramdisk)
+#pseudo-device vnd 4 # disk-like interface to files
+
+pseudo-device bpfilter 8 # Berkeley packet filter
+pseudo-device ipfilter # IP filter (firewall) and NAT
+pseudo-device loop # network loopback
+#pseudo-device tun 2 # network tunneling over tty
+#pseudo-device gre 2 # generic L3 over IP tunnel
+#pseudo-device gif 4 # IPv[46] over IPv[46] tunnel (RFC1933)
+#pseudo-device faith 1 # IPv[46] tcp relay translation i/f
+#pseudo-device stf 1 # 6to4 IPv6 over IPv4 encapsulation
+#pseudo-device vlan # IEEE 802.1q encapsulation
+#pseudo-device bridge # simple inter-network bridging
+
+pseudo-device pty # pseudo-terminals
+pseudo-device rnd # /dev/random and in-kernel generator
+pseudo-device clockctl # user control of clock subsystem
+
+pseudo-device wsmux # mouse & keyboard multiplexor
+pseudo-device wsfont
+pseudo-device ksyms # /dev/ksyms
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen
new file mode 100644
index 0000000000..12f6bfa1d5
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen
@@ -0,0 +1,232 @@
+# $NetBSD: files.xen,v 1.3.2.1 2004/05/22 15:59:02 he Exp $
+# NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp
+# NetBSD: files.i386,v 1.254 2004/03/25 23:32:10 jmc Exp
+
+maxpartitions 8
+
+maxusers 2 16 128
+
+# Processor type options.
+defflag opt_cputype.h I686_CPU
+
+# delay before cpu_reset() for reboot.
+defparam CPURESET_DELAY
+
+# No unmapped page below kernel stack
+defflag NOREDZONE
+
+# Beep on halt
+defflag opt_beep.h BEEP_ONHALT
+defparam opt_beep.h BEEP_ONHALT_COUNT
+defparam opt_beep.h BEEP_ONHALT_PITCH BEEP_ONHALT_PERIOD
+
+file arch/xen/i386/autoconf.c
+file arch/i386/i386/db_dbgreg.S ddb | kstack_check_dr0
+file arch/i386/i386/db_disasm.c ddb
+file arch/i386/i386/db_interface.c ddb
+file arch/i386/i386/db_memrw.c ddb | kgdb
+file arch/i386/i386/db_trace.c ddb
+file kern/subr_disk_mbr.c disk
+file arch/xen/i386/gdt.c
+file arch/xen/i386/hypervisor_machdep.c
+file arch/i386/i386/in_cksum.S inet | inet6
+file arch/i386/i386/ipkdb_glue.c ipkdb
+file arch/i386/i386/kgdb_machdep.c kgdb
+file arch/xen/i386/machdep.c
+file arch/xen/i386/identcpu.c
+file arch/i386/i386/math_emulate.c math_emulate
+file arch/i386/i386/mem.c
+file kern/kern_microtime.c i586_cpu | i686_cpu
+file arch/i386/i386/mtrr_k6.c mtrr
+file netns/ns_cksum.c ns
+file arch/xen/i386/pmap.c
+file arch/i386/i386/process_machdep.c
+file arch/i386/i386/procfs_machdep.c procfs
+file arch/xen/i386/sys_machdep.c
+file arch/i386/i386/syscall.c
+file arch/xen/i386/trap.c
+file arch/i386/i386/vm_machdep.c
+file arch/xen/i386/xen_machdep.c
+
+file arch/xen/xen/xen_debug.c
+
+file arch/xen/xen/clock.c
+file arch/xen/xen/evtchn.c
+file arch/xen/xen/ctrl_if.c
+
+file dev/cons.c
+
+file arch/i386/i386/mptramp.S multiprocessor
+file arch/i386/i386/ipifuncs.c multiprocessor
+
+file arch/i386/i386/pmc.c perfctrs
+
+file crypto/des/arch/i386/des_enc.S des
+file crypto/des/arch/i386/des_cbc.S des
+
+file crypto/blowfish/arch/i386/bf_enc.S blowfish
+file crypto/blowfish/arch/i386/bf_cbc.S blowfish & !i386_cpu
+
+#
+# Machine-independent SCSI drivers
+#
+
+#xxx include "dev/scsipi/files.scsipi"
+
+#
+# Machine-independent ATA drivers
+#
+
+#xxx include "dev/ata/files.ata"
+
+# Memory Disk for install floppy
+file dev/md_root.c memory_disk_hooks
+
+#
+define mainbus { [apid = -1] }
+
+file arch/x86/x86/bus_dma.c
+file arch/xen/x86/bus_space.c
+file arch/x86/x86/cacheinfo.c
+file arch/xen/x86/consinit.c
+file arch/xen/x86/intr.c
+file arch/x86/x86/ipi.c multiprocessor
+file arch/x86/x86/lock_machdep.c lockdebug
+file arch/x86/x86/softintr.c
+
+include "arch/xen/conf/files.compat"
+
+#
+# System bus types
+#
+
+device mainbus: mainbus
+attach mainbus at root
+file arch/xen/i386/mainbus.c mainbus
+
+# Xen hypervisor
+device hypervisor { }
+attach hypervisor at mainbus
+file arch/xen/xen/hypervisor.c hypervisor needs-flag
+
+# Numeric Processing Extension; Math Co-processor
+device npx
+file arch/xen/i386/npx.c npx needs-flag
+
+attach npx at hypervisor with npx_hv
+file arch/xen/i386/npx_hv.c npx_hv
+
+# Xen console support
+device xencons: tty
+attach xencons at hypervisor
+file arch/xen/xen/xencons.c xencons needs-flag
+
+include "dev/wscons/files.wscons"
+include "dev/wsfont/files.wsfont"
+
+include "dev/pckbport/files.pckbport"
+
+# CPUS
+
+define cpu { [apid = -1] }
+device cpu
+attach cpu at mainbus
+file arch/xen/i386/cpu.c cpu
+
+#
+# Compatibility modules
+#
+
+# VM86 mode
+file arch/i386/i386/vm86.c vm86
+
+# VM86 in kernel
+file arch/i386/i386/kvm86.c kvm86
+file arch/i386/i386/kvm86call.S kvm86
+
+# Binary compatibility with previous NetBSD releases (COMPAT_XX)
+file arch/i386/i386/compat_13_machdep.c compat_13 | compat_aout
+file arch/i386/i386/compat_16_machdep.c compat_16 | compat_ibcs2
+
+# SVR4 binary compatibility (COMPAT_SVR4)
+include "compat/svr4/files.svr4"
+file arch/i386/i386/svr4_machdep.c compat_svr4
+file arch/i386/i386/svr4_sigcode.S compat_svr4
+file arch/i386/i386/svr4_syscall.c compat_svr4
+
+# MACH binary compatibility (COMPAT_MACH)
+include "compat/mach/files.mach"
+file arch/i386/i386/mach_machdep.c compat_mach | compat_darwin
+file arch/i386/i386/mach_sigcode.S compat_mach | compat_darwin
+file arch/i386/i386/mach_syscall.c compat_mach | compat_darwin
+file arch/i386/i386/macho_machdep.c exec_macho
+
+# DARWIN binary compatibility (COMPAT_DARWIN)
+include "compat/darwin/files.darwin"
+file arch/i386/i386/darwin_machdep.c compat_darwin
+
+# iBCS-2 binary compatibility (COMPAT_IBCS2)
+include "compat/ibcs2/files.ibcs2"
+file arch/i386/i386/ibcs2_machdep.c compat_ibcs2
+file arch/i386/i386/ibcs2_sigcode.S compat_ibcs2
+file arch/i386/i386/ibcs2_syscall.c compat_ibcs2
+
+# Linux binary compatibility (COMPAT_LINUX)
+include "compat/linux/files.linux"
+include "compat/linux/arch/i386/files.linux_i386"
+file arch/i386/i386/linux_sigcode.S compat_linux
+file arch/i386/i386/linux_syscall.c compat_linux
+file arch/i386/i386/linux_trap.c compat_linux
+
+# FreeBSD binary compatibility (COMPAT_FREEBSD)
+include "compat/freebsd/files.freebsd"
+file arch/i386/i386/freebsd_machdep.c compat_freebsd
+file arch/i386/i386/freebsd_sigcode.S compat_freebsd
+file arch/i386/i386/freebsd_syscall.c compat_freebsd
+
+# a.out binary compatibility (COMPAT_AOUT)
+include "compat/aout/files.aout"
+
+# Win32 binary compatibility (COMPAT_PECOFF)
+include "compat/pecoff/files.pecoff"
+
+# OSS audio driver compatibility
+include "compat/ossaudio/files.ossaudio"
+
+# Xen devices
+
+# Network driver
+device xennet: arp, ether, ifnet
+attach xennet at hypervisor
+file arch/xen/xen/if_xennet.c xennet needs-flag
+
+# Block device driver and wd/sd/cd identities
+device xbd: disk
+attach xbd at hypervisor
+file arch/xen/xen/xbd.c xbd | wd | sd | cd needs-flag
+
+device wd: disk
+attach wd at hypervisor
+
+device sd: disk
+attach sd at hypervisor
+
+device cd: disk
+attach cd at hypervisor
+
+# Keyboard
+device xenkbc: pckbport
+attach xenkbc at hypervisor
+file arch/xen/xen/xenkbc.c xenkbc needs-flag
+
+# Generic VGA
+attach vga at hypervisor with vga_xen
+file arch/xen/xen/vga_xen.c vga_xen needs-flag
+
+# Domain-0 operations
+defflag opt_xen.h DOM0OPS
+file arch/xen/xen/machmem.c dom0ops
+file arch/xen/xen/privcmd.c dom0ops
+file arch/xen/xen/vfr.c dom0ops
+
+include "arch/xen/conf/majors.i386"
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c
new file mode 100644
index 0000000000..766b7aaee2
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c
@@ -0,0 +1,630 @@
+/* $NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $ */
+/* NetBSD: autoconf.c,v 1.75 2003/12/30 12:33:22 pk Exp */
+
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)autoconf.c 7.1 (Berkeley) 5/9/91
+ */
+
+/*
+ * Setup the system to run on the current machine.
+ *
+ * Configure() is called at boot time and initializes the vba
+ * device tables and the memory controller monitoring. Available
+ * devices are determined (from possibilities mentioned in ioconf.c),
+ * and the drivers are initialized.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $");
+
+#include "opt_compat_oldboot.h"
+#include "opt_multiprocessor.h"
+#include "opt_nfs_boot.h"
+#include "xennet.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/disklabel.h>
+#include <sys/conf.h>
+#ifdef COMPAT_OLDBOOT
+#include <sys/reboot.h>
+#endif
+#include <sys/device.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/fcntl.h>
+#include <sys/dkio.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+
+#ifdef NFS_BOOT_BOOTSTATIC
+#include <net/if.h>
+#include <net/if_ether.h>
+#include <netinet/in.h>
+#include <nfs/rpcv2.h>
+#include <nfs/nfsproto.h>
+#include <nfs/nfs.h>
+#include <nfs/nfsmount.h>
+#include <nfs/nfsdiskless.h>
+#include <machine/if_xennetvar.h>
+#endif
+
+#include <machine/pte.h>
+#include <machine/cpu.h>
+#include <machine/gdt.h>
+#include <machine/pcb.h>
+#include <machine/bootinfo.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+
+#if NIOAPIC > 0
+#include <machine/i82093var.h>
+#endif
+
+#if NLAPIC > 0
+#include <machine/i82489var.h>
+#endif
+
+static int match_harddisk(struct device *, struct btinfo_bootdisk *);
+static void matchbiosdisks(void);
+static void findroot(void);
+static int is_valid_disk(struct device *);
+
+extern struct disklist *i386_alldisks;
+extern int i386_ndisks;
+
+#include "bios32.h"
+#if NBIOS32 > 0
+#include <machine/bios32.h>
+#endif
+
+#include "opt_pcibios.h"
+#ifdef PCIBIOS
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <i386/pci/pcibios.h>
+#endif
+
+#include "opt_kvm86.h"
+#ifdef KVM86
+#include <machine/kvm86.h>
+#endif
+
+#include "opt_xen.h"
+
+struct device *booted_device;
+int booted_partition;
+
+/*
+ * Determine i/o configuration for a machine.
+ */
+void
+cpu_configure(void)
+{
+
+ startrtclock();
+
+#if NBIOS32 > 0
+ bios32_init();
+#endif
+#ifdef PCIBIOS
+ pcibios_init();
+#endif
+
+ /* kvm86 needs a TSS */
+ i386_proc0_tss_ldt_init();
+#ifdef KVM86
+ kvm86_init();
+#endif
+
+ if (config_rootfound("mainbus", NULL) == NULL)
+ panic("configure: mainbus not configured");
+
+#ifdef INTRDEBUG
+ intr_printconfig();
+#endif
+
+#if NIOAPIC > 0
+ lapic_set_lvt();
+ ioapic_enable();
+#endif
+ /* resync cr0 after FPU configuration */
+ lwp0.l_addr->u_pcb.pcb_cr0 = rcr0();
+#ifdef MULTIPROCESSOR
+ /* propagate this to the idle pcb's. */
+ cpu_init_idle_pcbs();
+#endif
+
+ spl0();
+#if NLAPIC > 0
+ lapic_tpr = 0;
+#endif
+}
+
+void
+cpu_rootconf(void)
+{
+ findroot();
+ matchbiosdisks();
+
+ printf("boot device: %s\n",
+ booted_device ? booted_device->dv_xname : "<unknown>");
+
+ setroot(booted_device, booted_partition);
+}
+
+/*
+ * XXX ugly bit of code. But, this is the only safe time that the
+ * match between BIOS disks and native disks can be done.
+ */
+static void
+matchbiosdisks(void)
+{
+ struct btinfo_biosgeom *big;
+ struct bi_biosgeom_entry *be;
+ struct device *dv;
+ int i, ck, error, m, n;
+ struct vnode *tv;
+ char mbr[DEV_BSIZE];
+ int dklist_size;
+ int bmajor;
+
+ big = lookup_bootinfo(BTINFO_BIOSGEOM);
+
+ if (big == NULL)
+ return;
+
+ /*
+ * First, count all native disks
+ */
+ for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next)
+ if (is_valid_disk(dv))
+ i386_ndisks++;
+
+ if (i386_ndisks == 0)
+ return;
+
+ dklist_size = sizeof (struct disklist) + (i386_ndisks - 1) *
+ sizeof (struct nativedisk_info);
+
+ /* XXX M_TEMP is wrong */
+ i386_alldisks = malloc(dklist_size, M_TEMP, M_NOWAIT);
+ if (i386_alldisks == NULL)
+ return;
+
+ memset(i386_alldisks, 0, dklist_size);
+
+ i386_alldisks->dl_nnativedisks = i386_ndisks;
+ i386_alldisks->dl_nbiosdisks = big->num;
+ for (i = 0; i < big->num; i++) {
+ i386_alldisks->dl_biosdisks[i].bi_dev = big->disk[i].dev;
+ i386_alldisks->dl_biosdisks[i].bi_sec = big->disk[i].sec;
+ i386_alldisks->dl_biosdisks[i].bi_head = big->disk[i].head;
+ i386_alldisks->dl_biosdisks[i].bi_cyl = big->disk[i].cyl;
+ i386_alldisks->dl_biosdisks[i].bi_lbasecs = big->disk[i].totsec;
+ i386_alldisks->dl_biosdisks[i].bi_flags = big->disk[i].flags;
+#ifdef GEOM_DEBUG
+#ifdef NOTYET
+ printf("disk %x: flags %x, interface %x, device %llx\n",
+ big->disk[i].dev, big->disk[i].flags,
+ big->disk[i].interface_path, big->disk[i].device_path);
+#endif
+#endif
+ }
+
+ /*
+ * XXX code duplication from findroot()
+ */
+ n = -1;
+ for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+ if (dv->dv_class != DV_DISK)
+ continue;
+#ifdef GEOM_DEBUG
+ printf("matchbiosdisks: trying to match (%s) %s\n",
+ dv->dv_xname, dv->dv_cfdata->cf_name);
+#endif
+ if (is_valid_disk(dv)) {
+ n++;
+ sprintf(i386_alldisks->dl_nativedisks[n].ni_devname,
+ "%s%d", dv->dv_cfdata->cf_name,
+ dv->dv_unit);
+
+ bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
+ if (bmajor == -1)
+ return;
+
+ if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, RAW_PART),
+ &tv))
+ panic("matchbiosdisks: can't alloc vnode");
+
+ error = VOP_OPEN(tv, FREAD, NOCRED, 0);
+ if (error) {
+ vput(tv);
+ continue;
+ }
+ error = vn_rdwr(UIO_READ, tv, mbr, DEV_BSIZE, 0,
+ UIO_SYSSPACE, 0, NOCRED, NULL, 0);
+ VOP_CLOSE(tv, FREAD, NOCRED, 0);
+ if (error) {
+#ifdef GEOM_DEBUG
+ printf("matchbiosdisks: %s: MBR read failure\n",
+ dv->dv_xname);
+#endif
+ continue;
+ }
+
+ for (ck = i = 0; i < DEV_BSIZE; i++)
+ ck += mbr[i];
+ for (m = i = 0; i < big->num; i++) {
+ be = &big->disk[i];
+#ifdef GEOM_DEBUG
+ printf("match %s with %d ", dv->dv_xname, i);
+ printf("dev ck %x bios ck %x\n", ck, be->cksum);
+#endif
+ if (be->flags & BI_GEOM_INVALID)
+ continue;
+ if (be->cksum == ck &&
+ !memcmp(&mbr[MBR_PART_OFFSET], be->dosparts,
+ MBR_PART_COUNT *
+ sizeof (struct mbr_partition))) {
+#ifdef GEOM_DEBUG
+ printf("matched bios disk %x with %s\n",
+ be->dev, dv->dv_xname);
+#endif
+ i386_alldisks->dl_nativedisks[n].
+ ni_biosmatches[m++] = i;
+ }
+ }
+ i386_alldisks->dl_nativedisks[n].ni_nmatches = m;
+ vput(tv);
+ }
+ }
+}
+
+#ifdef COMPAT_OLDBOOT
+u_long bootdev = 0; /* should be dev_t, but not until 32 bits */
+#endif
+
+/*
+ * helper function for "findroot()":
+ * return nonzero if disk device matches bootinfo
+ */
+static int
+match_harddisk(struct device *dv, struct btinfo_bootdisk *bid)
+{
+ struct vnode *tmpvn;
+ int error;
+ struct disklabel label;
+ int found = 0;
+ int bmajor;
+
+ /*
+ * A disklabel is required here. The
+ * bootblocks don't refuse to boot from
+ * a disk without a label, but this is
+ * normally not wanted.
+ */
+ if (bid->labelsector == -1)
+ return(0);
+
+ /*
+ * lookup major number for disk block device
+ */
+ bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
+ if (bmajor == -1)
+ return(0); /* XXX panic() ??? */
+
+ /*
+ * Fake a temporary vnode for the disk, open
+ * it, and read the disklabel for comparison.
+ */
+ if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, bid->partition), &tmpvn))
+ panic("findroot can't alloc vnode");
+ error = VOP_OPEN(tmpvn, FREAD, NOCRED, 0);
+ if (error) {
+#ifndef DEBUG
+ /*
+ * Ignore errors caused by missing
+ * device, partition or medium.
+ */
+ if (error != ENXIO && error != ENODEV)
+#endif
+ printf("findroot: can't open dev %s%c (%d)\n",
+ dv->dv_xname, 'a' + bid->partition, error);
+ vput(tmpvn);
+ return(0);
+ }
+ error = VOP_IOCTL(tmpvn, DIOCGDINFO, &label, FREAD, NOCRED, 0);
+ if (error) {
+ /*
+ * XXX can't happen - open() would
+ * have errored out (or faked up one)
+ */
+ printf("can't get label for dev %s%c (%d)\n",
+ dv->dv_xname, 'a' + bid->partition, error);
+ goto closeout;
+ }
+
+ /* compare with our data */
+ if (label.d_type == bid->label.type &&
+ label.d_checksum == bid->label.checksum &&
+ !strncmp(label.d_packname, bid->label.packname, 16))
+ found = 1;
+
+closeout:
+ VOP_CLOSE(tmpvn, FREAD, NOCRED, 0);
+ vput(tmpvn);
+ return(found);
+}
+
+/*
+ * Attempt to find the device from which we were booted.
+ * If we can do so, and not instructed not to do so,
+ * change rootdev to correspond to the load device.
+ */
+void
+findroot(void)
+{
+ struct btinfo_bootdisk *bid;
+ struct device *dv;
+ union xen_cmdline_parseinfo xcp;
+#ifdef COMPAT_OLDBOOT
+ int i, majdev, unit, part;
+ char buf[32];
+#endif
+
+ if (booted_device)
+ return;
+
+ if (lookup_bootinfo(BTINFO_NETIF)) {
+ /*
+ * We got netboot interface information, but
+ * "device_register()" couldn't match it to a configured
+ * device. Bootdisk information cannot be present at the
+ * same time, so give up.
+ */
+ printf("findroot: netboot interface not found\n");
+ return;
+ }
+
+ bid = lookup_bootinfo(BTINFO_BOOTDISK);
+ if (bid) {
+ /*
+ * Scan all disk devices for ones that match the passed data.
+ * Don't break if one is found, to get possible multiple
+ * matches - for problem tracking. Use the first match anyway
+ * because lower device numbers are more likely to be the
+ * boot device.
+ */
+ for (dv = alldevs.tqh_first; dv != NULL;
+ dv = dv->dv_list.tqe_next) {
+ if (dv->dv_class != DV_DISK)
+ continue;
+
+ if (!strcmp(dv->dv_cfdata->cf_name, "fd")) {
+ /*
+ * Assume the configured unit number matches
+ * the BIOS device number. (This is the old
+ * behaviour.) Needs some ideas how to handle
+ * BIOS's "swap floppy drive" options.
+ */
+ if ((bid->biosdev & 0x80) ||
+ dv->dv_unit != bid->biosdev)
+ continue;
+
+ goto found;
+ }
+
+ if (is_valid_disk(dv)) {
+ /*
+ * Don't trust BIOS device numbers, try
+ * to match the information passed by the
+ * bootloader instead.
+ */
+ if ((bid->biosdev & 0x80) == 0 ||
+ !match_harddisk(dv, bid))
+ continue;
+
+ goto found;
+ }
+
+ /* no "fd", "wd", "sd", "ld", "ed" */
+ continue;
+
+found:
+ if (booted_device) {
+ printf("warning: double match for boot "
+ "device (%s, %s)\n",
+ booted_device->dv_xname, dv->dv_xname);
+ continue;
+ }
+ booted_device = dv;
+ booted_partition = bid->partition;
+ }
+
+ if (booted_device)
+ return;
+ }
+
+ xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
+
+ for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+ if (is_valid_disk(dv) == 0)
+ continue;
+
+ if (xcp.xcp_bootdev[0] == 0) {
+ booted_device = dv;
+ break;
+ }
+
+ if (strncmp(xcp.xcp_bootdev, dv->dv_xname,
+ strlen(dv->dv_xname)))
+ continue;
+
+ if (strlen(xcp.xcp_bootdev) > strlen(dv->dv_xname)) {
+ booted_partition = toupper(
+ xcp.xcp_bootdev[strlen(dv->dv_xname)]) - 'A';
+ }
+
+ booted_device = dv;
+ break;
+ }
+
+ if (booted_device)
+ return;
+
+#ifdef COMPAT_OLDBOOT
+#if 0
+ printf("howto %x bootdev %x ", boothowto, bootdev);
+#endif
+
+ if ((bootdev & B_MAGICMASK) != (u_long)B_DEVMAGIC)
+ return;
+
+ majdev = (bootdev >> B_TYPESHIFT) & B_TYPEMASK;
+ name = devsw_blk2name(majdev);
+ if (name == NULL)
+ return;
+
+ part = (bootdev >> B_PARTITIONSHIFT) & B_PARTITIONMASK;
+ unit = (bootdev >> B_UNITSHIFT) & B_UNITMASK;
+
+ sprintf(buf, "%s%d", name, unit);
+ for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+ if (strcmp(buf, dv->dv_xname) == 0) {
+ booted_device = dv;
+ booted_partition = part;
+ return;
+ }
+ }
+#endif
+}
+
+#include "pci.h"
+
+#include <dev/isa/isavar.h>
+#if NPCI > 0
+#include <dev/pci/pcivar.h>
+#endif
+
+void
+device_register(struct device *dev, void *aux)
+{
+ /*
+ * Handle network interfaces here, the attachment information is
+ * not available driver independantly later.
+ * For disks, there is nothing useful available at attach time.
+ */
+#if NXENNET > 0
+ if (dev->dv_class == DV_IFNET) {
+ union xen_cmdline_parseinfo xcp;
+
+ xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
+ if (strncmp(xcp.xcp_bootdev, dev->dv_xname, 16) == 0) {
+#ifdef NFS_BOOT_BOOTSTATIC
+ nfs_bootstatic_callback = xennet_bootstatic_callback;
+#endif
+ goto found;
+ }
+ }
+#endif
+ if (dev->dv_class == DV_IFNET) {
+ struct btinfo_netif *bin = lookup_bootinfo(BTINFO_NETIF);
+ if (bin == NULL)
+ return;
+
+ /*
+ * We don't check the driver name against the device name
+ * passed by the boot ROM. The ROM should stay usable
+ * if the driver gets obsoleted.
+ * The physical attachment information (checked below)
+ * must be sufficient to identify the device.
+ */
+
+ if (bin->bus == BI_BUS_ISA &&
+ !strcmp(dev->dv_parent->dv_cfdata->cf_name, "isa")) {
+ struct isa_attach_args *iaa = aux;
+
+ /* compare IO base address */
+ /* XXXJRT what about multiple I/O addrs? */
+ if (iaa->ia_nio > 0 &&
+ bin->addr.iobase == iaa->ia_io[0].ir_addr)
+ goto found;
+ }
+#if NPCI > 0
+ if (bin->bus == BI_BUS_PCI &&
+ !strcmp(dev->dv_parent->dv_cfdata->cf_name, "pci")) {
+ struct pci_attach_args *paa = aux;
+ int b, d, f;
+
+ /*
+ * Calculate BIOS representation of:
+ *
+ * <bus,device,function>
+ *
+ * and compare.
+ */
+ pci_decompose_tag(paa->pa_pc, paa->pa_tag, &b, &d, &f);
+ if (bin->addr.tag == ((b << 8) | (d << 3) | f))
+ goto found;
+ }
+#endif
+ }
+ return;
+
+found:
+ if (booted_device) {
+ /* XXX should be a "panic()" */
+ printf("warning: double match for boot device (%s, %s)\n",
+ booted_device->dv_xname, dev->dv_xname);
+ return;
+ }
+ booted_device = dev;
+}
+
+static int
+is_valid_disk(struct device *dv)
+{
+ const char *name;
+
+ if (dv->dv_class != DV_DISK)
+ return (0);
+
+ name = dv->dv_cfdata->cf_name;
+
+ return (strcmp(name, "sd") == 0 || strcmp(name, "wd") == 0 ||
+ strcmp(name, "ld") == 0 || strcmp(name, "ed") == 0 ||
+ strcmp(name, "xbd") == 0);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c
new file mode 100644
index 0000000000..23dd52f1d3
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c
@@ -0,0 +1,408 @@
+/* $NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $ */
+/* NetBSD: gdt.c,v 1.32 2004/02/13 11:36:13 wiz Exp */
+
+/*-
+ * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by John T. Kohl and Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $");
+
+#include "opt_multiprocessor.h"
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/user.h>
+
+#include <uvm/uvm.h>
+
+#include <machine/gdt.h>
+
+int gdt_size[2]; /* total number of GDT entries */
+int gdt_count[2]; /* number of GDT entries in use */
+int gdt_next[2]; /* next available slot for sweeping */
+int gdt_free[2]; /* next free slot; terminated with GNULL_SEL */
+
+struct lock gdt_lock_store;
+
+static __inline void gdt_lock(void);
+static __inline void gdt_unlock(void);
+void gdt_init(void);
+void gdt_grow(int);
+int gdt_get_slot(void);
+int gdt_get_slot1(int);
+void gdt_put_slot(int);
+void gdt_put_slot1(int, int);
+
+/*
+ * Lock and unlock the GDT, to avoid races in case gdt_{ge,pu}t_slot() sleep
+ * waiting for memory.
+ *
+ * Note that the locking done here is not sufficient for multiprocessor
+ * systems. A freshly allocated slot will still be of type SDT_SYSNULL for
+ * some time after the GDT is unlocked, so gdt_compact() could attempt to
+ * reclaim it.
+ */
+static __inline void
+gdt_lock()
+{
+
+ (void) lockmgr(&gdt_lock_store, LK_EXCLUSIVE, NULL);
+}
+
+static __inline void
+gdt_unlock()
+{
+
+ (void) lockmgr(&gdt_lock_store, LK_RELEASE, NULL);
+}
+
+void
+setgdt(int sel, void *base, size_t limit,
+ int type, int dpl, int def32, int gran)
+{
+ struct segment_descriptor sd;
+ CPU_INFO_ITERATOR cii;
+ struct cpu_info *ci;
+
+ if (type == SDT_SYS386TSS) {
+ /* printk("XXX TSS descriptor not supported in GDT\n"); */
+ return;
+ }
+
+ setsegment(&sd, base, limit, type, dpl, def32, gran);
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ if (ci->ci_gdt != NULL) {
+#ifndef XEN
+ ci->ci_gdt[sel].sd = sd;
+#else
+ xen_update_descriptor(&ci->ci_gdt[sel],
+ (union descriptor *)&sd);
+#endif
+ }
+ }
+}
+
+/*
+ * Initialize the GDT subsystem. Called from autoconf().
+ */
+void
+gdt_init()
+{
+ size_t max_len, min_len;
+ union descriptor *old_gdt;
+ struct vm_page *pg;
+ vaddr_t va;
+ struct cpu_info *ci = &cpu_info_primary;
+
+ lockinit(&gdt_lock_store, PZERO, "gdtlck", 0, 0);
+
+ max_len = MAXGDTSIZ * sizeof(gdt[0]);
+ min_len = MINGDTSIZ * sizeof(gdt[0]);
+
+ gdt_size[0] = MINGDTSIZ;
+ gdt_count[0] = NGDT;
+ gdt_next[0] = NGDT;
+ gdt_free[0] = GNULL_SEL;
+
+ gdt_size[1] = 0;
+ gdt_count[1] = MAXGDTSIZ;
+ gdt_next[1] = MAXGDTSIZ;
+ gdt_free[1] = GNULL_SEL;
+
+ old_gdt = gdt;
+ gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len + max_len);
+ for (va = (vaddr_t)gdt; va < (vaddr_t)gdt + min_len; va += PAGE_SIZE) {
+ pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+ if (pg == NULL) {
+ panic("gdt_init: no pages");
+ }
+ pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ | VM_PROT_WRITE);
+ }
+ memcpy(gdt, old_gdt, NGDT * sizeof(gdt[0]));
+ ci->ci_gdt = gdt;
+ setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
+ SDT_MEMRWA, SEL_KPL, 1, 1);
+
+ gdt_init_cpu(ci);
+}
+
+/*
+ * Allocate shadow GDT for a slave CPU.
+ */
+void
+gdt_alloc_cpu(struct cpu_info *ci)
+{
+ int max_len = MAXGDTSIZ * sizeof(gdt[0]);
+ int min_len = MINGDTSIZ * sizeof(gdt[0]);
+ struct vm_page *pg;
+ vaddr_t va;
+
+ ci->ci_gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len);
+ for (va = (vaddr_t)ci->ci_gdt; va < (vaddr_t)ci->ci_gdt + min_len;
+ va += PAGE_SIZE) {
+ while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO))
+ == NULL) {
+ uvm_wait("gdt_alloc_cpu");
+ }
+ pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ | VM_PROT_WRITE);
+ }
+ memset(ci->ci_gdt, 0, min_len);
+ memcpy(ci->ci_gdt, gdt, gdt_count[0] * sizeof(gdt[0]));
+ setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
+ SDT_MEMRWA, SEL_KPL, 1, 1);
+}
+
+
+/*
+ * Load appropriate gdt descriptor; we better be running on *ci
+ * (for the most part, this is how a CPU knows who it is).
+ */
+void
+gdt_init_cpu(struct cpu_info *ci)
+{
+#ifndef XEN
+ struct region_descriptor region;
+ size_t max_len;
+
+ max_len = MAXGDTSIZ * sizeof(gdt[0]);
+ setregion(&region, ci->ci_gdt, max_len - 1);
+ lgdt(&region);
+#else
+ size_t len = gdt_size[0] * sizeof(gdt[0]);
+ unsigned long frames[len >> PAGE_SHIFT];
+ vaddr_t va;
+ pt_entry_t *ptp;
+ pt_entry_t *maptp;
+ int f;
+
+ for (va = (vaddr_t)ci->ci_gdt, f = 0;
+ va < (vaddr_t)ci->ci_gdt + len;
+ va += PAGE_SIZE, f++) {
+ KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
+ ptp = kvtopte(va);
+ frames[f] = *ptp >> PAGE_SHIFT;
+ maptp = (pt_entry_t *)vtomach((vaddr_t)ptp);
+ PTE_CLEARBITS(ptp, maptp, PG_RW);
+ }
+ PTE_UPDATES_FLUSH();
+ /* printk("loading gdt %x, %d entries, %d pages", */
+ /* frames[0] << PAGE_SHIFT, gdt_size[0], len >> PAGE_SHIFT); */
+ if (HYPERVISOR_set_gdt(frames, gdt_size[0]))
+ panic("HYPERVISOR_set_gdt failed!\n");
+ lgdt_finish();
+#endif
+}
+
+#ifdef MULTIPROCESSOR
+
+void
+gdt_reload_cpu(struct cpu_info *ci)
+{
+ struct region_descriptor region;
+ size_t max_len;
+
+ max_len = MAXGDTSIZ * sizeof(gdt[0]);
+ setregion(&region, ci->ci_gdt, max_len - 1);
+ lgdt(&region);
+}
+#endif
+
+
+/*
+ * Grow the GDT.
+ */
+void
+gdt_grow(int which)
+{
+ size_t old_len, new_len, max_len;
+ CPU_INFO_ITERATOR cii;
+ struct cpu_info *ci;
+ struct vm_page *pg;
+ vaddr_t va;
+
+ old_len = gdt_size[which] * sizeof(gdt[0]);
+ gdt_size[which] <<= 1;
+ new_len = old_len << 1;
+
+ if (which != 0) {
+ max_len = MAXGDTSIZ * sizeof(gdt[0]);
+ if (old_len == 0) {
+ gdt_size[which] = MINGDTSIZ;
+ new_len = gdt_size[which] * sizeof(gdt[0]);
+ }
+ for (va = (vaddr_t)(cpu_info_primary.ci_gdt) + old_len + max_len;
+ va < (vaddr_t)(cpu_info_primary.ci_gdt) + new_len + max_len;
+ va += PAGE_SIZE) {
+ while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
+ NULL) {
+ uvm_wait("gdt_grow");
+ }
+ pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ | VM_PROT_WRITE);
+ }
+ return;
+ }
+
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ for (va = (vaddr_t)(ci->ci_gdt) + old_len;
+ va < (vaddr_t)(ci->ci_gdt) + new_len;
+ va += PAGE_SIZE) {
+ while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
+ NULL) {
+ uvm_wait("gdt_grow");
+ }
+ pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ | VM_PROT_WRITE);
+ }
+ }
+}
+
+/*
+ * Allocate a GDT slot as follows:
+ * 1) If there are entries on the free list, use those.
+ * 2) If there are fewer than gdt_size entries in use, there are free slots
+ * near the end that we can sweep through.
+ * 3) As a last resort, we increase the size of the GDT, and sweep through
+ * the new slots.
+ */
+int
+gdt_get_slot()
+{
+ return gdt_get_slot1(0);
+}
+
+int
+gdt_get_slot1(int which)
+{
+ size_t offset;
+ int slot;
+
+ gdt_lock();
+
+ if (gdt_free[which] != GNULL_SEL) {
+ slot = gdt_free[which];
+ gdt_free[which] = gdt[slot].gd.gd_selector;
+ } else {
+ offset = which * MAXGDTSIZ * sizeof(gdt[0]);
+ if (gdt_next[which] != gdt_count[which] + offset)
+ panic("gdt_get_slot botch 1");
+ if (gdt_next[which] - offset >= gdt_size[which]) {
+ if (gdt_size[which] >= MAXGDTSIZ)
+ panic("gdt_get_slot botch 2");
+ gdt_grow(which);
+ }
+ slot = gdt_next[which]++;
+ }
+
+ gdt_count[which]++;
+ gdt_unlock();
+ return (slot);
+}
+
+/*
+ * Deallocate a GDT slot, putting it on the free list.
+ */
+void
+gdt_put_slot(int slot)
+{
+ gdt_put_slot1(slot, 0);
+}
+
+void
+gdt_put_slot1(int slot, int which)
+{
+
+ gdt_lock();
+ gdt_count[which]--;
+
+ gdt[slot].gd.gd_type = SDT_SYSNULL;
+ gdt[slot].gd.gd_selector = gdt_free[which];
+ gdt_free[which] = slot;
+
+ gdt_unlock();
+}
+
+int
+tss_alloc(struct pcb *pcb)
+{
+ int slot;
+
+ slot = gdt_get_slot();
+ setgdt(slot, &pcb->pcb_tss, sizeof(struct pcb) - 1,
+ SDT_SYS386TSS, SEL_KPL, 0, 0);
+ return GSEL(slot, SEL_KPL);
+}
+
+void
+tss_free(int sel)
+{
+
+ gdt_put_slot(IDXSEL(sel));
+}
+
+/*
+ * Caller must have pmap locked for both of these functions.
+ */
+void
+ldt_alloc(struct pmap *pmap, union descriptor *ldt, size_t len)
+{
+ int slot;
+
+ slot = gdt_get_slot1(1);
+#ifndef XEN
+ setgdt(slot, ldt, len - 1, SDT_SYSLDT, SEL_KPL, 0, 0);
+#else
+ cpu_info_primary.ci_gdt[slot].ld.ld_base = (uint32_t)ldt;
+ cpu_info_primary.ci_gdt[slot].ld.ld_entries =
+ len / sizeof(union descriptor);
+#endif
+ pmap->pm_ldt_sel = GSEL(slot, SEL_KPL);
+}
+
+void
+ldt_free(struct pmap *pmap)
+{
+ int slot;
+
+ slot = IDXSEL(pmap->pm_ldt_sel);
+
+ gdt_put_slot1(slot, 1);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c
new file mode 100644
index 0000000000..e08b5a64bd
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c
@@ -0,0 +1,230 @@
+/* $NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/******************************************************************************
+ * hypervisor.c
+ *
+ * Communication to/from hypervisor.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $");
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void
+hypervisor_force_callback(void)
+{
+
+ (void)HYPERVISOR_xen_version(0);
+}
+
+int stipending(void);
+int
+stipending()
+{
+ uint32_t l1;
+ unsigned long l2;
+ unsigned int l1i, l2i, port;
+ int irq;
+ shared_info_t *s = HYPERVISOR_shared_info;
+ struct cpu_info *ci;
+ int ret;
+
+ ret = 0;
+ ci = curcpu();
+
+#if 0
+ if (HYPERVISOR_shared_info->events)
+ printf("stipending events %08lx mask %08lx ilevel %d\n",
+ HYPERVISOR_shared_info->events,
+ HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
+#endif
+
+ /*
+ * we're only called after STIC, so we know that we'll have to
+ * STI at the end
+ */
+ cli();
+ while (s->vcpu_data[0].evtchn_upcall_pending) {
+ s->vcpu_data[0].evtchn_upcall_pending = 0;
+ /* NB. No need for a barrier here -- XCHG is a barrier
+ * on x86. */
+ l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
+ while ((l1i = ffs(l1)) != 0) {
+ l1i--;
+ l1 &= ~(1 << l1i);
+
+ l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
+ while ((l2i = ffs(l2)) != 0) {
+ l2i--;
+ l2 &= ~(1 << l2i);
+
+ port = (l1i << 5) + l2i;
+ if ((irq = evtchn_to_irq[port]) != -1) {
+ hypervisor_acknowledge_irq(irq);
+ ci->ci_ipending |= (1 << irq);
+ if (ret == 0 && ci->ci_ilevel <
+ ci->ci_isources[irq]->is_handlers
+ ->ih_level)
+ ret = 1;
+ }
+#if 0 /* XXXcl dev/evtchn */
+ else
+ evtchn_device_upcall(port);
+#endif
+ }
+ }
+ }
+ sti();
+
+#if 0
+ if (ci->ci_ipending & 0x1)
+ printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n",
+ HYPERVISOR_shared_info->events,
+ HYPERVISOR_shared_info->events_mask, ci->ci_ilevel,
+ ci->ci_ipending);
+#endif
+
+ return (ret);
+}
+
+void do_hypervisor_callback(struct trapframe *regs)
+{
+ uint32_t l1;
+ unsigned long l2;
+ unsigned int l1i, l2i, port;
+ int irq;
+ shared_info_t *s = HYPERVISOR_shared_info;
+ struct cpu_info *ci;
+ int level;
+
+ ci = curcpu();
+ level = ci->ci_ilevel;
+
+ while (s->vcpu_data[0].evtchn_upcall_pending) {
+ s->vcpu_data[0].evtchn_upcall_pending = 0;
+ /* NB. No need for a barrier here -- XCHG is a barrier
+ * on x86. */
+ l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
+ while ((l1i = ffs(l1)) != 0) {
+ l1i--;
+ l1 &= ~(1 << l1i);
+
+ l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
+ while ((l2i = ffs(l2)) != 0) {
+ l2i--;
+ l2 &= ~(1 << l2i);
+
+ port = (l1i << 5) + l2i;
+ if ((irq = evtchn_to_irq[port]) != -1)
+ do_event(irq, regs);
+#if 0 /* XXXcl dev/evtchn */
+ else
+ evtchn_device_upcall(port);
+#endif
+ }
+ }
+ }
+
+#ifdef DIAGNOSTIC
+ if (level != ci->ci_ilevel)
+ printf("hypervisor done %08x level %d/%d ipending %08x\n",
+ HYPERVISOR_shared_info->evtchn_pending_sel, level,
+ ci->ci_ilevel, ci->ci_ipending);
+#endif
+}
+
+void hypervisor_unmask_event(unsigned int ev)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+
+ x86_atomic_clear_bit(&s->evtchn_mask[0], ev);
+ /*
+ * The following is basically the equivalent of
+ * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
+ * interrupt edge' if the channel is masked.
+ */
+ if (x86_atomic_test_bit(&s->evtchn_pending[0], ev) &&
+ !x86_atomic_test_and_set_bit(&s->evtchn_pending_sel, ev>>5)) {
+ s->vcpu_data[0].evtchn_upcall_pending = 1;
+ if (!s->vcpu_data[0].evtchn_upcall_mask)
+ hypervisor_force_callback();
+ }
+}
+
+void hypervisor_mask_event(unsigned int ev)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+
+ x86_atomic_set_bit(&s->evtchn_mask[0], ev);
+}
+
+void hypervisor_clear_event(unsigned int ev)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+
+ x86_atomic_clear_bit(&s->evtchn_pending[0], ev);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S
new file mode 100644
index 0000000000..45af67272f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S
@@ -0,0 +1,2000 @@
+/* $NetBSD: locore.S,v 1.2.2.1 2004/05/22 15:59:48 he Exp $ */
+/* NetBSD: locore.S,v 1.26 2004/04/12 13:17:46 yamt Exp */
+
+/*-
+ * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)locore.s 7.3 (Berkeley) 5/13/91
+ */
+
+#include "opt_compat_netbsd.h"
+#include "opt_compat_oldboot.h"
+#include "opt_cputype.h"
+#include "opt_ddb.h"
+#include "opt_ipkdb.h"
+#include "opt_lockdebug.h"
+#include "opt_multiprocessor.h"
+#include "opt_realmem.h"
+#include "opt_user_ldt.h"
+#include "opt_vm86.h"
+#include "opt_xen.h"
+
+#include "npx.h"
+#include "assym.h"
+#include "apm.h"
+#include "lapic.h"
+#include "ioapic.h"
+#include "ksyms.h"
+
+#include <sys/errno.h>
+#include <sys/syscall.h>
+
+#include <machine/cputypes.h>
+#include <machine/param.h>
+#include <machine/pte.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/trap.h>
+#include <machine/bootinfo.h>
+
+#if NLAPIC > 0
+#include <machine/i82489reg.h>
+#endif
+
+/* LINTSTUB: include <sys/types.h> */
+/* LINTSTUB: include <machine/cpu.h> */
+/* LINTSTUB: include <sys/systm.h> */
+
+#include <machine/asm.h>
+
+#if defined(MULTIPROCESSOR)
+
+#define SET_CURLWP(lwp,cpu) \
+ movl CPUVAR(SELF),cpu ; \
+ movl lwp,CPUVAR(CURLWP) ; \
+ movl cpu,L_CPU(lwp)
+
+#else
+
+#define SET_CURLWP(lwp,tcpu) movl lwp,CPUVAR(CURLWP)
+#define GET_CURLWP(reg) movl CPUVAR(CURLWP),reg
+
+#endif
+
+#define GET_CURPCB(reg) movl CPUVAR(CURPCB),reg
+#define SET_CURPCB(reg) movl reg,CPUVAR(CURPCB)
+
+#define CLEAR_RESCHED(reg) movl reg,CPUVAR(RESCHED)
+
+/* XXX temporary kluge; these should not be here */
+/* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */
+#include <dev/isa/isareg.h>
+
+
+/* Disallow old names for REALBASEMEM */
+#ifdef BIOSBASEMEM
+#error BIOSBASEMEM option deprecated; use REALBASEMEM only if memory size reported by latest boot block is incorrect
+#endif
+
+/* Disallow old names for REALEXTMEM */
+#ifdef EXTMEM_SIZE
+#error EXTMEM_SIZE option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
+#endif
+#ifdef BIOSEXTMEM
+#error BIOSEXTMEM option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
+#endif
+
+#include <machine/frameasm.h>
+
+
+#ifdef MULTIPROCESSOR
+#include <machine/i82489reg.h>
+#endif
+
+/*
+ * PTmap is recursive pagemap at top of virtual address space.
+ * Within PTmap, the page directory can be found (third indirection).
+ *
+ * XXX 4 == sizeof pde
+ */
+ .set _C_LABEL(PTmap),(PDSLOT_PTE << PDSHIFT)
+ .set _C_LABEL(PTD),(_C_LABEL(PTmap) + PDSLOT_PTE * PAGE_SIZE)
+ .set _C_LABEL(PTDpde),(_C_LABEL(PTD) + PDSLOT_PTE * 4)
+
+/*
+ * APTmap, APTD is the alternate recursive pagemap.
+ * It's used when modifying another process's page tables.
+ *
+ * XXX 4 == sizeof pde
+ */
+ .set _C_LABEL(APTmap),(PDSLOT_APTE << PDSHIFT)
+ .set _C_LABEL(APTD),(_C_LABEL(APTmap) + PDSLOT_APTE * PAGE_SIZE)
+ .set _C_LABEL(APTDpde),(_C_LABEL(PTD) + PDSLOT_APTE * 4)
+
+
+/*
+ * Xen guest identifier and loader selection
+ */
+.section __xen_guest
+ .asciz "GUEST_OS=netbsd,GUEST_VER=2.0,XEN_VER=2.0,LOADER=generic"
+
+
+/*
+ * Initialization
+ */
+ .data
+
+ .globl _C_LABEL(cpu)
+ .globl _C_LABEL(esym),_C_LABEL(boothowto)
+ .globl _C_LABEL(bootinfo),_C_LABEL(atdevbase)
+#ifdef COMPAT_OLDBOOT
+ .globl _C_LABEL(bootdev)
+#endif
+ .globl _C_LABEL(proc0paddr),_C_LABEL(PTDpaddr)
+ .globl _C_LABEL(biosbasemem),_C_LABEL(biosextmem)
+ .globl _C_LABEL(gdt)
+#ifdef I586_CPU
+ .globl _C_LABEL(idt)
+#endif
+ .globl _C_LABEL(lapic_tpr)
+
+#if NLAPIC > 0
+#ifdef __ELF__
+ .align PAGE_SIZE
+#else
+ .align 12
+#endif
+ .globl _C_LABEL(local_apic), _C_LABEL(lapic_id)
+_C_LABEL(local_apic):
+ .space LAPIC_ID
+_C_LABEL(lapic_id):
+ .long 0x00000000
+ .space LAPIC_TPRI-(LAPIC_ID+4)
+_C_LABEL(lapic_tpr):
+ .space LAPIC_PPRI-LAPIC_TPRI
+_C_LABEL(lapic_ppr):
+ .space LAPIC_ISR-LAPIC_PPRI
+_C_LABEL(lapic_isr):
+ .space PAGE_SIZE-LAPIC_ISR
+#else
+_C_LABEL(lapic_tpr):
+ .long 0
+#endif
+
+
+_C_LABEL(cpu): .long 0 # are we 386, 386sx, or 486,
+ # or Pentium, or..
+_C_LABEL(esym): .long 0 # ptr to end of syms
+_C_LABEL(atdevbase): .long 0 # location of start of iomem in virtual
+_C_LABEL(proc0paddr): .long 0
+_C_LABEL(PTDpaddr): .long 0 # paddr of PTD, for libkvm
+#ifndef REALBASEMEM
+_C_LABEL(biosbasemem): .long 0 # base memory reported by BIOS
+#else
+_C_LABEL(biosbasemem): .long REALBASEMEM
+#endif
+#ifndef REALEXTMEM
+_C_LABEL(biosextmem): .long 0 # extended memory reported by BIOS
+#else
+_C_LABEL(biosextmem): .long REALEXTMEM
+#endif
+
+#include <machine/xen.h>
+#define __HYPERVISOR_yield 8
+
+ .space 512
+tmpstk:
+ .long tmpstk, __KERNEL_DS
+
+
+#define _RELOC(x) ((x))
+#define RELOC(x) _RELOC(_C_LABEL(x))
+
+/* XXX assym.h */
+#define MOD_START 48
+#define MOD_LEN 56
+/* XXX assym.h */
+
+ .text
+ .globl _C_LABEL(kernel_text)
+ .set _C_LABEL(kernel_text),KERNTEXTOFF
+
+ .globl start
+start:
+ cld
+
+ lss tmpstk,%esp # bootstrap stack end location
+
+ movl %esi,%ebx # save start_info pointer
+
+#if (NKSYMS || defined(DDB) || defined(LKM)) && !defined(SYMTAB_SPACE)
+ /* Save the symbol locations. */
+ movl MOD_START(%ebx),%esi
+ addl MOD_LEN(%ebx),%esi
+ movl %esi,RELOC(esym)
+#endif
+
+ /* Clear BSS first so that there are no surprises... */
+ xorl %eax,%eax
+ movl $RELOC(__bss_start),%edi
+ movl $RELOC(_end),%ecx
+ subl %edi,%ecx
+ rep stosb
+
+ movl %ebx,RELOC(avail_start)
+
+ /* Copy the necessary stuff from start_info structure. */
+ /* We need to copy shared_info early, so that sti/cli work */
+ movl %ebx,%esi
+ movl $RELOC(start_info_union),%edi
+ movl $128,%ecx
+ rep movsl
+
+ /* (howto, [bootdev], bootinfo, basemem, extmem). */
+ xorl %eax,%eax
+ movl %eax,RELOC(boothowto)
+#ifdef COMPAT_OLDBOOT
+ movl %eax,RELOC(bootdev)
+#endif
+ movl $0x20000,%eax
+ movl %eax,RELOC(boothowto)
+
+ /* First, reset the PSL. */
+ pushl $PSL_MBO
+ popfl
+
+ /* Clear segment registers; always null in proc0. */
+ xorl %eax,%eax
+ movw %ax,%fs
+ movw %ax,%gs
+ decl %eax
+ movl %eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
+
+ xorl %eax,%eax
+ cpuid
+ movl %eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
+
+/*
+ * Virtual address space of kernel:
+ *
+ * text | data | bss | [syms] | page dir | proc0 kstack
+ * 0 1 2 3
+ */
+#define PROC0PDIR ((0) * PAGE_SIZE)
+#define PROC0STACK ((1) * PAGE_SIZE)
+#define SYSMAP ((1+UPAGES) * PAGE_SIZE)
+#define TABLESIZE ((1+UPAGES) * PAGE_SIZE) /* + nkpde * PAGE_SIZE */
+
+ /* Find end of kernel image. */
+ movl RELOC(avail_start),%edi
+ /* Calculate where to start the bootstrap tables. */
+ movl %edi,%esi
+
+ /*
+ * Calculate the size of the kernel page table directory, and
+ * how many entries it will have.
+ */
+ movl RELOC(nkpde),%ecx # get nkpde
+ cmpl $NKPTP_MIN,%ecx # larger than min?
+ jge 1f
+ movl $NKPTP_MIN,%ecx # set at min
+ jmp 2f
+1: cmpl $NKPTP_MAX,%ecx # larger than max?
+ jle 2f
+ movl $NKPTP_MAX,%ecx
+2:
+
+ /* Clear memory for bootstrap tables. */
+ shll $PGSHIFT,%ecx
+ addl $TABLESIZE,%ecx
+ addl %esi,%ecx # end of tables
+ movl %ecx,RELOC(gdt)
+ addl $PAGE_SIZE,%ecx
+ movl %ecx,RELOC(avail_start)
+ subl %edi,%ecx # size of tables
+ shrl $2,%ecx
+ xorl %eax,%eax
+ cld
+ rep
+ stosl
+
+/*
+ * fillkpt
+ * eax = pte (page frame | control | status)
+ * ebx = page table address
+ * ecx = number of pages to map
+ */
+#define fillkpt \
+1: movl %eax,(%ebx) ; \
+ addl $PAGE_SIZE,%eax ; /* increment physical address */ \
+ addl $4,%ebx ; /* next pte */ \
+ loop 1b ;
+
+/*
+ * Build initial page tables.
+ */
+ /* Calculate end of text segment, rounded to a page. */
+ leal (RELOC(etext)+PGOFSET),%edx
+ andl $~PGOFSET,%edx
+
+ /* Skip over the first 1MB. */
+ movl $KERNTEXTOFF,%eax
+ movl %eax,%ecx
+ subl $KERNBASE_LOCORE,%ecx
+ shrl $PGSHIFT,%ecx
+ leal (SYSMAP)(%esi,%ecx,4),%ebx
+
+ /* Map the kernel text read-only. */
+ movl %edx,%ecx
+ subl %eax,%ecx
+ shrl $PGSHIFT,%ecx
+ orl $(PG_V|PG_KR),%eax
+ fillkpt
+
+ /* Map the data, BSS, and bootstrap tables read-write. */
+ movl RELOC(avail_start),%ecx
+ # end of tables
+ subl %edx,%ecx # subtract end of text
+ shrl $PGSHIFT,%ecx
+ leal (PG_V|PG_KW)(%edx),%eax
+ fillkpt
+
+ movl $0xffffffff,(%ebx)
+ addl $4,%ebx
+
+/*
+ * Construct a page table directory.
+ */
+ /* Map kernel PDEs. */
+ movl RELOC(nkpde),%ecx # for this many pde s,
+ leal (PROC0PDIR+PDSLOT_KERN*4)(%esi),%ebx # kernel pde offset
+ leal (SYSMAP+PG_V|PG_KW)(%esi),%eax # pte for KPT in proc 0,
+ fillkpt
+
+ /* Install a PDE recursively mapping page directory as a page table! */
+ leal (PROC0PDIR+PG_V/*|PG_KW*/)(%esi),%eax # pte for ptd
+ movl %eax,(PROC0PDIR+PDSLOT_PTE*4)(%esi) # recursive PD slot
+
+ /* Save phys. addr of PTD, for libkvm. */
+ movl %esi,RELOC(PTDpaddr)
+
+ call xpmap_init
+
+ /* cr0 is 0x8005003b */
+
+ /* Relocate atdevbase. */
+ movl _C_LABEL(avail_start),%edx
+ movl %edx,_C_LABEL(HYPERVISOR_shared_info)
+ addl $PAGE_SIZE,%edx # shared_inf
+ movl %edx,_C_LABEL(atdevbase)
+
+ /* Set up bootstrap stack. */
+ leal (PROC0STACK)(%esi),%eax
+ movl %eax,_C_LABEL(proc0paddr)
+ leal (USPACE-FRAMESIZE)(%eax),%esp
+ subl $KERNBASE_LOCORE,%esi
+ movl %esi,PCB_CR3(%eax) # pcb->pcb_cr3
+ xorl %ebp,%ebp # mark end of frames
+
+ movl _C_LABEL(atdevbase),%eax
+ pushl %eax
+ call _C_LABEL(init386) # wire 386 chip for unix operation
+ addl $4,%esp
+
+#ifdef SAFARI_FIFO_HACK
+ movb $5,%al
+ movw $0x37b,%dx
+ outb %al,%dx
+ movw $0x37f,%dx
+ inb %dx,%al
+ movb %al,%cl
+
+ orb $1,%cl
+
+ movb $5,%al
+ movw $0x37b,%dx
+ outb %al,%dx
+ movw $0x37f,%dx
+ movb %cl,%al
+ outb %al,%dx
+#endif /* SAFARI_FIFO_HACK */
+
+ call _C_LABEL(main)
+
+/*
+ * void proc_trampoline(void);
+ * This is a trampoline function pushed onto the stack of a newly created
+ * process in order to do some additional setup. The trampoline is entered by
+ * cpu_switch()ing to the process, so we abuse the callee-saved registers used
+ * by cpu_switch() to store the information about the stub to call.
+ * NOTE: This function does not have a normal calling sequence!
+ */
+/* LINTSTUB: Func: void proc_trampoline(void) */
+NENTRY(proc_trampoline)
+#ifdef MULTIPROCESSOR
+ call _C_LABEL(proc_trampoline_mp)
+#endif
+ movl $IPL_NONE,CPUVAR(ILEVEL)
+ pushl %ebx
+ call *%esi
+ addl $4,%esp
+ DO_DEFERRED_SWITCH(%eax)
+ INTRFASTEXIT
+ /* NOTREACHED */
+
+/*****************************************************************************/
+#ifdef COMPAT_16
+/*
+ * Signal trampoline; copied to top of user stack.
+ */
+/* LINTSTUB: Var: char sigcode[1], esigcode[1]; */
+NENTRY(sigcode)
+ /*
+ * Handler has returned here as if we called it. The sigcontext
+ * is on the stack after the 3 args "we" pushed.
+ */
+ leal 12(%esp),%eax # get pointer to sigcontext
+ movl %eax,4(%esp) # put it in the argument slot
+ # fake return address already there
+ movl $SYS_compat_16___sigreturn14,%eax
+ int $0x80 # enter kernel with args on stack
+ movl $SYS_exit,%eax
+ int $0x80 # exit if sigreturn fails
+ .globl _C_LABEL(esigcode)
+_C_LABEL(esigcode):
+#endif
+
+/*****************************************************************************/
+
+/*
+ * The following primitives are used to fill and copy regions of memory.
+ */
+
+/*
+ * XXX No section 9 man page for fillw.
+ * fillw seems to be very sparsely used (only in pccons it seems.)
+ * One wonders if it couldn't be done without.
+ * -- Perry Metzger, May 7, 2001
+ */
+/*
+ * void fillw(short pattern, void *addr, size_t len);
+ * Write len copies of pattern at addr.
+ */
+/* LINTSTUB: Func: void fillw(short pattern, void *addr, size_t len) */
+ENTRY(fillw)
+ pushl %edi
+ movl 8(%esp),%eax
+ movl 12(%esp),%edi
+ movw %ax,%cx
+ rorl $16,%eax
+ movw %cx,%ax
+ cld
+ movl 16(%esp),%ecx
+ shrl %ecx # do longwords
+ rep
+ stosl
+ movl 16(%esp),%ecx
+ andl $1,%ecx # do remainder
+ rep
+ stosw
+ popl %edi
+ ret
+
+/*
+ * int kcopy(const void *from, void *to, size_t len);
+ * Copy len bytes, abort on fault.
+ */
+/* LINTSTUB: Func: int kcopy(const void *from, void *to, size_t len) */
+ENTRY(kcopy)
+ pushl %esi
+ pushl %edi
+ GET_CURPCB(%eax) # load curpcb into eax and set on-fault
+ pushl PCB_ONFAULT(%eax)
+ movl $_C_LABEL(kcopy_fault), PCB_ONFAULT(%eax)
+
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ movl %edi,%eax
+ subl %esi,%eax
+ cmpl %ecx,%eax # overlapping?
+ jb 1f
+ cld # nope, copy forward
+ shrl $2,%ecx # copy by 32-bit words
+ rep
+ movsl
+ movl 24(%esp),%ecx
+ andl $3,%ecx # any bytes left?
+ rep
+ movsb
+
+ GET_CURPCB(%edx) # XXX save curpcb?
+ popl PCB_ONFAULT(%edx)
+ popl %edi
+ popl %esi
+ xorl %eax,%eax
+ ret
+
+ ALIGN_TEXT
+1: addl %ecx,%edi # copy backward
+ addl %ecx,%esi
+ std
+ andl $3,%ecx # any fractional bytes?
+ decl %edi
+ decl %esi
+ rep
+ movsb
+ movl 24(%esp),%ecx # copy remainder by 32-bit words
+ shrl $2,%ecx
+ subl $3,%esi
+ subl $3,%edi
+ rep
+ movsl
+ cld
+
+ GET_CURPCB(%edx)
+ popl PCB_ONFAULT(%edx)
+ popl %edi
+ popl %esi
+ xorl %eax,%eax
+ ret
+
+/*****************************************************************************/
+
+/*
+ * The following primitives are used to copy data in and out of the user's
+ * address space.
+ */
+
+/*
+ * Default to the lowest-common-denominator. We will improve it
+ * later.
+ */
+#if defined(I386_CPU)
+#define DEFAULT_COPYOUT _C_LABEL(i386_copyout)
+#define DEFAULT_COPYIN _C_LABEL(i386_copyin)
+#elif defined(I486_CPU)
+#define DEFAULT_COPYOUT _C_LABEL(i486_copyout)
+#define DEFAULT_COPYIN _C_LABEL(i386_copyin)
+#elif defined(I586_CPU)
+#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) /* XXX */
+#define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */
+#elif defined(I686_CPU)
+#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) /* XXX */
+#define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */
+#endif
+
+ .data
+
+ .globl _C_LABEL(copyout_func)
+_C_LABEL(copyout_func):
+ .long DEFAULT_COPYOUT
+
+ .globl _C_LABEL(copyin_func)
+_C_LABEL(copyin_func):
+ .long DEFAULT_COPYIN
+
+ .text
+
+/*
+ * int copyout(const void *from, void *to, size_t len);
+ * Copy len bytes into the user's address space.
+ * see copyout(9)
+ */
+/* LINTSTUB: Func: int copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(copyout)
+ DO_DEFERRED_SWITCH(%eax)
+ jmp *_C_LABEL(copyout_func)
+
+#if defined(I386_CPU)
+/* LINTSTUB: Func: int i386_copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(i386_copyout)
+ pushl %esi
+ pushl %edi
+ pushl $0
+
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%eax
+
+ /*
+ * We check that the end of the destination buffer is not past the end
+ * of the user's address space. If it's not, then we only need to
+ * check that each page is writable. The 486 will do this for us; the
+ * 386 will not. (We assume that pages in user space that are not
+ * writable by the user are not writable by the kernel either.)
+ */
+ movl %edi,%edx
+ addl %eax,%edx
+ jc _C_LABEL(copy_efault)
+ cmpl $VM_MAXUSER_ADDRESS,%edx
+ ja _C_LABEL(copy_efault)
+
+ testl %eax,%eax # anything to do?
+ jz 3f
+
+ /*
+ * We have to check each PTE for (write) permission, since the CPU
+ * doesn't do it for us.
+ */
+
+ /* Compute number of pages. */
+ movl %edi,%ecx
+ andl $PGOFSET,%ecx
+ addl %eax,%ecx
+ decl %ecx
+ shrl $PGSHIFT,%ecx
+
+ /* Compute PTE offset for start address. */
+ shrl $PGSHIFT,%edi
+
+ GET_CURPCB(%edx)
+ movl $2f,PCB_ONFAULT(%edx)
+
+1: /* Check PTE for each page. */
+ testb $PG_RW,_C_LABEL(PTmap)(,%edi,4)
+ jz 2f
+
+4: incl %edi
+ decl %ecx
+ jns 1b
+
+ movl 20(%esp),%edi
+ movl 24(%esp),%eax
+ jmp 3f
+
+2: /* Simulate a trap. */
+ pushl %ecx
+ movl %edi,%eax
+ shll $PGSHIFT,%eax
+ pushl %eax
+ call _C_LABEL(trapwrite) # trapwrite(addr)
+ addl $4,%esp # pop argument
+ popl %ecx
+ testl %eax,%eax # if not ok, return EFAULT
+ jz 4b
+ jmp _C_LABEL(copy_efault)
+
+3: GET_CURPCB(%edx)
+ movl $_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
+
+ /* bcopy(%esi, %edi, %eax); */
+ cld
+ movl %eax,%ecx
+ shrl $2,%ecx
+ rep
+ movsl
+ movl %eax,%ecx
+ andl $3,%ecx
+ rep
+ movsb
+
+ popl PCB_ONFAULT(%edx)
+ popl %edi
+ popl %esi
+ xorl %eax,%eax
+ ret
+#endif /* I386_CPU */
+
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+/* LINTSTUB: Func: int i486_copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(i486_copyout)
+ pushl %esi
+ pushl %edi
+ pushl $0
+
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%eax
+
+ /*
+ * We check that the end of the destination buffer is not past the end
+ * of the user's address space.
+ */
+ movl %edi,%edx
+ addl %eax,%edx
+ jc _C_LABEL(copy_efault)
+ cmpl $VM_MAXUSER_ADDRESS,%edx
+ ja _C_LABEL(copy_efault)
+
+ GET_CURPCB(%edx)
+ movl $_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
+
+ /* bcopy(%esi, %edi, %eax); */
+ cld
+ movl %eax,%ecx
+ shrl $2,%ecx
+ rep
+ movsl
+ movl %eax,%ecx
+ andl $3,%ecx
+ rep
+ movsb
+
+ popl PCB_ONFAULT(%edx)
+ popl %edi
+ popl %esi
+ xorl %eax,%eax
+ ret
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+/*
+ * int copyin(const void *from, void *to, size_t len);
+ * Copy len bytes from the user's address space.
+ * see copyin(9)
+ */
+/* LINTSTUB: Func: int copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(copyin)
+ DO_DEFERRED_SWITCH(%eax)
+ jmp *_C_LABEL(copyin_func)
+
+#if defined(I386_CPU) || defined(I486_CPU) || defined(I586_CPU) || \
+ defined(I686_CPU)
+/* LINTSTUB: Func: int i386_copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(i386_copyin)
+ pushl %esi
+ pushl %edi
+ GET_CURPCB(%eax)
+ pushl $0
+ movl $_C_LABEL(copy_fault),PCB_ONFAULT(%eax)
+
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%eax
+
+ /*
+ * We check that the end of the destination buffer is not past the end
+ * of the user's address space. If it's not, then we only need to
+ * check that each page is readable, and the CPU will do that for us.
+ */
+ movl %esi,%edx
+ addl %eax,%edx
+ jc _C_LABEL(copy_efault)
+ cmpl $VM_MAXUSER_ADDRESS,%edx
+ ja _C_LABEL(copy_efault)
+
+ /* bcopy(%esi, %edi, %eax); */
+ cld
+ movl %eax,%ecx
+ shrl $2,%ecx
+ rep
+ movsl
+ movl %eax,%ecx
+ andl $3,%ecx
+ rep
+ movsb
+
+ GET_CURPCB(%edx)
+ popl PCB_ONFAULT(%edx)
+ popl %edi
+ popl %esi
+ xorl %eax,%eax
+ ret
+#endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
+
+/* LINTSTUB: Ignore */
+NENTRY(copy_efault)
+ movl $EFAULT,%eax
+
+/*
+ * kcopy_fault is used by kcopy and copy_fault is used by copyin/out.
+ *
+ * they're distinguished for lazy pmap switching. see trap().
+ */
+/* LINTSTUB: Ignore */
+NENTRY(kcopy_fault)
+ GET_CURPCB(%edx)
+ popl PCB_ONFAULT(%edx)
+ popl %edi
+ popl %esi
+ ret
+
+/* LINTSTUB: Ignore */
+NENTRY(copy_fault)
+ GET_CURPCB(%edx)
+ popl PCB_ONFAULT(%edx)
+ popl %edi
+ popl %esi
+ ret
+
+/*
+ * int copyoutstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
+ * Copy a NUL-terminated string, at most maxlen characters long, into the
+ * user's address space. Return the number of characters copied (including the
+ * NUL) in *lencopied. If the string is too long, return ENAMETOOLONG; else
+ * return 0 or EFAULT.
+ * see copyoutstr(9)
+ */
+/* LINTSTUB: Func: int copyoutstr(const void *kaddr, void *uaddr, size_t len, size_t *done) */
+ENTRY(copyoutstr)
+ pushl %esi
+ pushl %edi
+
+ DO_DEFERRED_SWITCH(%eax)
+
+ movl 12(%esp),%esi # esi = from
+ movl 16(%esp),%edi # edi = to
+ movl 20(%esp),%edx # edx = maxlen
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+ cmpl $CPUCLASS_386,_C_LABEL(cpu_class)
+ jne 5f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+ /* Compute number of bytes in first page. */
+ movl %edi,%eax
+ andl $PGOFSET,%eax
+ movl $PAGE_SIZE,%ecx
+ subl %eax,%ecx # ecx = PAGE_SIZE - (src % PAGE_SIZE)
+
+ GET_CURPCB(%eax)
+ movl $6f,PCB_ONFAULT(%eax)
+
+1: /*
+ * Once per page, check that we are still within the bounds of user
+ * space, and check for a write fault.
+ */
+ cmpl $VM_MAXUSER_ADDRESS,%edi
+ jae _C_LABEL(copystr_efault)
+
+ /* Compute PTE offset. */
+ movl %edi,%eax
+ shrl $PGSHIFT,%eax # calculate pte address
+
+ testb $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+ jnz 2f
+
+6: /* Simulate a trap. */
+ pushl %edx
+ pushl %edi
+ call _C_LABEL(trapwrite) # trapwrite(addr)
+ addl $4,%esp # clear argument from stack
+ popl %edx
+ testl %eax,%eax
+ jnz _C_LABEL(copystr_efault)
+
+2: /* Copy up to end of this page. */
+ subl %ecx,%edx # predecrement total count
+ jnc 3f
+ addl %edx,%ecx # ecx += (edx - ecx) = edx
+ xorl %edx,%edx
+
+3: decl %ecx
+ js 4f
+ lodsb
+ stosb
+ testb %al,%al
+ jnz 3b
+
+ /* Success -- 0 byte reached. */
+ addl %ecx,%edx # add back residual for this page
+ xorl %eax,%eax
+ jmp copystr_return
+
+4: /* Go to next page, if any. */
+ movl $PAGE_SIZE,%ecx
+ testl %edx,%edx
+ jnz 1b
+
+ /* edx is zero -- return ENAMETOOLONG. */
+ movl $ENAMETOOLONG,%eax
+ jmp copystr_return
+#endif /* I386_CPU */
+
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+5: GET_CURPCB(%eax)
+ movl $_C_LABEL(copystr_fault),PCB_ONFAULT(%eax)
+ /*
+ * Get min(%edx, VM_MAXUSER_ADDRESS-%edi).
+ */
+ movl $VM_MAXUSER_ADDRESS,%eax
+ subl %edi,%eax
+ cmpl %edx,%eax
+ jae 1f
+ movl %eax,%edx
+ movl %eax,20(%esp)
+
+1: incl %edx
+ cld
+
+1: decl %edx
+ jz 2f
+ lodsb
+ stosb
+ testb %al,%al
+ jnz 1b
+
+ /* Success -- 0 byte reached. */
+ decl %edx
+ xorl %eax,%eax
+ jmp copystr_return
+
+2: /* edx is zero -- return EFAULT or ENAMETOOLONG. */
+ cmpl $VM_MAXUSER_ADDRESS,%edi
+ jae _C_LABEL(copystr_efault)
+ movl $ENAMETOOLONG,%eax
+ jmp copystr_return
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+/*
+ * int copyinstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
+ * Copy a NUL-terminated string, at most maxlen characters long, from the
+ * user's address space. Return the number of characters copied (including the
+ * NUL) in *lencopied. If the string is too long, return ENAMETOOLONG; else
+ * return 0 or EFAULT.
+ * see copyinstr(9)
+ */
+/* LINTSTUB: Func: int copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) */
+ENTRY(copyinstr)
+ pushl %esi
+ pushl %edi
+
+ DO_DEFERRED_SWITCH(%eax)
+
+ GET_CURPCB(%ecx)
+ movl $_C_LABEL(copystr_fault),PCB_ONFAULT(%ecx)
+
+ movl 12(%esp),%esi # %esi = from
+ movl 16(%esp),%edi # %edi = to
+ movl 20(%esp),%edx # %edx = maxlen
+
+ /*
+ * Get min(%edx, VM_MAXUSER_ADDRESS-%esi).
+ */
+ movl $VM_MAXUSER_ADDRESS,%eax
+ subl %esi,%eax
+ cmpl %edx,%eax
+ jae 1f
+ movl %eax,%edx
+ movl %eax,20(%esp)
+
+1: incl %edx
+ cld
+
+1: decl %edx
+ jz 2f
+ lodsb
+ stosb
+ testb %al,%al
+ jnz 1b
+
+ /* Success -- 0 byte reached. */
+ decl %edx
+ xorl %eax,%eax
+ jmp copystr_return
+
+2: /* edx is zero -- return EFAULT or ENAMETOOLONG. */
+ cmpl $VM_MAXUSER_ADDRESS,%esi
+ jae _C_LABEL(copystr_efault)
+ movl $ENAMETOOLONG,%eax
+ jmp copystr_return
+
+/* LINTSTUB: Ignore */
+NENTRY(copystr_efault)
+ movl $EFAULT,%eax
+
+/* LINTSTUB: Ignore */
+NENTRY(copystr_fault)
+copystr_return:
+ /* Set *lencopied and return %eax. */
+ GET_CURPCB(%ecx)
+ movl $0,PCB_ONFAULT(%ecx)
+ movl 20(%esp),%ecx
+ subl %edx,%ecx
+ movl 24(%esp),%edx
+ testl %edx,%edx
+ jz 8f
+ movl %ecx,(%edx)
+
+8: popl %edi
+ popl %esi
+ ret
+
+/*
+ * int copystr(const void *from, void *to, size_t maxlen, size_t *lencopied);
+ * Copy a NUL-terminated string, at most maxlen characters long. Return the
+ * number of characters copied (including the NUL) in *lencopied. If the
+ * string is too long, return ENAMETOOLONG; else return 0.
+ * see copystr(9)
+ */
+/* LINTSTUB: Func: int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done) */
+ENTRY(copystr)
+ pushl %esi
+ pushl %edi
+
+ movl 12(%esp),%esi # esi = from
+ movl 16(%esp),%edi # edi = to
+ movl 20(%esp),%edx # edx = maxlen
+ incl %edx
+ cld
+
+1: decl %edx
+ jz 4f
+ lodsb
+ stosb
+ testb %al,%al
+ jnz 1b
+
+ /* Success -- 0 byte reached. */
+ decl %edx
+ xorl %eax,%eax
+ jmp 6f
+
+4: /* edx is zero -- return ENAMETOOLONG. */
+ movl $ENAMETOOLONG,%eax
+
+6: /* Set *lencopied and return %eax. */
+ movl 20(%esp),%ecx
+ subl %edx,%ecx
+ movl 24(%esp),%edx
+ testl %edx,%edx
+ jz 7f
+ movl %ecx,(%edx)
+
+7: popl %edi
+ popl %esi
+ ret
+
+/*
+ * long fuword(const void *uaddr);
+ * Fetch an int from the user's address space.
+ * see fuword(9)
+ */
+/* LINTSTUB: Func: long fuword(const void *base) */
+ENTRY(fuword)
+ DO_DEFERRED_SWITCH(%eax)
+ movl 4(%esp),%edx
+ cmpl $VM_MAXUSER_ADDRESS-4,%edx
+ ja _C_LABEL(fusuaddrfault)
+ GET_CURPCB(%ecx)
+ movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+ movl (%edx),%eax
+ movl $0,PCB_ONFAULT(%ecx)
+ ret
+
+/*
+ * int fusword(const void *uaddr);
+ * Fetch a short from the user's address space.
+ * see fusword(9)
+ */
+/* LINTSTUB: Func: int fusword(const void *base) */
+ENTRY(fusword)
+ DO_DEFERRED_SWITCH(%eax)
+ movl 4(%esp),%edx
+ cmpl $VM_MAXUSER_ADDRESS-2,%edx
+ ja _C_LABEL(fusuaddrfault)
+ GET_CURPCB(%ecx)
+ movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+ movzwl (%edx),%eax
+ movl $0,PCB_ONFAULT(%ecx)
+ ret
+
+/*
+ * int fuswintr(const void *uaddr);
+ * Fetch a short from the user's address space. Can be called during an
+ * interrupt.
+ * see fuswintr(9)
+ */
+/* LINTSTUB: Func: int fuswintr(const void *base) */
+ENTRY(fuswintr)
+ cmpl $TLBSTATE_VALID, CPUVAR(TLBSTATE)
+ jnz _C_LABEL(fusuaddrfault)
+ movl 4(%esp),%edx
+ cmpl $VM_MAXUSER_ADDRESS-2,%edx
+ ja _C_LABEL(fusuaddrfault)
+ movl CPUVAR(CURLWP),%ecx
+ movl L_ADDR(%ecx),%ecx
+ movl $_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
+ movzwl (%edx),%eax
+ movl $0,PCB_ONFAULT(%ecx)
+ ret
+
+/*
+ * int fubyte(const void *uaddr);
+ * Fetch a byte from the user's address space.
+ * see fubyte(9)
+ */
+/* LINTSTUB: Func: int fubyte(const void *base) */
+ENTRY(fubyte)
+ DO_DEFERRED_SWITCH(%eax)
+ movl 4(%esp),%edx
+ cmpl $VM_MAXUSER_ADDRESS-1,%edx
+ ja _C_LABEL(fusuaddrfault)
+ GET_CURPCB(%ecx)
+ movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+ movzbl (%edx),%eax
+ movl $0,PCB_ONFAULT(%ecx)
+ ret
+
+/*
+ * Handle faults from [fs]u*(). Clean up and return -1.
+ */
+/* LINTSTUB: Ignore */
+NENTRY(fusufault)
+ movl $0,PCB_ONFAULT(%ecx)
+ movl $-1,%eax
+ ret
+
+/*
+ * Handle faults from [fs]u*(). Clean up and return -1. This differs from
+ * fusufault() in that trap() will recognize it and return immediately rather
+ * than trying to page fault.
+ */
+/* LINTSTUB: Ignore */
+NENTRY(fusubail)
+ movl $0,PCB_ONFAULT(%ecx)
+ movl $-1,%eax
+ ret
+
+/*
+ * Handle earlier faults from [fs]u*(), due to our of range addresses.
+ */
+/* LINTSTUB: Ignore */
+NENTRY(fusuaddrfault)
+ movl $-1,%eax
+ ret
+
+/*
+ * int suword(void *uaddr, long x);
+ * Store an int in the user's address space.
+ * see suword(9)
+ */
+/* LINTSTUB: Func: int suword(void *base, long c) */
+ENTRY(suword)
+ DO_DEFERRED_SWITCH(%eax)
+ movl 4(%esp),%edx
+ cmpl $VM_MAXUSER_ADDRESS-4,%edx
+ ja _C_LABEL(fusuaddrfault)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+ cmpl $CPUCLASS_386,_C_LABEL(cpu_class)
+ jne 2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+ GET_CURPCB(%eax)
+ movl $3f,PCB_ONFAULT(%eax)
+
+ movl %edx,%eax
+ shrl $PGSHIFT,%eax # calculate pte address
+ testb $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+ jnz 1f
+
+3: /* Simulate a trap. */
+ pushl %edx
+ pushl %edx
+ call _C_LABEL(trapwrite) # trapwrite(addr)
+ addl $4,%esp # clear parameter from the stack
+ popl %edx
+ GET_CURPCB(%ecx)
+ testl %eax,%eax
+ jnz _C_LABEL(fusufault)
+
+1: /* XXX also need to check the following 3 bytes for validity! */
+#endif
+
+2: GET_CURPCB(%ecx)
+ movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+
+ movl 8(%esp),%eax
+ movl %eax,(%edx)
+ xorl %eax,%eax
+ movl %eax,PCB_ONFAULT(%ecx)
+ ret
+
+/*
+ * int susword(void *uaddr, short x);
+ * Store a short in the user's address space.
+ * see susword(9)
+ */
+/* LINTSTUB: Func: int susword(void *base, short c) */
+ENTRY(susword)
+ DO_DEFERRED_SWITCH(%eax)
+ movl 4(%esp),%edx
+ cmpl $VM_MAXUSER_ADDRESS-2,%edx
+ ja _C_LABEL(fusuaddrfault)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+ cmpl $CPUCLASS_386,_C_LABEL(cpu_class)
+ jne 2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+ GET_CURPCB(%eax)
+ movl $3f,PCB_ONFAULT(%eax)
+
+ movl %edx,%eax
+ shrl $PGSHIFT,%eax # calculate pte address
+ testb $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+ jnz 1f
+
+3: /* Simulate a trap. */
+ pushl %edx
+ pushl %edx
+ call _C_LABEL(trapwrite) # trapwrite(addr)
+ addl $4,%esp # clear parameter from the stack
+ popl %edx
+ GET_CURPCB(%ecx)
+ testl %eax,%eax
+ jnz _C_LABEL(fusufault)
+
+1: /* XXX also need to check the following byte for validity! */
+#endif
+
+2: GET_CURPCB(%ecx)
+ movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+
+ movl 8(%esp),%eax
+ movw %ax,(%edx)
+ xorl %eax,%eax
+ movl %eax,PCB_ONFAULT(%ecx)
+ ret
+
+/*
+ * int suswintr(void *uaddr, short x);
+ * Store a short in the user's address space. Can be called during an
+ * interrupt.
+ * see suswintr(9)
+ */
+/* LINTSTUB: Func: int suswintr(void *base, short c) */
+ENTRY(suswintr)
+ cmpl $TLBSTATE_VALID, CPUVAR(TLBSTATE)
+ jnz _C_LABEL(fusuaddrfault)
+ movl 4(%esp),%edx
+ cmpl $VM_MAXUSER_ADDRESS-2,%edx
+ ja _C_LABEL(fusuaddrfault)
+ movl CPUVAR(CURLWP),%ecx
+ movl L_ADDR(%ecx),%ecx
+ movl $_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+ cmpl $CPUCLASS_386,_C_LABEL(cpu_class)
+ jne 2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+ movl %edx,%eax
+ shrl $PGSHIFT,%eax # calculate pte address
+ testb $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+ jnz 1f
+
+ /* Simulate a trap. */
+ jmp _C_LABEL(fusubail)
+
+1: /* XXX also need to check the following byte for validity! */
+#endif
+
+2: movl 8(%esp),%eax
+ movw %ax,(%edx)
+ xorl %eax,%eax
+ movl %eax,PCB_ONFAULT(%ecx)
+ ret
+
+/*
+ * int subyte(void *uaddr, char x);
+ * Store a byte in the user's address space.
+ * see subyte(9)
+ */
+/* LINTSTUB: Func: int subyte(void *base, int c) */
+ENTRY(subyte)
+ DO_DEFERRED_SWITCH(%eax)
+ movl 4(%esp),%edx
+ cmpl $VM_MAXUSER_ADDRESS-1,%edx
+ ja _C_LABEL(fusuaddrfault)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+ cmpl $CPUCLASS_386,_C_LABEL(cpu_class)
+ jne 2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+ GET_CURPCB(%eax)
+ movl $3f,PCB_ONFAULT(%eax)
+
+ movl %edx,%eax
+ shrl $PGSHIFT,%eax # calculate pte address
+ testb $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+ jnz 1f
+
+3: /* Simulate a trap. */
+ pushl %edx
+ pushl %edx
+ call _C_LABEL(trapwrite) # trapwrite(addr)
+ addl $4,%esp # clear parameter from the stack
+ popl %edx
+ GET_CURPCB(%ecx)
+ testl %eax,%eax
+ jnz _C_LABEL(fusufault)
+
+1:
+#endif
+
+2: GET_CURPCB(%ecx)
+ movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+
+ movb 8(%esp),%al
+ movb %al,(%edx)
+ xorl %eax,%eax
+ movl %eax,PCB_ONFAULT(%ecx)
+ ret
+
+/*****************************************************************************/
+
+/*
+ * The following is i386-specific nonsense.
+ */
+
+/*
+ * void lgdt_finish(void);
+ * Finish load a new GDT pointer (do any necessary cleanup).
+ * XXX It's somewhat questionable whether reloading all the segment registers
+ * is necessary, since the actual descriptor data is not changed except by
+ * process creation and exit, both of which clean up via task switches. OTOH,
+ * this only happens at run time when the GDT is resized.
+ */
+/* LINTSTUB: Func: void lgdt_finish(void) */
+NENTRY(lgdt_finish)
+ movl $GSEL(GDATA_SEL, SEL_KPL),%eax
+ movw %ax,%ds
+ movw %ax,%es
+ movw %ax,%gs
+ movw %ax,%ss
+ movl $GSEL(GCPU_SEL, SEL_KPL),%eax
+ movw %ax,%fs
+ /* Reload code selector by doing intersegment return. */
+ popl %eax
+ pushl $GSEL(GCODE_SEL, SEL_KPL)
+ pushl %eax
+ lret
+
+/*****************************************************************************/
+
+/*
+ * These functions are primarily used by DDB.
+ */
+
+/* LINTSTUB: Func: int setjmp (label_t *l) */
+ENTRY(setjmp)
+ movl 4(%esp),%eax
+ movl %ebx,(%eax) # save ebx
+ movl %esp,4(%eax) # save esp
+ movl %ebp,8(%eax) # save ebp
+ movl %esi,12(%eax) # save esi
+ movl %edi,16(%eax) # save edi
+ movl (%esp),%edx # get rta
+ movl %edx,20(%eax) # save eip
+ xorl %eax,%eax # return (0);
+ ret
+
+/* LINTSTUB: Func: void longjmp (label_t *l) */
+ENTRY(longjmp)
+ movl 4(%esp),%eax
+ movl (%eax),%ebx # restore ebx
+ movl 4(%eax),%esp # restore esp
+ movl 8(%eax),%ebp # restore ebp
+ movl 12(%eax),%esi # restore esi
+ movl 16(%eax),%edi # restore edi
+ movl 20(%eax),%edx # get rta
+ movl %edx,(%esp) # put in return frame
+ xorl %eax,%eax # return (1);
+ incl %eax
+ ret
+
+/*****************************************************************************/
+
+ .globl _C_LABEL(sched_whichqs),_C_LABEL(sched_qs)
+ .globl _C_LABEL(uvmexp),_C_LABEL(panic)
+
+#ifdef DIAGNOSTIC
+NENTRY(switch_error)
+ pushl $1f
+3: call _C_LABEL(panic)
+ /* NOTREACHED */
+1: .asciz "cpu_switch"
+#endif /* DIAGNOSTIC */
+
+/*
+ * void cpu_switch(struct lwp *)
+ * Find a runnable process and switch to it. Wait if necessary. If the new
+ * process is the same as the old one, we short-circuit the context save and
+ * restore.
+ *
+ * Note that the stack frame layout is known to "struct switchframe"
+ * in <machine/frame.h> and to the code in cpu_fork() which initializes
+ * it for a new lwp.
+ */
+ENTRY(cpu_switch)
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+#ifdef DEBUG
+ cmpl $IPL_SCHED,CPUVAR(ILEVEL)
+ jae 1f
+ pushl $2f
+ call _C_LABEL(panic)
+ /* NOTREACHED */
+2: .asciz "not splsched() in cpu_switch!"
+1:
+#endif /* DEBUG */
+
+ movl 16(%esp),%esi # current
+
+ /*
+ * Clear curlwp so that we don't accumulate system time while idle.
+ * This also insures that schedcpu() will move the old lwp to
+ * the correct queue if it happens to get called from the spllower()
+ * below and changes the priority. (See corresponding comment in
+ * userret()).
+ */
+ movl $0,CPUVAR(CURLWP)
+ /*
+ * First phase: find new lwp.
+ *
+ * Registers:
+ * %eax - queue head, scratch, then zero
+ * %ebx - queue number
+ * %ecx - cached value of whichqs
+ * %edx - next lwp in queue
+ * %esi - old lwp
+ * %edi - new lwp
+ */
+
+ /* Look for new lwp. */
+ CLI(%ecx) # splhigh doesn't do a cli
+ movl _C_LABEL(sched_whichqs),%ecx
+ bsfl %ecx,%ebx # find a full q
+ jnz switch_dequeue
+
+ /*
+ * idling: save old context.
+ *
+ * Registers:
+ * %eax, %ecx - scratch
+ * %esi - old lwp, then old pcb
+ * %edi - idle pcb
+ */
+
+ pushl %esi
+ call _C_LABEL(pmap_deactivate2) # pmap_deactivate(oldproc)
+ addl $4,%esp
+
+ movl L_ADDR(%esi),%esi
+
+ /* Save stack pointers. */
+ movl %esp,PCB_ESP(%esi)
+ movl %ebp,PCB_EBP(%esi)
+
+ /* Find idle PCB for this CPU */
+#ifndef MULTIPROCESSOR
+ movl $_C_LABEL(lwp0),%ebx
+ movl L_ADDR(%ebx),%edi
+ movl L_MD_TSS_SEL(%ebx),%edx
+#else
+ movl CPUVAR(IDLE_PCB),%edi
+ movl CPUVAR(IDLE_TSS_SEL),%edx
+#endif
+ movl $0,CPUVAR(CURLWP) /* In case we fault... */
+
+ /* Restore the idle context (avoid interrupts) */
+ CLI(%ecx)
+
+ /* Restore stack pointers. */
+ movl PCB_ESP(%edi),%esp
+ movl PCB_EBP(%edi),%ebp
+
+ pushl %edi
+ call _C_LABEL(i386_switch_context)
+ addl $4,%esp
+
+ /* Record new pcb. */
+ SET_CURPCB(%edi)
+
+ xorl %esi,%esi
+ STI(%eax)
+idle_unlock:
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+ call _C_LABEL(sched_unlock_idle)
+#endif
+ /* Interrupts are okay again. */
+ pushl $IPL_NONE # spl0()
+ call _C_LABEL(Xspllower) # process pending interrupts
+ addl $4,%esp
+ jmp idle_start
+idle_zero:
+ STIC(%eax)
+ jz 4f
+ call _C_LABEL(stipending)
+ testl %eax,%eax
+ jz 4f
+ pushl $IPL_NONE
+ call _C_LABEL(Xspllower)
+ addl $4,%esp
+4:
+ call _C_LABEL(uvm_pageidlezero)
+ CLI(%eax)
+ cmpl $0,_C_LABEL(sched_whichqs)
+ jnz idle_exit
+idle_loop:
+ /* Try to zero some pages. */
+ movl _C_LABEL(uvm)+UVM_PAGE_IDLE_ZERO,%ecx
+ testl %ecx,%ecx
+ jnz idle_zero
+ STIC(%eax)
+ jz 4f
+ call _C_LABEL(stipending)
+ testl %eax,%eax
+ jz 4f
+ pushl $IPL_NONE
+ call _C_LABEL(Xspllower)
+ addl $4,%esp
+ jmp idle_start
+4:
+ movl $__HYPERVISOR_yield,%eax
+ TRAP_INSTR
+NENTRY(mpidle)
+idle_start:
+ CLI(%eax)
+ cmpl $0,_C_LABEL(sched_whichqs)
+ jz idle_loop
+idle_exit:
+ movl $IPL_HIGH,CPUVAR(ILEVEL) # splhigh
+ STI(%eax)
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+ call _C_LABEL(sched_lock_idle)
+#endif
+ movl _C_LABEL(sched_whichqs),%ecx
+ bsfl %ecx,%ebx
+ jz idle_unlock
+
+#ifdef XENDEBUG_LOW
+ pushl %ecx
+ call _C_LABEL(xen_dbg1)
+ xorl %ecx,%ecx
+ movl %ecx,_C_LABEL(xen_once)
+ popl %ecx
+#endif
+switch_dequeue:
+ /*
+ * we're running at splhigh(), but it's otherwise okay to take
+ * interrupts here.
+ */
+ STI(%edi)
+ leal _C_LABEL(sched_qs)(,%ebx,8),%eax # select q
+
+ movl L_FORW(%eax),%edi # unlink from front of process q
+#ifdef DIAGNOSTIC
+ cmpl %edi,%eax # linked to self (i.e. nothing queued)?
+ je _C_LABEL(switch_error) # not possible
+#endif /* DIAGNOSTIC */
+ movl L_FORW(%edi),%edx
+ movl %edx,L_FORW(%eax)
+ movl %eax,L_BACK(%edx)
+
+ cmpl %edx,%eax # q empty?
+ jne 3f
+
+ btrl %ebx,%ecx # yes, clear to indicate empty
+ movl %ecx,_C_LABEL(sched_whichqs) # update q status
+
+3: /* We just did it. */
+ xorl %eax,%eax
+ CLEAR_RESCHED(%eax)
+
+switch_resume:
+#ifdef DIAGNOSTIC
+ cmpl %eax,L_WCHAN(%edi) # Waiting for something?
+ jne _C_LABEL(switch_error) # Yes; shouldn't be queued.
+ cmpb $LSRUN,L_STAT(%edi) # In run state?
+ jne _C_LABEL(switch_error) # No; shouldn't be queued.
+#endif /* DIAGNOSTIC */
+
+ /* Isolate lwp. XXX Is this necessary? */
+ movl %eax,L_BACK(%edi)
+
+ /* Record new lwp. */
+ movb $LSONPROC,L_STAT(%edi) # l->l_stat = LSONPROC
+ SET_CURLWP(%edi,%ecx)
+
+ /* Skip context switch if same lwp. */
+ xorl %ebx,%ebx
+ cmpl %edi,%esi
+ je switch_return
+
+ /* If old lwp exited, don't bother. */
+ testl %esi,%esi
+ jz switch_exited
+
+ /*
+ * Second phase: save old context.
+ *
+ * Registers:
+ * %eax, %ecx - scratch
+ * %esi - old lwp, then old pcb
+ * %edi - new lwp
+ */
+
+ pushl %esi
+ call _C_LABEL(pmap_deactivate2) # pmap_deactivate(oldproc)
+ addl $4,%esp
+
+ movl L_ADDR(%esi),%esi
+
+ /* Save stack pointers. */
+ movl %esp,PCB_ESP(%esi)
+ movl %ebp,PCB_EBP(%esi)
+
+switch_exited:
+ /*
+ * Third phase: restore saved context.
+ *
+ * Registers:
+ * %eax, %ebx, %ecx, %edx - scratch
+ * %esi - new pcb
+ * %edi - new lwp
+ */
+
+ /* No interrupts while loading new state. */
+ CLI(%eax)
+ movl L_ADDR(%edi),%esi
+
+ /* Restore stack pointers. */
+ movl PCB_ESP(%esi),%esp
+ movl PCB_EBP(%esi),%ebp
+
+#if 0
+ /* Don't bother with the rest if switching to a system process. */
+ testl $P_SYSTEM,L_FLAG(%edi); XXX NJWLWP lwp's don't have P_SYSTEM!
+ jnz switch_restored ; XXX skip stack_switch+pmap_activate
+#endif
+
+ pushl %edi
+ call _C_LABEL(pmap_activate) # pmap_activate(p)
+ addl $4,%esp
+
+ pushl %esi
+ call _C_LABEL(i386_switch_context)
+ addl $4,%esp
+
+ /* Record new pcb. */
+ SET_CURPCB(%esi)
+
+ /* Interrupts are okay again. */
+ STI(%edi)
+
+/*
+ * Check for restartable atomic sequences (RAS)
+ */
+ movl CPUVAR(CURLWP),%edi
+ movl L_PROC(%edi),%esi
+ cmpl $0,P_RASLIST(%esi)
+ jne 2f
+1:
+ movl $1,%ebx
+
+switch_return:
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+ call _C_LABEL(sched_unlock_idle)
+#endif
+ pushl $IPL_NONE # spl0()
+ call _C_LABEL(Xspllower) # process pending interrupts
+ addl $4,%esp
+ movl $IPL_HIGH,CPUVAR(ILEVEL) # splhigh()
+
+ movl %ebx,%eax
+
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+
+2: # check RAS list
+ movl L_MD_REGS(%edi),%ebx
+ movl TF_EIP(%ebx),%eax
+ pushl %eax
+ pushl %esi
+ call _C_LABEL(ras_lookup)
+ addl $8,%esp
+ cmpl $-1,%eax
+ je 1b
+ movl %eax,TF_EIP(%ebx)
+ jmp 1b
+
+/*
+ * void cpu_switchto(struct lwp *current, struct lwp *next)
+ * Switch to the specified next LWP.
+ */
+ENTRY(cpu_switchto)
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+#ifdef DEBUG
+ cmpl $IPL_SCHED,CPUVAR(ILEVEL)
+ jae 1f
+ pushl $2f
+ call _C_LABEL(panic)
+ /* NOTREACHED */
+2: .asciz "not splsched() in cpu_switchto!"
+1:
+#endif /* DEBUG */
+
+ movl 16(%esp),%esi # current
+ movl 20(%esp),%edi # next
+
+ /*
+ * Clear curlwp so that we don't accumulate system time while idle.
+ * This also insures that schedcpu() will move the old process to
+ * the correct queue if it happens to get called from the spllower()
+ * below and changes the priority. (See corresponding comment in
+ * usrret()).
+ *
+ * XXX Is this necessary? We know we won't go idle.
+ */
+ movl $0,CPUVAR(CURLWP)
+
+ /*
+ * We're running at splhigh(), but it's otherwise okay to take
+ * interrupts here.
+ */
+ STI(%eax)
+
+ /* Jump into the middle of cpu_switch */
+ xorl %eax,%eax
+ jmp switch_resume
+
+/*
+ * void cpu_exit(struct lwp *l)
+ * Switch to the appropriate idle context (lwp0's if uniprocessor; the CPU's
+ * if multiprocessor) and deallocate the address space and kernel stack for p.
+ * Then jump into cpu_switch(), as if we were in the idle proc all along.
+ */
+#ifndef MULTIPROCESSOR
+ .globl _C_LABEL(lwp0)
+#endif
+ .globl _C_LABEL(uvmspace_free),_C_LABEL(kernel_map)
+ .globl _C_LABEL(uvm_km_free),_C_LABEL(tss_free)
+/* LINTSTUB: Func: void cpu_exit(struct lwp *l) */
+ENTRY(cpu_exit)
+ movl 4(%esp),%edi # old process
+#ifndef MULTIPROCESSOR
+ movl $_C_LABEL(lwp0),%ebx
+ movl L_ADDR(%ebx),%esi
+ movl L_MD_TSS_SEL(%ebx),%edx
+#else
+ movl CPUVAR(IDLE_PCB),%esi
+ movl CPUVAR(IDLE_TSS_SEL),%edx
+#endif
+ /* In case we fault... */
+ movl $0,CPUVAR(CURLWP)
+
+ /* Restore the idle context. */
+ CLI(%eax)
+
+ /* Restore stack pointers. */
+ movl PCB_ESP(%esi),%esp
+ movl PCB_EBP(%esi),%ebp
+
+ pushl %esi
+ call _C_LABEL(i386_switch_context)
+ addl $4,%esp
+
+ /* Record new pcb. */
+ SET_CURPCB(%esi)
+
+ /* Interrupts are okay again. */
+ STI(%eax)
+
+ /*
+ * Schedule the dead LWP's stack to be freed.
+ */
+ pushl %edi
+ call _C_LABEL(lwp_exit2)
+ addl $4,%esp
+
+ /* Jump into cpu_switch() with the right state. */
+ xorl %esi,%esi
+ movl %esi,CPUVAR(CURLWP)
+ jmp idle_start
+
+/*
+ * void savectx(struct pcb *pcb);
+ * Update pcb, saving current processor state.
+ */
+/* LINTSTUB: Func: void savectx(struct pcb *pcb) */
+ENTRY(savectx)
+ movl 4(%esp),%edx # edx = p->p_addr
+
+ /* Save stack pointers. */
+ movl %esp,PCB_ESP(%edx)
+ movl %ebp,PCB_EBP(%edx)
+
+ ret
+
+/*
+ * Old call gate entry for syscall
+ */
+/* LINTSTUB: Var: char Xosyscall[1]; */
+IDTVEC(osyscall)
+ /* Set eflags in trap frame. */
+ pushfl
+ popl 8(%esp)
+ pushl $7 # size of instruction for restart
+ jmp syscall1
+
+/*
+ * Trap gate entry for syscall
+ */
+/* LINTSTUB: Var: char Xsyscall[1]; */
+IDTVEC(syscall)
+ pushl $2 # size of instruction for restart
+syscall1:
+ pushl $T_ASTFLT # trap # for doing ASTs
+ INTRENTRY
+
+#ifdef DIAGNOSTIC
+ cmpl $0, CPUVAR(WANT_PMAPLOAD)
+ jz 1f
+ pushl $6f
+ call _C_LABEL(printf)
+ addl $4, %esp
+1:
+ movl CPUVAR(ILEVEL),%ebx
+ testl %ebx,%ebx
+ jz 1f
+ pushl $5f
+ call _C_LABEL(printf)
+ addl $4,%esp
+#ifdef DDB
+ int $3
+#endif
+1:
+#endif /* DIAGNOSTIC */
+ movl CPUVAR(CURLWP),%edx
+ movl %esp,L_MD_REGS(%edx) # save pointer to frame
+ movl L_PROC(%edx),%edx
+ pushl %esp
+ call *P_MD_SYSCALL(%edx) # get pointer to syscall() function
+ addl $4,%esp
+syscall_checkast:
+ /* Check for ASTs on exit to user mode. */
+ CLI(%eax)
+ CHECK_ASTPENDING(%eax)
+ je 1f
+ /* Always returning to user mode here. */
+ CLEAR_ASTPENDING(%eax)
+ STI(%eax)
+ /* Pushed T_ASTFLT into tf_trapno on entry. */
+ pushl %esp
+ call _C_LABEL(trap)
+ addl $4,%esp
+ jmp syscall_checkast
+1: STI(%eax)
+ CHECK_DEFERRED_SWITCH(%eax)
+ jnz 9f
+#ifndef DIAGNOSTIC
+ INTRFASTEXIT
+#else /* DIAGNOSTIC */
+ cmpl $IPL_NONE,CPUVAR(ILEVEL)
+ jne 3f
+ INTRFASTEXIT
+3: pushl $4f
+ call _C_LABEL(printf)
+ addl $4,%esp
+#ifdef DDB
+ int $3
+#endif /* DDB */
+ movl $IPL_NONE,CPUVAR(ILEVEL)
+ jmp 2b
+4: .asciz "WARNING: SPL NOT LOWERED ON SYSCALL EXIT\n"
+5: .asciz "WARNING: SPL NOT ZERO ON SYSCALL ENTRY\n"
+6: .asciz "WARNING: WANT PMAPLOAD ON SYSCALL ENTRY\n"
+#endif /* DIAGNOSTIC */
+9: call _C_LABEL(pmap_load)
+ jmp syscall_checkast /* re-check ASTs */
+
+#if NNPX > 0
+/*
+ * Special interrupt handlers. Someday intr0-intr15 will be used to count
+ * interrupts. We'll still need a special exception 16 handler. The busy
+ * latch stuff in probintr() can be moved to npxprobe().
+ */
+
+/* LINTSTUB: Func: void probeintr(void) */
+NENTRY(probeintr)
+ ss
+ incl _C_LABEL(npx_intrs_while_probing)
+ pushl %eax
+ movb $0x20,%al # EOI (asm in strings loses cpp features)
+ outb %al,$0xa0 # IO_ICU2
+ outb %al,$0x20 # IO_ICU1
+ movb $0,%al
+ outb %al,$0xf0 # clear BUSY# latch
+ popl %eax
+ iret
+
+/* LINTSTUB: Func: void probetrap(void) */
+NENTRY(probetrap)
+ ss
+ incl _C_LABEL(npx_traps_while_probing)
+ fnclex
+ iret
+
+/* LINTSTUB: Func: int npx586bug1(int a, int b) */
+NENTRY(npx586bug1)
+ fildl 4(%esp) # x
+ fildl 8(%esp) # y
+ fld %st(1)
+ fdiv %st(1),%st # x/y
+ fmulp %st,%st(1) # (x/y)*y
+ fsubrp %st,%st(1) # x-(x/y)*y
+ pushl $0
+ fistpl (%esp)
+ popl %eax
+ ret
+#endif /* NNPX > 0 */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c
new file mode 100644
index 0000000000..61d2898096
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c
@@ -0,0 +1,2561 @@
+/* $NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $ */
+/* NetBSD: machdep.c,v 1.552 2004/03/24 15:34:49 atatat Exp */
+
+/*-
+ * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
+ * Simulation Facility, NASA Ames Research Center.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)machdep.c 7.4 (Berkeley) 6/3/91
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $");
+
+#include "opt_beep.h"
+#include "opt_compat_ibcs2.h"
+#include "opt_compat_mach.h" /* need to get the right segment def */
+#include "opt_compat_netbsd.h"
+#include "opt_compat_svr4.h"
+#include "opt_cpureset_delay.h"
+#include "opt_cputype.h"
+#include "opt_ddb.h"
+#include "opt_ipkdb.h"
+#include "opt_kgdb.h"
+#include "opt_mtrr.h"
+#include "opt_multiprocessor.h"
+#include "opt_realmem.h"
+#include "opt_user_ldt.h"
+#include "opt_vm86.h"
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/signal.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/exec.h>
+#include <sys/buf.h>
+#include <sys/reboot.h>
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/msgbuf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/extent.h>
+#include <sys/syscallargs.h>
+#include <sys/core.h>
+#include <sys/kcore.h>
+#include <sys/ucontext.h>
+#include <machine/kcore.h>
+#include <sys/ras.h>
+#include <sys/sa.h>
+#include <sys/savar.h>
+#include <sys/ksyms.h>
+
+#ifdef IPKDB
+#include <ipkdb/ipkdb.h>
+#endif
+
+#ifdef KGDB
+#include <sys/kgdb.h>
+#endif
+
+#include <dev/cons.h>
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm_page.h>
+
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#include <machine/cpufunc.h>
+#include <machine/cpuvar.h>
+#include <machine/gdt.h>
+#include <machine/pio.h>
+#include <machine/psl.h>
+#include <machine/reg.h>
+#include <machine/specialreg.h>
+#include <machine/bootinfo.h>
+#include <machine/mtrr.h>
+#include <machine/evtchn.h>
+
+#include <dev/isa/isareg.h>
+#include <machine/isa_machdep.h>
+#include <dev/ic/i8042reg.h>
+
+#ifdef DDB
+#include <machine/db_machdep.h>
+#include <ddb/db_extern.h>
+#endif
+
+#ifdef VM86
+#include <machine/vm86.h>
+#endif
+
+#include "acpi.h"
+#include "apm.h"
+#include "bioscall.h"
+
+#if NBIOSCALL > 0
+#include <machine/bioscall.h>
+#endif
+
+#if NACPI > 0
+#include <dev/acpi/acpivar.h>
+#define ACPI_MACHDEP_PRIVATE
+#include <machine/acpi_machdep.h>
+#endif
+
+#if NAPM > 0
+#include <machine/apmvar.h>
+#endif
+
+#include "isa.h"
+#include "isadma.h"
+#include "npx.h"
+#include "ksyms.h"
+
+#include "mca.h"
+#if NMCA > 0
+#include <machine/mca_machdep.h> /* for mca_busprobe() */
+#endif
+
+#ifdef MULTIPROCESSOR /* XXX */
+#include <machine/mpbiosvar.h> /* XXX */
+#endif /* XXX */
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+
+#if defined(DDB) || defined(KGDB)
+#include <ddb/db_interface.h>
+#include <ddb/db_output.h>
+
+void ddb_trap_hook(int);
+#endif
+
+/* #define XENDEBUG */
+/* #define XENDEBUG_LOW */
+
+#ifdef XENDEBUG
+extern void printk(char *, ...);
+#define XENPRINTF(x) printf x
+#define XENPRINTK(x) printk x
+#else
+#define XENPRINTF(x)
+#define XENPRINTK(x)
+#endif
+#define PRINTK(x) printf x
+
+#ifdef XENDEBUG_LOW
+void xen_dbglow_init(void);
+#endif
+
+#ifndef BEEP_ONHALT_COUNT
+#define BEEP_ONHALT_COUNT 3
+#endif
+#ifndef BEEP_ONHALT_PITCH
+#define BEEP_ONHALT_PITCH 1500
+#endif
+#ifndef BEEP_ONHALT_PERIOD
+#define BEEP_ONHALT_PERIOD 250
+#endif
+
+/* the following is used externally (sysctl_hw) */
+char machine[] = "i386"; /* CPU "architecture" */
+char machine_arch[] = "i386"; /* machine == machine_arch */
+
+char bootinfo[BOOTINFO_MAXSIZE];
+
+struct bi_devmatch *i386_alldisks = NULL;
+int i386_ndisks = 0;
+
+#ifdef CPURESET_DELAY
+int cpureset_delay = CPURESET_DELAY;
+#else
+int cpureset_delay = 2000; /* default to 2s */
+#endif
+
+#ifdef MTRR
+struct mtrr_funcs *mtrr_funcs;
+#endif
+
+#ifdef COMPAT_NOMID
+static int exec_nomid(struct proc *, struct exec_package *);
+#endif
+
+int physmem;
+int dumpmem_low;
+int dumpmem_high;
+unsigned int cpu_feature;
+int cpu_class;
+int i386_fpu_present;
+int i386_fpu_exception;
+int i386_fpu_fdivbug;
+
+int i386_use_fxsave;
+int i386_has_sse;
+int i386_has_sse2;
+
+int tmx86_has_longrun;
+
+vaddr_t msgbuf_vaddr;
+paddr_t msgbuf_paddr;
+
+vaddr_t idt_vaddr;
+paddr_t idt_paddr;
+
+#ifdef I586_CPU
+vaddr_t pentium_idt_vaddr;
+#endif
+
+struct vm_map *exec_map = NULL;
+struct vm_map *mb_map = NULL;
+struct vm_map *phys_map = NULL;
+
+extern paddr_t avail_start, avail_end;
+extern paddr_t pmap_pa_start, pmap_pa_end;
+
+#ifdef ISA_CLOCK
+void (*delay_func)(int) = i8254_delay;
+void (*microtime_func)(struct timeval *) = i8254_microtime;
+void (*initclock_func)(void) = i8254_initclocks;
+#else
+void (*delay_func)(int) = xen_delay;
+void (*microtime_func)(struct timeval *) = xen_microtime;
+void (*initclock_func)(void) = xen_initclocks;
+#endif
+
+void hypervisor_callback(void);
+void failsafe_callback(void);
+
+/*
+ * Size of memory segments, before any memory is stolen.
+ */
+phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
+int mem_cluster_cnt;
+
+int cpu_dump(void);
+int cpu_dumpsize(void);
+u_long cpu_dump_mempagecnt(void);
+void dumpsys(void);
+void init386(paddr_t);
+void initgdt(void);
+
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+void add_mem_cluster(u_int64_t, u_int64_t, u_int32_t);
+#endif /* !defnied(REALBASEMEM) && !defined(REALEXTMEM) */
+
+extern int time_adjusted;
+
+/*
+ * Machine-dependent startup code
+ */
+void
+cpu_startup()
+{
+ int x;
+ vaddr_t minaddr, maxaddr;
+ char pbuf[9];
+
+ /*
+ * Initialize error message buffer (et end of core).
+ */
+ msgbuf_vaddr = uvm_km_valloc(kernel_map, x86_round_page(MSGBUFSIZE));
+ if (msgbuf_vaddr == 0)
+ panic("failed to valloc msgbuf_vaddr");
+
+ /* msgbuf_paddr was init'd in pmap */
+ for (x = 0; x < btoc(MSGBUFSIZE); x++)
+ pmap_kenter_pa((vaddr_t)msgbuf_vaddr + x * PAGE_SIZE,
+ msgbuf_paddr + x * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE);
+ pmap_update(pmap_kernel());
+
+ initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
+
+ printf("%s", version);
+
+#ifdef TRAPLOG
+ /*
+ * Enable recording of branch from/to in MSR's
+ */
+ wrmsr(MSR_DEBUGCTLMSR, 0x1);
+#endif
+
+ format_bytes(pbuf, sizeof(pbuf), ptoa(physmem));
+ printf("total memory = %s\n", pbuf);
+
+ minaddr = 0;
+
+ /*
+ * Allocate a submap for exec arguments. This map effectively
+ * limits the number of processes exec'ing at any time.
+ */
+ exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
+ 16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
+
+ /*
+ * Allocate a submap for physio
+ */
+ phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
+ VM_PHYS_SIZE, 0, FALSE, NULL);
+
+ /*
+ * Finally, allocate mbuf cluster submap.
+ */
+ mb_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
+ nmbclusters * mclbytes, VM_MAP_INTRSAFE, FALSE, NULL);
+
+ format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free));
+ printf("avail memory = %s\n", pbuf);
+
+ /* Safe for i/o port / memory space allocation to use malloc now. */
+ x86_bus_space_mallocok();
+}
+
+/*
+ * Set up proc0's TSS and LDT.
+ */
+void
+i386_proc0_tss_ldt_init()
+{
+ struct pcb *pcb;
+ int x;
+
+ gdt_init();
+
+ cpu_info_primary.ci_curpcb = pcb = &lwp0.l_addr->u_pcb;
+
+ pcb->pcb_tss.tss_ioopt =
+ ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
+ | SEL_KPL; /* i/o pl */
+
+ for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
+ pcb->pcb_iomap[x] = 0xffffffff;
+
+ pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+ pcb->pcb_cr0 = rcr0();
+ pcb->pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
+ pcb->pcb_tss.tss_esp0 = (int)lwp0.l_addr + USPACE - 16;
+ lwp0.l_md.md_regs = (struct trapframe *)pcb->pcb_tss.tss_esp0 - 1;
+ lwp0.l_md.md_tss_sel = tss_alloc(pcb);
+
+#ifndef XEN
+ ltr(lwp0.l_md.md_tss_sel);
+ lldt(pcb->pcb_ldt_sel);
+#else
+ HYPERVISOR_fpu_taskswitch();
+ XENPRINTF(("lwp tss sp %p ss %04x/%04x\n",
+ (void *)pcb->pcb_tss.tss_esp0,
+ pcb->pcb_tss.tss_ss0, IDXSEL(pcb->pcb_tss.tss_ss0)));
+ HYPERVISOR_stack_switch(pcb->pcb_tss.tss_ss0, pcb->pcb_tss.tss_esp0);
+#endif
+}
+
+/*
+ * Set up TSS and LDT for a new PCB.
+ */
+
+void
+i386_init_pcb_tss_ldt(struct cpu_info *ci)
+{
+ int x;
+ struct pcb *pcb = ci->ci_idle_pcb;
+
+ pcb->pcb_tss.tss_ioopt =
+ ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
+ | SEL_KPL; /* i/o pl */
+ for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
+ pcb->pcb_iomap[x] = 0xffffffff;
+
+ pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+ pcb->pcb_cr0 = rcr0();
+
+ ci->ci_idle_tss_sel = tss_alloc(pcb);
+}
+
+/*
+ * Switch context:
+ * - honor CR0_TS in saved CR0 and request DNA exception on FPU use
+ * - switch stack pointer for user->kernel transition
+ */
+void
+i386_switch_context(struct pcb *new)
+{
+ dom0_op_t op;
+ struct cpu_info *ci;
+
+ ci = curcpu();
+ if (ci->ci_fpused) {
+ HYPERVISOR_fpu_taskswitch();
+ ci->ci_fpused = 0;
+ }
+
+ HYPERVISOR_stack_switch(new->pcb_tss.tss_ss0, new->pcb_tss.tss_esp0);
+
+ if (xen_start_info.flags & SIF_PRIVILEGED) {
+ op.cmd = DOM0_IOPL;
+ op.u.iopl.domain = DOMID_SELF;
+ op.u.iopl.iopl = new->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */
+ HYPERVISOR_dom0_op(&op);
+ }
+}
+
+/*
+ * sysctl helper routine for machdep.tm* nodes.
+ */
+static int
+sysctl_machdep_tm_longrun(SYSCTLFN_ARGS)
+{
+ struct sysctlnode node;
+ int io, error;
+
+ if (!tmx86_has_longrun)
+ return (EOPNOTSUPP);
+
+ node = *rnode;
+ node.sysctl_data = &io;
+
+ switch (rnode->sysctl_num) {
+ case CPU_TMLR_MODE:
+ io = (int)(crusoe_longrun = tmx86_get_longrun_mode());
+ break;
+ case CPU_TMLR_FREQUENCY:
+ tmx86_get_longrun_status_all();
+ io = crusoe_frequency;
+ break;
+ case CPU_TMLR_VOLTAGE:
+ tmx86_get_longrun_status_all();
+ io = crusoe_voltage;
+ break;
+ case CPU_TMLR_PERCENTAGE:
+ tmx86_get_longrun_status_all();
+ io = crusoe_percentage;
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+
+ error = sysctl_lookup(SYSCTLFN_CALL(&node));
+ if (error || newp == NULL)
+ return (error);
+
+ if (rnode->sysctl_num == CPU_TMLR_MODE) {
+ if (tmx86_set_longrun_mode(io))
+ crusoe_longrun = (u_int)io;
+ else
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * sysctl helper routine for machdep.booted_kernel
+ */
+static int
+sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
+{
+ struct btinfo_bootpath *bibp;
+ struct sysctlnode node;
+
+ bibp = lookup_bootinfo(BTINFO_BOOTPATH);
+ if(!bibp)
+ return(ENOENT); /* ??? */
+
+ node = *rnode;
+ node.sysctl_data = bibp->bootpath;
+ node.sysctl_size = sizeof(bibp->bootpath);
+ return (sysctl_lookup(SYSCTLFN_CALL(&node)));
+}
+
+/*
+ * sysctl helper routine for machdep.diskinfo
+ */
+static int
+sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
+{
+ struct sysctlnode node;
+
+ node = *rnode;
+ node.sysctl_data = i386_alldisks;
+ node.sysctl_size = sizeof(struct disklist) +
+ (i386_ndisks - 1) * sizeof(struct nativedisk_info);
+ return (sysctl_lookup(SYSCTLFN_CALL(&node)));
+}
+
+/*
+ * machine dependent system variables.
+ */
+SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
+{
+
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "machdep", NULL,
+ NULL, 0, NULL, 0,
+ CTL_MACHDEP, CTL_EOL);
+
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_STRUCT, "console_device", NULL,
+ sysctl_consdev, 0, NULL, sizeof(dev_t),
+ CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "biosbasemem", NULL,
+ NULL, 0, &biosbasemem, 0,
+ CTL_MACHDEP, CPU_BIOSBASEMEM, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "biosextmem", NULL,
+ NULL, 0, &biosextmem, 0,
+ CTL_MACHDEP, CPU_BIOSEXTMEM, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "nkpde", NULL,
+ NULL, 0, &nkpde, 0,
+ CTL_MACHDEP, CPU_NKPDE, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_STRING, "booted_kernel", NULL,
+ sysctl_machdep_booted_kernel, 0, NULL, 0,
+ CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_STRUCT, "diskinfo", NULL,
+ sysctl_machdep_diskinfo, 0, NULL, 0,
+ CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "fpu_present", NULL,
+ NULL, 0, &i386_fpu_present, 0,
+ CTL_MACHDEP, CPU_FPU_PRESENT, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "osfxsr", NULL,
+ NULL, 0, &i386_use_fxsave, 0,
+ CTL_MACHDEP, CPU_OSFXSR, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "sse", NULL,
+ NULL, 0, &i386_has_sse, 0,
+ CTL_MACHDEP, CPU_SSE, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "sse2", NULL,
+ NULL, 0, &i386_has_sse2, 0,
+ CTL_MACHDEP, CPU_SSE2, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "tm_longrun_mode", NULL,
+ sysctl_machdep_tm_longrun, 0, NULL, 0,
+ CTL_MACHDEP, CPU_TMLR_MODE, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "tm_longrun_frequency", NULL,
+ sysctl_machdep_tm_longrun, 0, NULL, 0,
+ CTL_MACHDEP, CPU_TMLR_FREQUENCY, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "tm_longrun_voltage", NULL,
+ sysctl_machdep_tm_longrun, 0, NULL, 0,
+ CTL_MACHDEP, CPU_TMLR_VOLTAGE, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "tm_longrun_percentage", NULL,
+ sysctl_machdep_tm_longrun, 0, NULL, 0,
+ CTL_MACHDEP, CPU_TMLR_PERCENTAGE, CTL_EOL);
+}
+
+void *
+getframe(struct lwp *l, int sig, int *onstack)
+{
+ struct proc *p = l->l_proc;
+ struct sigctx *ctx = &p->p_sigctx;
+ struct trapframe *tf = l->l_md.md_regs;
+
+ /* Do we need to jump onto the signal stack? */
+ *onstack = (ctx->ps_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
+ && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
+ if (*onstack)
+ return (char *)ctx->ps_sigstk.ss_sp + ctx->ps_sigstk.ss_size;
+#ifdef VM86
+ if (tf->tf_eflags & PSL_VM)
+ return (void *)(tf->tf_esp + (tf->tf_ss << 4));
+ else
+#endif
+ return (void *)tf->tf_esp;
+}
+
+/*
+ * Build context to run handler in. We invoke the handler
+ * directly, only returning via the trampoline. Note the
+ * trampoline version numbers are coordinated with machine-
+ * dependent code in libc.
+ */
+void
+buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
+{
+ struct trapframe *tf = l->l_md.md_regs;
+
+ tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
+ tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
+ tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
+ tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
+ tf->tf_eip = (int)catcher;
+ tf->tf_cs = GSEL(sel, SEL_UPL);
+ tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
+ tf->tf_esp = (int)fp;
+ tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
+}
+
+static void
+sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
+{
+ struct lwp *l = curlwp;
+ struct proc *p = l->l_proc;
+ struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
+ int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
+ GUCODEBIG_SEL : GUCODE_SEL;
+ struct sigacts *ps = p->p_sigacts;
+ int onstack;
+ int sig = ksi->ksi_signo;
+ struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
+ sig_t catcher = SIGACTION(p, sig).sa_handler;
+ struct trapframe *tf = l->l_md.md_regs;
+
+ fp--;
+
+ /* Build stack frame for signal trampoline. */
+ switch (ps->sa_sigdesc[sig].sd_vers) {
+ case 0: /* handled by sendsig_sigcontext */
+ case 1: /* handled by sendsig_sigcontext */
+ default: /* unknown version */
+ printf("nsendsig: bad version %d\n",
+ ps->sa_sigdesc[sig].sd_vers);
+ sigexit(l, SIGILL);
+ case 2:
+ break;
+ }
+
+ frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
+ frame.sf_signum = sig;
+ frame.sf_sip = &fp->sf_si;
+ frame.sf_ucp = &fp->sf_uc;
+ frame.sf_si._info = ksi->ksi_info;
+ frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
+ frame.sf_uc.uc_sigmask = *mask;
+ frame.sf_uc.uc_link = NULL;
+ frame.sf_uc.uc_flags |= (p->p_sigctx.ps_sigstk.ss_flags & SS_ONSTACK)
+ ? _UC_SETSTACK : _UC_CLRSTACK;
+ memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
+ cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
+
+ if (tf->tf_eflags & PSL_VM)
+ (*p->p_emul->e_syscall_intern)(p);
+
+ if (copyout(&frame, fp, sizeof(frame)) != 0) {
+ /*
+ * Process has trashed its stack; give it an illegal
+ * instruction to halt it in its tracks.
+ */
+ sigexit(l, SIGILL);
+ /* NOTREACHED */
+ }
+
+ buildcontext(l, sel, catcher, fp);
+
+ /* Remember that we're now on the signal stack. */
+ if (onstack)
+ p->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
+}
+
+void
+sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
+{
+#ifdef COMPAT_16
+ if (curproc->p_sigacts->sa_sigdesc[ksi->ksi_signo].sd_vers < 2)
+ sendsig_sigcontext(ksi, mask);
+ else
+#endif
+ sendsig_siginfo(ksi, mask);
+}
+
+void
+cpu_upcall(struct lwp *l, int type, int nevents, int ninterrupted, void *sas,
+ void *ap, void *sp, sa_upcall_t upcall)
+{
+ struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+ struct saframe *sf, frame;
+ struct trapframe *tf;
+
+ tf = l->l_md.md_regs;
+
+ /* Finally, copy out the rest of the frame. */
+ frame.sa_type = type;
+ frame.sa_sas = sas;
+ frame.sa_events = nevents;
+ frame.sa_interrupted = ninterrupted;
+ frame.sa_arg = ap;
+ frame.sa_ra = 0;
+
+ sf = (struct saframe *)sp - 1;
+ if (copyout(&frame, sf, sizeof(frame)) != 0) {
+ /* Copying onto the stack didn't work. Die. */
+ sigexit(l, SIGILL);
+ /* NOTREACHED */
+ }
+
+ tf->tf_eip = (int) upcall;
+ tf->tf_esp = (int) sf;
+ tf->tf_ebp = 0; /* indicate call-frame-top to debuggers */
+ tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
+ tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
+ tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
+ tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
+ tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
+ GSEL(GUCODEBIG_SEL, SEL_UPL) : GSEL(GUCODE_SEL, SEL_UPL);
+ tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
+ tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
+}
+
+int waittime = -1;
+struct pcb dumppcb;
+
+void
+cpu_reboot(int howto, char *bootstr)
+{
+
+ if (cold) {
+ howto |= RB_HALT;
+ goto haltsys;
+ }
+
+ boothowto = howto;
+ if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
+ waittime = 0;
+ vfs_shutdown();
+ /*
+ * If we've been adjusting the clock, the todr
+ * will be out of synch; adjust it now.
+ */
+ if (time_adjusted != 0)
+ resettodr();
+ }
+
+ /* Disable interrupts. */
+ splhigh();
+
+ /* Do a dump if requested. */
+ if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
+ dumpsys();
+
+haltsys:
+ doshutdownhooks();
+
+#ifdef MULTIPROCESSOR
+ x86_broadcast_ipi(X86_IPI_HALT);
+#endif
+
+ if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
+#if NACPI > 0
+ if (acpi_softc != NULL) {
+ delay(500000);
+ acpi_enter_sleep_state(acpi_softc, ACPI_STATE_S5);
+ printf("WARNING: ACPI powerdown failed!\n");
+ }
+#endif
+#if NAPM > 0 && !defined(APM_NO_POWEROFF)
+ /* turn off, if we can. But try to turn disk off and
+ * wait a bit first--some disk drives are slow to clean up
+ * and users have reported disk corruption.
+ */
+ delay(500000);
+ apm_set_powstate(APM_DEV_DISK(0xff), APM_SYS_OFF);
+ delay(500000);
+ apm_set_powstate(APM_DEV_ALLDEVS, APM_SYS_OFF);
+ printf("WARNING: APM powerdown failed!\n");
+ /*
+ * RB_POWERDOWN implies RB_HALT... fall into it...
+ */
+#endif
+ HYPERVISOR_shutdown();
+ }
+
+ if (howto & RB_HALT) {
+ printf("\n");
+ printf("The operating system has halted.\n");
+ printf("Please press any key to reboot.\n\n");
+
+#ifdef BEEP_ONHALT
+ {
+ int c;
+ for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
+ sysbeep(BEEP_ONHALT_PITCH,
+ BEEP_ONHALT_PERIOD * hz / 1000);
+ delay(BEEP_ONHALT_PERIOD * 1000);
+ sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
+ delay(BEEP_ONHALT_PERIOD * 1000);
+ }
+ }
+#endif
+
+ cnpollc(1); /* for proper keyboard command handling */
+ if (cngetc() == 0) {
+ /* no console attached, so just hlt */
+ for(;;) {
+ __asm __volatile("hlt");
+ }
+ }
+ cnpollc(0);
+ }
+
+ printf("rebooting...\n");
+ if (cpureset_delay > 0)
+ delay(cpureset_delay * 1000);
+ cpu_reset();
+ for(;;) ;
+ /*NOTREACHED*/
+}
+
+/*
+ * These variables are needed by /sbin/savecore
+ */
+u_int32_t dumpmag = 0x8fca0101; /* magic number */
+int dumpsize = 0; /* pages */
+long dumplo = 0; /* blocks */
+
+/*
+ * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
+ */
+int
+cpu_dumpsize()
+{
+ int size;
+
+ size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
+ ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
+ if (roundup(size, dbtob(1)) != dbtob(1))
+ return (-1);
+
+ return (1);
+}
+
+/*
+ * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
+ */
+u_long
+cpu_dump_mempagecnt()
+{
+ u_long i, n;
+
+ n = 0;
+ for (i = 0; i < mem_cluster_cnt; i++)
+ n += atop(mem_clusters[i].size);
+ return (n);
+}
+
+/*
+ * cpu_dump: dump the machine-dependent kernel core dump headers.
+ */
+int
+cpu_dump()
+{
+ int (*dump)(dev_t, daddr_t, caddr_t, size_t);
+ char buf[dbtob(1)];
+ kcore_seg_t *segp;
+ cpu_kcore_hdr_t *cpuhdrp;
+ phys_ram_seg_t *memsegp;
+ const struct bdevsw *bdev;
+ int i;
+
+ bdev = bdevsw_lookup(dumpdev);
+ if (bdev == NULL)
+ return (ENXIO);
+ dump = bdev->d_dump;
+
+ memset(buf, 0, sizeof buf);
+ segp = (kcore_seg_t *)buf;
+ cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
+ memsegp = (phys_ram_seg_t *)&buf[ ALIGN(sizeof(*segp)) +
+ ALIGN(sizeof(*cpuhdrp))];
+
+ /*
+ * Generate a segment header.
+ */
+ CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
+ segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
+
+ /*
+ * Add the machine-dependent header info.
+ */
+ cpuhdrp->ptdpaddr = PTDpaddr;
+ cpuhdrp->nmemsegs = mem_cluster_cnt;
+
+ /*
+ * Fill in the memory segment descriptors.
+ */
+ for (i = 0; i < mem_cluster_cnt; i++) {
+ memsegp[i].start = mem_clusters[i].start;
+ memsegp[i].size = mem_clusters[i].size;
+ }
+
+ return (dump(dumpdev, dumplo, (caddr_t)buf, dbtob(1)));
+}
+
+/*
+ * This is called by main to set dumplo and dumpsize.
+ * Dumps always skip the first PAGE_SIZE of disk space
+ * in case there might be a disk label stored there.
+ * If there is extra space, put dump at the end to
+ * reduce the chance that swapping trashes it.
+ */
+void
+cpu_dumpconf()
+{
+ const struct bdevsw *bdev;
+ int nblks, dumpblks; /* size of dump area */
+
+ if (dumpdev == NODEV)
+ goto bad;
+ bdev = bdevsw_lookup(dumpdev);
+ if (bdev == NULL)
+ panic("dumpconf: bad dumpdev=0x%x", dumpdev);
+ if (bdev->d_psize == NULL)
+ goto bad;
+ nblks = (*bdev->d_psize)(dumpdev);
+ if (nblks <= ctod(1))
+ goto bad;
+
+ dumpblks = cpu_dumpsize();
+ if (dumpblks < 0)
+ goto bad;
+ dumpblks += ctod(cpu_dump_mempagecnt());
+
+ /* If dump won't fit (incl. room for possible label), punt. */
+ if (dumpblks > (nblks - ctod(1)))
+ goto bad;
+
+ /* Put dump at end of partition */
+ dumplo = nblks - dumpblks;
+
+ /* dumpsize is in page units, and doesn't include headers. */
+ dumpsize = cpu_dump_mempagecnt();
+ return;
+
+ bad:
+ dumpsize = 0;
+}
+
+/*
+ * Doadump comes here after turning off memory management and
+ * getting on the dump stack, either when called above, or by
+ * the auto-restart code.
+ */
+#define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */
+static vaddr_t dumpspace;
+
+vaddr_t
+reserve_dumppages(vaddr_t p)
+{
+
+ dumpspace = p;
+ return (p + BYTES_PER_DUMP);
+}
+
+void
+dumpsys()
+{
+ u_long totalbytesleft, bytes, i, n, memseg;
+ u_long maddr;
+ int psize;
+ daddr_t blkno;
+ const struct bdevsw *bdev;
+ int (*dump)(dev_t, daddr_t, caddr_t, size_t);
+ int error;
+
+ /* Save registers. */
+ savectx(&dumppcb);
+
+ if (dumpdev == NODEV)
+ return;
+
+ bdev = bdevsw_lookup(dumpdev);
+ if (bdev == NULL || bdev->d_psize == NULL)
+ return;
+
+ /*
+ * For dumps during autoconfiguration,
+ * if dump device has already configured...
+ */
+ if (dumpsize == 0)
+ cpu_dumpconf();
+ if (dumplo <= 0 || dumpsize == 0) {
+ printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
+ minor(dumpdev));
+ return;
+ }
+ printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
+ minor(dumpdev), dumplo);
+
+ psize = (*bdev->d_psize)(dumpdev);
+ printf("dump ");
+ if (psize == -1) {
+ printf("area unavailable\n");
+ return;
+ }
+
+#if 0 /* XXX this doesn't work. grr. */
+ /* toss any characters present prior to dump */
+ while (sget() != NULL); /*syscons and pccons differ */
+#endif
+
+ if ((error = cpu_dump()) != 0)
+ goto err;
+
+ totalbytesleft = ptoa(cpu_dump_mempagecnt());
+ blkno = dumplo + cpu_dumpsize();
+ dump = bdev->d_dump;
+ error = 0;
+
+ for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
+ maddr = mem_clusters[memseg].start;
+ bytes = mem_clusters[memseg].size;
+
+ for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
+ /* Print out how many MBs we have left to go. */
+ if ((totalbytesleft % (1024*1024)) == 0)
+ printf("%ld ", totalbytesleft / (1024 * 1024));
+
+ /* Limit size for next transfer. */
+ n = bytes - i;
+ if (n > BYTES_PER_DUMP)
+ n = BYTES_PER_DUMP;
+
+ (void) pmap_map(dumpspace, maddr, maddr + n,
+ VM_PROT_READ);
+
+ error = (*dump)(dumpdev, blkno, (caddr_t)dumpspace, n);
+ if (error)
+ goto err;
+ maddr += n;
+ blkno += btodb(n); /* XXX? */
+
+#if 0 /* XXX this doesn't work. grr. */
+ /* operator aborting dump? */
+ if (sget() != NULL) {
+ error = EINTR;
+ break;
+ }
+#endif
+ }
+ }
+
+ err:
+ switch (error) {
+
+ case ENXIO:
+ printf("device bad\n");
+ break;
+
+ case EFAULT:
+ printf("device not ready\n");
+ break;
+
+ case EINVAL:
+ printf("area improper\n");
+ break;
+
+ case EIO:
+ printf("i/o error\n");
+ break;
+
+ case EINTR:
+ printf("aborted from console\n");
+ break;
+
+ case 0:
+ printf("succeeded\n");
+ break;
+
+ default:
+ printf("error %d\n", error);
+ break;
+ }
+ printf("\n\n");
+ delay(5000000); /* 5 seconds */
+}
+
+/*
+ * Clear registers on exec
+ */
+void
+setregs(struct lwp *l, struct exec_package *pack, u_long stack)
+{
+ struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+ struct pcb *pcb = &l->l_addr->u_pcb;
+ struct trapframe *tf;
+
+#if NNPX > 0
+ /* If we were using the FPU, forget about it. */
+ if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
+ npxsave_lwp(l, 0);
+#endif
+
+#ifdef USER_LDT
+ pmap_ldt_cleanup(l);
+#endif
+
+ l->l_md.md_flags &= ~MDL_USEDFPU;
+ if (i386_use_fxsave) {
+ pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __NetBSD_NPXCW__;
+ pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__;
+ } else
+ pcb->pcb_savefpu.sv_87.sv_env.en_cw = __NetBSD_NPXCW__;
+
+ tf = l->l_md.md_regs;
+ tf->tf_gs = LSEL(LUDATA_SEL, SEL_UPL);
+ tf->tf_fs = LSEL(LUDATA_SEL, SEL_UPL);
+ tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
+ tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
+ tf->tf_edi = 0;
+ tf->tf_esi = 0;
+ tf->tf_ebp = 0;
+ tf->tf_ebx = (int)l->l_proc->p_psstr;
+ tf->tf_edx = 0;
+ tf->tf_ecx = 0;
+ tf->tf_eax = 0;
+ tf->tf_eip = pack->ep_entry;
+ tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
+ LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
+ tf->tf_eflags = PSL_USERSET;
+ tf->tf_esp = stack;
+ tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
+}
+
+/*
+ * Initialize segments and descriptor tables
+ */
+
+union descriptor *gdt, *ldt;
+struct gate_descriptor *idt;
+char idt_allocmap[NIDT];
+struct simplelock idt_lock = SIMPLELOCK_INITIALIZER;
+#ifdef I586_CPU
+union descriptor *pentium_idt;
+#endif
+extern struct user *proc0paddr;
+
+void
+setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
+ int sel)
+{
+
+ gd->gd_looffset = (int)func;
+ gd->gd_selector = sel;
+ gd->gd_stkcpy = args;
+ gd->gd_xx = 0;
+ gd->gd_type = type;
+ gd->gd_dpl = dpl;
+ gd->gd_p = 1;
+ gd->gd_hioffset = (int)func >> 16;
+}
+
+void
+unsetgate(struct gate_descriptor *gd)
+{
+ gd->gd_p = 0;
+ gd->gd_hioffset = 0;
+ gd->gd_looffset = 0;
+ gd->gd_selector = 0;
+ gd->gd_xx = 0;
+ gd->gd_stkcpy = 0;
+ gd->gd_type = 0;
+ gd->gd_dpl = 0;
+}
+
+
+void
+setregion(struct region_descriptor *rd, void *base, size_t limit)
+{
+
+ rd->rd_limit = (int)limit;
+ rd->rd_base = (int)base;
+}
+
+void
+setsegment(struct segment_descriptor *sd, void *base, size_t limit, int type,
+ int dpl, int def32, int gran)
+{
+
+ sd->sd_lolimit = (int)limit;
+ sd->sd_lobase = (int)base;
+ sd->sd_type = type;
+ sd->sd_dpl = dpl;
+ sd->sd_p = 1;
+ sd->sd_hilimit = (int)limit >> 16;
+ sd->sd_xx = 0;
+ sd->sd_def32 = def32;
+ sd->sd_gran = gran;
+ sd->sd_hibase = (int)base >> 24;
+}
+
+#define IDTVEC(name) __CONCAT(X, name)
+typedef void (vector)(void);
+extern vector IDTVEC(syscall);
+extern vector IDTVEC(osyscall);
+extern vector *IDTVEC(exceptions)[];
+#ifdef COMPAT_SVR4
+extern vector IDTVEC(svr4_fasttrap);
+#endif /* COMPAT_SVR4 */
+#ifdef COMPAT_MACH
+extern vector IDTVEC(mach_trap);
+#endif
+#define MAX_XEN_IDT 128
+trap_info_t xen_idt[MAX_XEN_IDT];
+int xen_idt_idx;
+
+#define KBTOB(x) ((size_t)(x) * 1024UL)
+
+void cpu_init_idt()
+{
+ struct region_descriptor region;
+
+ panic("cpu_init_idt");
+#ifdef I586_CPU
+ setregion(&region, pentium_idt, NIDT * sizeof(idt[0]) - 1);
+#else
+ setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
+#endif
+ lidt(&region);
+}
+
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+void
+add_mem_cluster(u_int64_t seg_start, u_int64_t seg_end, u_int32_t type)
+{
+ extern struct extent *iomem_ex;
+ int i;
+
+ if (seg_end > 0x100000000ULL) {
+ printf("WARNING: skipping large "
+ "memory map entry: "
+ "0x%qx/0x%qx/0x%x\n",
+ seg_start,
+ (seg_end - seg_start),
+ type);
+ return;
+ }
+
+ /*
+ * XXX Chop the last page off the size so that
+ * XXX it can fit in avail_end.
+ */
+ if (seg_end == 0x100000000ULL)
+ seg_end -= PAGE_SIZE;
+
+ if (seg_end <= seg_start)
+ return;
+
+ for (i = 0; i < mem_cluster_cnt; i++) {
+ if ((mem_clusters[i].start == round_page(seg_start))
+ && (mem_clusters[i].size
+ == trunc_page(seg_end) - mem_clusters[i].start)) {
+#ifdef DEBUG_MEMLOAD
+ printf("WARNING: skipping duplicate segment entry\n");
+#endif
+ return;
+ }
+ }
+
+ /*
+ * Allocate the physical addresses used by RAM
+ * from the iomem extent map. This is done before
+ * the addresses are page rounded just to make
+ * sure we get them all.
+ */
+ if (extent_alloc_region(iomem_ex, seg_start,
+ seg_end - seg_start, EX_NOWAIT)) {
+ /* XXX What should we do? */
+ printf("WARNING: CAN'T ALLOCATE "
+ "MEMORY SEGMENT "
+ "(0x%qx/0x%qx/0x%x) FROM "
+ "IOMEM EXTENT MAP!\n",
+ seg_start, seg_end - seg_start, type);
+ return;
+ }
+
+ /*
+ * If it's not free memory, skip it.
+ */
+ if (type != BIM_Memory)
+ return;
+
+ /* XXX XXX XXX */
+ if (mem_cluster_cnt >= VM_PHYSSEG_MAX)
+ panic("init386: too many memory segments");
+
+ seg_start = round_page(seg_start);
+ seg_end = trunc_page(seg_end);
+
+ if (seg_start == seg_end)
+ return;
+
+ mem_clusters[mem_cluster_cnt].start = seg_start;
+ mem_clusters[mem_cluster_cnt].size =
+ seg_end - seg_start;
+
+ if (avail_end < seg_end)
+ avail_end = seg_end;
+ physmem += atop(mem_clusters[mem_cluster_cnt].size);
+ mem_cluster_cnt++;
+}
+#endif /* !defined(REALBASEMEM) && !defined(REALEXTMEM) */
+
+void
+initgdt()
+{
+#if !defined(XEN)
+ struct region_descriptor region;
+#else
+ paddr_t frames[16];
+#endif
+
+#if !defined(XEN)
+ gdt = tgdt;
+ memset(gdt, 0, NGDT*sizeof(*gdt));
+#endif
+ /* make gdt gates and memory segments */
+ setsegment(&gdt[GCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 1, 1);
+ setsegment(&gdt[GDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 1, 1);
+ setsegment(&gdt[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
+ SDT_MEMERA, SEL_UPL, 1, 1);
+ setsegment(&gdt[GUCODEBIG_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
+ SDT_MEMERA, SEL_UPL, 1, 1);
+ setsegment(&gdt[GUDATA_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
+ SDT_MEMRWA, SEL_UPL, 1, 1);
+#ifdef COMPAT_MACH
+ setgate(&gdt[GMACHCALLS_SEL].gd, &IDTVEC(mach_trap), 1,
+ SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif
+#if NBIOSCALL > 0
+ /* bios trampoline GDT entries */
+ setsegment(&gdt[GBIOSCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 0,
+ 0);
+ setsegment(&gdt[GBIOSDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 0,
+ 0);
+#endif
+ setsegment(&gdt[GCPU_SEL].sd, &cpu_info_primary,
+ sizeof(struct cpu_info)-1, SDT_MEMRWA, SEL_KPL, 1, 1);
+
+#if !defined(XEN)
+ setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
+ lgdt(&region);
+#else
+ frames[0] = xpmap_ptom((uint32_t)gdt - KERNBASE) >> PAGE_SHIFT;
+ /* pmap_kremove((vaddr_t)gdt, PAGE_SIZE); */
+ pmap_kenter_pa((vaddr_t)gdt, (uint32_t)gdt - KERNBASE,
+ VM_PROT_READ);
+ XENPRINTK(("loading gdt %lx, %d entries\n", frames[0] << PAGE_SHIFT,
+ LAST_RESERVED_GDT_ENTRY + 1));
+ if (HYPERVISOR_set_gdt(frames, LAST_RESERVED_GDT_ENTRY + 1))
+ panic("HYPERVISOR_set_gdt failed!\n");
+ lgdt_finish();
+#endif
+}
+
+void
+init386(paddr_t first_avail)
+{
+#if !defined(XEN)
+ union descriptor *tgdt;
+#endif
+ extern void consinit(void);
+#if !defined(XEN)
+ extern struct extent *iomem_ex;
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+ struct btinfo_memmap *bim;
+#endif
+ struct region_descriptor region;
+#endif
+ int x;
+#if !defined(XEN)
+ int first16q;
+ u_int64_t seg_start, seg_end;
+ u_int64_t seg_start1, seg_end1;
+#endif
+ paddr_t realmode_reserved_start;
+ psize_t realmode_reserved_size;
+ int needs_earlier_install_pte0;
+#if NBIOSCALL > 0
+ extern int biostramp_image_size;
+ extern u_char biostramp_image[];
+#endif
+
+ XENPRINTK(("HYPERVISOR_shared_info %p\n", HYPERVISOR_shared_info));
+#ifdef XENDEBUG_LOW
+ xen_dbglow_init();
+#endif
+
+ cpu_probe_features(&cpu_info_primary);
+ cpu_feature = cpu_info_primary.ci_feature_flags;
+
+ /* not on Xen... */
+ cpu_feature &= ~(CPUID_PGE|CPUID_PSE|CPUID_MTRR|CPUID_FXSR);
+
+ lwp0.l_addr = proc0paddr;
+ cpu_info_primary.ci_curpcb = &lwp0.l_addr->u_pcb;
+
+ XENPRINTK(("proc0paddr %p pcb %p first_avail %p\n",
+ proc0paddr, cpu_info_primary.ci_curpcb, (void *)first_avail));
+ XENPRINTK(("ptdpaddr %p atdevbase %p\n", (void *)PTDpaddr,
+ (void *)atdevbase));
+
+ x86_bus_space_init();
+ consinit(); /* XXX SHOULD NOT BE DONE HERE */
+ /*
+ * Initailize PAGE_SIZE-dependent variables.
+ */
+ uvm_setpagesize();
+
+ /*
+ * Saving SSE registers won't work if the save area isn't
+ * 16-byte aligned.
+ */
+ if (offsetof(struct user, u_pcb.pcb_savefpu) & 0xf)
+ panic("init386: pcb_savefpu not 16-byte aligned");
+
+ /*
+ * Start with 2 color bins -- this is just a guess to get us
+ * started. We'll recolor when we determine the largest cache
+ * sizes on the system.
+ */
+ uvmexp.ncolors = 2;
+
+#if !defined(XEN)
+ /*
+ * BIOS leaves data in physical page 0
+ * Even if it didn't, our VM system doesn't like using zero as a
+ * physical page number.
+ * We may also need pages in low memory (one each) for secondary CPU
+ * startup, for BIOS calls, and for ACPI, plus a page table page to map
+ * them into the first few pages of the kernel's pmap.
+ */
+ avail_start = PAGE_SIZE;
+#else
+ /* Make sure the end of the space used by the kernel is rounded. */
+ first_avail = round_page(first_avail);
+ avail_start = first_avail - KERNBASE;
+ avail_end = ptoa(xen_start_info.nr_pages) +
+ (KERNTEXTOFF - KERNBASE_LOCORE);
+ pmap_pa_start = (KERNTEXTOFF - KERNBASE_LOCORE);
+ pmap_pa_end = avail_end;
+ mem_clusters[0].start = avail_start;
+ mem_clusters[0].size = avail_end - avail_start;
+ mem_cluster_cnt++;
+ physmem += atop(mem_clusters[0].size);
+#endif
+
+ /*
+ * reserve memory for real-mode call
+ */
+ needs_earlier_install_pte0 = 0;
+ realmode_reserved_start = 0;
+ realmode_reserved_size = 0;
+#if NBIOSCALL > 0
+ /* save us a page for trampoline code */
+ realmode_reserved_size += PAGE_SIZE;
+ needs_earlier_install_pte0 = 1;
+#endif
+#ifdef MULTIPROCESSOR /* XXX */
+#if !defined(XEN)
+ KASSERT(avail_start == PAGE_SIZE); /* XXX */
+#endif
+ if (realmode_reserved_size < MP_TRAMPOLINE) /* XXX */
+ realmode_reserved_size = MP_TRAMPOLINE; /* XXX */
+ needs_earlier_install_pte0 = 1; /* XXX */
+#endif /* XXX */
+#if NACPI > 0
+ /* trampoline code for wake handler */
+ realmode_reserved_size += ptoa(acpi_md_get_npages_of_wakecode()+1);
+ needs_earlier_install_pte0 = 1;
+#endif
+ if (needs_earlier_install_pte0) {
+ /* page table for directory entry 0 */
+ realmode_reserved_size += PAGE_SIZE;
+ }
+ if (realmode_reserved_size>0) {
+ realmode_reserved_start = avail_start;
+ avail_start += realmode_reserved_size;
+ }
+
+#ifdef DEBUG_MEMLOAD
+ printf("mem_cluster_count: %d\n", mem_cluster_cnt);
+#endif
+
+ /*
+ * Call pmap initialization to make new kernel address space.
+ * We must do this before loading pages into the VM system.
+ */
+ pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
+
+#if !defined(XEN)
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+ /*
+ * Check to see if we have a memory map from the BIOS (passed
+ * to us by the boot program.
+ */
+ bim = lookup_bootinfo(BTINFO_MEMMAP);
+ if (bim != NULL && bim->num > 0) {
+#ifdef DEBUG_MEMLOAD
+ printf("BIOS MEMORY MAP (%d ENTRIES):\n", bim->num);
+#endif
+ for (x = 0; x < bim->num; x++) {
+#ifdef DEBUG_MEMLOAD
+ printf(" addr 0x%qx size 0x%qx type 0x%x\n",
+ bim->entry[x].addr,
+ bim->entry[x].size,
+ bim->entry[x].type);
+#endif
+
+ /*
+ * If the segment is not memory, skip it.
+ */
+ switch (bim->entry[x].type) {
+ case BIM_Memory:
+ case BIM_ACPI:
+ case BIM_NVS:
+ break;
+ default:
+ continue;
+ }
+
+ /*
+ * Sanity check the entry.
+ * XXX Need to handle uint64_t in extent code
+ * XXX and 64-bit physical addresses in i386
+ * XXX port.
+ */
+ seg_start = bim->entry[x].addr;
+ seg_end = bim->entry[x].addr + bim->entry[x].size;
+
+ /*
+ * Avoid Compatibility Holes.
+ * XXX Holes within memory space that allow access
+ * XXX to be directed to the PC-compatible frame buffer
+ * XXX (0xa0000-0xbffff),to adapter ROM space
+ * XXX (0xc0000-0xdffff), and to system BIOS space
+ * XXX (0xe0000-0xfffff).
+ * XXX Some laptop(for example,Toshiba Satellite2550X)
+ * XXX report this area and occurred problems,
+ * XXX so we avoid this area.
+ */
+ if (seg_start < 0x100000 && seg_end > 0xa0000) {
+ printf("WARNING: memory map entry overlaps "
+ "with ``Compatibility Holes'': "
+ "0x%qx/0x%qx/0x%x\n", seg_start,
+ seg_end - seg_start, bim->entry[x].type);
+ add_mem_cluster(seg_start, 0xa0000,
+ bim->entry[x].type);
+ add_mem_cluster(0x100000, seg_end,
+ bim->entry[x].type);
+ } else
+ add_mem_cluster(seg_start, seg_end,
+ bim->entry[x].type);
+ }
+ }
+#endif /* ! REALBASEMEM && ! REALEXTMEM */
+ /*
+ * If the loop above didn't find any valid segment, fall back to
+ * former code.
+ */
+ if (mem_cluster_cnt == 0) {
+ /*
+ * Allocate the physical addresses used by RAM from the iomem
+ * extent map. This is done before the addresses are
+ * page rounded just to make sure we get them all.
+ */
+ if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem),
+ EX_NOWAIT)) {
+ /* XXX What should we do? */
+ printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
+ "IOMEM EXTENT MAP!\n");
+ }
+ mem_clusters[0].start = 0;
+ mem_clusters[0].size = trunc_page(KBTOB(biosbasemem));
+ physmem += atop(mem_clusters[0].size);
+ if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
+ EX_NOWAIT)) {
+ /* XXX What should we do? */
+ printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
+ "IOMEM EXTENT MAP!\n");
+ }
+#if NISADMA > 0
+ /*
+ * Some motherboards/BIOSes remap the 384K of RAM that would
+ * normally be covered by the ISA hole to the end of memory
+ * so that it can be used. However, on a 16M system, this
+ * would cause bounce buffers to be allocated and used.
+ * This is not desirable behaviour, as more than 384K of
+ * bounce buffers might be allocated. As a work-around,
+ * we round memory down to the nearest 1M boundary if
+ * we're using any isadma devices and the remapped memory
+ * is what puts us over 16M.
+ */
+ if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
+ char pbuf[9];
+
+ format_bytes(pbuf, sizeof(pbuf),
+ biosextmem - (15*1024));
+ printf("Warning: ignoring %s of remapped memory\n",
+ pbuf);
+ biosextmem = (15*1024);
+ }
+#endif
+ mem_clusters[1].start = IOM_END;
+ mem_clusters[1].size = trunc_page(KBTOB(biosextmem));
+ physmem += atop(mem_clusters[1].size);
+
+ mem_cluster_cnt = 2;
+
+ avail_end = IOM_END + trunc_page(KBTOB(biosextmem));
+ }
+ /*
+ * If we have 16M of RAM or less, just put it all on
+ * the default free list. Otherwise, put the first
+ * 16M of RAM on a lower priority free list (so that
+ * all of the ISA DMA'able memory won't be eaten up
+ * first-off).
+ */
+ if (avail_end <= (16 * 1024 * 1024))
+ first16q = VM_FREELIST_DEFAULT;
+ else
+ first16q = VM_FREELIST_FIRST16;
+
+ /* Make sure the end of the space used by the kernel is rounded. */
+ first_avail = round_page(first_avail);
+#endif
+
+ XENPRINTK(("load the memory cluster %p(%d) - %p(%ld)\n",
+ (void *)avail_start, (int)atop(avail_start),
+ (void *)avail_end, (int)atop(avail_end)));
+ uvm_page_physload(atop(avail_start), atop(avail_end),
+ atop(avail_start), atop(avail_end),
+ VM_FREELIST_DEFAULT);
+
+#if !defined(XEN)
+
+ /*
+ * Now, load the memory clusters (which have already been
+ * rounded and truncated) into the VM system.
+ *
+ * NOTE: WE ASSUME THAT MEMORY STARTS AT 0 AND THAT THE KERNEL
+ * IS LOADED AT IOM_END (1M).
+ */
+ for (x = 0; x < mem_cluster_cnt; x++) {
+ seg_start = mem_clusters[x].start;
+ seg_end = mem_clusters[x].start + mem_clusters[x].size;
+ seg_start1 = 0;
+ seg_end1 = 0;
+
+ /*
+ * Skip memory before our available starting point.
+ */
+ if (seg_end <= avail_start)
+ continue;
+
+ if (avail_start >= seg_start && avail_start < seg_end) {
+ if (seg_start != 0)
+ panic("init386: memory doesn't start at 0");
+ seg_start = avail_start;
+ if (seg_start == seg_end)
+ continue;
+ }
+
+ /*
+ * If this segment contains the kernel, split it
+ * in two, around the kernel.
+ */
+ if (seg_start <= IOM_END && first_avail <= seg_end) {
+ seg_start1 = first_avail;
+ seg_end1 = seg_end;
+ seg_end = IOM_END;
+ }
+
+ /* First hunk */
+ if (seg_start != seg_end) {
+ if (seg_start < (16 * 1024 * 1024) &&
+ first16q != VM_FREELIST_DEFAULT) {
+ u_int64_t tmp;
+
+ if (seg_end > (16 * 1024 * 1024))
+ tmp = (16 * 1024 * 1024);
+ else
+ tmp = seg_end;
+
+ if (tmp != seg_start) {
+#ifdef DEBUG_MEMLOAD
+ printf("loading 0x%qx-0x%qx "
+ "(0x%lx-0x%lx)\n",
+ seg_start, tmp,
+ atop(seg_start), atop(tmp));
+#endif
+ uvm_page_physload(atop(seg_start),
+ atop(tmp), atop(seg_start),
+ atop(tmp), first16q);
+ }
+ seg_start = tmp;
+ }
+
+ if (seg_start != seg_end) {
+#ifdef DEBUG_MEMLOAD
+ printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
+ seg_start, seg_end,
+ atop(seg_start), atop(seg_end));
+#endif
+ uvm_page_physload(atop(seg_start),
+ atop(seg_end), atop(seg_start),
+ atop(seg_end), VM_FREELIST_DEFAULT);
+ }
+ }
+
+ /* Second hunk */
+ if (seg_start1 != seg_end1) {
+ if (seg_start1 < (16 * 1024 * 1024) &&
+ first16q != VM_FREELIST_DEFAULT) {
+ u_int64_t tmp;
+
+ if (seg_end1 > (16 * 1024 * 1024))
+ tmp = (16 * 1024 * 1024);
+ else
+ tmp = seg_end1;
+
+ if (tmp != seg_start1) {
+#ifdef DEBUG_MEMLOAD
+ printf("loading 0x%qx-0x%qx "
+ "(0x%lx-0x%lx)\n",
+ seg_start1, tmp,
+ atop(seg_start1), atop(tmp));
+#endif
+ uvm_page_physload(atop(seg_start1),
+ atop(tmp), atop(seg_start1),
+ atop(tmp), first16q);
+ }
+ seg_start1 = tmp;
+ }
+
+ if (seg_start1 != seg_end1) {
+#ifdef DEBUG_MEMLOAD
+ printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
+ seg_start1, seg_end1,
+ atop(seg_start1), atop(seg_end1));
+#endif
+ uvm_page_physload(atop(seg_start1),
+ atop(seg_end1), atop(seg_start1),
+ atop(seg_end1), VM_FREELIST_DEFAULT);
+ }
+ }
+ }
+#endif
+
+ /*
+ * Steal memory for the message buffer (at end of core).
+ */
+ {
+ struct vm_physseg *vps;
+ psize_t sz = round_page(MSGBUFSIZE);
+ psize_t reqsz = sz;
+
+ for (x = 0; x < vm_nphysseg; x++) {
+ vps = &vm_physmem[x];
+ if (ptoa(vps->avail_end) == avail_end)
+ goto found;
+ }
+ panic("init386: can't find end of memory");
+
+ found:
+ /* Shrink so it'll fit in the last segment. */
+ if ((vps->avail_end - vps->avail_start) < atop(sz))
+ sz = ptoa(vps->avail_end - vps->avail_start);
+
+ vps->avail_end -= atop(sz);
+ vps->end -= atop(sz);
+ msgbuf_paddr = ptoa(vps->avail_end);
+
+ /* Remove the last segment if it now has no pages. */
+ if (vps->start == vps->end) {
+ for (vm_nphysseg--; x < vm_nphysseg; x++)
+ vm_physmem[x] = vm_physmem[x + 1];
+ }
+
+ /* Now find where the new avail_end is. */
+ for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
+ if (vm_physmem[x].avail_end > avail_end)
+ avail_end = vm_physmem[x].avail_end;
+ avail_end = ptoa(avail_end);
+
+ /* Warn if the message buffer had to be shrunk. */
+ if (sz != reqsz)
+ printf("WARNING: %ld bytes not available for msgbuf "
+ "in last cluster (%ld used)\n", reqsz, sz);
+ }
+
+ /*
+ * install PT page for the first 4M if needed.
+ */
+ if (needs_earlier_install_pte0) {
+ paddr_t paddr;
+#ifdef DIAGNOSTIC
+ if (realmode_reserved_size < PAGE_SIZE) {
+ panic("cannot steal memory for first 4M PT page.");
+ }
+#endif
+ paddr=realmode_reserved_start+realmode_reserved_size-PAGE_SIZE;
+ pmap_enter(pmap_kernel(), (vaddr_t)vtopte(0), paddr,
+ VM_PROT_READ|VM_PROT_WRITE,
+ PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
+ pmap_update(pmap_kernel());
+ /* make sure it is clean before using */
+ memset(vtopte(0), 0, PAGE_SIZE);
+ realmode_reserved_size -= PAGE_SIZE;
+ }
+
+#if NBIOSCALL > 0
+ /*
+ * this should be caught at kernel build time, but put it here
+ * in case someone tries to fake it out...
+ */
+#ifdef DIAGNOSTIC
+ if (realmode_reserved_start > BIOSTRAMP_BASE ||
+ (realmode_reserved_start+realmode_reserved_size) < (BIOSTRAMP_BASE+
+ PAGE_SIZE)) {
+ panic("cannot steal memory for PT page of bioscall.");
+ }
+ if (biostramp_image_size > PAGE_SIZE)
+ panic("biostramp_image_size too big: %x vs. %x",
+ biostramp_image_size, PAGE_SIZE);
+#endif
+ pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE, /* virtual */
+ (paddr_t)BIOSTRAMP_BASE, /* physical */
+ VM_PROT_ALL); /* protection */
+ pmap_update(pmap_kernel());
+ memcpy((caddr_t)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
+#ifdef DEBUG_BIOSCALL
+ printf("biostramp installed @ %x\n", BIOSTRAMP_BASE);
+#endif
+ realmode_reserved_size -= PAGE_SIZE;
+ realmode_reserved_start += PAGE_SIZE;
+#endif
+
+#if NACPI > 0
+ /*
+ * Steal memory for the acpi wake code
+ */
+ {
+ paddr_t paddr, p;
+ psize_t sz;
+ int npg;
+
+ paddr = realmode_reserved_start;
+ npg = acpi_md_get_npages_of_wakecode();
+ sz = ptoa(npg);
+#ifdef DIAGNOSTIC
+ if (realmode_reserved_size < sz) {
+ panic("cannot steal memory for ACPI wake code.");
+ }
+#endif
+
+ /* identical mapping */
+ p = paddr;
+ for (x=0; x<npg; x++) {
+ printf("kenter: 0x%08X\n", (unsigned)p);
+ pmap_kenter_pa((vaddr_t)p, p, VM_PROT_ALL);
+ p += PAGE_SIZE;
+ }
+ pmap_update(pmap_kernel());
+
+ acpi_md_install_wakecode(paddr);
+
+ realmode_reserved_size -= sz;
+ realmode_reserved_start += sz;
+ }
+#endif
+
+ pmap_enter(pmap_kernel(), idt_vaddr, idt_paddr,
+ VM_PROT_READ|VM_PROT_WRITE, PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
+ pmap_update(pmap_kernel());
+ memset((void *)idt_vaddr, 0, PAGE_SIZE);
+
+#if !defined(XEN)
+ idt = (struct gate_descriptor *)idt_vaddr;
+#ifdef I586_CPU
+ pmap_enter(pmap_kernel(), pentium_idt_vaddr, idt_paddr,
+ VM_PROT_READ, PMAP_WIRED|VM_PROT_READ);
+ pentium_idt = (union descriptor *)pentium_idt_vaddr;
+#endif
+#endif
+ pmap_update(pmap_kernel());
+
+ initgdt();
+
+ HYPERVISOR_set_callbacks(
+ GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
+ GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
+
+#if !defined(XEN)
+ tgdt = gdt;
+ gdt = (union descriptor *)
+ ((char *)idt + NIDT * sizeof (struct gate_descriptor));
+ ldt = gdt + NGDT;
+
+ memcpy(gdt, tgdt, NGDT*sizeof(*gdt));
+
+ setsegment(&gdt[GLDT_SEL].sd, ldt, NLDT * sizeof(ldt[0]) - 1,
+ SDT_SYSLDT, SEL_KPL, 0, 0);
+#else
+ ldt = (union descriptor *)idt_vaddr;
+#endif
+
+ /* make ldt gates and memory segments */
+ setgate(&ldt[LSYS5CALLS_SEL].gd, &IDTVEC(osyscall), 1,
+ SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+
+ ldt[LUCODE_SEL] = gdt[GUCODE_SEL];
+ ldt[LUCODEBIG_SEL] = gdt[GUCODEBIG_SEL];
+ ldt[LUDATA_SEL] = gdt[GUDATA_SEL];
+ ldt[LSOL26CALLS_SEL] = ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
+
+#if !defined(XEN)
+ /* exceptions */
+ for (x = 0; x < 32; x++) {
+ setgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386TGT,
+ (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ idt_allocmap[x] = 1;
+ }
+
+ /* new-style interrupt gate for syscalls */
+ setgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386TGT, SEL_UPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ idt_allocmap[128] = 1;
+#ifdef COMPAT_SVR4
+ setgate(&idt[0xd2], &IDTVEC(svr4_fasttrap), 0, SDT_SYS386TGT,
+ SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+ idt_allocmap[0xd2] = 1;
+#endif /* COMPAT_SVR4 */
+#endif
+
+ memset(xen_idt, 0, sizeof(trap_info_t) * MAX_XEN_IDT);
+ xen_idt_idx = 0;
+ for (x = 0; x < 32; x++) {
+ KASSERT(xen_idt_idx < MAX_XEN_IDT);
+ xen_idt[xen_idt_idx].vector = x;
+ xen_idt[xen_idt_idx].flags =
+ (x == 3 || x == 4) ? SEL_UPL : SEL_XEN;
+ xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
+ xen_idt[xen_idt_idx].address =
+ (uint32_t)IDTVEC(exceptions)[x];
+ xen_idt_idx++;
+ }
+ KASSERT(xen_idt_idx < MAX_XEN_IDT);
+ xen_idt[xen_idt_idx].vector = 128;
+ xen_idt[xen_idt_idx].flags = SEL_UPL;
+ xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
+ xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(syscall);
+ xen_idt_idx++;
+#ifdef COMPAT_SVR4
+ KASSERT(xen_idt_idx < MAX_XEN_IDT);
+ xen_idt[xen_idt_idx].vector = 0xd2;
+ xen_idt[xen_idt_idx].flags = SEL_UPL;
+ xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
+ xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(svr4_fasttrap);
+ xen_idt_idx++;
+#endif /* COMPAT_SVR4 */
+
+#if !defined(XEN)
+ setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
+ lgdt(&region);
+#else
+ lldt(GSEL(GLDT_SEL, SEL_KPL));
+#endif
+
+#if !defined(XEN)
+ cpu_init_idt();
+#else
+ db_trap_callback = ddb_trap_hook;
+
+ XENPRINTF(("HYPERVISOR_set_trap_table %p\n", xen_idt));
+ if (HYPERVISOR_set_trap_table(xen_idt))
+ panic("HYPERVISOR_set_trap_table %p failed\n", xen_idt);
+#endif
+
+#if NKSYMS || defined(DDB) || defined(LKM)
+ {
+ extern int end;
+ extern int *esym;
+ struct btinfo_symtab *symtab;
+
+#ifdef DDB
+ db_machine_init();
+#endif
+
+ symtab = lookup_bootinfo(BTINFO_SYMTAB);
+
+ if (symtab) {
+ symtab->ssym += KERNBASE;
+ symtab->esym += KERNBASE;
+ ksyms_init(symtab->nsym, (int *)symtab->ssym,
+ (int *)symtab->esym);
+ }
+ else
+ ksyms_init(*(int *)&end, ((int *)&end) + 1, esym);
+ }
+#endif
+#ifdef DDB
+ if (boothowto & RB_KDB)
+ Debugger();
+#endif
+#ifdef IPKDB
+ ipkdb_init();
+ if (boothowto & RB_KDB)
+ ipkdb_connect(0);
+#endif
+#ifdef KGDB
+ kgdb_port_init();
+ if (boothowto & RB_KDB) {
+ kgdb_debug_init = 1;
+ kgdb_connect(1);
+ }
+#endif
+
+#if NMCA > 0
+ /* check for MCA bus, needed to be done before ISA stuff - if
+ * MCA is detected, ISA needs to use level triggered interrupts
+ * by default */
+ mca_busprobe();
+#endif
+
+#if defined(XEN)
+ events_default_setup();
+#else
+ intr_default_setup();
+#endif
+
+ /* Initialize software interrupts. */
+ softintr_init();
+
+ splraise(IPL_IPI);
+ enable_intr();
+
+ if (physmem < btoc(2 * 1024 * 1024)) {
+ printf("warning: too little memory available; "
+ "have %lu bytes, want %lu bytes\n"
+ "running in degraded mode\n"
+ "press a key to confirm\n\n",
+ ptoa(physmem), 2*1024*1024UL);
+ cngetc();
+ }
+
+#ifdef __HAVE_CPU_MAXPROC
+ /* Make sure maxproc is sane */
+ if (maxproc > cpu_maxproc())
+ maxproc = cpu_maxproc();
+#endif
+}
+
+#ifdef COMPAT_NOMID
+static int
+exec_nomid(struct proc *p, struct exec_package *epp)
+{
+ int error;
+ u_long midmag, magic;
+ u_short mid;
+ struct exec *execp = epp->ep_hdr;
+
+ /* check on validity of epp->ep_hdr performed by exec_out_makecmds */
+
+ midmag = ntohl(execp->a_midmag);
+ mid = (midmag >> 16) & 0xffff;
+ magic = midmag & 0xffff;
+
+ if (magic == 0) {
+ magic = (execp->a_midmag & 0xffff);
+ mid = MID_ZERO;
+ }
+
+ midmag = mid << 16 | magic;
+
+ switch (midmag) {
+ case (MID_ZERO << 16) | ZMAGIC:
+ /*
+ * 386BSD's ZMAGIC format:
+ */
+ error = exec_aout_prep_oldzmagic(p, epp);
+ break;
+
+ case (MID_ZERO << 16) | QMAGIC:
+ /*
+ * BSDI's QMAGIC format:
+ * same as new ZMAGIC format, but with different magic number
+ */
+ error = exec_aout_prep_zmagic(p, epp);
+ break;
+
+ case (MID_ZERO << 16) | NMAGIC:
+ /*
+ * BSDI's NMAGIC format:
+ * same as NMAGIC format, but with different magic number
+ * and with text starting at 0.
+ */
+ error = exec_aout_prep_oldnmagic(p, epp);
+ break;
+
+ case (MID_ZERO << 16) | OMAGIC:
+ /*
+ * BSDI's OMAGIC format:
+ * same as OMAGIC format, but with different magic number
+ * and with text starting at 0.
+ */
+ error = exec_aout_prep_oldomagic(p, epp);
+ break;
+
+ default:
+ error = ENOEXEC;
+ }
+
+ return error;
+}
+#endif
+
+/*
+ * cpu_exec_aout_makecmds():
+ * CPU-dependent a.out format hook for execve().
+ *
+ * Determine of the given exec package refers to something which we
+ * understand and, if so, set up the vmcmds for it.
+ *
+ * On the i386, old (386bsd) ZMAGIC binaries and BSDI QMAGIC binaries
+ * if COMPAT_NOMID is given as a kernel option.
+ */
+int
+cpu_exec_aout_makecmds(struct proc *p, struct exec_package *epp)
+{
+ int error = ENOEXEC;
+
+#ifdef COMPAT_NOMID
+ if ((error = exec_nomid(p, epp)) == 0)
+ return error;
+#endif /* ! COMPAT_NOMID */
+
+ return error;
+}
+
+void *
+lookup_bootinfo(int type)
+{
+ struct btinfo_common *help;
+ int n = *(int*)bootinfo;
+ help = (struct btinfo_common *)(bootinfo + sizeof(int));
+ while(n--) {
+ if(help->type == type)
+ return(help);
+ help = (struct btinfo_common *)((char*)help + help->len);
+ }
+ return(0);
+}
+
+#include <dev/ic/mc146818reg.h> /* for NVRAM POST */
+#include <i386/isa/nvram.h> /* for NVRAM POST */
+
+void
+cpu_reset()
+{
+
+ disable_intr();
+
+#if 0
+ /*
+ * Ensure the NVRAM reset byte contains something vaguely sane.
+ */
+
+ outb(IO_RTC, NVRAM_RESET);
+ outb(IO_RTC+1, NVRAM_RESET_RST);
+
+ /*
+ * The keyboard controller has 4 random output pins, one of which is
+ * connected to the RESET pin on the CPU in many PCs. We tell the
+ * keyboard controller to pulse this line a couple of times.
+ */
+ outb(IO_KBD + KBCMDP, KBC_PULSE0);
+ delay(100000);
+ outb(IO_KBD + KBCMDP, KBC_PULSE0);
+ delay(100000);
+#endif
+
+ HYPERVISOR_reboot();
+
+ for (;;);
+}
+
+void
+cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
+{
+ const struct trapframe *tf = l->l_md.md_regs;
+ __greg_t *gr = mcp->__gregs;
+ __greg_t ras_eip;
+
+ /* Save register context. */
+#ifdef VM86
+ if (tf->tf_eflags & PSL_VM) {
+ gr[_REG_GS] = tf->tf_vm86_gs;
+ gr[_REG_FS] = tf->tf_vm86_fs;
+ gr[_REG_ES] = tf->tf_vm86_es;
+ gr[_REG_DS] = tf->tf_vm86_ds;
+ gr[_REG_EFL] = get_vflags(l);
+ } else
+#endif
+ {
+ gr[_REG_GS] = tf->tf_gs;
+ gr[_REG_FS] = tf->tf_fs;
+ gr[_REG_ES] = tf->tf_es;
+ gr[_REG_DS] = tf->tf_ds;
+ gr[_REG_EFL] = tf->tf_eflags;
+ }
+ gr[_REG_EDI] = tf->tf_edi;
+ gr[_REG_ESI] = tf->tf_esi;
+ gr[_REG_EBP] = tf->tf_ebp;
+ gr[_REG_EBX] = tf->tf_ebx;
+ gr[_REG_EDX] = tf->tf_edx;
+ gr[_REG_ECX] = tf->tf_ecx;
+ gr[_REG_EAX] = tf->tf_eax;
+ gr[_REG_EIP] = tf->tf_eip;
+ gr[_REG_CS] = tf->tf_cs;
+ gr[_REG_ESP] = tf->tf_esp;
+ gr[_REG_UESP] = tf->tf_esp;
+ gr[_REG_SS] = tf->tf_ss;
+ gr[_REG_TRAPNO] = tf->tf_trapno;
+ gr[_REG_ERR] = tf->tf_err;
+
+ if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
+ (caddr_t) gr[_REG_EIP])) != -1)
+ gr[_REG_EIP] = ras_eip;
+
+ *flags |= _UC_CPU;
+
+ /* Save floating point register context, if any. */
+ if ((l->l_md.md_flags & MDL_USEDFPU) != 0) {
+#if NNPX > 0
+ /*
+ * If this process is the current FP owner, dump its
+ * context to the PCB first.
+ * XXX npxsave() also clears the FPU state; depending on the
+ * XXX application this might be a penalty.
+ */
+ if (l->l_addr->u_pcb.pcb_fpcpu) {
+ npxsave_lwp(l, 1);
+ }
+#endif
+ if (i386_use_fxsave) {
+ memcpy(&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
+ &l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
+ sizeof (mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm));
+ *flags |= _UC_FXSAVE;
+ } else {
+ memcpy(&mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
+ &l->l_addr->u_pcb.pcb_savefpu.sv_87,
+ sizeof (mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state));
+ }
+#if 0
+ /* Apparently nothing ever touches this. */
+ ucp->mcp.mc_fp.fp_emcsts = l->l_addr->u_pcb.pcb_saveemc;
+#endif
+ *flags |= _UC_FPU;
+ }
+}
+
+int
+cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
+{
+ struct trapframe *tf = l->l_md.md_regs;
+ __greg_t *gr = mcp->__gregs;
+
+ /* Restore register context, if any. */
+ if ((flags & _UC_CPU) != 0) {
+#ifdef VM86
+ if (gr[_REG_EFL] & PSL_VM) {
+ tf->tf_vm86_gs = gr[_REG_GS];
+ tf->tf_vm86_fs = gr[_REG_FS];
+ tf->tf_vm86_es = gr[_REG_ES];
+ tf->tf_vm86_ds = gr[_REG_DS];
+ set_vflags(l, gr[_REG_EFL]);
+ if (flags & _UC_VM) {
+ void syscall_vm86(struct trapframe *);
+ l->l_proc->p_md.md_syscall = syscall_vm86;
+ }
+ } else
+#endif
+ {
+ /*
+ * Check for security violations. If we're returning
+ * to protected mode, the CPU will validate the segment
+ * registers automatically and generate a trap on
+ * violations. We handle the trap, rather than doing
+ * all of the checking here.
+ */
+ if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
+ !USERMODE(gr[_REG_CS], gr[_REG_EFL])) {
+ printf("cpu_setmcontext error: uc EFL: 0x%08x"
+ " tf EFL: 0x%08x uc CS: 0x%x\n",
+ gr[_REG_EFL], tf->tf_eflags, gr[_REG_CS]);
+ return (EINVAL);
+ }
+ tf->tf_gs = gr[_REG_GS];
+ tf->tf_fs = gr[_REG_FS];
+ tf->tf_es = gr[_REG_ES];
+ tf->tf_ds = gr[_REG_DS];
+ /* Only change the user-alterable part of eflags */
+ tf->tf_eflags &= ~PSL_USER;
+ tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
+ }
+ tf->tf_edi = gr[_REG_EDI];
+ tf->tf_esi = gr[_REG_ESI];
+ tf->tf_ebp = gr[_REG_EBP];
+ tf->tf_ebx = gr[_REG_EBX];
+ tf->tf_edx = gr[_REG_EDX];
+ tf->tf_ecx = gr[_REG_ECX];
+ tf->tf_eax = gr[_REG_EAX];
+ tf->tf_eip = gr[_REG_EIP];
+ tf->tf_cs = gr[_REG_CS];
+ tf->tf_esp = gr[_REG_UESP];
+ tf->tf_ss = gr[_REG_SS];
+ }
+
+ /* Restore floating point register context, if any. */
+ if ((flags & _UC_FPU) != 0) {
+#if NNPX > 0
+ /*
+ * If we were using the FPU, forget that we were.
+ */
+ if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
+ npxsave_lwp(l, 0);
+#endif
+ if (flags & _UC_FXSAVE) {
+ if (i386_use_fxsave) {
+ memcpy(
+ &l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
+ &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
+ sizeof (&l->l_addr->u_pcb.pcb_savefpu.sv_xmm));
+ } else {
+ /* This is a weird corner case */
+ process_xmm_to_s87((struct savexmm *)
+ &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
+ &l->l_addr->u_pcb.pcb_savefpu.sv_87);
+ }
+ } else {
+ if (i386_use_fxsave) {
+ process_s87_to_xmm((struct save87 *)
+ &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
+ &l->l_addr->u_pcb.pcb_savefpu.sv_xmm);
+ } else {
+ memcpy(&l->l_addr->u_pcb.pcb_savefpu.sv_87,
+ &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
+ sizeof (l->l_addr->u_pcb.pcb_savefpu.sv_87));
+ }
+ }
+ /* If not set already. */
+ l->l_md.md_flags |= MDL_USEDFPU;
+#if 0
+ /* Apparently unused. */
+ l->l_addr->u_pcb.pcb_saveemc = mcp->mc_fp.fp_emcsts;
+#endif
+ }
+ if (flags & _UC_SETSTACK)
+ l->l_proc->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
+ if (flags & _UC_CLRSTACK)
+ l->l_proc->p_sigctx.ps_sigstk.ss_flags &= ~SS_ONSTACK;
+ return (0);
+}
+
+void
+cpu_initclocks()
+{
+ (*initclock_func)();
+}
+
+#ifdef MULTIPROCESSOR
+void
+need_resched(struct cpu_info *ci)
+{
+
+ if (ci->ci_want_resched)
+ return;
+
+ ci->ci_want_resched = 1;
+ if ((ci)->ci_curlwp != NULL)
+ aston((ci)->ci_curlwp->l_proc);
+ else if (ci != curcpu())
+ x86_send_ipi(ci, 0);
+}
+#endif
+
+/*
+ * Allocate an IDT vector slot within the given range.
+ * XXX needs locking to avoid MP allocation races.
+ */
+
+int
+idt_vec_alloc(int low, int high)
+{
+ int vec;
+
+ simple_lock(&idt_lock);
+ for (vec = low; vec <= high; vec++) {
+ if (idt_allocmap[vec] == 0) {
+ idt_allocmap[vec] = 1;
+ simple_unlock(&idt_lock);
+ return vec;
+ }
+ }
+ simple_unlock(&idt_lock);
+ return 0;
+}
+
+void
+idt_vec_set(int vec, void (*function)(void))
+{
+ /*
+ * Vector should be allocated, so no locking needed.
+ */
+ KASSERT(idt_allocmap[vec] == 1);
+ setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+}
+
+void
+idt_vec_free(int vec)
+{
+ simple_lock(&idt_lock);
+ unsetgate(&idt[vec]);
+ idt_allocmap[vec] = 0;
+ simple_unlock(&idt_lock);
+}
+
+/*
+ * Number of processes is limited by number of available GDT slots.
+ */
+int
+cpu_maxproc(void)
+{
+#ifdef USER_LDT
+ return ((MAXGDTSIZ - NGDT) / 2);
+#else
+ return (MAXGDTSIZ - NGDT);
+#endif
+}
+
+#if defined(DDB) || defined(KGDB)
+
+/*
+ * Callback to output a backtrace when entering ddb.
+ */
+void
+ddb_trap_hook(int where)
+{
+ static int once = 0;
+ db_addr_t db_dot;
+
+ if (once != 0 || where != 1)
+ return;
+ once = 1;
+
+ if (curlwp != NULL) {
+ db_printf("Stopped");
+ if (curproc == NULL)
+ db_printf("; curlwp = %p,"
+ " curproc is NULL at\t", curlwp);
+ else
+ db_printf(" in pid %d.%d (%s) at\t",
+ curproc->p_pid, curlwp->l_lid,
+ curproc->p_comm);
+ } else
+ db_printf("Stopped at\t");
+ db_dot = PC_REGS(DDB_REGS);
+ db_print_loc_and_inst(db_dot);
+
+ db_stack_trace_print((db_expr_t) db_dot, FALSE, 65535,
+ "", db_printf);
+#ifdef DEBUG
+ db_show_regs((db_expr_t) db_dot, FALSE, 65535, "");
+#endif
+}
+
+#endif /* DDB || KGDB */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c
new file mode 100644
index 0000000000..8e031eb242
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c
@@ -0,0 +1,4522 @@
+/* $NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $ */
+/* NetBSD: pmap.c,v 1.172 2004/04/12 13:17:46 yamt Exp */
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Charles D. Cranor and
+ * Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * pmap.c: i386 pmap module rewrite
+ * Chuck Cranor <chuck@ccrc.wustl.edu>
+ * 11-Aug-97
+ *
+ * history of this pmap module: in addition to my own input, i used
+ * the following references for this rewrite of the i386 pmap:
+ *
+ * [1] the NetBSD i386 pmap. this pmap appears to be based on the
+ * BSD hp300 pmap done by Mike Hibler at University of Utah.
+ * it was then ported to the i386 by William Jolitz of UUNET
+ * Technologies, Inc. Then Charles M. Hannum of the NetBSD
+ * project fixed some bugs and provided some speed ups.
+ *
+ * [2] the FreeBSD i386 pmap. this pmap seems to be the
+ * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
+ * and David Greenman.
+ *
+ * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
+ * between several processors. the VAX version was done by
+ * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
+ * version was done by Lance Berc, Mike Kupfer, Bob Baron,
+ * David Golub, and Richard Draves. the alpha version was
+ * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
+ * (NetBSD/alpha).
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $");
+
+#include "opt_cputype.h"
+#include "opt_user_ldt.h"
+#include "opt_largepages.h"
+#include "opt_lockdebug.h"
+#include "opt_multiprocessor.h"
+#include "opt_kstack_dr0.h"
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/user.h>
+#include <sys/kernel.h>
+
+#include <uvm/uvm.h>
+
+#include <machine/atomic.h>
+#include <machine/cpu.h>
+#include <machine/specialreg.h>
+#include <machine/gdt.h>
+
+#include <dev/isa/isareg.h>
+#include <machine/isa_machdep.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/xenpmap.h>
+
+void xpmap_find_pte(paddr_t);
+
+/* #define XENDEBUG */
+
+#ifdef XENDEBUG
+#define XENPRINTF(x) printf x
+#define XENPRINTK(x) printf x
+#else
+#define XENPRINTF(x)
+#define XENPRINTK(x)
+#endif
+#define PRINTF(x) printf x
+#define PRINTK(x) printf x
+
+
+/*
+ * general info:
+ *
+ * - for an explanation of how the i386 MMU hardware works see
+ * the comments in <machine/pte.h>.
+ *
+ * - for an explanation of the general memory structure used by
+ * this pmap (including the recursive mapping), see the comments
+ * in <machine/pmap.h>.
+ *
+ * this file contains the code for the "pmap module." the module's
+ * job is to manage the hardware's virtual to physical address mappings.
+ * note that there are two levels of mapping in the VM system:
+ *
+ * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
+ * to map ranges of virtual address space to objects/files. for
+ * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
+ * to the file /bin/ls starting at offset zero." note that
+ * the upper layer mapping is not concerned with how individual
+ * vm_pages are mapped.
+ *
+ * [2] the lower layer of the VM system (the pmap) maintains the mappings
+ * from virtual addresses. it is concerned with which vm_page is
+ * mapped where. for example, when you run /bin/ls and start
+ * at page 0x1000 the fault routine may lookup the correct page
+ * of the /bin/ls file and then ask the pmap layer to establish
+ * a mapping for it.
+ *
+ * note that information in the lower layer of the VM system can be
+ * thrown away since it can easily be reconstructed from the info
+ * in the upper layer.
+ *
+ * data structures we use include:
+ *
+ * - struct pmap: describes the address space of one thread
+ * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
+ * - struct pv_head: there is one pv_head per managed page of
+ * physical memory. the pv_head points to a list of pv_entry
+ * structures which describe all the <PMAP,VA> pairs that this
+ * page is mapped in. this is critical for page based operations
+ * such as pmap_page_protect() [change protection on _all_ mappings
+ * of a page]
+ * - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's.
+ * if we run out of pv_entry's we allocate a new pv_page and free
+ * its pv_entrys.
+ * - pmap_remove_record: a list of virtual addresses whose mappings
+ * have been changed. used for TLB flushing.
+ */
+
+/*
+ * memory allocation
+ *
+ * - there are three data structures that we must dynamically allocate:
+ *
+ * [A] new process' page directory page (PDP)
+ * - plan 1: done at pmap_create() we use
+ * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this
+ * allocation.
+ *
+ * if we are low in free physical memory then we sleep in
+ * uvm_km_alloc -- in this case this is ok since we are creating
+ * a new pmap and should not be holding any locks.
+ *
+ * if the kernel is totally out of virtual space
+ * (i.e. uvm_km_alloc returns NULL), then we panic.
+ *
+ * XXX: the fork code currently has no way to return an "out of
+ * memory, try again" error code since uvm_fork [fka vm_fork]
+ * is a void function.
+ *
+ * [B] new page tables pages (PTP)
+ * - call uvm_pagealloc()
+ * => success: zero page, add to pm_pdir
+ * => failure: we are out of free vm_pages, let pmap_enter()
+ * tell UVM about it.
+ *
+ * note: for kernel PTPs, we start with NKPTP of them. as we map
+ * kernel memory (at uvm_map time) we check to see if we've grown
+ * the kernel pmap. if so, we call the optional function
+ * pmap_growkernel() to grow the kernel PTPs in advance.
+ *
+ * [C] pv_entry structures
+ * - plan 1: try to allocate one off the free list
+ * => success: done!
+ * => failure: no more free pv_entrys on the list
+ * - plan 2: try to allocate a new pv_page to add a chunk of
+ * pv_entrys to the free list
+ * [a] obtain a free, unmapped, VA in kmem_map. either
+ * we have one saved from a previous call, or we allocate
+ * one now using a "vm_map_lock_try" in uvm_map
+ * => success: we have an unmapped VA, continue to [b]
+ * => failure: unable to lock kmem_map or out of VA in it.
+ * move on to plan 3.
+ * [b] allocate a page in kmem_object for the VA
+ * => success: map it in, free the pv_entry's, DONE!
+ * => failure: kmem_object locked, no free vm_pages, etc.
+ * save VA for later call to [a], go to plan 3.
+ * If we fail, we simply let pmap_enter() tell UVM about it.
+ */
+
+/*
+ * locking
+ *
+ * we have the following locks that we must contend with:
+ *
+ * "normal" locks:
+ *
+ * - pmap_main_lock
+ * this lock is used to prevent deadlock and/or provide mutex
+ * access to the pmap system. most operations lock the pmap
+ * structure first, then they lock the pv_lists (if needed).
+ * however, some operations such as pmap_page_protect lock
+ * the pv_lists and then lock pmaps. in order to prevent a
+ * cycle, we require a mutex lock when locking the pv_lists
+ * first. thus, the "pmap = >pv_list" lockers must gain a
+ * read-lock on pmap_main_lock before locking the pmap. and
+ * the "pv_list => pmap" lockers must gain a write-lock on
+ * pmap_main_lock before locking. since only one thread
+ * can write-lock a lock at a time, this provides mutex.
+ *
+ * "simple" locks:
+ *
+ * - pmap lock (per pmap, part of uvm_object)
+ * this lock protects the fields in the pmap structure including
+ * the non-kernel PDEs in the PDP, and the PTEs. it also locks
+ * in the alternate PTE space (since that is determined by the
+ * entry in the PDP).
+ *
+ * - pvh_lock (per pv_head)
+ * this lock protects the pv_entry list which is chained off the
+ * pv_head structure for a specific managed PA. it is locked
+ * when traversing the list (e.g. adding/removing mappings,
+ * syncing R/M bits, etc.)
+ *
+ * - pvalloc_lock
+ * this lock protects the data structures which are used to manage
+ * the free list of pv_entry structures.
+ *
+ * - pmaps_lock
+ * this lock protects the list of active pmaps (headed by "pmaps").
+ * we lock it when adding or removing pmaps from this list.
+ *
+ */
+
+/*
+ * locking data structures
+ */
+
+static struct simplelock pvalloc_lock;
+static struct simplelock pmaps_lock;
+
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+static struct lock pmap_main_lock;
+
+#define PMAP_MAP_TO_HEAD_LOCK() \
+ (void) spinlockmgr(&pmap_main_lock, LK_SHARED, NULL)
+#define PMAP_MAP_TO_HEAD_UNLOCK() \
+ (void) spinlockmgr(&pmap_main_lock, LK_RELEASE, NULL)
+
+#define PMAP_HEAD_TO_MAP_LOCK() \
+ (void) spinlockmgr(&pmap_main_lock, LK_EXCLUSIVE, NULL)
+#define PMAP_HEAD_TO_MAP_UNLOCK() \
+ spinlockmgr(&pmap_main_lock, LK_RELEASE, (void *) 0)
+
+#else
+
+#define PMAP_MAP_TO_HEAD_LOCK() /* null */
+#define PMAP_MAP_TO_HEAD_UNLOCK() /* null */
+
+#define PMAP_HEAD_TO_MAP_LOCK() /* null */
+#define PMAP_HEAD_TO_MAP_UNLOCK() /* null */
+
+#endif
+
+#define COUNT(x) /* nothing */
+
+/*
+ * TLB Shootdown:
+ *
+ * When a mapping is changed in a pmap, the TLB entry corresponding to
+ * the virtual address must be invalidated on all processors. In order
+ * to accomplish this on systems with multiple processors, messages are
+ * sent from the processor which performs the mapping change to all
+ * processors on which the pmap is active. For other processors, the
+ * ASN generation numbers for that processor is invalidated, so that
+ * the next time the pmap is activated on that processor, a new ASN
+ * will be allocated (which implicitly invalidates all TLB entries).
+ *
+ * Shootdown job queue entries are allocated using a simple special-
+ * purpose allocator for speed.
+ */
+struct pmap_tlb_shootdown_job {
+ TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list;
+ vaddr_t pj_va; /* virtual address */
+ pmap_t pj_pmap; /* the pmap which maps the address */
+ pt_entry_t pj_pte; /* the PTE bits */
+ struct pmap_tlb_shootdown_job *pj_nextfree;
+};
+
+#define PMAP_TLB_SHOOTDOWN_JOB_ALIGN 32
+union pmap_tlb_shootdown_job_al {
+ struct pmap_tlb_shootdown_job pja_job;
+ char pja_align[PMAP_TLB_SHOOTDOWN_JOB_ALIGN];
+};
+
+struct pmap_tlb_shootdown_q {
+ TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head;
+ int pq_pte; /* aggregate PTE bits */
+ int pq_count; /* number of pending requests */
+ __cpu_simple_lock_t pq_slock; /* spin lock on queue */
+ int pq_flushg; /* pending flush global */
+ int pq_flushu; /* pending flush user */
+} pmap_tlb_shootdown_q[X86_MAXPROCS];
+
+#define PMAP_TLB_MAXJOBS 16
+
+void pmap_tlb_shootdown_q_drain(struct pmap_tlb_shootdown_q *);
+struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
+ (struct pmap_tlb_shootdown_q *);
+void pmap_tlb_shootdown_job_put(struct pmap_tlb_shootdown_q *,
+ struct pmap_tlb_shootdown_job *);
+
+__cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
+union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
+
+/*
+ * global data structures
+ */
+
+struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */
+
+/*
+ * nkpde is the number of kernel PTPs allocated for the kernel at
+ * boot time (NKPTP is a compile time override). this number can
+ * grow dynamically as needed (but once allocated, we never free
+ * kernel PTPs).
+ */
+
+int nkpde = NKPTP;
+#ifdef NKPDE
+#error "obsolete NKPDE: use NKPTP"
+#endif
+
+/*
+ * pmap_pg_g: if our processor supports PG_G in the PTE then we
+ * set pmap_pg_g to PG_G (otherwise it is zero).
+ */
+
+int pmap_pg_g = 0;
+
+#ifdef LARGEPAGES
+/*
+ * pmap_largepages: if our processor supports PG_PS and we are
+ * using it, this is set to TRUE.
+ */
+
+int pmap_largepages;
+#endif
+
+/*
+ * i386 physical memory comes in a big contig chunk with a small
+ * hole toward the front of it... the following two paddr_t's
+ * (shared with machdep.c) describe the physical address space
+ * of this machine.
+ */
+paddr_t avail_start; /* PA of first available physical page */
+paddr_t avail_end; /* PA of last available physical page */
+
+paddr_t pmap_pa_start; /* PA of first physical page for this domain */
+paddr_t pmap_pa_end; /* PA of last physical page for this domain */
+
+ /* MA of last physical page of the machine */
+paddr_t pmap_mem_end = HYPERVISOR_VIRT_START; /* updated for domain-0 */
+
+/*
+ * other data structures
+ */
+
+static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */
+static boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */
+
+/*
+ * the following two vaddr_t's are used during system startup
+ * to keep track of how much of the kernel's VM space we have used.
+ * once the system is started, the management of the remaining kernel
+ * VM space is turned over to the kernel_map vm_map.
+ */
+
+static vaddr_t virtual_avail; /* VA of first free KVA */
+static vaddr_t virtual_end; /* VA of last free KVA */
+
+
+/*
+ * pv_page management structures: locked by pvalloc_lock
+ */
+
+TAILQ_HEAD(pv_pagelist, pv_page);
+static struct pv_pagelist pv_freepages; /* list of pv_pages with free entrys */
+static struct pv_pagelist pv_unusedpgs; /* list of unused pv_pages */
+static int pv_nfpvents; /* # of free pv entries */
+static struct pv_page *pv_initpage; /* bootstrap page from kernel_map */
+static vaddr_t pv_cachedva; /* cached VA for later use */
+
+#define PVE_LOWAT (PVE_PER_PVPAGE / 2) /* free pv_entry low water mark */
+#define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
+ /* high water mark */
+
+static __inline int
+pv_compare(struct pv_entry *a, struct pv_entry *b)
+{
+ if (a->pv_pmap < b->pv_pmap)
+ return (-1);
+ else if (a->pv_pmap > b->pv_pmap)
+ return (1);
+ else if (a->pv_va < b->pv_va)
+ return (-1);
+ else if (a->pv_va > b->pv_va)
+ return (1);
+ else
+ return (0);
+}
+
+SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare);
+SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare);
+
+/*
+ * linked list of all non-kernel pmaps
+ */
+
+static struct pmap_head pmaps;
+
+/*
+ * pool that pmap structures are allocated from
+ */
+
+struct pool pmap_pmap_pool;
+
+/*
+ * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
+ * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing
+ * due to false sharing.
+ */
+
+#ifdef MULTIPROCESSOR
+#define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
+#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
+#else
+#define PTESLEW(pte, id) (pte)
+#define VASLEW(va,id) (va)
+#endif
+
+/*
+ * special VAs and the PTEs that map them
+ */
+static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte;
+static caddr_t csrcp, cdstp, zerop, ptpp;
+
+/*
+ * pool and cache that PDPs are allocated from
+ */
+
+struct pool pmap_pdp_pool;
+struct pool_cache pmap_pdp_cache;
+u_int pmap_pdp_cache_generation;
+
+int pmap_pdp_ctor(void *, void *, int);
+void pmap_pdp_dtor(void *, void *);
+
+caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
+
+extern vaddr_t msgbuf_vaddr;
+extern paddr_t msgbuf_paddr;
+
+extern vaddr_t idt_vaddr; /* we allocate IDT early */
+extern paddr_t idt_paddr;
+
+#if defined(I586_CPU)
+/* stuff to fix the pentium f00f bug */
+extern vaddr_t pentium_idt_vaddr;
+#endif
+
+
+/*
+ * local prototypes
+ */
+
+static struct pv_entry *pmap_add_pvpage(struct pv_page *, boolean_t);
+static struct vm_page *pmap_alloc_ptp(struct pmap *, int);
+static struct pv_entry *pmap_alloc_pv(struct pmap *, int); /* see codes below */
+#define ALLOCPV_NEED 0 /* need PV now */
+#define ALLOCPV_TRY 1 /* just try to allocate, don't steal */
+#define ALLOCPV_NONEED 2 /* don't need PV, just growing cache */
+static struct pv_entry *pmap_alloc_pvpage(struct pmap *, int);
+static void pmap_enter_pv(struct pv_head *,
+ struct pv_entry *, struct pmap *,
+ vaddr_t, struct vm_page *);
+static void pmap_free_pv(struct pmap *, struct pv_entry *);
+static void pmap_free_pvs(struct pmap *, struct pv_entry *);
+static void pmap_free_pv_doit(struct pv_entry *);
+static void pmap_free_pvpage(void);
+static struct vm_page *pmap_get_ptp(struct pmap *, int);
+static boolean_t pmap_is_curpmap(struct pmap *);
+static boolean_t pmap_is_active(struct pmap *, int);
+static pt_entry_t *pmap_map_ptes(struct pmap *);
+static struct pv_entry *pmap_remove_pv(struct pv_head *, struct pmap *,
+ vaddr_t);
+static void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
+static boolean_t pmap_remove_pte(struct pmap *, struct vm_page *,
+ pt_entry_t *, vaddr_t, int32_t *, int);
+static void pmap_remove_ptes(struct pmap *, struct vm_page *,
+ vaddr_t, vaddr_t, vaddr_t, int32_t *,
+ int);
+#define PMAP_REMOVE_ALL 0 /* remove all mappings */
+#define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */
+
+static vaddr_t pmap_tmpmap_pa(paddr_t);
+static pt_entry_t *pmap_tmpmap_pvepte(struct pv_entry *);
+static void pmap_tmpunmap_pa(void);
+static void pmap_tmpunmap_pvepte(struct pv_entry *);
+static void pmap_unmap_ptes(struct pmap *);
+
+static boolean_t pmap_reactivate(struct pmap *);
+
+#ifdef DEBUG
+u_int curapdp;
+#endif
+
+/*
+ * p m a p i n l i n e h e l p e r f u n c t i o n s
+ */
+
+/*
+ * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
+ * of course the kernel is always loaded
+ */
+
+__inline static boolean_t
+pmap_is_curpmap(pmap)
+ struct pmap *pmap;
+{
+
+ return((pmap == pmap_kernel()) ||
+ (pmap == curcpu()->ci_pmap));
+}
+
+/*
+ * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
+ */
+
+__inline static boolean_t
+pmap_is_active(pmap, cpu_id)
+ struct pmap *pmap;
+ int cpu_id;
+{
+
+ return (pmap == pmap_kernel() ||
+ (pmap->pm_cpus & (1U << cpu_id)) != 0);
+}
+
+/*
+ * pmap_tmpmap_pa: map a page in for tmp usage
+ */
+
+__inline static vaddr_t
+pmap_tmpmap_pa(pa)
+ paddr_t pa;
+{
+#ifdef MULTIPROCESSOR
+ int id = cpu_number();
+#endif
+ pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
+ pt_entry_t *maptp;
+ caddr_t ptpva = VASLEW(ptpp, id);
+#if defined(DIAGNOSTIC)
+ if (*ptpte)
+ panic("pmap_tmpmap_pa: ptp_pte in use?");
+#endif
+ maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
+ PTE_SET(ptpte, maptp, PG_V | PG_RW | pa); /* always a new mapping */
+ return((vaddr_t)ptpva);
+}
+
+/*
+ * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa)
+ */
+
+__inline static void
+pmap_tmpunmap_pa()
+{
+#ifdef MULTIPROCESSOR
+ int id = cpu_number();
+#endif
+ pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
+ pt_entry_t *maptp;
+ caddr_t ptpva = VASLEW(ptpp, id);
+#if defined(DIAGNOSTIC)
+ if (!pmap_valid_entry(*ptp_pte))
+ panic("pmap_tmpunmap_pa: our pte invalid?");
+#endif
+ maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
+ PTE_CLEAR(ptpte, maptp); /* zap! */
+ pmap_update_pg((vaddr_t)ptpva);
+#ifdef MULTIPROCESSOR
+ /*
+ * No need for tlb shootdown here, since ptp_pte is per-CPU.
+ */
+#endif
+}
+
+/*
+ * pmap_tmpmap_pvepte: get a quick mapping of a PTE for a pv_entry
+ *
+ * => do NOT use this on kernel mappings [why? because pv_ptp may be NULL]
+ */
+
+__inline static pt_entry_t *
+pmap_tmpmap_pvepte(pve)
+ struct pv_entry *pve;
+{
+#ifdef DIAGNOSTIC
+ if (pve->pv_pmap == pmap_kernel())
+ panic("pmap_tmpmap_pvepte: attempt to map kernel");
+#endif
+
+ /* is it current pmap? use direct mapping... */
+ if (pmap_is_curpmap(pve->pv_pmap))
+ return(vtopte(pve->pv_va));
+
+ return(((pt_entry_t *)pmap_tmpmap_pa(VM_PAGE_TO_PHYS(pve->pv_ptp)))
+ + ptei((unsigned)pve->pv_va));
+}
+
+/*
+ * pmap_tmpunmap_pvepte: release a mapping obtained with pmap_tmpmap_pvepte
+ */
+
+__inline static void
+pmap_tmpunmap_pvepte(pve)
+ struct pv_entry *pve;
+{
+ /* was it current pmap? if so, return */
+ if (pmap_is_curpmap(pve->pv_pmap))
+ return;
+
+ pmap_tmpunmap_pa();
+}
+
+__inline static void
+pmap_apte_flush(struct pmap *pmap)
+{
+#if defined(MULTIPROCESSOR)
+ struct pmap_tlb_shootdown_q *pq;
+ struct cpu_info *ci, *self = curcpu();
+ CPU_INFO_ITERATOR cii;
+ int s;
+#endif
+
+ tlbflush(); /* flush TLB on current processor */
+#if defined(MULTIPROCESSOR)
+ /*
+ * Flush the APTE mapping from all other CPUs that
+ * are using the pmap we are using (who's APTE space
+ * is the one we've just modified).
+ *
+ * XXXthorpej -- find a way to defer the IPI.
+ */
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ if (ci == self)
+ continue;
+ if (pmap_is_active(pmap, ci->ci_cpuid)) {
+ pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
+ s = splipi();
+ __cpu_simple_lock(&pq->pq_slock);
+ pq->pq_flushu++;
+ __cpu_simple_unlock(&pq->pq_slock);
+ splx(s);
+ x86_send_ipi(ci, X86_IPI_TLB);
+ }
+ }
+#endif
+}
+
+/*
+ * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
+ *
+ * => we lock enough pmaps to keep things locked in
+ * => must be undone with pmap_unmap_ptes before returning
+ */
+
+__inline static pt_entry_t *
+pmap_map_ptes(pmap)
+ struct pmap *pmap;
+{
+ pd_entry_t opde;
+ pd_entry_t *mapdp;
+ struct pmap *ourpmap;
+ struct cpu_info *ci;
+
+ /* the kernel's pmap is always accessible */
+ if (pmap == pmap_kernel()) {
+ return(PTE_BASE);
+ }
+
+ ci = curcpu();
+ if (ci->ci_want_pmapload &&
+ vm_map_pmap(&ci->ci_curlwp->l_proc->p_vmspace->vm_map) == pmap)
+ pmap_load();
+
+ /* if curpmap then we are always mapped */
+ if (pmap_is_curpmap(pmap)) {
+ simple_lock(&pmap->pm_obj.vmobjlock);
+ return(PTE_BASE);
+ }
+
+ ourpmap = ci->ci_pmap;
+
+ /* need to lock both curpmap and pmap: use ordered locking */
+ if ((unsigned) pmap < (unsigned) ourpmap) {
+ simple_lock(&pmap->pm_obj.vmobjlock);
+ simple_lock(&ourpmap->pm_obj.vmobjlock);
+ } else {
+ simple_lock(&ourpmap->pm_obj.vmobjlock);
+ simple_lock(&pmap->pm_obj.vmobjlock);
+ }
+
+ /* need to load a new alternate pt space into curpmap? */
+ COUNT(apdp_pde_map);
+ opde = PDE_GET(APDP_PDE);
+ if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
+ XENPRINTF(("APDP_PDE %p %p/%p set %p/%p\n",
+ pmap,
+ (void *)vtophys((vaddr_t)APDP_PDE),
+ (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
+ (void *)pmap->pm_pdirpa,
+ (void *)xpmap_ptom(pmap->pm_pdirpa)));
+ mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
+ PDE_SET(APDP_PDE, mapdp, pmap->pm_pdirpa /* | PG_RW */ | PG_V);
+#ifdef DEBUG
+ curapdp = pmap->pm_pdirpa;
+#endif
+ if (pmap_valid_entry(opde))
+ pmap_apte_flush(ourpmap);
+ XENPRINTF(("APDP_PDE set done\n"));
+ }
+ return(APTE_BASE);
+}
+
+/*
+ * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
+ */
+
+__inline static void
+pmap_unmap_ptes(pmap)
+ struct pmap *pmap;
+{
+#if defined(MULTIPROCESSOR)
+ pd_entry_t *mapdp;
+#endif
+
+ if (pmap == pmap_kernel()) {
+ return;
+ }
+ if (pmap_is_curpmap(pmap)) {
+ simple_unlock(&pmap->pm_obj.vmobjlock);
+ } else {
+ struct pmap *ourpmap = curcpu()->ci_pmap;
+
+#if defined(MULTIPROCESSOR)
+ mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
+ PDE_CLEAR(APDP_PDE, mapdp);
+ pmap_apte_flush(ourpmap);
+#endif
+#ifdef DEBUG
+ curapdp = 0;
+#endif
+ XENPRINTF(("APDP_PDE clear %p/%p set %p/%p\n",
+ (void *)vtophys((vaddr_t)APDP_PDE),
+ (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
+ (void *)pmap->pm_pdirpa,
+ (void *)xpmap_ptom(pmap->pm_pdirpa)));
+ COUNT(apdp_pde_unmap);
+ simple_unlock(&pmap->pm_obj.vmobjlock);
+ simple_unlock(&ourpmap->pm_obj.vmobjlock);
+ }
+}
+
+__inline static void
+pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
+{
+ if (curproc == NULL || curproc->p_vmspace == NULL ||
+ pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
+ return;
+
+ if ((opte ^ npte) & PG_X)
+ pmap_update_pg(va);
+
+ /*
+ * Executability was removed on the last executable change.
+ * Reset the code segment to something conservative and
+ * let the trap handler deal with setting the right limit.
+ * We can't do that because of locking constraints on the vm map.
+ */
+
+ if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
+ struct trapframe *tf = curlwp->l_md.md_regs;
+ struct pcb *pcb = &curlwp->l_addr->u_pcb;
+
+ pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
+ pm->pm_hiexec = I386_MAX_EXE_ADDR;
+ }
+}
+
+__inline static pt_entry_t
+pte_mtop(pt_entry_t pte)
+{
+ pt_entry_t ppte;
+
+ KDASSERT(pmap_valid_entry(pte));
+ ppte = xpmap_mtop(pte);
+ if ((ppte & PG_FRAME) == XPMAP_OFFSET) {
+ XENPRINTF(("pte_mtop: null page %08x -> %08x\n",
+ ppte, pte));
+ ppte = pte;
+ }
+
+ return ppte;
+}
+
+__inline static pt_entry_t
+pte_get_ma(pt_entry_t *pte)
+{
+
+ return *pte;
+}
+
+__inline static pt_entry_t
+pte_get(pt_entry_t *pte)
+{
+
+ if (pmap_valid_entry(*pte))
+ return pte_mtop(*pte);
+ return *pte;
+}
+
+__inline static pt_entry_t
+pte_atomic_update_ma(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
+{
+ pt_entry_t opte;
+
+ XENPRINTK(("pte_atomic_update_ma pte %p mapte %p npte %08x\n",
+ pte, mapte, npte));
+ opte = PTE_GET_MA(pte);
+ if (opte > pmap_mem_end) {
+ /* must remove opte unchecked */
+ if (npte > pmap_mem_end)
+ /* must set npte unchecked */
+ xpq_queue_unchecked_pte_update(mapte, npte);
+ else {
+ /* must set npte checked */
+ xpq_queue_unchecked_pte_update(mapte, 0);
+ xpq_queue_pte_update(mapte, npte);
+ }
+ } else {
+ /* must remove opte checked */
+ if (npte > pmap_mem_end) {
+ /* must set npte unchecked */
+ xpq_queue_pte_update(mapte, 0);
+ xpq_queue_unchecked_pte_update(mapte, npte);
+ } else
+ /* must set npte checked */
+ xpq_queue_pte_update(mapte, npte);
+ }
+ xpq_flush_queue();
+
+ return opte;
+}
+
+__inline static pt_entry_t
+pte_atomic_update(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
+{
+ pt_entry_t opte;
+
+ opte = pte_atomic_update_ma(pte, mapte, npte);
+
+ return pte_mtop(opte);
+}
+
+/*
+ * Fixup the code segment to cover all potential executable mappings.
+ * returns 0 if no changes to the code segment were made.
+ */
+
+int
+pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
+{
+ struct vm_map_entry *ent;
+ struct pmap *pm = vm_map_pmap(map);
+ vaddr_t va = 0;
+
+ vm_map_lock_read(map);
+ for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
+
+ /*
+ * This entry has greater va than the entries before.
+ * We need to make it point to the last page, not past it.
+ */
+
+ if (ent->protection & VM_PROT_EXECUTE)
+ va = trunc_page(ent->end) - PAGE_SIZE;
+ }
+ vm_map_unlock_read(map);
+ if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
+ return (0);
+
+ pm->pm_hiexec = va;
+ if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
+ pcb->pcb_cs = tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
+ } else {
+ pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
+ return (0);
+ }
+ return (1);
+}
+
+/*
+ * p m a p k e n t e r f u n c t i o n s
+ *
+ * functions to quickly enter/remove pages from the kernel address
+ * space. pmap_kremove is exported to MI kernel. we make use of
+ * the recursive PTE mappings.
+ */
+
+/*
+ * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
+ *
+ * => no need to lock anything, assume va is already allocated
+ * => should be faster than normal pmap enter function
+ */
+
+void
+pmap_kenter_pa(va, pa, prot)
+ vaddr_t va;
+ paddr_t pa;
+ vm_prot_t prot;
+{
+ pt_entry_t *pte, opte, npte;
+ pt_entry_t *maptp;
+
+ if (va < VM_MIN_KERNEL_ADDRESS)
+ pte = vtopte(va);
+ else
+ pte = kvtopte(va);
+
+ npte = ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
+ PG_V | pmap_pg_g;
+
+ if (pa >= pmap_pa_start && pa < pmap_pa_end) {
+ npte |= xpmap_ptom(pa);
+ } else {
+ XENPRINTF(("pmap_kenter: va %08lx outside pa range %08lx\n",
+ va, pa));
+ npte |= pa;
+ }
+
+ maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+ opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
+ XENPRINTK(("pmap_kenter_pa(%p,%p) %p, was %08x now %08x\n", (void *)va,
+ (void *)pa, pte, opte, npte));
+#ifdef LARGEPAGES
+ /* XXX For now... */
+ if (opte & PG_PS)
+ panic("pmap_kenter_pa: PG_PS");
+#endif
+ if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+ int32_t cpumask = 0;
+
+ pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+ pmap_tlb_shootnow(cpumask);
+#else
+ /* Don't bother deferring in the single CPU case. */
+ pmap_update_pg(va);
+#endif
+ }
+}
+
+/*
+ * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
+ *
+ * => no need to lock anything, assume va is already allocated
+ * => should be faster than normal pmap enter function
+ */
+
+void pmap_kenter_ma __P((vaddr_t, paddr_t, vm_prot_t));
+
+void
+pmap_kenter_ma(va, ma, prot)
+ vaddr_t va;
+ paddr_t ma;
+ vm_prot_t prot;
+{
+ pt_entry_t *pte, opte, npte;
+ pt_entry_t *maptp;
+
+ KASSERT (va >= VM_MIN_KERNEL_ADDRESS);
+ pte = kvtopte(va);
+
+ npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
+ PG_V | pmap_pg_g;
+
+ maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+ opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
+ XENPRINTK(("pmap_kenter_ma(%p,%p) %p, was %08x\n", (void *)va,
+ (void *)ma, pte, opte));
+#ifdef LARGEPAGES
+ /* XXX For now... */
+ if (opte & PG_PS)
+ panic("pmap_kenter_ma: PG_PS");
+#endif
+ if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+ int32_t cpumask = 0;
+
+ pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+ pmap_tlb_shootnow(cpumask);
+#else
+ /* Don't bother deferring in the single CPU case. */
+ pmap_update_pg(va);
+#endif
+ }
+}
+
+/*
+ * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
+ *
+ * => no need to lock anything
+ * => caller must dispose of any vm_page mapped in the va range
+ * => note: not an inline function
+ * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
+ * => we assume kernel only unmaps valid addresses and thus don't bother
+ * checking the valid bit before doing TLB flushing
+ */
+
+void
+pmap_kremove(va, len)
+ vaddr_t va;
+ vsize_t len;
+{
+ pt_entry_t *pte, opte;
+ pt_entry_t *maptp;
+ int32_t cpumask = 0;
+
+ XENPRINTK(("pmap_kremove va %p, len %08lx\n", (void *)va, len));
+ len >>= PAGE_SHIFT;
+ for ( /* null */ ; len ; len--, va += PAGE_SIZE) {
+ if (va < VM_MIN_KERNEL_ADDRESS)
+ pte = vtopte(va);
+ else
+ pte = kvtopte(va);
+ maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+ opte = pte_atomic_update_ma(pte, maptp, 0); /* zap! */
+ XENPRINTK(("pmap_kremove pte %p, was %08x\n", pte, opte));
+#ifdef LARGEPAGES
+ /* XXX For now... */
+ if (opte & PG_PS)
+ panic("pmap_kremove: PG_PS");
+#endif
+#ifdef DIAGNOSTIC
+ if (opte & PG_PVLIST)
+ panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
+ va);
+#endif
+ if ((opte & (PG_V | PG_U)) == (PG_V | PG_U))
+ pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+ }
+ pmap_tlb_shootnow(cpumask);
+}
+
+/*
+ * p m a p i n i t f u n c t i o n s
+ *
+ * pmap_bootstrap and pmap_init are called during system startup
+ * to init the pmap module. pmap_bootstrap() does a low level
+ * init just to get things rolling. pmap_init() finishes the job.
+ */
+
+/*
+ * pmap_bootstrap: get the system in a state where it can run with VM
+ * properly enabled (called before main()). the VM system is
+ * fully init'd later...
+ *
+ * => on i386, locore.s has already enabled the MMU by allocating
+ * a PDP for the kernel, and nkpde PTP's for the kernel.
+ * => kva_start is the first free virtual address in kernel space
+ */
+
+void
+pmap_bootstrap(kva_start)
+ vaddr_t kva_start;
+{
+ struct pmap *kpm;
+ vaddr_t kva;
+ pt_entry_t *pte;
+ pt_entry_t *maptp;
+ int i;
+
+ /*
+ * set up our local static global vars that keep track of the
+ * usage of KVM before kernel_map is set up
+ */
+
+ virtual_avail = kva_start; /* first free KVA */
+ virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */
+
+ /*
+ * find out where physical memory ends on the real hardware.
+ */
+
+ if (xen_start_info.flags & SIF_PRIVILEGED)
+ pmap_mem_end = find_pmap_mem_end(kva_start);
+
+ /*
+ * set up protection_codes: we need to be able to convert from
+ * a MI protection code (some combo of VM_PROT...) to something
+ * we can jam into a i386 PTE.
+ */
+
+ protection_codes[VM_PROT_NONE] = 0; /* --- */
+ protection_codes[VM_PROT_EXECUTE] = PG_X; /* --x */
+ protection_codes[VM_PROT_READ] = PG_RO; /* -r- */
+ protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO|PG_X;/* -rx */
+ protection_codes[VM_PROT_WRITE] = PG_RW; /* w-- */
+ protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW|PG_X;/* w-x */
+ protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW; /* wr- */
+ protection_codes[VM_PROT_ALL] = PG_RW|PG_X; /* wrx */
+
+ /*
+ * now we init the kernel's pmap
+ *
+ * the kernel pmap's pm_obj is not used for much. however, in
+ * user pmaps the pm_obj contains the list of active PTPs.
+ * the pm_obj currently does not have a pager. it might be possible
+ * to add a pager that would allow a process to read-only mmap its
+ * own page tables (fast user level vtophys?). this may or may not
+ * be useful.
+ */
+
+ kpm = pmap_kernel();
+ simple_lock_init(&kpm->pm_obj.vmobjlock);
+ kpm->pm_obj.pgops = NULL;
+ TAILQ_INIT(&kpm->pm_obj.memq);
+ kpm->pm_obj.uo_npages = 0;
+ kpm->pm_obj.uo_refs = 1;
+ memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
+ kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE);
+ XENPRINTF(("pm_pdirpa %p PTDpaddr %p\n",
+ (void *)lwp0.l_addr->u_pcb.pcb_cr3, (void *)PTDpaddr));
+ kpm->pm_pdirpa = (u_int32_t) lwp0.l_addr->u_pcb.pcb_cr3;
+ kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
+ x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
+
+ /*
+ * the above is just a rough estimate and not critical to the proper
+ * operation of the system.
+ */
+
+ /*
+ * Begin to enable global TLB entries if they are supported.
+ * The G bit has no effect until the CR4_PGE bit is set in CR4,
+ * which happens in cpu_init(), which is run on each cpu
+ * (and happens later)
+ */
+
+ if (cpu_feature & CPUID_PGE) {
+ pmap_pg_g = PG_G; /* enable software */
+
+ /* add PG_G attribute to already mapped kernel pages */
+ for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
+ kva += PAGE_SIZE)
+ if (pmap_valid_entry(PTE_BASE[x86_btop(kva)])) {
+#if !defined(XEN)
+ PTE_BASE[x86_btop(kva)] |= PG_G;
+#else
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&PTE_BASE[x86_btop(kva)]);
+ PTE_SETBITS(&PTE_BASE[x86_btop(kva)], maptp,
+ PG_G);
+ }
+ PTE_UPDATES_FLUSH();
+#endif
+ }
+
+#ifdef LARGEPAGES
+ /*
+ * enable large pages if they are supported.
+ */
+
+ if (cpu_feature & CPUID_PSE) {
+ paddr_t pa;
+ vaddr_t kva_end;
+ pd_entry_t *pde;
+ pd_entry_t *mapdp;
+ extern char _etext;
+
+ lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */
+ pmap_largepages = 1; /* enable software */
+
+ /*
+ * the TLB must be flushed after enabling large pages
+ * on Pentium CPUs, according to section 3.6.2.2 of
+ * "Intel Architecture Software Developer's Manual,
+ * Volume 3: System Programming".
+ */
+ tlbflush();
+
+ /*
+ * now, remap the kernel text using large pages. we
+ * assume that the linker has properly aligned the
+ * .data segment to a 4MB boundary.
+ */
+ kva_end = roundup((vaddr_t)&_etext, NBPD);
+ for (pa = 0, kva = KERNBASE; kva < kva_end;
+ kva += NBPD, pa += NBPD) {
+ pde = &kpm->pm_pdir[pdei(kva)];
+ mapdp = (pt_entry_t *)vtomach((vaddr_t)pde);
+ PDE_SET(pde, mapdp, pa | pmap_pg_g | PG_PS |
+ PG_KR | PG_V); /* zap! */
+ tlbflush();
+ }
+ }
+#endif /* LARGEPAGES */
+
+ /*
+ * now we allocate the "special" VAs which are used for tmp mappings
+ * by the pmap (and other modules). we allocate the VAs by advancing
+ * virtual_avail (note that there are no pages mapped at these VAs).
+ * we find the PTE that maps the allocated VA via the linear PTE
+ * mapping.
+ */
+
+ pte = PTE_BASE + x86_btop(virtual_avail);
+
+#ifdef MULTIPROCESSOR
+ /*
+ * Waste some VA space to avoid false sharing of cache lines
+ * for page table pages: Give each possible CPU a cache line
+ * of PTE's (8) to play with, though we only need 4. We could
+ * recycle some of this waste by putting the idle stacks here
+ * as well; we could waste less space if we knew the largest
+ * CPU ID beforehand.
+ */
+ csrcp = (caddr_t) virtual_avail; csrc_pte = pte;
+
+ cdstp = (caddr_t) virtual_avail+PAGE_SIZE; cdst_pte = pte+1;
+
+ zerop = (caddr_t) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2;
+
+ ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3;
+
+ virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL;
+ pte += X86_MAXPROCS * NPTECL;
+#else
+ csrcp = (caddr_t) virtual_avail; csrc_pte = pte; /* allocate */
+ virtual_avail += PAGE_SIZE; pte++; /* advance */
+
+ cdstp = (caddr_t) virtual_avail; cdst_pte = pte;
+ virtual_avail += PAGE_SIZE; pte++;
+
+ zerop = (caddr_t) virtual_avail; zero_pte = pte;
+ virtual_avail += PAGE_SIZE; pte++;
+
+ ptpp = (caddr_t) virtual_avail; ptp_pte = pte;
+ virtual_avail += PAGE_SIZE; pte++;
+#endif
+
+ XENPRINTK(("pmap_bootstrap csrcp %p cdstp %p zerop %p ptpp %p\n",
+ csrc_pte, cdst_pte, zero_pte, ptp_pte));
+ /*
+ * Nothing after this point actually needs pte;
+ */
+ pte = (void *)0xdeadbeef;
+
+ /* XXX: vmmap used by mem.c... should be uvm_map_reserve */
+ vmmap = (char *)virtual_avail; /* don't need pte */
+ virtual_avail += PAGE_SIZE;
+
+ msgbuf_vaddr = virtual_avail; /* don't need pte */
+ virtual_avail += round_page(MSGBUFSIZE);
+
+ idt_vaddr = virtual_avail; /* don't need pte */
+ virtual_avail += PAGE_SIZE;
+ idt_paddr = avail_start; /* steal a page */
+ avail_start += PAGE_SIZE;
+
+#if defined(I586_CPU)
+ /* pentium f00f bug stuff */
+ pentium_idt_vaddr = virtual_avail; /* don't need pte */
+ virtual_avail += PAGE_SIZE;
+#endif
+
+ /*
+ * now we reserve some VM for mapping pages when doing a crash dump
+ */
+
+ virtual_avail = reserve_dumppages(virtual_avail);
+
+ /*
+ * init the static-global locks and global lists.
+ */
+
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+ spinlockinit(&pmap_main_lock, "pmaplk", 0);
+#endif
+ simple_lock_init(&pvalloc_lock);
+ simple_lock_init(&pmaps_lock);
+ LIST_INIT(&pmaps);
+ TAILQ_INIT(&pv_freepages);
+ TAILQ_INIT(&pv_unusedpgs);
+
+ /*
+ * initialize the pmap pool.
+ */
+
+ pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0, "pmappl",
+ &pool_allocator_nointr);
+
+ /*
+ * Initialize the TLB shootdown queues.
+ */
+
+ __cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
+
+ for (i = 0; i < X86_MAXPROCS; i++) {
+ TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
+ __cpu_simple_lock_init(&pmap_tlb_shootdown_q[i].pq_slock);
+ }
+
+ /*
+ * initialize the PDE pool and cache.
+ */
+ pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, 0, "pdppl",
+ &pool_allocator_nointr);
+ pool_cache_init(&pmap_pdp_cache, &pmap_pdp_pool,
+ pmap_pdp_ctor, pmap_pdp_dtor, NULL);
+
+ /*
+ * ensure the TLB is sync'd with reality by flushing it...
+ */
+
+ tlbflush();
+}
+
+/*
+ * pmap_init: called from uvm_init, our job is to get the pmap
+ * system ready to manage mappings... this mainly means initing
+ * the pv_entry stuff.
+ */
+
+void
+pmap_init()
+{
+ int i;
+
+ /*
+ * now we need to free enough pv_entry structures to allow us to get
+ * the kmem_map/kmem_object allocated and inited (done after this
+ * function is finished). to do this we allocate one bootstrap page out
+ * of kernel_map and use it to provide an initial pool of pv_entry
+ * structures. we never free this page.
+ */
+
+ pv_initpage = (struct pv_page *) uvm_km_alloc(kernel_map, PAGE_SIZE);
+ if (pv_initpage == NULL)
+ panic("pmap_init: pv_initpage");
+ pv_cachedva = 0; /* a VA we have allocated but not used yet */
+ pv_nfpvents = 0;
+ (void) pmap_add_pvpage(pv_initpage, FALSE);
+
+ pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE);
+ if (pj_page == NULL)
+ panic("pmap_init: pj_page");
+
+ for (i = 0;
+ i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
+ i++)
+ pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
+ pj_page[i].pja_job.pj_nextfree = NULL;
+ pj_free = &pj_page[0];
+
+ /*
+ * done: pmap module is up (and ready for business)
+ */
+
+ pmap_initialized = TRUE;
+}
+
+/*
+ * p v _ e n t r y f u n c t i o n s
+ */
+
+/*
+ * pv_entry allocation functions:
+ * the main pv_entry allocation functions are:
+ * pmap_alloc_pv: allocate a pv_entry structure
+ * pmap_free_pv: free one pv_entry
+ * pmap_free_pvs: free a list of pv_entrys
+ *
+ * the rest are helper functions
+ */
+
+/*
+ * pmap_alloc_pv: inline function to allocate a pv_entry structure
+ * => we lock pvalloc_lock
+ * => if we fail, we call out to pmap_alloc_pvpage
+ * => 3 modes:
+ * ALLOCPV_NEED = we really need a pv_entry, even if we have to steal it
+ * ALLOCPV_TRY = we want a pv_entry, but not enough to steal
+ * ALLOCPV_NONEED = we are trying to grow our free list, don't really need
+ * one now
+ *
+ * "try" is for optional functions like pmap_copy().
+ */
+
+__inline static struct pv_entry *
+pmap_alloc_pv(pmap, mode)
+ struct pmap *pmap;
+ int mode;
+{
+ struct pv_page *pvpage;
+ struct pv_entry *pv;
+
+ simple_lock(&pvalloc_lock);
+
+ pvpage = TAILQ_FIRST(&pv_freepages);
+ if (pvpage != NULL) {
+ pvpage->pvinfo.pvpi_nfree--;
+ if (pvpage->pvinfo.pvpi_nfree == 0) {
+ /* nothing left in this one? */
+ TAILQ_REMOVE(&pv_freepages, pvpage, pvinfo.pvpi_list);
+ }
+ pv = pvpage->pvinfo.pvpi_pvfree;
+ KASSERT(pv);
+ pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
+ pv_nfpvents--; /* took one from pool */
+ } else {
+ pv = NULL; /* need more of them */
+ }
+
+ /*
+ * if below low water mark or we didn't get a pv_entry we try and
+ * create more pv_entrys ...
+ */
+
+ if (pv_nfpvents < PVE_LOWAT || pv == NULL) {
+ if (pv == NULL)
+ pv = pmap_alloc_pvpage(pmap, (mode == ALLOCPV_TRY) ?
+ mode : ALLOCPV_NEED);
+ else
+ (void) pmap_alloc_pvpage(pmap, ALLOCPV_NONEED);
+ }
+ simple_unlock(&pvalloc_lock);
+ return(pv);
+}
+
+/*
+ * pmap_alloc_pvpage: maybe allocate a new pvpage
+ *
+ * if need_entry is false: try and allocate a new pv_page
+ * if need_entry is true: try and allocate a new pv_page and return a
+ * new pv_entry from it. if we are unable to allocate a pv_page
+ * we make a last ditch effort to steal a pv_page from some other
+ * mapping. if that fails, we panic...
+ *
+ * => we assume that the caller holds pvalloc_lock
+ */
+
+static struct pv_entry *
+pmap_alloc_pvpage(pmap, mode)
+ struct pmap *pmap;
+ int mode;
+{
+ struct vm_page *pg;
+ struct pv_page *pvpage;
+ struct pv_entry *pv;
+ int s;
+
+ /*
+ * if we need_entry and we've got unused pv_pages, allocate from there
+ */
+
+ pvpage = TAILQ_FIRST(&pv_unusedpgs);
+ if (mode != ALLOCPV_NONEED && pvpage != NULL) {
+
+ /* move it to pv_freepages list */
+ TAILQ_REMOVE(&pv_unusedpgs, pvpage, pvinfo.pvpi_list);
+ TAILQ_INSERT_HEAD(&pv_freepages, pvpage, pvinfo.pvpi_list);
+
+ /* allocate a pv_entry */
+ pvpage->pvinfo.pvpi_nfree--; /* can't go to zero */
+ pv = pvpage->pvinfo.pvpi_pvfree;
+ KASSERT(pv);
+ pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
+ pv_nfpvents--; /* took one from pool */
+ return(pv);
+ }
+
+ /*
+ * see if we've got a cached unmapped VA that we can map a page in.
+ * if not, try to allocate one.
+ */
+
+ if (pv_cachedva == 0) {
+ s = splvm(); /* must protect kmem_map with splvm! */
+ pv_cachedva = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE,
+ UVM_KMF_TRYLOCK|UVM_KMF_VALLOC);
+ splx(s);
+ if (pv_cachedva == 0) {
+ return (NULL);
+ }
+ }
+
+ pg = uvm_pagealloc(NULL, pv_cachedva - vm_map_min(kernel_map), NULL,
+ UVM_PGA_USERESERVE);
+ if (pg == NULL)
+ return (NULL);
+ pg->flags &= ~PG_BUSY; /* never busy */
+
+ /*
+ * add a mapping for our new pv_page and free its entrys (save one!)
+ *
+ * NOTE: If we are allocating a PV page for the kernel pmap, the
+ * pmap is already locked! (...but entering the mapping is safe...)
+ */
+
+ pmap_kenter_pa(pv_cachedva, VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ | VM_PROT_WRITE);
+ pmap_update(pmap_kernel());
+ pvpage = (struct pv_page *) pv_cachedva;
+ pv_cachedva = 0;
+ return (pmap_add_pvpage(pvpage, mode != ALLOCPV_NONEED));
+}
+
+/*
+ * pmap_add_pvpage: add a pv_page's pv_entrys to the free list
+ *
+ * => caller must hold pvalloc_lock
+ * => if need_entry is true, we allocate and return one pv_entry
+ */
+
+static struct pv_entry *
+pmap_add_pvpage(pvp, need_entry)
+ struct pv_page *pvp;
+ boolean_t need_entry;
+{
+ int tofree, lcv;
+
+ /* do we need to return one? */
+ tofree = (need_entry) ? PVE_PER_PVPAGE - 1 : PVE_PER_PVPAGE;
+
+ pvp->pvinfo.pvpi_pvfree = NULL;
+ pvp->pvinfo.pvpi_nfree = tofree;
+ for (lcv = 0 ; lcv < tofree ; lcv++) {
+ SPLAY_RIGHT(&pvp->pvents[lcv], pv_node) =
+ pvp->pvinfo.pvpi_pvfree;
+ pvp->pvinfo.pvpi_pvfree = &pvp->pvents[lcv];
+ }
+ if (need_entry)
+ TAILQ_INSERT_TAIL(&pv_freepages, pvp, pvinfo.pvpi_list);
+ else
+ TAILQ_INSERT_TAIL(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
+ pv_nfpvents += tofree;
+ return((need_entry) ? &pvp->pvents[lcv] : NULL);
+}
+
+/*
+ * pmap_free_pv_doit: actually free a pv_entry
+ *
+ * => do not call this directly! instead use either
+ * 1. pmap_free_pv ==> free a single pv_entry
+ * 2. pmap_free_pvs => free a list of pv_entrys
+ * => we must be holding pvalloc_lock
+ */
+
+__inline static void
+pmap_free_pv_doit(pv)
+ struct pv_entry *pv;
+{
+ struct pv_page *pvp;
+
+ pvp = (struct pv_page *) x86_trunc_page(pv);
+ pv_nfpvents++;
+ pvp->pvinfo.pvpi_nfree++;
+
+ /* nfree == 1 => fully allocated page just became partly allocated */
+ if (pvp->pvinfo.pvpi_nfree == 1) {
+ TAILQ_INSERT_HEAD(&pv_freepages, pvp, pvinfo.pvpi_list);
+ }
+
+ /* free it */
+ SPLAY_RIGHT(pv, pv_node) = pvp->pvinfo.pvpi_pvfree;
+ pvp->pvinfo.pvpi_pvfree = pv;
+
+ /*
+ * are all pv_page's pv_entry's free? move it to unused queue.
+ */
+
+ if (pvp->pvinfo.pvpi_nfree == PVE_PER_PVPAGE) {
+ TAILQ_REMOVE(&pv_freepages, pvp, pvinfo.pvpi_list);
+ TAILQ_INSERT_HEAD(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
+ }
+}
+
+/*
+ * pmap_free_pv: free a single pv_entry
+ *
+ * => we gain the pvalloc_lock
+ */
+
+__inline static void
+pmap_free_pv(pmap, pv)
+ struct pmap *pmap;
+ struct pv_entry *pv;
+{
+ simple_lock(&pvalloc_lock);
+ pmap_free_pv_doit(pv);
+
+ /*
+ * Can't free the PV page if the PV entries were associated with
+ * the kernel pmap; the pmap is already locked.
+ */
+ if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
+ pmap != pmap_kernel())
+ pmap_free_pvpage();
+
+ simple_unlock(&pvalloc_lock);
+}
+
+/*
+ * pmap_free_pvs: free a list of pv_entrys
+ *
+ * => we gain the pvalloc_lock
+ */
+
+__inline static void
+pmap_free_pvs(pmap, pvs)
+ struct pmap *pmap;
+ struct pv_entry *pvs;
+{
+ struct pv_entry *nextpv;
+
+ simple_lock(&pvalloc_lock);
+
+ for ( /* null */ ; pvs != NULL ; pvs = nextpv) {
+ nextpv = SPLAY_RIGHT(pvs, pv_node);
+ pmap_free_pv_doit(pvs);
+ }
+
+ /*
+ * Can't free the PV page if the PV entries were associated with
+ * the kernel pmap; the pmap is already locked.
+ */
+ if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
+ pmap != pmap_kernel())
+ pmap_free_pvpage();
+
+ simple_unlock(&pvalloc_lock);
+}
+
+
+/*
+ * pmap_free_pvpage: try and free an unused pv_page structure
+ *
+ * => assume caller is holding the pvalloc_lock and that
+ * there is a page on the pv_unusedpgs list
+ * => if we can't get a lock on the kmem_map we try again later
+ */
+
+static void
+pmap_free_pvpage()
+{
+ int s;
+ struct vm_map *map;
+ struct vm_map_entry *dead_entries;
+ struct pv_page *pvp;
+
+ s = splvm(); /* protect kmem_map */
+
+ pvp = TAILQ_FIRST(&pv_unusedpgs);
+
+ /*
+ * note: watch out for pv_initpage which is allocated out of
+ * kernel_map rather than kmem_map.
+ */
+
+ if (pvp == pv_initpage)
+ map = kernel_map;
+ else
+ map = kmem_map;
+ if (vm_map_lock_try(map)) {
+
+ /* remove pvp from pv_unusedpgs */
+ TAILQ_REMOVE(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
+
+ /* unmap the page */
+ dead_entries = NULL;
+ uvm_unmap_remove(map, (vaddr_t)pvp, ((vaddr_t)pvp) + PAGE_SIZE,
+ &dead_entries);
+ vm_map_unlock(map);
+
+ if (dead_entries != NULL)
+ uvm_unmap_detach(dead_entries, 0);
+
+ pv_nfpvents -= PVE_PER_PVPAGE; /* update free count */
+ }
+ if (pvp == pv_initpage)
+ /* no more initpage, we've freed it */
+ pv_initpage = NULL;
+
+ splx(s);
+}
+
+/*
+ * pmap_lock_pvhs: Lock pvh1 and optional pvh2
+ * Observe locking order when locking both pvhs
+ */
+
+__inline static void
+pmap_lock_pvhs(struct pv_head *pvh1, struct pv_head *pvh2)
+{
+
+ if (pvh2 == NULL) {
+ simple_lock(&pvh1->pvh_lock);
+ return;
+ }
+
+ if (pvh1 < pvh2) {
+ simple_lock(&pvh1->pvh_lock);
+ simple_lock(&pvh2->pvh_lock);
+ } else {
+ simple_lock(&pvh2->pvh_lock);
+ simple_lock(&pvh1->pvh_lock);
+ }
+}
+
+
+/*
+ * main pv_entry manipulation functions:
+ * pmap_enter_pv: enter a mapping onto a pv_head list
+ * pmap_remove_pv: remove a mappiing from a pv_head list
+ *
+ * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
+ * the pvh before calling
+ */
+
+/*
+ * pmap_enter_pv: enter a mapping onto a pv_head lst
+ *
+ * => caller should hold the proper lock on pmap_main_lock
+ * => caller should have pmap locked
+ * => caller should have the pv_head locked
+ * => caller should adjust ptp's wire_count before calling
+ */
+
+__inline static void
+pmap_enter_pv(pvh, pve, pmap, va, ptp)
+ struct pv_head *pvh;
+ struct pv_entry *pve; /* preallocated pve for us to use */
+ struct pmap *pmap;
+ vaddr_t va;
+ struct vm_page *ptp; /* PTP in pmap that maps this VA */
+{
+ pve->pv_pmap = pmap;
+ pve->pv_va = va;
+ pve->pv_ptp = ptp; /* NULL for kernel pmap */
+ SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */
+}
+
+/*
+ * pmap_remove_pv: try to remove a mapping from a pv_list
+ *
+ * => caller should hold proper lock on pmap_main_lock
+ * => pmap should be locked
+ * => caller should hold lock on pv_head [so that attrs can be adjusted]
+ * => caller should adjust ptp's wire_count and free PTP if needed
+ * => we return the removed pve
+ */
+
+__inline static struct pv_entry *
+pmap_remove_pv(pvh, pmap, va)
+ struct pv_head *pvh;
+ struct pmap *pmap;
+ vaddr_t va;
+{
+ struct pv_entry tmp, *pve;
+
+ tmp.pv_pmap = pmap;
+ tmp.pv_va = va;
+ pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp);
+ if (pve == NULL)
+ return (NULL);
+ SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve);
+ return(pve); /* return removed pve */
+}
+
+/*
+ * p t p f u n c t i o n s
+ */
+
+/*
+ * pmap_alloc_ptp: allocate a PTP for a PMAP
+ *
+ * => pmap should already be locked by caller
+ * => we use the ptp's wire_count to count the number of active mappings
+ * in the PTP (we start it at one to prevent any chance this PTP
+ * will ever leak onto the active/inactive queues)
+ */
+
+__inline static struct vm_page *
+pmap_alloc_ptp(pmap, pde_index)
+ struct pmap *pmap;
+ int pde_index;
+{
+ struct vm_page *ptp;
+ pd_entry_t *mapdp;
+
+ ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
+ UVM_PGA_USERESERVE|UVM_PGA_ZERO);
+ if (ptp == NULL)
+ return(NULL);
+
+ /* got one! */
+ ptp->flags &= ~PG_BUSY; /* never busy */
+ ptp->wire_count = 1; /* no mappings yet */
+ mapdp = (pt_entry_t *)vtomach((vaddr_t)&pmap->pm_pdir[pde_index]);
+ PDE_SET(&pmap->pm_pdir[pde_index], mapdp,
+ (pd_entry_t) (VM_PAGE_TO_PHYS(ptp) | PG_u | PG_RW | PG_V));
+ pmap->pm_stats.resident_count++; /* count PTP as resident */
+ pmap->pm_ptphint = ptp;
+ return(ptp);
+}
+
+/*
+ * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
+ *
+ * => pmap should NOT be pmap_kernel()
+ * => pmap should be locked
+ */
+
+static struct vm_page *
+pmap_get_ptp(pmap, pde_index)
+ struct pmap *pmap;
+ int pde_index;
+{
+ struct vm_page *ptp;
+
+ if (pmap_valid_entry(pmap->pm_pdir[pde_index])) {
+
+ /* valid... check hint (saves us a PA->PG lookup) */
+ if (pmap->pm_ptphint &&
+ (PDE_GET(&pmap->pm_pdir[pde_index]) & PG_FRAME) ==
+ VM_PAGE_TO_PHYS(pmap->pm_ptphint))
+ return(pmap->pm_ptphint);
+
+ ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
+#ifdef DIAGNOSTIC
+ if (ptp == NULL)
+ panic("pmap_get_ptp: unmanaged user PTP");
+#endif
+ pmap->pm_ptphint = ptp;
+ return(ptp);
+ }
+
+ /* allocate a new PTP (updates ptphint) */
+ return(pmap_alloc_ptp(pmap, pde_index));
+}
+
+/*
+ * p m a p l i f e c y c l e f u n c t i o n s
+ */
+
+/*
+ * pmap_pdp_ctor: constructor for the PDP cache.
+ */
+
+int
+pmap_pdp_ctor(void *arg, void *object, int flags)
+{
+ pd_entry_t *pdir = object;
+ paddr_t pdirpa;
+
+ /*
+ * NOTE: The `pmap_lock' is held when the PDP is allocated.
+ * WE MUST NOT BLOCK!
+ */
+
+ /* fetch the physical address of the page directory. */
+ (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
+
+ XENPRINTF(("pmap_pdp_ctor %p %p\n", pdir, (void *)pdirpa));
+
+ /* zero init area */
+ memset(pdir, 0, PDSLOT_PTE * sizeof(pd_entry_t));
+
+ /* put in recursive PDE to map the PTEs */
+ pdir[PDSLOT_PTE] = xpmap_ptom(pdirpa | PG_V /* | PG_KW */);
+
+ /* put in kernel VM PDEs */
+ memcpy(&pdir[PDSLOT_KERN], &PDP_BASE[PDSLOT_KERN],
+ nkpde * sizeof(pd_entry_t));
+
+ /* zero the rest */
+ memset(&pdir[PDSLOT_KERN + nkpde], 0,
+ PAGE_SIZE - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
+
+ pmap_enter(pmap_kernel(), (vaddr_t)pdir, pdirpa, VM_PROT_READ,
+ VM_PROT_READ);
+ pmap_update(pmap_kernel());
+
+ /* pin page type */
+ xpq_queue_pin_table(xpmap_ptom(pdirpa), XPQ_PIN_L2_TABLE);
+ xpq_flush_queue();
+
+ return (0);
+}
+
+void
+pmap_pdp_dtor(void *arg, void *object)
+{
+ pd_entry_t *pdir = object;
+ paddr_t pdirpa;
+
+ /* fetch the physical address of the page directory. */
+ pdirpa = PDE_GET(&pdir[PDSLOT_PTE]) & PG_FRAME;
+
+ XENPRINTF(("pmap_pdp_dtor %p %p\n", pdir, (void *)pdirpa));
+
+ /* unpin page type */
+ xpq_queue_unpin_table(xpmap_ptom(pdirpa));
+ xpq_flush_queue();
+}
+
+/*
+ * pmap_create: create a pmap
+ *
+ * => note: old pmap interface took a "size" args which allowed for
+ * the creation of "software only" pmaps (not in bsd).
+ */
+
+struct pmap *
+pmap_create()
+{
+ struct pmap *pmap;
+ u_int gen;
+
+ XENPRINTF(("pmap_create\n"));
+ pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
+
+ /* init uvm_object */
+ simple_lock_init(&pmap->pm_obj.vmobjlock);
+ pmap->pm_obj.pgops = NULL; /* currently not a mappable object */
+ TAILQ_INIT(&pmap->pm_obj.memq);
+ pmap->pm_obj.uo_npages = 0;
+ pmap->pm_obj.uo_refs = 1;
+ pmap->pm_stats.wired_count = 0;
+ pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */
+ pmap->pm_ptphint = NULL;
+ pmap->pm_hiexec = 0;
+ pmap->pm_flags = 0;
+ pmap->pm_cpus = 0;
+
+ /* init the LDT */
+ pmap->pm_ldt = NULL;
+ pmap->pm_ldt_len = 0;
+ pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+
+ /* allocate PDP */
+
+ /*
+ * we need to lock pmaps_lock to prevent nkpde from changing on
+ * us. note that there is no need to splvm to protect us from
+ * malloc since malloc allocates out of a submap and we should
+ * have already allocated kernel PTPs to cover the range...
+ *
+ * NOTE: WE MUST NOT BLOCK WHILE HOLDING THE `pmap_lock', nor
+ * must we call pmap_growkernel() while holding it!
+ */
+
+ try_again:
+ gen = pmap_pdp_cache_generation;
+ pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
+
+ simple_lock(&pmaps_lock);
+
+ if (gen != pmap_pdp_cache_generation) {
+ simple_unlock(&pmaps_lock);
+ pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
+ goto try_again;
+ }
+
+ pmap->pm_pdirpa = PDE_GET(&pmap->pm_pdir[PDSLOT_PTE]) & PG_FRAME;
+ XENPRINTF(("pmap_create %p set pm_pdirpa %p/%p slotval %p\n", pmap,
+ (void *)pmap->pm_pdirpa,
+ (void *)xpmap_ptom(pmap->pm_pdirpa),
+ (void *)pmap->pm_pdir[PDSLOT_PTE]));
+
+ LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
+
+ simple_unlock(&pmaps_lock);
+
+ return (pmap);
+}
+
+/*
+ * pmap_destroy: drop reference count on pmap. free pmap if
+ * reference count goes to zero.
+ */
+
+void
+pmap_destroy(pmap)
+ struct pmap *pmap;
+{
+ int refs;
+#ifdef DIAGNOSTIC
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+#endif /* DIAGNOSTIC */
+
+ /*
+ * drop reference count
+ */
+
+ simple_lock(&pmap->pm_obj.vmobjlock);
+ refs = --pmap->pm_obj.uo_refs;
+ simple_unlock(&pmap->pm_obj.vmobjlock);
+ if (refs > 0) {
+ return;
+ }
+
+#ifdef DIAGNOSTIC
+ for (CPU_INFO_FOREACH(cii, ci))
+ if (ci->ci_pmap == pmap)
+ panic("destroying pmap being used");
+#endif /* DIAGNOSTIC */
+
+ /*
+ * reference count is zero, free pmap resources and then free pmap.
+ */
+
+ XENPRINTF(("pmap_destroy %p pm_pdirpa %p/%p\n", pmap,
+ (void *)pmap->pm_pdirpa,
+ (void *)xpmap_ptom(pmap->pm_pdirpa)));
+
+ /*
+ * remove it from global list of pmaps
+ */
+
+ simple_lock(&pmaps_lock);
+ LIST_REMOVE(pmap, pm_list);
+ simple_unlock(&pmaps_lock);
+
+ /*
+ * destroyed pmap shouldn't have remaining PTPs
+ */
+
+ KASSERT(pmap->pm_obj.uo_npages == 0);
+ KASSERT(TAILQ_EMPTY(&pmap->pm_obj.memq));
+
+ /*
+ * MULTIPROCESSOR -- no need to flush out of other processors'
+ * APTE space because we do that in pmap_unmap_ptes().
+ */
+ pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
+
+#ifdef USER_LDT
+ if (pmap->pm_flags & PMF_USER_LDT) {
+ /*
+ * no need to switch the LDT; this address space is gone,
+ * nothing is using it.
+ *
+ * No need to lock the pmap for ldt_free (or anything else),
+ * we're the last one to use it.
+ */
+ ldt_free(pmap);
+ uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
+ pmap->pm_ldt_len * sizeof(union descriptor));
+ }
+#endif
+
+ pool_put(&pmap_pmap_pool, pmap);
+}
+
+/*
+ * Add a reference to the specified pmap.
+ */
+
+void
+pmap_reference(pmap)
+ struct pmap *pmap;
+{
+ simple_lock(&pmap->pm_obj.vmobjlock);
+ pmap->pm_obj.uo_refs++;
+ simple_unlock(&pmap->pm_obj.vmobjlock);
+}
+
+#if defined(PMAP_FORK)
+/*
+ * pmap_fork: perform any necessary data structure manipulation when
+ * a VM space is forked.
+ */
+
+void
+pmap_fork(pmap1, pmap2)
+ struct pmap *pmap1, *pmap2;
+{
+ simple_lock(&pmap1->pm_obj.vmobjlock);
+ simple_lock(&pmap2->pm_obj.vmobjlock);
+
+#ifdef USER_LDT
+ /* Copy the LDT, if necessary. */
+ if (pmap1->pm_flags & PMF_USER_LDT) {
+ union descriptor *new_ldt;
+ size_t len;
+
+ len = pmap1->pm_ldt_len * sizeof(union descriptor);
+ new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len);
+ memcpy(new_ldt, pmap1->pm_ldt, len);
+ pmap2->pm_ldt = new_ldt;
+ pmap2->pm_ldt_len = pmap1->pm_ldt_len;
+ pmap2->pm_flags |= PMF_USER_LDT;
+ ldt_alloc(pmap2, new_ldt, len);
+ }
+#endif /* USER_LDT */
+
+ simple_unlock(&pmap2->pm_obj.vmobjlock);
+ simple_unlock(&pmap1->pm_obj.vmobjlock);
+}
+#endif /* PMAP_FORK */
+
+#ifdef USER_LDT
+/*
+ * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
+ * restore the default.
+ */
+
+void
+pmap_ldt_cleanup(l)
+ struct lwp *l;
+{
+ struct pcb *pcb = &l->l_addr->u_pcb;
+ pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
+ union descriptor *old_ldt = NULL;
+ size_t len = 0;
+
+ simple_lock(&pmap->pm_obj.vmobjlock);
+
+ if (pmap->pm_flags & PMF_USER_LDT) {
+ ldt_free(pmap);
+ pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+ pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
+ if (pcb == curpcb)
+ lldt(pcb->pcb_ldt_sel);
+ old_ldt = pmap->pm_ldt;
+ len = pmap->pm_ldt_len * sizeof(union descriptor);
+ pmap->pm_ldt = NULL;
+ pmap->pm_ldt_len = 0;
+ pmap->pm_flags &= ~PMF_USER_LDT;
+ }
+
+ simple_unlock(&pmap->pm_obj.vmobjlock);
+
+ if (old_ldt != NULL)
+ uvm_km_free(kernel_map, (vaddr_t)old_ldt, len);
+}
+#endif /* USER_LDT */
+
+/*
+ * pmap_activate: activate a process' pmap
+ *
+ * => called from cpu_switch()
+ * => if lwp is the curlwp, then set ci_want_pmapload so that
+ * actual MMU context switch will be done by pmap_load() later
+ */
+
+void
+pmap_activate(l)
+ struct lwp *l;
+{
+ struct cpu_info *ci = curcpu();
+ struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+
+ if (l == ci->ci_curlwp) {
+ struct pcb *pcb;
+
+ KASSERT(ci->ci_want_pmapload == 0);
+ KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
+#ifdef KSTACK_CHECK_DR0
+ /*
+ * setup breakpoint on the top of stack
+ */
+ if (l == &lwp0)
+ dr0(0, 0, 0, 0);
+ else
+ dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
+#endif
+
+ /*
+ * no need to switch to kernel vmspace because
+ * it's a subset of any vmspace.
+ */
+
+ if (pmap == pmap_kernel()) {
+ ci->ci_want_pmapload = 0;
+ return;
+ }
+
+ pcb = &l->l_addr->u_pcb;
+ pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
+
+ ci->ci_want_pmapload = 1;
+ }
+}
+
+/*
+ * pmap_reactivate: try to regain reference to the pmap.
+ */
+
+static boolean_t
+pmap_reactivate(struct pmap *pmap)
+{
+ struct cpu_info *ci = curcpu();
+ u_int32_t cpumask = 1U << ci->ci_cpuid;
+ int s;
+ boolean_t result;
+ u_int32_t oldcpus;
+
+ /*
+ * if we still have a lazy reference to this pmap,
+ * we can assume that there was no tlb shootdown
+ * for this pmap in the meantime.
+ */
+
+ s = splipi(); /* protect from tlb shootdown ipis. */
+ oldcpus = pmap->pm_cpus;
+ x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
+ if (oldcpus & cpumask) {
+ KASSERT(ci->ci_tlbstate == TLBSTATE_LAZY);
+ /* got it */
+ result = TRUE;
+ } else {
+ KASSERT(ci->ci_tlbstate == TLBSTATE_STALE);
+ result = FALSE;
+ }
+ ci->ci_tlbstate = TLBSTATE_VALID;
+ splx(s);
+
+ return result;
+}
+
+/*
+ * pmap_load: actually switch pmap. (fill in %cr3 and LDT info)
+ */
+
+void
+pmap_load()
+{
+ struct cpu_info *ci = curcpu();
+ u_int32_t cpumask = 1U << ci->ci_cpuid;
+ struct pmap *pmap;
+ struct pmap *oldpmap;
+ struct lwp *l;
+ struct pcb *pcb;
+ pd_entry_t *mapdp;
+ int s;
+
+ KASSERT(ci->ci_want_pmapload);
+
+ l = ci->ci_curlwp;
+ KASSERT(l != NULL);
+ pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+ KASSERT(pmap != pmap_kernel());
+ oldpmap = ci->ci_pmap;
+
+ pcb = ci->ci_curpcb;
+ KASSERT(pcb == &l->l_addr->u_pcb);
+ /* loaded by pmap_activate */
+ KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel);
+
+ if (pmap == oldpmap) {
+ if (!pmap_reactivate(pmap)) {
+
+ /*
+ * pmap has been changed during deactivated.
+ * our tlb may be stale.
+ */
+
+ tlbflush();
+ }
+
+ ci->ci_want_pmapload = 0;
+ return;
+ }
+
+ /*
+ * actually switch pmap.
+ */
+
+ x86_atomic_clearbits_l(&oldpmap->pm_cpus, cpumask);
+
+ KASSERT((pmap->pm_cpus & cpumask) == 0);
+
+ KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
+ pmap_reference(pmap);
+ KERNEL_UNLOCK();
+
+ /*
+ * mark the pmap in use by this processor.
+ */
+
+ s = splipi();
+ x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
+ ci->ci_pmap = pmap;
+ ci->ci_tlbstate = TLBSTATE_VALID;
+ splx(s);
+
+ /*
+ * clear apdp slot before loading %cr3 since Xen only allows
+ * linear pagetable mappings in the current pagetable.
+ */
+ KDASSERT(curapdp == 0);
+ mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
+ PDE_CLEAR(APDP_PDE, mapdp);
+
+ /*
+ * update tss and load corresponding registers.
+ */
+
+ lldt(pcb->pcb_ldt_sel);
+ pcb->pcb_cr3 = pmap->pm_pdirpa;
+ lcr3(pcb->pcb_cr3);
+
+ ci->ci_want_pmapload = 0;
+
+ KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
+ pmap_destroy(oldpmap);
+ KERNEL_UNLOCK();
+}
+
+/*
+ * pmap_deactivate: deactivate a process' pmap
+ */
+
+void
+pmap_deactivate(l)
+ struct lwp *l;
+{
+
+ if (l == curlwp)
+ pmap_deactivate2(l);
+}
+
+/*
+ * pmap_deactivate2: context switch version of pmap_deactivate.
+ * always treat l as curlwp.
+ */
+
+void
+pmap_deactivate2(l)
+ struct lwp *l;
+{
+ struct pmap *pmap;
+ struct cpu_info *ci = curcpu();
+
+ if (ci->ci_want_pmapload) {
+ KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
+ != pmap_kernel());
+ KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
+ != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
+
+ /*
+ * userspace has not been touched.
+ * nothing to do here.
+ */
+
+ ci->ci_want_pmapload = 0;
+ return;
+ }
+
+ pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+
+ if (pmap == pmap_kernel()) {
+ return;
+ }
+
+ KASSERT(ci->ci_pmap == pmap);
+
+ KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
+ ci->ci_tlbstate = TLBSTATE_LAZY;
+ XENPRINTF(("pmap_deactivate %p ebp %p esp %p\n",
+ l, (void *)l->l_addr->u_pcb.pcb_ebp,
+ (void *)l->l_addr->u_pcb.pcb_esp));
+}
+
+/*
+ * end of lifecycle functions
+ */
+
+/*
+ * some misc. functions
+ */
+
+/*
+ * pmap_extract: extract a PA for the given VA
+ */
+
+boolean_t
+pmap_extract(pmap, va, pap)
+ struct pmap *pmap;
+ vaddr_t va;
+ paddr_t *pap;
+{
+ pt_entry_t *ptes, pte;
+ pd_entry_t pde;
+
+ if (__predict_true((pde = PDE_GET(&pmap->pm_pdir[pdei(va)])) != 0)) {
+#ifdef LARGEPAGES
+ if (pde & PG_PS) {
+ if (pap != NULL)
+ *pap = (pde & PG_LGFRAME) | (va & ~PG_LGFRAME);
+ return (TRUE);
+ }
+#endif
+
+ ptes = pmap_map_ptes(pmap);
+ pte = PTE_GET(&ptes[x86_btop(va)]);
+ pmap_unmap_ptes(pmap);
+
+ if (__predict_true((pte & PG_V) != 0)) {
+ if (pap != NULL)
+ *pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
+ return (TRUE);
+ }
+ }
+ return (FALSE);
+}
+
+
+/*
+ * vtophys: virtual address to physical address. For use by
+ * machine-dependent code only.
+ */
+
+paddr_t
+vtophys(va)
+ vaddr_t va;
+{
+ paddr_t pa;
+
+ if (pmap_extract(pmap_kernel(), va, &pa) == TRUE)
+ return (pa);
+ return (0);
+}
+
+
+/*
+ * pmap_virtual_space: used during bootup [pmap_steal_memory] to
+ * determine the bounds of the kernel virtual addess space.
+ */
+
+void
+pmap_virtual_space(startp, endp)
+ vaddr_t *startp;
+ vaddr_t *endp;
+{
+ *startp = virtual_avail;
+ *endp = virtual_end;
+}
+
+/*
+ * pmap_map: map a range of PAs into kvm
+ *
+ * => used during crash dump
+ * => XXX: pmap_map() should be phased out?
+ */
+
+vaddr_t
+pmap_map(va, spa, epa, prot)
+ vaddr_t va;
+ paddr_t spa, epa;
+ vm_prot_t prot;
+{
+ while (spa < epa) {
+ pmap_enter(pmap_kernel(), va, spa, prot, 0);
+ va += PAGE_SIZE;
+ spa += PAGE_SIZE;
+ }
+ pmap_update(pmap_kernel());
+ return va;
+}
+
+/*
+ * pmap_zero_page: zero a page
+ */
+
+void
+pmap_zero_page(pa)
+ paddr_t pa;
+{
+#ifdef MULTIPROCESSOR
+ int id = cpu_number();
+#endif
+ pt_entry_t *zpte = PTESLEW(zero_pte, id);
+ pt_entry_t *maptp;
+ caddr_t zerova = VASLEW(zerop, id);
+
+#ifdef DIAGNOSTIC
+ if (PTE_GET(zpte))
+ panic("pmap_zero_page: lock botch");
+#endif
+
+ maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
+ PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW); /* map in */
+ pmap_update_pg((vaddr_t)zerova); /* flush TLB */
+
+ memset(zerova, 0, PAGE_SIZE); /* zero */
+ PTE_CLEAR(zpte, maptp); /* zap! */
+}
+
+/*
+ * pmap_pagezeroidle: the same, for the idle loop page zero'er.
+ * Returns TRUE if the page was zero'd, FALSE if we aborted for
+ * some reason.
+ */
+
+boolean_t
+pmap_pageidlezero(pa)
+ paddr_t pa;
+{
+#ifdef MULTIPROCESSOR
+ int id = cpu_number();
+#endif
+ pt_entry_t *zpte = PTESLEW(zero_pte, id);
+ pt_entry_t *maptp;
+ caddr_t zerova = VASLEW(zerop, id);
+ boolean_t rv = TRUE;
+ int i, *ptr;
+
+#ifdef DIAGNOSTIC
+ if (PTE_GET(zpte))
+ panic("pmap_zero_page_uncached: lock botch");
+#endif
+ maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
+ PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW); /* map in */
+ pmap_update_pg((vaddr_t)zerova); /* flush TLB */
+ for (i = 0, ptr = (int *) zerova; i < PAGE_SIZE / sizeof(int); i++) {
+ if (sched_whichqs != 0) {
+
+ /*
+ * A process has become ready. Abort now,
+ * so we don't keep it waiting while we
+ * do slow memory access to finish this
+ * page.
+ */
+
+ rv = FALSE;
+ break;
+ }
+ *ptr++ = 0;
+ }
+
+ PTE_CLEAR(zpte, maptp); /* zap! */
+ return (rv);
+}
+
+/*
+ * pmap_copy_page: copy a page
+ */
+
+void
+pmap_copy_page(srcpa, dstpa)
+ paddr_t srcpa, dstpa;
+{
+#ifdef MULTIPROCESSOR
+ int id = cpu_number();
+#endif
+ pt_entry_t *spte = PTESLEW(csrc_pte,id), *maspte;
+ pt_entry_t *dpte = PTESLEW(cdst_pte,id), *madpte;
+ caddr_t csrcva = VASLEW(csrcp, id);
+ caddr_t cdstva = VASLEW(cdstp, id);
+
+#ifdef DIAGNOSTIC
+ if (PTE_GET(spte) || PTE_GET(dpte))
+ panic("pmap_copy_page: lock botch");
+#endif
+
+ maspte = (pt_entry_t *)vtomach((vaddr_t)spte);
+ madpte = (pt_entry_t *)vtomach((vaddr_t)dpte);
+ PTE_SET(spte, maspte, (srcpa & PG_FRAME) | PG_V | PG_RW);
+ PTE_SET(dpte, madpte, (dstpa & PG_FRAME) | PG_V | PG_RW);
+ pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
+ memcpy(cdstva, csrcva, PAGE_SIZE);
+ PTE_CLEAR(spte, maspte); /* zap! */
+ PTE_CLEAR(dpte, madpte); /* zap! */
+}
+
+/*
+ * p m a p r e m o v e f u n c t i o n s
+ *
+ * functions that remove mappings
+ */
+
+/*
+ * pmap_remove_ptes: remove PTEs from a PTP
+ *
+ * => must have proper locking on pmap_master_lock
+ * => caller must hold pmap's lock
+ * => PTP must be mapped into KVA
+ * => PTP should be null if pmap == pmap_kernel()
+ */
+
+static void
+pmap_remove_ptes(pmap, ptp, ptpva, startva, endva, cpumaskp, flags)
+ struct pmap *pmap;
+ struct vm_page *ptp;
+ vaddr_t ptpva;
+ vaddr_t startva, endva;
+ int32_t *cpumaskp;
+ int flags;
+{
+ struct pv_entry *pv_tofree = NULL; /* list of pv_entrys to free */
+ struct pv_entry *pve;
+ pt_entry_t *pte = (pt_entry_t *) ptpva;
+ pt_entry_t opte;
+ pt_entry_t *maptp;
+
+ /*
+ * note that ptpva points to the PTE that maps startva. this may
+ * or may not be the first PTE in the PTP.
+ *
+ * we loop through the PTP while there are still PTEs to look at
+ * and the wire_count is greater than 1 (because we use the wire_count
+ * to keep track of the number of real PTEs in the PTP).
+ */
+
+ for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
+ ; pte++, startva += PAGE_SIZE) {
+ struct vm_page *pg;
+ struct vm_page_md *mdpg;
+
+ if (!pmap_valid_entry(*pte))
+ continue; /* VA not mapped */
+ if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
+ continue;
+ }
+
+ /* atomically save the old PTE and zap! it */
+ maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+ opte = pte_atomic_update(pte, maptp, 0);
+ pmap_exec_account(pmap, startva, opte, 0);
+
+ if (opte & PG_W)
+ pmap->pm_stats.wired_count--;
+ pmap->pm_stats.resident_count--;
+
+ if (opte & PG_U)
+ pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
+
+ if (ptp) {
+ ptp->wire_count--; /* dropping a PTE */
+ /* Make sure that the PDE is flushed */
+ if ((ptp->wire_count <= 1) && !(opte & PG_U))
+ pmap_tlb_shootdown(pmap, startva, opte,
+ cpumaskp);
+ }
+
+ /*
+ * if we are not on a pv_head list we are done.
+ */
+
+ if ((opte & PG_PVLIST) == 0) {
+#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
+ if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
+ panic("pmap_remove_ptes: managed page without "
+ "PG_PVLIST for 0x%lx", startva);
+#endif
+ continue;
+ }
+
+ pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+ if (pg == NULL)
+ panic("pmap_remove_ptes: unmanaged page marked "
+ "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
+ startva, (u_long)(opte & PG_FRAME));
+#endif
+ mdpg = &pg->mdpage;
+
+ /* sync R/M bits */
+ simple_lock(&mdpg->mp_pvhead.pvh_lock);
+ mdpg->mp_attrs |= (opte & (PG_U|PG_M));
+ pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, startva);
+ simple_unlock(&mdpg->mp_pvhead.pvh_lock);
+
+ if (pve) {
+ SPLAY_RIGHT(pve, pv_node) = pv_tofree;
+ pv_tofree = pve;
+ }
+
+ /* end of "for" loop: time for next pte */
+ }
+ if (pv_tofree)
+ pmap_free_pvs(pmap, pv_tofree);
+}
+
+
+/*
+ * pmap_remove_pte: remove a single PTE from a PTP
+ *
+ * => must have proper locking on pmap_master_lock
+ * => caller must hold pmap's lock
+ * => PTP must be mapped into KVA
+ * => PTP should be null if pmap == pmap_kernel()
+ * => returns true if we removed a mapping
+ */
+
+static boolean_t
+pmap_remove_pte(pmap, ptp, pte, va, cpumaskp, flags)
+ struct pmap *pmap;
+ struct vm_page *ptp;
+ pt_entry_t *pte;
+ vaddr_t va;
+ int32_t *cpumaskp;
+ int flags;
+{
+ pt_entry_t opte;
+ pt_entry_t *maptp;
+ struct pv_entry *pve;
+ struct vm_page *pg;
+ struct vm_page_md *mdpg;
+
+ if (!pmap_valid_entry(*pte))
+ return(FALSE); /* VA not mapped */
+ if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
+ return(FALSE);
+ }
+
+ /* atomically save the old PTE and zap! it */
+ maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+ opte = pte_atomic_update(pte, maptp, 0);
+
+ XENPRINTK(("pmap_remove_pte %p, was %08x\n", pte, opte));
+ pmap_exec_account(pmap, va, opte, 0);
+
+ if (opte & PG_W)
+ pmap->pm_stats.wired_count--;
+ pmap->pm_stats.resident_count--;
+
+ if (opte & PG_U)
+ pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
+
+ if (ptp) {
+ ptp->wire_count--; /* dropping a PTE */
+ /* Make sure that the PDE is flushed */
+ if ((ptp->wire_count <= 1) && !(opte & PG_U))
+ pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
+
+ }
+ /*
+ * if we are not on a pv_head list we are done.
+ */
+
+ if ((opte & PG_PVLIST) == 0) {
+#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
+ if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
+ panic("pmap_remove_pte: managed page without "
+ "PG_PVLIST for 0x%lx", va);
+#endif
+ return(TRUE);
+ }
+
+ pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+ if (pg == NULL)
+ panic("pmap_remove_pte: unmanaged page marked "
+ "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
+ (u_long)(opte & PG_FRAME));
+#endif
+ mdpg = &pg->mdpage;
+
+ /* sync R/M bits */
+ simple_lock(&mdpg->mp_pvhead.pvh_lock);
+ mdpg->mp_attrs |= (opte & (PG_U|PG_M));
+ pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, va);
+ simple_unlock(&mdpg->mp_pvhead.pvh_lock);
+
+ if (pve)
+ pmap_free_pv(pmap, pve);
+ return(TRUE);
+}
+
+/*
+ * pmap_remove: top level mapping removal function
+ *
+ * => caller should not be holding any pmap locks
+ */
+
+void
+pmap_remove(pmap, sva, eva)
+ struct pmap *pmap;
+ vaddr_t sva, eva;
+{
+ pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
+}
+
+/*
+ * pmap_do_remove: mapping removal guts
+ *
+ * => caller should not be holding any pmap locks
+ */
+
+static void
+pmap_do_remove(pmap, sva, eva, flags)
+ struct pmap *pmap;
+ vaddr_t sva, eva;
+ int flags;
+{
+ pt_entry_t *ptes, opte;
+ pt_entry_t *maptp;
+ boolean_t result;
+ paddr_t ptppa;
+ vaddr_t blkendva;
+ struct vm_page *ptp;
+ int32_t cpumask = 0;
+ TAILQ_HEAD(, vm_page) empty_ptps;
+ struct cpu_info *ci;
+ struct pmap *curpmap;
+
+ /*
+ * we lock in the pmap => pv_head direction
+ */
+
+ TAILQ_INIT(&empty_ptps);
+
+ PMAP_MAP_TO_HEAD_LOCK();
+
+ ptes = pmap_map_ptes(pmap); /* locks pmap */
+
+ ci = curcpu();
+ curpmap = ci->ci_pmap;
+
+ /*
+ * removing one page? take shortcut function.
+ */
+
+ if (sva + PAGE_SIZE == eva) {
+ if (pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) {
+
+ /* PA of the PTP */
+ ptppa = PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME;
+
+ /* get PTP if non-kernel mapping */
+ if (pmap == pmap_kernel()) {
+ /* we never free kernel PTPs */
+ ptp = NULL;
+ } else {
+ if (pmap->pm_ptphint &&
+ VM_PAGE_TO_PHYS(pmap->pm_ptphint) ==
+ ptppa) {
+ ptp = pmap->pm_ptphint;
+ } else {
+ ptp = PHYS_TO_VM_PAGE(ptppa);
+#ifdef DIAGNOSTIC
+ if (ptp == NULL)
+ panic("pmap_remove: unmanaged "
+ "PTP detected");
+#endif
+ }
+ }
+
+ /* do it! */
+ result = pmap_remove_pte(pmap, ptp,
+ &ptes[x86_btop(sva)], sva, &cpumask, flags);
+
+ /*
+ * if mapping removed and the PTP is no longer
+ * being used, free it!
+ */
+
+ if (result && ptp && ptp->wire_count <= 1) {
+ /* zap! */
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&pmap->pm_pdir[pdei(sva)]);
+ PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
+ maptp, opte);
+#if defined(MULTIPROCESSOR)
+ /*
+ * XXXthorpej Redundant shootdown can happen
+ * here if we're using APTE space.
+ */
+#endif
+ pmap_tlb_shootdown(curpmap,
+ ((vaddr_t)ptes) + ptp->offset, opte,
+ &cpumask);
+#if defined(MULTIPROCESSOR)
+ /*
+ * Always shoot down the pmap's self-mapping
+ * of the PTP.
+ * XXXthorpej Redundant shootdown can happen
+ * here if pmap == curpmap (not APTE space).
+ */
+ pmap_tlb_shootdown(pmap,
+ ((vaddr_t)PTE_BASE) + ptp->offset, opte,
+ &cpumask);
+#endif
+ pmap->pm_stats.resident_count--;
+ if (pmap->pm_ptphint == ptp)
+ pmap->pm_ptphint =
+ TAILQ_FIRST(&pmap->pm_obj.memq);
+ ptp->wire_count = 0;
+ ptp->flags |= PG_ZERO;
+ uvm_pagerealloc(ptp, NULL, 0);
+ TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
+ }
+ }
+ pmap_tlb_shootnow(cpumask);
+ pmap_unmap_ptes(pmap); /* unlock pmap */
+ PMAP_MAP_TO_HEAD_UNLOCK();
+ /* Now we can free unused ptps */
+ TAILQ_FOREACH(ptp, &empty_ptps, listq)
+ uvm_pagefree(ptp);
+ return;
+ }
+
+ cpumask = 0;
+
+ for (/* null */ ; sva < eva ; sva = blkendva) {
+
+ /* determine range of block */
+ blkendva = x86_round_pdr(sva+1);
+ if (blkendva > eva)
+ blkendva = eva;
+
+ /*
+ * XXXCDC: our PTE mappings should never be removed
+ * with pmap_remove! if we allow this (and why would
+ * we?) then we end up freeing the pmap's page
+ * directory page (PDP) before we are finished using
+ * it when we hit in in the recursive mapping. this
+ * is BAD.
+ *
+ * long term solution is to move the PTEs out of user
+ * address space. and into kernel address space (up
+ * with APTE). then we can set VM_MAXUSER_ADDRESS to
+ * be VM_MAX_ADDRESS.
+ */
+
+ if (pdei(sva) == PDSLOT_PTE)
+ /* XXXCDC: ugly hack to avoid freeing PDP here */
+ continue;
+
+ if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
+ /* valid block? */
+ continue;
+
+ /* PA of the PTP */
+ ptppa = (PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME);
+
+ /* get PTP if non-kernel mapping */
+ if (pmap == pmap_kernel()) {
+ /* we never free kernel PTPs */
+ ptp = NULL;
+ } else {
+ if (pmap->pm_ptphint &&
+ VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
+ ptp = pmap->pm_ptphint;
+ } else {
+ ptp = PHYS_TO_VM_PAGE(ptppa);
+#ifdef DIAGNOSTIC
+ if (ptp == NULL)
+ panic("pmap_remove: unmanaged PTP "
+ "detected");
+#endif
+ }
+ }
+ pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[x86_btop(sva)],
+ sva, blkendva, &cpumask, flags);
+
+ /* if PTP is no longer being used, free it! */
+ if (ptp && ptp->wire_count <= 1) {
+ /* zap! */
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&pmap->pm_pdir[pdei(sva)]);
+ PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
+ maptp, opte);
+#if defined(MULTIPROCESSOR)
+ /*
+ * XXXthorpej Redundant shootdown can happen here
+ * if we're using APTE space.
+ */
+#endif
+ pmap_tlb_shootdown(curpmap,
+ ((vaddr_t)ptes) + ptp->offset, opte, &cpumask);
+#if defined(MULTIPROCESSOR)
+ /*
+ * Always shoot down the pmap's self-mapping
+ * of the PTP.
+ * XXXthorpej Redundant shootdown can happen here
+ * if pmap == curpmap (not APTE space).
+ */
+ pmap_tlb_shootdown(pmap,
+ ((vaddr_t)PTE_BASE) + ptp->offset, opte, &cpumask);
+#endif
+ pmap->pm_stats.resident_count--;
+ if (pmap->pm_ptphint == ptp) /* update hint? */
+ pmap->pm_ptphint = pmap->pm_obj.memq.tqh_first;
+ ptp->wire_count = 0;
+ ptp->flags |= PG_ZERO;
+ /* Postpone free to shootdown */
+ uvm_pagerealloc(ptp, NULL, 0);
+ TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
+ }
+ }
+
+ pmap_tlb_shootnow(cpumask);
+ pmap_unmap_ptes(pmap);
+ PMAP_MAP_TO_HEAD_UNLOCK();
+ /* Now we can free unused ptps */
+ TAILQ_FOREACH(ptp, &empty_ptps, listq)
+ uvm_pagefree(ptp);
+}
+
+/*
+ * pmap_page_remove: remove a managed vm_page from all pmaps that map it
+ *
+ * => we set pv_head => pmap locking
+ * => R/M bits are sync'd back to attrs
+ */
+
+void
+pmap_page_remove(pg)
+ struct vm_page *pg;
+{
+ struct pv_head *pvh;
+ struct pv_entry *pve, *npve, *killlist = NULL;
+ pt_entry_t *ptes, opte;
+ pt_entry_t *maptp;
+ int32_t cpumask = 0;
+ TAILQ_HEAD(, vm_page) empty_ptps;
+ struct vm_page *ptp;
+ struct cpu_info *ci;
+ struct pmap *curpmap;
+
+#ifdef DIAGNOSTIC
+ int bank, off;
+
+ bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
+ if (bank == -1)
+ panic("pmap_page_remove: unmanaged page?");
+#endif
+
+ pvh = &pg->mdpage.mp_pvhead;
+ if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
+ return;
+ }
+
+ TAILQ_INIT(&empty_ptps);
+
+ /* set pv_head => pmap locking */
+ PMAP_HEAD_TO_MAP_LOCK();
+
+ ci = curcpu();
+ curpmap = ci->ci_pmap;
+
+ /* XXX: needed if we hold head->map lock? */
+ simple_lock(&pvh->pvh_lock);
+
+ for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) {
+ npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve);
+ ptes = pmap_map_ptes(pve->pv_pmap); /* locks pmap */
+
+#ifdef DIAGNOSTIC
+ if (pve->pv_ptp &&
+ (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]) &
+ PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
+ printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n",
+ pg, pve->pv_va, pve->pv_ptp);
+ printf("pmap_page_remove: PTP's phys addr: "
+ "actual=%lx, recorded=%lx\n",
+ (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)])
+ & PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
+ panic("pmap_page_remove: mapped managed page has "
+ "invalid pv_ptp field");
+ }
+#endif
+
+ /* atomically save the old PTE and zap! it */
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&ptes[x86_btop(pve->pv_va)]);
+ opte = pte_atomic_update(&ptes[x86_btop(pve->pv_va)],
+ maptp, 0);
+
+ if (opte & PG_W)
+ pve->pv_pmap->pm_stats.wired_count--;
+ pve->pv_pmap->pm_stats.resident_count--;
+
+ /* Shootdown only if referenced */
+ if (opte & PG_U)
+ pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
+ &cpumask);
+
+ /* sync R/M bits */
+ pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M));
+
+ /* update the PTP reference count. free if last reference. */
+ if (pve->pv_ptp) {
+ pve->pv_ptp->wire_count--;
+ if (pve->pv_ptp->wire_count <= 1) {
+ /*
+ * Do we have to shootdown the page just to
+ * get the pte out of the TLB ?
+ */
+ if(!(opte & PG_U))
+ pmap_tlb_shootdown(pve->pv_pmap,
+ pve->pv_va, opte, &cpumask);
+
+ /* zap! */
+ maptp = (pt_entry_t *)vtomach((vaddr_t)
+ &pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]);
+ PTE_ATOMIC_CLEAR(&pve->pv_pmap->pm_pdir
+ [pdei(pve->pv_va)], maptp, opte);
+ pmap_tlb_shootdown(curpmap,
+ ((vaddr_t)ptes) + pve->pv_ptp->offset,
+ opte, &cpumask);
+#if defined(MULTIPROCESSOR)
+ /*
+ * Always shoot down the other pmap's
+ * self-mapping of the PTP.
+ */
+ pmap_tlb_shootdown(pve->pv_pmap,
+ ((vaddr_t)PTE_BASE) + pve->pv_ptp->offset,
+ opte, &cpumask);
+#endif
+ pve->pv_pmap->pm_stats.resident_count--;
+ /* update hint? */
+ if (pve->pv_pmap->pm_ptphint == pve->pv_ptp)
+ pve->pv_pmap->pm_ptphint =
+ pve->pv_pmap->pm_obj.memq.tqh_first;
+ pve->pv_ptp->wire_count = 0;
+ pve->pv_ptp->flags |= PG_ZERO;
+ /* Free only after the shootdown */
+ uvm_pagerealloc(pve->pv_ptp, NULL, 0);
+ TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp,
+ listq);
+ }
+ }
+ pmap_unmap_ptes(pve->pv_pmap); /* unlocks pmap */
+ SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */
+ SPLAY_RIGHT(pve, pv_node) = killlist; /* mark it for death */
+ killlist = pve;
+ }
+ pmap_free_pvs(NULL, killlist);
+ simple_unlock(&pvh->pvh_lock);
+ PMAP_HEAD_TO_MAP_UNLOCK();
+ pmap_tlb_shootnow(cpumask);
+
+ /* Now we can free unused ptps */
+ TAILQ_FOREACH(ptp, &empty_ptps, listq)
+ uvm_pagefree(ptp);
+}
+
+/*
+ * p m a p a t t r i b u t e f u n c t i o n s
+ * functions that test/change managed page's attributes
+ * since a page can be mapped multiple times we must check each PTE that
+ * maps it by going down the pv lists.
+ */
+
+/*
+ * pmap_test_attrs: test a page's attributes
+ *
+ * => we set pv_head => pmap locking
+ */
+
+boolean_t
+pmap_test_attrs(pg, testbits)
+ struct vm_page *pg;
+ int testbits;
+{
+ struct vm_page_md *mdpg;
+ int *myattrs;
+ struct pv_head *pvh;
+ struct pv_entry *pve;
+ volatile pt_entry_t *ptes;
+ pt_entry_t pte;
+
+#if DIAGNOSTIC
+ int bank, off;
+
+ bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
+ if (bank == -1)
+ panic("pmap_test_attrs: unmanaged page?");
+#endif
+ mdpg = &pg->mdpage;
+
+ /*
+ * before locking: see if attributes are already set and if so,
+ * return!
+ */
+
+ myattrs = &mdpg->mp_attrs;
+ if (*myattrs & testbits)
+ return(TRUE);
+
+ /* test to see if there is a list before bothering to lock */
+ pvh = &mdpg->mp_pvhead;
+ if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
+ return(FALSE);
+ }
+
+ /* nope, gonna have to do it the hard way */
+ PMAP_HEAD_TO_MAP_LOCK();
+ /* XXX: needed if we hold head->map lock? */
+ simple_lock(&pvh->pvh_lock);
+
+ for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root);
+ pve != NULL && (*myattrs & testbits) == 0;
+ pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) {
+ ptes = pmap_map_ptes(pve->pv_pmap);
+ pte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); /* XXX flags only? */
+ pmap_unmap_ptes(pve->pv_pmap);
+ *myattrs |= pte;
+ }
+
+ /*
+ * note that we will exit the for loop with a non-null pve if
+ * we have found the bits we are testing for.
+ */
+
+ simple_unlock(&pvh->pvh_lock);
+ PMAP_HEAD_TO_MAP_UNLOCK();
+ return((*myattrs & testbits) != 0);
+}
+
+/*
+ * pmap_clear_attrs: clear the specified attribute for a page.
+ *
+ * => we set pv_head => pmap locking
+ * => we return TRUE if we cleared one of the bits we were asked to
+ */
+
+boolean_t
+pmap_clear_attrs(pg, clearbits)
+ struct vm_page *pg;
+ int clearbits;
+{
+ struct vm_page_md *mdpg;
+ u_int32_t result;
+ struct pv_head *pvh;
+ struct pv_entry *pve;
+ pt_entry_t *ptes, opte;
+ pt_entry_t *maptp;
+ int *myattrs;
+ int32_t cpumask = 0;
+
+#ifdef DIAGNOSTIC
+ int bank, off;
+
+ bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
+ if (bank == -1)
+ panic("pmap_change_attrs: unmanaged page?");
+#endif
+ mdpg = &pg->mdpage;
+
+ PMAP_HEAD_TO_MAP_LOCK();
+ pvh = &mdpg->mp_pvhead;
+ /* XXX: needed if we hold head->map lock? */
+ simple_lock(&pvh->pvh_lock);
+
+ myattrs = &mdpg->mp_attrs;
+ result = *myattrs & clearbits;
+ *myattrs &= ~clearbits;
+
+ SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) {
+#ifdef DIAGNOSTIC
+ if (!pmap_valid_entry(pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]))
+ panic("pmap_change_attrs: mapping without PTP "
+ "detected");
+#endif
+
+ ptes = pmap_map_ptes(pve->pv_pmap); /* locks pmap */
+ opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
+ if (opte & clearbits) {
+ /* We need to do something */
+ if (clearbits == PG_RW) {
+ result |= PG_RW;
+
+ /*
+ * On write protect we might not need to flush
+ * the TLB
+ */
+
+ /* First zap the RW bit! */
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&ptes[x86_btop(pve->pv_va)]);
+ PTE_ATOMIC_CLEARBITS(
+ &ptes[x86_btop(pve->pv_va)],
+ maptp, PG_RW);
+ opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
+
+ /*
+ * Then test if it is not cached as RW the TLB
+ */
+ if (!(opte & PG_M))
+ goto no_tlb_shootdown;
+ }
+
+ /*
+ * Since we need a shootdown me might as well
+ * always clear PG_U AND PG_M.
+ */
+
+ /* zap! */
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&ptes[x86_btop(pve->pv_va)]);
+ PTE_ATOMIC_SET(&ptes[x86_btop(pve->pv_va)], maptp,
+ (opte & ~(PG_U | PG_M)), opte);
+
+ result |= (opte & clearbits);
+ *myattrs |= (opte & ~(clearbits));
+
+ pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
+ &cpumask);
+ }
+no_tlb_shootdown:
+ pmap_unmap_ptes(pve->pv_pmap); /* unlocks pmap */
+ }
+
+ simple_unlock(&pvh->pvh_lock);
+ PMAP_HEAD_TO_MAP_UNLOCK();
+
+ pmap_tlb_shootnow(cpumask);
+ return(result != 0);
+}
+
+
+/*
+ * p m a p p r o t e c t i o n f u n c t i o n s
+ */
+
+/*
+ * pmap_page_protect: change the protection of all recorded mappings
+ * of a managed page
+ *
+ * => NOTE: this is an inline function in pmap.h
+ */
+
+/* see pmap.h */
+
+/*
+ * pmap_protect: set the protection in of the pages in a pmap
+ *
+ * => NOTE: this is an inline function in pmap.h
+ */
+
+/* see pmap.h */
+
+/*
+ * pmap_write_protect: write-protect pages in a pmap
+ */
+
+void
+pmap_write_protect(pmap, sva, eva, prot)
+ struct pmap *pmap;
+ vaddr_t sva, eva;
+ vm_prot_t prot;
+{
+ pt_entry_t *ptes, *epte;
+ pt_entry_t *maptp;
+#ifndef XEN
+ volatile
+#endif
+ pt_entry_t *spte;
+ vaddr_t blockend;
+ int32_t cpumask = 0;
+
+ ptes = pmap_map_ptes(pmap); /* locks pmap */
+
+ /* should be ok, but just in case ... */
+ sva &= PG_FRAME;
+ eva &= PG_FRAME;
+
+ for (/* null */ ; sva < eva ; sva = blockend) {
+
+ blockend = (sva & PD_MASK) + NBPD;
+ if (blockend > eva)
+ blockend = eva;
+
+ /*
+ * XXXCDC: our PTE mappings should never be write-protected!
+ *
+ * long term solution is to move the PTEs out of user
+ * address space. and into kernel address space (up
+ * with APTE). then we can set VM_MAXUSER_ADDRESS to
+ * be VM_MAX_ADDRESS.
+ */
+
+ /* XXXCDC: ugly hack to avoid freeing PDP here */
+ if (pdei(sva) == PDSLOT_PTE)
+ continue;
+
+ /* empty block? */
+ if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
+ continue;
+
+#ifdef DIAGNOSTIC
+ if (sva >= VM_MAXUSER_ADDRESS &&
+ sva < VM_MAX_ADDRESS)
+ panic("pmap_write_protect: PTE space");
+#endif
+
+ spte = &ptes[x86_btop(sva)];
+ epte = &ptes[x86_btop(blockend)];
+
+ for (/*null */; spte < epte ; spte++) {
+ if ((PTE_GET(spte) & (PG_RW|PG_V)) == (PG_RW|PG_V)) {
+ maptp = (pt_entry_t *)vtomach((vaddr_t)spte);
+ PTE_ATOMIC_CLEARBITS(spte, maptp, PG_RW);
+ if (PTE_GET(spte) & PG_M)
+ pmap_tlb_shootdown(pmap,
+ x86_ptob(spte - ptes),
+ PTE_GET(spte), &cpumask);
+ }
+ }
+ }
+
+ /*
+ * if we kept a removal record and removed some pages update the TLB
+ */
+
+ pmap_tlb_shootnow(cpumask);
+ pmap_unmap_ptes(pmap); /* unlocks pmap */
+}
+
+/*
+ * end of protection functions
+ */
+
+/*
+ * pmap_unwire: clear the wired bit in the PTE
+ *
+ * => mapping should already be in map
+ */
+
+void
+pmap_unwire(pmap, va)
+ struct pmap *pmap;
+ vaddr_t va;
+{
+ pt_entry_t *ptes;
+ pt_entry_t *maptp;
+
+ if (pmap_valid_entry(pmap->pm_pdir[pdei(va)])) {
+ ptes = pmap_map_ptes(pmap); /* locks pmap */
+
+#ifdef DIAGNOSTIC
+ if (!pmap_valid_entry(ptes[x86_btop(va)]))
+ panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
+#endif
+ if ((ptes[x86_btop(va)] & PG_W) != 0) {
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&ptes[x86_btop(va)]);
+ PTE_ATOMIC_CLEARBITS(&ptes[x86_btop(va)], maptp, PG_W);
+ pmap->pm_stats.wired_count--;
+ }
+#ifdef DIAGNOSTIC
+ else {
+ printf("pmap_unwire: wiring for pmap %p va 0x%lx "
+ "didn't change!\n", pmap, va);
+ }
+#endif
+ pmap_unmap_ptes(pmap); /* unlocks map */
+ }
+#ifdef DIAGNOSTIC
+ else {
+ panic("pmap_unwire: invalid PDE");
+ }
+#endif
+}
+
+/*
+ * pmap_collect: free resources held by a pmap
+ *
+ * => optional function.
+ * => called when a process is swapped out to free memory.
+ */
+
+void
+pmap_collect(pmap)
+ struct pmap *pmap;
+{
+ /*
+ * free all of the pt pages by removing the physical mappings
+ * for its entire address space.
+ */
+
+ pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
+ PMAP_REMOVE_SKIPWIRED);
+}
+
+/*
+ * pmap_copy: copy mappings from one pmap to another
+ *
+ * => optional function
+ * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
+ */
+
+/*
+ * defined as macro in pmap.h
+ */
+
+/*
+ * pmap_enter: enter a mapping into a pmap
+ *
+ * => must be done "now" ... no lazy-evaluation
+ * => we set pmap => pv_head locking
+ */
+
+int
+pmap_enter(pmap, va, pa, prot, flags)
+ struct pmap *pmap;
+ vaddr_t va;
+ paddr_t pa;
+ vm_prot_t prot;
+ int flags;
+{
+ pt_entry_t *ptes, opte, npte;
+ struct vm_page *ptp, *pg;
+ struct vm_page_md *mdpg;
+ struct pv_head *old_pvh, *new_pvh;
+ struct pv_entry *pve = NULL; /* XXX gcc */
+ int error;
+ boolean_t wired = (flags & PMAP_WIRED) != 0;
+ pt_entry_t *maptp;
+
+ XENPRINTK(("pmap_enter(%p, %p, %p, %08x, %08x)\n",
+ pmap, (void *)va, (void *)pa, prot, flags));
+
+#ifdef DIAGNOSTIC
+ /* sanity check: totally out of range? */
+ if (va >= VM_MAX_KERNEL_ADDRESS)
+ panic("pmap_enter: too big");
+
+ if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
+ panic("pmap_enter: trying to map over PDP/APDP!");
+
+ /* sanity check: kernel PTPs should already have been pre-allocated */
+ if (va >= VM_MIN_KERNEL_ADDRESS &&
+ !pmap_valid_entry(pmap->pm_pdir[pdei(va)]))
+ panic("pmap_enter: missing kernel PTP!");
+#endif
+
+ npte = protection_codes[prot] | PG_V;
+
+ if (pa >= pmap_pa_start && pa < pmap_pa_end)
+ npte |= xpmap_ptom(pa);
+ else {
+ XENPRINTF(("pmap_enter: va %08lx outside pa range %08lx\n",
+ va, pa));
+ npte |= pa;
+ }
+
+ /* XENPRINTK(("npte %p\n", npte)); */
+
+ if (wired)
+ npte |= PG_W;
+
+ if (va < VM_MAXUSER_ADDRESS)
+ npte |= PG_u;
+ else if (va < VM_MAX_ADDRESS)
+ npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
+ if (pmap == pmap_kernel())
+ npte |= pmap_pg_g;
+
+ /* get lock */
+ PMAP_MAP_TO_HEAD_LOCK();
+
+ ptes = pmap_map_ptes(pmap); /* locks pmap */
+ if (pmap == pmap_kernel()) {
+ ptp = NULL;
+ } else {
+ ptp = pmap_get_ptp(pmap, pdei(va));
+ if (ptp == NULL) {
+ if (flags & PMAP_CANFAIL) {
+ error = ENOMEM;
+ goto out;
+ }
+ panic("pmap_enter: get ptp failed");
+ }
+ }
+
+ /*
+ * Get first view on old PTE
+ * on SMP the PTE might gain PG_U and PG_M flags
+ * before we zap it later
+ */
+ opte = pte_get(&ptes[x86_btop(va)]); /* old PTE */
+ XENPRINTK(("npte %p opte %p ptes %p idx %03x\n",
+ (void *)npte, (void *)opte, ptes, x86_btop(va)));
+
+ /*
+ * is there currently a valid mapping at our VA and does it
+ * map to the same PA as the one we want to map ?
+ */
+
+ if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) {
+
+ /*
+ * first, calculate pm_stats updates. resident count will not
+ * change since we are replacing/changing a valid mapping.
+ * wired count might change...
+ */
+ pmap->pm_stats.wired_count +=
+ ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+ npte |= (opte & PG_PVLIST);
+
+ XENPRINTK(("pmap update opte == pa"));
+ /* zap! */
+ maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+ opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte);
+
+ /*
+ * Any change in the protection level that the CPU
+ * should know about ?
+ */
+ if ((npte & PG_RW)
+ || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) {
+ XENPRINTK(("pmap update opte == pa, prot change"));
+ /*
+ * No need to flush the TLB.
+ * Just add old PG_M, ... flags in new entry.
+ */
+ PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp,
+ opte & (PG_M | PG_U));
+ goto out_ok;
+ }
+
+ /*
+ * Might be cached in the TLB as being writable
+ * if this is on the PVLIST, sync R/M bit
+ */
+ if (opte & PG_PVLIST) {
+ pg = PHYS_TO_VM_PAGE(pa);
+#ifdef DIAGNOSTIC
+ if (pg == NULL)
+ panic("pmap_enter: same pa PG_PVLIST "
+ "mapping with unmanaged page "
+ "pa = 0x%lx (0x%lx)", pa,
+ atop(pa));
+#endif
+ mdpg = &pg->mdpage;
+ old_pvh = &mdpg->mp_pvhead;
+ simple_lock(&old_pvh->pvh_lock);
+ mdpg->mp_attrs |= opte;
+ simple_unlock(&old_pvh->pvh_lock);
+ }
+ goto shootdown_now;
+ }
+
+ pg = PHYS_TO_VM_PAGE(pa);
+ XENPRINTK(("pg %p from %p, init %d\n", pg, (void *)pa,
+ pmap_initialized));
+ if (pmap_initialized && pg != NULL) {
+ /* This is a managed page */
+ npte |= PG_PVLIST;
+ mdpg = &pg->mdpage;
+ new_pvh = &mdpg->mp_pvhead;
+ if ((opte & (PG_PVLIST | PG_V)) != (PG_PVLIST | PG_V)) {
+ /* We can not steal a pve - allocate one */
+ pve = pmap_alloc_pv(pmap, ALLOCPV_NEED);
+ if (pve == NULL) {
+ if (!(flags & PMAP_CANFAIL))
+ panic("pmap_enter: "
+ "no pv entries available");
+ error = ENOMEM;
+ goto out;
+ }
+ }
+ } else {
+ new_pvh = NULL;
+ }
+
+ /*
+ * is there currently a valid mapping at our VA?
+ */
+
+ if (pmap_valid_entry(opte)) {
+
+ /*
+ * changing PAs: we must remove the old one first
+ */
+
+ /*
+ * first, calculate pm_stats updates. resident count will not
+ * change since we are replacing/changing a valid mapping.
+ * wired count might change...
+ */
+ pmap->pm_stats.wired_count +=
+ ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+ if (opte & PG_PVLIST) {
+ pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+ if (pg == NULL)
+ panic("pmap_enter: PG_PVLIST mapping with "
+ "unmanaged page "
+ "pa = 0x%lx (0x%lx)", pa, atop(pa));
+#endif
+ mdpg = &pg->mdpage;
+ old_pvh = &mdpg->mp_pvhead;
+
+ /* new_pvh is NULL if page will not be managed */
+ pmap_lock_pvhs(old_pvh, new_pvh);
+
+ XENPRINTK(("pmap change pa"));
+ /* zap! */
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&ptes[x86_btop(va)]);
+ opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp,
+ npte);
+
+ pve = pmap_remove_pv(old_pvh, pmap, va);
+ KASSERT(pve != 0);
+ mdpg->mp_attrs |= opte;
+
+ if (new_pvh) {
+ pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
+ simple_unlock(&new_pvh->pvh_lock);
+ } else
+ pmap_free_pv(pmap, pve);
+ simple_unlock(&old_pvh->pvh_lock);
+
+ goto shootdown_test;
+ }
+ } else { /* opte not valid */
+ pmap->pm_stats.resident_count++;
+ if (wired)
+ pmap->pm_stats.wired_count++;
+ if (ptp)
+ ptp->wire_count++;
+ }
+
+ if (new_pvh) {
+ simple_lock(&new_pvh->pvh_lock);
+ pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
+ simple_unlock(&new_pvh->pvh_lock);
+ }
+
+ XENPRINTK(("pmap initial setup\n"));
+ maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+ opte = pte_atomic_update_ma(&ptes[x86_btop(va)],
+ maptp, npte); /* zap! */
+
+shootdown_test:
+ /* Update page attributes if needed */
+ if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+ int32_t cpumask = 0;
+#endif
+shootdown_now:
+#if defined(MULTIPROCESSOR)
+ pmap_tlb_shootdown(pmap, va, opte, &cpumask);
+ pmap_tlb_shootnow(cpumask);
+#else
+ /* Don't bother deferring in the single CPU case. */
+ if (pmap_is_curpmap(pmap))
+ pmap_update_pg(va);
+#endif
+ }
+
+out_ok:
+ error = 0;
+
+out:
+ pmap_unmap_ptes(pmap);
+ PMAP_MAP_TO_HEAD_UNLOCK();
+
+ XENPRINTK(("pmap_enter: %d\n", error));
+ return error;
+}
+
+/*
+ * pmap_enter_ma: enter a mapping into a pmap
+ *
+ * => must be done "now" ... no lazy-evaluation
+ * => we set pmap => pv_head locking
+ */
+
+int
+pmap_enter_ma(pmap, va, pa, prot, flags)
+ struct pmap *pmap;
+ vaddr_t va;
+ paddr_t pa;
+ vm_prot_t prot;
+ int flags;
+{
+ pt_entry_t *ptes, opte, npte;
+ pt_entry_t *maptp;
+ struct vm_page *ptp, *pg;
+ struct vm_page_md *mdpg;
+ struct pv_head *old_pvh;
+ struct pv_entry *pve = NULL; /* XXX gcc */
+ int error;
+ boolean_t wired = (flags & PMAP_WIRED) != 0;
+
+ XENPRINTK(("pmap_enter_ma(%p, %p, %p, %08x, %08x)\n",
+ pmap, (void *)va, (void *)pa, prot, flags));
+
+#ifdef DIAGNOSTIC
+ /* sanity check: totally out of range? */
+ if (va >= VM_MAX_KERNEL_ADDRESS)
+ panic("pmap_enter: too big");
+
+ if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
+ panic("pmap_enter: trying to map over PDP/APDP!");
+
+ /* sanity check: kernel PTPs should already have been pre-allocated */
+ if (va >= VM_MIN_KERNEL_ADDRESS &&
+ !pmap_valid_entry(pmap->pm_pdir[pdei(va)]))
+ panic("pmap_enter: missing kernel PTP!");
+#endif
+
+ npte = pa | protection_codes[prot] | PG_V;
+ /* XENPRINTK(("npte %p\n", npte)); */
+
+ if (wired)
+ npte |= PG_W;
+
+ if (va < VM_MAXUSER_ADDRESS)
+ npte |= PG_u;
+ else if (va < VM_MAX_ADDRESS)
+ npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
+ if (pmap == pmap_kernel())
+ npte |= pmap_pg_g;
+
+ /* get lock */
+ PMAP_MAP_TO_HEAD_LOCK();
+
+ ptes = pmap_map_ptes(pmap); /* locks pmap */
+ if (pmap == pmap_kernel()) {
+ ptp = NULL;
+ } else {
+ ptp = pmap_get_ptp(pmap, pdei(va));
+ if (ptp == NULL) {
+ if (flags & PMAP_CANFAIL) {
+ error = ENOMEM;
+ goto out;
+ }
+ panic("pmap_enter: get ptp failed");
+ }
+ }
+
+ /*
+ * Get first view on old PTE
+ * on SMP the PTE might gain PG_U and PG_M flags
+ * before we zap it later
+ */
+ opte = pte_get_ma(&ptes[x86_btop(va)]); /* old PTE */
+ XENPRINTK(("npte %p opte %p ptes %p idx %03x\n",
+ (void *)npte, (void *)opte, ptes, x86_btop(va)));
+ XENPRINTF(("pmap_enter_ma pa %08lx va %08lx opte %08x npte %08x "
+ "wired %d count %ld\n", pa, va, opte, npte, wired,
+ pmap->pm_stats.wired_count));
+
+ /*
+ * is there currently a valid mapping at our VA and does it
+ * map to the same MA as the one we want to map ?
+ */
+
+ if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) {
+
+ /*
+ * first, calculate pm_stats updates. resident count will not
+ * change since we are replacing/changing a valid mapping.
+ * wired count might change...
+ */
+ pmap->pm_stats.wired_count +=
+ ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+ XENPRINTK(("pmap update opte == pa"));
+ /* zap! */
+ maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+ opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte);
+
+ /*
+ * Any change in the protection level that the CPU
+ * should know about ?
+ */
+ if ((npte & PG_RW)
+ || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) {
+ XENPRINTK(("pmap update opte == pa, prot change"));
+ /*
+ * No need to flush the TLB.
+ * Just add old PG_M, ... flags in new entry.
+ */
+ PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp,
+ opte & (PG_M | PG_U));
+ goto out_ok;
+ }
+
+ /*
+ * Might be cached in the TLB as being writable
+ * if this is on the PVLIST, sync R/M bit
+ */
+ KDASSERT((opte & PG_PVLIST) == 0);
+ goto shootdown_now;
+ }
+
+ /*
+ * no managed mapping for pages mapped through pmap_enter_ma.
+ */
+
+ /*
+ * is there currently a valid mapping at our VA?
+ */
+
+ if (pmap_valid_entry(opte)) {
+
+ /*
+ * changing PAs: we must remove the old one first
+ */
+
+ /*
+ * first, calculate pm_stats updates. resident count will not
+ * change since we are replacing/changing a valid mapping.
+ * wired count might change...
+ */
+ pmap->pm_stats.wired_count +=
+ ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+ if (opte & PG_PVLIST) {
+ opte = xpmap_mtop(opte);
+ KDASSERT((opte & PG_FRAME) !=
+ (KERNTEXTOFF - KERNBASE_LOCORE));
+
+ pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+ if (pg == NULL)
+ panic("pmap_enter: PG_PVLIST mapping with "
+ "unmanaged page "
+ "pa = 0x%lx (0x%lx)", pa, atop(pa));
+#endif
+ mdpg = &pg->mdpage;
+ old_pvh = &mdpg->mp_pvhead;
+
+ /* NULL new_pvh since page will not be managed */
+ pmap_lock_pvhs(old_pvh, NULL);
+
+ XENPRINTK(("pmap change pa"));
+ /* zap! */
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&ptes[x86_btop(va)]);
+ opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp,
+ npte);
+
+ pve = pmap_remove_pv(old_pvh, pmap, va);
+ KASSERT(pve != 0);
+ mdpg->mp_attrs |= opte;
+
+ pmap_free_pv(pmap, pve);
+ simple_unlock(&old_pvh->pvh_lock);
+
+ goto shootdown_test;
+ }
+ } else { /* opte not valid */
+ pmap->pm_stats.resident_count++;
+ if (wired)
+ pmap->pm_stats.wired_count++;
+ if (ptp)
+ ptp->wire_count++;
+ }
+
+ XENPRINTK(("pmap initial setup"));
+ maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+ opte = pte_atomic_update_ma(&ptes[x86_btop(va)],
+ maptp, npte); /* zap! */
+
+shootdown_test:
+ /* Update page attributes if needed */
+ if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+ int32_t cpumask = 0;
+#endif
+shootdown_now:
+#if defined(MULTIPROCESSOR)
+ pmap_tlb_shootdown(pmap, va, opte, &cpumask);
+ pmap_tlb_shootnow(cpumask);
+#else
+ /* Don't bother deferring in the single CPU case. */
+ if (pmap_is_curpmap(pmap))
+ pmap_update_pg(va);
+#endif
+ }
+
+out_ok:
+ error = 0;
+
+out:
+ pmap_unmap_ptes(pmap);
+ PMAP_MAP_TO_HEAD_UNLOCK();
+
+ XENPRINTK(("pmap_enter: %d\n", error));
+ return error;
+}
+
+/*
+ * pmap_growkernel: increase usage of KVM space
+ *
+ * => we allocate new PTPs for the kernel and install them in all
+ * the pmaps on the system.
+ */
+
+vaddr_t
+pmap_growkernel(maxkvaddr)
+ vaddr_t maxkvaddr;
+{
+ struct pmap *kpm = pmap_kernel(), *pm;
+ pd_entry_t *mapdp;
+ pt_entry_t *maptp;
+ int needed_kpde; /* needed number of kernel PTPs */
+ int s;
+ paddr_t ptaddr;
+
+ needed_kpde = (u_int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1))
+ / NBPD;
+ XENPRINTF(("pmap_growkernel %p: %d -> %d\n", (void *)maxkvaddr,
+ nkpde, needed_kpde));
+ if (needed_kpde <= nkpde)
+ goto out; /* we are OK */
+
+ /*
+ * whoops! we need to add kernel PTPs
+ */
+
+ s = splhigh(); /* to be safe */
+ simple_lock(&kpm->pm_obj.vmobjlock);
+
+ for (/*null*/ ; nkpde < needed_kpde ; nkpde++) {
+
+ mapdp = (pt_entry_t *)vtomach((vaddr_t)&kpm->pm_pdir[PDSLOT_KERN + nkpde]);
+ if (uvm.page_init_done == FALSE) {
+
+ /*
+ * we're growing the kernel pmap early (from
+ * uvm_pageboot_alloc()). this case must be
+ * handled a little differently.
+ */
+
+ if (uvm_page_physget(&ptaddr) == FALSE)
+ panic("pmap_growkernel: out of memory");
+ pmap_zero_page(ptaddr);
+
+ XENPRINTF(("xxxx maybe not PG_RW\n"));
+ PDE_SET(&kpm->pm_pdir[PDSLOT_KERN + nkpde], mapdp, ptaddr | PG_RW | PG_V);
+
+ /* count PTP as resident */
+ kpm->pm_stats.resident_count++;
+ continue;
+ }
+
+ /*
+ * THIS *MUST* BE CODED SO AS TO WORK IN THE
+ * pmap_initialized == FALSE CASE! WE MAY BE
+ * INVOKED WHILE pmap_init() IS RUNNING!
+ */
+
+ if (pmap_alloc_ptp(kpm, PDSLOT_KERN + nkpde) == NULL) {
+ panic("pmap_growkernel: alloc ptp failed");
+ }
+
+ /* PG_u not for kernel */
+ PDE_CLEARBITS(&kpm->pm_pdir[PDSLOT_KERN + nkpde], mapdp, PG_u);
+
+ /* distribute new kernel PTP to all active pmaps */
+ simple_lock(&pmaps_lock);
+ for (pm = pmaps.lh_first; pm != NULL;
+ pm = pm->pm_list.le_next) {
+ XENPRINTF(("update\n"));
+ maptp = (pt_entry_t *)vtomach(
+ (vaddr_t)&pm->pm_pdir[PDSLOT_KERN + nkpde]);
+ PDE_COPY(&pm->pm_pdir[PDSLOT_KERN + nkpde], maptp,
+ &kpm->pm_pdir[PDSLOT_KERN + nkpde]);
+ }
+
+ /* Invalidate the PDP cache. */
+ pool_cache_invalidate(&pmap_pdp_cache);
+ pmap_pdp_cache_generation++;
+
+ simple_unlock(&pmaps_lock);
+ }
+
+ simple_unlock(&kpm->pm_obj.vmobjlock);
+ splx(s);
+
+out:
+ XENPRINTF(("pmap_growkernel return %d %p\n", nkpde,
+ (void *)(VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD))));
+ return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD));
+}
+
+#ifdef DEBUG
+void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
+
+/*
+ * pmap_dump: dump all the mappings from a pmap
+ *
+ * => caller should not be holding any pmap locks
+ */
+
+void
+pmap_dump(pmap, sva, eva)
+ struct pmap *pmap;
+ vaddr_t sva, eva;
+{
+ pt_entry_t *ptes, *pte;
+ vaddr_t blkendva;
+
+ /*
+ * if end is out of range truncate.
+ * if (end == start) update to max.
+ */
+
+ if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
+ eva = VM_MAXUSER_ADDRESS;
+
+ /*
+ * we lock in the pmap => pv_head direction
+ */
+
+ PMAP_MAP_TO_HEAD_LOCK();
+ ptes = pmap_map_ptes(pmap); /* locks pmap */
+
+ /*
+ * dumping a range of pages: we dump in PTP sized blocks (4MB)
+ */
+
+ for (/* null */ ; sva < eva ; sva = blkendva) {
+
+ /* determine range of block */
+ blkendva = x86_round_pdr(sva+1);
+ if (blkendva > eva)
+ blkendva = eva;
+
+ /* valid block? */
+ if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
+ continue;
+
+ pte = &ptes[x86_btop(sva)];
+ for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
+ if (!pmap_valid_entry(*pte))
+ continue;
+ XENPRINTF(("va %#lx -> pa %#lx (pte=%#lx)\n",
+ sva, PTE_GET(pte), PTE_GET(pte) & PG_FRAME));
+ }
+ }
+ pmap_unmap_ptes(pmap);
+ PMAP_MAP_TO_HEAD_UNLOCK();
+}
+#endif
+
+/******************** TLB shootdown code ********************/
+
+
+void
+pmap_tlb_shootnow(int32_t cpumask)
+{
+ struct cpu_info *self;
+#ifdef MULTIPROCESSOR
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ int s;
+#ifdef DIAGNOSTIC
+ int count = 0;
+#endif
+#endif
+
+ if (cpumask == 0)
+ return;
+
+ self = curcpu();
+#ifdef MULTIPROCESSOR
+ s = splipi();
+ self->ci_tlb_ipi_mask = cpumask;
+#endif
+
+ pmap_do_tlb_shootdown(self); /* do *our* work. */
+
+#ifdef MULTIPROCESSOR
+ splx(s);
+
+ /*
+ * Send the TLB IPI to other CPUs pending shootdowns.
+ */
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ if (ci == self)
+ continue;
+ if (cpumask & (1U << ci->ci_cpuid))
+ if (x86_send_ipi(ci, X86_IPI_TLB) != 0)
+ x86_atomic_clearbits_l(&self->ci_tlb_ipi_mask,
+ (1U << ci->ci_cpuid));
+ }
+
+ while (self->ci_tlb_ipi_mask != 0) {
+#ifdef DIAGNOSTIC
+ if (count++ > 10000000)
+ panic("TLB IPI rendezvous failed (mask %x)",
+ self->ci_tlb_ipi_mask);
+#endif
+ x86_pause();
+ }
+#endif
+}
+
+/*
+ * pmap_tlb_shootdown:
+ *
+ * Cause the TLB entry for pmap/va to be shot down.
+ */
+void
+pmap_tlb_shootdown(pmap, va, pte, cpumaskp)
+ pmap_t pmap;
+ vaddr_t va;
+ pt_entry_t pte;
+ int32_t *cpumaskp;
+{
+ struct cpu_info *ci, *self;
+ struct pmap_tlb_shootdown_q *pq;
+ struct pmap_tlb_shootdown_job *pj;
+ CPU_INFO_ITERATOR cii;
+ int s;
+
+#ifdef LARGEPAGES
+ if (pte & PG_PS)
+ va &= PG_LGFRAME;
+#endif
+
+ if (pmap_initialized == FALSE || cpus_attached == 0) {
+ pmap_update_pg(va);
+ return;
+ }
+
+ self = curcpu();
+
+ s = splipi();
+#if 0
+ printf("dshootdown %lx\n", va);
+#endif
+
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ /* Note: we queue shootdown events for ourselves here! */
+ if (pmap_is_active(pmap, ci->ci_cpuid) == 0)
+ continue;
+ if (ci != self && !(ci->ci_flags & CPUF_RUNNING))
+ continue;
+ pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
+ __cpu_simple_lock(&pq->pq_slock);
+
+ /*
+ * If there's a global flush already queued, or a
+ * non-global flush, and this pte doesn't have the G
+ * bit set, don't bother.
+ */
+ if (pq->pq_flushg > 0 ||
+ (pq->pq_flushu > 0 && (pte & pmap_pg_g) == 0)) {
+ __cpu_simple_unlock(&pq->pq_slock);
+ continue;
+ }
+
+#ifdef I386_CPU
+ /*
+ * i386 CPUs can't invalidate a single VA, only
+ * flush the entire TLB, so don't bother allocating
+ * jobs for them -- just queue a `flushu'.
+ *
+ * XXX note that this can be executed for non-i386
+ * when called * early (before identifycpu() has set
+ * cpu_class)
+ */
+ if (cpu_class == CPUCLASS_386) {
+ pq->pq_flushu++;
+ *cpumaskp |= 1U << ci->ci_cpuid;
+ __cpu_simple_unlock(&pq->pq_slock);
+ continue;
+ }
+#endif
+
+ pj = pmap_tlb_shootdown_job_get(pq);
+ pq->pq_pte |= pte;
+ if (pj == NULL) {
+ /*
+ * Couldn't allocate a job entry.
+ * Kill it now for this CPU, unless the failure
+ * was due to too many pending flushes; otherwise,
+ * tell other cpus to kill everything..
+ */
+ if (ci == self && pq->pq_count < PMAP_TLB_MAXJOBS) {
+ pmap_update_pg(va);
+ __cpu_simple_unlock(&pq->pq_slock);
+ continue;
+ } else {
+ if (pq->pq_pte & pmap_pg_g)
+ pq->pq_flushg++;
+ else
+ pq->pq_flushu++;
+ /*
+ * Since we've nailed the whole thing,
+ * drain the job entries pending for that
+ * processor.
+ */
+ pmap_tlb_shootdown_q_drain(pq);
+ *cpumaskp |= 1U << ci->ci_cpuid;
+ }
+ } else {
+ pj->pj_pmap = pmap;
+ pj->pj_va = va;
+ pj->pj_pte = pte;
+ TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list);
+ *cpumaskp |= 1U << ci->ci_cpuid;
+ }
+ __cpu_simple_unlock(&pq->pq_slock);
+ }
+ splx(s);
+}
+
+/*
+ * pmap_do_tlb_shootdown_checktlbstate: check and update ci_tlbstate.
+ *
+ * => called at splipi.
+ * => return TRUE if we need to maintain user tlbs.
+ */
+static __inline boolean_t
+pmap_do_tlb_shootdown_checktlbstate(struct cpu_info *ci)
+{
+
+ KASSERT(ci == curcpu());
+
+ if (ci->ci_tlbstate == TLBSTATE_LAZY) {
+ KASSERT(ci->ci_pmap != pmap_kernel());
+ /*
+ * mostly KASSERT(ci->ci_pmap->pm_cpus & (1U << ci->ci_cpuid));
+ */
+
+ /*
+ * we no longer want tlb shootdown ipis for this pmap.
+ * mark the pmap no longer in use by this processor.
+ */
+
+ x86_atomic_clearbits_l(&ci->ci_pmap->pm_cpus,
+ 1U << ci->ci_cpuid);
+ ci->ci_tlbstate = TLBSTATE_STALE;
+ }
+
+ if (ci->ci_tlbstate == TLBSTATE_STALE)
+ return FALSE;
+
+ return TRUE;
+}
+
+/*
+ * pmap_do_tlb_shootdown:
+ *
+ * Process pending TLB shootdown operations for this processor.
+ */
+void
+pmap_do_tlb_shootdown(struct cpu_info *self)
+{
+ u_long cpu_id = self->ci_cpuid;
+ struct pmap_tlb_shootdown_q *pq = &pmap_tlb_shootdown_q[cpu_id];
+ struct pmap_tlb_shootdown_job *pj;
+ int s;
+#ifdef MULTIPROCESSOR
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+#endif
+ KASSERT(self == curcpu());
+
+ s = splipi();
+
+ __cpu_simple_lock(&pq->pq_slock);
+
+ if (pq->pq_flushg) {
+ COUNT(flushg);
+ pmap_do_tlb_shootdown_checktlbstate(self);
+ tlbflushg();
+ pq->pq_flushg = 0;
+ pq->pq_flushu = 0;
+ pmap_tlb_shootdown_q_drain(pq);
+ } else {
+ /*
+ * TLB flushes for PTEs with PG_G set may be in the queue
+ * after a flushu, they need to be dealt with.
+ */
+ if (pq->pq_flushu) {
+ COUNT(flushu);
+ pmap_do_tlb_shootdown_checktlbstate(self);
+ tlbflush();
+ }
+ while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) {
+ TAILQ_REMOVE(&pq->pq_head, pj, pj_list);
+
+ if ((pj->pj_pte & pmap_pg_g) ||
+ pj->pj_pmap == pmap_kernel()) {
+ pmap_update_pg(pj->pj_va);
+ } else if (!pq->pq_flushu &&
+ pj->pj_pmap == self->ci_pmap) {
+ if (pmap_do_tlb_shootdown_checktlbstate(self))
+ pmap_update_pg(pj->pj_va);
+ }
+
+ pmap_tlb_shootdown_job_put(pq, pj);
+ }
+
+ pq->pq_flushu = pq->pq_pte = 0;
+ }
+
+#ifdef MULTIPROCESSOR
+ for (CPU_INFO_FOREACH(cii, ci))
+ x86_atomic_clearbits_l(&ci->ci_tlb_ipi_mask,
+ (1U << cpu_id));
+#endif
+ __cpu_simple_unlock(&pq->pq_slock);
+
+ splx(s);
+}
+
+
+/*
+ * pmap_tlb_shootdown_q_drain:
+ *
+ * Drain a processor's TLB shootdown queue. We do not perform
+ * the shootdown operations. This is merely a convenience
+ * function.
+ *
+ * Note: We expect the queue to be locked.
+ */
+void
+pmap_tlb_shootdown_q_drain(pq)
+ struct pmap_tlb_shootdown_q *pq;
+{
+ struct pmap_tlb_shootdown_job *pj;
+
+ while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) {
+ TAILQ_REMOVE(&pq->pq_head, pj, pj_list);
+ pmap_tlb_shootdown_job_put(pq, pj);
+ }
+ pq->pq_pte = 0;
+}
+
+/*
+ * pmap_tlb_shootdown_job_get:
+ *
+ * Get a TLB shootdown job queue entry. This places a limit on
+ * the number of outstanding jobs a processor may have.
+ *
+ * Note: We expect the queue to be locked.
+ */
+struct pmap_tlb_shootdown_job *
+pmap_tlb_shootdown_job_get(pq)
+ struct pmap_tlb_shootdown_q *pq;
+{
+ struct pmap_tlb_shootdown_job *pj;
+
+ if (pq->pq_count >= PMAP_TLB_MAXJOBS)
+ return (NULL);
+
+ __cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
+ if (pj_free == NULL) {
+ __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
+ return NULL;
+ }
+ pj = &pj_free->pja_job;
+ pj_free =
+ (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
+ __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
+
+ pq->pq_count++;
+ return (pj);
+}
+
+/*
+ * pmap_tlb_shootdown_job_put:
+ *
+ * Put a TLB shootdown job queue entry onto the free list.
+ *
+ * Note: We expect the queue to be locked.
+ */
+void
+pmap_tlb_shootdown_job_put(pq, pj)
+ struct pmap_tlb_shootdown_q *pq;
+ struct pmap_tlb_shootdown_job *pj;
+{
+
+#ifdef DIAGNOSTIC
+ if (pq->pq_count == 0)
+ panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
+#endif
+ __cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
+ pj->pj_nextfree = &pj_free->pja_job;
+ pj_free = (union pmap_tlb_shootdown_job_al *)pj;
+ __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
+
+ pq->pq_count--;
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c
new file mode 100644
index 0000000000..d65741fbf2
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c
@@ -0,0 +1,550 @@
+/* $NetBSD: sys_machdep.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $ */
+/* NetBSD: sys_machdep.c,v 1.70 2003/10/27 14:11:47 junyoung Exp */
+
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: sys_machdep.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $");
+
+#include "opt_compat_netbsd.h"
+#include "opt_mtrr.h"
+#include "opt_perfctrs.h"
+#include "opt_user_ldt.h"
+#include "opt_vm86.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl.h>
+#include <sys/file.h>
+#include <sys/time.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/buf.h>
+#include <sys/signal.h>
+#include <sys/malloc.h>
+
+#include <sys/mount.h>
+#include <sys/sa.h>
+#include <sys/syscallargs.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/cpu.h>
+#include <machine/cpufunc.h>
+#include <machine/gdt.h>
+#include <machine/psl.h>
+#include <machine/reg.h>
+#include <machine/sysarch.h>
+#include <machine/mtrr.h>
+
+#ifdef VM86
+#include <machine/vm86.h>
+#endif
+
+#ifdef PERFCTRS
+#include <machine/pmc.h>
+#endif
+
+extern struct vm_map *kernel_map;
+
+int i386_iopl(struct lwp *, void *, register_t *);
+int i386_get_ioperm(struct lwp *, void *, register_t *);
+int i386_set_ioperm(struct lwp *, void *, register_t *);
+int i386_get_mtrr(struct lwp *, void *, register_t *);
+int i386_set_mtrr(struct lwp *, void *, register_t *);
+
+#ifdef USER_LDT
+
+#ifdef LDT_DEBUG
+static void i386_print_ldt(int, const struct segment_descriptor *);
+
+static void
+i386_print_ldt(i, d)
+ int i;
+ const struct segment_descriptor *d;
+{
+ printf("[%d] lolimit=0x%x, lobase=0x%x, type=%u, dpl=%u, p=%u, "
+ "hilimit=0x%x, xx=%x, def32=%u, gran=%u, hibase=0x%x\n",
+ i, d->sd_lolimit, d->sd_lobase, d->sd_type, d->sd_dpl, d->sd_p,
+ d->sd_hilimit, d->sd_xx, d->sd_def32, d->sd_gran, d->sd_hibase);
+}
+#endif
+
+int
+i386_get_ldt(l, args, retval)
+ struct lwp *l;
+ void *args;
+ register_t *retval;
+{
+ int error;
+ struct proc *p = l->l_proc;
+ pmap_t pmap = p->p_vmspace->vm_map.pmap;
+ int nldt, num;
+ union descriptor *lp, *cp;
+ struct i386_get_ldt_args ua;
+
+ if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+ return (error);
+
+#ifdef LDT_DEBUG
+ printf("i386_get_ldt: start=%d num=%d descs=%p\n", ua.start,
+ ua.num, ua.desc);
+#endif
+
+ if (ua.start < 0 || ua.num < 0 || ua.start > 8192 || ua.num > 8192 ||
+ ua.start + ua.num > 8192)
+ return (EINVAL);
+
+ cp = malloc(ua.num * sizeof(union descriptor), M_TEMP, M_WAITOK);
+ if (cp == NULL)
+ return ENOMEM;
+
+ simple_lock(&pmap->pm_lock);
+
+ if (pmap->pm_flags & PMF_USER_LDT) {
+ nldt = pmap->pm_ldt_len;
+ lp = pmap->pm_ldt;
+ } else {
+ nldt = NLDT;
+ lp = ldt;
+ }
+
+ if (ua.start > nldt) {
+ simple_unlock(&pmap->pm_lock);
+ free(cp, M_TEMP);
+ return (EINVAL);
+ }
+
+ lp += ua.start;
+ num = min(ua.num, nldt - ua.start);
+#ifdef LDT_DEBUG
+ {
+ int i;
+ for (i = 0; i < num; i++)
+ i386_print_ldt(i, &lp[i].sd);
+ }
+#endif
+
+ memcpy(cp, lp, num * sizeof(union descriptor));
+ simple_unlock(&pmap->pm_lock);
+
+ error = copyout(cp, ua.desc, num * sizeof(union descriptor));
+ if (error == 0)
+ *retval = num;
+
+ free(cp, M_TEMP);
+ return (error);
+}
+
+int
+i386_set_ldt(l, args, retval)
+ struct lwp *l;
+ void *args;
+ register_t *retval;
+{
+ int error, i, n;
+ struct proc *p = l->l_proc;
+ struct pcb *pcb = &l->l_addr->u_pcb;
+ pmap_t pmap = p->p_vmspace->vm_map.pmap;
+ struct i386_set_ldt_args ua;
+ union descriptor *descv;
+ size_t old_len, new_len, ldt_len;
+ union descriptor *old_ldt, *new_ldt;
+
+ if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+ return (error);
+
+ if (ua.start < 0 || ua.num < 0 || ua.start > 8192 || ua.num > 8192 ||
+ ua.start + ua.num > 8192)
+ return (EINVAL);
+
+ descv = malloc(sizeof (*descv) * ua.num, M_TEMP, M_NOWAIT);
+ if (descv == NULL)
+ return (ENOMEM);
+
+ if ((error = copyin(ua.desc, descv, sizeof (*descv) * ua.num)) != 0)
+ goto out;
+
+ /* Check descriptors for access violations. */
+ for (i = 0; i < ua.num; i++) {
+ union descriptor *desc = &descv[i];
+
+ switch (desc->sd.sd_type) {
+ case SDT_SYSNULL:
+ desc->sd.sd_p = 0;
+ break;
+ case SDT_SYS286CGT:
+ case SDT_SYS386CGT:
+ /*
+ * Only allow call gates targeting a segment
+ * in the LDT or a user segment in the fixed
+ * part of the gdt. Segments in the LDT are
+ * constrained (below) to be user segments.
+ */
+ if (desc->gd.gd_p != 0 &&
+ !ISLDT(desc->gd.gd_selector) &&
+ ((IDXSEL(desc->gd.gd_selector) >= NGDT) ||
+ (gdt[IDXSEL(desc->gd.gd_selector)].sd.sd_dpl !=
+ SEL_UPL))) {
+ error = EACCES;
+ goto out;
+ }
+ break;
+ case SDT_MEMEC:
+ case SDT_MEMEAC:
+ case SDT_MEMERC:
+ case SDT_MEMERAC:
+ /* Must be "present" if executable and conforming. */
+ if (desc->sd.sd_p == 0) {
+ error = EACCES;
+ goto out;
+ }
+ break;
+ case SDT_MEMRO:
+ case SDT_MEMROA:
+ case SDT_MEMRW:
+ case SDT_MEMRWA:
+ case SDT_MEMROD:
+ case SDT_MEMRODA:
+ case SDT_MEMRWD:
+ case SDT_MEMRWDA:
+ case SDT_MEME:
+ case SDT_MEMEA:
+ case SDT_MEMER:
+ case SDT_MEMERA:
+ break;
+ default:
+ /*
+ * Make sure that unknown descriptor types are
+ * not marked present.
+ */
+ if (desc->sd.sd_p != 0) {
+ error = EACCES;
+ goto out;
+ }
+ break;
+ }
+
+ if (desc->sd.sd_p != 0) {
+ /* Only user (ring-3) descriptors may be present. */
+ if (desc->sd.sd_dpl != SEL_UPL) {
+ error = EACCES;
+ goto out;
+ }
+ }
+ }
+
+ /* allocate user ldt */
+ simple_lock(&pmap->pm_lock);
+ if (pmap->pm_ldt == 0 || (ua.start + ua.num) > pmap->pm_ldt_len) {
+ if (pmap->pm_flags & PMF_USER_LDT)
+ ldt_len = pmap->pm_ldt_len;
+ else
+ ldt_len = 512;
+ while ((ua.start + ua.num) > ldt_len)
+ ldt_len *= 2;
+ new_len = ldt_len * sizeof(union descriptor);
+
+ simple_unlock(&pmap->pm_lock);
+ new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
+ new_len);
+ simple_lock(&pmap->pm_lock);
+
+ if (pmap->pm_ldt != NULL && ldt_len <= pmap->pm_ldt_len) {
+ /*
+ * Another thread (re)allocated the LDT to
+ * sufficient size while we were blocked in
+ * uvm_km_alloc. Oh well. The new entries
+ * will quite probably not be right, but
+ * hey.. not our problem if user applications
+ * have race conditions like that.
+ */
+ uvm_km_free(kernel_map, (vaddr_t)new_ldt, new_len);
+ goto copy;
+ }
+
+ old_ldt = pmap->pm_ldt;
+
+ if (old_ldt != NULL) {
+ old_len = pmap->pm_ldt_len * sizeof(union descriptor);
+ } else {
+ old_len = NLDT * sizeof(union descriptor);
+ old_ldt = ldt;
+ }
+
+ memcpy(new_ldt, old_ldt, old_len);
+ memset((caddr_t)new_ldt + old_len, 0, new_len - old_len);
+
+ if (old_ldt != ldt)
+ uvm_km_free(kernel_map, (vaddr_t)old_ldt, old_len);
+
+ pmap->pm_ldt = new_ldt;
+ pmap->pm_ldt_len = ldt_len;
+
+ if (pmap->pm_flags & PMF_USER_LDT)
+ ldt_free(pmap);
+ else
+ pmap->pm_flags |= PMF_USER_LDT;
+ ldt_alloc(pmap, new_ldt, new_len);
+ pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
+ if (pcb == curpcb)
+ lldt(pcb->pcb_ldt_sel);
+
+ }
+copy:
+ /* Now actually replace the descriptors. */
+ for (i = 0, n = ua.start; i < ua.num; i++, n++)
+ pmap->pm_ldt[n] = descv[i];
+
+ simple_unlock(&pmap->pm_lock);
+
+ *retval = ua.start;
+
+out:
+ free(descv, M_TEMP);
+ return (error);
+}
+#endif /* USER_LDT */
+
+int
+i386_iopl(l, args, retval)
+ struct lwp *l;
+ void *args;
+ register_t *retval;
+{
+ int error;
+ struct proc *p = l->l_proc;
+ struct pcb *pcb = &l->l_addr->u_pcb;
+ struct i386_iopl_args ua;
+ dom0_op_t op;
+
+ if ((xen_start_info.flags & SIF_PRIVILEGED) == 0)
+ return EPERM;
+
+ if (securelevel > 1)
+ return EPERM;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
+ return error;
+
+ if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+ return error;
+
+ pcb->pcb_tss.tss_ioopt &= ~SEL_RPL;
+ if (ua.iopl)
+ pcb->pcb_tss.tss_ioopt |= SEL_UPL; /* i/o pl */
+ else
+ pcb->pcb_tss.tss_ioopt |= SEL_KPL; /* i/o pl */
+
+ /* Force the change at ring 0. */
+ op.cmd = DOM0_IOPL;
+ op.u.iopl.domain = DOMID_SELF;
+ op.u.iopl.iopl = pcb->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */
+ HYPERVISOR_dom0_op(&op);
+
+ return 0;
+}
+
+int
+i386_get_ioperm(l, args, retval)
+ struct lwp *l;
+ void *args;
+ register_t *retval;
+{
+ int error;
+ struct pcb *pcb = &l->l_addr->u_pcb;
+ struct i386_get_ioperm_args ua;
+
+ if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+ return (error);
+
+ return copyout(pcb->pcb_iomap, ua.iomap, sizeof(pcb->pcb_iomap));
+}
+
+int
+i386_set_ioperm(l, args, retval)
+ struct lwp *l;
+ void *args;
+ register_t *retval;
+{
+ int error;
+ struct proc *p = l->l_proc;
+ struct pcb *pcb = &l->l_addr->u_pcb;
+ struct i386_set_ioperm_args ua;
+
+ if (securelevel > 1)
+ return EPERM;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
+ return error;
+
+ if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+ return (error);
+
+ return copyin(ua.iomap, pcb->pcb_iomap, sizeof(pcb->pcb_iomap));
+}
+
+#ifdef MTRR
+int
+i386_get_mtrr(struct lwp *l, void *args, register_t *retval)
+{
+ struct i386_get_mtrr_args ua;
+ int error, n;
+ struct proc *p = l->l_proc;
+
+ if (mtrr_funcs == NULL)
+ return ENOSYS;
+
+ error = copyin(args, &ua, sizeof ua);
+ if (error != 0)
+ return error;
+
+ error = copyin(ua.n, &n, sizeof n);
+ if (error != 0)
+ return error;
+
+ error = mtrr_get(ua.mtrrp, &n, p, MTRR_GETSET_USER);
+
+ copyout(&n, ua.n, sizeof (int));
+
+ return error;
+}
+
+int
+i386_set_mtrr(struct lwp *l, void *args, register_t *retval)
+{
+ int error, n;
+ struct i386_set_mtrr_args ua;
+ struct proc *p = l->l_proc;
+
+ if (mtrr_funcs == NULL)
+ return ENOSYS;
+
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error != 0)
+ return error;
+
+ error = copyin(args, &ua, sizeof ua);
+ if (error != 0)
+ return error;
+
+ error = copyin(ua.n, &n, sizeof n);
+ if (error != 0)
+ return error;
+
+ error = mtrr_set(ua.mtrrp, &n, p, MTRR_GETSET_USER);
+ if (n != 0)
+ mtrr_commit();
+
+ copyout(&n, ua.n, sizeof n);
+
+ return error;
+}
+#endif
+
+int
+sys_sysarch(struct lwp *l, void *v, register_t *retval)
+{
+ struct sys_sysarch_args /* {
+ syscallarg(int) op;
+ syscallarg(void *) parms;
+ } */ *uap = v;
+ int error = 0;
+
+ switch(SCARG(uap, op)) {
+#ifdef USER_LDT
+ case I386_GET_LDT:
+ error = i386_get_ldt(l, SCARG(uap, parms), retval);
+ break;
+
+ case I386_SET_LDT:
+ error = i386_set_ldt(l, SCARG(uap, parms), retval);
+ break;
+#endif
+
+ case I386_IOPL:
+ error = i386_iopl(l, SCARG(uap, parms), retval);
+ break;
+
+ case I386_GET_IOPERM:
+ error = i386_get_ioperm(l, SCARG(uap, parms), retval);
+ break;
+
+ case I386_SET_IOPERM:
+ error = i386_set_ioperm(l, SCARG(uap, parms), retval);
+ break;
+
+#ifdef VM86
+ case I386_VM86:
+ error = i386_vm86(l, SCARG(uap, parms), retval);
+ break;
+#ifdef COMPAT_16
+ case I386_OLD_VM86:
+ error = compat_16_i386_vm86(l, SCARG(uap, parms), retval);
+ break;
+#endif
+#endif
+#ifdef MTRR
+ case I386_GET_MTRR:
+ error = i386_get_mtrr(l, SCARG(uap, parms), retval);
+ break;
+ case I386_SET_MTRR:
+ error = i386_set_mtrr(l, SCARG(uap, parms), retval);
+ break;
+#endif
+#ifdef PERFCTRS
+ case I386_PMC_INFO:
+ error = pmc_info(l, SCARG(uap, parms), retval);
+ break;
+
+ case I386_PMC_STARTSTOP:
+ error = pmc_startstop(l, SCARG(uap, parms), retval);
+ break;
+
+ case I386_PMC_READ:
+ error = pmc_read(l, SCARG(uap, parms), retval);
+ break;
+#endif
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S
new file mode 100644
index 0000000000..165b5f06be
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S
@@ -0,0 +1,1587 @@
+/* $NetBSD: vector.S,v 1.1.2.1 2004/05/22 15:57:16 he Exp $ */
+/* NetBSD: 1.13 2004/03/11 11:39:26 yamt Exp */
+
+/*
+ * Copyright 2002 (c) Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Frank van der Linden for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed for the NetBSD Project by
+ * Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ * or promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_ddb.h"
+#include "opt_multiprocessor.h"
+#include "opt_ipkdb.h"
+#include "opt_vm86.h"
+#include "opt_xen.h"
+
+#ifndef XEN
+#include <machine/i8259.h>
+#endif
+#include <machine/i82093reg.h>
+#include <machine/i82489reg.h>
+#include <machine/asm.h>
+#include <machine/frameasm.h>
+#include <machine/segments.h>
+#include <machine/trap.h>
+#include <machine/intr.h>
+#include <machine/psl.h>
+#ifdef XEN
+#include <machine/xen.h>
+#endif
+
+#include <net/netisr.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+
+#include "npx.h"
+#include "assym.h"
+
+#define __HAVE_GENERIC_SOFT_INTERRUPTS /* XXX */
+
+
+/*
+ * Macros for interrupt entry, call to handler, and exit.
+ *
+ * XXX
+ * The interrupt frame is set up to look like a trap frame. This may be a
+ * waste. The only handler which needs a frame is the clock handler, and it
+ * only needs a few bits. Xdoreti() needs a trap frame for handling ASTs, but
+ * it could easily convert the frame on demand.
+ *
+ * The direct costs of setting up a trap frame are two pushl's (error code and
+ * trap number), an addl to get rid of these, and pushing and popping the
+ * callee-saved registers %esi, %edi, %ebx, and %ebp twice.
+ *
+ * If the interrupt frame is made more flexible, INTR can push %eax first and
+ * decide the ipending case with less overhead, e.g., by avoiding loading the
+ * segment registers.
+ *
+ */
+
+#define MY_COUNT _C_LABEL(uvmexp)
+
+/* XXX See comment in locore.s */
+#ifdef __ELF__
+#define XINTR(name,num) Xintr_/**/name/**/num
+#define XSTRAY(name,num) Xstray_/**/name/**/num
+#define XINTR_TSS(irq_num) Xintr_tss_ ## irq_num
+#else
+#define XINTR(name,num) _Xintr_/**/name/**/num
+#define XSTRAY(name,num) _Xstray_/**/name/**/num
+#define XINTR_TSS(irq_num) Xintr_tss_/**/irq_num
+#endif
+
+/*
+ * Store address of TSS in %eax, given a selector in %eax.
+ * Clobbers %eax, %ecx, %edx, but that's ok for its usage.
+ * This is a bit complicated, but it's done to make as few
+ * assumptions as possible about the validity of the environment.
+ * The GDT and the current and previous TSS are known to be OK,
+ * otherwise we would not be here. The only other thing that needs
+ * to be OK is the cpu_info structure for the current CPU.
+ */
+#define GET_TSS \
+ andl $0xfff8,%eax ;\
+ addl CPUVAR(GDT),%eax ;\
+ movl 2(%eax),%edx ;\
+ andl $0xffffff,%edx ;\
+ movzbl 7(%eax),%eax ;\
+ shl $24,%eax ;\
+ orl %edx,%eax
+
+#if NLAPIC > 0
+#ifdef MULTIPROCESSOR
+IDTVEC(recurse_lapic_ipi)
+ pushfl
+ pushl %cs
+ pushl %esi
+ pushl $0
+ pushl $T_ASTFLT
+ INTRENTRY
+IDTVEC(resume_lapic_ipi)
+ cli
+ jmp 1f
+IDTVEC(intr_lapic_ipi)
+ pushl $0
+ pushl $T_ASTFLT
+ INTRENTRY
+ movl $0,_C_LABEL(local_apic)+LAPIC_EOI
+ movl CPUVAR(ILEVEL),%ebx
+ cmpl $IPL_IPI,%ebx
+ jae 2f
+1:
+ incl CPUVAR(IDEPTH)
+ movl $IPL_IPI,CPUVAR(ILEVEL)
+ sti
+ pushl %ebx
+ call _C_LABEL(x86_ipi_handler)
+ jmp _C_LABEL(Xdoreti)
+2:
+ orl $(1 << LIR_IPI),CPUVAR(IPENDING)
+ sti
+ INTRFASTEXIT
+
+#if defined(DDB)
+IDTVEC(intrddbipi)
+1:
+ str %ax
+ GET_TSS
+ movzwl (%eax),%eax
+ GET_TSS
+ pushl %eax
+ movl $0xff,_C_LABEL(lapic_tpr)
+ movl $0,_C_LABEL(local_apic)+LAPIC_EOI
+ sti
+ call _C_LABEL(ddb_ipi_tss)
+ addl $4,%esp
+ movl $0,_C_LABEL(lapic_tpr)
+ iret
+ jmp 1b
+#endif /* DDB */
+#endif /* MULTIPROCESSOR */
+
+ /*
+ * Interrupt from the local APIC timer.
+ */
+IDTVEC(recurse_lapic_ltimer)
+ pushfl
+ pushl %cs
+ pushl %esi
+ pushl $0
+ pushl $T_ASTFLT
+ INTRENTRY
+IDTVEC(resume_lapic_ltimer)
+ cli
+ jmp 1f
+IDTVEC(intr_lapic_ltimer)
+ pushl $0
+ pushl $T_ASTFLT
+ INTRENTRY
+ movl $0,_C_LABEL(local_apic)+LAPIC_EOI
+ movl CPUVAR(ILEVEL),%ebx
+ cmpl $IPL_CLOCK,%ebx
+ jae 2f
+1:
+ incl CPUVAR(IDEPTH)
+ movl $IPL_CLOCK,CPUVAR(ILEVEL)
+ sti
+ pushl %ebx
+ pushl $0
+ call _C_LABEL(lapic_clockintr)
+ addl $4,%esp
+ jmp _C_LABEL(Xdoreti)
+2:
+ orl $(1 << LIR_TIMER),CPUVAR(IPENDING)
+ sti
+ INTRFASTEXIT
+#endif /* NLAPIC > 0 */
+
+#ifdef MULTIPROCESSOR
+#define LOCK_KERNEL pushl %esp ; call _C_LABEL(x86_intlock) ; addl $4,%esp
+#define UNLOCK_KERNEL pushl %esp ; call _C_LABEL(x86_intunlock) ; addl $4,%esp
+#else
+#define LOCK_KERNEL
+#define UNLOCK_KERNEL
+#endif
+
+#define voidop(num)
+
+
+#define XENINTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
+IDTVEC(recurse_/**/name/**/num) ;\
+ pushfl ;\
+ pushl %cs ;\
+ pushl %esi ;\
+ subl $4,%esp ;\
+ pushl $T_ASTFLT /* trap # for doing ASTs */ ;\
+ INTRENTRY ;\
+IDTVEC(resume_/**/name/**/num) \
+ /*movl %esp,%ecx*/ ;\
+ movl $IREENT_MAGIC,TF_ERR(%esp) ;\
+ movl %ebx,%esi ;\
+ movl CPUVAR(ISOURCES) + (num) * 4, %ebp ;\
+ movl IS_MAXLEVEL(%ebp),%ebx ;\
+ jmp 1f ;\
+IDTVEC(intr_/**/name/**/num) ;\
+ pushl $0 /* dummy error code */ ;\
+ pushl $T_ASTFLT /* trap # for doing ASTs */ ;\
+ INTRENTRY ;\
+ /*movl %esp,%ecx*/ ;\
+ movl CPUVAR(ISOURCES) + (num) * 4, %ebp ;\
+ mask(num) /* mask it in hardware */ ;\
+ early_ack(num) /* and allow other intrs */ ;\
+ testl %ebp,%ebp ;\
+ jz 9f /* stray */ ;\
+ movl IS_MAXLEVEL(%ebp),%ebx ;\
+ movl CPUVAR(ILEVEL),%esi ;\
+ cmpl %ebx,%esi ;\
+ jae 10f /* currently masked; hold it */ ;\
+ incl MY_COUNT+V_INTR /* statistical info */ ;\
+ addl $1,IS_EVCNTLO(%ebp) /* inc event counter */ ;\
+ adcl $0,IS_EVCNTHI(%ebp) ;\
+1: \
+ pushl %esi ;\
+ movl %ebx,CPUVAR(ILEVEL) ;\
+ STI(%eax) ;\
+ incl CPUVAR(IDEPTH) ;\
+ movl IS_HANDLERS(%ebp),%ebx ;\
+ LOCK_KERNEL ;\
+6: \
+ movl IH_LEVEL(%ebx),%edi ;\
+ cmpl %esi,%edi ;\
+ jle 7f ;\
+ pushl %esp ;\
+ pushl IH_ARG(%ebx) ;\
+ movl %edi,CPUVAR(ILEVEL) ;\
+ call *IH_FUN(%ebx) /* call it */ ;\
+ addl $8,%esp /* toss the arg */ ;\
+ movl IH_NEXT(%ebx),%ebx /* next handler in chain */ ;\
+ testl %ebx,%ebx ;\
+ jnz 6b ;\
+5: \
+ UNLOCK_KERNEL ;\
+ CLI(%eax) ;\
+ unmask(num) /* unmask it in hardware */ ;\
+ late_ack(num) ;\
+ STI(%eax) ;\
+ jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\
+7: \
+ UNLOCK_KERNEL ;\
+ CLI(%eax) ;\
+ orl $(1 << num),CPUVAR(IPENDING) ;\
+ level_mask(num) ;\
+ late_ack(num) ;\
+ STI(%eax) ;\
+ jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\
+10: \
+ CLI(%eax) ;\
+ orl $(1 << num),CPUVAR(IPENDING) ;\
+ level_mask(num) ;\
+6: ; \
+ late_ack(num) ;\
+ STIC(%eax) ;\
+ jz 4f ; \
+ call _C_LABEL(stipending) ; \
+ testl %eax,%eax ; \
+ jnz 1b ; \
+4: INTRFASTEXIT ;\
+9: \
+ unmask(num) ;\
+ jmp 6b
+
+#define hypervisor_asm_unmask(num) \
+ movl irq_to_evtchn + (num) * 4,%ecx ;\
+ movl HYPERVISOR_shared_info,%eax ;\
+ lock ;\
+ btrl %ecx,EVENTS_MASK(%eax)
+
+XENINTRSTUB(xenev,0,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,1,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,2,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,3,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,4,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,5,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,6,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,7,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,8,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,9,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,10,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,11,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,12,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,13,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,14,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,15,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,16,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,17,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,18,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,19,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,20,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,21,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,22,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,23,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,24,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,25,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,26,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,27,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,28,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,29,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,30,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,31,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+
+.globl _C_LABEL(xenev_stubs)
+_C_LABEL(xenev_stubs):
+ .long _C_LABEL(Xintr_xenev0), _C_LABEL(Xrecurse_xenev0)
+ .long _C_LABEL(Xresume_xenev0)
+ .long _C_LABEL(Xintr_xenev1), _C_LABEL(Xrecurse_xenev1)
+ .long _C_LABEL(Xresume_xenev1)
+ .long _C_LABEL(Xintr_xenev2), _C_LABEL(Xrecurse_xenev2)
+ .long _C_LABEL(Xresume_xenev2)
+ .long _C_LABEL(Xintr_xenev3), _C_LABEL(Xrecurse_xenev3)
+ .long _C_LABEL(Xresume_xenev3)
+ .long _C_LABEL(Xintr_xenev4), _C_LABEL(Xrecurse_xenev4)
+ .long _C_LABEL(Xresume_xenev4)
+ .long _C_LABEL(Xintr_xenev5), _C_LABEL(Xrecurse_xenev5)
+ .long _C_LABEL(Xresume_xenev5)
+ .long _C_LABEL(Xintr_xenev6), _C_LABEL(Xrecurse_xenev6)
+ .long _C_LABEL(Xresume_xenev6)
+ .long _C_LABEL(Xintr_xenev7), _C_LABEL(Xrecurse_xenev7)
+ .long _C_LABEL(Xresume_xenev7)
+ .long _C_LABEL(Xintr_xenev8), _C_LABEL(Xrecurse_xenev8)
+ .long _C_LABEL(Xresume_xenev8)
+ .long _C_LABEL(Xintr_xenev9), _C_LABEL(Xrecurse_xenev9)
+ .long _C_LABEL(Xresume_xenev9)
+ .long _C_LABEL(Xintr_xenev10), _C_LABEL(Xrecurse_xenev10)
+ .long _C_LABEL(Xresume_xenev10)
+ .long _C_LABEL(Xintr_xenev11), _C_LABEL(Xrecurse_xenev11)
+ .long _C_LABEL(Xresume_xenev11)
+ .long _C_LABEL(Xintr_xenev12), _C_LABEL(Xrecurse_xenev12)
+ .long _C_LABEL(Xresume_xenev12)
+ .long _C_LABEL(Xintr_xenev13), _C_LABEL(Xrecurse_xenev13)
+ .long _C_LABEL(Xresume_xenev13)
+ .long _C_LABEL(Xintr_xenev14), _C_LABEL(Xrecurse_xenev14)
+ .long _C_LABEL(Xresume_xenev14)
+ .long _C_LABEL(Xintr_xenev15), _C_LABEL(Xrecurse_xenev15)
+ .long _C_LABEL(Xresume_xenev15)
+ .long _C_LABEL(Xintr_xenev16), _C_LABEL(Xrecurse_xenev16)
+ .long _C_LABEL(Xresume_xenev16)
+ .long _C_LABEL(Xintr_xenev17), _C_LABEL(Xrecurse_xenev17)
+ .long _C_LABEL(Xresume_xenev17)
+ .long _C_LABEL(Xintr_xenev18), _C_LABEL(Xrecurse_xenev18)
+ .long _C_LABEL(Xresume_xenev18)
+ .long _C_LABEL(Xintr_xenev19), _C_LABEL(Xrecurse_xenev19)
+ .long _C_LABEL(Xresume_xenev19)
+ .long _C_LABEL(Xintr_xenev20), _C_LABEL(Xrecurse_xenev20)
+ .long _C_LABEL(Xresume_xenev20)
+ .long _C_LABEL(Xintr_xenev21), _C_LABEL(Xrecurse_xenev21)
+ .long _C_LABEL(Xresume_xenev21)
+ .long _C_LABEL(Xintr_xenev22), _C_LABEL(Xrecurse_xenev22)
+ .long _C_LABEL(Xresume_xenev22)
+ .long _C_LABEL(Xintr_xenev23), _C_LABEL(Xrecurse_xenev23)
+ .long _C_LABEL(Xresume_xenev23)
+ .long _C_LABEL(Xintr_xenev24), _C_LABEL(Xrecurse_xenev24)
+ .long _C_LABEL(Xresume_xenev24)
+ .long _C_LABEL(Xintr_xenev25), _C_LABEL(Xrecurse_xenev25)
+ .long _C_LABEL(Xresume_xenev25)
+ .long _C_LABEL(Xintr_xenev26), _C_LABEL(Xrecurse_xenev26)
+ .long _C_LABEL(Xresume_xenev26)
+ .long _C_LABEL(Xintr_xenev27), _C_LABEL(Xrecurse_xenev27)
+ .long _C_LABEL(Xresume_xenev27)
+ .long _C_LABEL(Xintr_xenev28), _C_LABEL(Xrecurse_xenev28)
+ .long _C_LABEL(Xresume_xenev28)
+ .long _C_LABEL(Xintr_xenev29), _C_LABEL(Xrecurse_xenev29)
+ .long _C_LABEL(Xresume_xenev29)
+ .long _C_LABEL(Xintr_xenev30), _C_LABEL(Xrecurse_xenev30)
+ .long _C_LABEL(Xresume_xenev30)
+ .long _C_LABEL(Xintr_xenev31), _C_LABEL(Xrecurse_xenev31)
+ .long _C_LABEL(Xresume_xenev31)
+
+#ifndef XEN
+/*
+ * This macro defines the generic stub code. Its arguments modifiy it
+ * for specific PICs.
+ */
+
+#define INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
+IDTVEC(recurse_/**/name/**/num) ;\
+ pushfl ;\
+ pushl %cs ;\
+ pushl %esi ;\
+ subl $4,%esp ;\
+ pushl $T_ASTFLT /* trap # for doing ASTs */ ;\
+ INTRENTRY ;\
+IDTVEC(resume_/**/name/**/num) \
+ movl $IREENT_MAGIC,TF_ERR(%esp) ;\
+ movl %ebx,%esi ;\
+ movl CPUVAR(ISOURCES) + (num) * 4, %ebp ;\
+ movl IS_MAXLEVEL(%ebp),%ebx ;\
+ jmp 1f ;\
+IDTVEC(intr_/**/name/**/num) ;\
+ pushl $0 /* dummy error code */ ;\
+ pushl $T_ASTFLT /* trap # for doing ASTs */ ;\
+ INTRENTRY ;\
+ movl CPUVAR(ISOURCES) + (num) * 4, %ebp ;\
+ mask(num) /* mask it in hardware */ ;\
+ early_ack(num) /* and allow other intrs */ ;\
+ testl %ebp,%ebp ;\
+ jz 9f /* stray */ ;\
+ movl IS_MAXLEVEL(%ebp),%ebx ;\
+ movl CPUVAR(ILEVEL),%esi ;\
+ cmpl %ebx,%esi ;\
+ jae 10f /* currently masked; hold it */ ;\
+ incl MY_COUNT+V_INTR /* statistical info */ ;\
+ addl $1,IS_EVCNTLO(%ebp) /* inc event counter */ ;\
+ adcl $0,IS_EVCNTHI(%ebp) ;\
+1: \
+ pushl %esi ;\
+ movl %ebx,CPUVAR(ILEVEL) ;\
+ STI(%eax) ;\
+ incl CPUVAR(IDEPTH) ;\
+ movl IS_HANDLERS(%ebp),%ebx ;\
+ LOCK_KERNEL ;\
+6: \
+ movl IH_LEVEL(%ebx),%edi ;\
+ cmpl %esi,%edi ;\
+ jle 7f ;\
+ pushl IH_ARG(%ebx) ;\
+ movl %edi,CPUVAR(ILEVEL) ;\
+ call *IH_FUN(%ebx) /* call it */ ;\
+ addl $4,%esp /* toss the arg */ ;\
+ movl IH_NEXT(%ebx),%ebx /* next handler in chain */ ;\
+ testl %ebx,%ebx ;\
+ jnz 6b ;\
+5: \
+ UNLOCK_KERNEL ;\
+ CLI(%eax) ;\
+ unmask(num) /* unmask it in hardware */ ;\
+ late_ack(num) ;\
+ STI(%eax) ;\
+ jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\
+7: \
+ UNLOCK_KERNEL ;\
+ CLI(%eax) ;\
+ orl $(1 << num),CPUVAR(IPENDING) ;\
+ level_mask(num) ;\
+ late_ack(num) ;\
+ STI(%eax) ;\
+ jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\
+10: \
+ CLI(%eax) ;\
+ orl $(1 << num),CPUVAR(IPENDING) ;\
+ level_mask(num) ;\
+ late_ack(num) ;\
+ STIC(%eax) ;\
+ jz 4f ; \
+ call _C_LABEL(stipending) ; \
+ testl %eax,%eax ; \
+ jnz 1b ; \
+4: INTRFASTEXIT ;\
+9: \
+ unmask(num) ;\
+ late_ack(num) ;\
+ STIC(%eax) ;\
+ jz 4f ; \
+ call _C_LABEL(stipending) ; \
+ testl %eax,%eax ; \
+ jnz 1b ; \
+4: INTRFASTEXIT
+
+#define ICUADDR IO_ICU1
+
+INTRSTUB(legacy,0,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,1,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,2,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,3,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,4,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,5,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,6,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,7,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+#undef ICUADDR
+#define ICUADDR IO_ICU2
+
+INTRSTUB(legacy,8,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,9,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,10,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,11,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,12,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,13,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,14,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+INTRSTUB(legacy,15,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+ voidop)
+#endif
+
+#if NIOAPIC > 0
+
+INTRSTUB(ioapic_edge,0,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,1,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,2,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,3,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,4,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,5,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,6,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,7,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,8,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,9,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,10,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,11,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,12,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,13,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,14,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,15,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,16,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,17,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,18,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,19,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,20,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,21,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,22,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,23,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,24,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,25,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,26,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,27,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,28,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,29,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,30,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,31,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+
+INTRSTUB(ioapic_level,0,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,1,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,2,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,3,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,4,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,5,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,6,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,7,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,8,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,9,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,10,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,11,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,12,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,13,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,14,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,15,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,16,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,17,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,18,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,19,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,20,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,21,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,22,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,23,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,24,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,25,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,26,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,27,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,28,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,29,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,30,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,31,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+
+#endif
+
+#ifndef XEN
+.globl _C_LABEL(i8259_stubs)
+_C_LABEL(i8259_stubs):
+ .long _C_LABEL(Xintr_legacy0), _C_LABEL(Xrecurse_legacy0)
+ .long _C_LABEL(Xresume_legacy0)
+ .long _C_LABEL(Xintr_legacy1), _C_LABEL(Xrecurse_legacy1)
+ .long _C_LABEL(Xresume_legacy1)
+ .long _C_LABEL(Xintr_legacy2), _C_LABEL(Xrecurse_legacy2)
+ .long _C_LABEL(Xresume_legacy2)
+ .long _C_LABEL(Xintr_legacy3), _C_LABEL(Xrecurse_legacy3)
+ .long _C_LABEL(Xresume_legacy3)
+ .long _C_LABEL(Xintr_legacy4), _C_LABEL(Xrecurse_legacy4)
+ .long _C_LABEL(Xresume_legacy4)
+ .long _C_LABEL(Xintr_legacy5), _C_LABEL(Xrecurse_legacy5)
+ .long _C_LABEL(Xresume_legacy5)
+ .long _C_LABEL(Xintr_legacy6), _C_LABEL(Xrecurse_legacy6)
+ .long _C_LABEL(Xresume_legacy6)
+ .long _C_LABEL(Xintr_legacy7), _C_LABEL(Xrecurse_legacy7)
+ .long _C_LABEL(Xresume_legacy7)
+ .long _C_LABEL(Xintr_legacy8), _C_LABEL(Xrecurse_legacy8)
+ .long _C_LABEL(Xresume_legacy8)
+ .long _C_LABEL(Xintr_legacy9), _C_LABEL(Xrecurse_legacy9)
+ .long _C_LABEL(Xresume_legacy9)
+ .long _C_LABEL(Xintr_legacy10), _C_LABEL(Xrecurse_legacy10)
+ .long _C_LABEL(Xresume_legacy10)
+ .long _C_LABEL(Xintr_legacy11), _C_LABEL(Xrecurse_legacy11)
+ .long _C_LABEL(Xresume_legacy11)
+ .long _C_LABEL(Xintr_legacy12), _C_LABEL(Xrecurse_legacy12)
+ .long _C_LABEL(Xresume_legacy12)
+ .long _C_LABEL(Xintr_legacy13), _C_LABEL(Xrecurse_legacy13)
+ .long _C_LABEL(Xresume_legacy13)
+ .long _C_LABEL(Xintr_legacy14), _C_LABEL(Xrecurse_legacy14)
+ .long _C_LABEL(Xresume_legacy14)
+ .long _C_LABEL(Xintr_legacy15), _C_LABEL(Xrecurse_legacy15)
+ .long _C_LABEL(Xresume_legacy15)
+#endif
+
+#if NIOAPIC > 0
+.globl _C_LABEL(ioapic_edge_stubs)
+_C_LABEL(ioapic_edge_stubs):
+ .long _C_LABEL(Xintr_ioapic_edge0), _C_LABEL(Xrecurse_ioapic_edge0)
+ .long _C_LABEL(Xresume_ioapic_edge0)
+ .long _C_LABEL(Xintr_ioapic_edge1), _C_LABEL(Xrecurse_ioapic_edge1)
+ .long _C_LABEL(Xresume_ioapic_edge1)
+ .long _C_LABEL(Xintr_ioapic_edge2), _C_LABEL(Xrecurse_ioapic_edge2)
+ .long _C_LABEL(Xresume_ioapic_edge2)
+ .long _C_LABEL(Xintr_ioapic_edge3), _C_LABEL(Xrecurse_ioapic_edge3)
+ .long _C_LABEL(Xresume_ioapic_edge3)
+ .long _C_LABEL(Xintr_ioapic_edge4), _C_LABEL(Xrecurse_ioapic_edge4)
+ .long _C_LABEL(Xresume_ioapic_edge4)
+ .long _C_LABEL(Xintr_ioapic_edge5), _C_LABEL(Xrecurse_ioapic_edge5)
+ .long _C_LABEL(Xresume_ioapic_edge5)
+ .long _C_LABEL(Xintr_ioapic_edge6), _C_LABEL(Xrecurse_ioapic_edge6)
+ .long _C_LABEL(Xresume_ioapic_edge6)
+ .long _C_LABEL(Xintr_ioapic_edge7), _C_LABEL(Xrecurse_ioapic_edge7)
+ .long _C_LABEL(Xresume_ioapic_edge7)
+ .long _C_LABEL(Xintr_ioapic_edge8), _C_LABEL(Xrecurse_ioapic_edge8)
+ .long _C_LABEL(Xresume_ioapic_edge8)
+ .long _C_LABEL(Xintr_ioapic_edge9), _C_LABEL(Xrecurse_ioapic_edge9)
+ .long _C_LABEL(Xresume_ioapic_edge9)
+ .long _C_LABEL(Xintr_ioapic_edge10), _C_LABEL(Xrecurse_ioapic_edge10)
+ .long _C_LABEL(Xresume_ioapic_edge10)
+ .long _C_LABEL(Xintr_ioapic_edge11), _C_LABEL(Xrecurse_ioapic_edge11)
+ .long _C_LABEL(Xresume_ioapic_edge11)
+ .long _C_LABEL(Xintr_ioapic_edge12), _C_LABEL(Xrecurse_ioapic_edge12)
+ .long _C_LABEL(Xresume_ioapic_edge12)
+ .long _C_LABEL(Xintr_ioapic_edge13), _C_LABEL(Xrecurse_ioapic_edge13)
+ .long _C_LABEL(Xresume_ioapic_edge13)
+ .long _C_LABEL(Xintr_ioapic_edge14), _C_LABEL(Xrecurse_ioapic_edge14)
+ .long _C_LABEL(Xresume_ioapic_edge14)
+ .long _C_LABEL(Xintr_ioapic_edge15), _C_LABEL(Xrecurse_ioapic_edge15)
+ .long _C_LABEL(Xresume_ioapic_edge15)
+ .long _C_LABEL(Xintr_ioapic_edge16), _C_LABEL(Xrecurse_ioapic_edge16)
+ .long _C_LABEL(Xresume_ioapic_edge16)
+ .long _C_LABEL(Xintr_ioapic_edge17), _C_LABEL(Xrecurse_ioapic_edge17)
+ .long _C_LABEL(Xresume_ioapic_edge17)
+ .long _C_LABEL(Xintr_ioapic_edge18), _C_LABEL(Xrecurse_ioapic_edge18)
+ .long _C_LABEL(Xresume_ioapic_edge18)
+ .long _C_LABEL(Xintr_ioapic_edge19), _C_LABEL(Xrecurse_ioapic_edge19)
+ .long _C_LABEL(Xresume_ioapic_edge19)
+ .long _C_LABEL(Xintr_ioapic_edge20), _C_LABEL(Xrecurse_ioapic_edge20)
+ .long _C_LABEL(Xresume_ioapic_edge20)
+ .long _C_LABEL(Xintr_ioapic_edge21), _C_LABEL(Xrecurse_ioapic_edge21)
+ .long _C_LABEL(Xresume_ioapic_edge21)
+ .long _C_LABEL(Xintr_ioapic_edge22), _C_LABEL(Xrecurse_ioapic_edge22)
+ .long _C_LABEL(Xresume_ioapic_edge22)
+ .long _C_LABEL(Xintr_ioapic_edge23), _C_LABEL(Xrecurse_ioapic_edge23)
+ .long _C_LABEL(Xresume_ioapic_edge23)
+ .long _C_LABEL(Xintr_ioapic_edge24), _C_LABEL(Xrecurse_ioapic_edge24)
+ .long _C_LABEL(Xresume_ioapic_edge24)
+ .long _C_LABEL(Xintr_ioapic_edge25), _C_LABEL(Xrecurse_ioapic_edge25)
+ .long _C_LABEL(Xresume_ioapic_edge25)
+ .long _C_LABEL(Xintr_ioapic_edge26), _C_LABEL(Xrecurse_ioapic_edge26)
+ .long _C_LABEL(Xresume_ioapic_edge26)
+ .long _C_LABEL(Xintr_ioapic_edge27), _C_LABEL(Xrecurse_ioapic_edge27)
+ .long _C_LABEL(Xresume_ioapic_edge27)
+ .long _C_LABEL(Xintr_ioapic_edge28), _C_LABEL(Xrecurse_ioapic_edge28)
+ .long _C_LABEL(Xresume_ioapic_edge28)
+ .long _C_LABEL(Xintr_ioapic_edge29), _C_LABEL(Xrecurse_ioapic_edge29)
+ .long _C_LABEL(Xresume_ioapic_edge29)
+ .long _C_LABEL(Xintr_ioapic_edge30), _C_LABEL(Xrecurse_ioapic_edge30)
+ .long _C_LABEL(Xresume_ioapic_edge30)
+ .long _C_LABEL(Xintr_ioapic_edge31), _C_LABEL(Xrecurse_ioapic_edge31)
+ .long _C_LABEL(Xresume_ioapic_edge31)
+
+.globl _C_LABEL(ioapic_level_stubs)
+_C_LABEL(ioapic_level_stubs):
+ .long _C_LABEL(Xintr_ioapic_level0), _C_LABEL(Xrecurse_ioapic_level0)
+ .long _C_LABEL(Xresume_ioapic_level0)
+ .long _C_LABEL(Xintr_ioapic_level1), _C_LABEL(Xrecurse_ioapic_level1)
+ .long _C_LABEL(Xresume_ioapic_level1)
+ .long _C_LABEL(Xintr_ioapic_level2), _C_LABEL(Xrecurse_ioapic_level2)
+ .long _C_LABEL(Xresume_ioapic_level2)
+ .long _C_LABEL(Xintr_ioapic_level3), _C_LABEL(Xrecurse_ioapic_level3)
+ .long _C_LABEL(Xresume_ioapic_level3)
+ .long _C_LABEL(Xintr_ioapic_level4), _C_LABEL(Xrecurse_ioapic_level4)
+ .long _C_LABEL(Xresume_ioapic_level4)
+ .long _C_LABEL(Xintr_ioapic_level5), _C_LABEL(Xrecurse_ioapic_level5)
+ .long _C_LABEL(Xresume_ioapic_level5)
+ .long _C_LABEL(Xintr_ioapic_level6), _C_LABEL(Xrecurse_ioapic_level6)
+ .long _C_LABEL(Xresume_ioapic_level6)
+ .long _C_LABEL(Xintr_ioapic_level7), _C_LABEL(Xrecurse_ioapic_level7)
+ .long _C_LABEL(Xresume_ioapic_level7)
+ .long _C_LABEL(Xintr_ioapic_level8), _C_LABEL(Xrecurse_ioapic_level8)
+ .long _C_LABEL(Xresume_ioapic_level8)
+ .long _C_LABEL(Xintr_ioapic_level9), _C_LABEL(Xrecurse_ioapic_level9)
+ .long _C_LABEL(Xresume_ioapic_level9)
+ .long _C_LABEL(Xintr_ioapic_level10), _C_LABEL(Xrecurse_ioapic_level10)
+ .long _C_LABEL(Xresume_ioapic_level10)
+ .long _C_LABEL(Xintr_ioapic_level11), _C_LABEL(Xrecurse_ioapic_level11)
+ .long _C_LABEL(Xresume_ioapic_level11)
+ .long _C_LABEL(Xintr_ioapic_level12), _C_LABEL(Xrecurse_ioapic_level12)
+ .long _C_LABEL(Xresume_ioapic_level12)
+ .long _C_LABEL(Xintr_ioapic_level13), _C_LABEL(Xrecurse_ioapic_level13)
+ .long _C_LABEL(Xresume_ioapic_level13)
+ .long _C_LABEL(Xintr_ioapic_level14), _C_LABEL(Xrecurse_ioapic_level14)
+ .long _C_LABEL(Xresume_ioapic_level14)
+ .long _C_LABEL(Xintr_ioapic_level15), _C_LABEL(Xrecurse_ioapic_level15)
+ .long _C_LABEL(Xresume_ioapic_level15)
+ .long _C_LABEL(Xintr_ioapic_level16), _C_LABEL(Xrecurse_ioapic_level16)
+ .long _C_LABEL(Xresume_ioapic_level16)
+ .long _C_LABEL(Xintr_ioapic_level17), _C_LABEL(Xrecurse_ioapic_level17)
+ .long _C_LABEL(Xresume_ioapic_level17)
+ .long _C_LABEL(Xintr_ioapic_level18), _C_LABEL(Xrecurse_ioapic_level18)
+ .long _C_LABEL(Xresume_ioapic_level18)
+ .long _C_LABEL(Xintr_ioapic_level19), _C_LABEL(Xrecurse_ioapic_level19)
+ .long _C_LABEL(Xresume_ioapic_level19)
+ .long _C_LABEL(Xintr_ioapic_level20), _C_LABEL(Xrecurse_ioapic_level20)
+ .long _C_LABEL(Xresume_ioapic_level20)
+ .long _C_LABEL(Xintr_ioapic_level21), _C_LABEL(Xrecurse_ioapic_level21)
+ .long _C_LABEL(Xresume_ioapic_level21)
+ .long _C_LABEL(Xintr_ioapic_level22), _C_LABEL(Xrecurse_ioapic_level22)
+ .long _C_LABEL(Xresume_ioapic_level22)
+ .long _C_LABEL(Xintr_ioapic_level23), _C_LABEL(Xrecurse_ioapic_level23)
+ .long _C_LABEL(Xresume_ioapic_level23)
+ .long _C_LABEL(Xintr_ioapic_level24), _C_LABEL(Xrecurse_ioapic_level24)
+ .long _C_LABEL(Xresume_ioapic_level24)
+ .long _C_LABEL(Xintr_ioapic_level25), _C_LABEL(Xrecurse_ioapic_level25)
+ .long _C_LABEL(Xresume_ioapic_level25)
+ .long _C_LABEL(Xintr_ioapic_level26), _C_LABEL(Xrecurse_ioapic_level26)
+ .long _C_LABEL(Xresume_ioapic_level26)
+ .long _C_LABEL(Xintr_ioapic_level27), _C_LABEL(Xrecurse_ioapic_level27)
+ .long _C_LABEL(Xresume_ioapic_level27)
+ .long _C_LABEL(Xintr_ioapic_level28), _C_LABEL(Xrecurse_ioapic_level28)
+ .long _C_LABEL(Xresume_ioapic_level28)
+ .long _C_LABEL(Xintr_ioapic_level29), _C_LABEL(Xrecurse_ioapic_level29)
+ .long _C_LABEL(Xresume_ioapic_level29)
+ .long _C_LABEL(Xintr_ioapic_level30), _C_LABEL(Xrecurse_ioapic_level30)
+ .long _C_LABEL(Xresume_ioapic_level30)
+ .long _C_LABEL(Xintr_ioapic_level31), _C_LABEL(Xrecurse_ioapic_level31)
+ .long _C_LABEL(Xresume_ioapic_level31)
+#endif
+
+/*
+ * Symbols that vmstat -i wants, even though they're not used.
+ */
+.globl _C_LABEL(intrnames)
+_C_LABEL(intrnames):
+.globl _C_LABEL(eintrnames)
+_C_LABEL(eintrnames):
+
+.globl _C_LABEL(intrcnt)
+_C_LABEL(intrcnt):
+.globl _C_LABEL(eintrcnt)
+_C_LABEL(eintrcnt):
+
+/*
+ * Soft interrupt handlers
+ */
+
+IDTVEC(softserial)
+ movl $IPL_SOFTSERIAL, CPUVAR(ILEVEL)
+ incl CPUVAR(IDEPTH)
+#ifdef MULTIPROCESSOR
+ call _C_LABEL(x86_softintlock)
+#endif
+ movl CPUVAR(ISOURCES) + SIR_SERIAL * 4, %edi
+ addl $1,IS_EVCNTLO(%edi)
+ adcl $0,IS_EVCNTHI(%edi)
+ pushl $X86_SOFTINTR_SOFTSERIAL
+ call _C_LABEL(softintr_dispatch)
+ addl $4,%esp
+#ifdef MULTIPROCESSOR
+ call _C_LABEL(x86_softintunlock)
+#endif
+ decl CPUVAR(IDEPTH)
+ jmp *%esi
+
+IDTVEC(softnet)
+ movl $IPL_SOFTNET, CPUVAR(ILEVEL)
+ incl CPUVAR(IDEPTH)
+#ifdef MULTIPROCESSOR
+ call _C_LABEL(x86_softintlock)
+#endif
+ movl CPUVAR(ISOURCES) + SIR_NET * 4, %edi
+ addl $1,IS_EVCNTLO(%edi)
+ adcl $0,IS_EVCNTHI(%edi)
+
+ xorl %edi,%edi
+ xchgl _C_LABEL(netisr),%edi
+
+ /* XXX Do the legacy netisrs here for now. */
+#define DONETISR(s, c) \
+ .globl _C_LABEL(c) ;\
+ testl $(1 << s),%edi ;\
+ jz 1f ;\
+ call _C_LABEL(c) ;\
+1:
+#include <net/netisr_dispatch.h>
+
+ pushl $X86_SOFTINTR_SOFTNET
+ call _C_LABEL(softintr_dispatch)
+ addl $4,%esp
+#ifdef MULTIPROCESSOR
+ call _C_LABEL(x86_softintunlock)
+#endif
+ decl CPUVAR(IDEPTH)
+ jmp *%esi
+
+IDTVEC(softclock)
+ movl $IPL_SOFTCLOCK, CPUVAR(ILEVEL)
+ incl CPUVAR(IDEPTH)
+#ifdef MULTIPROCESSOR
+ call _C_LABEL(x86_softintlock)
+#endif
+ movl CPUVAR(ISOURCES) + SIR_CLOCK * 4, %edi
+ addl $1,IS_EVCNTLO(%edi)
+ adcl $0,IS_EVCNTHI(%edi)
+
+ pushl $X86_SOFTINTR_SOFTCLOCK
+ call _C_LABEL(softintr_dispatch)
+ addl $4,%esp
+#ifdef MULTIPROCESSOR
+ call _C_LABEL(x86_softintunlock)
+#endif
+ decl CPUVAR(IDEPTH)
+ jmp *%esi
+
+/*
+ * Trap and fault vector routines
+ *
+ * On exit from the kernel to user mode, we always need to check for ASTs. In
+ * addition, we need to do this atomically; otherwise an interrupt may occur
+ * which causes an AST, but it won't get processed until the next kernel entry
+ * (possibly the next clock tick). Thus, we disable interrupt before checking,
+ * and only enable them again on the final `iret' or before calling the AST
+ * handler.
+ */
+
+#define TRAP(a) pushl $(a) ; jmp _C_LABEL(alltraps)
+#define ZTRAP(a) pushl $0 ; TRAP(a)
+
+#ifdef IPKDB
+#define BPTTRAP(a) pushl $0; pushl $(a); jmp _C_LABEL(bpttraps)
+#else
+#define BPTTRAP(a) ZTRAP(a)
+#endif
+
+
+ .text
+IDTVEC(trap00)
+ ZTRAP(T_DIVIDE)
+IDTVEC(trap01)
+ BPTTRAP(T_TRCTRAP)
+IDTVEC(trap02)
+ ZTRAP(T_NMI)
+IDTVEC(trap03)
+ BPTTRAP(T_BPTFLT)
+IDTVEC(trap04)
+ ZTRAP(T_OFLOW)
+IDTVEC(trap05)
+ ZTRAP(T_BOUND)
+IDTVEC(trap06)
+ ZTRAP(T_PRIVINFLT)
+IDTVEC(trap07)
+#if NNPX > 0
+ pushl $0 # dummy error code
+ pushl $T_DNA
+ INTRENTRY
+#ifdef XENDEBUG_LOW
+ pushl %esp
+#endif
+ pushl CPUVAR(SELF)
+ call *_C_LABEL(npxdna_func)
+ addl $4,%esp
+#ifdef XENDEBUG_LOW
+ addl $4,%esp
+#endif
+ testl %eax,%eax
+ jz calltrap
+ INTRFASTEXIT
+#else
+ ZTRAP(T_DNA)
+#endif
+IDTVEC(trap08)
+ TRAP(T_DOUBLEFLT)
+IDTVEC(trap09)
+ ZTRAP(T_FPOPFLT)
+IDTVEC(trap0a)
+ TRAP(T_TSSFLT)
+IDTVEC(trap0b)
+ TRAP(T_SEGNPFLT)
+IDTVEC(trap0c)
+ TRAP(T_STKFLT)
+IDTVEC(trap0d)
+ TRAP(T_PROTFLT)
+#ifndef XEN
+IDTVEC(trap0e)
+#ifndef I586_CPU
+ TRAP(T_PAGEFLT)
+#else
+ pushl $T_PAGEFLT
+ INTRENTRY
+ testb $PGEX_U,TF_ERR(%esp)
+ jnz calltrap
+ movl %cr2,%eax
+ subl _C_LABEL(pentium_idt),%eax
+ cmpl $(6*8),%eax
+ jne calltrap
+ movb $T_PRIVINFLT,TF_TRAPNO(%esp)
+ jmp calltrap
+#endif
+#endif
+
+IDTVEC(intrspurious)
+IDTVEC(trap0f)
+ /*
+ * The Pentium Pro local APIC may erroneously call this vector for a
+ * default IR7. Just ignore it.
+ *
+ * (The local APIC does this when CPL is raised while it's on the
+ * way to delivering an interrupt.. presumably enough has been set
+ * up that it's inconvenient to abort delivery completely..)
+ */
+ iret
+
+IDTVEC(trap10)
+#if NNPX > 0
+ /*
+ * Handle like an interrupt so that we can call npxintr to clear the
+ * error. It would be better to handle npx interrupts as traps but
+ * this is difficult for nested interrupts.
+ */
+ pushl $0 # dummy error code
+ pushl $T_ASTFLT
+ INTRENTRY
+ pushl CPUVAR(ILEVEL)
+ pushl %esp
+ incl _C_LABEL(uvmexp)+V_TRAP
+ call _C_LABEL(npxintr)
+ addl $8,%esp
+ INTRFASTEXIT
+#else
+ ZTRAP(T_ARITHTRAP)
+#endif
+IDTVEC(trap11)
+ TRAP(T_ALIGNFLT)
+IDTVEC(trap12)
+IDTVEC(trap13)
+IDTVEC(trap14)
+IDTVEC(trap15)
+IDTVEC(trap16)
+IDTVEC(trap17)
+IDTVEC(trap18)
+IDTVEC(trap19)
+IDTVEC(trap1a)
+IDTVEC(trap1b)
+IDTVEC(trap1c)
+IDTVEC(trap1d)
+IDTVEC(trap1e)
+IDTVEC(trap1f)
+ /* 18 - 31 reserved for future exp */
+ ZTRAP(T_RESERVED)
+
+IDTVEC(exceptions)
+#ifndef XENDEBUG_LOW
+ .long _C_LABEL(Xtrap00), _C_LABEL(Xtrap01)
+ .long _C_LABEL(Xtrap02), _C_LABEL(Xtrap03)
+ .long _C_LABEL(Xtrap04), _C_LABEL(Xtrap05)
+ .long _C_LABEL(Xtrap06), _C_LABEL(Xtrap07)
+ .long _C_LABEL(Xtrap08), _C_LABEL(Xtrap09)
+ .long _C_LABEL(Xtrap0a), _C_LABEL(Xtrap0b)
+ .long _C_LABEL(Xtrap0c), _C_LABEL(Xtrap0d)
+ .long _C_LABEL(Xtrap0e), _C_LABEL(Xtrap0f)
+ .long _C_LABEL(Xtrap10), _C_LABEL(Xtrap11)
+ .long _C_LABEL(Xtrap12), _C_LABEL(Xtrap13)
+ .long _C_LABEL(Xtrap14), _C_LABEL(Xtrap15)
+ .long _C_LABEL(Xtrap16), _C_LABEL(Xtrap17)
+ .long _C_LABEL(Xtrap18), _C_LABEL(Xtrap19)
+ .long _C_LABEL(Xtrap1a), _C_LABEL(Xtrap1b)
+ .long _C_LABEL(Xtrap1c), _C_LABEL(Xtrap1d)
+ .long _C_LABEL(Xtrap1e), _C_LABEL(Xtrap1f)
+#else
+ .long _C_LABEL(divide_error), _C_LABEL(debug)
+ .long _C_LABEL(Xtrap02), _C_LABEL(Xtrap03) #int3)
+ .long _C_LABEL(overflow), _C_LABEL(bounds)
+ .long _C_LABEL(invalid_op), _C_LABEL(device_not_available)
+ .long _C_LABEL(double_fault), _C_LABEL(coprocessor_segment_overrun)
+ .long _C_LABEL(invalid_TSS), _C_LABEL(segment_not_present)
+ .long _C_LABEL(stack_segment)
+ #.long _C_LABEL(general_protection)
+ .long _C_LABEL(Xtrap0d)
+ #.long _C_LABEL(page_fault)
+ .long _C_LABEL(Xtrap0e)
+ .long _C_LABEL(spurious_interrupt_bug)
+ .long _C_LABEL(coprocessor_error), _C_LABEL(alignment_check)
+ .long _C_LABEL(machine_check), _C_LABEL(simd_coprocessor_error)
+ .long _C_LABEL(Xtrap14), _C_LABEL(Xtrap15)
+ .long _C_LABEL(Xtrap16), _C_LABEL(Xtrap17)
+ .long _C_LABEL(Xtrap18), _C_LABEL(Xtrap19)
+ .long _C_LABEL(Xtrap1a), _C_LABEL(Xtrap1b)
+ .long _C_LABEL(Xtrap1c), _C_LABEL(Xtrap1d)
+ .long _C_LABEL(Xtrap1e), _C_LABEL(Xtrap1f)
+#endif
+
+
+IDTVEC(tss_trap08)
+1:
+ str %ax
+ GET_TSS
+ movzwl (%eax),%eax
+ GET_TSS
+ pushl $T_DOUBLEFLT
+ pushl %eax
+ call _C_LABEL(trap_tss)
+ addl $12,%esp
+ iret
+ jmp 1b
+
+/* LINTSTUB: Ignore */
+NENTRY(alltraps)
+ INTRENTRY
+calltrap:
+#ifdef DIAGNOSTIC
+ movl CPUVAR(ILEVEL),%ebx
+#endif /* DIAGNOSTIC */
+ pushl %esp
+ call _C_LABEL(trap)
+ addl $4,%esp
+ testb $CHK_UPL,TF_CS(%esp)
+ jnz alltraps_checkast
+#ifdef VM86
+ testl $PSL_VM,TF_EFLAGS(%esp)
+ jz 6f
+#else
+ jmp 6f
+#endif
+alltraps_checkast:
+ /* Check for ASTs on exit to user mode. */
+ CLI(%eax)
+ CHECK_ASTPENDING(%eax)
+ jz 3f
+5: CLEAR_ASTPENDING(%eax)
+ STI(%eax)
+ movl $T_ASTFLT,TF_TRAPNO(%esp)
+ pushl %esp
+ call _C_LABEL(trap)
+ addl $4,%esp
+ jmp alltraps_checkast /* re-check ASTs */
+3: CHECK_DEFERRED_SWITCH(%eax)
+ jnz 9f
+6: STIC(%eax)
+ jz 4f
+ call _C_LABEL(stipending)
+ #testl %eax,%eax /* XXXcl */
+ #jnz 1b
+4:
+#ifndef DIAGNOSTIC
+ INTRFASTEXIT
+#else
+ cmpl CPUVAR(ILEVEL),%ebx
+ jne 3f
+ INTRFASTEXIT
+3: pushl $4f
+ call _C_LABEL(printf)
+ addl $4,%esp
+#ifdef DDB
+ int $3
+#endif /* DDB */
+ movl %ebx,CPUVAR(ILEVEL)
+ jmp alltraps_checkast /* re-check ASTs */
+4: .asciz "WARNING: SPL NOT LOWERED ON TRAP EXIT\n"
+#endif /* DIAGNOSTIC */
+9: STI(%eax)
+ call _C_LABEL(pmap_load)
+ jmp alltraps_checkast /* re-check ASTs */
+
+/* LINTSTUB: Ignore */
+IDTVEC(trap0e)
+ INTRENTRY
+ movl TF_TRAPNO(%esp),%eax
+ movl $T_PAGEFLT,TF_TRAPNO(%esp)
+#ifdef DIAGNOSTIC
+ movl CPUVAR(ILEVEL),%ebx
+#endif /* DIAGNOSTIC */
+ #pushl %esp
+ pushl %eax
+ movl %esp,%eax
+ addl $4,%eax
+ pushl %eax
+ call _C_LABEL(trap)
+ addl $4,%esp
+ addl $4,%esp
+ testb $CHK_UPL,TF_CS(%esp)
+ jnz trap0e_checkast
+#ifdef VM86
+ testl $PSL_VM,TF_EFLAGS(%esp)
+ jz 6f
+#else
+ jmp 6f
+#endif
+trap0e_checkast:
+ /* Check for ASTs on exit to user mode. */
+ CLI(%eax)
+ CHECK_ASTPENDING(%eax)
+ jz 3f
+5: CLEAR_ASTPENDING(%eax)
+ STI(%eax)
+ movl $T_ASTFLT,TF_TRAPNO(%esp)
+ pushl %esp
+ call _C_LABEL(trap)
+ addl $4,%esp
+ jmp trap0e_checkast /* re-check ASTs */
+3: CHECK_DEFERRED_SWITCH(%eax)
+ jnz 9f
+6: STIC(%eax)
+ jz 4f
+ call _C_LABEL(stipending)
+ #testl %eax,%eax /* XXXcl */
+ #jnz 1b
+4:
+#ifndef DIAGNOSTIC
+ INTRFASTEXIT
+#else
+ cmpl CPUVAR(ILEVEL),%ebx
+ jne 3f
+ INTRFASTEXIT
+3: pushl $4f
+ call _C_LABEL(printf)
+ addl $4,%esp
+#ifdef DDB
+ int $3
+#endif /* DDB */
+ movl %ebx,CPUVAR(ILEVEL)
+ jmp trap0e_checkast /* re-check ASTs */
+4: .asciz "WARNING: SPL NOT LOWERED ON TRAP EXIT\n"
+#endif /* DIAGNOSTIC */
+9: STI(%eax)
+ call _C_LABEL(pmap_load)
+ jmp trap0e_checkast /* re-check ASTs */
+
+#ifdef IPKDB
+/* LINTSTUB: Ignore */
+NENTRY(bpttraps)
+ INTRENTRY
+ call _C_LABEL(ipkdb_trap_glue)
+ testl %eax,%eax
+ jz calltrap
+ INTRFASTEXIT
+
+ipkdbsetup:
+ popl %ecx
+
+ /* Disable write protection: */
+ movl %cr0,%eax
+ pushl %eax
+ andl $~CR0_WP,%eax
+ movl %eax,%cr0
+
+ /* Substitute Protection & Page Fault handlers: */
+ movl _C_LABEL(idt),%edx
+ pushl 13*8(%edx)
+ pushl 13*8+4(%edx)
+ pushl 14*8(%edx)
+ pushl 14*8+4(%edx)
+ movl $fault,%eax
+ movw %ax,13*8(%edx)
+ movw %ax,14*8(%edx)
+ shrl $16,%eax
+ movw %ax,13*8+6(%edx)
+ movw %ax,14*8+6(%edx)
+
+ pushl %ecx
+ ret
+
+ipkdbrestore:
+ popl %ecx
+
+ /* Restore Protection & Page Fault handlers: */
+ movl _C_LABEL(idt),%edx
+ popl 14*8+4(%edx)
+ popl 14*8(%edx)
+ popl 13*8+4(%edx)
+ popl 13*8(%edx)
+
+ /* Restore write protection: */
+ popl %edx
+ movl %edx,%cr0
+
+ pushl %ecx
+ ret
+#endif /* IPKDB */
+
+
+/*
+ * If an error is detected during trap, syscall, or interrupt exit, trap() will
+ * change %eip to point to one of these labels. We clean up the stack, if
+ * necessary, and resume as if we were handling a general protection fault.
+ * This will cause the process to get a SIGBUS.
+ */
+/* LINTSTUB: Var: char resume_iret[1]; */
+NENTRY(resume_iret)
+ ZTRAP(T_PROTFLT)
+/* LINTSTUB: Var: char resume_pop_ds[1]; */
+NENTRY(resume_pop_ds)
+ movl %es,TF_ES(%esp)
+ movl $GSEL(GDATA_SEL, SEL_KPL),%eax
+ movw %ax,%es
+/* LINTSTUB: Var: char resume_pop_es[1]; */
+NENTRY(resume_pop_es)
+ movl %fs,TF_FS(%esp)
+ movl $GSEL(GDATA_SEL, SEL_KPL),%eax
+ movw %ax,%fs
+/* LINTSTUB: Var: char resume_pop_fs[1]; */
+NENTRY(resume_pop_fs)
+ movl %gs,TF_GS(%esp)
+ movl $GSEL(GDATA_SEL, SEL_KPL),%eax
+ movw %ax,%gs
+/* LINTSTUB: Var: char resume_pop_gs[1]; */
+NENTRY(resume_pop_gs)
+ movl $T_PROTFLT,TF_TRAPNO(%esp)
+ jmp calltrap
+
+#ifdef IPKDB
+/* LINTSTUB: Func: int ipkdbfbyte(u_char *c) */
+NENTRY(ipkdbfbyte)
+ pushl %ebp
+ movl %esp,%ebp
+ call ipkdbsetup
+ movl 8(%ebp),%edx
+ movzbl (%edx),%eax
+faultexit:
+ call ipkdbrestore
+ popl %ebp
+ ret
+
+/* LINTSTUB: Func: int ipkdbsbyte(u_char *c, int i) */
+NENTRY(ipkdbsbyte)
+ pushl %ebp
+ movl %esp,%ebp
+ call ipkdbsetup
+ movl 8(%ebp),%edx
+ movl 12(%ebp),%eax
+ movb %al,(%edx)
+ call ipkdbrestore
+ popl %ebp
+ ret
+
+fault:
+ popl %eax /* error code */
+ movl $faultexit,%eax
+ movl %eax,(%esp)
+ movl $-1,%eax
+ iret
+#endif /* IPKDB */
+
+
+
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until weve done all processing. HOWEVER, we must enable events before
+# popping the stack frame (cant be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so wed
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(hypervisor_callback)
+ pushl $0 # dummy error code
+ pushl $T_ASTFLT
+ INTRENTRY
+ movl TF_EIP(%esp),%eax
+ cmpl $scrit,%eax
+ jb 11f
+ cmpl $ecrit,%eax
+ jb critical_region_fixup
+11: push %esp
+ call do_hypervisor_callback
+ add $4,%esp
+ movl HYPERVISOR_shared_info,%esi
+ xorl %eax,%eax
+ movb TF_CS(%esp),%cl
+ test $CHK_UPL,%cl # slow return to ring 2 or 3
+ je safesti
+ movl CPUVAR(ILEVEL),%ebx
+ jmp doreti_checkast
+safesti:XEN_UNBLOCK_EVENTS(%esi) # reenable event callbacks
+scrit: /**** START OF CRITICAL REGION ****/
+ testb $1,evtchn_upcall_pending(%esi)
+ jnz 14f # process more events if necessary...
+ INTRFASTEXIT
+critiret:
+14: XEN_BLOCK_EVENTS(%esi)
+ jmp 11b
+ecrit: /**** END OF CRITICAL REGION ****/
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame.
+critical_region_fixup:
+ cmpl $(critiret-1),%eax # eip points to iret?
+ jne 1f
+ movl $(TF_PUSHSIZE+0x8),%eax
+ jmp 2f
+1: xorl %eax,%eax
+2:
+ # %eax contains num bytes popped
+ mov %esp,%esi
+ add %eax,%esi # %esi points at end of src region
+ mov %esp,%edi
+ add $(TF_PUSHSIZE+0x8+0xC),%edi # %edi points at end of dst region
+ mov %eax,%ecx
+ shr $2,%ecx # convert words to bytes
+ je 16f # skip loop if nothing to copy
+15: subl $4,%esi # pre-decrementing copy loop
+ subl $4,%edi
+ movl (%esi),%eax
+ movl %eax,(%edi)
+ loop 15b
+16: movl %edi,%esp # final %edi is top of merged stack
+ jmp 11b
+
+
+# Hypervisor uses this for application faults while it executes.
+ENTRY(failsafe_callback)
+ pop %ds
+ pop %es
+ pop %fs
+ pop %gs
+ call _C_LABEL(xen_failsafe_handler)
+ iret
+
+#ifdef XENDEBUG_LOW
+
+ES = 0x20
+ORIG_EAX = 0x24
+EIP = 0x28
+CS = 0x2C
+
+#define SAVE_ALL \
+ cld; \
+ pushl %es; \
+ pushl %ds; \
+ pushl %eax; \
+ pushl %ebp; \
+ pushl %edi; \
+ pushl %esi; \
+ pushl %edx; \
+ pushl %ecx; \
+ pushl %ebx; \
+ movl $GSEL(GDATA_SEL, SEL_KPL),%edx; \
+ movl %edx,%ds; \
+ movl %edx,%es;
+
+#define RESTORE_ALL \
+ popl %ebx; \
+ popl %ecx; \
+ popl %edx; \
+ popl %esi; \
+ popl %edi; \
+ popl %ebp; \
+ popl %eax; \
+ popl %ds; \
+ popl %es; \
+ addl $4,%esp; \
+ iret; \
+
+ret_from_exception:
+ movb CS(%esp),%cl
+ test $2,%cl # slow return to ring 2 or 3
+ jne safesti
+ RESTORE_ALL
+
+
+ENTRY(divide_error)
+ pushl $0 # no error code
+ pushl $do_divide_error
+do_exception:
+ pushl %ds
+ pushl %eax
+ xorl %eax,%eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ decl %eax # eax = -1
+ pushl %ecx
+ pushl %ebx
+ cld
+ movl %es,%ecx
+ movl ORIG_EAX(%esp), %esi # get the error code
+ movl ES(%esp), %edi # get the function address
+ movl %eax, ORIG_EAX(%esp)
+ movl %ecx, ES(%esp)
+ movl %esp,%edx
+ pushl %esi # push the error code
+ pushl %edx # push the pt_regs pointer
+ movl $(__KERNEL_DS),%edx
+ movl %edx,%ds
+ movl %edx,%es
+ call *%edi
+ addl $8,%esp
+ jmp ret_from_exception
+
+ENTRY(coprocessor_error)
+ pushl $0
+ pushl $do_coprocessor_error
+ jmp do_exception
+
+ENTRY(simd_coprocessor_error)
+ pushl $0
+ pushl $do_simd_coprocessor_error
+ jmp do_exception
+
+ENTRY(device_not_available)
+ iret
+
+ENTRY(debug)
+ pushl $0
+ pushl $do_debug
+ jmp do_exception
+
+ENTRY(int3)
+ pushl $0
+ pushl $do_int3
+ jmp do_exception
+
+ENTRY(overflow)
+ pushl $0
+ pushl $do_overflow
+ jmp do_exception
+
+ENTRY(bounds)
+ pushl $0
+ pushl $do_bounds
+ jmp do_exception
+
+ENTRY(invalid_op)
+ pushl $0
+ pushl $do_invalid_op
+ jmp do_exception
+
+ENTRY(coprocessor_segment_overrun)
+ pushl $0
+ pushl $do_coprocessor_segment_overrun
+ jmp do_exception
+
+ENTRY(double_fault)
+ pushl $do_double_fault
+ jmp do_exception
+
+ENTRY(invalid_TSS)
+ pushl $do_invalid_TSS
+ jmp do_exception
+
+ENTRY(segment_not_present)
+ pushl $do_segment_not_present
+ jmp do_exception
+
+ENTRY(stack_segment)
+ pushl $do_stack_segment
+ jmp do_exception
+
+ENTRY(general_protection)
+ pushl $do_general_protection
+ jmp do_exception
+
+ENTRY(alignment_check)
+ pushl $do_alignment_check
+ jmp do_exception
+
+# This handler is special, because it gets an extra value on its stack,
+# which is the linear faulting address.
+ENTRY(page_fault)
+ pushl %ds
+ pushl %eax
+ xorl %eax,%eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ decl %eax # eax = -1
+ pushl %ecx
+ pushl %ebx
+ cld
+ movl %es,%ecx
+ movl ORIG_EAX(%esp), %esi # get the error code
+ movl ES(%esp), %edi # get the faulting address
+ movl %eax, ORIG_EAX(%esp)
+ movl %ecx, ES(%esp)
+ movl %esp,%edx
+ pushl %edi # push the faulting address
+ pushl %esi # push the error code
+ pushl %edx # push the pt_regs pointer
+ movl $(__KERNEL_DS),%edx
+ movl %edx,%ds
+ movl %edx,%es
+ call do_page_fault
+ addl $12,%esp
+ jmp ret_from_exception
+
+ENTRY(machine_check)
+ pushl $0
+ pushl $do_machine_check
+ jmp do_exception
+
+ENTRY(spurious_interrupt_bug)
+ pushl $0
+ pushl $do_spurious_interrupt_bug
+ jmp do_exception
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c
new file mode 100644
index 0000000000..d51baba078
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c
@@ -0,0 +1,680 @@
+/* $NetBSD: xen_machdep.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xen_machdep.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $");
+
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+
+#include <uvm/uvm.h>
+
+#include <machine/gdt.h>
+#include <machine/xenfunc.h>
+#include <machine/xenpmap.h>
+
+/* #define XENDEBUG */
+/* #define XENDEBUG_LOW */
+
+#ifdef XENDEBUG
+#define XENPRINTF(x) printf x
+#define XENPRINTK(x) printk x
+#define XENPRINTK2(x) /* printk x */
+
+static char XBUF[256];
+#else
+#define XENPRINTF(x)
+#define XENPRINTK(x)
+#define XENPRINTK2(x)
+#endif
+void printk(char *, ...);
+#define PRINTF(x) printf x
+#define PRINTK(x) printk x
+
+shared_info_t *HYPERVISOR_shared_info;
+union start_info_union start_info_union;
+
+void xen_failsafe_handler(void);
+
+void
+xen_failsafe_handler(void)
+{
+
+ panic("xen_failsafe_handler called!\n");
+}
+
+
+void
+xen_update_descriptor(union descriptor *table, union descriptor *entry)
+{
+ paddr_t pa;
+ pt_entry_t *ptp;
+
+ ptp = kvtopte((vaddr_t)table);
+ pa = (*ptp & PG_FRAME) | ((vaddr_t)table & ~PG_FRAME);
+ if (HYPERVISOR_update_descriptor(pa, entry->raw[0], entry->raw[1]))
+ panic("HYPERVISOR_update_descriptor failed\n");
+}
+
+void
+xen_set_ldt(vaddr_t base, uint32_t entries)
+{
+ vaddr_t va;
+ pt_entry_t *ptp, *maptp;
+
+ for (va = base; va < base + entries * sizeof(union descriptor);
+ va += PAGE_SIZE) {
+ KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
+ ptp = kvtopte(va);
+ maptp = (pt_entry_t *)vtomach((vaddr_t)ptp);
+ XENPRINTF(("xen_set_ldt %p %d %p %p\n", (void *)base,
+ entries, ptp, maptp));
+ PTE_CLEARBITS(ptp, maptp, PG_RW);
+ }
+ PTE_UPDATES_FLUSH();
+
+ xpq_queue_set_ldt(base, entries);
+ xpq_flush_queue();
+}
+
+void
+lgdt(struct region_descriptor *rdp)
+{
+
+ panic("lgdt %p %08x\n", (void *)rdp->rd_base, rdp->rd_limit);
+}
+
+void
+xen_parse_cmdline(int what, union xen_cmdline_parseinfo *xcp)
+{
+ char *cmd_line, *opt, *s;
+ int b, i, ipidx = 0;
+ uint32_t xi_ip[5];
+
+ cmd_line = xen_start_info.cmd_line;
+
+ switch (what) {
+ case XEN_PARSE_BOOTDEV:
+ xcp->xcp_bootdev[0] = 0;
+ break;
+ case XEN_PARSE_CONSOLE:
+ xcp->xcp_console[0] = 0;
+ break;
+ }
+
+ while (cmd_line && *cmd_line) {
+ opt = cmd_line;
+ cmd_line = strchr(opt, ' ');
+ if (cmd_line)
+ *cmd_line = 0;
+
+ switch (what) {
+ case XEN_PARSE_BOOTDEV:
+ if (strncasecmp(opt, "bootdev=", 8) == 0)
+ strncpy(xcp->xcp_bootdev, opt + 8,
+ sizeof(xcp->xcp_console));
+ break;
+
+ case XEN_PARSE_NETINFO:
+ if (xcp->xcp_netinfo.xi_root &&
+ strncasecmp(opt, "nfsroot=", 8) == 0)
+ strncpy(xcp->xcp_netinfo.xi_root, opt + 8,
+ MNAMELEN);
+
+ if (strncasecmp(opt, "ip=", 3) == 0) {
+ memset(xi_ip, 0, sizeof(xi_ip));
+ opt += 3;
+ ipidx = 0;
+ while (opt && *opt) {
+ s = opt;
+ opt = strchr(opt, ':');
+ if (opt)
+ *opt = 0;
+
+ switch (ipidx) {
+ case 0: /* ip */
+ case 1: /* nfs server */
+ case 2: /* gw */
+ case 3: /* mask */
+ case 4: /* host */
+ if (*s == 0)
+ break;
+ for (i = 0; i < 4; i++) {
+ b = strtoul(s, &s, 10);
+ xi_ip[ipidx] = b + 256
+ * xi_ip[ipidx];
+ if (*s != '.')
+ break;
+ s++;
+ }
+ if (i < 3)
+ xi_ip[ipidx] = 0;
+ break;
+ case 5: /* interface */
+ if (!strncmp(s, "xennet", 6))
+ s += 6;
+ else if (!strncmp(s, "eth", 3))
+ s += 3;
+ else
+ break;
+ if (xcp->xcp_netinfo.xi_ifno
+ == strtoul(s, NULL, 10))
+ memcpy(xcp->
+ xcp_netinfo.xi_ip,
+ xi_ip,
+ sizeof(xi_ip));
+ break;
+ }
+ ipidx++;
+
+ if (opt)
+ *opt++ = ':';
+ }
+ }
+ break;
+
+ case XEN_PARSE_CONSOLE:
+ if (strncasecmp(opt, "console=", 8) == 0)
+ strncpy(xcp->xcp_console, opt + 8,
+ sizeof(xcp->xcp_console));
+ break;
+
+ }
+
+ if (cmd_line)
+ *cmd_line++ = ' ';
+ }
+}
+
+
+
+
+
+#define XEN_PAGE_OFFSET 0xC0100000
+
+static pd_entry_t
+xpmap_get_bootpde(paddr_t va)
+{
+
+ return ((pd_entry_t *)xen_start_info.pt_base)[va >> PDSHIFT];
+}
+
+static pd_entry_t
+xpmap_get_vbootpde(paddr_t va)
+{
+ pd_entry_t pde;
+
+ pde = xpmap_get_bootpde(va);
+ if ((pde & PG_V) == 0)
+ return (pde & ~PG_FRAME);
+ return (pde & ~PG_FRAME) |
+ (xpmap_mtop(pde & PG_FRAME) + KERNBASE);
+}
+
+static pt_entry_t *
+xpmap_get_bootptep(paddr_t va)
+{
+ pd_entry_t pde;
+
+ pde = xpmap_get_vbootpde(va);
+ if ((pde & PG_V) == 0)
+ return (void *)-1;
+ return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]);
+}
+
+static pt_entry_t
+xpmap_get_bootpte(paddr_t va)
+{
+
+ return xpmap_get_bootptep(va)[0];
+}
+
+#if defined(XENDEBUG)
+static void
+xpmap_dump_pt(pt_entry_t *ptp, int p)
+{
+ pt_entry_t pte;
+ int j;
+ int bufpos;
+
+ pte = xpmap_ptom((uint32_t)ptp - KERNBASE);
+ PRINTK(("%03x: %p(%p) %08x\n", p, ptp, (void *)pte, p << PDSHIFT));
+
+ bufpos = 0;
+ for (j = 0; j < PTES_PER_PTP; j++) {
+ if ((ptp[j] & PG_V) == 0)
+ continue;
+ pte = ptp[j] /* & PG_FRAME */;
+ bufpos += sprintf(XBUF + bufpos, "%x:%03x:%08x ",
+ p, j, pte);
+ if (bufpos > 70) {
+ int k;
+ sprintf(XBUF + bufpos, "\n");
+ PRINTK((XBUF));
+ bufpos = 0;
+ for (k = 0; k < 1000000; k++);
+ }
+ }
+ if (bufpos) {
+ PRINTK((XBUF));
+ PRINTK(("\n"));
+ bufpos = 0;
+ }
+}
+#endif
+
+void
+xpmap_init(void)
+{
+ pd_entry_t *xen_pdp;
+ pt_entry_t *ptp, *sysptp;
+ pt_entry_t pte;
+ uint32_t i, j;
+ int bufpos;
+#if defined(XENDEBUG_LOW)
+ extern char kernel_text, _etext, __bss_start, end, *esym;
+#endif
+
+ xpmap_phys_to_machine_mapping = (void *)xen_start_info.mfn_list;
+
+ xen_pdp = (pd_entry_t *)xen_start_info.pt_base;
+
+ XENPRINTK(("text %p data %p bss %p end %p esym %p\n", &kernel_text,
+ &_etext, &__bss_start, &end, esym));
+ XENPRINTK(("xpmap_init PTD %p nkpde %d upages %d xen_PTD %p p2m-map %p\n",
+ (void *)PTDpaddr, nkpde, UPAGES, xen_pdp,
+ xpmap_phys_to_machine_mapping));
+
+ bufpos = 0;
+
+ XENPRINTK(("shared_inf %08x\n", (paddr_t)xen_start_info.shared_info));
+ XENPRINTK(("c0100000: %08x\n",
+ xpmap_get_bootpte(0xc0100000)));
+
+ /* Map kernel. */
+
+ /* Map kernel data/bss/tables. */
+
+ /* Map ISA I/O memory. */
+
+ /* Map kernel PDEs. */
+
+ /* Install a PDE recursively mapping page directory as a page table! */
+
+ sysptp = (pt_entry_t *)(PTDpaddr + ((1 + UPAGES) << PAGE_SHIFT));
+
+ /* make xen's PDE and PTE pages read-only in our pagetable */
+ for (i = 0; i < xen_start_info.nr_pt_frames; i++) {
+ /* mark PTE page read-only in our table */
+ sysptp[((xen_start_info.pt_base +
+ (i << PAGE_SHIFT) - KERNBASE_LOCORE) &
+ (PD_MASK | PT_MASK)) >> PAGE_SHIFT] &= ~PG_RW;
+ }
+
+ xpq_flush_queue();
+
+ for (i = 0; i < 1 + UPAGES + nkpde; i++) {
+ /* mark PTE page read-only in xen's table */
+ ptp = xpmap_get_bootptep(PTDpaddr + (i << PAGE_SHIFT));
+ xpq_queue_pte_update(
+ (void *)xpmap_ptom((unsigned long)ptp - KERNBASE), *ptp & ~PG_RW);
+ XENPRINTK(("%03x: %p(%p) -> %08x\n", i, ptp,
+ (unsigned long)ptp - KERNTEXTOFF, *ptp));
+
+ /* mark PTE page read-only in our table */
+ sysptp[((PTDpaddr + (i << PAGE_SHIFT) - KERNBASE_LOCORE) &
+ (PD_MASK | PT_MASK)) >> PAGE_SHIFT] &= ~PG_RW;
+
+ /* update our pte's */
+ ptp = (pt_entry_t *)(PTDpaddr + (i << PAGE_SHIFT));
+#if 0
+ pte = xpmap_ptom((uint32_t)ptp - KERNBASE);
+ XENPRINTK(("%03x: %p(%p) %08x\n", i, ptp, pte, i << PDSHIFT));
+#endif
+ for (j = 0; j < PTES_PER_PTP; j++) {
+ if ((ptp[j] & PG_V) == 0)
+ continue;
+ if (ptp[j] == 0xffffffff)
+ ptp[j] = xen_start_info.shared_info |
+ (PG_V|PG_RW);
+ if (ptp[j] >= KERNTEXTOFF) {
+ pte = ptp[j];
+ ptp[j] = (pte & ~PG_FRAME) |
+ (xpmap_get_bootpte(pte & PG_FRAME) &
+ PG_FRAME);
+ }
+#if defined(XENDEBUG) && 0
+ pte = ptp[j] /* & PG_FRAME */;
+ bufpos += sprintf(XBUF + bufpos, "%x:%03x:%08x ",
+ i, j, pte);
+ if (bufpos > 70) {
+ int k;
+ sprintf(XBUF + bufpos, "\n");
+ XENPRINTK((XBUF));
+ bufpos = 0;
+ for (k = 0; k < 1000000; k++);
+ }
+ }
+ if (bufpos) {
+ XENPRINTK((XBUF));
+ bufpos = 0;
+#endif
+ }
+ if (i == 0)
+ i = 1 + UPAGES - 1;
+ }
+
+#if 0
+ for (i = 0x300; i < 0x305; i++)
+ if (((pt_entry_t *)xen_start_info.pt_base)[i] & PG_V)
+ xpmap_dump_pt((pt_entry_t *)
+ (xpmap_mtop(((pt_entry_t *)xen_start_info.pt_base)[i] &
+ PG_FRAME) + KERNBASE), i);
+ xpmap_dump_pt((pt_entry_t *)xen_start_info.pt_base, 0);
+#endif
+
+ XENPRINTK(("switching pdp: %p, %08lx, %p, %p, %p\n", (void *)PTDpaddr,
+ PTDpaddr - KERNBASE,
+ (void *)xpmap_ptom(PTDpaddr - KERNBASE),
+ (void *)xpmap_get_bootpte(PTDpaddr),
+ (void *)xpmap_mtop(xpmap_ptom(PTDpaddr - KERNBASE))));
+
+#if defined(XENDEBUG)
+ xpmap_dump_pt((pt_entry_t *)PTDpaddr, 0);
+#endif
+
+ xpq_flush_queue();
+
+ xpq_queue_pin_table(xpmap_get_bootpte(PTDpaddr) & PG_FRAME,
+ XPQ_PIN_L2_TABLE);
+ xpq_queue_pt_switch(xpmap_get_bootpte(PTDpaddr) & PG_FRAME);
+ xpq_queue_unpin_table(
+ xpmap_get_bootpte(xen_start_info.pt_base) & PG_FRAME);
+
+ /* make xen's PDE and PTE pages writable in our pagetable */
+ for (i = 0; i < xen_start_info.nr_pt_frames; i++) {
+ /* mark PTE page writable in our table */
+ ptp = &sysptp[((xen_start_info.pt_base +
+ (i << PAGE_SHIFT) - KERNBASE_LOCORE) &
+ (PD_MASK | PT_MASK)) >> PAGE_SHIFT];
+ xpq_queue_pte_update(
+ (void *)xpmap_ptom((unsigned long)ptp - KERNBASE), *ptp |
+ PG_RW);
+ }
+
+ xpq_flush_queue();
+ XENPRINTK(("pt_switch done!\n"));
+}
+
+/*
+ * Do a binary search to find out where physical memory ends on the
+ * real hardware. Xen will fail our updates if they are beyond the
+ * last available page (max_page in xen/common/memory.c).
+ */
+paddr_t
+find_pmap_mem_end(vaddr_t va)
+{
+ mmu_update_t r;
+ int start, end, ok;
+ pt_entry_t old;
+
+ start = xen_start_info.nr_pages;
+ end = HYPERVISOR_VIRT_START >> PAGE_SHIFT;
+
+ r.ptr = (unsigned long)&PTE_BASE[x86_btop(va)];
+ old = PTE_BASE[x86_btop(va)];
+
+ while (start + 1 < end) {
+ r.val = (((start + end) / 2) << PAGE_SHIFT) | PG_V;
+
+ if (HYPERVISOR_mmu_update(&r, 1, &ok) < 0)
+ end = (start + end) / 2;
+ else
+ start = (start + end) / 2;
+ }
+ r.val = old;
+ if (HYPERVISOR_mmu_update(&r, 1, &ok) < 0)
+ printf("pmap_mem_end find: old update failed %08x\n",
+ old);
+
+ return end << PAGE_SHIFT;
+}
+
+
+#if 0
+void xpmap_find_memory(paddr_t);
+void
+xpmap_find_memory(paddr_t first_avail)
+{
+ char buf[256];
+ uint32_t i;
+ int bufpos;
+ paddr_t p;
+
+ bufpos = 0;
+ for (i = ((first_avail - KERNTEXTOFF) >> PAGE_SHIFT);
+ i < xen_start_info.nr_pages; i++) {
+ /* if (xpmap_phys_to_machine_mapping[i] */
+ bufpos += sprintf(buf + bufpos, "%03x:%08x:%08x ",
+ i, (uint32_t)xpmap_phys_to_machine_mapping[i],
+ (uint32_t)xpmap_mtop(xpmap_phys_to_machine_mapping[i] <<
+ PAGE_SHIFT));
+ p = xpmap_phys_to_machine_mapping[i];
+ uvm_page_physload(p, p + 1, p, p + 1, VM_FREELIST_DEFAULT);
+
+ if (bufpos > 70) {
+ int k;
+ sprintf(buf + bufpos, "\n");
+ XENPRINTK((buf));
+ bufpos = 0;
+ for (k = 0; k < 1000000; k++);
+ }
+ }
+ if (bufpos) {
+ XENPRINTK((buf));
+ bufpos = 0;
+ }
+}
+#endif
+
+
+#ifdef XENDEBUG
+void xpq_debug_dump(void);
+#endif
+
+#define XPQUEUE_SIZE 2048
+typedef union xpq_queue {
+ struct {
+ pd_entry_t *ptr;
+ pd_entry_t val;
+ } pde;
+ struct {
+ pt_entry_t *ptr;
+ pt_entry_t val;
+ } pte;
+ struct {
+ paddr_t ptr;
+ uint32_t val;
+ } pa;
+} xpq_queue_t;
+static xpq_queue_t xpq_queue[XPQUEUE_SIZE];
+static int xpq_idx = 0;
+
+void
+xpq_flush_queue()
+{
+ int i, ok;
+
+ XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
+ for (i = 0; i < xpq_idx; i++)
+ XENPRINTK2(("%d: %p %08x\n", i, xpq_queue[i].pde.ptr,
+ xpq_queue[i].pde.val));
+ if (xpq_idx != 0 &&
+ HYPERVISOR_mmu_update((mmu_update_t *)xpq_queue, xpq_idx, &ok) < 0)
+ panic("HYPERVISOR_mmu_update failed\n");
+ xpq_idx = 0;
+}
+
+static inline void
+xpq_increment_idx(void)
+{
+
+ xpq_idx++;
+ if (__predict_false(xpq_idx == XPQUEUE_SIZE))
+ xpq_flush_queue();
+}
+
+void
+xpq_queue_invlpg(vaddr_t va)
+{
+
+ XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va));
+ xpq_queue[xpq_idx].pa.ptr = (va & PG_FRAME) | MMU_EXTENDED_COMMAND;
+ xpq_queue[xpq_idx].pa.val = MMUEXT_INVLPG;
+ xpq_increment_idx();
+}
+
+void
+xpq_queue_pde_update(pd_entry_t *ptr, pd_entry_t val)
+{
+
+ xpq_queue[xpq_idx].pde.ptr = ptr;
+ xpq_queue[xpq_idx].pde.val = val;
+ xpq_increment_idx();
+}
+
+void
+xpq_queue_pte_update(pt_entry_t *ptr, pt_entry_t val)
+{
+
+ xpq_queue[xpq_idx].pte.ptr = ptr;
+ xpq_queue[xpq_idx].pte.val = val;
+ xpq_increment_idx();
+}
+
+void
+xpq_queue_unchecked_pte_update(pt_entry_t *ptr, pt_entry_t val)
+{
+
+ xpq_queue[xpq_idx].pa.ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
+ /* XXXcl UNCHECKED_PT_UPDATE */
+ xpq_queue[xpq_idx].pa.val = val;
+ xpq_increment_idx();
+}
+
+void
+xpq_queue_pt_switch(paddr_t pa)
+{
+
+ XENPRINTK2(("xpq_queue_pt_switch: %p %p\n", (void *)pa, (void *)pa));
+ xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND;
+ xpq_queue[xpq_idx].pa.val = MMUEXT_NEW_BASEPTR;
+ xpq_increment_idx();
+}
+
+void
+xpq_queue_pin_table(paddr_t pa, int type)
+{
+
+ XENPRINTK2(("xpq_queue_pin_table: %p %p\n", (void *)pa, (void *)pa));
+ xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND;
+ switch (type) {
+ case XPQ_PIN_L1_TABLE:
+ xpq_queue[xpq_idx].pa.val = MMUEXT_PIN_L1_TABLE;
+ break;
+ case XPQ_PIN_L2_TABLE:
+ xpq_queue[xpq_idx].pa.val = MMUEXT_PIN_L2_TABLE;
+ break;
+ }
+ xpq_increment_idx();
+}
+
+void
+xpq_queue_unpin_table(paddr_t pa)
+{
+
+ XENPRINTK2(("xpq_queue_unpin_table: %p %p\n", (void *)pa, (void *)pa));
+ xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND;
+ xpq_queue[xpq_idx].pa.val = MMUEXT_UNPIN_TABLE;
+ xpq_increment_idx();
+}
+
+void
+xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
+{
+
+ XENPRINTK2(("xpq_queue_set_ldt\n"));
+ KASSERT(va == (va & PG_FRAME));
+ xpq_queue[xpq_idx].pa.ptr = MMU_EXTENDED_COMMAND | va;
+ xpq_queue[xpq_idx].pa.val = MMUEXT_SET_LDT |
+ (entries << MMUEXT_CMD_SHIFT);
+ xpq_increment_idx();
+}
+
+void
+xpq_queue_tlb_flush()
+{
+
+ XENPRINTK2(("xpq_queue_tlb_flush\n"));
+ xpq_queue[xpq_idx].pa.ptr = MMU_EXTENDED_COMMAND;
+ xpq_queue[xpq_idx].pa.val = MMUEXT_TLB_FLUSH;
+ xpq_increment_idx();
+}
+
+#ifdef XENDEBUG
+void
+xpq_debug_dump()
+{
+ int i;
+
+ XENPRINTK2(("idx: %d\n", xpq_idx));
+ for (i = 0; i < xpq_idx; i++) {
+ sprintf(XBUF, "%p %08x ", xpq_queue[i].pte.ptr,
+ xpq_queue[i].pte.val);
+ if (++i < xpq_idx)
+ sprintf(XBUF + strlen(XBUF), "%p %08x ",
+ xpq_queue[i].pte.ptr, xpq_queue[i].pte.val);
+ if (++i < xpq_idx)
+ sprintf(XBUF + strlen(XBUF), "%p %08x ",
+ xpq_queue[i].pte.ptr, xpq_queue[i].pte.val);
+ if (++i < xpq_idx)
+ sprintf(XBUF + strlen(XBUF), "%p %08x ",
+ xpq_queue[i].pte.ptr, xpq_queue[i].pte.val);
+ XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
+ }
+}
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h
new file mode 100644
index 0000000000..cad97f21e1
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h
@@ -0,0 +1,130 @@
+/* $NetBSD: frameasm.h,v 1.1 2004/03/11 21:44:08 cl Exp $ */
+/* NetBSD: frameasm.h,v 1.4 2004/02/20 17:35:01 yamt Exp */
+
+#ifndef _I386_FRAMEASM_H_
+#define _I386_FRAMEASM_H_
+
+#ifdef _KERNEL_OPT
+#include "opt_multiprocessor.h"
+#endif
+
+/* XXX assym.h */
+#define TRAP_INSTR int $0x82
+#define __HYPERVISOR_stack_switch 4
+#define __HYPERVISOR_fpu_taskswitch 7
+
+#ifndef TRAPLOG
+#define TLOG /**/
+#else
+/*
+ * Fill in trap record
+ */
+#define TLOG \
+9: \
+ movl %fs:CPU_TLOG_OFFSET, %eax; \
+ movl %fs:CPU_TLOG_BASE, %ebx; \
+ addl $SIZEOF_TREC,%eax; \
+ andl $SIZEOF_TLOG-1,%eax; \
+ addl %eax,%ebx; \
+ movl %eax,%fs:CPU_TLOG_OFFSET; \
+ movl %esp,TREC_SP(%ebx); \
+ movl $9b,TREC_HPC(%ebx); \
+ movl TF_EIP(%esp),%eax; \
+ movl %eax,TREC_IPC(%ebx); \
+ rdtsc ; \
+ movl %eax,TREC_TSC(%ebx); \
+ movl $MSR_LASTBRANCHFROMIP,%ecx; \
+ rdmsr ; \
+ movl %eax,TREC_LBF(%ebx); \
+ incl %ecx ; \
+ rdmsr ; \
+ movl %eax,TREC_LBT(%ebx); \
+ incl %ecx ; \
+ rdmsr ; \
+ movl %eax,TREC_IBF(%ebx); \
+ incl %ecx ; \
+ rdmsr ; \
+ movl %eax,TREC_IBT(%ebx)
+#endif
+
+/*
+ * These are used on interrupt or trap entry or exit.
+ */
+#define INTRENTRY \
+ cld; \
+ subl $TF_PUSHSIZE,%esp ; \
+ movl %gs,TF_GS(%esp) ; \
+ movl %fs,TF_FS(%esp) ; \
+ movl %eax,TF_EAX(%esp) ; \
+ movl %es,TF_ES(%esp) ; \
+ movl %ds,TF_DS(%esp) ; \
+ movl $GSEL(GDATA_SEL, SEL_KPL),%eax ; \
+ movl %edi,TF_EDI(%esp) ; \
+ movl %esi,TF_ESI(%esp) ; \
+ movl %eax,%ds ; \
+ movl %ebp,TF_EBP(%esp) ; \
+ movl %eax,%es ; \
+ movl %ebx,TF_EBX(%esp) ; \
+ movl %eax,%gs ; \
+ movl %edx,TF_EDX(%esp) ; \
+ movl $GSEL(GCPU_SEL, SEL_KPL),%eax ; \
+ movl %ecx,TF_ECX(%esp) ; \
+ movl %eax,%fs ; \
+ TLOG
+
+#define INTRFASTEXIT \
+ movl TF_GS(%esp),%gs ; \
+ movl TF_FS(%esp),%fs ; \
+ movl TF_ES(%esp),%es ; \
+ movl TF_DS(%esp),%ds ; \
+ movl TF_EDI(%esp),%edi ; \
+ movl TF_ESI(%esp),%esi ; \
+ movl TF_EBP(%esp),%ebp ; \
+ movl TF_EBX(%esp),%ebx ; \
+ movl TF_EDX(%esp),%edx ; \
+ movl TF_ECX(%esp),%ecx ; \
+ movl TF_EAX(%esp),%eax ; \
+ addl $(TF_PUSHSIZE+8),%esp ; \
+ iret
+
+#define DO_DEFERRED_SWITCH(reg) \
+ cmpl $0, CPUVAR(WANT_PMAPLOAD) ; \
+ jz 1f ; \
+ call _C_LABEL(pmap_load) ; \
+ 1:
+
+#define CHECK_DEFERRED_SWITCH(reg) \
+ cmpl $0, CPUVAR(WANT_PMAPLOAD)
+
+#define CHECK_ASTPENDING(reg) movl CPUVAR(CURLWP),reg ; \
+ cmpl $0, reg ; \
+ je 1f ; \
+ movl L_PROC(reg),reg ; \
+ cmpl $0, P_MD_ASTPENDING(reg); \
+ 1:
+#define CLEAR_ASTPENDING(reg) movl $0, P_MD_ASTPENDING(reg)
+
+#if !defined(XEN)
+#define CLI(reg) cli
+#define STI(reg) sti
+#else
+/* XXX assym.h */
+#define EVENTS_MASK 136
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending /* 0 */
+#define evtchn_upcall_mask 1
+
+#define XEN_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
+#define XEN_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
+#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(%reg)
+
+#define CLI(reg) movl _C_LABEL(HYPERVISOR_shared_info),reg ; \
+ XEN_BLOCK_EVENTS(reg)
+#define STI(reg) movl _C_LABEL(HYPERVISOR_shared_info),reg ; \
+ XEN_UNBLOCK_EVENTS(reg)
+#define STIC(reg) movl _C_LABEL(HYPERVISOR_shared_info),reg ; \
+ XEN_UNBLOCK_EVENTS(reg) ; \
+ testb $1,evtchn_upcall_pending(reg)
+#endif
+
+#endif /* _I386_FRAMEASM_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h
new file mode 100644
index 0000000000..13442d22eb
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h
@@ -0,0 +1,423 @@
+/* $NetBSD: hypervisor.h,v 1.1.2.2 2004/06/17 09:23:19 tron Exp $ */
+
+/*
+ *
+ * Communication to/from hypervisor.
+ *
+ * Copyright (c) 2002-2003, K A Fraser
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _XEN_HYPERVISOR_H_
+#define _XEN_HYPERVISOR_H_
+
+
+struct hypervisor_attach_args {
+ const char *haa_busname;
+};
+
+struct xencons_attach_args {
+ const char *xa_device;
+};
+
+struct xen_npx_attach_args {
+ const char *xa_device;
+};
+
+
+#define u8 uint8_t
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+#define s8 int8_t
+#define s16 int16_t
+#define s32 int32_t
+#define s64 int64_t
+
+/* include the hypervisor interface */
+#include <sys/systm.h>
+#include <machine/hypervisor-ifs/hypervisor-if.h>
+#include <machine/hypervisor-ifs/dom0_ops.h>
+#include <machine/hypervisor-ifs/event_channel.h>
+#include <machine/hypervisor-ifs/io/domain_controller.h>
+#include <machine/hypervisor-ifs/io/netif.h>
+
+#undef u8
+#undef u16
+#undef u32
+#undef u64
+#undef s8
+#undef s16
+#undef s32
+#undef s64
+
+
+/*
+ * a placeholder for the start of day information passed up from the hypervisor
+ */
+union start_info_union
+{
+ start_info_t start_info;
+ char padding[512];
+};
+extern union start_info_union start_info_union;
+#define xen_start_info (start_info_union.start_info)
+
+
+/* hypervisor.c */
+void do_hypervisor_callback(struct trapframe *regs);
+void hypervisor_notify_via_evtchn(unsigned int);
+void hypervisor_enable_irq(unsigned int);
+void hypervisor_disable_irq(unsigned int);
+void hypervisor_acknowledge_irq(unsigned int);
+
+/* hypervisor_machdep.c */
+void hypervisor_unmask_event(unsigned int);
+void hypervisor_mask_event(unsigned int);
+void hypervisor_clear_event(unsigned int);
+void hypervisor_force_callback(void);
+
+/*
+ * Assembler stubs for hyper-calls.
+ */
+
+static inline int HYPERVISOR_set_trap_table(trap_info_t *table)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_set_trap_table),
+ "b" (table) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count,
+ int *success_count)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_mmu_update),
+ "b" (req), "c" (count), "d" (success_count) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_set_gdt),
+ "b" (frame_list), "c" (entries) : "memory" );
+
+
+ return ret;
+}
+
+static inline int HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_stack_switch),
+ "b" (ss), "c" (esp) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_set_callbacks(
+ unsigned long event_selector, unsigned long event_address,
+ unsigned long failsafe_selector, unsigned long failsafe_address)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_set_callbacks),
+ "b" (event_selector), "c" (event_address),
+ "d" (failsafe_selector), "S" (failsafe_address) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_fpu_taskswitch(void)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_fpu_taskswitch) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_yield(void)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+ "b" (SCHEDOP_yield) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_block(void)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+ "b" (SCHEDOP_block) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_shutdown(void)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+ "b" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift))
+ : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_reboot(void)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+ "b" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift))
+ : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_suspend(unsigned long srec)
+{
+ int ret;
+ /* NB. On suspend, control software expects a suspend record in %esi. */
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+ "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)),
+ "S" (srec) : "memory" );
+
+ return ret;
+}
+
+static inline long HYPERVISOR_set_timer_op(uint64_t timeout)
+{
+ int ret;
+ unsigned long timeout_hi = (unsigned long)(timeout>>32);
+ unsigned long timeout_lo = (unsigned long)timeout;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_set_timer_op),
+ "b" (timeout_hi), "c" (timeout_lo) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_dom0_op(dom0_op_t *dom0_op)
+{
+ int ret;
+ dom0_op->interface_version = DOM0_INTERFACE_VERSION;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_dom0_op),
+ "b" (dom0_op) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_set_debugreg(int reg, unsigned long value)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_set_debugreg),
+ "b" (reg), "c" (value) : "memory" );
+
+ return ret;
+}
+
+static inline unsigned long HYPERVISOR_get_debugreg(int reg)
+{
+ unsigned long ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_get_debugreg),
+ "b" (reg) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_update_descriptor(
+ unsigned long pa, unsigned long word1, unsigned long word2)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_update_descriptor),
+ "b" (pa), "c" (word1), "d" (word2) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_set_fast_trap(int idx)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_set_fast_trap),
+ "b" (idx) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_dom_mem_op(unsigned int op,
+ unsigned long *extent_list,
+ unsigned long nr_extents,
+ unsigned int extent_order)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_dom_mem_op),
+ "b" (op), "c" (extent_list), "d" (nr_extents), "S" (extent_order),
+ "D" (DOMID_SELF)
+ : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_multicall(void *call_list, int nr_calls)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_multicall),
+ "b" (call_list), "c" (nr_calls) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_update_va_mapping(
+ unsigned long page_nr, unsigned long new_val, unsigned long flags)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping),
+ "b" (page_nr), "c" (new_val), "d" (flags) : "memory" );
+
+ if (__predict_false(ret < 0))
+ panic("Failed update VA mapping: %08lx, %08lx, %08lx",
+ page_nr, new_val, flags);
+
+ return ret;
+}
+
+static inline int HYPERVISOR_event_channel_op(void *op)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_event_channel_op),
+ "b" (op) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_xen_version(int cmd)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_xen_version),
+ "b" (cmd) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_console_io(int cmd, int count, char *str)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_console_io),
+ "b" (cmd), "c" (count), "d" (str) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_physdev_op(void *physdev_op)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_physdev_op),
+ "b" (physdev_op) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_grant_table_op(void *gnttab_op)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_grant_table_op),
+ "b" (gnttab_op) : "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_update_va_mapping_otherdomain(
+ unsigned long page_nr, unsigned long new_val, unsigned long flags, domid_t domid)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping_otherdomain),
+ "b" (page_nr), "c" (new_val), "d" (flags), "S" (domid) :
+ "memory" );
+
+ return ret;
+}
+
+static inline int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
+{
+ int ret;
+ __asm__ __volatile__ (
+ TRAP_INSTR
+ : "=a" (ret) : "0" (__HYPERVISOR_vm_assist),
+ "b" (cmd), "c" (type) : "memory" );
+
+ return ret;
+}
+
+#endif /* _XEN_HYPERVISOR_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h
new file mode 100644
index 0000000000..32a774b1b6
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h
@@ -0,0 +1,110 @@
+/* $NetBSD: if_xennetvar.h,v 1.1.2.1 2004/05/22 15:59:31 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_IF_XENNETVAR_H_
+#define _XEN_IF_XENNETVAR_H_
+
+#include <machine/xen.h>
+
+union xennet_bufarray {
+ struct {
+ struct mbuf *xbtx_m;
+ } xb_tx;
+ struct {
+ vaddr_t xbrx_va;
+ paddr_t xbrx_pa;
+ struct xennet_softc *xbrx_sc;
+ } xb_rx;
+ int xb_next;
+};
+
+struct xennet_txbuf {
+ SLIST_ENTRY(xennet_txbuf) xt_next;
+ struct xennet_softc *xt_sc;
+ paddr_t xt_pa;
+ u_char xt_buf[0];
+};
+#define TXBUF_PER_PAGE 2
+#define TXBUF_BUFSIZE (PAGE_SIZE / TXBUF_PER_PAGE) - sizeof(struct xennet_txbuf)
+
+struct xennet_softc {
+ struct device sc_dev; /* base device glue */
+ struct ethercom sc_ethercom; /* Ethernet common part */
+
+ int sc_ifno;
+
+ uint8_t sc_enaddr[6];
+
+#ifdef mediacode
+ struct ifmedia sc_media;
+#endif
+
+ /* What is the status of our connection to the remote backend? */
+#define BEST_CLOSED 0
+#define BEST_DISCONNECTED 1
+#define BEST_CONNECTED 2
+ unsigned int sc_backend_state;
+
+ unsigned int sc_evtchn;
+ unsigned int sc_irq;
+
+ netif_tx_interface_t *sc_tx;
+ netif_rx_interface_t *sc_rx;
+
+ uint32_t sc_tx_entries;
+ uint32_t sc_tx_resp_cons;
+
+ uint32_t sc_rx_resp_cons;
+ uint32_t sc_rx_bufs_to_notify;
+
+ union xennet_bufarray sc_tx_bufa[NETIF_TX_RING_SIZE];
+ union xennet_bufarray sc_rx_bufa[NETIF_TX_RING_SIZE];
+
+ SLIST_HEAD(, xennet_txbuf) sc_tx_bufs;
+};
+
+struct xennet_attach_args {
+ const char *xa_device;
+ int xa_handle;
+};
+
+struct nfs_diskless;
+
+int xennet_scan(struct device *, struct xennet_attach_args *, cfprint_t);
+void xennet_start(struct ifnet *);
+int xennet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
+void xennet_watchdog(struct ifnet *ifp);
+int xennet_bootstatic_callback(struct nfs_diskless *);
+
+#endif /* _XEN_IF_XENNETVAR_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h
new file mode 100644
index 0000000000..1a482ea287
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h
@@ -0,0 +1,533 @@
+/* $NetBSD: pmap.h,v 1.1.2.1 2004/05/22 15:59:58 he Exp $ */
+/* NetBSD: pmap.h,v 1.79 2004/02/20 17:35:01 yamt Exp */
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgment:
+ * This product includes software developed by Charles D. Cranor and
+ * Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * pmap.h: see pmap.c for the history of this pmap module.
+ */
+
+#ifndef _I386_PMAP_H_
+#define _I386_PMAP_H_
+
+#if defined(_KERNEL_OPT)
+#include "opt_user_ldt.h"
+#include "opt_largepages.h"
+#endif
+
+#include "opt_xen.h"
+
+#include <machine/cpufunc.h>
+#include <machine/pte.h>
+#include <machine/xenfunc.h>
+#include <machine/xenpmap.h>
+#include <machine/segments.h>
+#include <uvm/uvm_object.h>
+
+/*
+ * see pte.h for a description of i386 MMU terminology and hardware
+ * interface.
+ *
+ * a pmap describes a processes' 4GB virtual address space. this
+ * virtual address space can be broken up into 1024 4MB regions which
+ * are described by PDEs in the PDP. the PDEs are defined as follows:
+ *
+ * (ranges are inclusive -> exclusive, just like vm_map_entry start/end)
+ * (the following assumes that KERNBASE is 0xc0000000)
+ *
+ * PDE#s VA range usage
+ * 0->766 0x0 -> 0xbfc00000 user address space
+ * 767 0xbfc00000-> recursive mapping of PDP (used for
+ * 0xc0000000 linear mapping of PTPs)
+ * 768->1023 0xc0000000-> kernel address space (constant
+ * 0xffc00000 across all pmap's/processes)
+ * 1023 0xffc00000-> "alternate" recursive PDP mapping
+ * <end> (for other pmaps)
+ *
+ *
+ * note: a recursive PDP mapping provides a way to map all the PTEs for
+ * a 4GB address space into a linear chunk of virtual memory. in other
+ * words, the PTE for page 0 is the first int mapped into the 4MB recursive
+ * area. the PTE for page 1 is the second int. the very last int in the
+ * 4MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB
+ * address).
+ *
+ * all pmap's PD's must have the same values in slots 768->1023 so that
+ * the kernel is always mapped in every process. these values are loaded
+ * into the PD at pmap creation time.
+ *
+ * at any one time only one pmap can be active on a processor. this is
+ * the pmap whose PDP is pointed to by processor register %cr3. this pmap
+ * will have all its PTEs mapped into memory at the recursive mapping
+ * point (slot #767 as show above). when the pmap code wants to find the
+ * PTE for a virtual address, all it has to do is the following:
+ *
+ * address of PTE = (767 * 4MB) + (VA / PAGE_SIZE) * sizeof(pt_entry_t)
+ * = 0xbfc00000 + (VA / 4096) * 4
+ *
+ * what happens if the pmap layer is asked to perform an operation
+ * on a pmap that is not the one which is currently active? in that
+ * case we take the PA of the PDP of non-active pmap and put it in
+ * slot 1023 of the active pmap. this causes the non-active pmap's
+ * PTEs to get mapped in the final 4MB of the 4GB address space
+ * (e.g. starting at 0xffc00000).
+ *
+ * the following figure shows the effects of the recursive PDP mapping:
+ *
+ * PDP (%cr3)
+ * +----+
+ * | 0| -> PTP#0 that maps VA 0x0 -> 0x400000
+ * | |
+ * | |
+ * | 767| -> points back to PDP (%cr3) mapping VA 0xbfc00000 -> 0xc0000000
+ * | 768| -> first kernel PTP (maps 0xc0000000 -> 0xf0400000)
+ * | |
+ * |1023| -> points to alternate pmap's PDP (maps 0xffc00000 -> end)
+ * +----+
+ *
+ * note that the PDE#767 VA (0xbfc00000) is defined as "PTE_BASE"
+ * note that the PDE#1023 VA (0xffc00000) is defined as "APTE_BASE"
+ *
+ * starting at VA 0xbfc00000 the current active PDP (%cr3) acts as a
+ * PTP:
+ *
+ * PTP#767 == PDP(%cr3) => maps VA 0xbfc00000 -> 0xc0000000
+ * +----+
+ * | 0| -> maps the contents of PTP#0 at VA 0xbfc00000->0xbfc01000
+ * | |
+ * | |
+ * | 767| -> maps contents of PTP#767 (the PDP) at VA 0xbffbf000
+ * | 768| -> maps contents of first kernel PTP
+ * | |
+ * |1023|
+ * +----+
+ *
+ * note that mapping of the PDP at PTP#767's VA (0xbffbf000) is
+ * defined as "PDP_BASE".... within that mapping there are two
+ * defines:
+ * "PDP_PDE" (0xbfeffbfc) is the VA of the PDE in the PDP
+ * which points back to itself.
+ * "APDP_PDE" (0xbfeffffc) is the VA of the PDE in the PDP which
+ * establishes the recursive mapping of the alternate pmap.
+ * to set the alternate PDP, one just has to put the correct
+ * PA info in *APDP_PDE.
+ *
+ * note that in the APTE_BASE space, the APDP appears at VA
+ * "APDP_BASE" (0xfffff000).
+ */
+/* XXX MP should we allocate one APDP_PDE per processor?? */
+
+/*
+ * the following defines identify the slots used as described above.
+ */
+
+#define PDSLOT_PTE ((KERNBASE/NBPD)-1) /* 767: for recursive PDP map */
+#define PDSLOT_KERN (KERNBASE/NBPD) /* 768: start of kernel space */
+#define PDSLOT_APTE ((unsigned)1023-16) /* 1023: alternative recursive slot */
+
+/*
+ * the following defines give the virtual addresses of various MMU
+ * data structures:
+ * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
+ * PTD_BASE and APTD_BASE: the base VA of the recursive mapping of the PTD
+ * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
+ */
+
+#define PTE_BASE ((pt_entry_t *) (PDSLOT_PTE * NBPD) )
+#define APTE_BASE ((pt_entry_t *) (PDSLOT_APTE * NBPD) )
+#define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * PAGE_SIZE)))
+#define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * PAGE_SIZE)))
+#define PDP_PDE (PDP_BASE + PDSLOT_PTE)
+#define APDP_PDE (PDP_BASE + PDSLOT_APTE)
+
+/*
+ * the follow define determines how many PTPs should be set up for the
+ * kernel by locore.s at boot time. this should be large enough to
+ * get the VM system running. once the VM system is running, the
+ * pmap module can add more PTPs to the kernel area on demand.
+ */
+
+#ifndef NKPTP
+#define NKPTP 4 /* 16MB to start */
+#endif
+#define NKPTP_MIN 4 /* smallest value we allow */
+#define NKPTP_MAX (1024 - (KERNBASE/NBPD) - 1)
+ /* largest value (-1 for APTP space) */
+
+/*
+ * pdei/ptei: generate index into PDP/PTP from a VA
+ */
+#define pdei(VA) (((VA) & PD_MASK) >> PDSHIFT)
+#define ptei(VA) (((VA) & PT_MASK) >> PGSHIFT)
+
+/*
+ * PTP macros:
+ * a PTP's index is the PD index of the PDE that points to it
+ * a PTP's offset is the byte-offset in the PTE space that this PTP is at
+ * a PTP's VA is the first VA mapped by that PTP
+ *
+ * note that PAGE_SIZE == number of bytes in a PTP (4096 bytes == 1024 entries)
+ * NBPD == number of bytes a PTP can map (4MB)
+ */
+
+#define ptp_i2o(I) ((I) * PAGE_SIZE) /* index => offset */
+#define ptp_o2i(O) ((O) / PAGE_SIZE) /* offset => index */
+#define ptp_i2v(I) ((I) * NBPD) /* index => VA */
+#define ptp_v2i(V) ((V) / NBPD) /* VA => index (same as pdei) */
+
+/*
+ * PG_AVAIL usage: we make use of the ignored bits of the PTE
+ */
+
+#define PG_W PG_AVAIL1 /* "wired" mapping */
+#define PG_PVLIST PG_AVAIL2 /* mapping has entry on pvlist */
+#define PG_X PG_AVAIL3 /* executable mapping */
+
+/*
+ * Number of PTE's per cache line. 4 byte pte, 32-byte cache line
+ * Used to avoid false sharing of cache lines.
+ */
+#define NPTECL 8
+
+#ifdef _KERNEL
+/*
+ * pmap data structures: see pmap.c for details of locking.
+ */
+
+struct pmap;
+typedef struct pmap *pmap_t;
+
+/*
+ * we maintain a list of all non-kernel pmaps
+ */
+
+LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */
+
+/*
+ * the pmap structure
+ *
+ * note that the pm_obj contains the simple_lock, the reference count,
+ * page list, and number of PTPs within the pmap.
+ *
+ * XXX If we ever support processor numbers higher than 31, we'll have
+ * XXX to rethink the CPU mask.
+ */
+
+struct pmap {
+ struct uvm_object pm_obj; /* object (lck by object lock) */
+#define pm_lock pm_obj.vmobjlock
+ LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */
+ pd_entry_t *pm_pdir; /* VA of PD (lck by object lock) */
+ u_int32_t pm_pdirpa; /* PA of PD (read-only after create) */
+ struct vm_page *pm_ptphint; /* pointer to a PTP in our pmap */
+ struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */
+
+ vaddr_t pm_hiexec; /* highest executable mapping */
+ int pm_flags; /* see below */
+
+ union descriptor *pm_ldt; /* user-set LDT */
+ int pm_ldt_len; /* number of LDT entries */
+ int pm_ldt_sel; /* LDT selector */
+ u_int32_t pm_cpus; /* mask of CPUs using pmap */
+};
+
+/* pm_flags */
+#define PMF_USER_LDT 0x01 /* pmap has user-set LDT */
+
+/*
+ * for each managed physical page we maintain a list of <PMAP,VA>'s
+ * which it is mapped at. the list is headed by a pv_head structure.
+ * there is one pv_head per managed phys page (allocated at boot time).
+ * the pv_head structure points to a list of pv_entry structures (each
+ * describes one mapping).
+ */
+
+struct pv_entry { /* locked by its list's pvh_lock */
+ SPLAY_ENTRY(pv_entry) pv_node; /* splay-tree node */
+ struct pmap *pv_pmap; /* the pmap */
+ vaddr_t pv_va; /* the virtual address */
+ struct vm_page *pv_ptp; /* the vm_page of the PTP */
+};
+
+/*
+ * pv_entrys are dynamically allocated in chunks from a single page.
+ * we keep track of how many pv_entrys are in use for each page and
+ * we can free pv_entry pages if needed. there is one lock for the
+ * entire allocation system.
+ */
+
+struct pv_page_info {
+ TAILQ_ENTRY(pv_page) pvpi_list;
+ struct pv_entry *pvpi_pvfree;
+ int pvpi_nfree;
+};
+
+/*
+ * number of pv_entry's in a pv_page
+ * (note: won't work on systems where NPBG isn't a constant)
+ */
+
+#define PVE_PER_PVPAGE ((PAGE_SIZE - sizeof(struct pv_page_info)) / \
+ sizeof(struct pv_entry))
+
+/*
+ * a pv_page: where pv_entrys are allocated from
+ */
+
+struct pv_page {
+ struct pv_page_info pvinfo;
+ struct pv_entry pvents[PVE_PER_PVPAGE];
+};
+
+/*
+ * global kernel variables
+ */
+
+/* PTDpaddr: is the physical address of the kernel's PDP */
+extern u_long PTDpaddr;
+
+extern struct pmap kernel_pmap_store; /* kernel pmap */
+extern int nkpde; /* current # of PDEs for kernel */
+extern int pmap_pg_g; /* do we support PG_G? */
+
+/*
+ * macros
+ */
+
+#define pmap_kernel() (&kernel_pmap_store)
+#define pmap_resident_count(pmap) ((pmap)->pm_stats.resident_count)
+#define pmap_wired_count(pmap) ((pmap)->pm_stats.wired_count)
+#define pmap_update(pmap) /* nothing (yet) */
+
+#define pmap_clear_modify(pg) pmap_clear_attrs(pg, PG_M)
+#define pmap_clear_reference(pg) pmap_clear_attrs(pg, PG_U)
+#define pmap_copy(DP,SP,D,L,S)
+#define pmap_is_modified(pg) pmap_test_attrs(pg, PG_M)
+#define pmap_is_referenced(pg) pmap_test_attrs(pg, PG_U)
+#define pmap_move(DP,SP,D,L,S)
+#define pmap_phys_address(ppn) x86_ptob(ppn)
+#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */
+
+
+/*
+ * prototypes
+ */
+
+void pmap_activate(struct lwp *);
+void pmap_bootstrap(vaddr_t);
+boolean_t pmap_clear_attrs(struct vm_page *, int);
+void pmap_deactivate(struct lwp *);
+void pmap_deactivate2(struct lwp *);
+void pmap_page_remove (struct vm_page *);
+void pmap_remove(struct pmap *, vaddr_t, vaddr_t);
+boolean_t pmap_test_attrs(struct vm_page *, int);
+void pmap_write_protect(struct pmap *, vaddr_t, vaddr_t, vm_prot_t);
+int pmap_exec_fixup(struct vm_map *, struct trapframe *,
+ struct pcb *);
+void pmap_load(void);
+int pmap_enter_ma(struct pmap *, vaddr_t, paddr_t, vm_prot_t,
+ int);
+
+vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
+
+void pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, int32_t *);
+void pmap_tlb_shootnow(int32_t);
+void pmap_do_tlb_shootdown(struct cpu_info *);
+
+#define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */
+
+/*
+ * Do idle page zero'ing uncached to avoid polluting the cache.
+ */
+boolean_t pmap_pageidlezero(paddr_t);
+#define PMAP_PAGEIDLEZERO(pa) pmap_pageidlezero((pa))
+
+/*
+ * inline functions
+ */
+
+/*ARGSUSED*/
+static __inline void
+pmap_remove_all(struct pmap *pmap)
+{
+ /* Nothing. */
+}
+
+/*
+ * pmap_update_pg: flush one page from the TLB (or flush the whole thing
+ * if hardware doesn't support one-page flushing)
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_update_pg(vaddr_t va)
+{
+#if defined(I386_CPU)
+ if (cpu_class == CPUCLASS_386)
+ tlbflush();
+ else
+#endif
+ invlpg((u_int) va);
+}
+
+/*
+ * pmap_update_2pg: flush two pages from the TLB
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_update_2pg(vaddr_t va, vaddr_t vb)
+{
+#if defined(I386_CPU)
+ if (cpu_class == CPUCLASS_386)
+ tlbflush();
+ else
+#endif
+ {
+ invlpg((u_int) va);
+ invlpg((u_int) vb);
+ }
+}
+
+/*
+ * pmap_page_protect: change the protection of all recorded mappings
+ * of a managed page
+ *
+ * => this function is a frontend for pmap_page_remove/pmap_clear_attrs
+ * => we only have to worry about making the page more protected.
+ * unprotecting a page is done on-demand at fault time.
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
+{
+ if ((prot & VM_PROT_WRITE) == 0) {
+ if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
+ (void) pmap_clear_attrs(pg, PG_RW);
+ } else {
+ pmap_page_remove(pg);
+ }
+ }
+}
+
+/*
+ * pmap_protect: change the protection of pages in a pmap
+ *
+ * => this function is a frontend for pmap_remove/pmap_write_protect
+ * => we only have to worry about making the page more protected.
+ * unprotecting a page is done on-demand at fault time.
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
+{
+ if ((prot & VM_PROT_WRITE) == 0) {
+ if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
+ pmap_write_protect(pmap, sva, eva, prot);
+ } else {
+ pmap_remove(pmap, sva, eva);
+ }
+ }
+}
+
+/*
+ * various address inlines
+ *
+ * vtopte: return a pointer to the PTE mapping a VA, works only for
+ * user and PT addresses
+ *
+ * kvtopte: return a pointer to the PTE mapping a kernel VA
+ */
+
+#include <lib/libkern/libkern.h>
+
+static __inline pt_entry_t * __attribute__((__unused__))
+vtopte(vaddr_t va)
+{
+
+ KASSERT(va < (PDSLOT_KERN << PDSHIFT));
+
+ return (PTE_BASE + x86_btop(va));
+}
+
+static __inline pt_entry_t * __attribute__((__unused__))
+kvtopte(vaddr_t va)
+{
+
+ KASSERT(va >= (PDSLOT_KERN << PDSHIFT));
+
+#ifdef LARGEPAGES
+ {
+ pd_entry_t *pde;
+
+ pde = PDP_BASE + pdei(va);
+ if (*pde & PG_PS)
+ return ((pt_entry_t *)pde);
+ }
+#endif
+
+ return (PTE_BASE + x86_btop(va));
+}
+
+/*
+ * vtomach: virtual address to machine address. For use by
+ * machine-dependent code only.
+ */
+
+static inline paddr_t __attribute__((__unused__))
+vtomach(vaddr_t va)
+{
+ pt_entry_t pte;
+
+ pte = PTE_GET(&PTE_BASE[x86_btop(va)]);
+ return xpmap_ptom((pte & PG_FRAME) | (va & ~PG_FRAME));
+}
+
+#define pmap_cpu_has_pg_n() (cpu_class != CPUCLASS_386)
+#define pmap_cpu_has_invlpg() (cpu_class != CPUCLASS_386)
+
+paddr_t vtophys(vaddr_t);
+vaddr_t pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t);
+
+void pmap_kenter_ma(vaddr_t, paddr_t, vm_prot_t);
+
+#if defined(USER_LDT)
+void pmap_ldt_cleanup(struct lwp *);
+#define PMAP_FORK
+#endif /* USER_LDT */
+
+/*
+ * Hooks for the pool allocator.
+ */
+#define POOL_VTOPHYS(va) vtophys((vaddr_t) (va))
+
+#endif /* _KERNEL */
+#endif /* _I386_PMAP_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h
new file mode 100644
index 0000000000..48bff484b9
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h
@@ -0,0 +1,247 @@
+/* $NetBSD: xen.h,v 1.1.2.2 2004/06/17 09:23:19 tron Exp $ */
+
+/*
+ *
+ * Copyright (c) 2003, 2004 Keir Fraser (on behalf of the Xen team)
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _XEN_H
+#define _XEN_H
+
+#ifndef _LOCORE
+
+struct xen_netinfo {
+ uint32_t xi_ifno;
+ char *xi_root;
+ uint32_t xi_ip[5];
+};
+
+union xen_cmdline_parseinfo {
+ char xcp_bootdev[16]; /* sizeof(dv_xname) */
+ struct xen_netinfo xcp_netinfo;
+ char xcp_console[16];
+};
+
+#define XEN_PARSE_BOOTDEV 0
+#define XEN_PARSE_NETINFO 1
+#define XEN_PARSE_CONSOLE 2
+
+void xen_parse_cmdline(int, union xen_cmdline_parseinfo *);
+
+void xenconscn_attach(void);
+
+void xenmachmem_init(void);
+void xenprivcmd_init(void);
+void xenvfr_init(void);
+
+#ifdef XENDEBUG
+void printk(const char *, ...);
+void vprintk(const char *, va_list);
+#endif
+
+#endif
+
+#endif /* _XEN_H */
+
+/******************************************************************************
+ * os.h
+ *
+ * random collection of macros and definition
+ */
+
+#ifndef _OS_H_
+#define _OS_H_
+
+/*
+ * These are the segment descriptors provided for us by the hypervisor.
+ * For now, these are hardwired -- guest OSes cannot update the GDT
+ * or LDT.
+ *
+ * It shouldn't be hard to support descriptor-table frobbing -- let me
+ * know if the BSD or XP ports require flexibility here.
+ */
+
+
+/*
+ * these are also defined in hypervisor-if.h but can't be pulled in as
+ * they are used in start of day assembly. Need to clean up the .h files
+ * a bit more...
+ */
+
+#ifndef FLAT_RING1_CS
+#define FLAT_RING1_CS 0x0819
+#define FLAT_RING1_DS 0x0821
+#define FLAT_RING3_CS 0x082b
+#define FLAT_RING3_DS 0x0833
+#endif
+
+#define __KERNEL_CS FLAT_RING1_CS
+#define __KERNEL_DS FLAT_RING1_DS
+
+/* Everything below this point is not included by assembler (.S) files. */
+#ifndef _LOCORE
+
+/* some function prototypes */
+void trap_init(void);
+
+
+/*
+ * STI/CLI equivalents. These basically set and clear the virtual
+ * event_enable flag in the shared_info structure. Note that when
+ * the enable bit is set, there may be pending events to be handled.
+ * We may therefore call into do_hypervisor_callback() directly.
+ */
+
+#define __save_flags(x) \
+do { \
+ (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask; \
+} while (0)
+
+#define __restore_flags(x) \
+do { \
+ shared_info_t *_shared = HYPERVISOR_shared_info; \
+ __insn_barrier(); \
+ if ((_shared->vcpu_data[0].evtchn_upcall_mask = (x)) == 0) { \
+ __insn_barrier(); \
+ if (__predict_false(_shared->vcpu_data[0].evtchn_upcall_pending)) \
+ hypervisor_force_callback(); \
+ } \
+} while (0)
+
+#define __cli() \
+do { \
+ HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1; \
+ __insn_barrier(); \
+} while (0)
+
+#define __sti() \
+do { \
+ shared_info_t *_shared = HYPERVISOR_shared_info; \
+ __insn_barrier(); \
+ _shared->vcpu_data[0].evtchn_upcall_mask = 0; \
+ __insn_barrier(); /* unmask then check (avoid races) */ \
+ if (__predict_false(_shared->vcpu_data[0].evtchn_upcall_pending)) \
+ hypervisor_force_callback(); \
+} while (0)
+
+#define cli() __cli()
+#define sti() __sti()
+#define save_flags(x) __save_flags(x)
+#define restore_flags(x) __restore_flags(x)
+#define save_and_cli(x) do { \
+ __save_flags(x); \
+ __cli(); \
+} while (/* CONSTCOND */ 0)
+#define save_and_sti(x) __save_and_sti(x)
+
+#ifdef MULTIPROCESSOR
+#define __LOCK_PREFIX "lock; "
+#else
+#define __LOCK_PREFIX ""
+#endif
+
+static __inline__ uint32_t
+x86_atomic_xchg(uint32_t *ptr, unsigned long val)
+{
+ unsigned long result;
+
+ __asm __volatile("xchgl %0,%1"
+ :"=r" (result)
+ :"m" (*ptr), "0" (val)
+ :"memory");
+
+ return result;
+}
+
+static __inline__ int
+x86_atomic_test_and_clear_bit(volatile void *ptr, int bitno)
+{
+ int result;
+
+ __asm __volatile(__LOCK_PREFIX
+ "btrl %2,%1 ;"
+ "sbbl %0,%0"
+ :"=r" (result), "=m" (*(volatile uint32_t *)(ptr))
+ :"Ir" (bitno) : "memory");
+ return result;
+}
+
+static __inline__ int
+x86_atomic_test_and_set_bit(volatile void *ptr, int bitno)
+{
+ int result;
+
+ __asm __volatile(__LOCK_PREFIX
+ "btsl %2,%1 ;"
+ "sbbl %0,%0"
+ :"=r" (result), "=m" (*(volatile uint32_t *)(ptr))
+ :"Ir" (bitno) : "memory");
+ return result;
+}
+
+static __inline int
+x86_constant_test_bit(const volatile void *ptr, int bitno)
+{
+ return ((1UL << (bitno & 31)) &
+ (((const volatile uint32_t *) ptr)[bitno >> 5])) != 0;
+}
+
+static __inline int
+x86_variable_test_bit(const volatile void *ptr, int bitno)
+{
+ int result;
+
+ __asm __volatile(
+ "btl %2,%1 ;"
+ "sbbl %0,%0"
+ :"=r" (result)
+ :"m" (*(volatile uint32_t *)(ptr)), "Ir" (bitno));
+ return result;
+}
+
+#define x86_atomic_test_bit(ptr, bitno) \
+ (__builtin_constant_p(bitno) ? \
+ x86_constant_test_bit((ptr),(bitno)) : \
+ x86_variable_test_bit((ptr),(bitno)))
+
+static __inline void
+x86_atomic_set_bit(volatile void *ptr, int bitno)
+{
+ __asm __volatile(__LOCK_PREFIX
+ "btsl %1,%0"
+ :"=m" (*(volatile uint32_t *)(ptr))
+ :"Ir" (bitno));
+}
+
+static __inline void
+x86_atomic_clear_bit(volatile void *ptr, int bitno)
+{
+ __asm __volatile(__LOCK_PREFIX
+ "btrl %1,%0"
+ :"=m" (*(volatile uint32_t *)(ptr))
+ :"Ir" (bitno));
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _OS_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h
new file mode 100644
index 0000000000..2df026a922
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h
@@ -0,0 +1,135 @@
+/* $NetBSD: xenfunc.h,v 1.1.2.1 2004/05/22 15:59:31 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_XENFUNC_H_
+#define _XEN_XENFUNC_H_
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+#include <machine/xenpmap.h>
+#include <machine/pte.h>
+
+#ifdef XENDEBUG_LOW
+#define __PRINTK(x) printk x
+#else
+#define __PRINTK(x)
+#endif
+
+void xen_set_ldt(vaddr_t, uint32_t);
+void xen_update_descriptor(union descriptor *, union descriptor *);
+
+static __inline void
+invlpg(u_int addr)
+{
+ xpq_queue_invlpg(addr);
+ xpq_flush_queue();
+}
+
+static __inline void
+lldt(u_short sel)
+{
+
+ /* __PRINTK(("ldt %x\n", IDXSELN(sel))); */
+ if (sel == GSEL(GLDT_SEL, SEL_KPL))
+ xen_set_ldt((vaddr_t)ldt, NLDT);
+ else
+ xen_set_ldt(cpu_info_primary.ci_gdt[IDXSELN(sel)].ld.ld_base,
+ cpu_info_primary.ci_gdt[IDXSELN(sel)].ld.ld_entries);
+}
+
+static __inline void
+ltr(u_short sel)
+{
+ __PRINTK(("XXX ltr not supported\n"));
+}
+
+static __inline void
+lcr0(u_int val)
+{
+ __PRINTK(("XXX lcr0 not supported\n"));
+}
+
+static __inline u_int
+rcr0(void)
+{
+ __PRINTK(("XXX rcr0 not supported\n"));
+ return 0;
+}
+
+#define lcr3(_v) _lcr3((_v), __FILE__, __LINE__)
+static __inline void
+_lcr3(u_int val, char *file, int line)
+{
+/* __PRINTK(("lcr3 %08x at %s:%d\n", val, file, line)); */
+ xpq_queue_pt_switch(xpmap_ptom(val) & PG_FRAME);
+ xpq_flush_queue();
+}
+
+static __inline void
+tlbflush(void)
+{
+ xpq_queue_tlb_flush();
+ xpq_flush_queue();
+}
+
+static __inline u_int
+rdr6(void)
+{
+ u_int val;
+
+ val = HYPERVISOR_get_debugreg(6);
+ return val;
+}
+
+static __inline void
+ldr6(u_int val)
+{
+
+ HYPERVISOR_set_debugreg(6, val);
+}
+
+static __inline void
+disable_intr(void)
+{
+ __cli();
+}
+
+static __inline void
+enable_intr(void)
+{
+ __sti();
+}
+
+#endif /* _XEN_XENFUNC_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h
new file mode 100644
index 0000000000..f3c8c7f2d8
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h
@@ -0,0 +1,193 @@
+/* $NetBSD: xenpmap.h,v 1.1.2.1 2004/05/22 15:59:58 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_XENPMAP_H_
+#define _XEN_XENPMAP_H_
+
+#define INVALID_P2M_ENTRY (~0UL)
+
+void xpq_queue_invlpg(vaddr_t);
+void xpq_queue_pde_update(pd_entry_t *, pd_entry_t);
+void xpq_queue_pte_update(pt_entry_t *, pt_entry_t);
+void xpq_queue_unchecked_pte_update(pt_entry_t *, pt_entry_t);
+void xpq_queue_pt_switch(paddr_t);
+void xpq_flush_queue(void);
+void xpq_queue_set_ldt(vaddr_t, uint32_t);
+void xpq_queue_tlb_flush(void);
+void xpq_queue_pin_table(paddr_t, int);
+void xpq_queue_unpin_table(paddr_t);
+
+extern paddr_t *xpmap_phys_to_machine_mapping;
+
+#define XPQ_PIN_L1_TABLE 1
+#define XPQ_PIN_L2_TABLE 2
+
+#ifndef XEN
+#define PDE_GET(_pdp) \
+ *(_pdp)
+#define PDE_SET(_pdp,_mapdp,_npde) \
+ *(_mapdp) = (_npde)
+#define PDE_CLEAR(_pdp,_mapdp) \
+ *(_mapdp) = 0
+#define PTE_SET(_ptp,_maptp,_npte) \
+ *(_maptp) = (_npte)
+#define PTE_CLEAR(_ptp,_maptp) \
+ *(_maptp) = 0
+#define PTE_ATOMIC_SET(_ptp,_maptp,_npte,_opte) \
+ (_opte) = x86_atomic_testset_ul((_maptp), (_npte))
+#define PTE_ATOMIC_CLEAR(_ptp,_maptp,_opte) \
+ (_opte) = x86_atomic_testset_ul((_maptp), 0)
+#define PDE_CLEARBITS(_pdp,_mapdp,_bits) \
+ *(_mapdp) &= ~(_bits)
+#define PTE_ATOMIC_CLEARBITS(_ptp,_maptp,_bits) \
+ x86_atomic_clearbits_l((_maptp), (_bits))
+#define PTE_SETBITS(_ptp,_maptp,_bits) \
+ *(_maptp) |= (_bits)
+#define PTE_ATOMIC_SETBITS(_ptp,_maptp,_bits) \
+ x86_atomic_setbits_l((_maptp), (_bits))
+#else
+paddr_t *xpmap_phys_to_machine_mapping;
+
+#define PDE_GET(_pdp) \
+ (pmap_valid_entry(*(_pdp)) ? xpmap_mtop(*(_pdp)) : *(_pdp))
+#define PDE_SET(_pdp,_mapdp,_npde) do { \
+ xpq_queue_pde_update((_mapdp), xpmap_ptom((_npde))); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PDE_CLEAR(_pdp,_mapdp) do { \
+ xpq_queue_pde_update((_mapdp), 0); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_GET(_ptp) \
+ (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : *(_ptp))
+#define PTE_GET_MA(_ptp) \
+ *(_ptp)
+#define PTE_SET(_ptp,_maptp,_npte) do { \
+ xpq_queue_pte_update((_maptp), xpmap_ptom((_npte))); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_SET_MA(_ptp,_maptp,_npte) do { \
+ xpq_queue_pte_update((_maptp), (_npte)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_SET_MA_UNCHECKED(_ptp,_maptp,_npte) do { \
+ xpq_queue_unchecked_pte_update((_maptp), (_npte)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_CLEAR(_ptp,_maptp) do { \
+ xpq_queue_pte_update((_maptp), 0); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_SET(_ptp,_maptp,_npte,_opte) do { \
+ (_opte) = PTE_GET(_ptp); \
+ xpq_queue_pte_update((_maptp), xpmap_ptom((_npte))); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_SET_MA(_ptp,_maptp,_npte,_opte) do { \
+ (_opte) = *(_ptp); \
+ xpq_queue_pte_update((_maptp), (_npte)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_CLEAR(_ptp,_maptp,_opte) do { \
+ (_opte) = PTE_GET(_ptp); \
+ xpq_queue_pte_update((_maptp), 0); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_CLEAR_MA(_ptp,_maptp,_opte) do { \
+ (_opte) = *(_ptp); \
+ xpq_queue_pte_update((_maptp), 0); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PDE_CLEARBITS(_pdp,_mapdp,_bits) do { \
+ xpq_queue_pte_update((_mapdp), *(_pdp) & ~((_bits) & ~PG_FRAME)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_CLEARBITS(_ptp,_maptp,_bits) do { \
+ xpq_queue_pte_update((_maptp), *(_ptp) & ~((_bits) & ~PG_FRAME)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PDE_ATOMIC_CLEARBITS(_pdp,_mapdp,_bits) do { \
+ xpq_queue_pde_update((_mapdp), *(_pdp) & ~((_bits) & ~PG_FRAME)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_CLEARBITS(_ptp,_maptp,_bits) do { \
+ xpq_queue_pte_update((_maptp), *(_ptp) & ~((_bits) & ~PG_FRAME)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_SETBITS(_ptp,_maptp,_bits) do { \
+ xpq_queue_pte_update((_maptp), *(_ptp) | ((_bits) & ~PG_FRAME)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PDE_ATOMIC_SETBITS(_pdp,_mapdp,_bits) do { \
+ xpq_queue_pde_update((_mapdp), *(_pdp) | ((_bits) & ~PG_FRAME)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_SETBITS(_ptp,_maptp,_bits) do { \
+ xpq_queue_pte_update((_maptp), *(_ptp) | ((_bits) & ~PG_FRAME)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PDE_COPY(_dpdp,_madpdp,_spdp) do { \
+ xpq_queue_pde_update((_madpdp), *(_spdp)); \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+#define PTE_UPDATES_FLUSH() do { \
+ xpq_flush_queue(); \
+} while (/*CONSTCOND*/0)
+
+#endif
+
+#define XPMAP_OFFSET (KERNTEXTOFF - KERNBASE_LOCORE)
+static __inline paddr_t
+xpmap_mtop(paddr_t mpa)
+{
+ return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) +
+ XPMAP_OFFSET) | (mpa & ~PG_FRAME);
+}
+
+static __inline paddr_t
+xpmap_ptom(paddr_t ppa)
+{
+ return (xpmap_phys_to_machine_mapping[(ppa -
+ XPMAP_OFFSET) >> PAGE_SHIFT] << PAGE_SHIFT)
+ | (ppa & ~PG_FRAME);
+}
+
+static __inline paddr_t
+xpmap_ptom_masked(paddr_t ppa)
+{
+ return (xpmap_phys_to_machine_mapping[(ppa -
+ XPMAP_OFFSET) >> PAGE_SHIFT] << PAGE_SHIFT);
+}
+
+#endif /* _XEN_XENPMAP_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c b/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c
new file mode 100644
index 0000000000..dda715fa54
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c
@@ -0,0 +1,505 @@
+/* $NetBSD: bus_space.c,v 1.2.2.1 2004/05/22 15:57:25 he Exp $ */
+/* NetBSD: bus_space.c,v 1.2 2003/03/14 18:47:53 christos Exp */
+
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
+ * Simulation Facility, NASA Ames Research Center.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: bus_space.c,v 1.2.2.1 2004/05/22 15:57:25 he Exp $");
+
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/extent.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/bus.h>
+
+#include <dev/isa/isareg.h>
+#include <machine/isa_machdep.h>
+
+#include <machine/hypervisor.h>
+#include <machine/xenpmap.h>
+
+/*
+ * Extent maps to manage I/O and memory space. Allocate
+ * storage for 8 regions in each, initially. Later, ioport_malloc_safe
+ * will indicate that it's safe to use malloc() to dynamically allocate
+ * region descriptors.
+ *
+ * N.B. At least two regions are _always_ allocated from the iomem
+ * extent map; (0 -> ISA hole) and (end of ISA hole -> end of RAM).
+ *
+ * The extent maps are not static! Machine-dependent ISA and EISA
+ * routines need access to them for bus address space allocation.
+ */
+static long ioport_ex_storage[EXTENT_FIXED_STORAGE_SIZE(8) / sizeof(long)];
+static long iomem_ex_storage[EXTENT_FIXED_STORAGE_SIZE(8) / sizeof(long)];
+struct extent *ioport_ex;
+struct extent *iomem_ex;
+static int ioport_malloc_safe;
+
+int x86_mem_add_mapping __P((bus_addr_t, bus_size_t,
+ int, bus_space_handle_t *));
+
+void
+x86_bus_space_init()
+{
+ /*
+ * Initialize the I/O port and I/O mem extent maps.
+ * Note: we don't have to check the return value since
+ * creation of a fixed extent map will never fail (since
+ * descriptor storage has already been allocated).
+ *
+ * N.B. The iomem extent manages _all_ physical addresses
+ * on the machine. When the amount of RAM is found, the two
+ * extents of RAM are allocated from the map (0 -> ISA hole
+ * and end of ISA hole -> end of RAM).
+ */
+ ioport_ex = extent_create("ioport", 0x0, 0xffff, M_DEVBUF,
+ (caddr_t)ioport_ex_storage, sizeof(ioport_ex_storage),
+ EX_NOCOALESCE|EX_NOWAIT);
+ iomem_ex = extent_create("iomem", 0x0, 0xffffffff, M_DEVBUF,
+ (caddr_t)iomem_ex_storage, sizeof(iomem_ex_storage),
+ EX_NOCOALESCE|EX_NOWAIT);
+
+ /* We are privileged guest os - should have IO privileges. */
+ if (xen_start_info.flags & SIF_PRIVILEGED) {
+ dom0_op_t op;
+ op.cmd = DOM0_IOPL;
+ op.u.iopl.domain = DOMID_SELF;
+ op.u.iopl.iopl = 1;
+ if (HYPERVISOR_dom0_op(&op) != 0)
+ panic("Unable to obtain IOPL, "
+ "despite being SIF_PRIVILEGED");
+ }
+}
+
+void
+x86_bus_space_mallocok()
+{
+
+ ioport_malloc_safe = 1;
+}
+
+int
+x86_memio_map(t, bpa, size, flags, bshp)
+ bus_space_tag_t t;
+ bus_addr_t bpa;
+ bus_size_t size;
+ int flags;
+ bus_space_handle_t *bshp;
+{
+ int error;
+ struct extent *ex;
+
+ /*
+ * Pick the appropriate extent map.
+ */
+ if (t == X86_BUS_SPACE_IO) {
+ if (flags & BUS_SPACE_MAP_LINEAR)
+ return (EOPNOTSUPP);
+ ex = ioport_ex;
+ } else if (t == X86_BUS_SPACE_MEM)
+ ex = iomem_ex;
+ else
+ panic("x86_memio_map: bad bus space tag");
+
+ /*
+ * Before we go any further, let's make sure that this
+ * region is available.
+ */
+ error = extent_alloc_region(ex, bpa, size,
+ EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0));
+ if (error)
+ return (error);
+
+ /*
+ * For I/O space, that's all she wrote.
+ */
+ if (t == X86_BUS_SPACE_IO) {
+ *bshp = bpa;
+ return (0);
+ }
+
+ /*
+ * For memory space, map the bus physical address to
+ * a kernel virtual address.
+ */
+ error = x86_mem_add_mapping(bpa, size,
+ (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp);
+ if (error) {
+ if (extent_free(ex, bpa, size, EX_NOWAIT |
+ (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
+ printf("x86_memio_map: pa 0x%lx, size 0x%lx\n",
+ bpa, size);
+ printf("x86_memio_map: can't free region\n");
+ }
+ }
+
+ return (error);
+}
+
+int
+_x86_memio_map(t, bpa, size, flags, bshp)
+ bus_space_tag_t t;
+ bus_addr_t bpa;
+ bus_size_t size;
+ int flags;
+ bus_space_handle_t *bshp;
+{
+
+ /*
+ * For I/O space, just fill in the handle.
+ */
+ if (t == X86_BUS_SPACE_IO) {
+ if (flags & BUS_SPACE_MAP_LINEAR)
+ return (EOPNOTSUPP);
+ *bshp = bpa;
+ return (0);
+ }
+
+ /*
+ * For memory space, map the bus physical address to
+ * a kernel virtual address.
+ */
+ return (x86_mem_add_mapping(bpa, size,
+ (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp));
+}
+
+int
+x86_memio_alloc(t, rstart, rend, size, alignment, boundary, flags,
+ bpap, bshp)
+ bus_space_tag_t t;
+ bus_addr_t rstart, rend;
+ bus_size_t size, alignment, boundary;
+ int flags;
+ bus_addr_t *bpap;
+ bus_space_handle_t *bshp;
+{
+ struct extent *ex;
+ u_long bpa;
+ int error;
+
+ /*
+ * Pick the appropriate extent map.
+ */
+ if (t == X86_BUS_SPACE_IO) {
+ if (flags & BUS_SPACE_MAP_LINEAR)
+ return (EOPNOTSUPP);
+ ex = ioport_ex;
+ } else if (t == X86_BUS_SPACE_MEM)
+ ex = iomem_ex;
+ else
+ panic("x86_memio_alloc: bad bus space tag");
+
+ /*
+ * Sanity check the allocation against the extent's boundaries.
+ */
+ if (rstart < ex->ex_start || rend > ex->ex_end)
+ panic("x86_memio_alloc: bad region start/end");
+
+ /*
+ * Do the requested allocation.
+ */
+ error = extent_alloc_subregion(ex, rstart, rend, size, alignment,
+ boundary,
+ EX_FAST | EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0),
+ &bpa);
+
+ if (error)
+ return (error);
+
+ /*
+ * For I/O space, that's all she wrote.
+ */
+ if (t == X86_BUS_SPACE_IO) {
+ *bshp = *bpap = bpa;
+ return (0);
+ }
+
+ /*
+ * For memory space, map the bus physical address to
+ * a kernel virtual address.
+ */
+ error = x86_mem_add_mapping(bpa, size,
+ (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp);
+ if (error) {
+ if (extent_free(iomem_ex, bpa, size, EX_NOWAIT |
+ (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
+ printf("x86_memio_alloc: pa 0x%lx, size 0x%lx\n",
+ bpa, size);
+ printf("x86_memio_alloc: can't free region\n");
+ }
+ }
+
+ *bpap = bpa;
+
+ return (error);
+}
+
+int
+x86_mem_add_mapping(bpa, size, cacheable, bshp)
+ bus_addr_t bpa;
+ bus_size_t size;
+ int cacheable;
+ bus_space_handle_t *bshp;
+{
+ u_long pa, endpa;
+ vaddr_t va;
+ pt_entry_t *pte;
+ pt_entry_t *maptp;
+ int32_t cpumask = 0;
+
+ pa = x86_trunc_page(bpa);
+ endpa = x86_round_page(bpa + size);
+
+#ifdef DIAGNOSTIC
+ if (endpa <= pa)
+ panic("x86_mem_add_mapping: overflow");
+#endif
+
+ if (bpa >= IOM_BEGIN && (bpa + size) <= IOM_END) {
+ va = (vaddr_t)ISA_HOLE_VADDR(pa);
+ } else {
+ va = uvm_km_valloc(kernel_map, endpa - pa);
+ if (va == 0)
+ return (ENOMEM);
+ }
+
+ *bshp = (bus_space_handle_t)(va + (bpa & PGOFSET));
+
+ for (; pa < endpa; pa += PAGE_SIZE, va += PAGE_SIZE) {
+ pmap_kenter_pa(va, pa, VM_PROT_READ | VM_PROT_WRITE);
+
+ /*
+ * PG_N doesn't exist on 386's, so we assume that
+ * the mainboard has wired up device space non-cacheable
+ * on those machines.
+ *
+ * Note that it's not necessary to use atomic ops to
+ * fiddle with the PTE here, because we don't care
+ * about mod/ref information.
+ *
+ * XXX should hand this bit to pmap_kenter_pa to
+ * save the extra invalidate!
+ *
+ * XXX extreme paranoia suggests tlb shootdown belongs here.
+ */
+ if (pmap_cpu_has_pg_n()) {
+ pte = kvtopte(va);
+ maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+ if (cacheable)
+ PTE_CLEARBITS(pte, maptp, PG_N);
+ else
+ PTE_SETBITS(pte, maptp, PG_N);
+ pmap_tlb_shootdown(pmap_kernel(), va, *pte,
+ &cpumask);
+ }
+ }
+
+ pmap_tlb_shootnow(cpumask);
+ pmap_update(pmap_kernel());
+
+ return 0;
+}
+
+/*
+ * void _x86_memio_unmap(bus_space_tag bst, bus_space_handle bsh,
+ * bus_size_t size, bus_addr_t *adrp)
+ *
+ * This function unmaps memory- or io-space mapped by the function
+ * _x86_memio_map(). This function works nearly as same as
+ * x86_memio_unmap(), but this function does not ask kernel
+ * built-in extents and returns physical address of the bus space,
+ * for the convenience of the extra extent manager.
+ */
+void
+_x86_memio_unmap(t, bsh, size, adrp)
+ bus_space_tag_t t;
+ bus_space_handle_t bsh;
+ bus_size_t size;
+ bus_addr_t *adrp;
+{
+ u_long va, endva;
+ bus_addr_t bpa;
+
+ /*
+ * Find the correct extent and bus physical address.
+ */
+ if (t == X86_BUS_SPACE_IO) {
+ bpa = bsh;
+ } else if (t == X86_BUS_SPACE_MEM) {
+ if (bsh >= atdevbase && (bsh + size) <= (atdevbase + IOM_SIZE)) {
+ bpa = (bus_addr_t)ISA_PHYSADDR(bsh);
+ } else {
+
+ va = x86_trunc_page(bsh);
+ endva = x86_round_page(bsh + size);
+
+#ifdef DIAGNOSTIC
+ if (endva <= va) {
+ panic("_x86_memio_unmap: overflow");
+ }
+#endif
+
+#if __NetBSD_Version__ > 104050000
+ if (pmap_extract(pmap_kernel(), va, &bpa) == FALSE) {
+ panic("_x86_memio_unmap:"
+ " wrong virtual address");
+ }
+ bpa += (bsh & PGOFSET);
+#else
+ bpa = pmap_extract(pmap_kernel(), va) + (bsh & PGOFSET);
+#endif
+
+ pmap_kremove(va, endva - va);
+ /*
+ * Free the kernel virtual mapping.
+ */
+ uvm_km_free(kernel_map, va, endva - va);
+ }
+ } else {
+ panic("_x86_memio_unmap: bad bus space tag");
+ }
+
+ if (adrp != NULL) {
+ *adrp = bpa;
+ }
+}
+
+void
+x86_memio_unmap(t, bsh, size)
+ bus_space_tag_t t;
+ bus_space_handle_t bsh;
+ bus_size_t size;
+{
+ struct extent *ex;
+ u_long va, endva;
+ bus_addr_t bpa;
+
+ /*
+ * Find the correct extent and bus physical address.
+ */
+ if (t == X86_BUS_SPACE_IO) {
+ ex = ioport_ex;
+ bpa = bsh;
+ } else if (t == X86_BUS_SPACE_MEM) {
+ ex = iomem_ex;
+
+ if (bsh >= atdevbase &&
+ (bsh + size) <= (atdevbase + IOM_SIZE)) {
+ bpa = (bus_addr_t)ISA_PHYSADDR(bsh);
+ goto ok;
+ }
+
+ va = x86_trunc_page(bsh);
+ endva = x86_round_page(bsh + size);
+
+#ifdef DIAGNOSTIC
+ if (endva <= va)
+ panic("x86_memio_unmap: overflow");
+#endif
+
+ (void) pmap_extract(pmap_kernel(), va, &bpa);
+ bpa += (bsh & PGOFSET);
+
+ pmap_kremove(va, endva - va);
+ /*
+ * Free the kernel virtual mapping.
+ */
+ uvm_km_free(kernel_map, va, endva - va);
+ } else
+ panic("x86_memio_unmap: bad bus space tag");
+
+ok:
+ if (extent_free(ex, bpa, size,
+ EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
+ printf("x86_memio_unmap: %s 0x%lx, size 0x%lx\n",
+ (t == X86_BUS_SPACE_IO) ? "port" : "pa", bpa, size);
+ printf("x86_memio_unmap: can't free region\n");
+ }
+}
+
+void
+x86_memio_free(t, bsh, size)
+ bus_space_tag_t t;
+ bus_space_handle_t bsh;
+ bus_size_t size;
+{
+
+ /* x86_memio_unmap() does all that we need to do. */
+ x86_memio_unmap(t, bsh, size);
+}
+
+int
+x86_memio_subregion(t, bsh, offset, size, nbshp)
+ bus_space_tag_t t;
+ bus_space_handle_t bsh;
+ bus_size_t offset, size;
+ bus_space_handle_t *nbshp;
+{
+
+ *nbshp = bsh + offset;
+ return (0);
+}
+
+paddr_t
+x86_memio_mmap(t, addr, off, prot, flags)
+ bus_space_tag_t t;
+ bus_addr_t addr;
+ off_t off;
+ int prot;
+ int flags;
+{
+
+ /* Can't mmap I/O space. */
+ if (t == X86_BUS_SPACE_IO)
+ return (-1);
+
+ /*
+ * "addr" is the base address of the device we're mapping.
+ * "off" is the offset into that device.
+ *
+ * Note we are called for each "page" in the device that
+ * the upper layers want to map.
+ */
+ return (x86_btop(addr + off));
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c
new file mode 100644
index 0000000000..6783f69363
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c
@@ -0,0 +1,234 @@
+/* $NetBSD: clock.c,v 1.1.2.2 2004/07/17 16:43:56 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_xen.h"
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.1.2.2 2004/07/17 16:43:56 he Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/device.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+#include <machine/cpu_counter.h>
+
+#include <dev/clock_subr.h>
+
+#include "config_time.h" /* for CONFIG_TIME */
+
+static int xen_timer_handler(void *, struct trapframe *);
+
+/* These are peridically updated in shared_info, and then copied here. */
+static unsigned long shadow_tsc_stamp;
+static u_int64_t shadow_system_time;
+static unsigned long shadow_time_version;
+static struct timeval shadow_tv;
+
+static int timeset;
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area. Must be called at splclock.
+ */
+static void
+get_time_values_from_xen(void)
+{
+ do {
+ shadow_time_version = HYPERVISOR_shared_info->time_version2;
+ __insn_barrier();
+ shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec;
+ shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_usec;
+ shadow_tsc_stamp = HYPERVISOR_shared_info->tsc_timestamp;
+ shadow_system_time = HYPERVISOR_shared_info->system_time;
+ __insn_barrier();
+ } while (shadow_time_version != HYPERVISOR_shared_info->time_version1);
+}
+
+void
+inittodr(time_t base)
+{
+ int s;
+
+ /*
+ * if the file system time is more than a year older than the
+ * kernel, warn and then set the base time to the CONFIG_TIME.
+ */
+ if (base && base < (CONFIG_TIME-SECYR)) {
+ printf("WARNING: preposterous time in file system\n");
+ base = CONFIG_TIME;
+ }
+
+ s = splclock();
+ get_time_values_from_xen();
+ splx(s);
+
+ time.tv_usec = shadow_tv.tv_usec;
+ time.tv_sec = shadow_tv.tv_sec + rtc_offset * 60;
+#ifdef DEBUG_CLOCK
+ printf("readclock: %ld (%ld)\n", time.tv_sec, base);
+#endif
+ if (base != 0 && base < time.tv_sec - 5*SECYR)
+ printf("WARNING: file system time much less than clock time\n");
+ else if (base > time.tv_sec + 5*SECYR) {
+ printf("WARNING: clock time much less than file system time\n");
+ printf("WARNING: using file system time\n");
+ goto fstime;
+ }
+
+ timeset = 1;
+ return;
+
+fstime:
+ timeset = 1;
+ time.tv_sec = base;
+ printf("WARNING: CHECK AND RESET THE DATE!\n");
+}
+
+void
+resettodr()
+{
+#ifdef DOM0OPS
+ dom0_op_t op;
+ int s;
+#endif
+#ifdef DEBUG_CLOCK
+ struct clock_ymdhms dt;
+#endif
+
+ /*
+ * We might have been called by boot() due to a crash early
+ * on. Don't reset the clock chip in this case.
+ */
+ if (!timeset)
+ return;
+
+#ifdef DEBUG_CLOCK
+ clock_secs_to_ymdhms(time.tv_sec - rtc_offset * 60, &dt);
+
+ printf("setclock: %d/%d/%d %02d:%02d:%02d\n", dt.dt_year,
+ dt.dt_mon, dt.dt_day, dt.dt_hour, dt.dt_min, dt.dt_sec);
+#endif
+#ifdef DOM0OPS
+ if (xen_start_info.dom_id == 0) {
+ s = splclock();
+
+ op.cmd = DOM0_SETTIME;
+ op.u.settime.secs = time.tv_sec - rtc_offset * 60;
+ op.u.settime.usecs = time.tv_usec;
+ op.u.settime.system_time = shadow_system_time;
+ HYPERVISOR_dom0_op(&op);
+
+ splx(s);
+ }
+#endif
+}
+
+void
+startrtclock()
+{
+
+}
+
+/*
+ * Wait approximately `n' microseconds.
+ */
+void
+xen_delay(int n)
+{
+ u_int64_t when;
+
+ get_time_values_from_xen();
+ when = shadow_system_time + n * 1000;
+ while (shadow_system_time < when)
+ get_time_values_from_xen();
+}
+
+void
+xen_microtime(struct timeval *tv)
+{
+
+ *tv = time;
+}
+
+void
+xen_initclocks()
+{
+ int irq = bind_virq_to_irq(VIRQ_TIMER);
+
+ event_set_handler(irq, (int (*)(void *))xen_timer_handler,
+ NULL, IPL_CLOCK);
+ hypervisor_enable_irq(irq);
+}
+
+static int
+xen_timer_handler(void *arg, struct trapframe *regs)
+{
+#if defined(I586_CPU) || defined(I686_CPU)
+ static int microset_iter; /* call cc_microset once/sec */
+ struct cpu_info *ci = curcpu();
+
+ /*
+ * If we have a cycle counter, do the microset thing.
+ */
+ if (ci->ci_feature_flags & CPUID_TSC) {
+ if (
+#if defined(MULTIPROCESSOR)
+ CPU_IS_PRIMARY(ci) &&
+#endif
+ (microset_iter--) == 0) {
+ microset_iter = hz - 1;
+#if defined(MULTIPROCESSOR)
+ x86_broadcast_ipi(X86_IPI_MICROSET);
+#endif
+ cc_microset_time = time;
+ cc_microset(ci);
+ }
+ }
+#endif
+
+ get_time_values_from_xen();
+
+ hardclock((struct clockframe *)regs);
+
+ return 0;
+}
+
+void
+setstatclockrate(int arg)
+{
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c
new file mode 100644
index 0000000000..0f5a9fe788
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c
@@ -0,0 +1,226 @@
+/* $NetBSD: hypervisor.c,v 1.7.2.1 2004/05/22 15:58:54 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: hypervisor.c,v 1.7.2.1 2004/05/22 15:58:54 he Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/device.h>
+
+#include "xencons.h"
+#include "xennet.h"
+#include "xbd.h"
+#include "xenkbc.h"
+#include "vga_xen.h"
+#include "npx.h"
+
+#include "opt_xen.h"
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+
+#ifdef DOM0OPS
+#include <sys/dirent.h>
+#include <sys/stat.h>
+#include <sys/tree.h>
+#include <sys/vnode.h>
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/kernfs/kernfs.h>
+#include <machine/kernfs_machdep.h>
+#endif
+
+#if NXENNET > 0
+#include <net/if.h>
+#include <net/if_ether.h>
+#include <net/if_media.h>
+#include <machine/if_xennetvar.h>
+#endif
+
+#if NXBD > 0
+#include <sys/buf.h>
+#include <sys/disk.h>
+#include <dev/dkvar.h>
+#include <machine/xbdvar.h>
+#endif
+
+#if NXENKBC > 0
+#include <dev/pckbport/pckbportvar.h>
+#include <machine/xenkbcvar.h>
+#endif
+
+#if NVGA_XEN > 0
+#include <machine/bus.h>
+#include <machine/vga_xenvar.h>
+#endif
+
+int hypervisor_match(struct device *, struct cfdata *, void *);
+void hypervisor_attach(struct device *, struct device *, void *);
+
+CFATTACH_DECL(hypervisor, sizeof(struct device),
+ hypervisor_match, hypervisor_attach, NULL, NULL);
+
+int hypervisor_print(void *, const char *);
+
+union hypervisor_attach_cookie {
+ const char *hac_device; /* first elem of all */
+#if NXENKBC > 0
+ struct xenkbc_attach_args hac_xenkbc;
+#endif
+#if NVGA_XEN > 0
+ struct xen_vga_attach_args hac_vga_xen;
+#endif
+#if NXENCONS > 0
+ struct xencons_attach_args hac_xencons;
+#endif
+#if NXENNET > 0
+ struct xennet_attach_args hac_xennet;
+#endif
+#if NXBD > 0
+ struct xbd_attach_args hac_xbd;
+#endif
+#if NNPX > 0
+ struct xen_npx_attach_args hac_xennpx;
+#endif
+};
+
+
+/*
+ * Probe for the hypervisor; always succeeds.
+ */
+int
+hypervisor_match(parent, match, aux)
+ struct device *parent;
+ struct cfdata *match;
+ void *aux;
+{
+ struct hypervisor_attach_args *haa = aux;
+
+ if (strcmp(haa->haa_busname, "hypervisor") == 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * Attach the hypervisor.
+ */
+void
+hypervisor_attach(parent, self, aux)
+ struct device *parent, *self;
+ void *aux;
+{
+ union hypervisor_attach_cookie hac;
+
+ printf("\n");
+
+ init_events();
+
+#if NXENKBC > 0
+ hac.hac_xenkbc.xa_device = "xenkbc";
+ config_found(self, &hac.hac_xenkbc, hypervisor_print);
+#endif
+
+#if NVGA_XEN > 0
+ hac.hac_vga_xen.xa_device = "vga_xen";
+ hac.hac_vga_xen.xa_iot = X86_BUS_SPACE_IO;
+ hac.hac_vga_xen.xa_memt = X86_BUS_SPACE_MEM;
+ config_found(self, &hac.hac_vga_xen, hypervisor_print);
+#endif
+
+#if NXENCONS > 0
+ hac.hac_xencons.xa_device = "xencons";
+ config_found(self, &hac.hac_xencons, hypervisor_print);
+#endif
+#if NXENNET > 0
+ hac.hac_xennet.xa_device = "xennet";
+ xennet_scan(self, &hac.hac_xennet, hypervisor_print);
+#endif
+#if NXBD > 0
+ hac.hac_xbd.xa_device = "xbd";
+ xbd_scan(self, &hac.hac_xbd, hypervisor_print);
+#endif
+#if NNPX > 0
+ hac.hac_xennpx.xa_device = "npx";
+ config_found(self, &hac.hac_xennpx, hypervisor_print);
+#endif
+#ifdef DOM0OPS
+ if (xen_start_info.flags & SIF_PRIVILEGED) {
+ xenkernfs_init();
+ xenprivcmd_init();
+ xenmachmem_init();
+ xenvfr_init();
+ }
+#endif
+}
+
+int
+hypervisor_print(aux, parent)
+ void *aux;
+ const char *parent;
+{
+ union hypervisor_attach_cookie *hac = aux;
+
+ if (parent)
+ aprint_normal("%s at %s", hac->hac_device, parent);
+ return (UNCONF);
+}
+
+void
+hypervisor_notify_via_evtchn(unsigned int port)
+{
+ evtchn_op_t op;
+
+ op.cmd = EVTCHNOP_send;
+ op.u.send.local_port = port;
+ (void)HYPERVISOR_event_channel_op(&op);
+}
+
+#ifdef DOM0OPS
+
+#define DIR_MODE (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
+
+kernfs_parentdir_t *kernxen_pkt;
+
+void
+xenkernfs_init()
+{
+ kernfs_entry_t *dkt;
+
+ KERNFS_ALLOCENTRY(dkt, M_TEMP, M_WAITOK);
+ KERNFS_INITENTRY(dkt, DT_DIR, "xen", NULL, KFSsubdir, VDIR, DIR_MODE);
+ kernfs_addentry(NULL, dkt);
+ kernxen_pkt = KERNFS_ENTOPARENTDIR(dkt);
+}
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c
new file mode 100644
index 0000000000..51219a980f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c
@@ -0,0 +1,1241 @@
+/* $NetBSD: if_xennet.c,v 1.1.2.1 2004/05/22 15:58:29 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: if_xennet.c,v 1.1.2.1 2004/05/22 15:58:29 he Exp $");
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/syslog.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/device.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#if NRND > 0
+#include <sys/rnd.h>
+#endif
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_dl.h>
+#include <net/if_ether.h>
+
+#ifdef mediacode
+#include <net/if_media.h>
+#endif
+
+#ifdef INET
+#include <netinet/in.h>
+#include <netinet/if_inarp.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#endif
+
+#include <nfs/rpcv2.h>
+
+#include <nfs/nfsproto.h>
+#include <nfs/nfs.h>
+#include <nfs/nfsmount.h>
+#include <nfs/nfsdiskless.h>
+
+#include "bpfilter.h"
+#if NBPFILTER > 0
+#include <net/bpf.h>
+#include <net/bpfdesc.h>
+#endif
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm_page.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+#include <machine/ctrl_if.h>
+
+#include <machine/if_xennetvar.h>
+
+#ifdef DEBUG
+#define XENNET_DEBUG
+#endif
+#if defined(XENNET_DEBUG) && !defined(DEBUG)
+#define DEBUG
+#endif
+/* #define XENNET_DEBUG_DUMP */
+
+#ifdef XENNET_DEBUG
+#define XEDB_FOLLOW 0x01
+#define XEDB_INIT 0x02
+#define XEDB_EVENT 0x04
+#define XEDB_MBUF 0x08
+#define XEDB_MEM 0x10
+int xennet_debug = 0x0;
+#define DPRINTF(x) if (xennet_debug) printf x;
+#define DPRINTFN(n,x) if (xennet_debug & (n)) printf x;
+#else
+#define DPRINTF(x)
+#define DPRINTFN(n,x)
+#endif
+#define PRINTF(x) printf x;
+
+#ifdef XENNET_DEBUG_DUMP
+static void xennet_hex_dump(unsigned char *, size_t, char *, int);
+#endif
+
+int xennet_match (struct device *, struct cfdata *, void *);
+void xennet_attach (struct device *, struct device *, void *);
+static void xennet_ctrlif_rx(ctrl_msg_t *, unsigned long);
+static void xennet_driver_status_change(netif_fe_driver_status_changed_t *);
+static void xennet_status_change(netif_fe_interface_status_changed_t *);
+static void xennet_tx_mbuf_free(struct mbuf *, caddr_t, size_t, void *);
+static void xennet_rx_mbuf_free(struct mbuf *, caddr_t, size_t, void *);
+static int xen_network_handler(void *);
+static void network_tx_buf_gc(struct xennet_softc *);
+static void network_alloc_rx_buffers(struct xennet_softc *);
+static void network_alloc_tx_buffers(struct xennet_softc *);
+void xennet_init(struct xennet_softc *);
+void xennet_reset(struct xennet_softc *);
+#ifdef mediacode
+static int xennet_mediachange (struct ifnet *);
+static void xennet_mediastatus(struct ifnet *, struct ifmediareq *);
+#endif
+
+CFATTACH_DECL(xennet, sizeof(struct xennet_softc),
+ xennet_match, xennet_attach, NULL, NULL);
+
+#define TX_MAX_ENTRIES (NETIF_TX_RING_SIZE - 2)
+#define RX_MAX_ENTRIES (NETIF_RX_RING_SIZE - 2)
+#define TX_ENTRIES 128
+#define RX_ENTRIES 128
+
+static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
+
+/** Network interface info. */
+struct xennet_ctrl {
+ /** Number of interfaces. */
+ int xc_interfaces;
+ /** Number of connected interfaces. */
+ int xc_connected;
+ /** Error code. */
+ int xc_err;
+
+ cfprint_t xc_cfprint;
+ struct device *xc_parent;
+};
+
+static struct xennet_ctrl netctrl = { -1, 0, 0 };
+
+#ifdef mediacode
+static int xennet_media[] = {
+ IFM_ETHER|IFM_AUTO,
+};
+static int nxennet_media = (sizeof(xennet_media)/sizeof(xennet_media[0]));
+#endif
+
+
+int
+xennet_scan(struct device *self, struct xennet_attach_args *xneta,
+ cfprint_t print)
+{
+ ctrl_msg_t cmsg;
+ netif_fe_driver_status_changed_t st;
+ int err = 0;
+
+ if ((xen_start_info.flags & SIF_INITDOMAIN) ||
+ (xen_start_info.flags & SIF_NET_BE_DOMAIN))
+ return 0;
+
+ netctrl.xc_parent = self;
+ netctrl.xc_cfprint = print;
+
+ printf("Initialising Xen virtual ethernet frontend driver.\n");
+
+ (void)ctrl_if_register_receiver(CMSG_NETIF_FE, xennet_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
+
+ /* Send a driver-UP notification to the domain controller. */
+ cmsg.type = CMSG_NETIF_FE;
+ cmsg.subtype = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
+ cmsg.length = sizeof(netif_fe_driver_status_changed_t);
+ st.status = NETIF_DRIVER_STATUS_UP;
+ st.max_handle = 0;
+ memcpy(cmsg.msg, &st, sizeof(st));
+ ctrl_if_send_message_block(&cmsg, NULL, 0, 0);
+
+#if 0
+ err = xennet_wait_for_interfaces();
+ if (err)
+ ctrl_if_unregister_receiver(CMSG_NETIF_FE, xennet_ctrlif_rx);
+#endif
+
+ return err;
+}
+
+int
+xennet_match(struct device *parent, struct cfdata *match, void *aux)
+{
+ struct xennet_attach_args *xa = (struct xennet_attach_args *)aux;
+
+ if (strcmp(xa->xa_device, "xennet") == 0)
+ return 1;
+ return 0;
+}
+
+void
+xennet_attach(struct device *parent, struct device *self, void *aux)
+{
+ struct xennet_attach_args *xneta = (struct xennet_attach_args *)aux;
+ struct xennet_softc *sc = (struct xennet_softc *)self;
+ struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+ int idx;
+
+ aprint_normal(": Xen Virtual Network Interface\n");
+
+ sc->sc_ifno = xneta->xa_handle;
+
+ /* Initialize ifnet structure. */
+ memcpy(ifp->if_xname, sc->sc_dev.dv_xname, IFNAMSIZ);
+ ifp->if_softc = sc;
+ ifp->if_start = xennet_start;
+ ifp->if_ioctl = xennet_ioctl;
+ ifp->if_watchdog = xennet_watchdog;
+ ifp->if_flags = IFF_BROADCAST | IFF_NOTRAILERS;
+
+#ifdef mediacode
+ ifmedia_init(&sc->sc_media, 0, xennet_mediachange,
+ xennet_mediastatus);
+ for (idx = 0; idx < nxennet_media; idx++)
+ ifmedia_add(&sc->sc_media, xennet_media[idx], 0, NULL);
+ ifmedia_set(&sc->sc_media, xennet_media[0]);
+#endif
+
+ for (idx = 0; idx < NETIF_TX_RING_SIZE; idx++)
+ sc->sc_tx_bufa[idx].xb_next = idx + 1;
+ for (idx = 0; idx < NETIF_RX_RING_SIZE; idx++)
+ sc->sc_rx_bufa[idx].xb_next = idx + 1;
+}
+
+static struct xennet_softc *
+find_device(int handle)
+{
+ struct device *dv;
+ struct xennet_softc *xs = NULL;
+
+ for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+ if (dv->dv_cfattach == NULL ||
+ dv->dv_cfattach->ca_attach != xennet_attach)
+ continue;
+ xs = (struct xennet_softc *)dv;
+ if (xs->sc_ifno == handle)
+ break;
+ }
+ return xs;
+}
+
+static void
+xennet_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+ int respond = 1;
+
+ switch (msg->subtype) {
+ case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
+ if (msg->length != sizeof(netif_fe_interface_status_changed_t))
+ goto error;
+ xennet_status_change(
+ (netif_fe_interface_status_changed_t *)&msg->msg[0]);
+ break;
+
+ case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
+ if (msg->length != sizeof(netif_fe_driver_status_changed_t))
+ goto error;
+ xennet_driver_status_change(
+ (netif_fe_driver_status_changed_t *)&msg->msg[0]);
+ break;
+
+ error:
+ default:
+ msg->length = 0;
+ break;
+ }
+
+ if (respond)
+ ctrl_if_send_response(msg);
+}
+
+static void
+xennet_driver_status_change(netif_fe_driver_status_changed_t *status)
+{
+ struct xennet_attach_args xneta;
+ int i;
+
+ DPRINTFN(XEDB_EVENT, ("> max_handle=%d\n", status->max_handle));
+
+ /* XXX FIXME: Abuse of 'max_handle' as interface count. */
+ netctrl.xc_interfaces = status->max_handle;
+ netctrl.xc_connected = 0;
+
+ xneta.xa_device = "xennet";
+
+ for (i = 0; i < netctrl.xc_interfaces; i++) {
+ xneta.xa_handle = i;
+ config_found(netctrl.xc_parent, &xneta, netctrl.xc_cfprint);
+ }
+}
+
+static void
+xennet_status_change(netif_fe_interface_status_changed_t *status)
+{
+ ctrl_msg_t cmsg;
+ netif_fe_interface_connect_t up;
+ struct xennet_softc *sc;
+ struct ifnet *ifp;
+ struct vm_page *pg_tx, *pg_rx;
+
+ DPRINTFN(XEDB_EVENT, (">\n"));
+ DPRINTFN(XEDB_EVENT, ("> status=%d handle=%d mac=%02x:%02x:%02x:%02x:%02x:%02x\n",
+ status->status,
+ status->handle,
+ status->mac[0], status->mac[1], status->mac[2],
+ status->mac[3], status->mac[4], status->mac[5]));
+
+ if (netctrl.xc_interfaces <= 0) {
+ printf("Status change: no interfaces\n");
+ return;
+ }
+
+ sc = find_device(status->handle);
+ if (sc == NULL) {
+ printf("Status change: invalid netif handle %u\n",
+ status->handle);
+ return;
+ }
+ ifp = &sc->sc_ethercom.ec_if;
+
+ switch (status->status) {
+ case NETIF_INTERFACE_STATUS_DESTROYED:
+ printf("Unexpected netif-DESTROYED message in state %d\n",
+ sc->sc_backend_state);
+ break;
+
+ case NETIF_INTERFACE_STATUS_DISCONNECTED:
+#if 0
+ if (sc->sc_backend_state != BEST_CLOSED) {
+ printk("Unexpected netif-DISCONNECTED message"
+ " in state %d\n", sc->sc_backend_state);
+ printk("Attempting to reconnect network interface\n");
+
+ /* Begin interface recovery.
+ *
+ * NB. Whilst we're recovering, we turn the
+ * carrier state off. We take measures to
+ * ensure that this device isn't used for
+ * anything. We also stop the queue for this
+ * device. Various different approaches
+ * (e.g. continuing to buffer packets) have
+ * been tested but don't appear to improve the
+ * overall impact on TCP connections.
+ *
+ * TODO: (MAW) Change the Xend<->Guest
+ * protocol so that a recovery is initiated by
+ * a special "RESET" message - disconnect
+ * could just mean we're not allowed to use
+ * this interface any more.
+ */
+
+ /* Stop old i/f to prevent errors whilst we
+ * rebuild the state. */
+ spin_lock_irq(&np->tx_lock);
+ spin_lock(&np->rx_lock);
+ netif_stop_queue(dev);
+ np->backend_state = BEST_DISCONNECTED;
+ spin_unlock(&np->rx_lock);
+ spin_unlock_irq(&np->tx_lock);
+
+ /* Free resources. */
+ free_irq(np->irq, dev);
+ unbind_evtchn_from_irq(np->evtchn);
+ free_page((unsigned long)np->tx);
+ free_page((unsigned long)np->rx);
+ }
+#endif
+
+ /* Move from CLOSED to DISCONNECTED state. */
+ sc->sc_tx = (netif_tx_interface_t *)
+ uvm_km_valloc_align(kernel_map, PAGE_SIZE, PAGE_SIZE);
+ if (sc->sc_tx == NULL)
+ panic("netif: no tx va");
+ sc->sc_rx = (netif_rx_interface_t *)
+ uvm_km_valloc_align(kernel_map, PAGE_SIZE, PAGE_SIZE);
+ if (sc->sc_rx == NULL)
+ panic("netif: no rx va");
+ pg_tx = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+ if (pg_tx == NULL) {
+ panic("netif: no tx pages");
+ }
+ pmap_kenter_pa((vaddr_t)sc->sc_tx, VM_PAGE_TO_PHYS(pg_tx),
+ VM_PROT_READ | VM_PROT_WRITE);
+ pg_rx = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+ if (pg_rx == NULL) {
+ panic("netif: no rx pages");
+ }
+ pmap_kenter_pa((vaddr_t)sc->sc_rx, VM_PAGE_TO_PHYS(pg_rx),
+ VM_PROT_READ | VM_PROT_WRITE);
+ sc->sc_backend_state = BEST_DISCONNECTED;
+
+ /* Construct an interface-CONNECT message for the
+ * domain controller. */
+ cmsg.type = CMSG_NETIF_FE;
+ cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT;
+ cmsg.length = sizeof(netif_fe_interface_connect_t);
+ up.handle = status->handle;
+ up.tx_shmem_frame = xpmap_ptom(VM_PAGE_TO_PHYS(pg_tx)) >> PAGE_SHIFT;
+ up.rx_shmem_frame = xpmap_ptom(VM_PAGE_TO_PHYS(pg_rx)) >> PAGE_SHIFT;
+ memcpy(cmsg.msg, &up, sizeof(up));
+
+ /* Tell the controller to bring up the interface. */
+ ctrl_if_send_message_block(&cmsg, NULL, 0, 0);
+ break;
+
+ case NETIF_INTERFACE_STATUS_CONNECTED:
+ if (sc->sc_backend_state == BEST_CLOSED) {
+ printf("Unexpected netif-CONNECTED message"
+ " in state %d\n", sc->sc_backend_state);
+ break;
+ }
+
+ memcpy(sc->sc_enaddr, status->mac, ETHER_ADDR_LEN);
+#if 0
+ if (xen_start_info.flags & SIF_PRIVILEGED) {
+ /* XXX for domain-0 change out ethernet address to be
+ * different than the physical address since arp
+ * replies from other domains will report the physical
+ * address.
+ */
+ if (sc->sc_enaddr[0] != 0xaa)
+ sc->sc_enaddr[0] = 0xaa;
+ else
+ sc->sc_enaddr[0] = 0xab;
+ }
+#endif
+
+ /* Recovery procedure: */
+
+ /* Step 1: Reinitialise variables. */
+ sc->sc_rx_resp_cons = sc->sc_tx_resp_cons = /* sc->sc_tx_full = */ 0;
+ sc->sc_rx->event = sc->sc_tx->event = 1;
+
+ /* Step 2: Rebuild the RX and TX ring contents. */
+ network_alloc_rx_buffers(sc);
+ SLIST_INIT(&sc->sc_tx_bufs);
+ network_alloc_tx_buffers(sc);
+
+ /* Step 3: All public and private state should now be
+ * sane. Get ready to start sending and receiving
+ * packets and give the driver domain a kick because
+ * we've probably just requeued some packets.
+ */
+ sc->sc_backend_state = BEST_CONNECTED;
+ __insn_barrier();
+ hypervisor_notify_via_evtchn(status->evtchn);
+ network_tx_buf_gc(sc);
+
+ if_attach(ifp);
+ ether_ifattach(ifp, sc->sc_enaddr);
+
+ sc->sc_evtchn = status->evtchn;
+ sc->sc_irq = bind_evtchn_to_irq(sc->sc_evtchn);
+ event_set_handler(sc->sc_irq, &xen_network_handler, sc, IPL_NET);
+ hypervisor_enable_irq(sc->sc_irq);
+ netctrl.xc_connected++;
+
+ aprint_normal("%s: MAC address %s\n", sc->sc_dev.dv_xname,
+ ether_sprintf(sc->sc_enaddr));
+
+#if NRND > 0
+ rnd_attach_source(&sc->rnd_source, sc->sc_dev.dv_xname,
+ RND_TYPE_NET, 0);
+#endif
+ break;
+
+ default:
+ printf("Status change to unknown value %d\n",
+ status->status);
+ break;
+ }
+}
+
+static void
+xennet_tx_mbuf_free(struct mbuf *m, caddr_t buf, size_t size, void *arg)
+{
+ struct xennet_txbuf *txbuf = (struct xennet_txbuf *)arg;
+
+ DPRINTFN(XEDB_MBUF, ("xennet_tx_mbuf_free %p pa %p\n", txbuf,
+ (void *)txbuf->xt_pa));
+ SLIST_INSERT_HEAD(&txbuf->xt_sc->sc_tx_bufs, txbuf, xt_next);
+ pool_cache_put(&mbpool_cache, m);
+}
+
+static void
+xennet_rx_push_buffer(struct xennet_softc *sc, int id)
+{
+ NETIF_RING_IDX ringidx;
+ int nr_pfns;
+
+ ringidx = sc->sc_rx->req_prod;
+ nr_pfns = 0;
+
+ DPRINTFN(XEDB_MEM, ("readding page va %p pa %p ma %p/%p to rx_ring "
+ "at %d with id %d\n",
+ (void *)sc->sc_rx_bufa[id].xb_rx.xbrx_va,
+ (void *)sc->sc_rx_bufa[id].xb_rx.xbrx_pa,
+ (void *)(PTE_BASE[x86_btop
+ (sc->sc_rx_bufa[id].xb_rx.xbrx_va)] &
+ PG_FRAME),
+ (void *)xpmap_ptom(sc->sc_rx_bufa[id].xb_rx.xbrx_pa),
+ ringidx, id));
+
+ sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].req.id = id;
+
+ rx_pfn_array[nr_pfns] = xpmap_ptom(sc->sc_rx_bufa[id].xb_rx.xbrx_pa)
+ >> PAGE_SHIFT;
+
+ /* Remove this page from pseudo phys map before
+ * passing back to Xen. */
+ xpmap_phys_to_machine_mapping[(sc->sc_rx_bufa[id].xb_rx.xbrx_pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
+ INVALID_P2M_ENTRY;
+
+ rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
+ rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT;
+ rx_mcl[nr_pfns].args[1] = 0;
+ rx_mcl[nr_pfns].args[2] = 0;
+
+ nr_pfns++;
+
+ sc->sc_rx_bufs_to_notify++;
+
+ ringidx++;
+
+ /*
+ * We may have allocated buffers which have entries
+ * outstanding in the page update queue -- make sure we flush
+ * those first!
+ */
+ xpq_flush_queue();
+
+ /* After all PTEs have been zapped we blow away stale TLB entries. */
+ rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
+
+ /* Give away a batch of pages. */
+ rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
+ rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
+ rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
+ rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
+ rx_mcl[nr_pfns].args[3] = 0;
+ rx_mcl[nr_pfns].args[4] = DOMID_SELF;
+
+ /* Zap PTEs and give away pages in one big multicall. */
+ (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
+
+ /* Check return status of HYPERVISOR_dom_mem_op(). */
+ if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
+ panic("Unable to reduce memory reservation\n");
+
+ /* Above is a suitable barrier to ensure backend will see requests. */
+ sc->sc_rx->req_prod = ringidx;
+}
+
+static void
+xennet_rx_mbuf_free(struct mbuf *m, caddr_t buf, size_t size, void *arg)
+{
+ union xennet_bufarray *xb = (union xennet_bufarray *)arg;
+ struct xennet_softc *sc = xb->xb_rx.xbrx_sc;
+ int id = (xb - sc->sc_rx_bufa);
+
+ DPRINTFN(XEDB_MBUF, ("xennet_rx_mbuf_free id %d, mbuf %p, buf %p, "
+ "size %d\n", id, m, buf, size));
+
+ xennet_rx_push_buffer(sc, id);
+
+ pool_cache_put(&mbpool_cache, m);
+}
+
+static int
+xen_network_handler(void *arg)
+{
+ struct xennet_softc *sc = arg;
+ struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+ netif_rx_response_t *rx;
+ paddr_t pa;
+ NETIF_RING_IDX ringidx;
+ mmu_update_t *mmu = rx_mmu;
+ multicall_entry_t *mcl = rx_mcl;
+ struct mbuf *m;
+
+ network_tx_buf_gc(sc);
+
+ again:
+ for (ringidx = sc->sc_rx_resp_cons;
+ ringidx != sc->sc_rx->resp_prod;
+ ringidx++) {
+ rx = &sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].resp;
+
+ if (rx->status < 0)
+ panic("rx->status < 0");
+ /* XXXcl check rx->status for error */
+
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL) {
+ printf("xennet: rx no mbuf\n");
+ break;
+ }
+
+ pa = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_pa;
+
+ DPRINTFN(XEDB_EVENT, ("rx event %d for id %d, size %d, "
+ "status %d, ma %08lx, pa %08lx\n", ringidx,
+ rx->id, rx->status, rx->status, rx->addr, pa));
+
+ /* Remap the page. */
+ mmu->ptr = (rx->addr & PG_FRAME) | MMU_MACHPHYS_UPDATE;
+ mmu->val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
+ mmu++;
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT;
+ mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW;
+ mcl->args[2] = UVMF_FLUSH_TLB; // 0;
+ mcl++;
+
+ xpmap_phys_to_machine_mapping
+ [(pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
+ rx->addr >> PAGE_SHIFT;
+
+ /* Do all the remapping work, and M->P updates, in one
+ * big hypercall. */
+ if ((mcl - rx_mcl) != 0) {
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)rx_mmu;
+ mcl->args[1] = mmu - rx_mmu;
+ mcl->args[2] = 0;
+ mcl++;
+ (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
+ }
+ if (0)
+ printf("page mapped at va %08lx -> %08x/%08lx\n",
+ sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va,
+ PTE_BASE[x86_btop(sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)],
+ rx->addr);
+ mmu = rx_mmu;
+ mcl = rx_mcl;
+
+ DPRINTFN(XEDB_MBUF, ("rx packet mbuf %p va %p pa %p/%p "
+ "ma %p\n", m,
+ (void *)sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va,
+ (void *)(xpmap_mtop(PTE_BASE[x86_btop
+ (sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)] & PG_FRAME)), (void *)pa,
+ (void *)(PTE_BASE[x86_btop
+ (sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)] & PG_FRAME)));
+
+ m->m_len = m->m_pkthdr.len = rx->status;
+ m->m_pkthdr.rcvif = ifp;
+ if (sc->sc_rx->req_prod != sc->sc_rx->resp_prod) {
+ MEXTADD(m, (void *)(sc->sc_rx_bufa[rx->id].xb_rx.
+ xbrx_va + (rx->addr & PAGE_MASK)), rx->status, M_DEVBUF,
+ xennet_rx_mbuf_free,
+ &sc->sc_rx_bufa[rx->id]);
+ } else {
+ /*
+ * This was our last receive buffer, allocate
+ * memory, copy data and push the receive
+ * buffer back to the hypervisor.
+ */
+ MEXTMALLOC(m, rx->status, M_DONTWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ printf("xennet: rx no mbuf 2\n");
+ m_free(m);
+ break;
+ }
+ memcpy(m->m_data, (void *)(sc->sc_rx_bufa[rx->id].
+ xb_rx.xbrx_va + (rx->addr & PAGE_MASK)), rx->status);
+ xennet_rx_push_buffer(sc, rx->id);
+ }
+
+#ifdef XENNET_DEBUG_DUMP
+ xennet_hex_dump(mtod(m, u_char *), m->m_pkthdr.len, "r", rx->id);
+#endif
+
+#if NBPFILTER > 0
+ /*
+ * Pass packet to bpf if there is a listener.
+ */
+ if (ifp->if_bpf)
+ bpf_mtap(ifp->if_bpf, m);
+#endif
+
+ ifp->if_ipackets++;
+
+ /* Pass the packet up. */
+ (*ifp->if_input)(ifp, m);
+ }
+
+ sc->sc_rx_resp_cons = ringidx;
+ sc->sc_rx->event = sc->sc_rx_resp_cons + 1;
+
+ if (sc->sc_rx->resp_prod != ringidx)
+ goto again;
+
+ return 0;
+}
+
+static inline int
+get_bufarray_entry(union xennet_bufarray *a)
+{
+ int idx;
+
+ idx = a[0].xb_next;
+ a[0].xb_next = a[idx].xb_next;
+ return idx;
+}
+
+static inline void
+put_bufarray_entry(union xennet_bufarray *a, int idx)
+{
+
+ a[idx].xb_next = a[0].xb_next;
+ a[0].xb_next = idx;
+}
+
+static void
+network_tx_buf_gc(struct xennet_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+ NETIF_RING_IDX idx, prod;
+
+ do {
+ prod = sc->sc_tx->resp_prod;
+
+ for (idx = sc->sc_tx_resp_cons; idx != prod; idx++) {
+ DPRINTFN(XEDB_EVENT, ("tx event at pos %d, status: "
+ "%d, id: %d, mbuf %p, buf %p\n", idx,
+ sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.status,
+ sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id,
+ sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m,
+ mtod(sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m, void *)));
+ m_freem(sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m);
+ put_bufarray_entry(sc->sc_tx_bufa,
+ sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id);
+ sc->sc_tx_entries--; /* atomic */
+ }
+
+ sc->sc_tx_resp_cons = prod;
+
+ /*
+ * Set a new event, then check for race with update of
+ * tx_cons.
+ */
+ sc->sc_tx->event = /* atomic */
+ prod + (sc->sc_tx_entries >> 1) + 1;
+ __insn_barrier();
+ } while (prod != sc->sc_tx->resp_prod);
+
+ if (sc->sc_tx->resp_prod == sc->sc_tx->req_prod)
+ ifp->if_timer = 0;
+ /* KDASSERT(sc->sc_net_idx->tx_req_prod == */
+ /* TX_RING_ADD(sc->sc_net_idx->tx_resp_prod, sc->sc_tx_entries)); */
+}
+
+static void
+network_alloc_rx_buffers(struct xennet_softc *sc)
+{
+ vaddr_t rxpages, va;
+ paddr_t pa;
+ struct vm_page *pg;
+ int id, nr_pfns;
+ NETIF_RING_IDX ringidx;
+ int s;
+
+ ringidx = sc->sc_rx->req_prod;
+ if (0) printf("network_alloc_rx_buffers prod %d cons %d\n", ringidx,
+ sc->sc_rx_resp_cons);
+ if ((ringidx - sc->sc_rx_resp_cons) > (RX_MAX_ENTRIES / 2))
+ return;
+
+ nr_pfns = 0;
+
+ rxpages = uvm_km_valloc_align(kernel_map, RX_ENTRIES * PAGE_SIZE,
+ PAGE_SIZE);
+
+ s = splnet();
+ for (va = rxpages; va < rxpages + RX_ENTRIES * PAGE_SIZE;
+ va += PAGE_SIZE) {
+ pg = uvm_pagealloc(NULL, 0, NULL, 0);
+ if (pg == NULL)
+ panic("network_alloc_rx_buffers: no pages");
+ pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ | VM_PROT_WRITE);
+
+ id = get_bufarray_entry(sc->sc_rx_bufa);
+ sc->sc_rx_bufa[id].xb_rx.xbrx_va = va;
+ sc->sc_rx_bufa[id].xb_rx.xbrx_sc = sc;
+
+ pa = VM_PAGE_TO_PHYS(pg);
+ DPRINTFN(XEDB_MEM, ("adding page va %p pa %p/%p "
+ "ma %p/%p to rx_ring at %d with id %d\n", (void *)va,
+ (void *)(VM_PAGE_TO_PHYS(pg) & PG_FRAME), (void *)xpmap_mtop(PTE_BASE[x86_btop(va)]),
+ (void *)(PTE_BASE[x86_btop(va)] & PG_FRAME),
+ (void *)xpmap_ptom(VM_PAGE_TO_PHYS(pg)),
+ ringidx, id));
+ sc->sc_rx_bufa[id].xb_rx.xbrx_pa = pa;
+ sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].req.id = id;
+
+ rx_pfn_array[nr_pfns] = xpmap_ptom(pa) >> PAGE_SHIFT;
+
+ /* Remove this page from pseudo phys map before
+ * passing back to Xen. */
+ xpmap_phys_to_machine_mapping[(pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
+ INVALID_P2M_ENTRY;
+
+ rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
+ rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT;
+ rx_mcl[nr_pfns].args[1] = 0;
+ rx_mcl[nr_pfns].args[2] = 0;
+
+ nr_pfns++;
+
+ sc->sc_rx_bufs_to_notify++;
+
+ ringidx++;
+ if ((ringidx - sc->sc_rx_resp_cons) == RX_MAX_ENTRIES)
+ break;
+ }
+
+ if (nr_pfns == 0) {
+ splx(s);
+ return;
+ }
+
+ /*
+ * We may have allocated buffers which have entries
+ * outstanding in the page update queue -- make sure we flush
+ * those first!
+ */
+ xpq_flush_queue();
+
+ /* After all PTEs have been zapped we blow away stale TLB entries. */
+ rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
+
+ /* Give away a batch of pages. */
+ rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
+ rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
+ rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
+ rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
+ rx_mcl[nr_pfns].args[3] = 0;
+ rx_mcl[nr_pfns].args[4] = DOMID_SELF;
+
+ /* Zap PTEs and give away pages in one big multicall. */
+ (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
+
+ /* Check return status of HYPERVISOR_dom_mem_op(). */
+ if (rx_mcl[nr_pfns].args[5] != nr_pfns)
+ panic("Unable to reduce memory reservation\n");
+
+ /* Above is a suitable barrier to ensure backend will see requests. */
+ sc->sc_rx->req_prod = ringidx;
+
+ splx(s);
+
+}
+
+static void
+network_alloc_tx_buffers(struct xennet_softc *sc)
+{
+ vaddr_t txpages, va;
+ struct vm_page *pg;
+ struct xennet_txbuf *txbuf;
+ int i;
+
+ txpages = uvm_km_valloc_align(kernel_map,
+ (TX_ENTRIES / TXBUF_PER_PAGE) * PAGE_SIZE, PAGE_SIZE);
+ for (va = txpages;
+ va < txpages + (TX_ENTRIES / TXBUF_PER_PAGE) * PAGE_SIZE;
+ va += PAGE_SIZE) {
+ pg = uvm_pagealloc(NULL, 0, NULL, 0);
+ if (pg == NULL)
+ panic("network_alloc_tx_buffers: no pages");
+ pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ | VM_PROT_WRITE);
+
+ for (i = 0; i < TXBUF_PER_PAGE; i++) {
+ txbuf = (struct xennet_txbuf *)
+ (va + i * (PAGE_SIZE / TXBUF_PER_PAGE));
+ txbuf->xt_sc = sc;
+ txbuf->xt_pa = VM_PAGE_TO_PHYS(pg) +
+ i * (PAGE_SIZE / TXBUF_PER_PAGE) +
+ sizeof(struct xennet_txbuf);
+ SLIST_INSERT_HEAD(&sc->sc_tx_bufs, txbuf, xt_next);
+ }
+ }
+}
+
+/*
+ * Called at splnet.
+ */
+void
+xennet_start(struct ifnet *ifp)
+{
+ struct xennet_softc *sc = ifp->if_softc;
+ struct mbuf *m, *new_m;
+ struct xennet_txbuf *txbuf;
+ netif_tx_request_t *txreq;
+ NETIF_RING_IDX idx;
+ paddr_t pa;
+ int bufid;
+
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_start()\n", sc->sc_dev.dv_xname));
+
+#ifdef DIAGNOSTIC
+ IFQ_POLL(&ifp->if_snd, m);
+ if (m == 0)
+ panic("%s: No packet to start", sc->sc_dev.dv_xname);
+#endif
+
+ if ((ifp->if_flags & (IFF_RUNNING | IFF_OACTIVE)) != IFF_RUNNING)
+ return;
+
+ idx = sc->sc_tx->req_prod;
+ while (/*CONSTCOND*/1) {
+
+ IFQ_POLL(&ifp->if_snd, m);
+ if (m == NULL)
+ break;
+
+ switch (m->m_flags & (M_EXT|M_EXT_CLUSTER)) {
+ case M_EXT|M_EXT_CLUSTER:
+ pa = m->m_ext.ext_paddr +
+ (m->m_data - m->m_ext.ext_buf);
+ break;
+ default:
+ case 0:
+ pa = m->m_paddr + M_BUFOFFSET(m) +
+ (m->m_data - M_BUFADDR(m));
+ break;
+ }
+
+ if (m->m_pkthdr.len != m->m_len ||
+ (pa ^ (pa + m->m_pkthdr.len)) & PG_FRAME) {
+ txbuf = SLIST_FIRST(&sc->sc_tx_bufs);
+ if (txbuf == NULL) {
+ // printf("xennet: no tx bufs\n");
+ break;
+ }
+
+ MGETHDR(new_m, M_DONTWAIT, MT_DATA);
+ if (new_m == NULL) {
+ printf("xennet: no mbuf\n");
+ break;
+ }
+
+ SLIST_REMOVE_HEAD(&sc->sc_tx_bufs, xt_next);
+ IFQ_DEQUEUE(&ifp->if_snd, m);
+
+ KASSERT(m->m_flags & M_PKTHDR);
+ M_COPY_PKTHDR(new_m, m);
+ m_copydata(m, 0, m->m_pkthdr.len, txbuf->xt_buf);
+ MEXTADD(new_m, txbuf->xt_buf, m->m_pkthdr.len,
+ M_DEVBUF, xennet_tx_mbuf_free, txbuf);
+ new_m->m_ext.ext_paddr = txbuf->xt_pa;
+ new_m->m_len = new_m->m_pkthdr.len = m->m_pkthdr.len;
+
+ m_freem(m);
+ m = new_m;
+
+ pa = m->m_ext.ext_paddr +
+ (m->m_data - m->m_ext.ext_buf);
+ } else
+ IFQ_DEQUEUE(&ifp->if_snd, m);
+
+ bufid = get_bufarray_entry(sc->sc_tx_bufa);
+ sc->sc_tx_bufa[bufid].xb_tx.xbtx_m = m;
+
+ DPRINTFN(XEDB_MBUF, ("xennet_start id %d, mbuf %p, buf %p/%p, "
+ "size %d\n", bufid, m, mtod(m, void *),
+ (void *)pa, m->m_pkthdr.len));
+#ifdef XENNET_DEBUG_DUMP
+ xennet_hex_dump(mtod(m, u_char *), m->m_pkthdr.len, "s", bufid);
+#endif
+
+ txreq = &sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].req;
+ txreq->id = bufid;
+ txreq->addr = xpmap_ptom(pa);
+ txreq->size = m->m_pkthdr.len;
+
+ __insn_barrier();
+ idx++;
+ sc->sc_tx->req_prod = idx;
+
+ sc->sc_tx_entries++; /* XXX atomic */
+
+#ifdef XENNET_DEBUG
+ DPRINTFN(XEDB_MEM, ("packet addr %p/%p, physical %p/%p, "
+ "m_paddr %p, len %d/%d\n", M_BUFADDR(m), mtod(m, void *),
+ (void *)*kvtopte(mtod(m, vaddr_t)),
+ (void *)xpmap_mtop(*kvtopte(mtod(m, vaddr_t))),
+ (void *)m->m_paddr, m->m_pkthdr.len, m->m_len));
+#endif
+
+#if NBPFILTER > 0
+ /*
+ * Pass packet to bpf if there is a listener.
+ */
+ if (ifp->if_bpf)
+ bpf_mtap(ifp->if_bpf, m);
+#endif
+ }
+
+ ifp->if_flags &= ~IFF_OACTIVE;
+
+ network_tx_buf_gc(sc);
+
+ __insn_barrier();
+ if (sc->sc_tx->resp_prod != idx)
+ hypervisor_notify_via_evtchn(sc->sc_evtchn);
+
+ ifp->if_timer = 5;
+
+ ifp->if_opackets++;
+
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_start() done\n",
+ sc->sc_dev.dv_xname));
+}
+
+int
+xennet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+ struct xennet_softc *sc = ifp->if_softc;
+ struct ifaddr *ifa = (struct ifaddr *)data;
+#ifdef mediacode
+ struct ifreq *ifr = (struct ifreq *)data;
+#endif
+ int s, error = 0;
+
+ s = splnet();
+
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl()\n", sc->sc_dev.dv_xname));
+
+ switch(cmd) {
+ case SIOCSIFADDR:
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOCSIFADDR\n",
+ sc->sc_dev.dv_xname));
+ ifp->if_flags |= IFF_UP;
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ xennet_init(sc);
+ arp_ifinit(ifp, ifa);
+ break;
+#endif
+ default:
+ xennet_init(sc);
+ break;
+ }
+ break;
+
+ case SIOCSIFFLAGS:
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOCSIFFLAGS\n",
+ sc->sc_dev.dv_xname));
+ break;
+
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOC*MULTI\n",
+ sc->sc_dev.dv_xname));
+ break;
+
+#ifdef mediacode
+ case SIOCGIFMEDIA:
+ case SIOCSIFMEDIA:
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOC*IFMEDIA\n",
+ sc->sc_dev.dv_xname));
+ error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
+ break;
+#endif
+
+ default:
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl(0x%lx) unknown cmd\n",
+ sc->sc_dev.dv_xname, cmd));
+ error = EINVAL;
+ break;
+ }
+
+ splx(s);
+
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() returning %d\n",
+ sc->sc_dev.dv_xname, error));
+
+ return error;
+}
+
+void
+xennet_watchdog(struct ifnet *ifp)
+{
+
+ panic("xennet_watchdog\n");
+}
+
+void
+xennet_init(struct xennet_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_init()\n", sc->sc_dev.dv_xname));
+
+ if (ifp->if_flags & IFF_UP) {
+ if ((ifp->if_flags & IFF_RUNNING) == 0)
+ xennet_reset(sc);
+
+ ifp->if_flags |= IFF_RUNNING;
+ ifp->if_flags &= ~IFF_OACTIVE;
+ ifp->if_timer = 0;
+ } else {
+ ifp->if_flags &= ~IFF_RUNNING;
+ xennet_reset(sc);
+ }
+}
+
+void
+xennet_reset(struct xennet_softc *sc)
+{
+
+ DPRINTFN(XEDB_FOLLOW, ("%s: xennet_reset()\n", sc->sc_dev.dv_xname));
+}
+
+#ifdef mediacode
+/*
+ * Media change callback.
+ */
+static int
+xennet_mediachange(struct ifnet *ifp)
+{
+ struct xennet_softc *sc = ifp->if_softc;
+
+ switch IFM_SUBTYPE(sc->sc_media.ifm_media) {
+ case IFM_AUTO:
+ break;
+ default:
+ return (1);
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * Media status callback.
+ */
+static void
+xennet_mediastatus(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+ struct xennet_softc *sc = ifp->if_softc;
+
+ if (IFM_SUBTYPE(ifmr->ifm_active) == IFM_AUTO)
+ ifmr->ifm_active = sc->sc_media.ifm_cur->ifm_data;
+
+ ifmr->ifm_status &= ~IFM_AVALID;
+}
+#endif
+
+int
+xennet_bootstatic_callback(struct nfs_diskless *nd)
+{
+ struct ifnet *ifp = nd->nd_ifp;
+ struct xennet_softc *sc = (struct xennet_softc *)ifp->if_softc;
+ union xen_cmdline_parseinfo xcp;
+ struct sockaddr_in *sin;
+
+ memset(&xcp, 0, sizeof(xcp.xcp_netinfo));
+ xcp.xcp_netinfo.xi_ifno = sc->sc_ifno;
+ xcp.xcp_netinfo.xi_root = nd->nd_root.ndm_host;
+ xen_parse_cmdline(XEN_PARSE_NETINFO, &xcp);
+
+ nd->nd_myip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[0]);
+ nd->nd_gwip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[2]);
+ nd->nd_mask.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[3]);
+
+ sin = (struct sockaddr_in *) &nd->nd_root.ndm_saddr;
+ memset((caddr_t)sin, 0, sizeof(*sin));
+ sin->sin_len = sizeof(*sin);
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[1]);
+
+ return (NFS_BOOTSTATIC_HAS_MYIP|NFS_BOOTSTATIC_HAS_GWIP|
+ NFS_BOOTSTATIC_HAS_MASK|NFS_BOOTSTATIC_HAS_SERVADDR|
+ NFS_BOOTSTATIC_HAS_SERVER);
+}
+
+
+#ifdef XENNET_DEBUG_DUMP
+#define XCHR(x) "0123456789abcdef"[(x) & 0xf]
+static void
+xennet_hex_dump(unsigned char *pkt, size_t len, char *type, int id)
+{
+ size_t i, j;
+
+ printf("pkt %p len %d/%x type %s id %d\n", pkt, len, len, type, id);
+ printf("00000000 ");
+ for(i=0; i<len; i++) {
+ printf("%c%c ", XCHR(pkt[i]>>4), XCHR(pkt[i]));
+ if ((i+1) % 16 == 8)
+ printf(" ");
+ if ((i+1) % 16 == 0) {
+ printf(" %c", '|');
+ for(j=0; j<16; j++)
+ printf("%c", pkt[i-15+j]>=32 &&
+ pkt[i-15+j]<127?pkt[i-15+j]:'.');
+ printf("%c\n%c%c%c%c%c%c%c%c ", '|',
+ XCHR((i+1)>>28), XCHR((i+1)>>24),
+ XCHR((i+1)>>20), XCHR((i+1)>>16),
+ XCHR((i+1)>>12), XCHR((i+1)>>8),
+ XCHR((i+1)>>4), XCHR(i+1));
+ }
+ }
+ printf("\n");
+}
+#undef XCHR
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c
new file mode 100644
index 0000000000..b72ffc95a1
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c
@@ -0,0 +1,1368 @@
+/* $NetBSD: xbd.c,v 1.9.2.1 2004/05/22 15:59:11 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xbd.c,v 1.9.2.1 2004/05/22 15:59:11 he Exp $");
+
+#include "xbd.h"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/ioctl.h>
+#include <sys/device.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/conf.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+
+#include <uvm/uvm.h>
+
+#include <dev/dkvar.h>
+#include <machine/xbdvar.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/hypervisor-ifs/hypervisor-if.h>
+#include <machine/hypervisor-ifs/vbd.h>
+#include <machine/evtchn.h>
+
+
+static void xbd_attach(struct device *, struct device *, void *);
+static int xbd_detach(struct device *, int);
+
+#if NXBD > 0
+int xbd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(xbd, sizeof(struct xbd_softc),
+ xbd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver xbd_cd;
+#endif
+
+#if NWD > 0
+int xbd_wd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(wd, sizeof(struct xbd_softc),
+ xbd_wd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver wd_cd;
+#endif
+
+#if NSD > 0
+int xbd_sd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(sd, sizeof(struct xbd_softc),
+ xbd_sd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver sd_cd;
+#endif
+
+#if NCD > 0
+int xbd_cd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(cd, sizeof(struct xbd_softc),
+ xbd_cd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver cd_cd;
+#endif
+
+
+dev_type_open(xbdopen);
+dev_type_close(xbdclose);
+dev_type_read(xbdread);
+dev_type_write(xbdwrite);
+dev_type_ioctl(xbdioctl);
+dev_type_ioctl(xbdioctl_cdev);
+dev_type_strategy(xbdstrategy);
+dev_type_dump(xbddump);
+dev_type_size(xbdsize);
+
+#if NXBD > 0
+const struct bdevsw xbd_bdevsw = {
+ xbdopen, xbdclose, xbdstrategy, xbdioctl,
+ xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw xbd_cdevsw = {
+ xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+ nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_major;
+#endif
+
+#if NWD > 0
+const struct bdevsw wd_bdevsw = {
+ xbdopen, xbdclose, xbdstrategy, xbdioctl,
+ xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw wd_cdevsw = {
+ xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+ nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_wd_major;
+static dev_t xbd_wd_cdev_major;
+#endif
+
+#if NSD > 0
+const struct bdevsw sd_bdevsw = {
+ xbdopen, xbdclose, xbdstrategy, xbdioctl,
+ xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw sd_cdevsw = {
+ xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+ nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_sd_major;
+static dev_t xbd_sd_cdev_major;
+#endif
+
+#if NCD > 0
+const struct bdevsw cd_bdevsw = {
+ xbdopen, xbdclose, xbdstrategy, xbdioctl,
+ xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw cd_cdevsw = {
+ xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+ nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_cd_major;
+static dev_t xbd_cd_cdev_major;
+#endif
+
+
+static int xbdstart(struct dk_softc *, struct buf *);
+static int xbd_response_handler(void *);
+static void xbd_update_create_kthread(void *);
+static void xbd_update_kthread(void *);
+static int xbd_update_handler(void *);
+
+static int xbdinit(struct xbd_softc *, xen_disk_t *, struct dk_intf *);
+
+/* Pseudo-disk Interface */
+static struct dk_intf dkintf_esdi = {
+ DTYPE_ESDI,
+ "Xen Virtual ESDI",
+ xbdopen,
+ xbdclose,
+ xbdstrategy,
+ xbdstart,
+};
+#if NSD > 0
+static struct dk_intf dkintf_scsi = {
+ DTYPE_SCSI,
+ "Xen Virtual SCSI",
+ xbdopen,
+ xbdclose,
+ xbdstrategy,
+ xbdstart,
+};
+#endif
+
+#if NXBD > 0
+static struct xbd_attach_args xbd_ata = {
+ .xa_device = "xbd",
+ .xa_dkintf = &dkintf_esdi,
+};
+#endif
+
+#if NWD > 0
+static struct xbd_attach_args wd_ata = {
+ .xa_device = "wd",
+ .xa_dkintf = &dkintf_esdi,
+};
+#endif
+
+#if NSD > 0
+static struct xbd_attach_args sd_ata = {
+ .xa_device = "sd",
+ .xa_dkintf = &dkintf_scsi,
+};
+#endif
+
+#if NCD > 0
+static struct xbd_attach_args cd_ata = {
+ .xa_device = "cd",
+ .xa_dkintf = &dkintf_esdi,
+};
+#endif
+
+static struct sysctlnode *diskcookies;
+
+
+#if defined(XBDDEBUG) && !defined(DEBUG)
+#define DEBUG
+#endif
+
+#ifdef DEBUG
+int xbddebug = 0;
+
+#define XBDB_FOLLOW 0x1
+#define XBDB_IO 0x2
+#define XBDB_SETUP 0x4
+#define XBDB_HOTPLUG 0x8
+
+#define IFDEBUG(x,y) if (xbddebug & (x)) y
+#define DPRINTF(x,y) IFDEBUG(x, printf y)
+#define DPRINTF_FOLLOW(y) DPRINTF(XBDB_FOLLOW, y)
+#define DEBUG_MARK_UNUSED(_xr) (_xr)->xr_sc = (void *)0xdeadbeef
+
+struct xbdreq *xbd_allxr;
+#else
+#define IFDEBUG(x,y)
+#define DPRINTF(x,y)
+#define DPRINTF_FOLLOW(y)
+#define DEBUG_MARK_UNUSED(_xr)
+#endif
+
+#ifdef DIAGNOSTIC
+#define DIAGPANIC(x) panic x
+#define DIAGCONDPANIC(x,y) if (x) panic y
+#else
+#define DIAGPANIC(x)
+#define DIAGCONDPANIC(x,y)
+#endif
+
+
+struct xbdreq {
+ union {
+ SLIST_ENTRY(xbdreq) _unused; /* ptr. to next free xbdreq */
+ SIMPLEQ_ENTRY(xbdreq) _suspended;
+ /* link when on suspended queue. */
+ } _link;
+ struct xbdreq *xr_parent; /* ptr. to parent xbdreq */
+ struct buf *xr_bp; /* ptr. to original I/O buf */
+ daddr_t xr_bn; /* block no. to process */
+ long xr_bqueue; /* bytes left to queue */
+ long xr_bdone; /* bytes left */
+ vaddr_t xr_data; /* ptr. to data to be proc. */
+ vaddr_t xr_aligned; /* ptr. to aligned data */
+ long xr_breq; /* bytes in this req. */
+ struct xbd_softc *xr_sc; /* ptr. to xbd softc */
+};
+#define xr_unused _link._unused
+#define xr_suspended _link._suspended
+
+SLIST_HEAD(,xbdreq) xbdreqs =
+ SLIST_HEAD_INITIALIZER(xbdreqs);
+static SIMPLEQ_HEAD(, xbdreq) xbdr_suspended =
+ SIMPLEQ_HEAD_INITIALIZER(xbdr_suspended);
+
+#define CANGET_XBDREQ() (!SLIST_EMPTY(&xbdreqs))
+
+#define GET_XBDREQ(_xr) do { \
+ (_xr) = SLIST_FIRST(&xbdreqs); \
+ if (__predict_true(_xr)) \
+ SLIST_REMOVE_HEAD(&xbdreqs, xr_unused); \
+} while (/*CONSTCOND*/0)
+
+#define PUT_XBDREQ(_xr) do { \
+ DEBUG_MARK_UNUSED(_xr); \
+ SLIST_INSERT_HEAD(&xbdreqs, _xr, xr_unused); \
+} while (/*CONSTCOND*/0)
+
+static struct bufq_state bufq;
+static int bufq_users = 0;
+
+#define XEN_MAJOR(_dev) ((_dev) >> 8)
+#define XEN_MINOR(_dev) ((_dev) & 0xff)
+
+#define XEN_SCSI_DISK0_MAJOR 8
+#define XEN_SCSI_DISK1_MAJOR 65
+#define XEN_SCSI_DISK2_MAJOR 66
+#define XEN_SCSI_DISK3_MAJOR 67
+#define XEN_SCSI_DISK4_MAJOR 68
+#define XEN_SCSI_DISK5_MAJOR 69
+#define XEN_SCSI_DISK6_MAJOR 70
+#define XEN_SCSI_DISK7_MAJOR 71
+#define XEN_SCSI_DISK8_MAJOR 128
+#define XEN_SCSI_DISK9_MAJOR 129
+#define XEN_SCSI_DISK10_MAJOR 130
+#define XEN_SCSI_DISK11_MAJOR 131
+#define XEN_SCSI_DISK12_MAJOR 132
+#define XEN_SCSI_DISK13_MAJOR 133
+#define XEN_SCSI_DISK14_MAJOR 134
+#define XEN_SCSI_DISK15_MAJOR 135
+#define XEN_SCSI_CDROM_MAJOR 11
+
+#define XEN_IDE0_MAJOR 3
+#define XEN_IDE1_MAJOR 22
+#define XEN_IDE2_MAJOR 33
+#define XEN_IDE3_MAJOR 34
+#define XEN_IDE4_MAJOR 56
+#define XEN_IDE5_MAJOR 57
+#define XEN_IDE6_MAJOR 88
+#define XEN_IDE7_MAJOR 89
+#define XEN_IDE8_MAJOR 90
+#define XEN_IDE9_MAJOR 91
+
+#define XEN_BSHIFT 9 /* log2(XEN_BSIZE) */
+#define XEN_BSIZE (1 << XEN_BSHIFT)
+
+#define MAX_VBDS 64
+static int nr_vbds;
+static xen_disk_t *vbd_info;
+
+static blk_ring_t *blk_ring = NULL;
+static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */
+static BLK_RING_IDX req_prod; /* Private request producer. */
+static BLK_RING_IDX last_req_prod; /* Request producer at last trap. */
+
+#define STATE_ACTIVE 0
+#define STATE_SUSPENDED 1
+#define STATE_CLOSED 2
+static unsigned int state = STATE_SUSPENDED;
+
+
+#define XBDUNIT(x) DISKUNIT(x)
+#define GETXBD_SOFTC(_xs, x) if (!((_xs) = getxbd_softc(x))) return ENXIO
+#define GETXBD_SOFTC_CDEV(_xs, x) do { \
+ dev_t bx = devsw_chr2blk((x)); \
+ if (bx == NODEV) \
+ return ENXIO; \
+ if (!((_xs) = getxbd_softc(bx))) \
+ return ENXIO; \
+} while (/*CONSTCOND*/0)
+
+static struct xbd_softc *
+getxbd_softc(dev_t dev)
+{
+ int unit = XBDUNIT(dev);
+
+ DPRINTF_FOLLOW(("getxbd_softc(0x%x): major = %d unit = %d\n", dev,
+ major(dev), unit));
+#if NXBD > 0
+ if (major(dev) == xbd_major)
+ return device_lookup(&xbd_cd, unit);
+#endif
+#if NWD > 0
+ if (major(dev) == xbd_wd_major || major(dev) == xbd_wd_cdev_major)
+ return device_lookup(&wd_cd, unit);
+#endif
+#if NSD > 0
+ if (major(dev) == xbd_sd_major || major(dev) == xbd_sd_cdev_major)
+ return device_lookup(&sd_cd, unit);
+#endif
+#if NCD > 0
+ if (major(dev) == xbd_cd_major || major(dev) == xbd_cd_cdev_major)
+ return device_lookup(&cd_cd, unit);
+#endif
+ return NULL;
+}
+
+static int
+get_vbd_info(xen_disk_t *disk_info)
+{
+ int err;
+ block_io_op_t op;
+
+ /* Probe for disk information. */
+ memset(&op, 0, sizeof(op));
+ op.cmd = BLOCK_IO_OP_VBD_PROBE;
+ op.u.probe_params.domain = 0;
+ op.u.probe_params.xdi.max = MAX_VBDS;
+ op.u.probe_params.xdi.disks = disk_info;
+ op.u.probe_params.xdi.count = 0;
+
+ err = HYPERVISOR_block_io_op(&op);
+ if (err) {
+ printf("WARNING: Could not probe disks (%d)\n", err);
+ DIAGPANIC(("get_vbd_info: Could not probe disks (%d)", err));
+ return -1;
+ }
+
+ return op.u.probe_params.xdi.count;
+}
+
+static void
+reset_interface(void)
+{
+ block_io_op_t op;
+
+ op.cmd = BLOCK_IO_OP_RESET;
+ if (HYPERVISOR_block_io_op(&op) != 0)
+ printf("xbd: Possible blkdev trouble: couldn't reset ring\n");
+}
+
+static void
+init_interface(void)
+{
+ block_io_op_t op;
+
+ reset_interface();
+
+ if (blk_ring == NULL) {
+ op.cmd = BLOCK_IO_OP_RING_ADDRESS;
+ (void)HYPERVISOR_block_io_op(&op);
+
+ blk_ring = (blk_ring_t *)uvm_km_valloc_align(kernel_map,
+ PAGE_SIZE, PAGE_SIZE);
+ pmap_kenter_ma((vaddr_t)blk_ring, op.u.ring_mfn << PAGE_SHIFT,
+ VM_PROT_READ|VM_PROT_WRITE);
+ DPRINTF(XBDB_SETUP, ("init_interface: "
+ "ring va %p and wired to %p\n",
+ blk_ring, (void *)(op.u.ring_mfn << PAGE_SHIFT)));
+
+ blk_ring->req_prod = blk_ring->resp_prod =
+ resp_cons = req_prod = last_req_prod = 0;
+
+ event_set_handler(_EVENT_BLKDEV, &xbd_response_handler,
+ NULL, IPL_BIO);
+ hypervisor_enable_event(_EVENT_BLKDEV);
+ }
+
+ __insn_barrier();
+ state = STATE_ACTIVE;
+}
+
+static void
+enable_update_events(struct device *self)
+{
+
+ kthread_create(xbd_update_create_kthread, self);
+ event_set_handler(_EVENT_VBD_UPD, &xbd_update_handler, self, IPL_BIO);
+ hypervisor_enable_event(_EVENT_VBD_UPD);
+}
+
+static void
+signal_requests_to_xen(void)
+{
+ block_io_op_t op;
+
+ DPRINTF(XBDB_IO, ("signal_requests_to_xen: %d -> %d\n",
+ blk_ring->req_prod, MASK_BLK_IDX(req_prod)));
+ blk_ring->req_prod = MASK_BLK_IDX(req_prod);
+ last_req_prod = req_prod;
+
+ op.cmd = BLOCK_IO_OP_SIGNAL;
+ HYPERVISOR_block_io_op(&op);
+ return;
+}
+
+static void
+setup_sysctl(void)
+{
+ struct sysctlnode *pnode;
+
+ sysctl_createv(NULL, 0, NULL, NULL,
+ 0,
+ CTLTYPE_NODE, "machdep", NULL,
+ NULL, 0, NULL, 0,
+ CTL_MACHDEP, CTL_EOL);
+
+ sysctl_createv(NULL, 0, NULL, &pnode,
+ 0,
+ CTLTYPE_NODE, "domain0", NULL,
+ NULL, 0, NULL, 0,
+ CTL_MACHDEP, CTL_CREATE, CTL_EOL);
+
+ if (pnode == NULL)
+ return;
+
+ sysctl_createv(NULL, 0, &pnode, &pnode,
+ 0,
+ CTLTYPE_NODE, "diskcookie", NULL,
+ NULL, 0, NULL, 0,
+ CTL_CREATE, CTL_EOL);
+
+ if (pnode)
+ diskcookies = pnode;
+}
+
+static struct xbd_attach_args *
+get_xbda(xen_disk_t *xd)
+{
+
+ switch (XEN_MAJOR(xd->device)) {
+#if NSD > 0
+ case XEN_SCSI_DISK0_MAJOR:
+ case XEN_SCSI_DISK1_MAJOR ... XEN_SCSI_DISK7_MAJOR:
+ case XEN_SCSI_DISK8_MAJOR ... XEN_SCSI_DISK15_MAJOR:
+ if (xd->capacity == 0)
+ return NULL;
+ return &sd_ata;
+ case XEN_SCSI_CDROM_MAJOR:
+ return &cd_ata;
+#endif
+#if NWD > 0
+ case XEN_IDE0_MAJOR:
+ case XEN_IDE1_MAJOR:
+ case XEN_IDE2_MAJOR:
+ case XEN_IDE3_MAJOR:
+ case XEN_IDE4_MAJOR:
+ case XEN_IDE5_MAJOR:
+ case XEN_IDE6_MAJOR:
+ case XEN_IDE7_MAJOR:
+ case XEN_IDE8_MAJOR:
+ case XEN_IDE9_MAJOR:
+ switch (XD_TYPE(xd->info)) {
+ case XD_TYPE_CDROM:
+ return &cd_ata;
+ case XD_TYPE_DISK:
+ if (xd->capacity == 0)
+ return NULL;
+ return &wd_ata;
+ default:
+ return NULL;
+ }
+ break;
+#endif
+ default:
+ if (xd->capacity == 0)
+ return NULL;
+ return &xbd_ata;
+ }
+ return NULL;
+}
+
+int
+xbd_scan(struct device *self, struct xbd_attach_args *mainbus_xbda,
+ cfprint_t print)
+{
+ struct xbdreq *xr;
+ struct xbd_attach_args *xbda;
+ xen_disk_t *xd;
+ int i;
+
+ init_interface();
+ if (xen_start_info.flags & SIF_PRIVILEGED)
+ setup_sysctl();
+
+#if NXBD > 0
+ xbd_major = devsw_name2blk("xbd", NULL, 0);
+#endif
+#if NWD > 0
+ xbd_wd_major = devsw_name2blk("wd", NULL, 0);
+ /* XXX Also handle the cdev majors since stuff like
+ * read_sector calls strategy on the cdev. This only works if
+ * all the majors we care about are different.
+ */
+ xbd_wd_cdev_major = major(devsw_blk2chr(makedev(xbd_wd_major, 0)));
+#endif
+#if NSD > 0
+ xbd_sd_major = devsw_name2blk("sd", NULL, 0);
+ xbd_sd_cdev_major = major(devsw_blk2chr(makedev(xbd_sd_major, 0)));
+#endif
+#if NCD > 0
+ xbd_cd_major = devsw_name2blk("cd", NULL, 0);
+ xbd_cd_cdev_major = major(devsw_blk2chr(makedev(xbd_cd_major, 0)));
+#endif
+
+ MALLOC(xr, struct xbdreq *, BLK_RING_SIZE * sizeof(struct xbdreq),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+#ifdef DEBUG
+ xbd_allxr = xr;
+#endif
+
+ /* XXX Xen1.2: We cannot use BLK_RING_SIZE many slots, since
+ * Xen 1.2 keeps indexes masked in the ring and the case where
+ * we queue all slots at once is handled wrong.
+ */
+ for (i = 0; i < BLK_RING_SIZE - 1; i++)
+ PUT_XBDREQ(&xr[i]);
+
+ MALLOC(vbd_info, xen_disk_t *, MAX_VBDS * sizeof(xen_disk_t),
+ M_DEVBUF, M_WAITOK);
+ memset(vbd_info, 0, MAX_VBDS * sizeof(xen_disk_t));
+ nr_vbds = get_vbd_info(vbd_info);
+ if (nr_vbds <= 0)
+ goto out;
+
+ for (i = 0; i < nr_vbds; i++) {
+ xd = &vbd_info[i];
+ xbda = get_xbda(xd);
+ if (xbda) {
+ xbda->xa_xd = xd;
+ config_found(self, xbda, print);
+ }
+ }
+
+ enable_update_events(self);
+
+ return 0;
+
+ out:
+ FREE(vbd_info, M_DEVBUF);
+ vbd_info = NULL;
+ FREE(xr, M_DEVBUF);
+#ifdef DEBUG
+ xbd_allxr = NULL;
+#endif
+ SLIST_INIT(&xbdreqs);
+ return 0;
+}
+
+#if NXBD > 0
+int
+xbd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+ struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+ if (strcmp(xa->xa_device, "xbd") == 0)
+ return 1;
+ return 0;
+}
+#endif
+
+#if NWD > 0
+int
+xbd_wd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+ struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+ if (strcmp(xa->xa_device, "wd") == 0)
+ return 1;
+ return 0;
+}
+#endif
+
+#if NSD > 0
+int
+xbd_sd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+ struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+ if (strcmp(xa->xa_device, "sd") == 0)
+ return 1;
+ return 0;
+}
+#endif
+
+#if NCD > 0
+int
+xbd_cd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+ struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+ if (strcmp(xa->xa_device, "cd") == 0)
+ return 1;
+ return 0;
+}
+#endif
+
+static void
+xbd_attach(struct device *parent, struct device *self, void *aux)
+{
+ struct xbd_attach_args *xbda = (struct xbd_attach_args *)aux;
+ struct xbd_softc *xs = (struct xbd_softc *)self;
+
+ aprint_normal(": Xen Virtual Block Device");
+
+ simple_lock_init(&xs->sc_slock);
+ dk_sc_init(&xs->sc_dksc, xs, xs->sc_dev.dv_xname);
+ xbdinit(xs, xbda->xa_xd, xbda->xa_dkintf);
+ if (diskcookies) {
+ /* XXX beware that xs->sc_xd_device is a long */
+ sysctl_createv(NULL, 0, &diskcookies, NULL,
+ 0,
+ CTLTYPE_INT, xs->sc_dev.dv_xname, NULL,
+ NULL, 0, &xs->sc_xd_device, 0,
+ CTL_CREATE, CTL_EOL);
+ }
+}
+
+static int
+xbd_detach(struct device *dv, int flags)
+{
+ struct xbd_softc *xs = (struct xbd_softc *)dv;
+
+ /*
+ * Mark disk about to be removed (between now and when the xs
+ * will be freed).
+ */
+ xs->sc_shutdown = 1;
+
+ /* And give it some time to settle if it's busy. */
+ if (xs->sc_dksc.sc_dkdev.dk_busy > 0)
+ tsleep(&xs, PWAIT, "xbdetach", hz);
+
+ /* Detach the disk. */
+ disk_detach(&xs->sc_dksc.sc_dkdev);
+
+ /* XXX decrement bufq_users and free? */
+
+ /* XXX no need to remove sysctl nodes since they only exist
+ * in domain0 and domain0's devices are never removed.
+ */
+
+ return 0;
+}
+
+int
+xbdopen(dev_t dev, int flags, int fmt, struct proc *p)
+{
+ struct xbd_softc *xs;
+
+ DPRINTF_FOLLOW(("xbdopen(0x%04x, %d)\n", dev, flags));
+ switch (fmt) {
+ case S_IFCHR:
+ GETXBD_SOFTC_CDEV(xs, dev);
+ break;
+ case S_IFBLK:
+ GETXBD_SOFTC(xs, dev);
+ break;
+ default:
+ return ENXIO;
+ }
+ return dk_open(xs->sc_di, &xs->sc_dksc, dev, flags, fmt, p);
+}
+
+int
+xbdclose(dev_t dev, int flags, int fmt, struct proc *p)
+{
+ struct xbd_softc *xs;
+
+ DPRINTF_FOLLOW(("xbdclose(%d, %d)\n", dev, flags));
+ switch (fmt) {
+ case S_IFCHR:
+ GETXBD_SOFTC_CDEV(xs, dev);
+ break;
+ case S_IFBLK:
+ GETXBD_SOFTC(xs, dev);
+ break;
+ default:
+ return ENXIO;
+ }
+ return dk_close(xs->sc_di, &xs->sc_dksc, dev, flags, fmt, p);
+}
+
+void
+xbdstrategy(struct buf *bp)
+{
+ struct xbd_softc *xs = getxbd_softc(bp->b_dev);
+
+ DPRINTF_FOLLOW(("xbdstrategy(%p): b_bcount = %ld\n", bp,
+ (long)bp->b_bcount));
+
+ if (xs == NULL || xs->sc_shutdown) {
+ bp->b_flags |= B_ERROR;
+ bp->b_error = EIO;
+ biodone(bp);
+ return;
+ }
+
+ dk_strategy(xs->sc_di, &xs->sc_dksc, bp);
+ return;
+}
+
+int
+xbdsize(dev_t dev)
+{
+ struct xbd_softc *xs = getxbd_softc(dev);
+
+ DPRINTF_FOLLOW(("xbdsize(%d)\n", dev));
+ if (xs == NULL || xs->sc_shutdown)
+ return -1;
+ return dk_size(xs->sc_di, &xs->sc_dksc, dev);
+}
+
+static void
+map_align(struct xbdreq *xr)
+{
+ int s;
+
+ s = splvm();
+ xr->xr_aligned = uvm_km_kmemalloc1(kmem_map, NULL,
+ xr->xr_bqueue, XEN_BSIZE, UVM_UNKNOWN_OFFSET,
+ 0/* UVM_KMF_NOWAIT */);
+ splx(s);
+ DPRINTF(XBDB_IO, ("map_align(%p): bp %p addr %p align 0x%08lx "
+ "size 0x%04lx\n", xr, xr->xr_bp, xr->xr_bp->b_data,
+ xr->xr_aligned, xr->xr_bqueue));
+ xr->xr_data = xr->xr_aligned;
+ if ((xr->xr_bp->b_flags & B_READ) == 0)
+ memcpy((void *)xr->xr_aligned, xr->xr_bp->b_data,
+ xr->xr_bqueue);
+}
+
+static void
+unmap_align(struct xbdreq *xr)
+{
+ int s;
+
+ if (xr->xr_bp->b_flags & B_READ)
+ memcpy(xr->xr_bp->b_data, (void *)xr->xr_aligned,
+ xr->xr_bp->b_bcount);
+ DPRINTF(XBDB_IO, ("unmap_align(%p): bp %p addr %p align 0x%08lx "
+ "size 0x%04lx\n", xr, xr->xr_bp, xr->xr_bp->b_data,
+ xr->xr_aligned, xr->xr_bp->b_bcount));
+ s = splvm();
+ uvm_km_free(kmem_map, xr->xr_aligned, xr->xr_bp->b_bcount);
+ splx(s);
+ xr->xr_aligned = (vaddr_t)0;
+}
+
+static void
+fill_ring(struct xbdreq *xr)
+{
+ struct xbdreq *pxr = xr->xr_parent;
+ paddr_t pa;
+ unsigned long ma;
+ vaddr_t addr, off;
+ blk_ring_req_entry_t *ring_req;
+ int breq, nr_sectors;
+
+ /* Fill out a communications ring structure. */
+ ring_req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req;
+ ring_req->id = (unsigned long)xr;
+ ring_req->operation = pxr->xr_bp->b_flags & B_READ ? XEN_BLOCK_READ :
+ XEN_BLOCK_WRITE;
+ ring_req->sector_number = (xen_sector_t)pxr->xr_bn;
+ ring_req->device = pxr->xr_sc->sc_xd_device;
+
+ DPRINTF(XBDB_IO, ("fill_ring(%d): bp %p sector %llu pxr %p xr %p\n",
+ MASK_BLK_IDX(req_prod), pxr->xr_bp, (unsigned long long)pxr->xr_bn,
+ pxr, xr));
+
+ xr->xr_breq = 0;
+ ring_req->nr_segments = 0;
+ addr = trunc_page(pxr->xr_data);
+ off = pxr->xr_data - addr;
+ while (pxr->xr_bqueue > 0) {
+#if 0
+ pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map),
+ addr, &pa);
+#else
+ pmap_extract(pmap_kernel(), addr, &pa);
+#endif
+ ma = xpmap_ptom_masked(pa) + off;
+ DIAGCONDPANIC((ma & (XEN_BSIZE - 1)) != 0,
+ ("xbd request ma not sector aligned"));
+
+ if (pxr->xr_bqueue > PAGE_SIZE - off)
+ breq = PAGE_SIZE - off;
+ else
+ breq = pxr->xr_bqueue;
+ nr_sectors = breq >> XEN_BSHIFT;
+ DIAGCONDPANIC(nr_sectors >= XEN_BSIZE,
+ ("xbd request nr_sectors >= XEN_BSIZE"));
+
+ DPRINTF(XBDB_IO, ("fill_ring(%d): va 0x%08lx pa 0x%08lx "
+ "ma 0x%08lx, sectors %d, left %ld/%ld\n",
+ MASK_BLK_IDX(req_prod), addr, pa, ma, nr_sectors,
+ pxr->xr_bqueue >> XEN_BSHIFT, pxr->xr_bqueue));
+
+ ring_req->buffer_and_sects[ring_req->nr_segments++] =
+ ma | nr_sectors;
+ addr += PAGE_SIZE;
+ pxr->xr_bqueue -= breq;
+ pxr->xr_bn += nr_sectors;
+ xr->xr_breq += breq;
+ off = 0;
+ if (ring_req->nr_segments == MAX_BLK_SEGS)
+ break;
+ }
+ pxr->xr_data = addr;
+
+ req_prod++;
+}
+
+static void
+xbdresume(void)
+{
+ struct xbdreq *pxr, *xr;
+ struct xbd_softc *xs;
+ struct buf *bp;
+
+ while ((pxr = SIMPLEQ_FIRST(&xbdr_suspended)) != NULL) {
+ DPRINTF(XBDB_IO, ("xbdstart: resuming xbdreq %p for bp %p\n",
+ pxr, pxr->xr_bp));
+ bp = pxr->xr_bp;
+ xs = getxbd_softc(bp->b_dev);
+ if (xs == NULL || xs->sc_shutdown) {
+ bp->b_flags |= B_ERROR;
+ bp->b_error = EIO;
+ }
+ if (bp->b_flags & B_ERROR) {
+ pxr->xr_bdone -= pxr->xr_bqueue;
+ pxr->xr_bqueue = 0;
+ if (pxr->xr_bdone == 0) {
+ bp->b_resid = bp->b_bcount;
+ if (pxr->xr_aligned)
+ unmap_align(pxr);
+ PUT_XBDREQ(pxr);
+ if (xs)
+ disk_unbusy(&xs->sc_dksc.sc_dkdev,
+ (bp->b_bcount - bp->b_resid),
+ (bp->b_flags & B_READ));
+ biodone(bp);
+ }
+ continue;
+ }
+ while (__predict_true(pxr->xr_bqueue > 0)) {
+ GET_XBDREQ(xr);
+ if (__predict_false(xr == NULL))
+ goto out;
+ xr->xr_parent = pxr;
+ fill_ring(xr);
+ }
+ DPRINTF(XBDB_IO, ("xbdstart: resumed xbdreq %p for bp %p\n",
+ pxr, bp));
+ SIMPLEQ_REMOVE_HEAD(&xbdr_suspended, xr_suspended);
+ }
+
+ out:
+ return;
+}
+
+static int
+xbdstart(struct dk_softc *dksc, struct buf *bp)
+{
+ struct xbd_softc *xs;
+ struct xbdreq *pxr, *xr;
+ struct partition *pp;
+ daddr_t bn;
+ int ret, runqueue;
+
+ DPRINTF_FOLLOW(("xbdstart(%p, %p)\n", dksc, bp));
+
+ runqueue = 1;
+ ret = -1;
+
+ xs = getxbd_softc(bp->b_dev);
+ if (xs == NULL || xs->sc_shutdown) {
+ bp->b_flags |= B_ERROR;
+ bp->b_error = EIO;
+ biodone(bp);
+ return 0;
+ }
+ dksc = &xs->sc_dksc;
+
+ /* XXXrcd:
+ * Translate partition relative blocks to absolute blocks,
+ * this probably belongs (somehow) in dksubr.c, since it
+ * is independant of the underlying code... This will require
+ * that the interface be expanded slightly, though.
+ */
+ bn = bp->b_blkno;
+ if (DISKPART(bp->b_dev) != RAW_PART) {
+ pp = &xs->sc_dksc.sc_dkdev.dk_label->
+ d_partitions[DISKPART(bp->b_dev)];
+ bn += pp->p_offset;
+ }
+
+ DPRINTF(XBDB_IO, ("xbdstart: addr %p, sector %llu, "
+ "count %ld [%s]\n", bp->b_data, (unsigned long long)bn,
+ bp->b_bcount, bp->b_flags & B_READ ? "read" : "write"));
+
+ GET_XBDREQ(pxr);
+ if (__predict_false(pxr == NULL))
+ goto out;
+
+ disk_busy(&dksc->sc_dkdev); /* XXX: put in dksubr.c */
+ /*
+ * We have a request slot, return 0 to make dk_start remove
+ * the bp from the work queue.
+ */
+ ret = 0;
+
+ pxr->xr_bp = bp;
+ pxr->xr_parent = pxr;
+ pxr->xr_bn = bn;
+ pxr->xr_bqueue = bp->b_bcount;
+ pxr->xr_bdone = bp->b_bcount;
+ pxr->xr_data = (vaddr_t)bp->b_data;
+ pxr->xr_sc = xs;
+
+ if (pxr->xr_data & (XEN_BSIZE - 1))
+ map_align(pxr);
+
+ fill_ring(pxr);
+
+ while (__predict_false(pxr->xr_bqueue > 0)) {
+ GET_XBDREQ(xr);
+ if (__predict_false(xr == NULL))
+ break;
+ xr->xr_parent = pxr;
+ fill_ring(xr);
+ }
+
+ if (__predict_false(pxr->xr_bqueue > 0)) {
+ SIMPLEQ_INSERT_TAIL(&xbdr_suspended, pxr,
+ xr_suspended);
+ DPRINTF(XBDB_IO, ("xbdstart: suspended xbdreq %p "
+ "for bp %p\n", pxr, bp));
+ } else if (CANGET_XBDREQ() && BUFQ_PEEK(&bufq) != NULL) {
+ /*
+ * We have enough resources to start another bp and
+ * there are additional bps on the queue, dk_start
+ * will call us again and we'll run the queue then.
+ */
+ runqueue = 0;
+ }
+
+ out:
+ if (runqueue && last_req_prod != req_prod)
+ signal_requests_to_xen();
+
+ return ret;
+}
+
+static int
+xbd_response_handler(void *arg)
+{
+ struct buf *bp;
+ struct xbd_softc *xs;
+ blk_ring_resp_entry_t *ring_resp;
+ struct xbdreq *pxr, *xr;
+ int i;
+
+ for (i = resp_cons; i != blk_ring->resp_prod; i = BLK_RING_INC(i)) {
+ ring_resp = &blk_ring->ring[MASK_BLK_IDX(i)].resp;
+ xr = (struct xbdreq *)ring_resp->id;
+ pxr = xr->xr_parent;
+
+ DPRINTF(XBDB_IO, ("xbd_response_handler(%d): pxr %p xr %p "
+ "bdone %04lx breq %04lx\n", i, pxr, xr, pxr->xr_bdone,
+ xr->xr_breq));
+ pxr->xr_bdone -= xr->xr_breq;
+ DIAGCONDPANIC(pxr->xr_bdone < 0,
+ ("xbd_response_handler: pxr->xr_bdone < 0"));
+
+ if (__predict_false(ring_resp->status)) {
+ pxr->xr_bp->b_flags |= B_ERROR;
+ pxr->xr_bp->b_error = EIO;
+ }
+
+ if (xr != pxr) {
+ PUT_XBDREQ(xr);
+ if (!SIMPLEQ_EMPTY(&xbdr_suspended))
+ xbdresume();
+ }
+
+ if (pxr->xr_bdone == 0) {
+ bp = pxr->xr_bp;
+ xs = getxbd_softc(bp->b_dev);
+ if (xs == NULL) { /* don't fail bp if we're shutdown */
+ bp->b_flags |= B_ERROR;
+ bp->b_error = EIO;
+ }
+ DPRINTF(XBDB_IO, ("xbd_response_handler(%d): "
+ "completed bp %p\n", i, bp));
+ if (bp->b_flags & B_ERROR)
+ bp->b_resid = bp->b_bcount;
+ else
+ bp->b_resid = 0;
+
+ if (pxr->xr_aligned)
+ unmap_align(pxr);
+
+ PUT_XBDREQ(pxr);
+ if (xs)
+ disk_unbusy(&xs->sc_dksc.sc_dkdev,
+ (bp->b_bcount - bp->b_resid),
+ (bp->b_flags & B_READ));
+ biodone(bp);
+ if (!SIMPLEQ_EMPTY(&xbdr_suspended))
+ xbdresume();
+ /* XXX possible lockup if this was the only
+ * active device and requests were held back in
+ * the queue.
+ */
+ if (xs)
+ dk_iodone(xs->sc_di, &xs->sc_dksc);
+ }
+ }
+ resp_cons = i;
+ /* check if xbdresume queued any requests */
+ if (last_req_prod != req_prod)
+ signal_requests_to_xen();
+ return 0;
+}
+
+static struct device *
+find_device(xen_disk_t *xd)
+{
+ struct device *dv;
+ struct xbd_softc *xs;
+
+ for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+ if (dv->dv_cfattach == NULL ||
+ dv->dv_cfattach->ca_attach != xbd_attach)
+ continue;
+ xs = (struct xbd_softc *)dv;
+ if (xs->sc_xd_device == xd->device)
+ break;
+ }
+ return dv;
+}
+
+static void
+xbd_update_create_kthread(void *arg)
+{
+
+ kthread_create1(xbd_update_kthread, arg, NULL, "xbdupdate");
+}
+
+static void
+xbd_update_kthread(void *arg)
+{
+ struct device *parent = arg;
+ struct xbd_attach_args *xbda;
+ struct device *dev;
+ xen_disk_t *xd;
+ xen_disk_t *vbd_info_update, *vbd_info_old;
+ int i, j, new_nr_vbds;
+ extern int hypervisor_print(void *, const char *);
+
+ MALLOC(vbd_info_update, xen_disk_t *, MAX_VBDS *
+ sizeof(xen_disk_t), M_DEVBUF, M_WAITOK);
+
+ for (;;) {
+ memset(vbd_info_update, 0, MAX_VBDS * sizeof(xen_disk_t));
+ new_nr_vbds = get_vbd_info(vbd_info_update);
+
+ if (memcmp(vbd_info, vbd_info_update, MAX_VBDS *
+ sizeof(xen_disk_t)) == 0) {
+ FREE(vbd_info_update, M_DEVBUF);
+ tsleep(parent, PWAIT, "xbdupd", 0);
+ MALLOC(vbd_info_update, xen_disk_t *, MAX_VBDS *
+ sizeof(xen_disk_t), M_DEVBUF, M_WAITOK);
+ continue;
+ }
+
+ j = 0;
+ for (i = 0; i < new_nr_vbds; i++) {
+ while (j < nr_vbds &&
+ vbd_info[j].device < vbd_info_update[i].device) {
+ DPRINTF(XBDB_HOTPLUG,
+ ("delete device %x size %lx\n",
+ vbd_info[j].device,
+ vbd_info[j].capacity));
+ xd = &vbd_info[j];
+ dev = find_device(xd);
+ if (dev)
+ config_detach(dev, DETACH_FORCE);
+ j++;
+ }
+ if (j < nr_vbds &&
+ vbd_info[j].device == vbd_info_update[i].device) {
+ DPRINTF(XBDB_HOTPLUG,
+ ("update device %x size %lx size %lx\n",
+ vbd_info_update[i].device,
+ vbd_info[j].capacity,
+ vbd_info_update[i].capacity));
+ j++;
+ } else {
+ DPRINTF(XBDB_HOTPLUG,
+ ("add device %x size %lx\n",
+ vbd_info_update[i].device,
+ vbd_info_update[i].capacity));
+ xd = &vbd_info_update[i];
+ xbda = get_xbda(xd);
+ if (xbda) {
+ xbda->xa_xd = xd;
+ config_found(parent, xbda, hypervisor_print);
+ }
+ }
+ }
+
+ while (j < nr_vbds) {
+ DPRINTF(XBDB_HOTPLUG, ("delete device %x\n",
+ vbd_info[j].device));
+ xd = &vbd_info[j];
+ dev = find_device(xd);
+ if (dev)
+ config_detach(dev, DETACH_FORCE);
+ j++;
+ }
+
+ nr_vbds = new_nr_vbds;
+
+ vbd_info_old = vbd_info;
+ vbd_info = vbd_info_update;
+ vbd_info_update = vbd_info_old;
+ }
+}
+
+static int
+xbd_update_handler(void *arg)
+{
+
+ wakeup(arg);
+
+ return 0;
+}
+
+/* XXX: we should probably put these into dksubr.c, mostly */
+int
+xbdread(dev_t dev, struct uio *uio, int flags)
+{
+ struct xbd_softc *xs;
+ struct dk_softc *dksc;
+
+ DPRINTF_FOLLOW(("xbdread(%d, %p, %d)\n", dev, uio, flags));
+ GETXBD_SOFTC_CDEV(xs, dev);
+ dksc = &xs->sc_dksc;
+ if ((dksc->sc_flags & DKF_INITED) == 0)
+ return ENXIO;
+ /* XXX see the comments about minphys in ccd.c */
+ return physio(xbdstrategy, NULL, dev, B_READ, minphys, uio);
+}
+
+/* XXX: we should probably put these into dksubr.c, mostly */
+int
+xbdwrite(dev_t dev, struct uio *uio, int flags)
+{
+ struct xbd_softc *xs;
+ struct dk_softc *dksc;
+
+ DPRINTF_FOLLOW(("xbdwrite(%d, %p, %d)\n", dev, uio, flags));
+ GETXBD_SOFTC_CDEV(xs, dev);
+ dksc = &xs->sc_dksc;
+ if ((dksc->sc_flags & DKF_INITED) == 0)
+ return ENXIO;
+ /* XXX see the comments about minphys in ccd.c */
+ return physio(xbdstrategy, NULL, dev, B_WRITE, minphys, uio);
+}
+
+int
+xbdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+ struct xbd_softc *xs;
+ struct dk_softc *dksc;
+ int ret;
+
+ DPRINTF_FOLLOW(("xbdioctl(%d, %08lx, %p, %d, %p)\n",
+ dev, cmd, data, flag, p));
+ GETXBD_SOFTC(xs, dev);
+ dksc = &xs->sc_dksc;
+
+ if ((ret = lockmgr(&dksc->sc_lock, LK_EXCLUSIVE, NULL)) != 0)
+ return ret;
+
+ switch (cmd) {
+ default:
+ ret = dk_ioctl(xs->sc_di, dksc, dev, cmd, data, flag, p);
+ break;
+ }
+
+ lockmgr(&dksc->sc_lock, LK_RELEASE, NULL);
+ return ret;
+}
+
+int
+xbdioctl_cdev(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+ dev_t bdev;
+
+ bdev = devsw_chr2blk(dev);
+ if (bdev == NODEV)
+ return ENXIO;
+ return xbdioctl(bdev, cmd, data, flag, p);
+}
+
+int
+xbddump(dev_t dev, daddr_t blkno, caddr_t va, size_t size)
+{
+ struct xbd_softc *xs;
+
+ DPRINTF_FOLLOW(("xbddump(%d, %" PRId64 ", %p, %lu)\n", dev, blkno, va,
+ (unsigned long)size));
+ GETXBD_SOFTC(xs, dev);
+ return dk_dump(xs->sc_di, &xs->sc_dksc, dev, blkno, va, size);
+}
+
+static int
+xbdinit(struct xbd_softc *xs, xen_disk_t *xd, struct dk_intf *dkintf)
+{
+ struct dk_geom *pdg;
+ char buf[9];
+ int ret;
+
+ ret = 0;
+
+ xs->sc_dksc.sc_size = xd->capacity;
+ xs->sc_xd_device = xd->device;
+ xs->sc_di = dkintf;
+ xs->sc_shutdown = 0;
+
+ /*
+ * XXX here we should probe the underlying device. If we
+ * are accessing a partition of type RAW_PART, then
+ * we should populate our initial geometry with the
+ * geometry that we discover from the device.
+ */
+ pdg = &xs->sc_dksc.sc_geom;
+ pdg->pdg_secsize = DEV_BSIZE;
+ pdg->pdg_ntracks = 1;
+ pdg->pdg_nsectors = 1024 * (1024 / pdg->pdg_secsize);
+ pdg->pdg_ncylinders = xs->sc_dksc.sc_size / pdg->pdg_nsectors;
+
+ /*
+ * We have one shared bufq for all devices because otherwise
+ * requests can stall if there were no free request slots
+ * available in xbdstart and this device had no requests
+ * in-flight which would trigger a dk_start from the interrupt
+ * handler.
+ * XXX this assumes that we can just memcpy struct bufq_state
+ * to share it between devices.
+ * XXX we reference count the usage in case so we can de-alloc
+ * the bufq if all devices are deconfigured.
+ */
+ if (bufq_users == 0) {
+ bufq_alloc(&bufq, BUFQ_FCFS);
+ bufq_users = 1;
+ }
+ memcpy(&xs->sc_dksc.sc_bufq, &bufq, sizeof(struct bufq_state));
+
+ xs->sc_dksc.sc_flags |= DKF_INITED;
+
+ /* Attach the disk. */
+ disk_attach(&xs->sc_dksc.sc_dkdev);
+
+ /* Try and read the disklabel. */
+ dk_getdisklabel(xs->sc_di, &xs->sc_dksc, 0 /* XXX ? */);
+
+ format_bytes(buf, sizeof(buf), (uint64_t)xs->sc_dksc.sc_size *
+ pdg->pdg_secsize);
+ printf(" %s\n", buf);
+
+/* out: */
+ return ret;
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c
new file mode 100644
index 0000000000..8181f2b9b3
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c
@@ -0,0 +1,444 @@
+/* $NetBSD: xen_debug.c,v 1.1.2.1 2004/05/22 15:59:31 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *
+ * Copyright (c) 2002-2003, K A Fraser & R Neugebauer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xen_debug.c,v 1.1.2.1 2004/05/22 15:59:31 he Exp $");
+
+#define XENDEBUG
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/stdarg.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+
+#ifdef XENDEBUG
+
+#define PRINTK_BUFSIZE 1024
+void
+printk(const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+ static char buf[PRINTK_BUFSIZE];
+
+ va_start(ap, fmt);
+ ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap);
+ va_end(ap);
+ buf[ret] = 0;
+ (void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf);
+}
+
+void
+vprintk(const char *fmt, va_list ap)
+{
+ int ret;
+ static char buf[PRINTK_BUFSIZE];
+
+ ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap);
+ buf[ret] = 0;
+ (void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf);
+}
+
+#endif
+
+#ifdef XENDEBUG_LOW
+
+int xen_once = 0;
+
+void hypervisor_callback(void);
+void failsafe_callback(void);
+
+void xen_dbglow_init(void);
+void
+xen_dbglow_init()
+{
+ start_info_t *si;
+#if 0
+ int i;
+#endif
+
+ si = &xen_start_info;
+
+ HYPERVISOR_set_callbacks(
+ __KERNEL_CS, (unsigned long)hypervisor_callback,
+ __KERNEL_CS, (unsigned long)failsafe_callback);
+
+ trap_init();
+
+ /* __sti(); */
+
+ /* print out some useful information */
+ printk(version);
+ printk("start_info: %p\n", si);
+ printk(" nr_pages: %lu", si->nr_pages);
+ printk(" shared_inf: %p (was %p)\n", HYPERVISOR_shared_info,
+ si->shared_info);
+ printk(" pt_base: %p", (void *)si->pt_base);
+ printk(" mod_start: 0x%lx\n", si->mod_start);
+ printk(" mod_len: %lu\n", si->mod_len);
+#if 0
+ printk(" net_rings: ");
+ for (i = 0; i < MAX_DOMAIN_VIFS; i++) {
+ if (si->net_rings[i] == 0)
+ break;
+ printk(" %lx", si->net_rings[i]);
+ };
+ printk("\n");
+ printk(" blk_ring: 0x%lx\n", si->blk_ring);
+#endif
+ printk(" dom_id: %d\n", si->dom_id);
+ printk(" flags: 0x%lx\n", si->flags);
+ printk(" cmd_line: %s\n", si->cmd_line ?
+ (const char *)si->cmd_line : "NULL");
+}
+
+
+void xen_dbg0(char *);
+void
+xen_dbg0(char *end)
+{
+ struct cpu_info *ci;
+
+ ci = &cpu_info_primary;
+ if (xen_once)
+ printk("xencpu level %d ipending %08x master %08x\n",
+ ci->ci_ilevel, ci->ci_ipending,
+ HYPERVISOR_shared_info->events_mask);
+ /* ipending %08x imask %08x iunmask %08x */
+ /* ci->ci_imask[IPL_NET], ci->ci_iunmask[IPL_NET]); */
+}
+
+void xen_dbg1(void *esp, int ss);
+void
+xen_dbg1(void *esp, int ss)
+{
+#if 1
+ struct cpu_info *ci;
+
+ ci = &cpu_info_primary;
+ if (xen_once)
+ printk("xenhighlevel %d ipending %08x master %08x events %08x\n",
+ ci->ci_ilevel, ci->ci_ipending,
+ HYPERVISOR_shared_info->events_mask, HYPERVISOR_shared_info->events);
+#else
+ printk("stack switch %p %d/%d, sp %p\n", esp, ss, IDXSEL(ss), &ss);
+#endif
+}
+
+void xen_dbg2(void);
+void
+xen_dbg2(void)
+{
+ if (xen_once)
+ printk("xen_dbg2\n");
+}
+
+void xen_dbg3(void *, void *);
+void
+xen_dbg3(void *ss, void *esp)
+{
+ if (xen_once)
+ printk("xen_dbg3 %p %p\n", ss, esp);
+}
+
+void xen_dbg4(void *);
+void
+xen_dbg4(void *esi)
+{
+
+ printk("xen_dbg4 %p\n", esi);
+ for(;;);
+}
+
+
+
+
+static void do_exit(void);
+
+/*
+ * These are assembler stubs in vector.S.
+ * They are the actual entry points for virtual exceptions.
+ */
+void divide_error(void);
+void debug(void);
+void int3(void);
+void overflow(void);
+void bounds(void);
+void invalid_op(void);
+void device_not_available(void);
+void double_fault(void);
+void coprocessor_segment_overrun(void);
+void invalid_TSS(void);
+void segment_not_present(void);
+void stack_segment(void);
+void general_protection(void);
+void page_fault(void);
+void coprocessor_error(void);
+void simd_coprocessor_error(void);
+void alignment_check(void);
+void spurious_interrupt_bug(void);
+void machine_check(void);
+
+static void
+dump_regs(struct pt_regs *regs)
+{
+ int in_kernel = 1;
+ unsigned long esp;
+ unsigned short ss;
+
+ esp = (unsigned long) (&regs->esp);
+ ss = __KERNEL_DS;
+ if (regs->xcs & 2) {
+ in_kernel = 0;
+ esp = regs->esp;
+ ss = regs->xss & 0xffff;
+ }
+ printf("EIP: %04x:[<%08lx>]\n",
+ 0xffff & regs->xcs, regs->eip);
+ printf("EFLAGS: %08lx\n",regs->eflags);
+ printf("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
+ regs->eax, regs->ebx, regs->ecx, regs->edx);
+ printf("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
+ regs->esi, regs->edi, regs->ebp, esp);
+ printf("ds: %04x es: %04x ss: %04x\n",
+ regs->xds & 0xffff, regs->xes & 0xffff, ss);
+ printf("\n");
+}
+
+
+static inline void
+dump_code(unsigned eip)
+{
+ unsigned *ptr = (unsigned *)eip;
+ int x;
+
+ printk("Bytes at eip:\n");
+ for (x = -4; x < 5; x++)
+ printf("%x", ptr[x]);
+}
+
+
+/*
+ * C handlers here have their parameter-list constructed by the
+ * assembler stubs above. Each one gets a pointer to a list
+ * of register values (to be restored at end of exception).
+ * Some will also receive an error code -- this is the code that
+ * was generated by the processor for the underlying real exception.
+ *
+ * Note that the page-fault exception is special. It also receives
+ * the faulting linear address. Normally this would be found in
+ * register CR2, but that is not accessible in a virtualised OS.
+ */
+
+static void inline
+do_trap(int trapnr, char *str, struct pt_regs *regs, long error_code)
+{
+
+ printk("FATAL: Unhandled Trap (see mini-os:traps.c)");
+ printf("%d %s", trapnr, str);
+ dump_regs(regs);
+ dump_code(regs->eip);
+
+ do_exit();
+}
+
+#define DO_ERROR(trapnr, str, name) \
+void do_##name(struct pt_regs *regs, long error_code); \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ do_trap(trapnr, str, regs, error_code); \
+}
+
+#define DO_ERROR_INFO(trapnr, str, name, sicode, siaddr) \
+void do_##name(struct pt_regs *regs, long error_code); \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ do_trap(trapnr, str, regs, error_code); \
+}
+
+DO_ERROR_INFO( 0, "divide error", divide_error, FPE_INTDIV, regs->eip)
+DO_ERROR( 3, "int3", int3)
+DO_ERROR( 4, "overflow", overflow)
+DO_ERROR( 5, "bounds", bounds)
+DO_ERROR_INFO( 6, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
+DO_ERROR( 7, "device not available", device_not_available)
+DO_ERROR( 8, "double fault", double_fault)
+DO_ERROR( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, "invalid TSS", invalid_TSS)
+DO_ERROR(11, "segment not present", segment_not_present)
+DO_ERROR(12, "stack segment", stack_segment)
+DO_ERROR_INFO(17, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR(18, "machine check", machine_check)
+
+void do_page_fault(struct pt_regs *, long, unsigned long);
+void
+do_page_fault(struct pt_regs *regs, long error_code, unsigned long address)
+{
+
+ printk("Page fault\n");
+ printk("Address: 0x%lx", address);
+ printk("Error Code: 0x%lx", error_code);
+ printk("eip: \t 0x%lx", regs->eip);
+ do_exit();
+}
+
+void do_general_protection(struct pt_regs *, long);
+void
+do_general_protection(struct pt_regs *regs, long error_code)
+{
+
+ HYPERVISOR_shared_info->events_mask = 0;
+ printk("GPF\n");
+ printk("Error Code: 0x%lx", error_code);
+ dump_regs(regs);
+ dump_code(regs->eip);
+ do_exit();
+}
+
+
+void do_debug(struct pt_regs *, long);
+void
+do_debug(struct pt_regs *regs, long error_code)
+{
+
+ printk("Debug exception\n");
+#define TF_MASK 0x100
+ regs->eflags &= ~TF_MASK;
+ dump_regs(regs);
+ do_exit();
+}
+
+
+
+void do_coprocessor_error(struct pt_regs *, long);
+void
+do_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+
+ printk("Copro error\n");
+ dump_regs(regs);
+ dump_code(regs->eip);
+ do_exit();
+}
+
+void simd_math_error(void *);
+void
+simd_math_error(void *eip)
+{
+
+ printk("SIMD error\n");
+}
+
+void do_simd_coprocessor_error(struct pt_regs *, long);
+void
+do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+
+ printk("SIMD copro error\n");
+}
+
+void do_spurious_interrupt_bug(struct pt_regs *, long);
+void
+do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+{
+}
+
+static void
+do_exit(void)
+{
+
+ HYPERVISOR_exit();
+}
+
+/*
+ * Submit a virtual IDT to teh hypervisor. This consists of tuples
+ * (interrupt vector, privilege ring, CS:EIP of handler).
+ * The 'privilege ring' field specifies the least-privileged ring that
+ * can trap to that vector using a software-interrupt instruction (INT).
+ */
+static trap_info_t trap_table[] = {
+ { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
+ { 1, 0, __KERNEL_CS, (unsigned long)debug },
+ { 3, 3, __KERNEL_CS, (unsigned long)int3 },
+ { 4, 3, __KERNEL_CS, (unsigned long)overflow },
+ { 5, 3, __KERNEL_CS, (unsigned long)bounds },
+ { 6, 0, __KERNEL_CS, (unsigned long)invalid_op },
+ { 7, 0, __KERNEL_CS, (unsigned long)device_not_available },
+ { 8, 0, __KERNEL_CS, (unsigned long)double_fault },
+ { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
+ { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS },
+ { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present },
+ { 12, 0, __KERNEL_CS, (unsigned long)stack_segment },
+ { 13, 0, __KERNEL_CS, (unsigned long)general_protection },
+ { 14, 0, __KERNEL_CS, (unsigned long)page_fault },
+ { 15, 0, __KERNEL_CS, (unsigned long)spurious_interrupt_bug },
+ { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error },
+ { 17, 0, __KERNEL_CS, (unsigned long)alignment_check },
+ { 18, 0, __KERNEL_CS, (unsigned long)machine_check },
+ { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
+ { 0, 0, 0, 0 }
+};
+
+void
+trap_init(void)
+{
+
+ HYPERVISOR_set_trap_table(trap_table);
+}
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c
new file mode 100644
index 0000000000..a151e3dd83
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c
@@ -0,0 +1,352 @@
+/* $NetBSD: xencons.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xencons.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $");
+
+#include <sys/param.h>
+#include <sys/ioctl.h>
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/systm.h>
+#include <sys/device.h>
+#include <sys/conf.h>
+
+#include <machine/stdarg.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+
+#include <dev/cons.h>
+
+#include <ddb/db_output.h> /* XXX for db_max_line */
+
+static int xencons_isconsole = 0;
+
+#define XENCONS_UNIT(x) (minor(x))
+#define XENCONS_BURST 128
+
+int xencons_match (struct device *, struct cfdata *, void *);
+void xencons_attach (struct device *, struct device *, void *);
+/* int xencons_intr (void *); */
+void xencons_init (void);
+
+struct xencons_softc {
+ struct device sc_dev;
+ struct tty *sc_tty;
+};
+
+CFATTACH_DECL(xencons, sizeof(struct xencons_softc),
+ xencons_match, xencons_attach, NULL, NULL);
+
+extern struct cfdriver xencons_cd;
+
+dev_type_open(xencons_open);
+dev_type_close(xencons_close);
+dev_type_read(xencons_read);
+dev_type_write(xencons_write);
+dev_type_ioctl(xencons_ioctl);
+dev_type_stop(xencons_stop);
+dev_type_tty(xencons_tty);
+dev_type_poll(xencons_poll);
+
+const struct cdevsw xencons_cdevsw = {
+ xencons_open, xencons_close, xencons_read, xencons_write,
+ xencons_ioctl, xencons_stop, xencons_tty, xencons_poll,
+ NULL, ttykqfilter, D_TTY
+};
+
+
+void xenconscn_attach(void);
+int xenconscn_getc(dev_t);
+void xenconscn_putc(dev_t, int);
+void xenconscn_pollc(dev_t, int);
+
+static struct consdev xencons = {
+ NULL, NULL, xenconscn_getc, xenconscn_putc, xenconscn_pollc,
+ NULL, NULL, NULL, NODEV, CN_NORMAL
+};
+
+void xencons_start (struct tty *);
+int xencons_param (struct tty *, struct termios *);
+
+int
+xencons_match(struct device *parent, struct cfdata *match, void *aux)
+{
+ struct xencons_attach_args *xa = (struct xencons_attach_args *)aux;
+
+ if (strcmp(xa->xa_device, "xencons") == 0)
+ return 1;
+ return 0;
+}
+
+void
+xencons_attach(struct device *parent, struct device *self, void *aux)
+{
+ struct xencons_softc *sc = (void *)self;
+
+ aprint_normal(": Xen Virtual Console Driver\n");
+
+ if (xencons_isconsole) {
+ int maj;
+
+ /* Locate the major number. */
+ maj = cdevsw_lookup_major(&xencons_cdevsw);
+
+ /* There can be only one, but it can have any unit number. */
+ cn_tab->cn_dev = makedev(maj, sc->sc_dev.dv_unit);
+
+ aprint_verbose("%s: console major %d, unit %d\n",
+ sc->sc_dev.dv_xname, maj, sc->sc_dev.dv_unit);
+
+ /* Set db_max_line to avoid paging. */
+ db_max_line = 0x7fffffff;
+ }
+}
+
+int
+xencons_open(dev_t dev, int flag, int mode, struct proc *p)
+{
+ struct xencons_softc *sc;
+ int unit = XENCONS_UNIT(dev);
+ struct tty *tp;
+
+ sc = device_lookup(&xencons_cd, unit);
+ if (sc == NULL)
+ return (ENXIO);
+
+ if (!sc->sc_tty) {
+ tp = sc->sc_tty = ttymalloc();
+ tty_attach(tp);
+ } else
+ tp = sc->sc_tty;
+
+ tp->t_oproc = xencons_start;
+ tp->t_param = xencons_param;
+ tp->t_dev = dev;
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ ttychars(tp);
+ tp->t_iflag = TTYDEF_IFLAG;
+ tp->t_oflag = TTYDEF_OFLAG;
+ tp->t_cflag = TTYDEF_CFLAG;
+ tp->t_lflag = TTYDEF_LFLAG;
+ tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED;
+ xencons_param(tp, &tp->t_termios);
+ ttsetwater(tp);
+ } else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0)
+ return (EBUSY);
+ tp->t_state |= TS_CARR_ON;
+
+ return ((*tp->t_linesw->l_open)(dev, tp));
+}
+
+int
+xencons_close(dev_t dev, int flag, int mode, struct proc *p)
+{
+ struct xencons_softc *sc = device_lookup(&xencons_cd,
+ XENCONS_UNIT(dev));
+ struct tty *tp = sc->sc_tty;
+
+ if (tp == NULL)
+ return (0);
+ (*tp->t_linesw->l_close)(tp, flag);
+ ttyclose(tp);
+#ifdef notyet /* XXX */
+ ttyfree(tp);
+#endif
+ return (0);
+}
+
+int
+xencons_read(dev_t dev, struct uio *uio, int flag)
+{
+ struct xencons_softc *sc = device_lookup(&xencons_cd,
+ XENCONS_UNIT(dev));
+ struct tty *tp = sc->sc_tty;
+
+ return ((*tp->t_linesw->l_read)(tp, uio, flag));
+}
+
+int
+xencons_write(dev_t dev, struct uio *uio, int flag)
+{
+ struct xencons_softc *sc = device_lookup(&xencons_cd,
+ XENCONS_UNIT(dev));
+ struct tty *tp = sc->sc_tty;
+
+ return ((*tp->t_linesw->l_write)(tp, uio, flag));
+}
+
+int
+xencons_poll(dev_t dev, int events, struct proc *p)
+{
+ struct xencons_softc *sc = device_lookup(&xencons_cd,
+ XENCONS_UNIT(dev));
+ struct tty *tp = sc->sc_tty;
+
+ return ((*tp->t_linesw->l_poll)(tp, events, p));
+}
+
+struct tty *
+xencons_tty(dev_t dev)
+{
+ struct xencons_softc *sc = device_lookup(&xencons_cd,
+ XENCONS_UNIT(dev));
+ struct tty *tp = sc->sc_tty;
+
+ return (tp);
+}
+
+int
+xencons_ioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+ struct xencons_softc *sc = device_lookup(&xencons_cd,
+ XENCONS_UNIT(dev));
+ struct tty *tp = sc->sc_tty;
+ int error;
+
+ error = (*tp->t_linesw->l_ioctl)(tp, cmd, data, flag, p);
+ if (error != EPASSTHROUGH)
+ return (error);
+
+ error = ttioctl(tp, cmd, data, flag, p);
+ if (error != EPASSTHROUGH)
+ return (error);
+
+ switch (cmd) {
+ default:
+ return (EPASSTHROUGH);
+ }
+
+#ifdef DIAGNOSTIC
+ panic("xencons_ioctl: impossible");
+#endif
+}
+
+void
+xencons_start(struct tty *tp)
+{
+ struct clist *cl;
+ int s, len;
+ u_char buf[XENCONS_BURST+1];
+
+ s = spltty();
+ if (tp->t_state & (TS_TIMEOUT | TS_BUSY | TS_TTSTOP))
+ goto out;
+ tp->t_state |= TS_BUSY;
+ splx(s);
+
+ /*
+ * We need to do this outside spl since it could be fairly
+ * expensive and we don't want our serial ports to overflow.
+ */
+ cl = &tp->t_outq;
+ len = q_to_b(cl, buf, XENCONS_BURST);
+ (void)HYPERVISOR_console_io(CONSOLEIO_write, len, buf);
+
+ s = spltty();
+ tp->t_state &= ~TS_BUSY;
+ if (cl->c_cc) {
+ tp->t_state |= TS_TIMEOUT;
+ callout_reset(&tp->t_rstrt_ch, 1, ttrstrt, tp);
+ }
+ if (cl->c_cc <= tp->t_lowat) {
+ if (tp->t_state & TS_ASLEEP) {
+ tp->t_state &= ~TS_ASLEEP;
+ wakeup(cl);
+ }
+ selwakeup(&tp->t_wsel);
+ }
+out:
+ splx(s);
+}
+
+void
+xencons_stop(struct tty *tp, int flag)
+{
+
+}
+
+
+
+void
+xenconscn_attach()
+{
+
+ cn_tab = &xencons;
+
+ xencons_isconsole = 1;
+}
+
+int
+xenconscn_getc(dev_t dev)
+{
+
+ printf("\n");
+ for (;;);
+}
+
+#define MAXLINELEN 1024
+void
+xenconscn_putc(dev_t dev, int c)
+{
+ static char buf[1024+1];
+ static int bufpos = 0;
+
+ buf[bufpos++] = c;
+ if (c == '\n') {
+ buf[bufpos] = 0;
+ (void)HYPERVISOR_console_io(CONSOLEIO_write, bufpos, buf);
+ bufpos = 0;
+ }
+}
+
+void
+xenconscn_pollc(dev_t dev, int on)
+{
+
+}
+
+/*
+ * Set line parameters.
+ */
+int
+xencons_param(struct tty *tp, struct termios *t)
+{
+
+ tp->t_ispeed = t->c_ispeed;
+ tp->t_ospeed = t->c_ospeed;
+ tp->t_cflag = t->c_cflag;
+ return (0);
+}
+
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c
new file mode 100644
index 0000000000..e54615567b
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c
@@ -0,0 +1,600 @@
+/* $NetBSD: xenkbc.c,v 1.3.2.1 2004/05/22 15:57:43 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2004 Ben Harris.
+ * Copyright (c) 1998
+ * Matthias Drochner. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xenkbc.c,v 1.3.2.1 2004/05/22 15:57:43 he Exp $");
+
+#include <sys/param.h>
+#include <sys/device.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+
+#include <dev/pckbport/pckbportvar.h>
+#include <dev/ic/i8042reg.h>
+
+#include <machine/intr.h>
+
+#include <machine/xenkbcvar.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/hypervisor-ifs/kbd.h>
+#include <machine/evtchn.h>
+
+#define KBC_DELAY DELAY(1000)
+#define KBC_TIMEOUT 250
+
+#define XENKBC_NSLOTS 2
+
+/* data per slave device */
+struct xenkbc_slotdata {
+ int xsd_polling; /* don't process data in interrupt handler */
+ int xsd_poll_data; /* data read from inr handler if polling */
+ int xsd_poll_stat; /* status read from inr handler if polling */
+#if NRND > 0
+ rndsource_element_t xsd_rnd_source;
+#endif
+};
+
+struct xenkbc_internal {
+ struct xenkbc_softc *xi_sc;
+ struct pckbport_tag *xi_pt;
+ struct xenkbc_slotdata *xi_slotdata[XENKBC_NSLOTS];
+ int xi_flags;
+ int xi_data;
+ int xi_8042cmdbyte;
+};
+
+#define XI_CONSOLE_FLAG 0x01
+#define XI_HASAUX_FLAG 0x02
+
+#define XI_CONSOLE(xi) ((xi)->xi_flags & XI_CONSOLE_FLAG)
+#define XI_HASAUX(xi) ((xi)->xi_flags & XI_HASAUX_FLAG)
+
+#define XI_SETCONSOLE(xi,on) \
+ ((on) ? ((xi)->xi_flags |= XI_CONSOLE_FLAG) : \
+ ((xi)->xi_flags &= ~XI_CONSOLE_FLAG))
+#define XI_SETHASAUX(xi,on) \
+ ((on) ? ((xi)->xi_flags |= XI_HASAUX_FLAG) : \
+ ((xi)->xi_flags &= ~XI_HASAUX_FLAG))
+
+static int xenkbc_match(struct device *, struct cfdata *, void *);
+static void xenkbc_attach(struct device *, struct device *, void *);
+
+static int xenkbc_xt_translation(void *, pckbport_slot_t, int);
+static void xenkbc_init_slotdata(struct xenkbc_slotdata *);
+
+static int xenkbc_get8042cmd (struct xenkbc_internal *);
+static int xenkbc_put8042cmd (struct xenkbc_internal *);
+static int xenkbc_send_devcmd(void *, pckbport_slot_t, u_char);
+static int xenkbc_send_cmd(void *, u_char);
+static int xenkbc_send_data(void *, u_char);
+static int xenkbc_poll_data1(void *, pckbport_slot_t);
+
+static void xenkbc_slot_enable(void *, pckbport_slot_t, int);
+static void xenkbc_intr_establish(void *, pckbport_slot_t);
+static void xenkbc_set_poll(void *, pckbport_slot_t, int);
+
+static int xenkbc_intr(void *);
+
+CFATTACH_DECL(xenkbc, sizeof(struct xenkbc_softc),
+ xenkbc_match, xenkbc_attach, NULL, NULL);
+
+static struct pckbport_accessops const xenkbc_ops = {
+ xenkbc_xt_translation,
+ xenkbc_send_devcmd,
+ xenkbc_poll_data1,
+ xenkbc_slot_enable,
+ xenkbc_intr_establish,
+ xenkbc_set_poll
+};
+
+static struct xenkbc_internal xenkbc_consdata;
+static struct xenkbc_slotdata xenkbc_cons_slotdata;
+
+/* #define XENKBCDEBUG */
+#ifdef XENKBCDEBUG
+#define DPRINTF(x) printf x
+#else
+#define DPRINTF(x)
+#endif
+
+
+static int
+xenkbc_getstatus(struct xenkbc_internal *xi)
+{
+ long res;
+
+ res = HYPERVISOR_kbd_op(KBD_OP_READ, 0);
+ if (res < 0) {
+ xi->xi_data = 0;
+ return 0;
+ }
+ xi->xi_data = KBD_CODE_SCANCODE(res);
+ return KBD_CODE_STATUS(res);
+}
+
+static int
+xenkbc_wait_output(struct xenkbc_internal *xi)
+{
+ u_int i;
+
+ for (i = KBC_TIMEOUT; i; i--) {
+ if ((xenkbc_getstatus(xi) & KBS_IBF) == 0)
+ return (1);
+ KBC_DELAY;
+ }
+ return (0);
+}
+
+static int
+xenkbc_match(struct device *parent, struct cfdata *cf, void *aux)
+{
+ struct xenkbc_attach_args *xa = aux;
+
+ if ((xen_start_info.flags & SIF_PRIVILEGED) == 0)
+ return 0;
+
+ if (strcmp(xa->xa_device, "xenkbc"))
+ return 0;
+
+ return 1;
+}
+
+static int
+xenkbc_attach_slot(struct xenkbc_softc *xs, pckbport_slot_t slot)
+{
+ struct xenkbc_internal *xi = xs->sc_xi;
+ struct device *child;
+ int alloced = 0;
+
+ if (xi->xi_slotdata[slot] == NULL) {
+ xi->xi_slotdata[slot] = malloc(sizeof(struct xenkbc_slotdata),
+ M_DEVBUF, M_NOWAIT);
+ if (xi->xi_slotdata[slot] == NULL) {
+ printf("%s: no memory\n", xs->sc_dev.dv_xname);
+ return 0;
+ }
+ xenkbc_init_slotdata(xi->xi_slotdata[slot]);
+ alloced++;
+ }
+
+ child = pckbport_attach_slot(&xs->sc_dev, xi->xi_pt, slot);
+
+ if (child == NULL && alloced) {
+ free(xi->xi_slotdata[slot], M_DEVBUF);
+ xi->xi_slotdata[slot] = NULL;
+ }
+
+#if NRND > 0
+ if (child != NULL && xi->xi_slotdata[slot] != NULL)
+ rnd_attach_source(&xi->xi_slotdata[slot]->xsd_rnd_source,
+ child->dv_xname, RND_TYPE_TTY, 0);
+#endif
+
+ return child != NULL;
+}
+
+static void
+xenkbc_attach(struct device *parent, struct device *self, void *aux)
+{
+ /* struct xenkbc_attach_args *xa = aux; */
+ struct xenkbc_softc *xs = (struct xenkbc_softc *)self;
+ struct xenkbc_internal *xi;
+ int res;
+ u_char cmdbits = 0;
+
+ if (XI_CONSOLE(&xenkbc_consdata))
+ xi = &xenkbc_consdata;
+ else {
+ xi = malloc(sizeof(struct xenkbc_internal), M_DEVBUF,
+ M_NOWAIT | M_ZERO);
+ if (xi == NULL) {
+ aprint_error(": no memory\n");
+ return;
+ }
+ xi->xi_8042cmdbyte = KC8_CPU;
+ }
+
+ aprint_normal(": Xen Keyboard/Mouse Device\n");
+
+ xs->sc_xi = xi;
+ xi->xi_sc = xs;
+
+ event_set_handler(_EVENT_PS2, &xenkbc_intr, xi, IPL_TTY);
+ hypervisor_enable_event(_EVENT_PS2);
+
+ xi->xi_pt = pckbport_attach(xi, &xenkbc_ops);
+
+ /* flush */
+ xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT);
+
+ /* set initial cmd byte */
+ if (!xenkbc_put8042cmd(xi)) {
+ printf("kbc: cmd word write error\n");
+ return;
+ }
+
+ if (xenkbc_attach_slot(xs, PCKBPORT_KBD_SLOT))
+ cmdbits |= KC8_KENABLE;
+
+ /*
+ * Check aux port ok.
+ */
+ if (!xenkbc_send_cmd(xi, KBC_AUXECHO)) {
+ printf("kbc: aux echo error 1\n");
+ goto nomouse;
+ }
+ if (!xenkbc_wait_output(xi)) {
+ printf("kbc: aux echo error 2\n");
+ goto nomouse;
+ }
+ XI_SETHASAUX(xi, 1);
+ xenkbc_send_data(xi, 0x5a); /* a random value */
+ res = xenkbc_poll_data1(xi, PCKBPORT_AUX_SLOT);
+ if (res != -1) {
+ /*
+ * In most cases, the 0x5a gets echoed.
+ * Some older controllers (Gateway 2000 circa 1993)
+ * return 0xfe here.
+ * We are satisfied if there is anything in the
+ * aux output buffer.
+ */
+ if (xenkbc_attach_slot(xs, PCKBPORT_AUX_SLOT))
+ cmdbits |= KC8_MENABLE;
+ } else {
+#ifdef XENKBCDEBUG
+ printf("kbc: aux echo test failed\n");
+#endif
+ XI_SETHASAUX(xi, 0);
+ }
+
+ nomouse:
+ /* enable needed interrupts */
+ xi->xi_8042cmdbyte |= cmdbits;
+ if (!xenkbc_put8042cmd(xi))
+ printf("kbc: cmd word write error\n");
+}
+
+static void
+xenkbc_init_slotdata(struct xenkbc_slotdata *xsd)
+{
+
+ xsd->xsd_polling = 0;
+}
+
+/*
+ * Get the current command byte.
+ */
+static int
+xenkbc_get8042cmd(struct xenkbc_internal *xi)
+{
+ int data;
+
+ if (!xenkbc_send_cmd(xi, K_RDCMDBYTE))
+ return 0;
+ data = xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT);
+ if (data == -1)
+ return 0;
+ xi->xi_8042cmdbyte = data;
+ return 1;
+}
+
+/*
+ * Pass command byte to keyboard controller (8042).
+ */
+static int
+xenkbc_put8042cmd(struct xenkbc_internal *xi)
+{
+
+ if (!xenkbc_send_cmd(xi, K_LDCMDBYTE))
+ return 0;
+ if (!xenkbc_wait_output(xi))
+ return 0;
+ return xenkbc_send_data(xi, xi->xi_8042cmdbyte);
+}
+
+static int
+xenkbc_send_devcmd(void *cookie, pckbport_slot_t slot, u_char devcmd)
+{
+
+ DPRINTF(("send_devcmd %x\n", devcmd));
+
+ if (slot == PCKBPORT_AUX_SLOT) {
+ if (!xenkbc_send_cmd(cookie, KBC_AUXWRITE)) {
+ DPRINTF(("xenkbc_send_devcmd: KBC_AUXWRITE failed\n"));
+ return 0;
+ }
+ }
+ if (!xenkbc_wait_output(cookie)) {
+ DPRINTF(("xenkbc_send_devcmd: wait_output failed\n"));
+ return 0;
+ }
+ return xenkbc_send_data(cookie, devcmd);
+}
+
+static int
+xenkbc_send_cmd(void *cookie, u_char cmd)
+{
+ struct xenkbc_internal *xi = cookie;
+
+ DPRINTF(("send_cmd %x\n", cmd));
+ xenkbc_wait_output(xi);
+ return !HYPERVISOR_kbd_op(KBD_OP_WRITECOMMAND, cmd);
+}
+
+static int
+xenkbc_send_data(void *cookie, u_char output)
+{
+ struct xenkbc_internal *xi = cookie;
+
+ DPRINTF(("send_data %x\n", output));
+ xenkbc_wait_output(xi);
+ return !HYPERVISOR_kbd_op(KBD_OP_WRITEOUTPUT, output);
+}
+
+static int
+xenkbc_poll_data1(void *cookie, pckbport_slot_t slot)
+{
+ struct xenkbc_internal *xi = cookie;
+ struct xenkbc_slotdata *xsd = xi->xi_slotdata[slot];
+ int s;
+ u_char stat, c;
+ int i = 1000;
+
+ s = splhigh();
+
+ if (xsd && xsd->xsd_polling && xsd->xsd_poll_data != -1 &&
+ xsd->xsd_poll_stat != -1) {
+ stat = xsd->xsd_poll_stat;
+ c = xsd->xsd_poll_data;
+ xsd->xsd_poll_data = -1;
+ xsd->xsd_poll_stat = -1;
+ goto process;
+ }
+
+ DELAY(10);
+ for (; i; i--) {
+ stat = xenkbc_getstatus(xi);
+ if (stat & KBS_DIB) {
+ c = xi->xi_data;
+ DELAY(10);
+ process:
+ if (XI_HASAUX(xi) && (stat & 0x20)) { /* aux data */
+ if (slot != PCKBPORT_AUX_SLOT) {
+#ifdef XENKBCDEBUG
+ printf("lost aux 0x%x\n", c);
+#endif
+ continue;
+ }
+ } else {
+ if (slot == PCKBPORT_AUX_SLOT) {
+#ifdef XENKBCDEBUG
+ printf("lost kbd 0x%x\n", c);
+#endif
+ continue;
+ }
+ }
+ splx(s);
+ DPRINTF(("poll -> %x stat %x\n", c, stat));
+ return c;
+ }
+ }
+
+ DPRINTF(("poll failed -> -1\n"));
+ splx(s);
+ return -1;
+}
+
+/*
+ * switch scancode translation on / off
+ * return nonzero on success
+ */
+static int
+xenkbc_xt_translation(void *cookie, pckbport_slot_t slot, int on)
+{
+ struct xenkbc_internal *xi = cookie;
+ int ison;
+
+ if (slot != PCKBPORT_KBD_SLOT) {
+ /* translation only for kbd slot */
+ if (on)
+ return 0;
+ else
+ return 1;
+ }
+
+ ison = xi->xi_8042cmdbyte & KC8_TRANS;
+ if ((on && ison) || (!on && !ison))
+ return 1;
+
+ xi->xi_8042cmdbyte ^= KC8_TRANS;
+ if (!xenkbc_put8042cmd(xi))
+ return 0;
+
+ /* read back to be sure */
+ if (!xenkbc_get8042cmd(xi))
+ return 0;
+
+ ison = xi->xi_8042cmdbyte & KC8_TRANS;
+ if ((on && ison) || (!on && !ison))
+ return 1;
+ return 0;
+}
+
+static const struct xenkbc_portcmd {
+ u_char cmd_en, cmd_dis;
+} xenkbc_portcmd[2] = {
+ {
+ KBC_KBDENABLE, KBC_KBDDISABLE,
+ }, {
+ KBC_AUXENABLE, KBC_AUXDISABLE,
+ }
+};
+
+static void
+xenkbc_slot_enable(void *cookie, pckbport_slot_t slot, int on)
+{
+ struct xenkbc_internal *xi = cookie;
+ const struct xenkbc_portcmd *cmd;
+
+ cmd = &xenkbc_portcmd[slot];
+
+ DPRINTF(("slot enable %d -> %d\n", slot, on));
+ xenkbc_send_cmd(xi, on ? cmd->cmd_en : cmd->cmd_dis);
+}
+
+
+static void
+xenkbc_intr_establish(void *cookie, pckbport_slot_t slot)
+{
+
+}
+
+static void
+xenkbc_set_poll(void *cookie, pckbport_slot_t slot, int on)
+{
+ struct xenkbc_internal *xi = cookie;
+
+ DPRINTF(("xenkbc_set_poll %d -> %d\n", slot, on));
+
+ xi->xi_slotdata[slot]->xsd_polling = on;
+
+ if (on) {
+ xi->xi_slotdata[slot]->xsd_poll_data = -1;
+ xi->xi_slotdata[slot]->xsd_poll_stat = -1;
+ } else {
+ int s;
+
+ /*
+ * If disabling polling on a device that's been configured,
+ * make sure there are no bytes left in the FIFO, holding up
+ * the interrupt line. Otherwise we won't get any further
+ * interrupts.
+ */
+ s = spltty();
+ xenkbc_intr(xi);
+ splx(s);
+ }
+}
+
+static int
+xenkbc_intr(void *self)
+{
+ struct xenkbc_internal *xi = self;
+ u_char stat;
+ pckbport_slot_t slot;
+ struct xenkbc_slotdata *xsd;
+ int served = 0;
+
+ for (;;) {
+ stat = xenkbc_getstatus(xi);
+ if (!(stat & KBS_DIB))
+ break;
+
+ served = 1;
+
+ slot = (XI_HASAUX(xi) && (stat & 0x20)) ?
+ PCKBPORT_AUX_SLOT : PCKBPORT_KBD_SLOT;
+ xsd = xi->xi_slotdata[slot];
+
+ if (xsd == NULL)
+ continue;
+
+#if NRND > 0
+ rnd_add_uint32(&xsd->xsd_rnd_source,
+ (stat << 8) | xi->xi_data);
+#endif
+
+ if (xsd->xsd_polling) {
+ xsd->xsd_poll_data = xi->xi_data;
+ xsd->xsd_poll_stat = stat;
+ break; /* xenkbc_poll_data() will get it */
+ }
+
+ pckbportintr(xi->xi_pt, slot, xi->xi_data);
+ }
+
+ return served;
+}
+
+int
+xenkbc_cnattach(pckbport_slot_t slot)
+{
+ struct xenkbc_internal *xi = &xenkbc_consdata;
+ int ret;
+
+ /* flush */
+ (void) xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT);
+
+ /* init cmd byte, enable ports */
+ xenkbc_consdata.xi_8042cmdbyte = KC8_CPU;
+ if (!xenkbc_put8042cmd(xi)) {
+ printf("kbc: cmd word write error\n");
+ return EIO;
+ }
+
+ ret = pckbport_cnattach(xi, &xenkbc_ops, slot);
+
+ xi->xi_slotdata[slot] = &xenkbc_cons_slotdata;
+ xenkbc_init_slotdata(xi->xi_slotdata[slot]);
+ XI_SETCONSOLE(xi, 1);
+
+ return ret;
+}
diff --git a/netbsd-2.0-xen-sparse/sys/nfs/files.nfs b/netbsd-2.0-xen-sparse/sys/nfs/files.nfs
new file mode 100644
index 0000000000..228c0c890f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/nfs/files.nfs
@@ -0,0 +1,34 @@
+# $NetBSD: files.nfs,v 1.3 2004/03/11 21:48:43 cl Exp $
+
+deffs fs_nfs.h NFS
+
+defflag opt_nfs_boot.h NFS_BOOT_BOOTP NFS_BOOT_BOOTPARAM NFS_BOOT_DHCP
+ NFS_BOOT_GATEWAY NFS_BOOT_TCP
+ NFS_BOOT_BOOTSTATIC
+
+defparam opt_nfs_boot.h NFS_BOOT_BOOTP_REQFILE NFS_BOOT_OPTIONS
+ NFS_BOOT_RWSIZE
+ NFS_BOOTSTATIC_MYIP NFS_BOOTSTATIC_GWIP
+ NFS_BOOTSTATIC_MASK NFS_BOOTSTATIC_SERVADDR
+ NFS_BOOTSTATIC_SERVER
+
+defflag opt_nfs.h NFS_V2_ONLY
+
+defflag NFSSERVER
+
+file nfs/krpc_subr.c nfs
+file nfs/nfs_bio.c nfs
+file nfs/nfs_boot.c nfs
+file nfs/nfs_bootdhcp.c nfs & (nfs_boot_bootp | nfs_boot_dhcp)
+file nfs/nfs_bootparam.c nfs & nfs_boot_bootparam
+file nfs/nfs_bootstatic.c nfs & nfs_boot_bootstatic
+file nfs/nfs_kq.c nfs
+file nfs/nfs_node.c nfs
+file nfs/nfs_nqlease.c nfsserver | nfs
+file nfs/nfs_serv.c nfsserver
+file nfs/nfs_socket.c nfsserver | nfs
+file nfs/nfs_srvcache.c nfsserver
+file nfs/nfs_subs.c nfsserver | nfs
+file nfs/nfs_syscalls.c nfsserver | nfs
+file nfs/nfs_vfsops.c nfs
+file nfs/nfs_vnops.c nfs