diff options
Diffstat (limited to 'netbsd-2.0-xen-sparse')
27 files changed, 20351 insertions, 0 deletions
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN new file mode 100644 index 0000000000..2fbb9998ac --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN @@ -0,0 +1,176 @@ +# $NetBSD: XEN,v 1.1.2.2 2004/07/15 20:19:34 he Exp $ + +include "arch/xen/conf/std.xen" + +options INCLUDE_CONFIG_FILE # embed config file in kernel binary + +#options UVMHIST +#options UVMHIST_PRINT +#options SYSCALL_DEBUG + +maxusers 32 # estimated number of users + +# +options XEN +#options DOM0OPS +options HZ=50 + +#options I586_CPU +options I686_CPU + +#options VM86 # virtual 8086 emulation +#options USER_LDT # user-settable LDT; used by WINE + +#options MTRR # memory-type range register syscall support + +#options CONSDEVNAME="\"xencons\"" +#options CONS_OVERRIDE + +options INSECURE # disable kernel security levels - X needs this + +options RTC_OFFSET=0 # hardware clock is this many mins. west of GMT +#options NTP # NTP phase/frequency locked loop + +options KTRACE # system call tracing via ktrace(1) +#options SYSTRACE # system call vetting via systrace(1) + +options SYSVMSG # System V-like message queues +options SYSVSEM # System V-like semaphores +#options SEMMNI=10 # number of semaphore identifiers +#options SEMMNS=60 # number of semaphores in system +#options SEMUME=10 # max number of undo entries per process +#options SEMMNU=30 # number of undo structures in system +options SYSVSHM # System V-like memory sharing +#options SHMMAXPGS=2048 # 2048 pages is the default +options P1003_1B_SEMAPHORE # p1003.1b semaphore support + +options LKM # loadable kernel modules + +options USERCONF # userconf(4) support +options SYSCTL_INCLUDE_DESCR # Include sysctl descriptions in kernel + +# Diagnostic/debugging support options +options DIAGNOSTIC # expensive kernel consistency checks +options DEBUG # expensive debugging checks/support +options KMEMSTATS # kernel memory statistics (vmstat -m) +options DDB # in-kernel debugger +options DDB_ONPANIC=1 # see also sysctl(8): `ddb.onpanic' +options DDB_HISTORY_SIZE=512 # enable history editing in DDB +#options KGDB # remote debugger +#options KGDB_DEVNAME="\"com\"",KGDB_DEVADDR=0x2f8,KGDB_DEVRATE=57600 +makeoptions DEBUG="-g" # compile full symbol table + +#options COMPAT_14 # NetBSD 1.4 +#options COMPAT_15 # NetBSD 1.5 +options COMPAT_16 # NetBSD 1.6 + +##options COMPAT_LINUX # binary compatibility with Linux +#options COMPAT_FREEBSD # binary compatibility with FreeBSD +#options COMPAT_MACH # binary compatibility with Mach binaries +#options COMPAT_DARWIN # binary compatibility with Darwin binaries +#options EXEC_MACHO # exec MACH-O binaries +#options COMPAT_PECOFF # kernel support to run Win32 apps + +file-system FFS # UFS +file-system EXT2FS # second extended file system (linux) +#file-system LFS # log-structured file system +#file-system MFS # memory file system +file-system NFS # Network File System client +#file-system NTFS # Windows/NT file system (experimental) +#file-system CD9660 # ISO 9660 + Rock Ridge file system +#file-system MSDOSFS # MS-DOS file system +file-system FDESC # /dev/fd +file-system KERNFS # /kern +file-system NULLFS # loopback file system +#file-system OVERLAY # overlay file system +#file-system PORTAL # portal filesystem (still experimental) +file-system PROCFS # /proc +#file-system UMAPFS # NULLFS + uid and gid remapping +#file-system UNION # union file system +#file-system SMBFS # experimental - CIFS; also needs nsmb (below) + +#options QUOTA # UFS quotas +#options SOFTDEP # FFS soft updates support. +#options NFSSERVER # Network File System server + +options GATEWAY # packet forwarding +options INET # IP + ICMP + TCP + UDP +options INET6 # IPV6 +options IPSEC # IP security +options IPSEC_ESP # IP security (encryption part; define w/IPSEC) +options MROUTING # IP multicast routing +options PFIL_HOOKS # pfil(9) packet filter hooks +options IPFILTER_LOG # ipmon(8) log support + +options NFS_BOOT_DHCP,NFS_BOOT_BOOTPARAM,NFS_BOOT_BOOTSTATIC +#options NFS_BOOTSTATIC_MYIP="\"169.254.1.2\"" +#options NFS_BOOTSTATIC_GWIP="\"169.254.1.1\"" +#options NFS_BOOTSTATIC_MASK="\"255.255.255.0\"" +#options NFS_BOOTSTATIC_SERVADDR="\"169.254.1.1\"" +#options NFS_BOOTSTATIC_SERVER="\"server:/path/to/root\"" + +options WSEMUL_VT100 # VT100 / VT220 emulation +options WS_KERNEL_FG=WSCOL_GREEN +options WSDISPLAY_COMPAT_PCVT # emulate some ioctls +options WSDISPLAY_COMPAT_SYSCONS # emulate some ioctls +options WSDISPLAY_COMPAT_USL # VT handling +options WSDISPLAY_COMPAT_RAWKBD # can get raw scancodes +options WSDISPLAY_DEFAULTSCREENS=4 +options PCDISPLAY_SOFTCURSOR + +config netbsd root on ? type ? +#config netbsd root on wd0a type ffs +#config netbsd root on xennet0 type nfs + +mainbus0 at root + +cpu* at mainbus? + +hypervisor* at mainbus? # Xen hypervisor + +npx0 at hypervisor? # x86 math coprocessor + +xencons* at hypervisor? # Xen virtual console +xennet* at hypervisor? # Xen virtual network interface + +#xbd* at hypervisor? # Xen virtual block device +#wd* at hypervisor? # Xen vbd (wd identity) +#sd* at hypervisor? # Xen vbd (sd identity) +#cd* at hypervisor? # Xen vbd (cd identity) + +#xenkbc* at hypervisor? # Xen Keyboard/Mouse Interface +#pckbd* at xenkbc? # Keyboard +#vga* at hypervisor? # Xen VGA display +#pms* at xenkbc? # PS/2 Mouse for wsmouse + +#wskbd* at pckbd? console ? +#wsdisplay* at vga? console ? +#wsmouse* at pms? mux 0 + + +include "arch/xen/conf/GENERIC.local" + + +pseudo-device ccd 4 # concatenated/striped disk devices +#pseudo-device cgd 4 # cryptographic disk devices +#pseudo-device md 1 # memory disk device (ramdisk) +#pseudo-device vnd 4 # disk-like interface to files + +pseudo-device bpfilter 8 # Berkeley packet filter +pseudo-device ipfilter # IP filter (firewall) and NAT +pseudo-device loop # network loopback +#pseudo-device tun 2 # network tunneling over tty +#pseudo-device gre 2 # generic L3 over IP tunnel +#pseudo-device gif 4 # IPv[46] over IPv[46] tunnel (RFC1933) +#pseudo-device faith 1 # IPv[46] tcp relay translation i/f +#pseudo-device stf 1 # 6to4 IPv6 over IPv4 encapsulation +#pseudo-device vlan # IEEE 802.1q encapsulation +#pseudo-device bridge # simple inter-network bridging + +pseudo-device pty # pseudo-terminals +pseudo-device rnd # /dev/random and in-kernel generator +pseudo-device clockctl # user control of clock subsystem + +pseudo-device wsmux # mouse & keyboard multiplexor +pseudo-device wsfont +pseudo-device ksyms # /dev/ksyms diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen new file mode 100644 index 0000000000..12f6bfa1d5 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen @@ -0,0 +1,232 @@ +# $NetBSD: files.xen,v 1.3.2.1 2004/05/22 15:59:02 he Exp $ +# NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp +# NetBSD: files.i386,v 1.254 2004/03/25 23:32:10 jmc Exp + +maxpartitions 8 + +maxusers 2 16 128 + +# Processor type options. +defflag opt_cputype.h I686_CPU + +# delay before cpu_reset() for reboot. +defparam CPURESET_DELAY + +# No unmapped page below kernel stack +defflag NOREDZONE + +# Beep on halt +defflag opt_beep.h BEEP_ONHALT +defparam opt_beep.h BEEP_ONHALT_COUNT +defparam opt_beep.h BEEP_ONHALT_PITCH BEEP_ONHALT_PERIOD + +file arch/xen/i386/autoconf.c +file arch/i386/i386/db_dbgreg.S ddb | kstack_check_dr0 +file arch/i386/i386/db_disasm.c ddb +file arch/i386/i386/db_interface.c ddb +file arch/i386/i386/db_memrw.c ddb | kgdb +file arch/i386/i386/db_trace.c ddb +file kern/subr_disk_mbr.c disk +file arch/xen/i386/gdt.c +file arch/xen/i386/hypervisor_machdep.c +file arch/i386/i386/in_cksum.S inet | inet6 +file arch/i386/i386/ipkdb_glue.c ipkdb +file arch/i386/i386/kgdb_machdep.c kgdb +file arch/xen/i386/machdep.c +file arch/xen/i386/identcpu.c +file arch/i386/i386/math_emulate.c math_emulate +file arch/i386/i386/mem.c +file kern/kern_microtime.c i586_cpu | i686_cpu +file arch/i386/i386/mtrr_k6.c mtrr +file netns/ns_cksum.c ns +file arch/xen/i386/pmap.c +file arch/i386/i386/process_machdep.c +file arch/i386/i386/procfs_machdep.c procfs +file arch/xen/i386/sys_machdep.c +file arch/i386/i386/syscall.c +file arch/xen/i386/trap.c +file arch/i386/i386/vm_machdep.c +file arch/xen/i386/xen_machdep.c + +file arch/xen/xen/xen_debug.c + +file arch/xen/xen/clock.c +file arch/xen/xen/evtchn.c +file arch/xen/xen/ctrl_if.c + +file dev/cons.c + +file arch/i386/i386/mptramp.S multiprocessor +file arch/i386/i386/ipifuncs.c multiprocessor + +file arch/i386/i386/pmc.c perfctrs + +file crypto/des/arch/i386/des_enc.S des +file crypto/des/arch/i386/des_cbc.S des + +file crypto/blowfish/arch/i386/bf_enc.S blowfish +file crypto/blowfish/arch/i386/bf_cbc.S blowfish & !i386_cpu + +# +# Machine-independent SCSI drivers +# + +#xxx include "dev/scsipi/files.scsipi" + +# +# Machine-independent ATA drivers +# + +#xxx include "dev/ata/files.ata" + +# Memory Disk for install floppy +file dev/md_root.c memory_disk_hooks + +# +define mainbus { [apid = -1] } + +file arch/x86/x86/bus_dma.c +file arch/xen/x86/bus_space.c +file arch/x86/x86/cacheinfo.c +file arch/xen/x86/consinit.c +file arch/xen/x86/intr.c +file arch/x86/x86/ipi.c multiprocessor +file arch/x86/x86/lock_machdep.c lockdebug +file arch/x86/x86/softintr.c + +include "arch/xen/conf/files.compat" + +# +# System bus types +# + +device mainbus: mainbus +attach mainbus at root +file arch/xen/i386/mainbus.c mainbus + +# Xen hypervisor +device hypervisor { } +attach hypervisor at mainbus +file arch/xen/xen/hypervisor.c hypervisor needs-flag + +# Numeric Processing Extension; Math Co-processor +device npx +file arch/xen/i386/npx.c npx needs-flag + +attach npx at hypervisor with npx_hv +file arch/xen/i386/npx_hv.c npx_hv + +# Xen console support +device xencons: tty +attach xencons at hypervisor +file arch/xen/xen/xencons.c xencons needs-flag + +include "dev/wscons/files.wscons" +include "dev/wsfont/files.wsfont" + +include "dev/pckbport/files.pckbport" + +# CPUS + +define cpu { [apid = -1] } +device cpu +attach cpu at mainbus +file arch/xen/i386/cpu.c cpu + +# +# Compatibility modules +# + +# VM86 mode +file arch/i386/i386/vm86.c vm86 + +# VM86 in kernel +file arch/i386/i386/kvm86.c kvm86 +file arch/i386/i386/kvm86call.S kvm86 + +# Binary compatibility with previous NetBSD releases (COMPAT_XX) +file arch/i386/i386/compat_13_machdep.c compat_13 | compat_aout +file arch/i386/i386/compat_16_machdep.c compat_16 | compat_ibcs2 + +# SVR4 binary compatibility (COMPAT_SVR4) +include "compat/svr4/files.svr4" +file arch/i386/i386/svr4_machdep.c compat_svr4 +file arch/i386/i386/svr4_sigcode.S compat_svr4 +file arch/i386/i386/svr4_syscall.c compat_svr4 + +# MACH binary compatibility (COMPAT_MACH) +include "compat/mach/files.mach" +file arch/i386/i386/mach_machdep.c compat_mach | compat_darwin +file arch/i386/i386/mach_sigcode.S compat_mach | compat_darwin +file arch/i386/i386/mach_syscall.c compat_mach | compat_darwin +file arch/i386/i386/macho_machdep.c exec_macho + +# DARWIN binary compatibility (COMPAT_DARWIN) +include "compat/darwin/files.darwin" +file arch/i386/i386/darwin_machdep.c compat_darwin + +# iBCS-2 binary compatibility (COMPAT_IBCS2) +include "compat/ibcs2/files.ibcs2" +file arch/i386/i386/ibcs2_machdep.c compat_ibcs2 +file arch/i386/i386/ibcs2_sigcode.S compat_ibcs2 +file arch/i386/i386/ibcs2_syscall.c compat_ibcs2 + +# Linux binary compatibility (COMPAT_LINUX) +include "compat/linux/files.linux" +include "compat/linux/arch/i386/files.linux_i386" +file arch/i386/i386/linux_sigcode.S compat_linux +file arch/i386/i386/linux_syscall.c compat_linux +file arch/i386/i386/linux_trap.c compat_linux + +# FreeBSD binary compatibility (COMPAT_FREEBSD) +include "compat/freebsd/files.freebsd" +file arch/i386/i386/freebsd_machdep.c compat_freebsd +file arch/i386/i386/freebsd_sigcode.S compat_freebsd +file arch/i386/i386/freebsd_syscall.c compat_freebsd + +# a.out binary compatibility (COMPAT_AOUT) +include "compat/aout/files.aout" + +# Win32 binary compatibility (COMPAT_PECOFF) +include "compat/pecoff/files.pecoff" + +# OSS audio driver compatibility +include "compat/ossaudio/files.ossaudio" + +# Xen devices + +# Network driver +device xennet: arp, ether, ifnet +attach xennet at hypervisor +file arch/xen/xen/if_xennet.c xennet needs-flag + +# Block device driver and wd/sd/cd identities +device xbd: disk +attach xbd at hypervisor +file arch/xen/xen/xbd.c xbd | wd | sd | cd needs-flag + +device wd: disk +attach wd at hypervisor + +device sd: disk +attach sd at hypervisor + +device cd: disk +attach cd at hypervisor + +# Keyboard +device xenkbc: pckbport +attach xenkbc at hypervisor +file arch/xen/xen/xenkbc.c xenkbc needs-flag + +# Generic VGA +attach vga at hypervisor with vga_xen +file arch/xen/xen/vga_xen.c vga_xen needs-flag + +# Domain-0 operations +defflag opt_xen.h DOM0OPS +file arch/xen/xen/machmem.c dom0ops +file arch/xen/xen/privcmd.c dom0ops +file arch/xen/xen/vfr.c dom0ops + +include "arch/xen/conf/majors.i386" diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c new file mode 100644 index 0000000000..766b7aaee2 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c @@ -0,0 +1,630 @@ +/* $NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $ */ +/* NetBSD: autoconf.c,v 1.75 2003/12/30 12:33:22 pk Exp */ + +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)autoconf.c 7.1 (Berkeley) 5/9/91 + */ + +/* + * Setup the system to run on the current machine. + * + * Configure() is called at boot time and initializes the vba + * device tables and the memory controller monitoring. Available + * devices are determined (from possibilities mentioned in ioconf.c), + * and the drivers are initialized. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $"); + +#include "opt_compat_oldboot.h" +#include "opt_multiprocessor.h" +#include "opt_nfs_boot.h" +#include "xennet.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/disklabel.h> +#include <sys/conf.h> +#ifdef COMPAT_OLDBOOT +#include <sys/reboot.h> +#endif +#include <sys/device.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/fcntl.h> +#include <sys/dkio.h> +#include <sys/proc.h> +#include <sys/user.h> + +#ifdef NFS_BOOT_BOOTSTATIC +#include <net/if.h> +#include <net/if_ether.h> +#include <netinet/in.h> +#include <nfs/rpcv2.h> +#include <nfs/nfsproto.h> +#include <nfs/nfs.h> +#include <nfs/nfsmount.h> +#include <nfs/nfsdiskless.h> +#include <machine/if_xennetvar.h> +#endif + +#include <machine/pte.h> +#include <machine/cpu.h> +#include <machine/gdt.h> +#include <machine/pcb.h> +#include <machine/bootinfo.h> + +#include "ioapic.h" +#include "lapic.h" + +#if NIOAPIC > 0 +#include <machine/i82093var.h> +#endif + +#if NLAPIC > 0 +#include <machine/i82489var.h> +#endif + +static int match_harddisk(struct device *, struct btinfo_bootdisk *); +static void matchbiosdisks(void); +static void findroot(void); +static int is_valid_disk(struct device *); + +extern struct disklist *i386_alldisks; +extern int i386_ndisks; + +#include "bios32.h" +#if NBIOS32 > 0 +#include <machine/bios32.h> +#endif + +#include "opt_pcibios.h" +#ifdef PCIBIOS +#include <dev/pci/pcireg.h> +#include <dev/pci/pcivar.h> +#include <i386/pci/pcibios.h> +#endif + +#include "opt_kvm86.h" +#ifdef KVM86 +#include <machine/kvm86.h> +#endif + +#include "opt_xen.h" + +struct device *booted_device; +int booted_partition; + +/* + * Determine i/o configuration for a machine. + */ +void +cpu_configure(void) +{ + + startrtclock(); + +#if NBIOS32 > 0 + bios32_init(); +#endif +#ifdef PCIBIOS + pcibios_init(); +#endif + + /* kvm86 needs a TSS */ + i386_proc0_tss_ldt_init(); +#ifdef KVM86 + kvm86_init(); +#endif + + if (config_rootfound("mainbus", NULL) == NULL) + panic("configure: mainbus not configured"); + +#ifdef INTRDEBUG + intr_printconfig(); +#endif + +#if NIOAPIC > 0 + lapic_set_lvt(); + ioapic_enable(); +#endif + /* resync cr0 after FPU configuration */ + lwp0.l_addr->u_pcb.pcb_cr0 = rcr0(); +#ifdef MULTIPROCESSOR + /* propagate this to the idle pcb's. */ + cpu_init_idle_pcbs(); +#endif + + spl0(); +#if NLAPIC > 0 + lapic_tpr = 0; +#endif +} + +void +cpu_rootconf(void) +{ + findroot(); + matchbiosdisks(); + + printf("boot device: %s\n", + booted_device ? booted_device->dv_xname : "<unknown>"); + + setroot(booted_device, booted_partition); +} + +/* + * XXX ugly bit of code. But, this is the only safe time that the + * match between BIOS disks and native disks can be done. + */ +static void +matchbiosdisks(void) +{ + struct btinfo_biosgeom *big; + struct bi_biosgeom_entry *be; + struct device *dv; + int i, ck, error, m, n; + struct vnode *tv; + char mbr[DEV_BSIZE]; + int dklist_size; + int bmajor; + + big = lookup_bootinfo(BTINFO_BIOSGEOM); + + if (big == NULL) + return; + + /* + * First, count all native disks + */ + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) + if (is_valid_disk(dv)) + i386_ndisks++; + + if (i386_ndisks == 0) + return; + + dklist_size = sizeof (struct disklist) + (i386_ndisks - 1) * + sizeof (struct nativedisk_info); + + /* XXX M_TEMP is wrong */ + i386_alldisks = malloc(dklist_size, M_TEMP, M_NOWAIT); + if (i386_alldisks == NULL) + return; + + memset(i386_alldisks, 0, dklist_size); + + i386_alldisks->dl_nnativedisks = i386_ndisks; + i386_alldisks->dl_nbiosdisks = big->num; + for (i = 0; i < big->num; i++) { + i386_alldisks->dl_biosdisks[i].bi_dev = big->disk[i].dev; + i386_alldisks->dl_biosdisks[i].bi_sec = big->disk[i].sec; + i386_alldisks->dl_biosdisks[i].bi_head = big->disk[i].head; + i386_alldisks->dl_biosdisks[i].bi_cyl = big->disk[i].cyl; + i386_alldisks->dl_biosdisks[i].bi_lbasecs = big->disk[i].totsec; + i386_alldisks->dl_biosdisks[i].bi_flags = big->disk[i].flags; +#ifdef GEOM_DEBUG +#ifdef NOTYET + printf("disk %x: flags %x, interface %x, device %llx\n", + big->disk[i].dev, big->disk[i].flags, + big->disk[i].interface_path, big->disk[i].device_path); +#endif +#endif + } + + /* + * XXX code duplication from findroot() + */ + n = -1; + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) { + if (dv->dv_class != DV_DISK) + continue; +#ifdef GEOM_DEBUG + printf("matchbiosdisks: trying to match (%s) %s\n", + dv->dv_xname, dv->dv_cfdata->cf_name); +#endif + if (is_valid_disk(dv)) { + n++; + sprintf(i386_alldisks->dl_nativedisks[n].ni_devname, + "%s%d", dv->dv_cfdata->cf_name, + dv->dv_unit); + + bmajor = devsw_name2blk(dv->dv_xname, NULL, 0); + if (bmajor == -1) + return; + + if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, RAW_PART), + &tv)) + panic("matchbiosdisks: can't alloc vnode"); + + error = VOP_OPEN(tv, FREAD, NOCRED, 0); + if (error) { + vput(tv); + continue; + } + error = vn_rdwr(UIO_READ, tv, mbr, DEV_BSIZE, 0, + UIO_SYSSPACE, 0, NOCRED, NULL, 0); + VOP_CLOSE(tv, FREAD, NOCRED, 0); + if (error) { +#ifdef GEOM_DEBUG + printf("matchbiosdisks: %s: MBR read failure\n", + dv->dv_xname); +#endif + continue; + } + + for (ck = i = 0; i < DEV_BSIZE; i++) + ck += mbr[i]; + for (m = i = 0; i < big->num; i++) { + be = &big->disk[i]; +#ifdef GEOM_DEBUG + printf("match %s with %d ", dv->dv_xname, i); + printf("dev ck %x bios ck %x\n", ck, be->cksum); +#endif + if (be->flags & BI_GEOM_INVALID) + continue; + if (be->cksum == ck && + !memcmp(&mbr[MBR_PART_OFFSET], be->dosparts, + MBR_PART_COUNT * + sizeof (struct mbr_partition))) { +#ifdef GEOM_DEBUG + printf("matched bios disk %x with %s\n", + be->dev, dv->dv_xname); +#endif + i386_alldisks->dl_nativedisks[n]. + ni_biosmatches[m++] = i; + } + } + i386_alldisks->dl_nativedisks[n].ni_nmatches = m; + vput(tv); + } + } +} + +#ifdef COMPAT_OLDBOOT +u_long bootdev = 0; /* should be dev_t, but not until 32 bits */ +#endif + +/* + * helper function for "findroot()": + * return nonzero if disk device matches bootinfo + */ +static int +match_harddisk(struct device *dv, struct btinfo_bootdisk *bid) +{ + struct vnode *tmpvn; + int error; + struct disklabel label; + int found = 0; + int bmajor; + + /* + * A disklabel is required here. The + * bootblocks don't refuse to boot from + * a disk without a label, but this is + * normally not wanted. + */ + if (bid->labelsector == -1) + return(0); + + /* + * lookup major number for disk block device + */ + bmajor = devsw_name2blk(dv->dv_xname, NULL, 0); + if (bmajor == -1) + return(0); /* XXX panic() ??? */ + + /* + * Fake a temporary vnode for the disk, open + * it, and read the disklabel for comparison. + */ + if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, bid->partition), &tmpvn)) + panic("findroot can't alloc vnode"); + error = VOP_OPEN(tmpvn, FREAD, NOCRED, 0); + if (error) { +#ifndef DEBUG + /* + * Ignore errors caused by missing + * device, partition or medium. + */ + if (error != ENXIO && error != ENODEV) +#endif + printf("findroot: can't open dev %s%c (%d)\n", + dv->dv_xname, 'a' + bid->partition, error); + vput(tmpvn); + return(0); + } + error = VOP_IOCTL(tmpvn, DIOCGDINFO, &label, FREAD, NOCRED, 0); + if (error) { + /* + * XXX can't happen - open() would + * have errored out (or faked up one) + */ + printf("can't get label for dev %s%c (%d)\n", + dv->dv_xname, 'a' + bid->partition, error); + goto closeout; + } + + /* compare with our data */ + if (label.d_type == bid->label.type && + label.d_checksum == bid->label.checksum && + !strncmp(label.d_packname, bid->label.packname, 16)) + found = 1; + +closeout: + VOP_CLOSE(tmpvn, FREAD, NOCRED, 0); + vput(tmpvn); + return(found); +} + +/* + * Attempt to find the device from which we were booted. + * If we can do so, and not instructed not to do so, + * change rootdev to correspond to the load device. + */ +void +findroot(void) +{ + struct btinfo_bootdisk *bid; + struct device *dv; + union xen_cmdline_parseinfo xcp; +#ifdef COMPAT_OLDBOOT + int i, majdev, unit, part; + char buf[32]; +#endif + + if (booted_device) + return; + + if (lookup_bootinfo(BTINFO_NETIF)) { + /* + * We got netboot interface information, but + * "device_register()" couldn't match it to a configured + * device. Bootdisk information cannot be present at the + * same time, so give up. + */ + printf("findroot: netboot interface not found\n"); + return; + } + + bid = lookup_bootinfo(BTINFO_BOOTDISK); + if (bid) { + /* + * Scan all disk devices for ones that match the passed data. + * Don't break if one is found, to get possible multiple + * matches - for problem tracking. Use the first match anyway + * because lower device numbers are more likely to be the + * boot device. + */ + for (dv = alldevs.tqh_first; dv != NULL; + dv = dv->dv_list.tqe_next) { + if (dv->dv_class != DV_DISK) + continue; + + if (!strcmp(dv->dv_cfdata->cf_name, "fd")) { + /* + * Assume the configured unit number matches + * the BIOS device number. (This is the old + * behaviour.) Needs some ideas how to handle + * BIOS's "swap floppy drive" options. + */ + if ((bid->biosdev & 0x80) || + dv->dv_unit != bid->biosdev) + continue; + + goto found; + } + + if (is_valid_disk(dv)) { + /* + * Don't trust BIOS device numbers, try + * to match the information passed by the + * bootloader instead. + */ + if ((bid->biosdev & 0x80) == 0 || + !match_harddisk(dv, bid)) + continue; + + goto found; + } + + /* no "fd", "wd", "sd", "ld", "ed" */ + continue; + +found: + if (booted_device) { + printf("warning: double match for boot " + "device (%s, %s)\n", + booted_device->dv_xname, dv->dv_xname); + continue; + } + booted_device = dv; + booted_partition = bid->partition; + } + + if (booted_device) + return; + } + + xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp); + + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) { + if (is_valid_disk(dv) == 0) + continue; + + if (xcp.xcp_bootdev[0] == 0) { + booted_device = dv; + break; + } + + if (strncmp(xcp.xcp_bootdev, dv->dv_xname, + strlen(dv->dv_xname))) + continue; + + if (strlen(xcp.xcp_bootdev) > strlen(dv->dv_xname)) { + booted_partition = toupper( + xcp.xcp_bootdev[strlen(dv->dv_xname)]) - 'A'; + } + + booted_device = dv; + break; + } + + if (booted_device) + return; + +#ifdef COMPAT_OLDBOOT +#if 0 + printf("howto %x bootdev %x ", boothowto, bootdev); +#endif + + if ((bootdev & B_MAGICMASK) != (u_long)B_DEVMAGIC) + return; + + majdev = (bootdev >> B_TYPESHIFT) & B_TYPEMASK; + name = devsw_blk2name(majdev); + if (name == NULL) + return; + + part = (bootdev >> B_PARTITIONSHIFT) & B_PARTITIONMASK; + unit = (bootdev >> B_UNITSHIFT) & B_UNITMASK; + + sprintf(buf, "%s%d", name, unit); + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) { + if (strcmp(buf, dv->dv_xname) == 0) { + booted_device = dv; + booted_partition = part; + return; + } + } +#endif +} + +#include "pci.h" + +#include <dev/isa/isavar.h> +#if NPCI > 0 +#include <dev/pci/pcivar.h> +#endif + +void +device_register(struct device *dev, void *aux) +{ + /* + * Handle network interfaces here, the attachment information is + * not available driver independantly later. + * For disks, there is nothing useful available at attach time. + */ +#if NXENNET > 0 + if (dev->dv_class == DV_IFNET) { + union xen_cmdline_parseinfo xcp; + + xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp); + if (strncmp(xcp.xcp_bootdev, dev->dv_xname, 16) == 0) { +#ifdef NFS_BOOT_BOOTSTATIC + nfs_bootstatic_callback = xennet_bootstatic_callback; +#endif + goto found; + } + } +#endif + if (dev->dv_class == DV_IFNET) { + struct btinfo_netif *bin = lookup_bootinfo(BTINFO_NETIF); + if (bin == NULL) + return; + + /* + * We don't check the driver name against the device name + * passed by the boot ROM. The ROM should stay usable + * if the driver gets obsoleted. + * The physical attachment information (checked below) + * must be sufficient to identify the device. + */ + + if (bin->bus == BI_BUS_ISA && + !strcmp(dev->dv_parent->dv_cfdata->cf_name, "isa")) { + struct isa_attach_args *iaa = aux; + + /* compare IO base address */ + /* XXXJRT what about multiple I/O addrs? */ + if (iaa->ia_nio > 0 && + bin->addr.iobase == iaa->ia_io[0].ir_addr) + goto found; + } +#if NPCI > 0 + if (bin->bus == BI_BUS_PCI && + !strcmp(dev->dv_parent->dv_cfdata->cf_name, "pci")) { + struct pci_attach_args *paa = aux; + int b, d, f; + + /* + * Calculate BIOS representation of: + * + * <bus,device,function> + * + * and compare. + */ + pci_decompose_tag(paa->pa_pc, paa->pa_tag, &b, &d, &f); + if (bin->addr.tag == ((b << 8) | (d << 3) | f)) + goto found; + } +#endif + } + return; + +found: + if (booted_device) { + /* XXX should be a "panic()" */ + printf("warning: double match for boot device (%s, %s)\n", + booted_device->dv_xname, dev->dv_xname); + return; + } + booted_device = dev; +} + +static int +is_valid_disk(struct device *dv) +{ + const char *name; + + if (dv->dv_class != DV_DISK) + return (0); + + name = dv->dv_cfdata->cf_name; + + return (strcmp(name, "sd") == 0 || strcmp(name, "wd") == 0 || + strcmp(name, "ld") == 0 || strcmp(name, "ed") == 0 || + strcmp(name, "xbd") == 0); +} diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c new file mode 100644 index 0000000000..23dd52f1d3 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c @@ -0,0 +1,408 @@ +/* $NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $ */ +/* NetBSD: gdt.c,v 1.32 2004/02/13 11:36:13 wiz Exp */ + +/*- + * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by John T. Kohl and Charles M. Hannum. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $"); + +#include "opt_multiprocessor.h" +#include "opt_xen.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/user.h> + +#include <uvm/uvm.h> + +#include <machine/gdt.h> + +int gdt_size[2]; /* total number of GDT entries */ +int gdt_count[2]; /* number of GDT entries in use */ +int gdt_next[2]; /* next available slot for sweeping */ +int gdt_free[2]; /* next free slot; terminated with GNULL_SEL */ + +struct lock gdt_lock_store; + +static __inline void gdt_lock(void); +static __inline void gdt_unlock(void); +void gdt_init(void); +void gdt_grow(int); +int gdt_get_slot(void); +int gdt_get_slot1(int); +void gdt_put_slot(int); +void gdt_put_slot1(int, int); + +/* + * Lock and unlock the GDT, to avoid races in case gdt_{ge,pu}t_slot() sleep + * waiting for memory. + * + * Note that the locking done here is not sufficient for multiprocessor + * systems. A freshly allocated slot will still be of type SDT_SYSNULL for + * some time after the GDT is unlocked, so gdt_compact() could attempt to + * reclaim it. + */ +static __inline void +gdt_lock() +{ + + (void) lockmgr(&gdt_lock_store, LK_EXCLUSIVE, NULL); +} + +static __inline void +gdt_unlock() +{ + + (void) lockmgr(&gdt_lock_store, LK_RELEASE, NULL); +} + +void +setgdt(int sel, void *base, size_t limit, + int type, int dpl, int def32, int gran) +{ + struct segment_descriptor sd; + CPU_INFO_ITERATOR cii; + struct cpu_info *ci; + + if (type == SDT_SYS386TSS) { + /* printk("XXX TSS descriptor not supported in GDT\n"); */ + return; + } + + setsegment(&sd, base, limit, type, dpl, def32, gran); + for (CPU_INFO_FOREACH(cii, ci)) { + if (ci->ci_gdt != NULL) { +#ifndef XEN + ci->ci_gdt[sel].sd = sd; +#else + xen_update_descriptor(&ci->ci_gdt[sel], + (union descriptor *)&sd); +#endif + } + } +} + +/* + * Initialize the GDT subsystem. Called from autoconf(). + */ +void +gdt_init() +{ + size_t max_len, min_len; + union descriptor *old_gdt; + struct vm_page *pg; + vaddr_t va; + struct cpu_info *ci = &cpu_info_primary; + + lockinit(&gdt_lock_store, PZERO, "gdtlck", 0, 0); + + max_len = MAXGDTSIZ * sizeof(gdt[0]); + min_len = MINGDTSIZ * sizeof(gdt[0]); + + gdt_size[0] = MINGDTSIZ; + gdt_count[0] = NGDT; + gdt_next[0] = NGDT; + gdt_free[0] = GNULL_SEL; + + gdt_size[1] = 0; + gdt_count[1] = MAXGDTSIZ; + gdt_next[1] = MAXGDTSIZ; + gdt_free[1] = GNULL_SEL; + + old_gdt = gdt; + gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len + max_len); + for (va = (vaddr_t)gdt; va < (vaddr_t)gdt + min_len; va += PAGE_SIZE) { + pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); + if (pg == NULL) { + panic("gdt_init: no pages"); + } + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), + VM_PROT_READ | VM_PROT_WRITE); + } + memcpy(gdt, old_gdt, NGDT * sizeof(gdt[0])); + ci->ci_gdt = gdt; + setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1, + SDT_MEMRWA, SEL_KPL, 1, 1); + + gdt_init_cpu(ci); +} + +/* + * Allocate shadow GDT for a slave CPU. + */ +void +gdt_alloc_cpu(struct cpu_info *ci) +{ + int max_len = MAXGDTSIZ * sizeof(gdt[0]); + int min_len = MINGDTSIZ * sizeof(gdt[0]); + struct vm_page *pg; + vaddr_t va; + + ci->ci_gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len); + for (va = (vaddr_t)ci->ci_gdt; va < (vaddr_t)ci->ci_gdt + min_len; + va += PAGE_SIZE) { + while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) + == NULL) { + uvm_wait("gdt_alloc_cpu"); + } + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), + VM_PROT_READ | VM_PROT_WRITE); + } + memset(ci->ci_gdt, 0, min_len); + memcpy(ci->ci_gdt, gdt, gdt_count[0] * sizeof(gdt[0])); + setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1, + SDT_MEMRWA, SEL_KPL, 1, 1); +} + + +/* + * Load appropriate gdt descriptor; we better be running on *ci + * (for the most part, this is how a CPU knows who it is). + */ +void +gdt_init_cpu(struct cpu_info *ci) +{ +#ifndef XEN + struct region_descriptor region; + size_t max_len; + + max_len = MAXGDTSIZ * sizeof(gdt[0]); + setregion(®ion, ci->ci_gdt, max_len - 1); + lgdt(®ion); +#else + size_t len = gdt_size[0] * sizeof(gdt[0]); + unsigned long frames[len >> PAGE_SHIFT]; + vaddr_t va; + pt_entry_t *ptp; + pt_entry_t *maptp; + int f; + + for (va = (vaddr_t)ci->ci_gdt, f = 0; + va < (vaddr_t)ci->ci_gdt + len; + va += PAGE_SIZE, f++) { + KASSERT(va >= VM_MIN_KERNEL_ADDRESS); + ptp = kvtopte(va); + frames[f] = *ptp >> PAGE_SHIFT; + maptp = (pt_entry_t *)vtomach((vaddr_t)ptp); + PTE_CLEARBITS(ptp, maptp, PG_RW); + } + PTE_UPDATES_FLUSH(); + /* printk("loading gdt %x, %d entries, %d pages", */ + /* frames[0] << PAGE_SHIFT, gdt_size[0], len >> PAGE_SHIFT); */ + if (HYPERVISOR_set_gdt(frames, gdt_size[0])) + panic("HYPERVISOR_set_gdt failed!\n"); + lgdt_finish(); +#endif +} + +#ifdef MULTIPROCESSOR + +void +gdt_reload_cpu(struct cpu_info *ci) +{ + struct region_descriptor region; + size_t max_len; + + max_len = MAXGDTSIZ * sizeof(gdt[0]); + setregion(®ion, ci->ci_gdt, max_len - 1); + lgdt(®ion); +} +#endif + + +/* + * Grow the GDT. + */ +void +gdt_grow(int which) +{ + size_t old_len, new_len, max_len; + CPU_INFO_ITERATOR cii; + struct cpu_info *ci; + struct vm_page *pg; + vaddr_t va; + + old_len = gdt_size[which] * sizeof(gdt[0]); + gdt_size[which] <<= 1; + new_len = old_len << 1; + + if (which != 0) { + max_len = MAXGDTSIZ * sizeof(gdt[0]); + if (old_len == 0) { + gdt_size[which] = MINGDTSIZ; + new_len = gdt_size[which] * sizeof(gdt[0]); + } + for (va = (vaddr_t)(cpu_info_primary.ci_gdt) + old_len + max_len; + va < (vaddr_t)(cpu_info_primary.ci_gdt) + new_len + max_len; + va += PAGE_SIZE) { + while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) == + NULL) { + uvm_wait("gdt_grow"); + } + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), + VM_PROT_READ | VM_PROT_WRITE); + } + return; + } + + for (CPU_INFO_FOREACH(cii, ci)) { + for (va = (vaddr_t)(ci->ci_gdt) + old_len; + va < (vaddr_t)(ci->ci_gdt) + new_len; + va += PAGE_SIZE) { + while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) == + NULL) { + uvm_wait("gdt_grow"); + } + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), + VM_PROT_READ | VM_PROT_WRITE); + } + } +} + +/* + * Allocate a GDT slot as follows: + * 1) If there are entries on the free list, use those. + * 2) If there are fewer than gdt_size entries in use, there are free slots + * near the end that we can sweep through. + * 3) As a last resort, we increase the size of the GDT, and sweep through + * the new slots. + */ +int +gdt_get_slot() +{ + return gdt_get_slot1(0); +} + +int +gdt_get_slot1(int which) +{ + size_t offset; + int slot; + + gdt_lock(); + + if (gdt_free[which] != GNULL_SEL) { + slot = gdt_free[which]; + gdt_free[which] = gdt[slot].gd.gd_selector; + } else { + offset = which * MAXGDTSIZ * sizeof(gdt[0]); + if (gdt_next[which] != gdt_count[which] + offset) + panic("gdt_get_slot botch 1"); + if (gdt_next[which] - offset >= gdt_size[which]) { + if (gdt_size[which] >= MAXGDTSIZ) + panic("gdt_get_slot botch 2"); + gdt_grow(which); + } + slot = gdt_next[which]++; + } + + gdt_count[which]++; + gdt_unlock(); + return (slot); +} + +/* + * Deallocate a GDT slot, putting it on the free list. + */ +void +gdt_put_slot(int slot) +{ + gdt_put_slot1(slot, 0); +} + +void +gdt_put_slot1(int slot, int which) +{ + + gdt_lock(); + gdt_count[which]--; + + gdt[slot].gd.gd_type = SDT_SYSNULL; + gdt[slot].gd.gd_selector = gdt_free[which]; + gdt_free[which] = slot; + + gdt_unlock(); +} + +int +tss_alloc(struct pcb *pcb) +{ + int slot; + + slot = gdt_get_slot(); + setgdt(slot, &pcb->pcb_tss, sizeof(struct pcb) - 1, + SDT_SYS386TSS, SEL_KPL, 0, 0); + return GSEL(slot, SEL_KPL); +} + +void +tss_free(int sel) +{ + + gdt_put_slot(IDXSEL(sel)); +} + +/* + * Caller must have pmap locked for both of these functions. + */ +void +ldt_alloc(struct pmap *pmap, union descriptor *ldt, size_t len) +{ + int slot; + + slot = gdt_get_slot1(1); +#ifndef XEN + setgdt(slot, ldt, len - 1, SDT_SYSLDT, SEL_KPL, 0, 0); +#else + cpu_info_primary.ci_gdt[slot].ld.ld_base = (uint32_t)ldt; + cpu_info_primary.ci_gdt[slot].ld.ld_entries = + len / sizeof(union descriptor); +#endif + pmap->pm_ldt_sel = GSEL(slot, SEL_KPL); +} + +void +ldt_free(struct pmap *pmap) +{ + int slot; + + slot = IDXSEL(pmap->pm_ldt_sel); + + gdt_put_slot1(slot, 1); +} diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c new file mode 100644 index 0000000000..e08b5a64bd --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c @@ -0,0 +1,230 @@ +/* $NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/****************************************************************************** + * hypervisor.c + * + * Communication to/from hypervisor. + * + * Copyright (c) 2002-2004, K A Fraser + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $"); + +#include <sys/cdefs.h> +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/xen.h> +#include <machine/hypervisor.h> +#include <machine/evtchn.h> + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void +hypervisor_force_callback(void) +{ + + (void)HYPERVISOR_xen_version(0); +} + +int stipending(void); +int +stipending() +{ + uint32_t l1; + unsigned long l2; + unsigned int l1i, l2i, port; + int irq; + shared_info_t *s = HYPERVISOR_shared_info; + struct cpu_info *ci; + int ret; + + ret = 0; + ci = curcpu(); + +#if 0 + if (HYPERVISOR_shared_info->events) + printf("stipending events %08lx mask %08lx ilevel %d\n", + HYPERVISOR_shared_info->events, + HYPERVISOR_shared_info->events_mask, ci->ci_ilevel); +#endif + + /* + * we're only called after STIC, so we know that we'll have to + * STI at the end + */ + cli(); + while (s->vcpu_data[0].evtchn_upcall_pending) { + s->vcpu_data[0].evtchn_upcall_pending = 0; + /* NB. No need for a barrier here -- XCHG is a barrier + * on x86. */ + l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0); + while ((l1i = ffs(l1)) != 0) { + l1i--; + l1 &= ~(1 << l1i); + + l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i]; + while ((l2i = ffs(l2)) != 0) { + l2i--; + l2 &= ~(1 << l2i); + + port = (l1i << 5) + l2i; + if ((irq = evtchn_to_irq[port]) != -1) { + hypervisor_acknowledge_irq(irq); + ci->ci_ipending |= (1 << irq); + if (ret == 0 && ci->ci_ilevel < + ci->ci_isources[irq]->is_handlers + ->ih_level) + ret = 1; + } +#if 0 /* XXXcl dev/evtchn */ + else + evtchn_device_upcall(port); +#endif + } + } + } + sti(); + +#if 0 + if (ci->ci_ipending & 0x1) + printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n", + HYPERVISOR_shared_info->events, + HYPERVISOR_shared_info->events_mask, ci->ci_ilevel, + ci->ci_ipending); +#endif + + return (ret); +} + +void do_hypervisor_callback(struct trapframe *regs) +{ + uint32_t l1; + unsigned long l2; + unsigned int l1i, l2i, port; + int irq; + shared_info_t *s = HYPERVISOR_shared_info; + struct cpu_info *ci; + int level; + + ci = curcpu(); + level = ci->ci_ilevel; + + while (s->vcpu_data[0].evtchn_upcall_pending) { + s->vcpu_data[0].evtchn_upcall_pending = 0; + /* NB. No need for a barrier here -- XCHG is a barrier + * on x86. */ + l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0); + while ((l1i = ffs(l1)) != 0) { + l1i--; + l1 &= ~(1 << l1i); + + l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i]; + while ((l2i = ffs(l2)) != 0) { + l2i--; + l2 &= ~(1 << l2i); + + port = (l1i << 5) + l2i; + if ((irq = evtchn_to_irq[port]) != -1) + do_event(irq, regs); +#if 0 /* XXXcl dev/evtchn */ + else + evtchn_device_upcall(port); +#endif + } + } + } + +#ifdef DIAGNOSTIC + if (level != ci->ci_ilevel) + printf("hypervisor done %08x level %d/%d ipending %08x\n", + HYPERVISOR_shared_info->evtchn_pending_sel, level, + ci->ci_ilevel, ci->ci_ipending); +#endif +} + +void hypervisor_unmask_event(unsigned int ev) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + x86_atomic_clear_bit(&s->evtchn_mask[0], ev); + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the + * interrupt edge' if the channel is masked. + */ + if (x86_atomic_test_bit(&s->evtchn_pending[0], ev) && + !x86_atomic_test_and_set_bit(&s->evtchn_pending_sel, ev>>5)) { + s->vcpu_data[0].evtchn_upcall_pending = 1; + if (!s->vcpu_data[0].evtchn_upcall_mask) + hypervisor_force_callback(); + } +} + +void hypervisor_mask_event(unsigned int ev) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + x86_atomic_set_bit(&s->evtchn_mask[0], ev); +} + +void hypervisor_clear_event(unsigned int ev) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + x86_atomic_clear_bit(&s->evtchn_pending[0], ev); +} diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S new file mode 100644 index 0000000000..45af67272f --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S @@ -0,0 +1,2000 @@ +/* $NetBSD: locore.S,v 1.2.2.1 2004/05/22 15:59:48 he Exp $ */ +/* NetBSD: locore.S,v 1.26 2004/04/12 13:17:46 yamt Exp */ + +/*- + * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Charles M. Hannum. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)locore.s 7.3 (Berkeley) 5/13/91 + */ + +#include "opt_compat_netbsd.h" +#include "opt_compat_oldboot.h" +#include "opt_cputype.h" +#include "opt_ddb.h" +#include "opt_ipkdb.h" +#include "opt_lockdebug.h" +#include "opt_multiprocessor.h" +#include "opt_realmem.h" +#include "opt_user_ldt.h" +#include "opt_vm86.h" +#include "opt_xen.h" + +#include "npx.h" +#include "assym.h" +#include "apm.h" +#include "lapic.h" +#include "ioapic.h" +#include "ksyms.h" + +#include <sys/errno.h> +#include <sys/syscall.h> + +#include <machine/cputypes.h> +#include <machine/param.h> +#include <machine/pte.h> +#include <machine/segments.h> +#include <machine/specialreg.h> +#include <machine/trap.h> +#include <machine/bootinfo.h> + +#if NLAPIC > 0 +#include <machine/i82489reg.h> +#endif + +/* LINTSTUB: include <sys/types.h> */ +/* LINTSTUB: include <machine/cpu.h> */ +/* LINTSTUB: include <sys/systm.h> */ + +#include <machine/asm.h> + +#if defined(MULTIPROCESSOR) + +#define SET_CURLWP(lwp,cpu) \ + movl CPUVAR(SELF),cpu ; \ + movl lwp,CPUVAR(CURLWP) ; \ + movl cpu,L_CPU(lwp) + +#else + +#define SET_CURLWP(lwp,tcpu) movl lwp,CPUVAR(CURLWP) +#define GET_CURLWP(reg) movl CPUVAR(CURLWP),reg + +#endif + +#define GET_CURPCB(reg) movl CPUVAR(CURPCB),reg +#define SET_CURPCB(reg) movl reg,CPUVAR(CURPCB) + +#define CLEAR_RESCHED(reg) movl reg,CPUVAR(RESCHED) + +/* XXX temporary kluge; these should not be here */ +/* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */ +#include <dev/isa/isareg.h> + + +/* Disallow old names for REALBASEMEM */ +#ifdef BIOSBASEMEM +#error BIOSBASEMEM option deprecated; use REALBASEMEM only if memory size reported by latest boot block is incorrect +#endif + +/* Disallow old names for REALEXTMEM */ +#ifdef EXTMEM_SIZE +#error EXTMEM_SIZE option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect +#endif +#ifdef BIOSEXTMEM +#error BIOSEXTMEM option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect +#endif + +#include <machine/frameasm.h> + + +#ifdef MULTIPROCESSOR +#include <machine/i82489reg.h> +#endif + +/* + * PTmap is recursive pagemap at top of virtual address space. + * Within PTmap, the page directory can be found (third indirection). + * + * XXX 4 == sizeof pde + */ + .set _C_LABEL(PTmap),(PDSLOT_PTE << PDSHIFT) + .set _C_LABEL(PTD),(_C_LABEL(PTmap) + PDSLOT_PTE * PAGE_SIZE) + .set _C_LABEL(PTDpde),(_C_LABEL(PTD) + PDSLOT_PTE * 4) + +/* + * APTmap, APTD is the alternate recursive pagemap. + * It's used when modifying another process's page tables. + * + * XXX 4 == sizeof pde + */ + .set _C_LABEL(APTmap),(PDSLOT_APTE << PDSHIFT) + .set _C_LABEL(APTD),(_C_LABEL(APTmap) + PDSLOT_APTE * PAGE_SIZE) + .set _C_LABEL(APTDpde),(_C_LABEL(PTD) + PDSLOT_APTE * 4) + + +/* + * Xen guest identifier and loader selection + */ +.section __xen_guest + .asciz "GUEST_OS=netbsd,GUEST_VER=2.0,XEN_VER=2.0,LOADER=generic" + + +/* + * Initialization + */ + .data + + .globl _C_LABEL(cpu) + .globl _C_LABEL(esym),_C_LABEL(boothowto) + .globl _C_LABEL(bootinfo),_C_LABEL(atdevbase) +#ifdef COMPAT_OLDBOOT + .globl _C_LABEL(bootdev) +#endif + .globl _C_LABEL(proc0paddr),_C_LABEL(PTDpaddr) + .globl _C_LABEL(biosbasemem),_C_LABEL(biosextmem) + .globl _C_LABEL(gdt) +#ifdef I586_CPU + .globl _C_LABEL(idt) +#endif + .globl _C_LABEL(lapic_tpr) + +#if NLAPIC > 0 +#ifdef __ELF__ + .align PAGE_SIZE +#else + .align 12 +#endif + .globl _C_LABEL(local_apic), _C_LABEL(lapic_id) +_C_LABEL(local_apic): + .space LAPIC_ID +_C_LABEL(lapic_id): + .long 0x00000000 + .space LAPIC_TPRI-(LAPIC_ID+4) +_C_LABEL(lapic_tpr): + .space LAPIC_PPRI-LAPIC_TPRI +_C_LABEL(lapic_ppr): + .space LAPIC_ISR-LAPIC_PPRI +_C_LABEL(lapic_isr): + .space PAGE_SIZE-LAPIC_ISR +#else +_C_LABEL(lapic_tpr): + .long 0 +#endif + + +_C_LABEL(cpu): .long 0 # are we 386, 386sx, or 486, + # or Pentium, or.. +_C_LABEL(esym): .long 0 # ptr to end of syms +_C_LABEL(atdevbase): .long 0 # location of start of iomem in virtual +_C_LABEL(proc0paddr): .long 0 +_C_LABEL(PTDpaddr): .long 0 # paddr of PTD, for libkvm +#ifndef REALBASEMEM +_C_LABEL(biosbasemem): .long 0 # base memory reported by BIOS +#else +_C_LABEL(biosbasemem): .long REALBASEMEM +#endif +#ifndef REALEXTMEM +_C_LABEL(biosextmem): .long 0 # extended memory reported by BIOS +#else +_C_LABEL(biosextmem): .long REALEXTMEM +#endif + +#include <machine/xen.h> +#define __HYPERVISOR_yield 8 + + .space 512 +tmpstk: + .long tmpstk, __KERNEL_DS + + +#define _RELOC(x) ((x)) +#define RELOC(x) _RELOC(_C_LABEL(x)) + +/* XXX assym.h */ +#define MOD_START 48 +#define MOD_LEN 56 +/* XXX assym.h */ + + .text + .globl _C_LABEL(kernel_text) + .set _C_LABEL(kernel_text),KERNTEXTOFF + + .globl start +start: + cld + + lss tmpstk,%esp # bootstrap stack end location + + movl %esi,%ebx # save start_info pointer + +#if (NKSYMS || defined(DDB) || defined(LKM)) && !defined(SYMTAB_SPACE) + /* Save the symbol locations. */ + movl MOD_START(%ebx),%esi + addl MOD_LEN(%ebx),%esi + movl %esi,RELOC(esym) +#endif + + /* Clear BSS first so that there are no surprises... */ + xorl %eax,%eax + movl $RELOC(__bss_start),%edi + movl $RELOC(_end),%ecx + subl %edi,%ecx + rep stosb + + movl %ebx,RELOC(avail_start) + + /* Copy the necessary stuff from start_info structure. */ + /* We need to copy shared_info early, so that sti/cli work */ + movl %ebx,%esi + movl $RELOC(start_info_union),%edi + movl $128,%ecx + rep movsl + + /* (howto, [bootdev], bootinfo, basemem, extmem). */ + xorl %eax,%eax + movl %eax,RELOC(boothowto) +#ifdef COMPAT_OLDBOOT + movl %eax,RELOC(bootdev) +#endif + movl $0x20000,%eax + movl %eax,RELOC(boothowto) + + /* First, reset the PSL. */ + pushl $PSL_MBO + popfl + + /* Clear segment registers; always null in proc0. */ + xorl %eax,%eax + movw %ax,%fs + movw %ax,%gs + decl %eax + movl %eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL + + xorl %eax,%eax + cpuid + movl %eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL + +/* + * Virtual address space of kernel: + * + * text | data | bss | [syms] | page dir | proc0 kstack + * 0 1 2 3 + */ +#define PROC0PDIR ((0) * PAGE_SIZE) +#define PROC0STACK ((1) * PAGE_SIZE) +#define SYSMAP ((1+UPAGES) * PAGE_SIZE) +#define TABLESIZE ((1+UPAGES) * PAGE_SIZE) /* + nkpde * PAGE_SIZE */ + + /* Find end of kernel image. */ + movl RELOC(avail_start),%edi + /* Calculate where to start the bootstrap tables. */ + movl %edi,%esi + + /* + * Calculate the size of the kernel page table directory, and + * how many entries it will have. + */ + movl RELOC(nkpde),%ecx # get nkpde + cmpl $NKPTP_MIN,%ecx # larger than min? + jge 1f + movl $NKPTP_MIN,%ecx # set at min + jmp 2f +1: cmpl $NKPTP_MAX,%ecx # larger than max? + jle 2f + movl $NKPTP_MAX,%ecx +2: + + /* Clear memory for bootstrap tables. */ + shll $PGSHIFT,%ecx + addl $TABLESIZE,%ecx + addl %esi,%ecx # end of tables + movl %ecx,RELOC(gdt) + addl $PAGE_SIZE,%ecx + movl %ecx,RELOC(avail_start) + subl %edi,%ecx # size of tables + shrl $2,%ecx + xorl %eax,%eax + cld + rep + stosl + +/* + * fillkpt + * eax = pte (page frame | control | status) + * ebx = page table address + * ecx = number of pages to map + */ +#define fillkpt \ +1: movl %eax,(%ebx) ; \ + addl $PAGE_SIZE,%eax ; /* increment physical address */ \ + addl $4,%ebx ; /* next pte */ \ + loop 1b ; + +/* + * Build initial page tables. + */ + /* Calculate end of text segment, rounded to a page. */ + leal (RELOC(etext)+PGOFSET),%edx + andl $~PGOFSET,%edx + + /* Skip over the first 1MB. */ + movl $KERNTEXTOFF,%eax + movl %eax,%ecx + subl $KERNBASE_LOCORE,%ecx + shrl $PGSHIFT,%ecx + leal (SYSMAP)(%esi,%ecx,4),%ebx + + /* Map the kernel text read-only. */ + movl %edx,%ecx + subl %eax,%ecx + shrl $PGSHIFT,%ecx + orl $(PG_V|PG_KR),%eax + fillkpt + + /* Map the data, BSS, and bootstrap tables read-write. */ + movl RELOC(avail_start),%ecx + # end of tables + subl %edx,%ecx # subtract end of text + shrl $PGSHIFT,%ecx + leal (PG_V|PG_KW)(%edx),%eax + fillkpt + + movl $0xffffffff,(%ebx) + addl $4,%ebx + +/* + * Construct a page table directory. + */ + /* Map kernel PDEs. */ + movl RELOC(nkpde),%ecx # for this many pde s, + leal (PROC0PDIR+PDSLOT_KERN*4)(%esi),%ebx # kernel pde offset + leal (SYSMAP+PG_V|PG_KW)(%esi),%eax # pte for KPT in proc 0, + fillkpt + + /* Install a PDE recursively mapping page directory as a page table! */ + leal (PROC0PDIR+PG_V/*|PG_KW*/)(%esi),%eax # pte for ptd + movl %eax,(PROC0PDIR+PDSLOT_PTE*4)(%esi) # recursive PD slot + + /* Save phys. addr of PTD, for libkvm. */ + movl %esi,RELOC(PTDpaddr) + + call xpmap_init + + /* cr0 is 0x8005003b */ + + /* Relocate atdevbase. */ + movl _C_LABEL(avail_start),%edx + movl %edx,_C_LABEL(HYPERVISOR_shared_info) + addl $PAGE_SIZE,%edx # shared_inf + movl %edx,_C_LABEL(atdevbase) + + /* Set up bootstrap stack. */ + leal (PROC0STACK)(%esi),%eax + movl %eax,_C_LABEL(proc0paddr) + leal (USPACE-FRAMESIZE)(%eax),%esp + subl $KERNBASE_LOCORE,%esi + movl %esi,PCB_CR3(%eax) # pcb->pcb_cr3 + xorl %ebp,%ebp # mark end of frames + + movl _C_LABEL(atdevbase),%eax + pushl %eax + call _C_LABEL(init386) # wire 386 chip for unix operation + addl $4,%esp + +#ifdef SAFARI_FIFO_HACK + movb $5,%al + movw $0x37b,%dx + outb %al,%dx + movw $0x37f,%dx + inb %dx,%al + movb %al,%cl + + orb $1,%cl + + movb $5,%al + movw $0x37b,%dx + outb %al,%dx + movw $0x37f,%dx + movb %cl,%al + outb %al,%dx +#endif /* SAFARI_FIFO_HACK */ + + call _C_LABEL(main) + +/* + * void proc_trampoline(void); + * This is a trampoline function pushed onto the stack of a newly created + * process in order to do some additional setup. The trampoline is entered by + * cpu_switch()ing to the process, so we abuse the callee-saved registers used + * by cpu_switch() to store the information about the stub to call. + * NOTE: This function does not have a normal calling sequence! + */ +/* LINTSTUB: Func: void proc_trampoline(void) */ +NENTRY(proc_trampoline) +#ifdef MULTIPROCESSOR + call _C_LABEL(proc_trampoline_mp) +#endif + movl $IPL_NONE,CPUVAR(ILEVEL) + pushl %ebx + call *%esi + addl $4,%esp + DO_DEFERRED_SWITCH(%eax) + INTRFASTEXIT + /* NOTREACHED */ + +/*****************************************************************************/ +#ifdef COMPAT_16 +/* + * Signal trampoline; copied to top of user stack. + */ +/* LINTSTUB: Var: char sigcode[1], esigcode[1]; */ +NENTRY(sigcode) + /* + * Handler has returned here as if we called it. The sigcontext + * is on the stack after the 3 args "we" pushed. + */ + leal 12(%esp),%eax # get pointer to sigcontext + movl %eax,4(%esp) # put it in the argument slot + # fake return address already there + movl $SYS_compat_16___sigreturn14,%eax + int $0x80 # enter kernel with args on stack + movl $SYS_exit,%eax + int $0x80 # exit if sigreturn fails + .globl _C_LABEL(esigcode) +_C_LABEL(esigcode): +#endif + +/*****************************************************************************/ + +/* + * The following primitives are used to fill and copy regions of memory. + */ + +/* + * XXX No section 9 man page for fillw. + * fillw seems to be very sparsely used (only in pccons it seems.) + * One wonders if it couldn't be done without. + * -- Perry Metzger, May 7, 2001 + */ +/* + * void fillw(short pattern, void *addr, size_t len); + * Write len copies of pattern at addr. + */ +/* LINTSTUB: Func: void fillw(short pattern, void *addr, size_t len) */ +ENTRY(fillw) + pushl %edi + movl 8(%esp),%eax + movl 12(%esp),%edi + movw %ax,%cx + rorl $16,%eax + movw %cx,%ax + cld + movl 16(%esp),%ecx + shrl %ecx # do longwords + rep + stosl + movl 16(%esp),%ecx + andl $1,%ecx # do remainder + rep + stosw + popl %edi + ret + +/* + * int kcopy(const void *from, void *to, size_t len); + * Copy len bytes, abort on fault. + */ +/* LINTSTUB: Func: int kcopy(const void *from, void *to, size_t len) */ +ENTRY(kcopy) + pushl %esi + pushl %edi + GET_CURPCB(%eax) # load curpcb into eax and set on-fault + pushl PCB_ONFAULT(%eax) + movl $_C_LABEL(kcopy_fault), PCB_ONFAULT(%eax) + + movl 16(%esp),%esi + movl 20(%esp),%edi + movl 24(%esp),%ecx + movl %edi,%eax + subl %esi,%eax + cmpl %ecx,%eax # overlapping? + jb 1f + cld # nope, copy forward + shrl $2,%ecx # copy by 32-bit words + rep + movsl + movl 24(%esp),%ecx + andl $3,%ecx # any bytes left? + rep + movsb + + GET_CURPCB(%edx) # XXX save curpcb? + popl PCB_ONFAULT(%edx) + popl %edi + popl %esi + xorl %eax,%eax + ret + + ALIGN_TEXT +1: addl %ecx,%edi # copy backward + addl %ecx,%esi + std + andl $3,%ecx # any fractional bytes? + decl %edi + decl %esi + rep + movsb + movl 24(%esp),%ecx # copy remainder by 32-bit words + shrl $2,%ecx + subl $3,%esi + subl $3,%edi + rep + movsl + cld + + GET_CURPCB(%edx) + popl PCB_ONFAULT(%edx) + popl %edi + popl %esi + xorl %eax,%eax + ret + +/*****************************************************************************/ + +/* + * The following primitives are used to copy data in and out of the user's + * address space. + */ + +/* + * Default to the lowest-common-denominator. We will improve it + * later. + */ +#if defined(I386_CPU) +#define DEFAULT_COPYOUT _C_LABEL(i386_copyout) +#define DEFAULT_COPYIN _C_LABEL(i386_copyin) +#elif defined(I486_CPU) +#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) +#define DEFAULT_COPYIN _C_LABEL(i386_copyin) +#elif defined(I586_CPU) +#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) /* XXX */ +#define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */ +#elif defined(I686_CPU) +#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) /* XXX */ +#define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */ +#endif + + .data + + .globl _C_LABEL(copyout_func) +_C_LABEL(copyout_func): + .long DEFAULT_COPYOUT + + .globl _C_LABEL(copyin_func) +_C_LABEL(copyin_func): + .long DEFAULT_COPYIN + + .text + +/* + * int copyout(const void *from, void *to, size_t len); + * Copy len bytes into the user's address space. + * see copyout(9) + */ +/* LINTSTUB: Func: int copyout(const void *kaddr, void *uaddr, size_t len) */ +ENTRY(copyout) + DO_DEFERRED_SWITCH(%eax) + jmp *_C_LABEL(copyout_func) + +#if defined(I386_CPU) +/* LINTSTUB: Func: int i386_copyout(const void *kaddr, void *uaddr, size_t len) */ +ENTRY(i386_copyout) + pushl %esi + pushl %edi + pushl $0 + + movl 16(%esp),%esi + movl 20(%esp),%edi + movl 24(%esp),%eax + + /* + * We check that the end of the destination buffer is not past the end + * of the user's address space. If it's not, then we only need to + * check that each page is writable. The 486 will do this for us; the + * 386 will not. (We assume that pages in user space that are not + * writable by the user are not writable by the kernel either.) + */ + movl %edi,%edx + addl %eax,%edx + jc _C_LABEL(copy_efault) + cmpl $VM_MAXUSER_ADDRESS,%edx + ja _C_LABEL(copy_efault) + + testl %eax,%eax # anything to do? + jz 3f + + /* + * We have to check each PTE for (write) permission, since the CPU + * doesn't do it for us. + */ + + /* Compute number of pages. */ + movl %edi,%ecx + andl $PGOFSET,%ecx + addl %eax,%ecx + decl %ecx + shrl $PGSHIFT,%ecx + + /* Compute PTE offset for start address. */ + shrl $PGSHIFT,%edi + + GET_CURPCB(%edx) + movl $2f,PCB_ONFAULT(%edx) + +1: /* Check PTE for each page. */ + testb $PG_RW,_C_LABEL(PTmap)(,%edi,4) + jz 2f + +4: incl %edi + decl %ecx + jns 1b + + movl 20(%esp),%edi + movl 24(%esp),%eax + jmp 3f + +2: /* Simulate a trap. */ + pushl %ecx + movl %edi,%eax + shll $PGSHIFT,%eax + pushl %eax + call _C_LABEL(trapwrite) # trapwrite(addr) + addl $4,%esp # pop argument + popl %ecx + testl %eax,%eax # if not ok, return EFAULT + jz 4b + jmp _C_LABEL(copy_efault) + +3: GET_CURPCB(%edx) + movl $_C_LABEL(copy_fault),PCB_ONFAULT(%edx) + + /* bcopy(%esi, %edi, %eax); */ + cld + movl %eax,%ecx + shrl $2,%ecx + rep + movsl + movl %eax,%ecx + andl $3,%ecx + rep + movsb + + popl PCB_ONFAULT(%edx) + popl %edi + popl %esi + xorl %eax,%eax + ret +#endif /* I386_CPU */ + +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) +/* LINTSTUB: Func: int i486_copyout(const void *kaddr, void *uaddr, size_t len) */ +ENTRY(i486_copyout) + pushl %esi + pushl %edi + pushl $0 + + movl 16(%esp),%esi + movl 20(%esp),%edi + movl 24(%esp),%eax + + /* + * We check that the end of the destination buffer is not past the end + * of the user's address space. + */ + movl %edi,%edx + addl %eax,%edx + jc _C_LABEL(copy_efault) + cmpl $VM_MAXUSER_ADDRESS,%edx + ja _C_LABEL(copy_efault) + + GET_CURPCB(%edx) + movl $_C_LABEL(copy_fault),PCB_ONFAULT(%edx) + + /* bcopy(%esi, %edi, %eax); */ + cld + movl %eax,%ecx + shrl $2,%ecx + rep + movsl + movl %eax,%ecx + andl $3,%ecx + rep + movsb + + popl PCB_ONFAULT(%edx) + popl %edi + popl %esi + xorl %eax,%eax + ret +#endif /* I486_CPU || I586_CPU || I686_CPU */ + +/* + * int copyin(const void *from, void *to, size_t len); + * Copy len bytes from the user's address space. + * see copyin(9) + */ +/* LINTSTUB: Func: int copyin(const void *uaddr, void *kaddr, size_t len) */ +ENTRY(copyin) + DO_DEFERRED_SWITCH(%eax) + jmp *_C_LABEL(copyin_func) + +#if defined(I386_CPU) || defined(I486_CPU) || defined(I586_CPU) || \ + defined(I686_CPU) +/* LINTSTUB: Func: int i386_copyin(const void *uaddr, void *kaddr, size_t len) */ +ENTRY(i386_copyin) + pushl %esi + pushl %edi + GET_CURPCB(%eax) + pushl $0 + movl $_C_LABEL(copy_fault),PCB_ONFAULT(%eax) + + movl 16(%esp),%esi + movl 20(%esp),%edi + movl 24(%esp),%eax + + /* + * We check that the end of the destination buffer is not past the end + * of the user's address space. If it's not, then we only need to + * check that each page is readable, and the CPU will do that for us. + */ + movl %esi,%edx + addl %eax,%edx + jc _C_LABEL(copy_efault) + cmpl $VM_MAXUSER_ADDRESS,%edx + ja _C_LABEL(copy_efault) + + /* bcopy(%esi, %edi, %eax); */ + cld + movl %eax,%ecx + shrl $2,%ecx + rep + movsl + movl %eax,%ecx + andl $3,%ecx + rep + movsb + + GET_CURPCB(%edx) + popl PCB_ONFAULT(%edx) + popl %edi + popl %esi + xorl %eax,%eax + ret +#endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */ + +/* LINTSTUB: Ignore */ +NENTRY(copy_efault) + movl $EFAULT,%eax + +/* + * kcopy_fault is used by kcopy and copy_fault is used by copyin/out. + * + * they're distinguished for lazy pmap switching. see trap(). + */ +/* LINTSTUB: Ignore */ +NENTRY(kcopy_fault) + GET_CURPCB(%edx) + popl PCB_ONFAULT(%edx) + popl %edi + popl %esi + ret + +/* LINTSTUB: Ignore */ +NENTRY(copy_fault) + GET_CURPCB(%edx) + popl PCB_ONFAULT(%edx) + popl %edi + popl %esi + ret + +/* + * int copyoutstr(const void *from, void *to, size_t maxlen, size_t *lencopied); + * Copy a NUL-terminated string, at most maxlen characters long, into the + * user's address space. Return the number of characters copied (including the + * NUL) in *lencopied. If the string is too long, return ENAMETOOLONG; else + * return 0 or EFAULT. + * see copyoutstr(9) + */ +/* LINTSTUB: Func: int copyoutstr(const void *kaddr, void *uaddr, size_t len, size_t *done) */ +ENTRY(copyoutstr) + pushl %esi + pushl %edi + + DO_DEFERRED_SWITCH(%eax) + + movl 12(%esp),%esi # esi = from + movl 16(%esp),%edi # edi = to + movl 20(%esp),%edx # edx = maxlen + +#if defined(I386_CPU) +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) + jne 5f +#endif /* I486_CPU || I586_CPU || I686_CPU */ + + /* Compute number of bytes in first page. */ + movl %edi,%eax + andl $PGOFSET,%eax + movl $PAGE_SIZE,%ecx + subl %eax,%ecx # ecx = PAGE_SIZE - (src % PAGE_SIZE) + + GET_CURPCB(%eax) + movl $6f,PCB_ONFAULT(%eax) + +1: /* + * Once per page, check that we are still within the bounds of user + * space, and check for a write fault. + */ + cmpl $VM_MAXUSER_ADDRESS,%edi + jae _C_LABEL(copystr_efault) + + /* Compute PTE offset. */ + movl %edi,%eax + shrl $PGSHIFT,%eax # calculate pte address + + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + jnz 2f + +6: /* Simulate a trap. */ + pushl %edx + pushl %edi + call _C_LABEL(trapwrite) # trapwrite(addr) + addl $4,%esp # clear argument from stack + popl %edx + testl %eax,%eax + jnz _C_LABEL(copystr_efault) + +2: /* Copy up to end of this page. */ + subl %ecx,%edx # predecrement total count + jnc 3f + addl %edx,%ecx # ecx += (edx - ecx) = edx + xorl %edx,%edx + +3: decl %ecx + js 4f + lodsb + stosb + testb %al,%al + jnz 3b + + /* Success -- 0 byte reached. */ + addl %ecx,%edx # add back residual for this page + xorl %eax,%eax + jmp copystr_return + +4: /* Go to next page, if any. */ + movl $PAGE_SIZE,%ecx + testl %edx,%edx + jnz 1b + + /* edx is zero -- return ENAMETOOLONG. */ + movl $ENAMETOOLONG,%eax + jmp copystr_return +#endif /* I386_CPU */ + +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) +5: GET_CURPCB(%eax) + movl $_C_LABEL(copystr_fault),PCB_ONFAULT(%eax) + /* + * Get min(%edx, VM_MAXUSER_ADDRESS-%edi). + */ + movl $VM_MAXUSER_ADDRESS,%eax + subl %edi,%eax + cmpl %edx,%eax + jae 1f + movl %eax,%edx + movl %eax,20(%esp) + +1: incl %edx + cld + +1: decl %edx + jz 2f + lodsb + stosb + testb %al,%al + jnz 1b + + /* Success -- 0 byte reached. */ + decl %edx + xorl %eax,%eax + jmp copystr_return + +2: /* edx is zero -- return EFAULT or ENAMETOOLONG. */ + cmpl $VM_MAXUSER_ADDRESS,%edi + jae _C_LABEL(copystr_efault) + movl $ENAMETOOLONG,%eax + jmp copystr_return +#endif /* I486_CPU || I586_CPU || I686_CPU */ + +/* + * int copyinstr(const void *from, void *to, size_t maxlen, size_t *lencopied); + * Copy a NUL-terminated string, at most maxlen characters long, from the + * user's address space. Return the number of characters copied (including the + * NUL) in *lencopied. If the string is too long, return ENAMETOOLONG; else + * return 0 or EFAULT. + * see copyinstr(9) + */ +/* LINTSTUB: Func: int copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) */ +ENTRY(copyinstr) + pushl %esi + pushl %edi + + DO_DEFERRED_SWITCH(%eax) + + GET_CURPCB(%ecx) + movl $_C_LABEL(copystr_fault),PCB_ONFAULT(%ecx) + + movl 12(%esp),%esi # %esi = from + movl 16(%esp),%edi # %edi = to + movl 20(%esp),%edx # %edx = maxlen + + /* + * Get min(%edx, VM_MAXUSER_ADDRESS-%esi). + */ + movl $VM_MAXUSER_ADDRESS,%eax + subl %esi,%eax + cmpl %edx,%eax + jae 1f + movl %eax,%edx + movl %eax,20(%esp) + +1: incl %edx + cld + +1: decl %edx + jz 2f + lodsb + stosb + testb %al,%al + jnz 1b + + /* Success -- 0 byte reached. */ + decl %edx + xorl %eax,%eax + jmp copystr_return + +2: /* edx is zero -- return EFAULT or ENAMETOOLONG. */ + cmpl $VM_MAXUSER_ADDRESS,%esi + jae _C_LABEL(copystr_efault) + movl $ENAMETOOLONG,%eax + jmp copystr_return + +/* LINTSTUB: Ignore */ +NENTRY(copystr_efault) + movl $EFAULT,%eax + +/* LINTSTUB: Ignore */ +NENTRY(copystr_fault) +copystr_return: + /* Set *lencopied and return %eax. */ + GET_CURPCB(%ecx) + movl $0,PCB_ONFAULT(%ecx) + movl 20(%esp),%ecx + subl %edx,%ecx + movl 24(%esp),%edx + testl %edx,%edx + jz 8f + movl %ecx,(%edx) + +8: popl %edi + popl %esi + ret + +/* + * int copystr(const void *from, void *to, size_t maxlen, size_t *lencopied); + * Copy a NUL-terminated string, at most maxlen characters long. Return the + * number of characters copied (including the NUL) in *lencopied. If the + * string is too long, return ENAMETOOLONG; else return 0. + * see copystr(9) + */ +/* LINTSTUB: Func: int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done) */ +ENTRY(copystr) + pushl %esi + pushl %edi + + movl 12(%esp),%esi # esi = from + movl 16(%esp),%edi # edi = to + movl 20(%esp),%edx # edx = maxlen + incl %edx + cld + +1: decl %edx + jz 4f + lodsb + stosb + testb %al,%al + jnz 1b + + /* Success -- 0 byte reached. */ + decl %edx + xorl %eax,%eax + jmp 6f + +4: /* edx is zero -- return ENAMETOOLONG. */ + movl $ENAMETOOLONG,%eax + +6: /* Set *lencopied and return %eax. */ + movl 20(%esp),%ecx + subl %edx,%ecx + movl 24(%esp),%edx + testl %edx,%edx + jz 7f + movl %ecx,(%edx) + +7: popl %edi + popl %esi + ret + +/* + * long fuword(const void *uaddr); + * Fetch an int from the user's address space. + * see fuword(9) + */ +/* LINTSTUB: Func: long fuword(const void *base) */ +ENTRY(fuword) + DO_DEFERRED_SWITCH(%eax) + movl 4(%esp),%edx + cmpl $VM_MAXUSER_ADDRESS-4,%edx + ja _C_LABEL(fusuaddrfault) + GET_CURPCB(%ecx) + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) + movl (%edx),%eax + movl $0,PCB_ONFAULT(%ecx) + ret + +/* + * int fusword(const void *uaddr); + * Fetch a short from the user's address space. + * see fusword(9) + */ +/* LINTSTUB: Func: int fusword(const void *base) */ +ENTRY(fusword) + DO_DEFERRED_SWITCH(%eax) + movl 4(%esp),%edx + cmpl $VM_MAXUSER_ADDRESS-2,%edx + ja _C_LABEL(fusuaddrfault) + GET_CURPCB(%ecx) + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) + movzwl (%edx),%eax + movl $0,PCB_ONFAULT(%ecx) + ret + +/* + * int fuswintr(const void *uaddr); + * Fetch a short from the user's address space. Can be called during an + * interrupt. + * see fuswintr(9) + */ +/* LINTSTUB: Func: int fuswintr(const void *base) */ +ENTRY(fuswintr) + cmpl $TLBSTATE_VALID, CPUVAR(TLBSTATE) + jnz _C_LABEL(fusuaddrfault) + movl 4(%esp),%edx + cmpl $VM_MAXUSER_ADDRESS-2,%edx + ja _C_LABEL(fusuaddrfault) + movl CPUVAR(CURLWP),%ecx + movl L_ADDR(%ecx),%ecx + movl $_C_LABEL(fusubail),PCB_ONFAULT(%ecx) + movzwl (%edx),%eax + movl $0,PCB_ONFAULT(%ecx) + ret + +/* + * int fubyte(const void *uaddr); + * Fetch a byte from the user's address space. + * see fubyte(9) + */ +/* LINTSTUB: Func: int fubyte(const void *base) */ +ENTRY(fubyte) + DO_DEFERRED_SWITCH(%eax) + movl 4(%esp),%edx + cmpl $VM_MAXUSER_ADDRESS-1,%edx + ja _C_LABEL(fusuaddrfault) + GET_CURPCB(%ecx) + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) + movzbl (%edx),%eax + movl $0,PCB_ONFAULT(%ecx) + ret + +/* + * Handle faults from [fs]u*(). Clean up and return -1. + */ +/* LINTSTUB: Ignore */ +NENTRY(fusufault) + movl $0,PCB_ONFAULT(%ecx) + movl $-1,%eax + ret + +/* + * Handle faults from [fs]u*(). Clean up and return -1. This differs from + * fusufault() in that trap() will recognize it and return immediately rather + * than trying to page fault. + */ +/* LINTSTUB: Ignore */ +NENTRY(fusubail) + movl $0,PCB_ONFAULT(%ecx) + movl $-1,%eax + ret + +/* + * Handle earlier faults from [fs]u*(), due to our of range addresses. + */ +/* LINTSTUB: Ignore */ +NENTRY(fusuaddrfault) + movl $-1,%eax + ret + +/* + * int suword(void *uaddr, long x); + * Store an int in the user's address space. + * see suword(9) + */ +/* LINTSTUB: Func: int suword(void *base, long c) */ +ENTRY(suword) + DO_DEFERRED_SWITCH(%eax) + movl 4(%esp),%edx + cmpl $VM_MAXUSER_ADDRESS-4,%edx + ja _C_LABEL(fusuaddrfault) + +#if defined(I386_CPU) +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) + jne 2f +#endif /* I486_CPU || I586_CPU || I686_CPU */ + + GET_CURPCB(%eax) + movl $3f,PCB_ONFAULT(%eax) + + movl %edx,%eax + shrl $PGSHIFT,%eax # calculate pte address + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + jnz 1f + +3: /* Simulate a trap. */ + pushl %edx + pushl %edx + call _C_LABEL(trapwrite) # trapwrite(addr) + addl $4,%esp # clear parameter from the stack + popl %edx + GET_CURPCB(%ecx) + testl %eax,%eax + jnz _C_LABEL(fusufault) + +1: /* XXX also need to check the following 3 bytes for validity! */ +#endif + +2: GET_CURPCB(%ecx) + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) + + movl 8(%esp),%eax + movl %eax,(%edx) + xorl %eax,%eax + movl %eax,PCB_ONFAULT(%ecx) + ret + +/* + * int susword(void *uaddr, short x); + * Store a short in the user's address space. + * see susword(9) + */ +/* LINTSTUB: Func: int susword(void *base, short c) */ +ENTRY(susword) + DO_DEFERRED_SWITCH(%eax) + movl 4(%esp),%edx + cmpl $VM_MAXUSER_ADDRESS-2,%edx + ja _C_LABEL(fusuaddrfault) + +#if defined(I386_CPU) +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) + jne 2f +#endif /* I486_CPU || I586_CPU || I686_CPU */ + + GET_CURPCB(%eax) + movl $3f,PCB_ONFAULT(%eax) + + movl %edx,%eax + shrl $PGSHIFT,%eax # calculate pte address + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + jnz 1f + +3: /* Simulate a trap. */ + pushl %edx + pushl %edx + call _C_LABEL(trapwrite) # trapwrite(addr) + addl $4,%esp # clear parameter from the stack + popl %edx + GET_CURPCB(%ecx) + testl %eax,%eax + jnz _C_LABEL(fusufault) + +1: /* XXX also need to check the following byte for validity! */ +#endif + +2: GET_CURPCB(%ecx) + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) + + movl 8(%esp),%eax + movw %ax,(%edx) + xorl %eax,%eax + movl %eax,PCB_ONFAULT(%ecx) + ret + +/* + * int suswintr(void *uaddr, short x); + * Store a short in the user's address space. Can be called during an + * interrupt. + * see suswintr(9) + */ +/* LINTSTUB: Func: int suswintr(void *base, short c) */ +ENTRY(suswintr) + cmpl $TLBSTATE_VALID, CPUVAR(TLBSTATE) + jnz _C_LABEL(fusuaddrfault) + movl 4(%esp),%edx + cmpl $VM_MAXUSER_ADDRESS-2,%edx + ja _C_LABEL(fusuaddrfault) + movl CPUVAR(CURLWP),%ecx + movl L_ADDR(%ecx),%ecx + movl $_C_LABEL(fusubail),PCB_ONFAULT(%ecx) + +#if defined(I386_CPU) +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) + jne 2f +#endif /* I486_CPU || I586_CPU || I686_CPU */ + + movl %edx,%eax + shrl $PGSHIFT,%eax # calculate pte address + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + jnz 1f + + /* Simulate a trap. */ + jmp _C_LABEL(fusubail) + +1: /* XXX also need to check the following byte for validity! */ +#endif + +2: movl 8(%esp),%eax + movw %ax,(%edx) + xorl %eax,%eax + movl %eax,PCB_ONFAULT(%ecx) + ret + +/* + * int subyte(void *uaddr, char x); + * Store a byte in the user's address space. + * see subyte(9) + */ +/* LINTSTUB: Func: int subyte(void *base, int c) */ +ENTRY(subyte) + DO_DEFERRED_SWITCH(%eax) + movl 4(%esp),%edx + cmpl $VM_MAXUSER_ADDRESS-1,%edx + ja _C_LABEL(fusuaddrfault) + +#if defined(I386_CPU) +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) + jne 2f +#endif /* I486_CPU || I586_CPU || I686_CPU */ + + GET_CURPCB(%eax) + movl $3f,PCB_ONFAULT(%eax) + + movl %edx,%eax + shrl $PGSHIFT,%eax # calculate pte address + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) + jnz 1f + +3: /* Simulate a trap. */ + pushl %edx + pushl %edx + call _C_LABEL(trapwrite) # trapwrite(addr) + addl $4,%esp # clear parameter from the stack + popl %edx + GET_CURPCB(%ecx) + testl %eax,%eax + jnz _C_LABEL(fusufault) + +1: +#endif + +2: GET_CURPCB(%ecx) + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) + + movb 8(%esp),%al + movb %al,(%edx) + xorl %eax,%eax + movl %eax,PCB_ONFAULT(%ecx) + ret + +/*****************************************************************************/ + +/* + * The following is i386-specific nonsense. + */ + +/* + * void lgdt_finish(void); + * Finish load a new GDT pointer (do any necessary cleanup). + * XXX It's somewhat questionable whether reloading all the segment registers + * is necessary, since the actual descriptor data is not changed except by + * process creation and exit, both of which clean up via task switches. OTOH, + * this only happens at run time when the GDT is resized. + */ +/* LINTSTUB: Func: void lgdt_finish(void) */ +NENTRY(lgdt_finish) + movl $GSEL(GDATA_SEL, SEL_KPL),%eax + movw %ax,%ds + movw %ax,%es + movw %ax,%gs + movw %ax,%ss + movl $GSEL(GCPU_SEL, SEL_KPL),%eax + movw %ax,%fs + /* Reload code selector by doing intersegment return. */ + popl %eax + pushl $GSEL(GCODE_SEL, SEL_KPL) + pushl %eax + lret + +/*****************************************************************************/ + +/* + * These functions are primarily used by DDB. + */ + +/* LINTSTUB: Func: int setjmp (label_t *l) */ +ENTRY(setjmp) + movl 4(%esp),%eax + movl %ebx,(%eax) # save ebx + movl %esp,4(%eax) # save esp + movl %ebp,8(%eax) # save ebp + movl %esi,12(%eax) # save esi + movl %edi,16(%eax) # save edi + movl (%esp),%edx # get rta + movl %edx,20(%eax) # save eip + xorl %eax,%eax # return (0); + ret + +/* LINTSTUB: Func: void longjmp (label_t *l) */ +ENTRY(longjmp) + movl 4(%esp),%eax + movl (%eax),%ebx # restore ebx + movl 4(%eax),%esp # restore esp + movl 8(%eax),%ebp # restore ebp + movl 12(%eax),%esi # restore esi + movl 16(%eax),%edi # restore edi + movl 20(%eax),%edx # get rta + movl %edx,(%esp) # put in return frame + xorl %eax,%eax # return (1); + incl %eax + ret + +/*****************************************************************************/ + + .globl _C_LABEL(sched_whichqs),_C_LABEL(sched_qs) + .globl _C_LABEL(uvmexp),_C_LABEL(panic) + +#ifdef DIAGNOSTIC +NENTRY(switch_error) + pushl $1f +3: call _C_LABEL(panic) + /* NOTREACHED */ +1: .asciz "cpu_switch" +#endif /* DIAGNOSTIC */ + +/* + * void cpu_switch(struct lwp *) + * Find a runnable process and switch to it. Wait if necessary. If the new + * process is the same as the old one, we short-circuit the context save and + * restore. + * + * Note that the stack frame layout is known to "struct switchframe" + * in <machine/frame.h> and to the code in cpu_fork() which initializes + * it for a new lwp. + */ +ENTRY(cpu_switch) + pushl %ebx + pushl %esi + pushl %edi + +#ifdef DEBUG + cmpl $IPL_SCHED,CPUVAR(ILEVEL) + jae 1f + pushl $2f + call _C_LABEL(panic) + /* NOTREACHED */ +2: .asciz "not splsched() in cpu_switch!" +1: +#endif /* DEBUG */ + + movl 16(%esp),%esi # current + + /* + * Clear curlwp so that we don't accumulate system time while idle. + * This also insures that schedcpu() will move the old lwp to + * the correct queue if it happens to get called from the spllower() + * below and changes the priority. (See corresponding comment in + * userret()). + */ + movl $0,CPUVAR(CURLWP) + /* + * First phase: find new lwp. + * + * Registers: + * %eax - queue head, scratch, then zero + * %ebx - queue number + * %ecx - cached value of whichqs + * %edx - next lwp in queue + * %esi - old lwp + * %edi - new lwp + */ + + /* Look for new lwp. */ + CLI(%ecx) # splhigh doesn't do a cli + movl _C_LABEL(sched_whichqs),%ecx + bsfl %ecx,%ebx # find a full q + jnz switch_dequeue + + /* + * idling: save old context. + * + * Registers: + * %eax, %ecx - scratch + * %esi - old lwp, then old pcb + * %edi - idle pcb + */ + + pushl %esi + call _C_LABEL(pmap_deactivate2) # pmap_deactivate(oldproc) + addl $4,%esp + + movl L_ADDR(%esi),%esi + + /* Save stack pointers. */ + movl %esp,PCB_ESP(%esi) + movl %ebp,PCB_EBP(%esi) + + /* Find idle PCB for this CPU */ +#ifndef MULTIPROCESSOR + movl $_C_LABEL(lwp0),%ebx + movl L_ADDR(%ebx),%edi + movl L_MD_TSS_SEL(%ebx),%edx +#else + movl CPUVAR(IDLE_PCB),%edi + movl CPUVAR(IDLE_TSS_SEL),%edx +#endif + movl $0,CPUVAR(CURLWP) /* In case we fault... */ + + /* Restore the idle context (avoid interrupts) */ + CLI(%ecx) + + /* Restore stack pointers. */ + movl PCB_ESP(%edi),%esp + movl PCB_EBP(%edi),%ebp + + pushl %edi + call _C_LABEL(i386_switch_context) + addl $4,%esp + + /* Record new pcb. */ + SET_CURPCB(%edi) + + xorl %esi,%esi + STI(%eax) +idle_unlock: +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) + call _C_LABEL(sched_unlock_idle) +#endif + /* Interrupts are okay again. */ + pushl $IPL_NONE # spl0() + call _C_LABEL(Xspllower) # process pending interrupts + addl $4,%esp + jmp idle_start +idle_zero: + STIC(%eax) + jz 4f + call _C_LABEL(stipending) + testl %eax,%eax + jz 4f + pushl $IPL_NONE + call _C_LABEL(Xspllower) + addl $4,%esp +4: + call _C_LABEL(uvm_pageidlezero) + CLI(%eax) + cmpl $0,_C_LABEL(sched_whichqs) + jnz idle_exit +idle_loop: + /* Try to zero some pages. */ + movl _C_LABEL(uvm)+UVM_PAGE_IDLE_ZERO,%ecx + testl %ecx,%ecx + jnz idle_zero + STIC(%eax) + jz 4f + call _C_LABEL(stipending) + testl %eax,%eax + jz 4f + pushl $IPL_NONE + call _C_LABEL(Xspllower) + addl $4,%esp + jmp idle_start +4: + movl $__HYPERVISOR_yield,%eax + TRAP_INSTR +NENTRY(mpidle) +idle_start: + CLI(%eax) + cmpl $0,_C_LABEL(sched_whichqs) + jz idle_loop +idle_exit: + movl $IPL_HIGH,CPUVAR(ILEVEL) # splhigh + STI(%eax) +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) + call _C_LABEL(sched_lock_idle) +#endif + movl _C_LABEL(sched_whichqs),%ecx + bsfl %ecx,%ebx + jz idle_unlock + +#ifdef XENDEBUG_LOW + pushl %ecx + call _C_LABEL(xen_dbg1) + xorl %ecx,%ecx + movl %ecx,_C_LABEL(xen_once) + popl %ecx +#endif +switch_dequeue: + /* + * we're running at splhigh(), but it's otherwise okay to take + * interrupts here. + */ + STI(%edi) + leal _C_LABEL(sched_qs)(,%ebx,8),%eax # select q + + movl L_FORW(%eax),%edi # unlink from front of process q +#ifdef DIAGNOSTIC + cmpl %edi,%eax # linked to self (i.e. nothing queued)? + je _C_LABEL(switch_error) # not possible +#endif /* DIAGNOSTIC */ + movl L_FORW(%edi),%edx + movl %edx,L_FORW(%eax) + movl %eax,L_BACK(%edx) + + cmpl %edx,%eax # q empty? + jne 3f + + btrl %ebx,%ecx # yes, clear to indicate empty + movl %ecx,_C_LABEL(sched_whichqs) # update q status + +3: /* We just did it. */ + xorl %eax,%eax + CLEAR_RESCHED(%eax) + +switch_resume: +#ifdef DIAGNOSTIC + cmpl %eax,L_WCHAN(%edi) # Waiting for something? + jne _C_LABEL(switch_error) # Yes; shouldn't be queued. + cmpb $LSRUN,L_STAT(%edi) # In run state? + jne _C_LABEL(switch_error) # No; shouldn't be queued. +#endif /* DIAGNOSTIC */ + + /* Isolate lwp. XXX Is this necessary? */ + movl %eax,L_BACK(%edi) + + /* Record new lwp. */ + movb $LSONPROC,L_STAT(%edi) # l->l_stat = LSONPROC + SET_CURLWP(%edi,%ecx) + + /* Skip context switch if same lwp. */ + xorl %ebx,%ebx + cmpl %edi,%esi + je switch_return + + /* If old lwp exited, don't bother. */ + testl %esi,%esi + jz switch_exited + + /* + * Second phase: save old context. + * + * Registers: + * %eax, %ecx - scratch + * %esi - old lwp, then old pcb + * %edi - new lwp + */ + + pushl %esi + call _C_LABEL(pmap_deactivate2) # pmap_deactivate(oldproc) + addl $4,%esp + + movl L_ADDR(%esi),%esi + + /* Save stack pointers. */ + movl %esp,PCB_ESP(%esi) + movl %ebp,PCB_EBP(%esi) + +switch_exited: + /* + * Third phase: restore saved context. + * + * Registers: + * %eax, %ebx, %ecx, %edx - scratch + * %esi - new pcb + * %edi - new lwp + */ + + /* No interrupts while loading new state. */ + CLI(%eax) + movl L_ADDR(%edi),%esi + + /* Restore stack pointers. */ + movl PCB_ESP(%esi),%esp + movl PCB_EBP(%esi),%ebp + +#if 0 + /* Don't bother with the rest if switching to a system process. */ + testl $P_SYSTEM,L_FLAG(%edi); XXX NJWLWP lwp's don't have P_SYSTEM! + jnz switch_restored ; XXX skip stack_switch+pmap_activate +#endif + + pushl %edi + call _C_LABEL(pmap_activate) # pmap_activate(p) + addl $4,%esp + + pushl %esi + call _C_LABEL(i386_switch_context) + addl $4,%esp + + /* Record new pcb. */ + SET_CURPCB(%esi) + + /* Interrupts are okay again. */ + STI(%edi) + +/* + * Check for restartable atomic sequences (RAS) + */ + movl CPUVAR(CURLWP),%edi + movl L_PROC(%edi),%esi + cmpl $0,P_RASLIST(%esi) + jne 2f +1: + movl $1,%ebx + +switch_return: +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) + call _C_LABEL(sched_unlock_idle) +#endif + pushl $IPL_NONE # spl0() + call _C_LABEL(Xspllower) # process pending interrupts + addl $4,%esp + movl $IPL_HIGH,CPUVAR(ILEVEL) # splhigh() + + movl %ebx,%eax + + popl %edi + popl %esi + popl %ebx + ret + +2: # check RAS list + movl L_MD_REGS(%edi),%ebx + movl TF_EIP(%ebx),%eax + pushl %eax + pushl %esi + call _C_LABEL(ras_lookup) + addl $8,%esp + cmpl $-1,%eax + je 1b + movl %eax,TF_EIP(%ebx) + jmp 1b + +/* + * void cpu_switchto(struct lwp *current, struct lwp *next) + * Switch to the specified next LWP. + */ +ENTRY(cpu_switchto) + pushl %ebx + pushl %esi + pushl %edi + +#ifdef DEBUG + cmpl $IPL_SCHED,CPUVAR(ILEVEL) + jae 1f + pushl $2f + call _C_LABEL(panic) + /* NOTREACHED */ +2: .asciz "not splsched() in cpu_switchto!" +1: +#endif /* DEBUG */ + + movl 16(%esp),%esi # current + movl 20(%esp),%edi # next + + /* + * Clear curlwp so that we don't accumulate system time while idle. + * This also insures that schedcpu() will move the old process to + * the correct queue if it happens to get called from the spllower() + * below and changes the priority. (See corresponding comment in + * usrret()). + * + * XXX Is this necessary? We know we won't go idle. + */ + movl $0,CPUVAR(CURLWP) + + /* + * We're running at splhigh(), but it's otherwise okay to take + * interrupts here. + */ + STI(%eax) + + /* Jump into the middle of cpu_switch */ + xorl %eax,%eax + jmp switch_resume + +/* + * void cpu_exit(struct lwp *l) + * Switch to the appropriate idle context (lwp0's if uniprocessor; the CPU's + * if multiprocessor) and deallocate the address space and kernel stack for p. + * Then jump into cpu_switch(), as if we were in the idle proc all along. + */ +#ifndef MULTIPROCESSOR + .globl _C_LABEL(lwp0) +#endif + .globl _C_LABEL(uvmspace_free),_C_LABEL(kernel_map) + .globl _C_LABEL(uvm_km_free),_C_LABEL(tss_free) +/* LINTSTUB: Func: void cpu_exit(struct lwp *l) */ +ENTRY(cpu_exit) + movl 4(%esp),%edi # old process +#ifndef MULTIPROCESSOR + movl $_C_LABEL(lwp0),%ebx + movl L_ADDR(%ebx),%esi + movl L_MD_TSS_SEL(%ebx),%edx +#else + movl CPUVAR(IDLE_PCB),%esi + movl CPUVAR(IDLE_TSS_SEL),%edx +#endif + /* In case we fault... */ + movl $0,CPUVAR(CURLWP) + + /* Restore the idle context. */ + CLI(%eax) + + /* Restore stack pointers. */ + movl PCB_ESP(%esi),%esp + movl PCB_EBP(%esi),%ebp + + pushl %esi + call _C_LABEL(i386_switch_context) + addl $4,%esp + + /* Record new pcb. */ + SET_CURPCB(%esi) + + /* Interrupts are okay again. */ + STI(%eax) + + /* + * Schedule the dead LWP's stack to be freed. + */ + pushl %edi + call _C_LABEL(lwp_exit2) + addl $4,%esp + + /* Jump into cpu_switch() with the right state. */ + xorl %esi,%esi + movl %esi,CPUVAR(CURLWP) + jmp idle_start + +/* + * void savectx(struct pcb *pcb); + * Update pcb, saving current processor state. + */ +/* LINTSTUB: Func: void savectx(struct pcb *pcb) */ +ENTRY(savectx) + movl 4(%esp),%edx # edx = p->p_addr + + /* Save stack pointers. */ + movl %esp,PCB_ESP(%edx) + movl %ebp,PCB_EBP(%edx) + + ret + +/* + * Old call gate entry for syscall + */ +/* LINTSTUB: Var: char Xosyscall[1]; */ +IDTVEC(osyscall) + /* Set eflags in trap frame. */ + pushfl + popl 8(%esp) + pushl $7 # size of instruction for restart + jmp syscall1 + +/* + * Trap gate entry for syscall + */ +/* LINTSTUB: Var: char Xsyscall[1]; */ +IDTVEC(syscall) + pushl $2 # size of instruction for restart +syscall1: + pushl $T_ASTFLT # trap # for doing ASTs + INTRENTRY + +#ifdef DIAGNOSTIC + cmpl $0, CPUVAR(WANT_PMAPLOAD) + jz 1f + pushl $6f + call _C_LABEL(printf) + addl $4, %esp +1: + movl CPUVAR(ILEVEL),%ebx + testl %ebx,%ebx + jz 1f + pushl $5f + call _C_LABEL(printf) + addl $4,%esp +#ifdef DDB + int $3 +#endif +1: +#endif /* DIAGNOSTIC */ + movl CPUVAR(CURLWP),%edx + movl %esp,L_MD_REGS(%edx) # save pointer to frame + movl L_PROC(%edx),%edx + pushl %esp + call *P_MD_SYSCALL(%edx) # get pointer to syscall() function + addl $4,%esp +syscall_checkast: + /* Check for ASTs on exit to user mode. */ + CLI(%eax) + CHECK_ASTPENDING(%eax) + je 1f + /* Always returning to user mode here. */ + CLEAR_ASTPENDING(%eax) + STI(%eax) + /* Pushed T_ASTFLT into tf_trapno on entry. */ + pushl %esp + call _C_LABEL(trap) + addl $4,%esp + jmp syscall_checkast +1: STI(%eax) + CHECK_DEFERRED_SWITCH(%eax) + jnz 9f +#ifndef DIAGNOSTIC + INTRFASTEXIT +#else /* DIAGNOSTIC */ + cmpl $IPL_NONE,CPUVAR(ILEVEL) + jne 3f + INTRFASTEXIT +3: pushl $4f + call _C_LABEL(printf) + addl $4,%esp +#ifdef DDB + int $3 +#endif /* DDB */ + movl $IPL_NONE,CPUVAR(ILEVEL) + jmp 2b +4: .asciz "WARNING: SPL NOT LOWERED ON SYSCALL EXIT\n" +5: .asciz "WARNING: SPL NOT ZERO ON SYSCALL ENTRY\n" +6: .asciz "WARNING: WANT PMAPLOAD ON SYSCALL ENTRY\n" +#endif /* DIAGNOSTIC */ +9: call _C_LABEL(pmap_load) + jmp syscall_checkast /* re-check ASTs */ + +#if NNPX > 0 +/* + * Special interrupt handlers. Someday intr0-intr15 will be used to count + * interrupts. We'll still need a special exception 16 handler. The busy + * latch stuff in probintr() can be moved to npxprobe(). + */ + +/* LINTSTUB: Func: void probeintr(void) */ +NENTRY(probeintr) + ss + incl _C_LABEL(npx_intrs_while_probing) + pushl %eax + movb $0x20,%al # EOI (asm in strings loses cpp features) + outb %al,$0xa0 # IO_ICU2 + outb %al,$0x20 # IO_ICU1 + movb $0,%al + outb %al,$0xf0 # clear BUSY# latch + popl %eax + iret + +/* LINTSTUB: Func: void probetrap(void) */ +NENTRY(probetrap) + ss + incl _C_LABEL(npx_traps_while_probing) + fnclex + iret + +/* LINTSTUB: Func: int npx586bug1(int a, int b) */ +NENTRY(npx586bug1) + fildl 4(%esp) # x + fildl 8(%esp) # y + fld %st(1) + fdiv %st(1),%st # x/y + fmulp %st,%st(1) # (x/y)*y + fsubrp %st,%st(1) # x-(x/y)*y + pushl $0 + fistpl (%esp) + popl %eax + ret +#endif /* NNPX > 0 */ diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c new file mode 100644 index 0000000000..61d2898096 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c @@ -0,0 +1,2561 @@ +/* $NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $ */ +/* NetBSD: machdep.c,v 1.552 2004/03/24 15:34:49 atatat Exp */ + +/*- + * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace + * Simulation Facility, NASA Ames Research Center. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)machdep.c 7.4 (Berkeley) 6/3/91 + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $"); + +#include "opt_beep.h" +#include "opt_compat_ibcs2.h" +#include "opt_compat_mach.h" /* need to get the right segment def */ +#include "opt_compat_netbsd.h" +#include "opt_compat_svr4.h" +#include "opt_cpureset_delay.h" +#include "opt_cputype.h" +#include "opt_ddb.h" +#include "opt_ipkdb.h" +#include "opt_kgdb.h" +#include "opt_mtrr.h" +#include "opt_multiprocessor.h" +#include "opt_realmem.h" +#include "opt_user_ldt.h" +#include "opt_vm86.h" +#include "opt_xen.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/signal.h> +#include <sys/signalvar.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/user.h> +#include <sys/exec.h> +#include <sys/buf.h> +#include <sys/reboot.h> +#include <sys/conf.h> +#include <sys/file.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/msgbuf.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/extent.h> +#include <sys/syscallargs.h> +#include <sys/core.h> +#include <sys/kcore.h> +#include <sys/ucontext.h> +#include <machine/kcore.h> +#include <sys/ras.h> +#include <sys/sa.h> +#include <sys/savar.h> +#include <sys/ksyms.h> + +#ifdef IPKDB +#include <ipkdb/ipkdb.h> +#endif + +#ifdef KGDB +#include <sys/kgdb.h> +#endif + +#include <dev/cons.h> + +#include <uvm/uvm_extern.h> +#include <uvm/uvm_page.h> + +#include <sys/sysctl.h> + +#include <machine/cpu.h> +#include <machine/cpufunc.h> +#include <machine/cpuvar.h> +#include <machine/gdt.h> +#include <machine/pio.h> +#include <machine/psl.h> +#include <machine/reg.h> +#include <machine/specialreg.h> +#include <machine/bootinfo.h> +#include <machine/mtrr.h> +#include <machine/evtchn.h> + +#include <dev/isa/isareg.h> +#include <machine/isa_machdep.h> +#include <dev/ic/i8042reg.h> + +#ifdef DDB +#include <machine/db_machdep.h> +#include <ddb/db_extern.h> +#endif + +#ifdef VM86 +#include <machine/vm86.h> +#endif + +#include "acpi.h" +#include "apm.h" +#include "bioscall.h" + +#if NBIOSCALL > 0 +#include <machine/bioscall.h> +#endif + +#if NACPI > 0 +#include <dev/acpi/acpivar.h> +#define ACPI_MACHDEP_PRIVATE +#include <machine/acpi_machdep.h> +#endif + +#if NAPM > 0 +#include <machine/apmvar.h> +#endif + +#include "isa.h" +#include "isadma.h" +#include "npx.h" +#include "ksyms.h" + +#include "mca.h" +#if NMCA > 0 +#include <machine/mca_machdep.h> /* for mca_busprobe() */ +#endif + +#ifdef MULTIPROCESSOR /* XXX */ +#include <machine/mpbiosvar.h> /* XXX */ +#endif /* XXX */ + +#include <machine/xen.h> +#include <machine/hypervisor.h> + +#if defined(DDB) || defined(KGDB) +#include <ddb/db_interface.h> +#include <ddb/db_output.h> + +void ddb_trap_hook(int); +#endif + +/* #define XENDEBUG */ +/* #define XENDEBUG_LOW */ + +#ifdef XENDEBUG +extern void printk(char *, ...); +#define XENPRINTF(x) printf x +#define XENPRINTK(x) printk x +#else +#define XENPRINTF(x) +#define XENPRINTK(x) +#endif +#define PRINTK(x) printf x + +#ifdef XENDEBUG_LOW +void xen_dbglow_init(void); +#endif + +#ifndef BEEP_ONHALT_COUNT +#define BEEP_ONHALT_COUNT 3 +#endif +#ifndef BEEP_ONHALT_PITCH +#define BEEP_ONHALT_PITCH 1500 +#endif +#ifndef BEEP_ONHALT_PERIOD +#define BEEP_ONHALT_PERIOD 250 +#endif + +/* the following is used externally (sysctl_hw) */ +char machine[] = "i386"; /* CPU "architecture" */ +char machine_arch[] = "i386"; /* machine == machine_arch */ + +char bootinfo[BOOTINFO_MAXSIZE]; + +struct bi_devmatch *i386_alldisks = NULL; +int i386_ndisks = 0; + +#ifdef CPURESET_DELAY +int cpureset_delay = CPURESET_DELAY; +#else +int cpureset_delay = 2000; /* default to 2s */ +#endif + +#ifdef MTRR +struct mtrr_funcs *mtrr_funcs; +#endif + +#ifdef COMPAT_NOMID +static int exec_nomid(struct proc *, struct exec_package *); +#endif + +int physmem; +int dumpmem_low; +int dumpmem_high; +unsigned int cpu_feature; +int cpu_class; +int i386_fpu_present; +int i386_fpu_exception; +int i386_fpu_fdivbug; + +int i386_use_fxsave; +int i386_has_sse; +int i386_has_sse2; + +int tmx86_has_longrun; + +vaddr_t msgbuf_vaddr; +paddr_t msgbuf_paddr; + +vaddr_t idt_vaddr; +paddr_t idt_paddr; + +#ifdef I586_CPU +vaddr_t pentium_idt_vaddr; +#endif + +struct vm_map *exec_map = NULL; +struct vm_map *mb_map = NULL; +struct vm_map *phys_map = NULL; + +extern paddr_t avail_start, avail_end; +extern paddr_t pmap_pa_start, pmap_pa_end; + +#ifdef ISA_CLOCK +void (*delay_func)(int) = i8254_delay; +void (*microtime_func)(struct timeval *) = i8254_microtime; +void (*initclock_func)(void) = i8254_initclocks; +#else +void (*delay_func)(int) = xen_delay; +void (*microtime_func)(struct timeval *) = xen_microtime; +void (*initclock_func)(void) = xen_initclocks; +#endif + +void hypervisor_callback(void); +void failsafe_callback(void); + +/* + * Size of memory segments, before any memory is stolen. + */ +phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; +int mem_cluster_cnt; + +int cpu_dump(void); +int cpu_dumpsize(void); +u_long cpu_dump_mempagecnt(void); +void dumpsys(void); +void init386(paddr_t); +void initgdt(void); + +#if !defined(REALBASEMEM) && !defined(REALEXTMEM) +void add_mem_cluster(u_int64_t, u_int64_t, u_int32_t); +#endif /* !defnied(REALBASEMEM) && !defined(REALEXTMEM) */ + +extern int time_adjusted; + +/* + * Machine-dependent startup code + */ +void +cpu_startup() +{ + int x; + vaddr_t minaddr, maxaddr; + char pbuf[9]; + + /* + * Initialize error message buffer (et end of core). + */ + msgbuf_vaddr = uvm_km_valloc(kernel_map, x86_round_page(MSGBUFSIZE)); + if (msgbuf_vaddr == 0) + panic("failed to valloc msgbuf_vaddr"); + + /* msgbuf_paddr was init'd in pmap */ + for (x = 0; x < btoc(MSGBUFSIZE); x++) + pmap_kenter_pa((vaddr_t)msgbuf_vaddr + x * PAGE_SIZE, + msgbuf_paddr + x * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE); + pmap_update(pmap_kernel()); + + initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE)); + + printf("%s", version); + +#ifdef TRAPLOG + /* + * Enable recording of branch from/to in MSR's + */ + wrmsr(MSR_DEBUGCTLMSR, 0x1); +#endif + + format_bytes(pbuf, sizeof(pbuf), ptoa(physmem)); + printf("total memory = %s\n", pbuf); + + minaddr = 0; + + /* + * Allocate a submap for exec arguments. This map effectively + * limits the number of processes exec'ing at any time. + */ + exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, + 16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL); + + /* + * Allocate a submap for physio + */ + phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, + VM_PHYS_SIZE, 0, FALSE, NULL); + + /* + * Finally, allocate mbuf cluster submap. + */ + mb_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, + nmbclusters * mclbytes, VM_MAP_INTRSAFE, FALSE, NULL); + + format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free)); + printf("avail memory = %s\n", pbuf); + + /* Safe for i/o port / memory space allocation to use malloc now. */ + x86_bus_space_mallocok(); +} + +/* + * Set up proc0's TSS and LDT. + */ +void +i386_proc0_tss_ldt_init() +{ + struct pcb *pcb; + int x; + + gdt_init(); + + cpu_info_primary.ci_curpcb = pcb = &lwp0.l_addr->u_pcb; + + pcb->pcb_tss.tss_ioopt = + ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16 + | SEL_KPL; /* i/o pl */ + + for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++) + pcb->pcb_iomap[x] = 0xffffffff; + + pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL); + pcb->pcb_cr0 = rcr0(); + pcb->pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); + pcb->pcb_tss.tss_esp0 = (int)lwp0.l_addr + USPACE - 16; + lwp0.l_md.md_regs = (struct trapframe *)pcb->pcb_tss.tss_esp0 - 1; + lwp0.l_md.md_tss_sel = tss_alloc(pcb); + +#ifndef XEN + ltr(lwp0.l_md.md_tss_sel); + lldt(pcb->pcb_ldt_sel); +#else + HYPERVISOR_fpu_taskswitch(); + XENPRINTF(("lwp tss sp %p ss %04x/%04x\n", + (void *)pcb->pcb_tss.tss_esp0, + pcb->pcb_tss.tss_ss0, IDXSEL(pcb->pcb_tss.tss_ss0))); + HYPERVISOR_stack_switch(pcb->pcb_tss.tss_ss0, pcb->pcb_tss.tss_esp0); +#endif +} + +/* + * Set up TSS and LDT for a new PCB. + */ + +void +i386_init_pcb_tss_ldt(struct cpu_info *ci) +{ + int x; + struct pcb *pcb = ci->ci_idle_pcb; + + pcb->pcb_tss.tss_ioopt = + ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16 + | SEL_KPL; /* i/o pl */ + for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++) + pcb->pcb_iomap[x] = 0xffffffff; + + pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL); + pcb->pcb_cr0 = rcr0(); + + ci->ci_idle_tss_sel = tss_alloc(pcb); +} + +/* + * Switch context: + * - honor CR0_TS in saved CR0 and request DNA exception on FPU use + * - switch stack pointer for user->kernel transition + */ +void +i386_switch_context(struct pcb *new) +{ + dom0_op_t op; + struct cpu_info *ci; + + ci = curcpu(); + if (ci->ci_fpused) { + HYPERVISOR_fpu_taskswitch(); + ci->ci_fpused = 0; + } + + HYPERVISOR_stack_switch(new->pcb_tss.tss_ss0, new->pcb_tss.tss_esp0); + + if (xen_start_info.flags & SIF_PRIVILEGED) { + op.cmd = DOM0_IOPL; + op.u.iopl.domain = DOMID_SELF; + op.u.iopl.iopl = new->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */ + HYPERVISOR_dom0_op(&op); + } +} + +/* + * sysctl helper routine for machdep.tm* nodes. + */ +static int +sysctl_machdep_tm_longrun(SYSCTLFN_ARGS) +{ + struct sysctlnode node; + int io, error; + + if (!tmx86_has_longrun) + return (EOPNOTSUPP); + + node = *rnode; + node.sysctl_data = &io; + + switch (rnode->sysctl_num) { + case CPU_TMLR_MODE: + io = (int)(crusoe_longrun = tmx86_get_longrun_mode()); + break; + case CPU_TMLR_FREQUENCY: + tmx86_get_longrun_status_all(); + io = crusoe_frequency; + break; + case CPU_TMLR_VOLTAGE: + tmx86_get_longrun_status_all(); + io = crusoe_voltage; + break; + case CPU_TMLR_PERCENTAGE: + tmx86_get_longrun_status_all(); + io = crusoe_percentage; + break; + default: + return (EOPNOTSUPP); + } + + error = sysctl_lookup(SYSCTLFN_CALL(&node)); + if (error || newp == NULL) + return (error); + + if (rnode->sysctl_num == CPU_TMLR_MODE) { + if (tmx86_set_longrun_mode(io)) + crusoe_longrun = (u_int)io; + else + return (EINVAL); + } + + return (0); +} + +/* + * sysctl helper routine for machdep.booted_kernel + */ +static int +sysctl_machdep_booted_kernel(SYSCTLFN_ARGS) +{ + struct btinfo_bootpath *bibp; + struct sysctlnode node; + + bibp = lookup_bootinfo(BTINFO_BOOTPATH); + if(!bibp) + return(ENOENT); /* ??? */ + + node = *rnode; + node.sysctl_data = bibp->bootpath; + node.sysctl_size = sizeof(bibp->bootpath); + return (sysctl_lookup(SYSCTLFN_CALL(&node))); +} + +/* + * sysctl helper routine for machdep.diskinfo + */ +static int +sysctl_machdep_diskinfo(SYSCTLFN_ARGS) +{ + struct sysctlnode node; + + node = *rnode; + node.sysctl_data = i386_alldisks; + node.sysctl_size = sizeof(struct disklist) + + (i386_ndisks - 1) * sizeof(struct nativedisk_info); + return (sysctl_lookup(SYSCTLFN_CALL(&node))); +} + +/* + * machine dependent system variables. + */ +SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup") +{ + + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "machdep", NULL, + NULL, 0, NULL, 0, + CTL_MACHDEP, CTL_EOL); + + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_STRUCT, "console_device", NULL, + sysctl_consdev, 0, NULL, sizeof(dev_t), + CTL_MACHDEP, CPU_CONSDEV, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "biosbasemem", NULL, + NULL, 0, &biosbasemem, 0, + CTL_MACHDEP, CPU_BIOSBASEMEM, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "biosextmem", NULL, + NULL, 0, &biosextmem, 0, + CTL_MACHDEP, CPU_BIOSEXTMEM, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "nkpde", NULL, + NULL, 0, &nkpde, 0, + CTL_MACHDEP, CPU_NKPDE, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_STRING, "booted_kernel", NULL, + sysctl_machdep_booted_kernel, 0, NULL, 0, + CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_STRUCT, "diskinfo", NULL, + sysctl_machdep_diskinfo, 0, NULL, 0, + CTL_MACHDEP, CPU_DISKINFO, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "fpu_present", NULL, + NULL, 0, &i386_fpu_present, 0, + CTL_MACHDEP, CPU_FPU_PRESENT, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "osfxsr", NULL, + NULL, 0, &i386_use_fxsave, 0, + CTL_MACHDEP, CPU_OSFXSR, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "sse", NULL, + NULL, 0, &i386_has_sse, 0, + CTL_MACHDEP, CPU_SSE, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "sse2", NULL, + NULL, 0, &i386_has_sse2, 0, + CTL_MACHDEP, CPU_SSE2, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "tm_longrun_mode", NULL, + sysctl_machdep_tm_longrun, 0, NULL, 0, + CTL_MACHDEP, CPU_TMLR_MODE, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "tm_longrun_frequency", NULL, + sysctl_machdep_tm_longrun, 0, NULL, 0, + CTL_MACHDEP, CPU_TMLR_FREQUENCY, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "tm_longrun_voltage", NULL, + sysctl_machdep_tm_longrun, 0, NULL, 0, + CTL_MACHDEP, CPU_TMLR_VOLTAGE, CTL_EOL); + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_INT, "tm_longrun_percentage", NULL, + sysctl_machdep_tm_longrun, 0, NULL, 0, + CTL_MACHDEP, CPU_TMLR_PERCENTAGE, CTL_EOL); +} + +void * +getframe(struct lwp *l, int sig, int *onstack) +{ + struct proc *p = l->l_proc; + struct sigctx *ctx = &p->p_sigctx; + struct trapframe *tf = l->l_md.md_regs; + + /* Do we need to jump onto the signal stack? */ + *onstack = (ctx->ps_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 + && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0; + if (*onstack) + return (char *)ctx->ps_sigstk.ss_sp + ctx->ps_sigstk.ss_size; +#ifdef VM86 + if (tf->tf_eflags & PSL_VM) + return (void *)(tf->tf_esp + (tf->tf_ss << 4)); + else +#endif + return (void *)tf->tf_esp; +} + +/* + * Build context to run handler in. We invoke the handler + * directly, only returning via the trampoline. Note the + * trampoline version numbers are coordinated with machine- + * dependent code in libc. + */ +void +buildcontext(struct lwp *l, int sel, void *catcher, void *fp) +{ + struct trapframe *tf = l->l_md.md_regs; + + tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); + tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); + tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); + tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); + tf->tf_eip = (int)catcher; + tf->tf_cs = GSEL(sel, SEL_UPL); + tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC); + tf->tf_esp = (int)fp; + tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); +} + +static void +sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask) +{ + struct lwp *l = curlwp; + struct proc *p = l->l_proc; + struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map); + int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ? + GUCODEBIG_SEL : GUCODE_SEL; + struct sigacts *ps = p->p_sigacts; + int onstack; + int sig = ksi->ksi_signo; + struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame; + sig_t catcher = SIGACTION(p, sig).sa_handler; + struct trapframe *tf = l->l_md.md_regs; + + fp--; + + /* Build stack frame for signal trampoline. */ + switch (ps->sa_sigdesc[sig].sd_vers) { + case 0: /* handled by sendsig_sigcontext */ + case 1: /* handled by sendsig_sigcontext */ + default: /* unknown version */ + printf("nsendsig: bad version %d\n", + ps->sa_sigdesc[sig].sd_vers); + sigexit(l, SIGILL); + case 2: + break; + } + + frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp; + frame.sf_signum = sig; + frame.sf_sip = &fp->sf_si; + frame.sf_ucp = &fp->sf_uc; + frame.sf_si._info = ksi->ksi_info; + frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM; + frame.sf_uc.uc_sigmask = *mask; + frame.sf_uc.uc_link = NULL; + frame.sf_uc.uc_flags |= (p->p_sigctx.ps_sigstk.ss_flags & SS_ONSTACK) + ? _UC_SETSTACK : _UC_CLRSTACK; + memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack)); + cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags); + + if (tf->tf_eflags & PSL_VM) + (*p->p_emul->e_syscall_intern)(p); + + if (copyout(&frame, fp, sizeof(frame)) != 0) { + /* + * Process has trashed its stack; give it an illegal + * instruction to halt it in its tracks. + */ + sigexit(l, SIGILL); + /* NOTREACHED */ + } + + buildcontext(l, sel, catcher, fp); + + /* Remember that we're now on the signal stack. */ + if (onstack) + p->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK; +} + +void +sendsig(const ksiginfo_t *ksi, const sigset_t *mask) +{ +#ifdef COMPAT_16 + if (curproc->p_sigacts->sa_sigdesc[ksi->ksi_signo].sd_vers < 2) + sendsig_sigcontext(ksi, mask); + else +#endif + sendsig_siginfo(ksi, mask); +} + +void +cpu_upcall(struct lwp *l, int type, int nevents, int ninterrupted, void *sas, + void *ap, void *sp, sa_upcall_t upcall) +{ + struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); + struct saframe *sf, frame; + struct trapframe *tf; + + tf = l->l_md.md_regs; + + /* Finally, copy out the rest of the frame. */ + frame.sa_type = type; + frame.sa_sas = sas; + frame.sa_events = nevents; + frame.sa_interrupted = ninterrupted; + frame.sa_arg = ap; + frame.sa_ra = 0; + + sf = (struct saframe *)sp - 1; + if (copyout(&frame, sf, sizeof(frame)) != 0) { + /* Copying onto the stack didn't work. Die. */ + sigexit(l, SIGILL); + /* NOTREACHED */ + } + + tf->tf_eip = (int) upcall; + tf->tf_esp = (int) sf; + tf->tf_ebp = 0; /* indicate call-frame-top to debuggers */ + tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); + tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); + tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); + tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); + tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ? + GSEL(GUCODEBIG_SEL, SEL_UPL) : GSEL(GUCODE_SEL, SEL_UPL); + tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); + tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC); +} + +int waittime = -1; +struct pcb dumppcb; + +void +cpu_reboot(int howto, char *bootstr) +{ + + if (cold) { + howto |= RB_HALT; + goto haltsys; + } + + boothowto = howto; + if ((howto & RB_NOSYNC) == 0 && waittime < 0) { + waittime = 0; + vfs_shutdown(); + /* + * If we've been adjusting the clock, the todr + * will be out of synch; adjust it now. + */ + if (time_adjusted != 0) + resettodr(); + } + + /* Disable interrupts. */ + splhigh(); + + /* Do a dump if requested. */ + if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP) + dumpsys(); + +haltsys: + doshutdownhooks(); + +#ifdef MULTIPROCESSOR + x86_broadcast_ipi(X86_IPI_HALT); +#endif + + if ((howto & RB_POWERDOWN) == RB_POWERDOWN) { +#if NACPI > 0 + if (acpi_softc != NULL) { + delay(500000); + acpi_enter_sleep_state(acpi_softc, ACPI_STATE_S5); + printf("WARNING: ACPI powerdown failed!\n"); + } +#endif +#if NAPM > 0 && !defined(APM_NO_POWEROFF) + /* turn off, if we can. But try to turn disk off and + * wait a bit first--some disk drives are slow to clean up + * and users have reported disk corruption. + */ + delay(500000); + apm_set_powstate(APM_DEV_DISK(0xff), APM_SYS_OFF); + delay(500000); + apm_set_powstate(APM_DEV_ALLDEVS, APM_SYS_OFF); + printf("WARNING: APM powerdown failed!\n"); + /* + * RB_POWERDOWN implies RB_HALT... fall into it... + */ +#endif + HYPERVISOR_shutdown(); + } + + if (howto & RB_HALT) { + printf("\n"); + printf("The operating system has halted.\n"); + printf("Please press any key to reboot.\n\n"); + +#ifdef BEEP_ONHALT + { + int c; + for (c = BEEP_ONHALT_COUNT; c > 0; c--) { + sysbeep(BEEP_ONHALT_PITCH, + BEEP_ONHALT_PERIOD * hz / 1000); + delay(BEEP_ONHALT_PERIOD * 1000); + sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000); + delay(BEEP_ONHALT_PERIOD * 1000); + } + } +#endif + + cnpollc(1); /* for proper keyboard command handling */ + if (cngetc() == 0) { + /* no console attached, so just hlt */ + for(;;) { + __asm __volatile("hlt"); + } + } + cnpollc(0); + } + + printf("rebooting...\n"); + if (cpureset_delay > 0) + delay(cpureset_delay * 1000); + cpu_reset(); + for(;;) ; + /*NOTREACHED*/ +} + +/* + * These variables are needed by /sbin/savecore + */ +u_int32_t dumpmag = 0x8fca0101; /* magic number */ +int dumpsize = 0; /* pages */ +long dumplo = 0; /* blocks */ + +/* + * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers. + */ +int +cpu_dumpsize() +{ + int size; + + size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) + + ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); + if (roundup(size, dbtob(1)) != dbtob(1)) + return (-1); + + return (1); +} + +/* + * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped. + */ +u_long +cpu_dump_mempagecnt() +{ + u_long i, n; + + n = 0; + for (i = 0; i < mem_cluster_cnt; i++) + n += atop(mem_clusters[i].size); + return (n); +} + +/* + * cpu_dump: dump the machine-dependent kernel core dump headers. + */ +int +cpu_dump() +{ + int (*dump)(dev_t, daddr_t, caddr_t, size_t); + char buf[dbtob(1)]; + kcore_seg_t *segp; + cpu_kcore_hdr_t *cpuhdrp; + phys_ram_seg_t *memsegp; + const struct bdevsw *bdev; + int i; + + bdev = bdevsw_lookup(dumpdev); + if (bdev == NULL) + return (ENXIO); + dump = bdev->d_dump; + + memset(buf, 0, sizeof buf); + segp = (kcore_seg_t *)buf; + cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))]; + memsegp = (phys_ram_seg_t *)&buf[ ALIGN(sizeof(*segp)) + + ALIGN(sizeof(*cpuhdrp))]; + + /* + * Generate a segment header. + */ + CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU); + segp->c_size = dbtob(1) - ALIGN(sizeof(*segp)); + + /* + * Add the machine-dependent header info. + */ + cpuhdrp->ptdpaddr = PTDpaddr; + cpuhdrp->nmemsegs = mem_cluster_cnt; + + /* + * Fill in the memory segment descriptors. + */ + for (i = 0; i < mem_cluster_cnt; i++) { + memsegp[i].start = mem_clusters[i].start; + memsegp[i].size = mem_clusters[i].size; + } + + return (dump(dumpdev, dumplo, (caddr_t)buf, dbtob(1))); +} + +/* + * This is called by main to set dumplo and dumpsize. + * Dumps always skip the first PAGE_SIZE of disk space + * in case there might be a disk label stored there. + * If there is extra space, put dump at the end to + * reduce the chance that swapping trashes it. + */ +void +cpu_dumpconf() +{ + const struct bdevsw *bdev; + int nblks, dumpblks; /* size of dump area */ + + if (dumpdev == NODEV) + goto bad; + bdev = bdevsw_lookup(dumpdev); + if (bdev == NULL) + panic("dumpconf: bad dumpdev=0x%x", dumpdev); + if (bdev->d_psize == NULL) + goto bad; + nblks = (*bdev->d_psize)(dumpdev); + if (nblks <= ctod(1)) + goto bad; + + dumpblks = cpu_dumpsize(); + if (dumpblks < 0) + goto bad; + dumpblks += ctod(cpu_dump_mempagecnt()); + + /* If dump won't fit (incl. room for possible label), punt. */ + if (dumpblks > (nblks - ctod(1))) + goto bad; + + /* Put dump at end of partition */ + dumplo = nblks - dumpblks; + + /* dumpsize is in page units, and doesn't include headers. */ + dumpsize = cpu_dump_mempagecnt(); + return; + + bad: + dumpsize = 0; +} + +/* + * Doadump comes here after turning off memory management and + * getting on the dump stack, either when called above, or by + * the auto-restart code. + */ +#define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */ +static vaddr_t dumpspace; + +vaddr_t +reserve_dumppages(vaddr_t p) +{ + + dumpspace = p; + return (p + BYTES_PER_DUMP); +} + +void +dumpsys() +{ + u_long totalbytesleft, bytes, i, n, memseg; + u_long maddr; + int psize; + daddr_t blkno; + const struct bdevsw *bdev; + int (*dump)(dev_t, daddr_t, caddr_t, size_t); + int error; + + /* Save registers. */ + savectx(&dumppcb); + + if (dumpdev == NODEV) + return; + + bdev = bdevsw_lookup(dumpdev); + if (bdev == NULL || bdev->d_psize == NULL) + return; + + /* + * For dumps during autoconfiguration, + * if dump device has already configured... + */ + if (dumpsize == 0) + cpu_dumpconf(); + if (dumplo <= 0 || dumpsize == 0) { + printf("\ndump to dev %u,%u not possible\n", major(dumpdev), + minor(dumpdev)); + return; + } + printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev), + minor(dumpdev), dumplo); + + psize = (*bdev->d_psize)(dumpdev); + printf("dump "); + if (psize == -1) { + printf("area unavailable\n"); + return; + } + +#if 0 /* XXX this doesn't work. grr. */ + /* toss any characters present prior to dump */ + while (sget() != NULL); /*syscons and pccons differ */ +#endif + + if ((error = cpu_dump()) != 0) + goto err; + + totalbytesleft = ptoa(cpu_dump_mempagecnt()); + blkno = dumplo + cpu_dumpsize(); + dump = bdev->d_dump; + error = 0; + + for (memseg = 0; memseg < mem_cluster_cnt; memseg++) { + maddr = mem_clusters[memseg].start; + bytes = mem_clusters[memseg].size; + + for (i = 0; i < bytes; i += n, totalbytesleft -= n) { + /* Print out how many MBs we have left to go. */ + if ((totalbytesleft % (1024*1024)) == 0) + printf("%ld ", totalbytesleft / (1024 * 1024)); + + /* Limit size for next transfer. */ + n = bytes - i; + if (n > BYTES_PER_DUMP) + n = BYTES_PER_DUMP; + + (void) pmap_map(dumpspace, maddr, maddr + n, + VM_PROT_READ); + + error = (*dump)(dumpdev, blkno, (caddr_t)dumpspace, n); + if (error) + goto err; + maddr += n; + blkno += btodb(n); /* XXX? */ + +#if 0 /* XXX this doesn't work. grr. */ + /* operator aborting dump? */ + if (sget() != NULL) { + error = EINTR; + break; + } +#endif + } + } + + err: + switch (error) { + + case ENXIO: + printf("device bad\n"); + break; + + case EFAULT: + printf("device not ready\n"); + break; + + case EINVAL: + printf("area improper\n"); + break; + + case EIO: + printf("i/o error\n"); + break; + + case EINTR: + printf("aborted from console\n"); + break; + + case 0: + printf("succeeded\n"); + break; + + default: + printf("error %d\n", error); + break; + } + printf("\n\n"); + delay(5000000); /* 5 seconds */ +} + +/* + * Clear registers on exec + */ +void +setregs(struct lwp *l, struct exec_package *pack, u_long stack) +{ + struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); + struct pcb *pcb = &l->l_addr->u_pcb; + struct trapframe *tf; + +#if NNPX > 0 + /* If we were using the FPU, forget about it. */ + if (l->l_addr->u_pcb.pcb_fpcpu != NULL) + npxsave_lwp(l, 0); +#endif + +#ifdef USER_LDT + pmap_ldt_cleanup(l); +#endif + + l->l_md.md_flags &= ~MDL_USEDFPU; + if (i386_use_fxsave) { + pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __NetBSD_NPXCW__; + pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__; + } else + pcb->pcb_savefpu.sv_87.sv_env.en_cw = __NetBSD_NPXCW__; + + tf = l->l_md.md_regs; + tf->tf_gs = LSEL(LUDATA_SEL, SEL_UPL); + tf->tf_fs = LSEL(LUDATA_SEL, SEL_UPL); + tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL); + tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL); + tf->tf_edi = 0; + tf->tf_esi = 0; + tf->tf_ebp = 0; + tf->tf_ebx = (int)l->l_proc->p_psstr; + tf->tf_edx = 0; + tf->tf_ecx = 0; + tf->tf_eax = 0; + tf->tf_eip = pack->ep_entry; + tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ? + LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL); + tf->tf_eflags = PSL_USERSET; + tf->tf_esp = stack; + tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); +} + +/* + * Initialize segments and descriptor tables + */ + +union descriptor *gdt, *ldt; +struct gate_descriptor *idt; +char idt_allocmap[NIDT]; +struct simplelock idt_lock = SIMPLELOCK_INITIALIZER; +#ifdef I586_CPU +union descriptor *pentium_idt; +#endif +extern struct user *proc0paddr; + +void +setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl, + int sel) +{ + + gd->gd_looffset = (int)func; + gd->gd_selector = sel; + gd->gd_stkcpy = args; + gd->gd_xx = 0; + gd->gd_type = type; + gd->gd_dpl = dpl; + gd->gd_p = 1; + gd->gd_hioffset = (int)func >> 16; +} + +void +unsetgate(struct gate_descriptor *gd) +{ + gd->gd_p = 0; + gd->gd_hioffset = 0; + gd->gd_looffset = 0; + gd->gd_selector = 0; + gd->gd_xx = 0; + gd->gd_stkcpy = 0; + gd->gd_type = 0; + gd->gd_dpl = 0; +} + + +void +setregion(struct region_descriptor *rd, void *base, size_t limit) +{ + + rd->rd_limit = (int)limit; + rd->rd_base = (int)base; +} + +void +setsegment(struct segment_descriptor *sd, void *base, size_t limit, int type, + int dpl, int def32, int gran) +{ + + sd->sd_lolimit = (int)limit; + sd->sd_lobase = (int)base; + sd->sd_type = type; + sd->sd_dpl = dpl; + sd->sd_p = 1; + sd->sd_hilimit = (int)limit >> 16; + sd->sd_xx = 0; + sd->sd_def32 = def32; + sd->sd_gran = gran; + sd->sd_hibase = (int)base >> 24; +} + +#define IDTVEC(name) __CONCAT(X, name) +typedef void (vector)(void); +extern vector IDTVEC(syscall); +extern vector IDTVEC(osyscall); +extern vector *IDTVEC(exceptions)[]; +#ifdef COMPAT_SVR4 +extern vector IDTVEC(svr4_fasttrap); +#endif /* COMPAT_SVR4 */ +#ifdef COMPAT_MACH +extern vector IDTVEC(mach_trap); +#endif +#define MAX_XEN_IDT 128 +trap_info_t xen_idt[MAX_XEN_IDT]; +int xen_idt_idx; + +#define KBTOB(x) ((size_t)(x) * 1024UL) + +void cpu_init_idt() +{ + struct region_descriptor region; + + panic("cpu_init_idt"); +#ifdef I586_CPU + setregion(®ion, pentium_idt, NIDT * sizeof(idt[0]) - 1); +#else + setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1); +#endif + lidt(®ion); +} + +#if !defined(REALBASEMEM) && !defined(REALEXTMEM) +void +add_mem_cluster(u_int64_t seg_start, u_int64_t seg_end, u_int32_t type) +{ + extern struct extent *iomem_ex; + int i; + + if (seg_end > 0x100000000ULL) { + printf("WARNING: skipping large " + "memory map entry: " + "0x%qx/0x%qx/0x%x\n", + seg_start, + (seg_end - seg_start), + type); + return; + } + + /* + * XXX Chop the last page off the size so that + * XXX it can fit in avail_end. + */ + if (seg_end == 0x100000000ULL) + seg_end -= PAGE_SIZE; + + if (seg_end <= seg_start) + return; + + for (i = 0; i < mem_cluster_cnt; i++) { + if ((mem_clusters[i].start == round_page(seg_start)) + && (mem_clusters[i].size + == trunc_page(seg_end) - mem_clusters[i].start)) { +#ifdef DEBUG_MEMLOAD + printf("WARNING: skipping duplicate segment entry\n"); +#endif + return; + } + } + + /* + * Allocate the physical addresses used by RAM + * from the iomem extent map. This is done before + * the addresses are page rounded just to make + * sure we get them all. + */ + if (extent_alloc_region(iomem_ex, seg_start, + seg_end - seg_start, EX_NOWAIT)) { + /* XXX What should we do? */ + printf("WARNING: CAN'T ALLOCATE " + "MEMORY SEGMENT " + "(0x%qx/0x%qx/0x%x) FROM " + "IOMEM EXTENT MAP!\n", + seg_start, seg_end - seg_start, type); + return; + } + + /* + * If it's not free memory, skip it. + */ + if (type != BIM_Memory) + return; + + /* XXX XXX XXX */ + if (mem_cluster_cnt >= VM_PHYSSEG_MAX) + panic("init386: too many memory segments"); + + seg_start = round_page(seg_start); + seg_end = trunc_page(seg_end); + + if (seg_start == seg_end) + return; + + mem_clusters[mem_cluster_cnt].start = seg_start; + mem_clusters[mem_cluster_cnt].size = + seg_end - seg_start; + + if (avail_end < seg_end) + avail_end = seg_end; + physmem += atop(mem_clusters[mem_cluster_cnt].size); + mem_cluster_cnt++; +} +#endif /* !defined(REALBASEMEM) && !defined(REALEXTMEM) */ + +void +initgdt() +{ +#if !defined(XEN) + struct region_descriptor region; +#else + paddr_t frames[16]; +#endif + +#if !defined(XEN) + gdt = tgdt; + memset(gdt, 0, NGDT*sizeof(*gdt)); +#endif + /* make gdt gates and memory segments */ + setsegment(&gdt[GCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 1, 1); + setsegment(&gdt[GDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 1, 1); + setsegment(&gdt[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1, + SDT_MEMERA, SEL_UPL, 1, 1); + setsegment(&gdt[GUCODEBIG_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1, + SDT_MEMERA, SEL_UPL, 1, 1); + setsegment(&gdt[GUDATA_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1, + SDT_MEMRWA, SEL_UPL, 1, 1); +#ifdef COMPAT_MACH + setgate(&gdt[GMACHCALLS_SEL].gd, &IDTVEC(mach_trap), 1, + SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif +#if NBIOSCALL > 0 + /* bios trampoline GDT entries */ + setsegment(&gdt[GBIOSCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 0, + 0); + setsegment(&gdt[GBIOSDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 0, + 0); +#endif + setsegment(&gdt[GCPU_SEL].sd, &cpu_info_primary, + sizeof(struct cpu_info)-1, SDT_MEMRWA, SEL_KPL, 1, 1); + +#if !defined(XEN) + setregion(®ion, gdt, NGDT * sizeof(gdt[0]) - 1); + lgdt(®ion); +#else + frames[0] = xpmap_ptom((uint32_t)gdt - KERNBASE) >> PAGE_SHIFT; + /* pmap_kremove((vaddr_t)gdt, PAGE_SIZE); */ + pmap_kenter_pa((vaddr_t)gdt, (uint32_t)gdt - KERNBASE, + VM_PROT_READ); + XENPRINTK(("loading gdt %lx, %d entries\n", frames[0] << PAGE_SHIFT, + LAST_RESERVED_GDT_ENTRY + 1)); + if (HYPERVISOR_set_gdt(frames, LAST_RESERVED_GDT_ENTRY + 1)) + panic("HYPERVISOR_set_gdt failed!\n"); + lgdt_finish(); +#endif +} + +void +init386(paddr_t first_avail) +{ +#if !defined(XEN) + union descriptor *tgdt; +#endif + extern void consinit(void); +#if !defined(XEN) + extern struct extent *iomem_ex; +#if !defined(REALBASEMEM) && !defined(REALEXTMEM) + struct btinfo_memmap *bim; +#endif + struct region_descriptor region; +#endif + int x; +#if !defined(XEN) + int first16q; + u_int64_t seg_start, seg_end; + u_int64_t seg_start1, seg_end1; +#endif + paddr_t realmode_reserved_start; + psize_t realmode_reserved_size; + int needs_earlier_install_pte0; +#if NBIOSCALL > 0 + extern int biostramp_image_size; + extern u_char biostramp_image[]; +#endif + + XENPRINTK(("HYPERVISOR_shared_info %p\n", HYPERVISOR_shared_info)); +#ifdef XENDEBUG_LOW + xen_dbglow_init(); +#endif + + cpu_probe_features(&cpu_info_primary); + cpu_feature = cpu_info_primary.ci_feature_flags; + + /* not on Xen... */ + cpu_feature &= ~(CPUID_PGE|CPUID_PSE|CPUID_MTRR|CPUID_FXSR); + + lwp0.l_addr = proc0paddr; + cpu_info_primary.ci_curpcb = &lwp0.l_addr->u_pcb; + + XENPRINTK(("proc0paddr %p pcb %p first_avail %p\n", + proc0paddr, cpu_info_primary.ci_curpcb, (void *)first_avail)); + XENPRINTK(("ptdpaddr %p atdevbase %p\n", (void *)PTDpaddr, + (void *)atdevbase)); + + x86_bus_space_init(); + consinit(); /* XXX SHOULD NOT BE DONE HERE */ + /* + * Initailize PAGE_SIZE-dependent variables. + */ + uvm_setpagesize(); + + /* + * Saving SSE registers won't work if the save area isn't + * 16-byte aligned. + */ + if (offsetof(struct user, u_pcb.pcb_savefpu) & 0xf) + panic("init386: pcb_savefpu not 16-byte aligned"); + + /* + * Start with 2 color bins -- this is just a guess to get us + * started. We'll recolor when we determine the largest cache + * sizes on the system. + */ + uvmexp.ncolors = 2; + +#if !defined(XEN) + /* + * BIOS leaves data in physical page 0 + * Even if it didn't, our VM system doesn't like using zero as a + * physical page number. + * We may also need pages in low memory (one each) for secondary CPU + * startup, for BIOS calls, and for ACPI, plus a page table page to map + * them into the first few pages of the kernel's pmap. + */ + avail_start = PAGE_SIZE; +#else + /* Make sure the end of the space used by the kernel is rounded. */ + first_avail = round_page(first_avail); + avail_start = first_avail - KERNBASE; + avail_end = ptoa(xen_start_info.nr_pages) + + (KERNTEXTOFF - KERNBASE_LOCORE); + pmap_pa_start = (KERNTEXTOFF - KERNBASE_LOCORE); + pmap_pa_end = avail_end; + mem_clusters[0].start = avail_start; + mem_clusters[0].size = avail_end - avail_start; + mem_cluster_cnt++; + physmem += atop(mem_clusters[0].size); +#endif + + /* + * reserve memory for real-mode call + */ + needs_earlier_install_pte0 = 0; + realmode_reserved_start = 0; + realmode_reserved_size = 0; +#if NBIOSCALL > 0 + /* save us a page for trampoline code */ + realmode_reserved_size += PAGE_SIZE; + needs_earlier_install_pte0 = 1; +#endif +#ifdef MULTIPROCESSOR /* XXX */ +#if !defined(XEN) + KASSERT(avail_start == PAGE_SIZE); /* XXX */ +#endif + if (realmode_reserved_size < MP_TRAMPOLINE) /* XXX */ + realmode_reserved_size = MP_TRAMPOLINE; /* XXX */ + needs_earlier_install_pte0 = 1; /* XXX */ +#endif /* XXX */ +#if NACPI > 0 + /* trampoline code for wake handler */ + realmode_reserved_size += ptoa(acpi_md_get_npages_of_wakecode()+1); + needs_earlier_install_pte0 = 1; +#endif + if (needs_earlier_install_pte0) { + /* page table for directory entry 0 */ + realmode_reserved_size += PAGE_SIZE; + } + if (realmode_reserved_size>0) { + realmode_reserved_start = avail_start; + avail_start += realmode_reserved_size; + } + +#ifdef DEBUG_MEMLOAD + printf("mem_cluster_count: %d\n", mem_cluster_cnt); +#endif + + /* + * Call pmap initialization to make new kernel address space. + * We must do this before loading pages into the VM system. + */ + pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE); + +#if !defined(XEN) +#if !defined(REALBASEMEM) && !defined(REALEXTMEM) + /* + * Check to see if we have a memory map from the BIOS (passed + * to us by the boot program. + */ + bim = lookup_bootinfo(BTINFO_MEMMAP); + if (bim != NULL && bim->num > 0) { +#ifdef DEBUG_MEMLOAD + printf("BIOS MEMORY MAP (%d ENTRIES):\n", bim->num); +#endif + for (x = 0; x < bim->num; x++) { +#ifdef DEBUG_MEMLOAD + printf(" addr 0x%qx size 0x%qx type 0x%x\n", + bim->entry[x].addr, + bim->entry[x].size, + bim->entry[x].type); +#endif + + /* + * If the segment is not memory, skip it. + */ + switch (bim->entry[x].type) { + case BIM_Memory: + case BIM_ACPI: + case BIM_NVS: + break; + default: + continue; + } + + /* + * Sanity check the entry. + * XXX Need to handle uint64_t in extent code + * XXX and 64-bit physical addresses in i386 + * XXX port. + */ + seg_start = bim->entry[x].addr; + seg_end = bim->entry[x].addr + bim->entry[x].size; + + /* + * Avoid Compatibility Holes. + * XXX Holes within memory space that allow access + * XXX to be directed to the PC-compatible frame buffer + * XXX (0xa0000-0xbffff),to adapter ROM space + * XXX (0xc0000-0xdffff), and to system BIOS space + * XXX (0xe0000-0xfffff). + * XXX Some laptop(for example,Toshiba Satellite2550X) + * XXX report this area and occurred problems, + * XXX so we avoid this area. + */ + if (seg_start < 0x100000 && seg_end > 0xa0000) { + printf("WARNING: memory map entry overlaps " + "with ``Compatibility Holes'': " + "0x%qx/0x%qx/0x%x\n", seg_start, + seg_end - seg_start, bim->entry[x].type); + add_mem_cluster(seg_start, 0xa0000, + bim->entry[x].type); + add_mem_cluster(0x100000, seg_end, + bim->entry[x].type); + } else + add_mem_cluster(seg_start, seg_end, + bim->entry[x].type); + } + } +#endif /* ! REALBASEMEM && ! REALEXTMEM */ + /* + * If the loop above didn't find any valid segment, fall back to + * former code. + */ + if (mem_cluster_cnt == 0) { + /* + * Allocate the physical addresses used by RAM from the iomem + * extent map. This is done before the addresses are + * page rounded just to make sure we get them all. + */ + if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), + EX_NOWAIT)) { + /* XXX What should we do? */ + printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM " + "IOMEM EXTENT MAP!\n"); + } + mem_clusters[0].start = 0; + mem_clusters[0].size = trunc_page(KBTOB(biosbasemem)); + physmem += atop(mem_clusters[0].size); + if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem), + EX_NOWAIT)) { + /* XXX What should we do? */ + printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM " + "IOMEM EXTENT MAP!\n"); + } +#if NISADMA > 0 + /* + * Some motherboards/BIOSes remap the 384K of RAM that would + * normally be covered by the ISA hole to the end of memory + * so that it can be used. However, on a 16M system, this + * would cause bounce buffers to be allocated and used. + * This is not desirable behaviour, as more than 384K of + * bounce buffers might be allocated. As a work-around, + * we round memory down to the nearest 1M boundary if + * we're using any isadma devices and the remapped memory + * is what puts us over 16M. + */ + if (biosextmem > (15*1024) && biosextmem < (16*1024)) { + char pbuf[9]; + + format_bytes(pbuf, sizeof(pbuf), + biosextmem - (15*1024)); + printf("Warning: ignoring %s of remapped memory\n", + pbuf); + biosextmem = (15*1024); + } +#endif + mem_clusters[1].start = IOM_END; + mem_clusters[1].size = trunc_page(KBTOB(biosextmem)); + physmem += atop(mem_clusters[1].size); + + mem_cluster_cnt = 2; + + avail_end = IOM_END + trunc_page(KBTOB(biosextmem)); + } + /* + * If we have 16M of RAM or less, just put it all on + * the default free list. Otherwise, put the first + * 16M of RAM on a lower priority free list (so that + * all of the ISA DMA'able memory won't be eaten up + * first-off). + */ + if (avail_end <= (16 * 1024 * 1024)) + first16q = VM_FREELIST_DEFAULT; + else + first16q = VM_FREELIST_FIRST16; + + /* Make sure the end of the space used by the kernel is rounded. */ + first_avail = round_page(first_avail); +#endif + + XENPRINTK(("load the memory cluster %p(%d) - %p(%ld)\n", + (void *)avail_start, (int)atop(avail_start), + (void *)avail_end, (int)atop(avail_end))); + uvm_page_physload(atop(avail_start), atop(avail_end), + atop(avail_start), atop(avail_end), + VM_FREELIST_DEFAULT); + +#if !defined(XEN) + + /* + * Now, load the memory clusters (which have already been + * rounded and truncated) into the VM system. + * + * NOTE: WE ASSUME THAT MEMORY STARTS AT 0 AND THAT THE KERNEL + * IS LOADED AT IOM_END (1M). + */ + for (x = 0; x < mem_cluster_cnt; x++) { + seg_start = mem_clusters[x].start; + seg_end = mem_clusters[x].start + mem_clusters[x].size; + seg_start1 = 0; + seg_end1 = 0; + + /* + * Skip memory before our available starting point. + */ + if (seg_end <= avail_start) + continue; + + if (avail_start >= seg_start && avail_start < seg_end) { + if (seg_start != 0) + panic("init386: memory doesn't start at 0"); + seg_start = avail_start; + if (seg_start == seg_end) + continue; + } + + /* + * If this segment contains the kernel, split it + * in two, around the kernel. + */ + if (seg_start <= IOM_END && first_avail <= seg_end) { + seg_start1 = first_avail; + seg_end1 = seg_end; + seg_end = IOM_END; + } + + /* First hunk */ + if (seg_start != seg_end) { + if (seg_start < (16 * 1024 * 1024) && + first16q != VM_FREELIST_DEFAULT) { + u_int64_t tmp; + + if (seg_end > (16 * 1024 * 1024)) + tmp = (16 * 1024 * 1024); + else + tmp = seg_end; + + if (tmp != seg_start) { +#ifdef DEBUG_MEMLOAD + printf("loading 0x%qx-0x%qx " + "(0x%lx-0x%lx)\n", + seg_start, tmp, + atop(seg_start), atop(tmp)); +#endif + uvm_page_physload(atop(seg_start), + atop(tmp), atop(seg_start), + atop(tmp), first16q); + } + seg_start = tmp; + } + + if (seg_start != seg_end) { +#ifdef DEBUG_MEMLOAD + printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n", + seg_start, seg_end, + atop(seg_start), atop(seg_end)); +#endif + uvm_page_physload(atop(seg_start), + atop(seg_end), atop(seg_start), + atop(seg_end), VM_FREELIST_DEFAULT); + } + } + + /* Second hunk */ + if (seg_start1 != seg_end1) { + if (seg_start1 < (16 * 1024 * 1024) && + first16q != VM_FREELIST_DEFAULT) { + u_int64_t tmp; + + if (seg_end1 > (16 * 1024 * 1024)) + tmp = (16 * 1024 * 1024); + else + tmp = seg_end1; + + if (tmp != seg_start1) { +#ifdef DEBUG_MEMLOAD + printf("loading 0x%qx-0x%qx " + "(0x%lx-0x%lx)\n", + seg_start1, tmp, + atop(seg_start1), atop(tmp)); +#endif + uvm_page_physload(atop(seg_start1), + atop(tmp), atop(seg_start1), + atop(tmp), first16q); + } + seg_start1 = tmp; + } + + if (seg_start1 != seg_end1) { +#ifdef DEBUG_MEMLOAD + printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n", + seg_start1, seg_end1, + atop(seg_start1), atop(seg_end1)); +#endif + uvm_page_physload(atop(seg_start1), + atop(seg_end1), atop(seg_start1), + atop(seg_end1), VM_FREELIST_DEFAULT); + } + } + } +#endif + + /* + * Steal memory for the message buffer (at end of core). + */ + { + struct vm_physseg *vps; + psize_t sz = round_page(MSGBUFSIZE); + psize_t reqsz = sz; + + for (x = 0; x < vm_nphysseg; x++) { + vps = &vm_physmem[x]; + if (ptoa(vps->avail_end) == avail_end) + goto found; + } + panic("init386: can't find end of memory"); + + found: + /* Shrink so it'll fit in the last segment. */ + if ((vps->avail_end - vps->avail_start) < atop(sz)) + sz = ptoa(vps->avail_end - vps->avail_start); + + vps->avail_end -= atop(sz); + vps->end -= atop(sz); + msgbuf_paddr = ptoa(vps->avail_end); + + /* Remove the last segment if it now has no pages. */ + if (vps->start == vps->end) { + for (vm_nphysseg--; x < vm_nphysseg; x++) + vm_physmem[x] = vm_physmem[x + 1]; + } + + /* Now find where the new avail_end is. */ + for (avail_end = 0, x = 0; x < vm_nphysseg; x++) + if (vm_physmem[x].avail_end > avail_end) + avail_end = vm_physmem[x].avail_end; + avail_end = ptoa(avail_end); + + /* Warn if the message buffer had to be shrunk. */ + if (sz != reqsz) + printf("WARNING: %ld bytes not available for msgbuf " + "in last cluster (%ld used)\n", reqsz, sz); + } + + /* + * install PT page for the first 4M if needed. + */ + if (needs_earlier_install_pte0) { + paddr_t paddr; +#ifdef DIAGNOSTIC + if (realmode_reserved_size < PAGE_SIZE) { + panic("cannot steal memory for first 4M PT page."); + } +#endif + paddr=realmode_reserved_start+realmode_reserved_size-PAGE_SIZE; + pmap_enter(pmap_kernel(), (vaddr_t)vtopte(0), paddr, + VM_PROT_READ|VM_PROT_WRITE, + PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE); + pmap_update(pmap_kernel()); + /* make sure it is clean before using */ + memset(vtopte(0), 0, PAGE_SIZE); + realmode_reserved_size -= PAGE_SIZE; + } + +#if NBIOSCALL > 0 + /* + * this should be caught at kernel build time, but put it here + * in case someone tries to fake it out... + */ +#ifdef DIAGNOSTIC + if (realmode_reserved_start > BIOSTRAMP_BASE || + (realmode_reserved_start+realmode_reserved_size) < (BIOSTRAMP_BASE+ + PAGE_SIZE)) { + panic("cannot steal memory for PT page of bioscall."); + } + if (biostramp_image_size > PAGE_SIZE) + panic("biostramp_image_size too big: %x vs. %x", + biostramp_image_size, PAGE_SIZE); +#endif + pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE, /* virtual */ + (paddr_t)BIOSTRAMP_BASE, /* physical */ + VM_PROT_ALL); /* protection */ + pmap_update(pmap_kernel()); + memcpy((caddr_t)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size); +#ifdef DEBUG_BIOSCALL + printf("biostramp installed @ %x\n", BIOSTRAMP_BASE); +#endif + realmode_reserved_size -= PAGE_SIZE; + realmode_reserved_start += PAGE_SIZE; +#endif + +#if NACPI > 0 + /* + * Steal memory for the acpi wake code + */ + { + paddr_t paddr, p; + psize_t sz; + int npg; + + paddr = realmode_reserved_start; + npg = acpi_md_get_npages_of_wakecode(); + sz = ptoa(npg); +#ifdef DIAGNOSTIC + if (realmode_reserved_size < sz) { + panic("cannot steal memory for ACPI wake code."); + } +#endif + + /* identical mapping */ + p = paddr; + for (x=0; x<npg; x++) { + printf("kenter: 0x%08X\n", (unsigned)p); + pmap_kenter_pa((vaddr_t)p, p, VM_PROT_ALL); + p += PAGE_SIZE; + } + pmap_update(pmap_kernel()); + + acpi_md_install_wakecode(paddr); + + realmode_reserved_size -= sz; + realmode_reserved_start += sz; + } +#endif + + pmap_enter(pmap_kernel(), idt_vaddr, idt_paddr, + VM_PROT_READ|VM_PROT_WRITE, PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE); + pmap_update(pmap_kernel()); + memset((void *)idt_vaddr, 0, PAGE_SIZE); + +#if !defined(XEN) + idt = (struct gate_descriptor *)idt_vaddr; +#ifdef I586_CPU + pmap_enter(pmap_kernel(), pentium_idt_vaddr, idt_paddr, + VM_PROT_READ, PMAP_WIRED|VM_PROT_READ); + pentium_idt = (union descriptor *)pentium_idt_vaddr; +#endif +#endif + pmap_update(pmap_kernel()); + + initgdt(); + + HYPERVISOR_set_callbacks( + GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback, + GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); + +#if !defined(XEN) + tgdt = gdt; + gdt = (union descriptor *) + ((char *)idt + NIDT * sizeof (struct gate_descriptor)); + ldt = gdt + NGDT; + + memcpy(gdt, tgdt, NGDT*sizeof(*gdt)); + + setsegment(&gdt[GLDT_SEL].sd, ldt, NLDT * sizeof(ldt[0]) - 1, + SDT_SYSLDT, SEL_KPL, 0, 0); +#else + ldt = (union descriptor *)idt_vaddr; +#endif + + /* make ldt gates and memory segments */ + setgate(&ldt[LSYS5CALLS_SEL].gd, &IDTVEC(osyscall), 1, + SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); + + ldt[LUCODE_SEL] = gdt[GUCODE_SEL]; + ldt[LUCODEBIG_SEL] = gdt[GUCODEBIG_SEL]; + ldt[LUDATA_SEL] = gdt[GUDATA_SEL]; + ldt[LSOL26CALLS_SEL] = ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; + +#if !defined(XEN) + /* exceptions */ + for (x = 0; x < 32; x++) { + setgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386TGT, + (x == 3 || x == 4) ? SEL_UPL : SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + idt_allocmap[x] = 1; + } + + /* new-style interrupt gate for syscalls */ + setgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386TGT, SEL_UPL, + GSEL(GCODE_SEL, SEL_KPL)); + idt_allocmap[128] = 1; +#ifdef COMPAT_SVR4 + setgate(&idt[0xd2], &IDTVEC(svr4_fasttrap), 0, SDT_SYS386TGT, + SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); + idt_allocmap[0xd2] = 1; +#endif /* COMPAT_SVR4 */ +#endif + + memset(xen_idt, 0, sizeof(trap_info_t) * MAX_XEN_IDT); + xen_idt_idx = 0; + for (x = 0; x < 32; x++) { + KASSERT(xen_idt_idx < MAX_XEN_IDT); + xen_idt[xen_idt_idx].vector = x; + xen_idt[xen_idt_idx].flags = + (x == 3 || x == 4) ? SEL_UPL : SEL_XEN; + xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); + xen_idt[xen_idt_idx].address = + (uint32_t)IDTVEC(exceptions)[x]; + xen_idt_idx++; + } + KASSERT(xen_idt_idx < MAX_XEN_IDT); + xen_idt[xen_idt_idx].vector = 128; + xen_idt[xen_idt_idx].flags = SEL_UPL; + xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); + xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(syscall); + xen_idt_idx++; +#ifdef COMPAT_SVR4 + KASSERT(xen_idt_idx < MAX_XEN_IDT); + xen_idt[xen_idt_idx].vector = 0xd2; + xen_idt[xen_idt_idx].flags = SEL_UPL; + xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); + xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(svr4_fasttrap); + xen_idt_idx++; +#endif /* COMPAT_SVR4 */ + +#if !defined(XEN) + setregion(®ion, gdt, NGDT * sizeof(gdt[0]) - 1); + lgdt(®ion); +#else + lldt(GSEL(GLDT_SEL, SEL_KPL)); +#endif + +#if !defined(XEN) + cpu_init_idt(); +#else + db_trap_callback = ddb_trap_hook; + + XENPRINTF(("HYPERVISOR_set_trap_table %p\n", xen_idt)); + if (HYPERVISOR_set_trap_table(xen_idt)) + panic("HYPERVISOR_set_trap_table %p failed\n", xen_idt); +#endif + +#if NKSYMS || defined(DDB) || defined(LKM) + { + extern int end; + extern int *esym; + struct btinfo_symtab *symtab; + +#ifdef DDB + db_machine_init(); +#endif + + symtab = lookup_bootinfo(BTINFO_SYMTAB); + + if (symtab) { + symtab->ssym += KERNBASE; + symtab->esym += KERNBASE; + ksyms_init(symtab->nsym, (int *)symtab->ssym, + (int *)symtab->esym); + } + else + ksyms_init(*(int *)&end, ((int *)&end) + 1, esym); + } +#endif +#ifdef DDB + if (boothowto & RB_KDB) + Debugger(); +#endif +#ifdef IPKDB + ipkdb_init(); + if (boothowto & RB_KDB) + ipkdb_connect(0); +#endif +#ifdef KGDB + kgdb_port_init(); + if (boothowto & RB_KDB) { + kgdb_debug_init = 1; + kgdb_connect(1); + } +#endif + +#if NMCA > 0 + /* check for MCA bus, needed to be done before ISA stuff - if + * MCA is detected, ISA needs to use level triggered interrupts + * by default */ + mca_busprobe(); +#endif + +#if defined(XEN) + events_default_setup(); +#else + intr_default_setup(); +#endif + + /* Initialize software interrupts. */ + softintr_init(); + + splraise(IPL_IPI); + enable_intr(); + + if (physmem < btoc(2 * 1024 * 1024)) { + printf("warning: too little memory available; " + "have %lu bytes, want %lu bytes\n" + "running in degraded mode\n" + "press a key to confirm\n\n", + ptoa(physmem), 2*1024*1024UL); + cngetc(); + } + +#ifdef __HAVE_CPU_MAXPROC + /* Make sure maxproc is sane */ + if (maxproc > cpu_maxproc()) + maxproc = cpu_maxproc(); +#endif +} + +#ifdef COMPAT_NOMID +static int +exec_nomid(struct proc *p, struct exec_package *epp) +{ + int error; + u_long midmag, magic; + u_short mid; + struct exec *execp = epp->ep_hdr; + + /* check on validity of epp->ep_hdr performed by exec_out_makecmds */ + + midmag = ntohl(execp->a_midmag); + mid = (midmag >> 16) & 0xffff; + magic = midmag & 0xffff; + + if (magic == 0) { + magic = (execp->a_midmag & 0xffff); + mid = MID_ZERO; + } + + midmag = mid << 16 | magic; + + switch (midmag) { + case (MID_ZERO << 16) | ZMAGIC: + /* + * 386BSD's ZMAGIC format: + */ + error = exec_aout_prep_oldzmagic(p, epp); + break; + + case (MID_ZERO << 16) | QMAGIC: + /* + * BSDI's QMAGIC format: + * same as new ZMAGIC format, but with different magic number + */ + error = exec_aout_prep_zmagic(p, epp); + break; + + case (MID_ZERO << 16) | NMAGIC: + /* + * BSDI's NMAGIC format: + * same as NMAGIC format, but with different magic number + * and with text starting at 0. + */ + error = exec_aout_prep_oldnmagic(p, epp); + break; + + case (MID_ZERO << 16) | OMAGIC: + /* + * BSDI's OMAGIC format: + * same as OMAGIC format, but with different magic number + * and with text starting at 0. + */ + error = exec_aout_prep_oldomagic(p, epp); + break; + + default: + error = ENOEXEC; + } + + return error; +} +#endif + +/* + * cpu_exec_aout_makecmds(): + * CPU-dependent a.out format hook for execve(). + * + * Determine of the given exec package refers to something which we + * understand and, if so, set up the vmcmds for it. + * + * On the i386, old (386bsd) ZMAGIC binaries and BSDI QMAGIC binaries + * if COMPAT_NOMID is given as a kernel option. + */ +int +cpu_exec_aout_makecmds(struct proc *p, struct exec_package *epp) +{ + int error = ENOEXEC; + +#ifdef COMPAT_NOMID + if ((error = exec_nomid(p, epp)) == 0) + return error; +#endif /* ! COMPAT_NOMID */ + + return error; +} + +void * +lookup_bootinfo(int type) +{ + struct btinfo_common *help; + int n = *(int*)bootinfo; + help = (struct btinfo_common *)(bootinfo + sizeof(int)); + while(n--) { + if(help->type == type) + return(help); + help = (struct btinfo_common *)((char*)help + help->len); + } + return(0); +} + +#include <dev/ic/mc146818reg.h> /* for NVRAM POST */ +#include <i386/isa/nvram.h> /* for NVRAM POST */ + +void +cpu_reset() +{ + + disable_intr(); + +#if 0 + /* + * Ensure the NVRAM reset byte contains something vaguely sane. + */ + + outb(IO_RTC, NVRAM_RESET); + outb(IO_RTC+1, NVRAM_RESET_RST); + + /* + * The keyboard controller has 4 random output pins, one of which is + * connected to the RESET pin on the CPU in many PCs. We tell the + * keyboard controller to pulse this line a couple of times. + */ + outb(IO_KBD + KBCMDP, KBC_PULSE0); + delay(100000); + outb(IO_KBD + KBCMDP, KBC_PULSE0); + delay(100000); +#endif + + HYPERVISOR_reboot(); + + for (;;); +} + +void +cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags) +{ + const struct trapframe *tf = l->l_md.md_regs; + __greg_t *gr = mcp->__gregs; + __greg_t ras_eip; + + /* Save register context. */ +#ifdef VM86 + if (tf->tf_eflags & PSL_VM) { + gr[_REG_GS] = tf->tf_vm86_gs; + gr[_REG_FS] = tf->tf_vm86_fs; + gr[_REG_ES] = tf->tf_vm86_es; + gr[_REG_DS] = tf->tf_vm86_ds; + gr[_REG_EFL] = get_vflags(l); + } else +#endif + { + gr[_REG_GS] = tf->tf_gs; + gr[_REG_FS] = tf->tf_fs; + gr[_REG_ES] = tf->tf_es; + gr[_REG_DS] = tf->tf_ds; + gr[_REG_EFL] = tf->tf_eflags; + } + gr[_REG_EDI] = tf->tf_edi; + gr[_REG_ESI] = tf->tf_esi; + gr[_REG_EBP] = tf->tf_ebp; + gr[_REG_EBX] = tf->tf_ebx; + gr[_REG_EDX] = tf->tf_edx; + gr[_REG_ECX] = tf->tf_ecx; + gr[_REG_EAX] = tf->tf_eax; + gr[_REG_EIP] = tf->tf_eip; + gr[_REG_CS] = tf->tf_cs; + gr[_REG_ESP] = tf->tf_esp; + gr[_REG_UESP] = tf->tf_esp; + gr[_REG_SS] = tf->tf_ss; + gr[_REG_TRAPNO] = tf->tf_trapno; + gr[_REG_ERR] = tf->tf_err; + + if ((ras_eip = (__greg_t)ras_lookup(l->l_proc, + (caddr_t) gr[_REG_EIP])) != -1) + gr[_REG_EIP] = ras_eip; + + *flags |= _UC_CPU; + + /* Save floating point register context, if any. */ + if ((l->l_md.md_flags & MDL_USEDFPU) != 0) { +#if NNPX > 0 + /* + * If this process is the current FP owner, dump its + * context to the PCB first. + * XXX npxsave() also clears the FPU state; depending on the + * XXX application this might be a penalty. + */ + if (l->l_addr->u_pcb.pcb_fpcpu) { + npxsave_lwp(l, 1); + } +#endif + if (i386_use_fxsave) { + memcpy(&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm, + &l->l_addr->u_pcb.pcb_savefpu.sv_xmm, + sizeof (mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm)); + *flags |= _UC_FXSAVE; + } else { + memcpy(&mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state, + &l->l_addr->u_pcb.pcb_savefpu.sv_87, + sizeof (mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state)); + } +#if 0 + /* Apparently nothing ever touches this. */ + ucp->mcp.mc_fp.fp_emcsts = l->l_addr->u_pcb.pcb_saveemc; +#endif + *flags |= _UC_FPU; + } +} + +int +cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags) +{ + struct trapframe *tf = l->l_md.md_regs; + __greg_t *gr = mcp->__gregs; + + /* Restore register context, if any. */ + if ((flags & _UC_CPU) != 0) { +#ifdef VM86 + if (gr[_REG_EFL] & PSL_VM) { + tf->tf_vm86_gs = gr[_REG_GS]; + tf->tf_vm86_fs = gr[_REG_FS]; + tf->tf_vm86_es = gr[_REG_ES]; + tf->tf_vm86_ds = gr[_REG_DS]; + set_vflags(l, gr[_REG_EFL]); + if (flags & _UC_VM) { + void syscall_vm86(struct trapframe *); + l->l_proc->p_md.md_syscall = syscall_vm86; + } + } else +#endif + { + /* + * Check for security violations. If we're returning + * to protected mode, the CPU will validate the segment + * registers automatically and generate a trap on + * violations. We handle the trap, rather than doing + * all of the checking here. + */ + if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) || + !USERMODE(gr[_REG_CS], gr[_REG_EFL])) { + printf("cpu_setmcontext error: uc EFL: 0x%08x" + " tf EFL: 0x%08x uc CS: 0x%x\n", + gr[_REG_EFL], tf->tf_eflags, gr[_REG_CS]); + return (EINVAL); + } + tf->tf_gs = gr[_REG_GS]; + tf->tf_fs = gr[_REG_FS]; + tf->tf_es = gr[_REG_ES]; + tf->tf_ds = gr[_REG_DS]; + /* Only change the user-alterable part of eflags */ + tf->tf_eflags &= ~PSL_USER; + tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER); + } + tf->tf_edi = gr[_REG_EDI]; + tf->tf_esi = gr[_REG_ESI]; + tf->tf_ebp = gr[_REG_EBP]; + tf->tf_ebx = gr[_REG_EBX]; + tf->tf_edx = gr[_REG_EDX]; + tf->tf_ecx = gr[_REG_ECX]; + tf->tf_eax = gr[_REG_EAX]; + tf->tf_eip = gr[_REG_EIP]; + tf->tf_cs = gr[_REG_CS]; + tf->tf_esp = gr[_REG_UESP]; + tf->tf_ss = gr[_REG_SS]; + } + + /* Restore floating point register context, if any. */ + if ((flags & _UC_FPU) != 0) { +#if NNPX > 0 + /* + * If we were using the FPU, forget that we were. + */ + if (l->l_addr->u_pcb.pcb_fpcpu != NULL) + npxsave_lwp(l, 0); +#endif + if (flags & _UC_FXSAVE) { + if (i386_use_fxsave) { + memcpy( + &l->l_addr->u_pcb.pcb_savefpu.sv_xmm, + &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm, + sizeof (&l->l_addr->u_pcb.pcb_savefpu.sv_xmm)); + } else { + /* This is a weird corner case */ + process_xmm_to_s87((struct savexmm *) + &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm, + &l->l_addr->u_pcb.pcb_savefpu.sv_87); + } + } else { + if (i386_use_fxsave) { + process_s87_to_xmm((struct save87 *) + &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state, + &l->l_addr->u_pcb.pcb_savefpu.sv_xmm); + } else { + memcpy(&l->l_addr->u_pcb.pcb_savefpu.sv_87, + &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state, + sizeof (l->l_addr->u_pcb.pcb_savefpu.sv_87)); + } + } + /* If not set already. */ + l->l_md.md_flags |= MDL_USEDFPU; +#if 0 + /* Apparently unused. */ + l->l_addr->u_pcb.pcb_saveemc = mcp->mc_fp.fp_emcsts; +#endif + } + if (flags & _UC_SETSTACK) + l->l_proc->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK; + if (flags & _UC_CLRSTACK) + l->l_proc->p_sigctx.ps_sigstk.ss_flags &= ~SS_ONSTACK; + return (0); +} + +void +cpu_initclocks() +{ + (*initclock_func)(); +} + +#ifdef MULTIPROCESSOR +void +need_resched(struct cpu_info *ci) +{ + + if (ci->ci_want_resched) + return; + + ci->ci_want_resched = 1; + if ((ci)->ci_curlwp != NULL) + aston((ci)->ci_curlwp->l_proc); + else if (ci != curcpu()) + x86_send_ipi(ci, 0); +} +#endif + +/* + * Allocate an IDT vector slot within the given range. + * XXX needs locking to avoid MP allocation races. + */ + +int +idt_vec_alloc(int low, int high) +{ + int vec; + + simple_lock(&idt_lock); + for (vec = low; vec <= high; vec++) { + if (idt_allocmap[vec] == 0) { + idt_allocmap[vec] = 1; + simple_unlock(&idt_lock); + return vec; + } + } + simple_unlock(&idt_lock); + return 0; +} + +void +idt_vec_set(int vec, void (*function)(void)) +{ + /* + * Vector should be allocated, so no locking needed. + */ + KASSERT(idt_allocmap[vec] == 1); + setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); +} + +void +idt_vec_free(int vec) +{ + simple_lock(&idt_lock); + unsetgate(&idt[vec]); + idt_allocmap[vec] = 0; + simple_unlock(&idt_lock); +} + +/* + * Number of processes is limited by number of available GDT slots. + */ +int +cpu_maxproc(void) +{ +#ifdef USER_LDT + return ((MAXGDTSIZ - NGDT) / 2); +#else + return (MAXGDTSIZ - NGDT); +#endif +} + +#if defined(DDB) || defined(KGDB) + +/* + * Callback to output a backtrace when entering ddb. + */ +void +ddb_trap_hook(int where) +{ + static int once = 0; + db_addr_t db_dot; + + if (once != 0 || where != 1) + return; + once = 1; + + if (curlwp != NULL) { + db_printf("Stopped"); + if (curproc == NULL) + db_printf("; curlwp = %p," + " curproc is NULL at\t", curlwp); + else + db_printf(" in pid %d.%d (%s) at\t", + curproc->p_pid, curlwp->l_lid, + curproc->p_comm); + } else + db_printf("Stopped at\t"); + db_dot = PC_REGS(DDB_REGS); + db_print_loc_and_inst(db_dot); + + db_stack_trace_print((db_expr_t) db_dot, FALSE, 65535, + "", db_printf); +#ifdef DEBUG + db_show_regs((db_expr_t) db_dot, FALSE, 65535, ""); +#endif +} + +#endif /* DDB || KGDB */ diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c new file mode 100644 index 0000000000..8e031eb242 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c @@ -0,0 +1,4522 @@ +/* $NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $ */ +/* NetBSD: pmap.c,v 1.172 2004/04/12 13:17:46 yamt Exp */ + +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * pmap.c: i386 pmap module rewrite + * Chuck Cranor <chuck@ccrc.wustl.edu> + * 11-Aug-97 + * + * history of this pmap module: in addition to my own input, i used + * the following references for this rewrite of the i386 pmap: + * + * [1] the NetBSD i386 pmap. this pmap appears to be based on the + * BSD hp300 pmap done by Mike Hibler at University of Utah. + * it was then ported to the i386 by William Jolitz of UUNET + * Technologies, Inc. Then Charles M. Hannum of the NetBSD + * project fixed some bugs and provided some speed ups. + * + * [2] the FreeBSD i386 pmap. this pmap seems to be the + * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson + * and David Greenman. + * + * [3] the Mach pmap. this pmap, from CMU, seems to have migrated + * between several processors. the VAX version was done by + * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 + * version was done by Lance Berc, Mike Kupfer, Bob Baron, + * David Golub, and Richard Draves. the alpha version was + * done by Alessandro Forin (CMU/Mach) and Chris Demetriou + * (NetBSD/alpha). + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $"); + +#include "opt_cputype.h" +#include "opt_user_ldt.h" +#include "opt_largepages.h" +#include "opt_lockdebug.h" +#include "opt_multiprocessor.h" +#include "opt_kstack_dr0.h" +#include "opt_xen.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/pool.h> +#include <sys/user.h> +#include <sys/kernel.h> + +#include <uvm/uvm.h> + +#include <machine/atomic.h> +#include <machine/cpu.h> +#include <machine/specialreg.h> +#include <machine/gdt.h> + +#include <dev/isa/isareg.h> +#include <machine/isa_machdep.h> + +#include <machine/xen.h> +#include <machine/hypervisor.h> +#include <machine/xenpmap.h> + +void xpmap_find_pte(paddr_t); + +/* #define XENDEBUG */ + +#ifdef XENDEBUG +#define XENPRINTF(x) printf x +#define XENPRINTK(x) printf x +#else +#define XENPRINTF(x) +#define XENPRINTK(x) +#endif +#define PRINTF(x) printf x +#define PRINTK(x) printf x + + +/* + * general info: + * + * - for an explanation of how the i386 MMU hardware works see + * the comments in <machine/pte.h>. + * + * - for an explanation of the general memory structure used by + * this pmap (including the recursive mapping), see the comments + * in <machine/pmap.h>. + * + * this file contains the code for the "pmap module." the module's + * job is to manage the hardware's virtual to physical address mappings. + * note that there are two levels of mapping in the VM system: + * + * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's + * to map ranges of virtual address space to objects/files. for + * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only + * to the file /bin/ls starting at offset zero." note that + * the upper layer mapping is not concerned with how individual + * vm_pages are mapped. + * + * [2] the lower layer of the VM system (the pmap) maintains the mappings + * from virtual addresses. it is concerned with which vm_page is + * mapped where. for example, when you run /bin/ls and start + * at page 0x1000 the fault routine may lookup the correct page + * of the /bin/ls file and then ask the pmap layer to establish + * a mapping for it. + * + * note that information in the lower layer of the VM system can be + * thrown away since it can easily be reconstructed from the info + * in the upper layer. + * + * data structures we use include: + * + * - struct pmap: describes the address space of one thread + * - struct pv_entry: describes one <PMAP,VA> mapping of a PA + * - struct pv_head: there is one pv_head per managed page of + * physical memory. the pv_head points to a list of pv_entry + * structures which describe all the <PMAP,VA> pairs that this + * page is mapped in. this is critical for page based operations + * such as pmap_page_protect() [change protection on _all_ mappings + * of a page] + * - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's. + * if we run out of pv_entry's we allocate a new pv_page and free + * its pv_entrys. + * - pmap_remove_record: a list of virtual addresses whose mappings + * have been changed. used for TLB flushing. + */ + +/* + * memory allocation + * + * - there are three data structures that we must dynamically allocate: + * + * [A] new process' page directory page (PDP) + * - plan 1: done at pmap_create() we use + * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this + * allocation. + * + * if we are low in free physical memory then we sleep in + * uvm_km_alloc -- in this case this is ok since we are creating + * a new pmap and should not be holding any locks. + * + * if the kernel is totally out of virtual space + * (i.e. uvm_km_alloc returns NULL), then we panic. + * + * XXX: the fork code currently has no way to return an "out of + * memory, try again" error code since uvm_fork [fka vm_fork] + * is a void function. + * + * [B] new page tables pages (PTP) + * - call uvm_pagealloc() + * => success: zero page, add to pm_pdir + * => failure: we are out of free vm_pages, let pmap_enter() + * tell UVM about it. + * + * note: for kernel PTPs, we start with NKPTP of them. as we map + * kernel memory (at uvm_map time) we check to see if we've grown + * the kernel pmap. if so, we call the optional function + * pmap_growkernel() to grow the kernel PTPs in advance. + * + * [C] pv_entry structures + * - plan 1: try to allocate one off the free list + * => success: done! + * => failure: no more free pv_entrys on the list + * - plan 2: try to allocate a new pv_page to add a chunk of + * pv_entrys to the free list + * [a] obtain a free, unmapped, VA in kmem_map. either + * we have one saved from a previous call, or we allocate + * one now using a "vm_map_lock_try" in uvm_map + * => success: we have an unmapped VA, continue to [b] + * => failure: unable to lock kmem_map or out of VA in it. + * move on to plan 3. + * [b] allocate a page in kmem_object for the VA + * => success: map it in, free the pv_entry's, DONE! + * => failure: kmem_object locked, no free vm_pages, etc. + * save VA for later call to [a], go to plan 3. + * If we fail, we simply let pmap_enter() tell UVM about it. + */ + +/* + * locking + * + * we have the following locks that we must contend with: + * + * "normal" locks: + * + * - pmap_main_lock + * this lock is used to prevent deadlock and/or provide mutex + * access to the pmap system. most operations lock the pmap + * structure first, then they lock the pv_lists (if needed). + * however, some operations such as pmap_page_protect lock + * the pv_lists and then lock pmaps. in order to prevent a + * cycle, we require a mutex lock when locking the pv_lists + * first. thus, the "pmap = >pv_list" lockers must gain a + * read-lock on pmap_main_lock before locking the pmap. and + * the "pv_list => pmap" lockers must gain a write-lock on + * pmap_main_lock before locking. since only one thread + * can write-lock a lock at a time, this provides mutex. + * + * "simple" locks: + * + * - pmap lock (per pmap, part of uvm_object) + * this lock protects the fields in the pmap structure including + * the non-kernel PDEs in the PDP, and the PTEs. it also locks + * in the alternate PTE space (since that is determined by the + * entry in the PDP). + * + * - pvh_lock (per pv_head) + * this lock protects the pv_entry list which is chained off the + * pv_head structure for a specific managed PA. it is locked + * when traversing the list (e.g. adding/removing mappings, + * syncing R/M bits, etc.) + * + * - pvalloc_lock + * this lock protects the data structures which are used to manage + * the free list of pv_entry structures. + * + * - pmaps_lock + * this lock protects the list of active pmaps (headed by "pmaps"). + * we lock it when adding or removing pmaps from this list. + * + */ + +/* + * locking data structures + */ + +static struct simplelock pvalloc_lock; +static struct simplelock pmaps_lock; + +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) +static struct lock pmap_main_lock; + +#define PMAP_MAP_TO_HEAD_LOCK() \ + (void) spinlockmgr(&pmap_main_lock, LK_SHARED, NULL) +#define PMAP_MAP_TO_HEAD_UNLOCK() \ + (void) spinlockmgr(&pmap_main_lock, LK_RELEASE, NULL) + +#define PMAP_HEAD_TO_MAP_LOCK() \ + (void) spinlockmgr(&pmap_main_lock, LK_EXCLUSIVE, NULL) +#define PMAP_HEAD_TO_MAP_UNLOCK() \ + spinlockmgr(&pmap_main_lock, LK_RELEASE, (void *) 0) + +#else + +#define PMAP_MAP_TO_HEAD_LOCK() /* null */ +#define PMAP_MAP_TO_HEAD_UNLOCK() /* null */ + +#define PMAP_HEAD_TO_MAP_LOCK() /* null */ +#define PMAP_HEAD_TO_MAP_UNLOCK() /* null */ + +#endif + +#define COUNT(x) /* nothing */ + +/* + * TLB Shootdown: + * + * When a mapping is changed in a pmap, the TLB entry corresponding to + * the virtual address must be invalidated on all processors. In order + * to accomplish this on systems with multiple processors, messages are + * sent from the processor which performs the mapping change to all + * processors on which the pmap is active. For other processors, the + * ASN generation numbers for that processor is invalidated, so that + * the next time the pmap is activated on that processor, a new ASN + * will be allocated (which implicitly invalidates all TLB entries). + * + * Shootdown job queue entries are allocated using a simple special- + * purpose allocator for speed. + */ +struct pmap_tlb_shootdown_job { + TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list; + vaddr_t pj_va; /* virtual address */ + pmap_t pj_pmap; /* the pmap which maps the address */ + pt_entry_t pj_pte; /* the PTE bits */ + struct pmap_tlb_shootdown_job *pj_nextfree; +}; + +#define PMAP_TLB_SHOOTDOWN_JOB_ALIGN 32 +union pmap_tlb_shootdown_job_al { + struct pmap_tlb_shootdown_job pja_job; + char pja_align[PMAP_TLB_SHOOTDOWN_JOB_ALIGN]; +}; + +struct pmap_tlb_shootdown_q { + TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head; + int pq_pte; /* aggregate PTE bits */ + int pq_count; /* number of pending requests */ + __cpu_simple_lock_t pq_slock; /* spin lock on queue */ + int pq_flushg; /* pending flush global */ + int pq_flushu; /* pending flush user */ +} pmap_tlb_shootdown_q[X86_MAXPROCS]; + +#define PMAP_TLB_MAXJOBS 16 + +void pmap_tlb_shootdown_q_drain(struct pmap_tlb_shootdown_q *); +struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get + (struct pmap_tlb_shootdown_q *); +void pmap_tlb_shootdown_job_put(struct pmap_tlb_shootdown_q *, + struct pmap_tlb_shootdown_job *); + +__cpu_simple_lock_t pmap_tlb_shootdown_job_lock; +union pmap_tlb_shootdown_job_al *pj_page, *pj_free; + +/* + * global data structures + */ + +struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ + +/* + * nkpde is the number of kernel PTPs allocated for the kernel at + * boot time (NKPTP is a compile time override). this number can + * grow dynamically as needed (but once allocated, we never free + * kernel PTPs). + */ + +int nkpde = NKPTP; +#ifdef NKPDE +#error "obsolete NKPDE: use NKPTP" +#endif + +/* + * pmap_pg_g: if our processor supports PG_G in the PTE then we + * set pmap_pg_g to PG_G (otherwise it is zero). + */ + +int pmap_pg_g = 0; + +#ifdef LARGEPAGES +/* + * pmap_largepages: if our processor supports PG_PS and we are + * using it, this is set to TRUE. + */ + +int pmap_largepages; +#endif + +/* + * i386 physical memory comes in a big contig chunk with a small + * hole toward the front of it... the following two paddr_t's + * (shared with machdep.c) describe the physical address space + * of this machine. + */ +paddr_t avail_start; /* PA of first available physical page */ +paddr_t avail_end; /* PA of last available physical page */ + +paddr_t pmap_pa_start; /* PA of first physical page for this domain */ +paddr_t pmap_pa_end; /* PA of last physical page for this domain */ + + /* MA of last physical page of the machine */ +paddr_t pmap_mem_end = HYPERVISOR_VIRT_START; /* updated for domain-0 */ + +/* + * other data structures + */ + +static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ +static boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */ + +/* + * the following two vaddr_t's are used during system startup + * to keep track of how much of the kernel's VM space we have used. + * once the system is started, the management of the remaining kernel + * VM space is turned over to the kernel_map vm_map. + */ + +static vaddr_t virtual_avail; /* VA of first free KVA */ +static vaddr_t virtual_end; /* VA of last free KVA */ + + +/* + * pv_page management structures: locked by pvalloc_lock + */ + +TAILQ_HEAD(pv_pagelist, pv_page); +static struct pv_pagelist pv_freepages; /* list of pv_pages with free entrys */ +static struct pv_pagelist pv_unusedpgs; /* list of unused pv_pages */ +static int pv_nfpvents; /* # of free pv entries */ +static struct pv_page *pv_initpage; /* bootstrap page from kernel_map */ +static vaddr_t pv_cachedva; /* cached VA for later use */ + +#define PVE_LOWAT (PVE_PER_PVPAGE / 2) /* free pv_entry low water mark */ +#define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2)) + /* high water mark */ + +static __inline int +pv_compare(struct pv_entry *a, struct pv_entry *b) +{ + if (a->pv_pmap < b->pv_pmap) + return (-1); + else if (a->pv_pmap > b->pv_pmap) + return (1); + else if (a->pv_va < b->pv_va) + return (-1); + else if (a->pv_va > b->pv_va) + return (1); + else + return (0); +} + +SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare); +SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare); + +/* + * linked list of all non-kernel pmaps + */ + +static struct pmap_head pmaps; + +/* + * pool that pmap structures are allocated from + */ + +struct pool pmap_pmap_pool; + +/* + * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a + * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing + * due to false sharing. + */ + +#ifdef MULTIPROCESSOR +#define PTESLEW(pte, id) ((pte)+(id)*NPTECL) +#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) +#else +#define PTESLEW(pte, id) (pte) +#define VASLEW(va,id) (va) +#endif + +/* + * special VAs and the PTEs that map them + */ +static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte; +static caddr_t csrcp, cdstp, zerop, ptpp; + +/* + * pool and cache that PDPs are allocated from + */ + +struct pool pmap_pdp_pool; +struct pool_cache pmap_pdp_cache; +u_int pmap_pdp_cache_generation; + +int pmap_pdp_ctor(void *, void *, int); +void pmap_pdp_dtor(void *, void *); + +caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ + +extern vaddr_t msgbuf_vaddr; +extern paddr_t msgbuf_paddr; + +extern vaddr_t idt_vaddr; /* we allocate IDT early */ +extern paddr_t idt_paddr; + +#if defined(I586_CPU) +/* stuff to fix the pentium f00f bug */ +extern vaddr_t pentium_idt_vaddr; +#endif + + +/* + * local prototypes + */ + +static struct pv_entry *pmap_add_pvpage(struct pv_page *, boolean_t); +static struct vm_page *pmap_alloc_ptp(struct pmap *, int); +static struct pv_entry *pmap_alloc_pv(struct pmap *, int); /* see codes below */ +#define ALLOCPV_NEED 0 /* need PV now */ +#define ALLOCPV_TRY 1 /* just try to allocate, don't steal */ +#define ALLOCPV_NONEED 2 /* don't need PV, just growing cache */ +static struct pv_entry *pmap_alloc_pvpage(struct pmap *, int); +static void pmap_enter_pv(struct pv_head *, + struct pv_entry *, struct pmap *, + vaddr_t, struct vm_page *); +static void pmap_free_pv(struct pmap *, struct pv_entry *); +static void pmap_free_pvs(struct pmap *, struct pv_entry *); +static void pmap_free_pv_doit(struct pv_entry *); +static void pmap_free_pvpage(void); +static struct vm_page *pmap_get_ptp(struct pmap *, int); +static boolean_t pmap_is_curpmap(struct pmap *); +static boolean_t pmap_is_active(struct pmap *, int); +static pt_entry_t *pmap_map_ptes(struct pmap *); +static struct pv_entry *pmap_remove_pv(struct pv_head *, struct pmap *, + vaddr_t); +static void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); +static boolean_t pmap_remove_pte(struct pmap *, struct vm_page *, + pt_entry_t *, vaddr_t, int32_t *, int); +static void pmap_remove_ptes(struct pmap *, struct vm_page *, + vaddr_t, vaddr_t, vaddr_t, int32_t *, + int); +#define PMAP_REMOVE_ALL 0 /* remove all mappings */ +#define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */ + +static vaddr_t pmap_tmpmap_pa(paddr_t); +static pt_entry_t *pmap_tmpmap_pvepte(struct pv_entry *); +static void pmap_tmpunmap_pa(void); +static void pmap_tmpunmap_pvepte(struct pv_entry *); +static void pmap_unmap_ptes(struct pmap *); + +static boolean_t pmap_reactivate(struct pmap *); + +#ifdef DEBUG +u_int curapdp; +#endif + +/* + * p m a p i n l i n e h e l p e r f u n c t i o n s + */ + +/* + * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? + * of course the kernel is always loaded + */ + +__inline static boolean_t +pmap_is_curpmap(pmap) + struct pmap *pmap; +{ + + return((pmap == pmap_kernel()) || + (pmap == curcpu()->ci_pmap)); +} + +/* + * pmap_is_active: is this pmap loaded into the specified processor's %cr3? + */ + +__inline static boolean_t +pmap_is_active(pmap, cpu_id) + struct pmap *pmap; + int cpu_id; +{ + + return (pmap == pmap_kernel() || + (pmap->pm_cpus & (1U << cpu_id)) != 0); +} + +/* + * pmap_tmpmap_pa: map a page in for tmp usage + */ + +__inline static vaddr_t +pmap_tmpmap_pa(pa) + paddr_t pa; +{ +#ifdef MULTIPROCESSOR + int id = cpu_number(); +#endif + pt_entry_t *ptpte = PTESLEW(ptp_pte, id); + pt_entry_t *maptp; + caddr_t ptpva = VASLEW(ptpp, id); +#if defined(DIAGNOSTIC) + if (*ptpte) + panic("pmap_tmpmap_pa: ptp_pte in use?"); +#endif + maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte); + PTE_SET(ptpte, maptp, PG_V | PG_RW | pa); /* always a new mapping */ + return((vaddr_t)ptpva); +} + +/* + * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa) + */ + +__inline static void +pmap_tmpunmap_pa() +{ +#ifdef MULTIPROCESSOR + int id = cpu_number(); +#endif + pt_entry_t *ptpte = PTESLEW(ptp_pte, id); + pt_entry_t *maptp; + caddr_t ptpva = VASLEW(ptpp, id); +#if defined(DIAGNOSTIC) + if (!pmap_valid_entry(*ptp_pte)) + panic("pmap_tmpunmap_pa: our pte invalid?"); +#endif + maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte); + PTE_CLEAR(ptpte, maptp); /* zap! */ + pmap_update_pg((vaddr_t)ptpva); +#ifdef MULTIPROCESSOR + /* + * No need for tlb shootdown here, since ptp_pte is per-CPU. + */ +#endif +} + +/* + * pmap_tmpmap_pvepte: get a quick mapping of a PTE for a pv_entry + * + * => do NOT use this on kernel mappings [why? because pv_ptp may be NULL] + */ + +__inline static pt_entry_t * +pmap_tmpmap_pvepte(pve) + struct pv_entry *pve; +{ +#ifdef DIAGNOSTIC + if (pve->pv_pmap == pmap_kernel()) + panic("pmap_tmpmap_pvepte: attempt to map kernel"); +#endif + + /* is it current pmap? use direct mapping... */ + if (pmap_is_curpmap(pve->pv_pmap)) + return(vtopte(pve->pv_va)); + + return(((pt_entry_t *)pmap_tmpmap_pa(VM_PAGE_TO_PHYS(pve->pv_ptp))) + + ptei((unsigned)pve->pv_va)); +} + +/* + * pmap_tmpunmap_pvepte: release a mapping obtained with pmap_tmpmap_pvepte + */ + +__inline static void +pmap_tmpunmap_pvepte(pve) + struct pv_entry *pve; +{ + /* was it current pmap? if so, return */ + if (pmap_is_curpmap(pve->pv_pmap)) + return; + + pmap_tmpunmap_pa(); +} + +__inline static void +pmap_apte_flush(struct pmap *pmap) +{ +#if defined(MULTIPROCESSOR) + struct pmap_tlb_shootdown_q *pq; + struct cpu_info *ci, *self = curcpu(); + CPU_INFO_ITERATOR cii; + int s; +#endif + + tlbflush(); /* flush TLB on current processor */ +#if defined(MULTIPROCESSOR) + /* + * Flush the APTE mapping from all other CPUs that + * are using the pmap we are using (who's APTE space + * is the one we've just modified). + * + * XXXthorpej -- find a way to defer the IPI. + */ + for (CPU_INFO_FOREACH(cii, ci)) { + if (ci == self) + continue; + if (pmap_is_active(pmap, ci->ci_cpuid)) { + pq = &pmap_tlb_shootdown_q[ci->ci_cpuid]; + s = splipi(); + __cpu_simple_lock(&pq->pq_slock); + pq->pq_flushu++; + __cpu_simple_unlock(&pq->pq_slock); + splx(s); + x86_send_ipi(ci, X86_IPI_TLB); + } + } +#endif +} + +/* + * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in + * + * => we lock enough pmaps to keep things locked in + * => must be undone with pmap_unmap_ptes before returning + */ + +__inline static pt_entry_t * +pmap_map_ptes(pmap) + struct pmap *pmap; +{ + pd_entry_t opde; + pd_entry_t *mapdp; + struct pmap *ourpmap; + struct cpu_info *ci; + + /* the kernel's pmap is always accessible */ + if (pmap == pmap_kernel()) { + return(PTE_BASE); + } + + ci = curcpu(); + if (ci->ci_want_pmapload && + vm_map_pmap(&ci->ci_curlwp->l_proc->p_vmspace->vm_map) == pmap) + pmap_load(); + + /* if curpmap then we are always mapped */ + if (pmap_is_curpmap(pmap)) { + simple_lock(&pmap->pm_obj.vmobjlock); + return(PTE_BASE); + } + + ourpmap = ci->ci_pmap; + + /* need to lock both curpmap and pmap: use ordered locking */ + if ((unsigned) pmap < (unsigned) ourpmap) { + simple_lock(&pmap->pm_obj.vmobjlock); + simple_lock(&ourpmap->pm_obj.vmobjlock); + } else { + simple_lock(&ourpmap->pm_obj.vmobjlock); + simple_lock(&pmap->pm_obj.vmobjlock); + } + + /* need to load a new alternate pt space into curpmap? */ + COUNT(apdp_pde_map); + opde = PDE_GET(APDP_PDE); + if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) { + XENPRINTF(("APDP_PDE %p %p/%p set %p/%p\n", + pmap, + (void *)vtophys((vaddr_t)APDP_PDE), + (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)), + (void *)pmap->pm_pdirpa, + (void *)xpmap_ptom(pmap->pm_pdirpa))); + mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE); + PDE_SET(APDP_PDE, mapdp, pmap->pm_pdirpa /* | PG_RW */ | PG_V); +#ifdef DEBUG + curapdp = pmap->pm_pdirpa; +#endif + if (pmap_valid_entry(opde)) + pmap_apte_flush(ourpmap); + XENPRINTF(("APDP_PDE set done\n")); + } + return(APTE_BASE); +} + +/* + * pmap_unmap_ptes: unlock the PTE mapping of "pmap" + */ + +__inline static void +pmap_unmap_ptes(pmap) + struct pmap *pmap; +{ +#if defined(MULTIPROCESSOR) + pd_entry_t *mapdp; +#endif + + if (pmap == pmap_kernel()) { + return; + } + if (pmap_is_curpmap(pmap)) { + simple_unlock(&pmap->pm_obj.vmobjlock); + } else { + struct pmap *ourpmap = curcpu()->ci_pmap; + +#if defined(MULTIPROCESSOR) + mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE); + PDE_CLEAR(APDP_PDE, mapdp); + pmap_apte_flush(ourpmap); +#endif +#ifdef DEBUG + curapdp = 0; +#endif + XENPRINTF(("APDP_PDE clear %p/%p set %p/%p\n", + (void *)vtophys((vaddr_t)APDP_PDE), + (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)), + (void *)pmap->pm_pdirpa, + (void *)xpmap_ptom(pmap->pm_pdirpa))); + COUNT(apdp_pde_unmap); + simple_unlock(&pmap->pm_obj.vmobjlock); + simple_unlock(&ourpmap->pm_obj.vmobjlock); + } +} + +__inline static void +pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) +{ + if (curproc == NULL || curproc->p_vmspace == NULL || + pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) + return; + + if ((opte ^ npte) & PG_X) + pmap_update_pg(va); + + /* + * Executability was removed on the last executable change. + * Reset the code segment to something conservative and + * let the trap handler deal with setting the right limit. + * We can't do that because of locking constraints on the vm map. + */ + + if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { + struct trapframe *tf = curlwp->l_md.md_regs; + struct pcb *pcb = &curlwp->l_addr->u_pcb; + + pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); + pm->pm_hiexec = I386_MAX_EXE_ADDR; + } +} + +__inline static pt_entry_t +pte_mtop(pt_entry_t pte) +{ + pt_entry_t ppte; + + KDASSERT(pmap_valid_entry(pte)); + ppte = xpmap_mtop(pte); + if ((ppte & PG_FRAME) == XPMAP_OFFSET) { + XENPRINTF(("pte_mtop: null page %08x -> %08x\n", + ppte, pte)); + ppte = pte; + } + + return ppte; +} + +__inline static pt_entry_t +pte_get_ma(pt_entry_t *pte) +{ + + return *pte; +} + +__inline static pt_entry_t +pte_get(pt_entry_t *pte) +{ + + if (pmap_valid_entry(*pte)) + return pte_mtop(*pte); + return *pte; +} + +__inline static pt_entry_t +pte_atomic_update_ma(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte) +{ + pt_entry_t opte; + + XENPRINTK(("pte_atomic_update_ma pte %p mapte %p npte %08x\n", + pte, mapte, npte)); + opte = PTE_GET_MA(pte); + if (opte > pmap_mem_end) { + /* must remove opte unchecked */ + if (npte > pmap_mem_end) + /* must set npte unchecked */ + xpq_queue_unchecked_pte_update(mapte, npte); + else { + /* must set npte checked */ + xpq_queue_unchecked_pte_update(mapte, 0); + xpq_queue_pte_update(mapte, npte); + } + } else { + /* must remove opte checked */ + if (npte > pmap_mem_end) { + /* must set npte unchecked */ + xpq_queue_pte_update(mapte, 0); + xpq_queue_unchecked_pte_update(mapte, npte); + } else + /* must set npte checked */ + xpq_queue_pte_update(mapte, npte); + } + xpq_flush_queue(); + + return opte; +} + +__inline static pt_entry_t +pte_atomic_update(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte) +{ + pt_entry_t opte; + + opte = pte_atomic_update_ma(pte, mapte, npte); + + return pte_mtop(opte); +} + +/* + * Fixup the code segment to cover all potential executable mappings. + * returns 0 if no changes to the code segment were made. + */ + +int +pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) +{ + struct vm_map_entry *ent; + struct pmap *pm = vm_map_pmap(map); + vaddr_t va = 0; + + vm_map_lock_read(map); + for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { + + /* + * This entry has greater va than the entries before. + * We need to make it point to the last page, not past it. + */ + + if (ent->protection & VM_PROT_EXECUTE) + va = trunc_page(ent->end) - PAGE_SIZE; + } + vm_map_unlock_read(map); + if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) + return (0); + + pm->pm_hiexec = va; + if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { + pcb->pcb_cs = tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); + } else { + pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); + return (0); + } + return (1); +} + +/* + * p m a p k e n t e r f u n c t i o n s + * + * functions to quickly enter/remove pages from the kernel address + * space. pmap_kremove is exported to MI kernel. we make use of + * the recursive PTE mappings. + */ + +/* + * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking + * + * => no need to lock anything, assume va is already allocated + * => should be faster than normal pmap enter function + */ + +void +pmap_kenter_pa(va, pa, prot) + vaddr_t va; + paddr_t pa; + vm_prot_t prot; +{ + pt_entry_t *pte, opte, npte; + pt_entry_t *maptp; + + if (va < VM_MIN_KERNEL_ADDRESS) + pte = vtopte(va); + else + pte = kvtopte(va); + + npte = ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | + PG_V | pmap_pg_g; + + if (pa >= pmap_pa_start && pa < pmap_pa_end) { + npte |= xpmap_ptom(pa); + } else { + XENPRINTF(("pmap_kenter: va %08lx outside pa range %08lx\n", + va, pa)); + npte |= pa; + } + + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); + opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */ + XENPRINTK(("pmap_kenter_pa(%p,%p) %p, was %08x now %08x\n", (void *)va, + (void *)pa, pte, opte, npte)); +#ifdef LARGEPAGES + /* XXX For now... */ + if (opte & PG_PS) + panic("pmap_kenter_pa: PG_PS"); +#endif + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { +#if defined(MULTIPROCESSOR) + int32_t cpumask = 0; + + pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask); + pmap_tlb_shootnow(cpumask); +#else + /* Don't bother deferring in the single CPU case. */ + pmap_update_pg(va); +#endif + } +} + +/* + * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking + * + * => no need to lock anything, assume va is already allocated + * => should be faster than normal pmap enter function + */ + +void pmap_kenter_ma __P((vaddr_t, paddr_t, vm_prot_t)); + +void +pmap_kenter_ma(va, ma, prot) + vaddr_t va; + paddr_t ma; + vm_prot_t prot; +{ + pt_entry_t *pte, opte, npte; + pt_entry_t *maptp; + + KASSERT (va >= VM_MIN_KERNEL_ADDRESS); + pte = kvtopte(va); + + npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | + PG_V | pmap_pg_g; + + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); + opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */ + XENPRINTK(("pmap_kenter_ma(%p,%p) %p, was %08x\n", (void *)va, + (void *)ma, pte, opte)); +#ifdef LARGEPAGES + /* XXX For now... */ + if (opte & PG_PS) + panic("pmap_kenter_ma: PG_PS"); +#endif + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { +#if defined(MULTIPROCESSOR) + int32_t cpumask = 0; + + pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask); + pmap_tlb_shootnow(cpumask); +#else + /* Don't bother deferring in the single CPU case. */ + pmap_update_pg(va); +#endif + } +} + +/* + * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking + * + * => no need to lock anything + * => caller must dispose of any vm_page mapped in the va range + * => note: not an inline function + * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE + * => we assume kernel only unmaps valid addresses and thus don't bother + * checking the valid bit before doing TLB flushing + */ + +void +pmap_kremove(va, len) + vaddr_t va; + vsize_t len; +{ + pt_entry_t *pte, opte; + pt_entry_t *maptp; + int32_t cpumask = 0; + + XENPRINTK(("pmap_kremove va %p, len %08lx\n", (void *)va, len)); + len >>= PAGE_SHIFT; + for ( /* null */ ; len ; len--, va += PAGE_SIZE) { + if (va < VM_MIN_KERNEL_ADDRESS) + pte = vtopte(va); + else + pte = kvtopte(va); + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); + opte = pte_atomic_update_ma(pte, maptp, 0); /* zap! */ + XENPRINTK(("pmap_kremove pte %p, was %08x\n", pte, opte)); +#ifdef LARGEPAGES + /* XXX For now... */ + if (opte & PG_PS) + panic("pmap_kremove: PG_PS"); +#endif +#ifdef DIAGNOSTIC + if (opte & PG_PVLIST) + panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", + va); +#endif + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) + pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask); + } + pmap_tlb_shootnow(cpumask); +} + +/* + * p m a p i n i t f u n c t i o n s + * + * pmap_bootstrap and pmap_init are called during system startup + * to init the pmap module. pmap_bootstrap() does a low level + * init just to get things rolling. pmap_init() finishes the job. + */ + +/* + * pmap_bootstrap: get the system in a state where it can run with VM + * properly enabled (called before main()). the VM system is + * fully init'd later... + * + * => on i386, locore.s has already enabled the MMU by allocating + * a PDP for the kernel, and nkpde PTP's for the kernel. + * => kva_start is the first free virtual address in kernel space + */ + +void +pmap_bootstrap(kva_start) + vaddr_t kva_start; +{ + struct pmap *kpm; + vaddr_t kva; + pt_entry_t *pte; + pt_entry_t *maptp; + int i; + + /* + * set up our local static global vars that keep track of the + * usage of KVM before kernel_map is set up + */ + + virtual_avail = kva_start; /* first free KVA */ + virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ + + /* + * find out where physical memory ends on the real hardware. + */ + + if (xen_start_info.flags & SIF_PRIVILEGED) + pmap_mem_end = find_pmap_mem_end(kva_start); + + /* + * set up protection_codes: we need to be able to convert from + * a MI protection code (some combo of VM_PROT...) to something + * we can jam into a i386 PTE. + */ + + protection_codes[VM_PROT_NONE] = 0; /* --- */ + protection_codes[VM_PROT_EXECUTE] = PG_X; /* --x */ + protection_codes[VM_PROT_READ] = PG_RO; /* -r- */ + protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO|PG_X;/* -rx */ + protection_codes[VM_PROT_WRITE] = PG_RW; /* w-- */ + protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW|PG_X;/* w-x */ + protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW; /* wr- */ + protection_codes[VM_PROT_ALL] = PG_RW|PG_X; /* wrx */ + + /* + * now we init the kernel's pmap + * + * the kernel pmap's pm_obj is not used for much. however, in + * user pmaps the pm_obj contains the list of active PTPs. + * the pm_obj currently does not have a pager. it might be possible + * to add a pager that would allow a process to read-only mmap its + * own page tables (fast user level vtophys?). this may or may not + * be useful. + */ + + kpm = pmap_kernel(); + simple_lock_init(&kpm->pm_obj.vmobjlock); + kpm->pm_obj.pgops = NULL; + TAILQ_INIT(&kpm->pm_obj.memq); + kpm->pm_obj.uo_npages = 0; + kpm->pm_obj.uo_refs = 1; + memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ + kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE); + XENPRINTF(("pm_pdirpa %p PTDpaddr %p\n", + (void *)lwp0.l_addr->u_pcb.pcb_cr3, (void *)PTDpaddr)); + kpm->pm_pdirpa = (u_int32_t) lwp0.l_addr->u_pcb.pcb_cr3; + kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = + x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); + + /* + * the above is just a rough estimate and not critical to the proper + * operation of the system. + */ + + /* + * Begin to enable global TLB entries if they are supported. + * The G bit has no effect until the CR4_PGE bit is set in CR4, + * which happens in cpu_init(), which is run on each cpu + * (and happens later) + */ + + if (cpu_feature & CPUID_PGE) { + pmap_pg_g = PG_G; /* enable software */ + + /* add PG_G attribute to already mapped kernel pages */ + for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ; + kva += PAGE_SIZE) + if (pmap_valid_entry(PTE_BASE[x86_btop(kva)])) { +#if !defined(XEN) + PTE_BASE[x86_btop(kva)] |= PG_G; +#else + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&PTE_BASE[x86_btop(kva)]); + PTE_SETBITS(&PTE_BASE[x86_btop(kva)], maptp, + PG_G); + } + PTE_UPDATES_FLUSH(); +#endif + } + +#ifdef LARGEPAGES + /* + * enable large pages if they are supported. + */ + + if (cpu_feature & CPUID_PSE) { + paddr_t pa; + vaddr_t kva_end; + pd_entry_t *pde; + pd_entry_t *mapdp; + extern char _etext; + + lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ + pmap_largepages = 1; /* enable software */ + + /* + * the TLB must be flushed after enabling large pages + * on Pentium CPUs, according to section 3.6.2.2 of + * "Intel Architecture Software Developer's Manual, + * Volume 3: System Programming". + */ + tlbflush(); + + /* + * now, remap the kernel text using large pages. we + * assume that the linker has properly aligned the + * .data segment to a 4MB boundary. + */ + kva_end = roundup((vaddr_t)&_etext, NBPD); + for (pa = 0, kva = KERNBASE; kva < kva_end; + kva += NBPD, pa += NBPD) { + pde = &kpm->pm_pdir[pdei(kva)]; + mapdp = (pt_entry_t *)vtomach((vaddr_t)pde); + PDE_SET(pde, mapdp, pa | pmap_pg_g | PG_PS | + PG_KR | PG_V); /* zap! */ + tlbflush(); + } + } +#endif /* LARGEPAGES */ + + /* + * now we allocate the "special" VAs which are used for tmp mappings + * by the pmap (and other modules). we allocate the VAs by advancing + * virtual_avail (note that there are no pages mapped at these VAs). + * we find the PTE that maps the allocated VA via the linear PTE + * mapping. + */ + + pte = PTE_BASE + x86_btop(virtual_avail); + +#ifdef MULTIPROCESSOR + /* + * Waste some VA space to avoid false sharing of cache lines + * for page table pages: Give each possible CPU a cache line + * of PTE's (8) to play with, though we only need 4. We could + * recycle some of this waste by putting the idle stacks here + * as well; we could waste less space if we knew the largest + * CPU ID beforehand. + */ + csrcp = (caddr_t) virtual_avail; csrc_pte = pte; + + cdstp = (caddr_t) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; + + zerop = (caddr_t) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; + + ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; + + virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL; + pte += X86_MAXPROCS * NPTECL; +#else + csrcp = (caddr_t) virtual_avail; csrc_pte = pte; /* allocate */ + virtual_avail += PAGE_SIZE; pte++; /* advance */ + + cdstp = (caddr_t) virtual_avail; cdst_pte = pte; + virtual_avail += PAGE_SIZE; pte++; + + zerop = (caddr_t) virtual_avail; zero_pte = pte; + virtual_avail += PAGE_SIZE; pte++; + + ptpp = (caddr_t) virtual_avail; ptp_pte = pte; + virtual_avail += PAGE_SIZE; pte++; +#endif + + XENPRINTK(("pmap_bootstrap csrcp %p cdstp %p zerop %p ptpp %p\n", + csrc_pte, cdst_pte, zero_pte, ptp_pte)); + /* + * Nothing after this point actually needs pte; + */ + pte = (void *)0xdeadbeef; + + /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ + vmmap = (char *)virtual_avail; /* don't need pte */ + virtual_avail += PAGE_SIZE; + + msgbuf_vaddr = virtual_avail; /* don't need pte */ + virtual_avail += round_page(MSGBUFSIZE); + + idt_vaddr = virtual_avail; /* don't need pte */ + virtual_avail += PAGE_SIZE; + idt_paddr = avail_start; /* steal a page */ + avail_start += PAGE_SIZE; + +#if defined(I586_CPU) + /* pentium f00f bug stuff */ + pentium_idt_vaddr = virtual_avail; /* don't need pte */ + virtual_avail += PAGE_SIZE; +#endif + + /* + * now we reserve some VM for mapping pages when doing a crash dump + */ + + virtual_avail = reserve_dumppages(virtual_avail); + + /* + * init the static-global locks and global lists. + */ + +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) + spinlockinit(&pmap_main_lock, "pmaplk", 0); +#endif + simple_lock_init(&pvalloc_lock); + simple_lock_init(&pmaps_lock); + LIST_INIT(&pmaps); + TAILQ_INIT(&pv_freepages); + TAILQ_INIT(&pv_unusedpgs); + + /* + * initialize the pmap pool. + */ + + pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0, "pmappl", + &pool_allocator_nointr); + + /* + * Initialize the TLB shootdown queues. + */ + + __cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock); + + for (i = 0; i < X86_MAXPROCS; i++) { + TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head); + __cpu_simple_lock_init(&pmap_tlb_shootdown_q[i].pq_slock); + } + + /* + * initialize the PDE pool and cache. + */ + pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, 0, "pdppl", + &pool_allocator_nointr); + pool_cache_init(&pmap_pdp_cache, &pmap_pdp_pool, + pmap_pdp_ctor, pmap_pdp_dtor, NULL); + + /* + * ensure the TLB is sync'd with reality by flushing it... + */ + + tlbflush(); +} + +/* + * pmap_init: called from uvm_init, our job is to get the pmap + * system ready to manage mappings... this mainly means initing + * the pv_entry stuff. + */ + +void +pmap_init() +{ + int i; + + /* + * now we need to free enough pv_entry structures to allow us to get + * the kmem_map/kmem_object allocated and inited (done after this + * function is finished). to do this we allocate one bootstrap page out + * of kernel_map and use it to provide an initial pool of pv_entry + * structures. we never free this page. + */ + + pv_initpage = (struct pv_page *) uvm_km_alloc(kernel_map, PAGE_SIZE); + if (pv_initpage == NULL) + panic("pmap_init: pv_initpage"); + pv_cachedva = 0; /* a VA we have allocated but not used yet */ + pv_nfpvents = 0; + (void) pmap_add_pvpage(pv_initpage, FALSE); + + pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE); + if (pj_page == NULL) + panic("pmap_init: pj_page"); + + for (i = 0; + i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1); + i++) + pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job; + pj_page[i].pja_job.pj_nextfree = NULL; + pj_free = &pj_page[0]; + + /* + * done: pmap module is up (and ready for business) + */ + + pmap_initialized = TRUE; +} + +/* + * p v _ e n t r y f u n c t i o n s + */ + +/* + * pv_entry allocation functions: + * the main pv_entry allocation functions are: + * pmap_alloc_pv: allocate a pv_entry structure + * pmap_free_pv: free one pv_entry + * pmap_free_pvs: free a list of pv_entrys + * + * the rest are helper functions + */ + +/* + * pmap_alloc_pv: inline function to allocate a pv_entry structure + * => we lock pvalloc_lock + * => if we fail, we call out to pmap_alloc_pvpage + * => 3 modes: + * ALLOCPV_NEED = we really need a pv_entry, even if we have to steal it + * ALLOCPV_TRY = we want a pv_entry, but not enough to steal + * ALLOCPV_NONEED = we are trying to grow our free list, don't really need + * one now + * + * "try" is for optional functions like pmap_copy(). + */ + +__inline static struct pv_entry * +pmap_alloc_pv(pmap, mode) + struct pmap *pmap; + int mode; +{ + struct pv_page *pvpage; + struct pv_entry *pv; + + simple_lock(&pvalloc_lock); + + pvpage = TAILQ_FIRST(&pv_freepages); + if (pvpage != NULL) { + pvpage->pvinfo.pvpi_nfree--; + if (pvpage->pvinfo.pvpi_nfree == 0) { + /* nothing left in this one? */ + TAILQ_REMOVE(&pv_freepages, pvpage, pvinfo.pvpi_list); + } + pv = pvpage->pvinfo.pvpi_pvfree; + KASSERT(pv); + pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node); + pv_nfpvents--; /* took one from pool */ + } else { + pv = NULL; /* need more of them */ + } + + /* + * if below low water mark or we didn't get a pv_entry we try and + * create more pv_entrys ... + */ + + if (pv_nfpvents < PVE_LOWAT || pv == NULL) { + if (pv == NULL) + pv = pmap_alloc_pvpage(pmap, (mode == ALLOCPV_TRY) ? + mode : ALLOCPV_NEED); + else + (void) pmap_alloc_pvpage(pmap, ALLOCPV_NONEED); + } + simple_unlock(&pvalloc_lock); + return(pv); +} + +/* + * pmap_alloc_pvpage: maybe allocate a new pvpage + * + * if need_entry is false: try and allocate a new pv_page + * if need_entry is true: try and allocate a new pv_page and return a + * new pv_entry from it. if we are unable to allocate a pv_page + * we make a last ditch effort to steal a pv_page from some other + * mapping. if that fails, we panic... + * + * => we assume that the caller holds pvalloc_lock + */ + +static struct pv_entry * +pmap_alloc_pvpage(pmap, mode) + struct pmap *pmap; + int mode; +{ + struct vm_page *pg; + struct pv_page *pvpage; + struct pv_entry *pv; + int s; + + /* + * if we need_entry and we've got unused pv_pages, allocate from there + */ + + pvpage = TAILQ_FIRST(&pv_unusedpgs); + if (mode != ALLOCPV_NONEED && pvpage != NULL) { + + /* move it to pv_freepages list */ + TAILQ_REMOVE(&pv_unusedpgs, pvpage, pvinfo.pvpi_list); + TAILQ_INSERT_HEAD(&pv_freepages, pvpage, pvinfo.pvpi_list); + + /* allocate a pv_entry */ + pvpage->pvinfo.pvpi_nfree--; /* can't go to zero */ + pv = pvpage->pvinfo.pvpi_pvfree; + KASSERT(pv); + pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node); + pv_nfpvents--; /* took one from pool */ + return(pv); + } + + /* + * see if we've got a cached unmapped VA that we can map a page in. + * if not, try to allocate one. + */ + + if (pv_cachedva == 0) { + s = splvm(); /* must protect kmem_map with splvm! */ + pv_cachedva = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE, + UVM_KMF_TRYLOCK|UVM_KMF_VALLOC); + splx(s); + if (pv_cachedva == 0) { + return (NULL); + } + } + + pg = uvm_pagealloc(NULL, pv_cachedva - vm_map_min(kernel_map), NULL, + UVM_PGA_USERESERVE); + if (pg == NULL) + return (NULL); + pg->flags &= ~PG_BUSY; /* never busy */ + + /* + * add a mapping for our new pv_page and free its entrys (save one!) + * + * NOTE: If we are allocating a PV page for the kernel pmap, the + * pmap is already locked! (...but entering the mapping is safe...) + */ + + pmap_kenter_pa(pv_cachedva, VM_PAGE_TO_PHYS(pg), + VM_PROT_READ | VM_PROT_WRITE); + pmap_update(pmap_kernel()); + pvpage = (struct pv_page *) pv_cachedva; + pv_cachedva = 0; + return (pmap_add_pvpage(pvpage, mode != ALLOCPV_NONEED)); +} + +/* + * pmap_add_pvpage: add a pv_page's pv_entrys to the free list + * + * => caller must hold pvalloc_lock + * => if need_entry is true, we allocate and return one pv_entry + */ + +static struct pv_entry * +pmap_add_pvpage(pvp, need_entry) + struct pv_page *pvp; + boolean_t need_entry; +{ + int tofree, lcv; + + /* do we need to return one? */ + tofree = (need_entry) ? PVE_PER_PVPAGE - 1 : PVE_PER_PVPAGE; + + pvp->pvinfo.pvpi_pvfree = NULL; + pvp->pvinfo.pvpi_nfree = tofree; + for (lcv = 0 ; lcv < tofree ; lcv++) { + SPLAY_RIGHT(&pvp->pvents[lcv], pv_node) = + pvp->pvinfo.pvpi_pvfree; + pvp->pvinfo.pvpi_pvfree = &pvp->pvents[lcv]; + } + if (need_entry) + TAILQ_INSERT_TAIL(&pv_freepages, pvp, pvinfo.pvpi_list); + else + TAILQ_INSERT_TAIL(&pv_unusedpgs, pvp, pvinfo.pvpi_list); + pv_nfpvents += tofree; + return((need_entry) ? &pvp->pvents[lcv] : NULL); +} + +/* + * pmap_free_pv_doit: actually free a pv_entry + * + * => do not call this directly! instead use either + * 1. pmap_free_pv ==> free a single pv_entry + * 2. pmap_free_pvs => free a list of pv_entrys + * => we must be holding pvalloc_lock + */ + +__inline static void +pmap_free_pv_doit(pv) + struct pv_entry *pv; +{ + struct pv_page *pvp; + + pvp = (struct pv_page *) x86_trunc_page(pv); + pv_nfpvents++; + pvp->pvinfo.pvpi_nfree++; + + /* nfree == 1 => fully allocated page just became partly allocated */ + if (pvp->pvinfo.pvpi_nfree == 1) { + TAILQ_INSERT_HEAD(&pv_freepages, pvp, pvinfo.pvpi_list); + } + + /* free it */ + SPLAY_RIGHT(pv, pv_node) = pvp->pvinfo.pvpi_pvfree; + pvp->pvinfo.pvpi_pvfree = pv; + + /* + * are all pv_page's pv_entry's free? move it to unused queue. + */ + + if (pvp->pvinfo.pvpi_nfree == PVE_PER_PVPAGE) { + TAILQ_REMOVE(&pv_freepages, pvp, pvinfo.pvpi_list); + TAILQ_INSERT_HEAD(&pv_unusedpgs, pvp, pvinfo.pvpi_list); + } +} + +/* + * pmap_free_pv: free a single pv_entry + * + * => we gain the pvalloc_lock + */ + +__inline static void +pmap_free_pv(pmap, pv) + struct pmap *pmap; + struct pv_entry *pv; +{ + simple_lock(&pvalloc_lock); + pmap_free_pv_doit(pv); + + /* + * Can't free the PV page if the PV entries were associated with + * the kernel pmap; the pmap is already locked. + */ + if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL && + pmap != pmap_kernel()) + pmap_free_pvpage(); + + simple_unlock(&pvalloc_lock); +} + +/* + * pmap_free_pvs: free a list of pv_entrys + * + * => we gain the pvalloc_lock + */ + +__inline static void +pmap_free_pvs(pmap, pvs) + struct pmap *pmap; + struct pv_entry *pvs; +{ + struct pv_entry *nextpv; + + simple_lock(&pvalloc_lock); + + for ( /* null */ ; pvs != NULL ; pvs = nextpv) { + nextpv = SPLAY_RIGHT(pvs, pv_node); + pmap_free_pv_doit(pvs); + } + + /* + * Can't free the PV page if the PV entries were associated with + * the kernel pmap; the pmap is already locked. + */ + if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL && + pmap != pmap_kernel()) + pmap_free_pvpage(); + + simple_unlock(&pvalloc_lock); +} + + +/* + * pmap_free_pvpage: try and free an unused pv_page structure + * + * => assume caller is holding the pvalloc_lock and that + * there is a page on the pv_unusedpgs list + * => if we can't get a lock on the kmem_map we try again later + */ + +static void +pmap_free_pvpage() +{ + int s; + struct vm_map *map; + struct vm_map_entry *dead_entries; + struct pv_page *pvp; + + s = splvm(); /* protect kmem_map */ + + pvp = TAILQ_FIRST(&pv_unusedpgs); + + /* + * note: watch out for pv_initpage which is allocated out of + * kernel_map rather than kmem_map. + */ + + if (pvp == pv_initpage) + map = kernel_map; + else + map = kmem_map; + if (vm_map_lock_try(map)) { + + /* remove pvp from pv_unusedpgs */ + TAILQ_REMOVE(&pv_unusedpgs, pvp, pvinfo.pvpi_list); + + /* unmap the page */ + dead_entries = NULL; + uvm_unmap_remove(map, (vaddr_t)pvp, ((vaddr_t)pvp) + PAGE_SIZE, + &dead_entries); + vm_map_unlock(map); + + if (dead_entries != NULL) + uvm_unmap_detach(dead_entries, 0); + + pv_nfpvents -= PVE_PER_PVPAGE; /* update free count */ + } + if (pvp == pv_initpage) + /* no more initpage, we've freed it */ + pv_initpage = NULL; + + splx(s); +} + +/* + * pmap_lock_pvhs: Lock pvh1 and optional pvh2 + * Observe locking order when locking both pvhs + */ + +__inline static void +pmap_lock_pvhs(struct pv_head *pvh1, struct pv_head *pvh2) +{ + + if (pvh2 == NULL) { + simple_lock(&pvh1->pvh_lock); + return; + } + + if (pvh1 < pvh2) { + simple_lock(&pvh1->pvh_lock); + simple_lock(&pvh2->pvh_lock); + } else { + simple_lock(&pvh2->pvh_lock); + simple_lock(&pvh1->pvh_lock); + } +} + + +/* + * main pv_entry manipulation functions: + * pmap_enter_pv: enter a mapping onto a pv_head list + * pmap_remove_pv: remove a mappiing from a pv_head list + * + * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock + * the pvh before calling + */ + +/* + * pmap_enter_pv: enter a mapping onto a pv_head lst + * + * => caller should hold the proper lock on pmap_main_lock + * => caller should have pmap locked + * => caller should have the pv_head locked + * => caller should adjust ptp's wire_count before calling + */ + +__inline static void +pmap_enter_pv(pvh, pve, pmap, va, ptp) + struct pv_head *pvh; + struct pv_entry *pve; /* preallocated pve for us to use */ + struct pmap *pmap; + vaddr_t va; + struct vm_page *ptp; /* PTP in pmap that maps this VA */ +{ + pve->pv_pmap = pmap; + pve->pv_va = va; + pve->pv_ptp = ptp; /* NULL for kernel pmap */ + SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */ +} + +/* + * pmap_remove_pv: try to remove a mapping from a pv_list + * + * => caller should hold proper lock on pmap_main_lock + * => pmap should be locked + * => caller should hold lock on pv_head [so that attrs can be adjusted] + * => caller should adjust ptp's wire_count and free PTP if needed + * => we return the removed pve + */ + +__inline static struct pv_entry * +pmap_remove_pv(pvh, pmap, va) + struct pv_head *pvh; + struct pmap *pmap; + vaddr_t va; +{ + struct pv_entry tmp, *pve; + + tmp.pv_pmap = pmap; + tmp.pv_va = va; + pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp); + if (pve == NULL) + return (NULL); + SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); + return(pve); /* return removed pve */ +} + +/* + * p t p f u n c t i o n s + */ + +/* + * pmap_alloc_ptp: allocate a PTP for a PMAP + * + * => pmap should already be locked by caller + * => we use the ptp's wire_count to count the number of active mappings + * in the PTP (we start it at one to prevent any chance this PTP + * will ever leak onto the active/inactive queues) + */ + +__inline static struct vm_page * +pmap_alloc_ptp(pmap, pde_index) + struct pmap *pmap; + int pde_index; +{ + struct vm_page *ptp; + pd_entry_t *mapdp; + + ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL, + UVM_PGA_USERESERVE|UVM_PGA_ZERO); + if (ptp == NULL) + return(NULL); + + /* got one! */ + ptp->flags &= ~PG_BUSY; /* never busy */ + ptp->wire_count = 1; /* no mappings yet */ + mapdp = (pt_entry_t *)vtomach((vaddr_t)&pmap->pm_pdir[pde_index]); + PDE_SET(&pmap->pm_pdir[pde_index], mapdp, + (pd_entry_t) (VM_PAGE_TO_PHYS(ptp) | PG_u | PG_RW | PG_V)); + pmap->pm_stats.resident_count++; /* count PTP as resident */ + pmap->pm_ptphint = ptp; + return(ptp); +} + +/* + * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) + * + * => pmap should NOT be pmap_kernel() + * => pmap should be locked + */ + +static struct vm_page * +pmap_get_ptp(pmap, pde_index) + struct pmap *pmap; + int pde_index; +{ + struct vm_page *ptp; + + if (pmap_valid_entry(pmap->pm_pdir[pde_index])) { + + /* valid... check hint (saves us a PA->PG lookup) */ + if (pmap->pm_ptphint && + (PDE_GET(&pmap->pm_pdir[pde_index]) & PG_FRAME) == + VM_PAGE_TO_PHYS(pmap->pm_ptphint)) + return(pmap->pm_ptphint); + + ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index)); +#ifdef DIAGNOSTIC + if (ptp == NULL) + panic("pmap_get_ptp: unmanaged user PTP"); +#endif + pmap->pm_ptphint = ptp; + return(ptp); + } + + /* allocate a new PTP (updates ptphint) */ + return(pmap_alloc_ptp(pmap, pde_index)); +} + +/* + * p m a p l i f e c y c l e f u n c t i o n s + */ + +/* + * pmap_pdp_ctor: constructor for the PDP cache. + */ + +int +pmap_pdp_ctor(void *arg, void *object, int flags) +{ + pd_entry_t *pdir = object; + paddr_t pdirpa; + + /* + * NOTE: The `pmap_lock' is held when the PDP is allocated. + * WE MUST NOT BLOCK! + */ + + /* fetch the physical address of the page directory. */ + (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); + + XENPRINTF(("pmap_pdp_ctor %p %p\n", pdir, (void *)pdirpa)); + + /* zero init area */ + memset(pdir, 0, PDSLOT_PTE * sizeof(pd_entry_t)); + + /* put in recursive PDE to map the PTEs */ + pdir[PDSLOT_PTE] = xpmap_ptom(pdirpa | PG_V /* | PG_KW */); + + /* put in kernel VM PDEs */ + memcpy(&pdir[PDSLOT_KERN], &PDP_BASE[PDSLOT_KERN], + nkpde * sizeof(pd_entry_t)); + + /* zero the rest */ + memset(&pdir[PDSLOT_KERN + nkpde], 0, + PAGE_SIZE - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t))); + + pmap_enter(pmap_kernel(), (vaddr_t)pdir, pdirpa, VM_PROT_READ, + VM_PROT_READ); + pmap_update(pmap_kernel()); + + /* pin page type */ + xpq_queue_pin_table(xpmap_ptom(pdirpa), XPQ_PIN_L2_TABLE); + xpq_flush_queue(); + + return (0); +} + +void +pmap_pdp_dtor(void *arg, void *object) +{ + pd_entry_t *pdir = object; + paddr_t pdirpa; + + /* fetch the physical address of the page directory. */ + pdirpa = PDE_GET(&pdir[PDSLOT_PTE]) & PG_FRAME; + + XENPRINTF(("pmap_pdp_dtor %p %p\n", pdir, (void *)pdirpa)); + + /* unpin page type */ + xpq_queue_unpin_table(xpmap_ptom(pdirpa)); + xpq_flush_queue(); +} + +/* + * pmap_create: create a pmap + * + * => note: old pmap interface took a "size" args which allowed for + * the creation of "software only" pmaps (not in bsd). + */ + +struct pmap * +pmap_create() +{ + struct pmap *pmap; + u_int gen; + + XENPRINTF(("pmap_create\n")); + pmap = pool_get(&pmap_pmap_pool, PR_WAITOK); + + /* init uvm_object */ + simple_lock_init(&pmap->pm_obj.vmobjlock); + pmap->pm_obj.pgops = NULL; /* currently not a mappable object */ + TAILQ_INIT(&pmap->pm_obj.memq); + pmap->pm_obj.uo_npages = 0; + pmap->pm_obj.uo_refs = 1; + pmap->pm_stats.wired_count = 0; + pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ + pmap->pm_ptphint = NULL; + pmap->pm_hiexec = 0; + pmap->pm_flags = 0; + pmap->pm_cpus = 0; + + /* init the LDT */ + pmap->pm_ldt = NULL; + pmap->pm_ldt_len = 0; + pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL); + + /* allocate PDP */ + + /* + * we need to lock pmaps_lock to prevent nkpde from changing on + * us. note that there is no need to splvm to protect us from + * malloc since malloc allocates out of a submap and we should + * have already allocated kernel PTPs to cover the range... + * + * NOTE: WE MUST NOT BLOCK WHILE HOLDING THE `pmap_lock', nor + * must we call pmap_growkernel() while holding it! + */ + + try_again: + gen = pmap_pdp_cache_generation; + pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); + + simple_lock(&pmaps_lock); + + if (gen != pmap_pdp_cache_generation) { + simple_unlock(&pmaps_lock); + pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); + goto try_again; + } + + pmap->pm_pdirpa = PDE_GET(&pmap->pm_pdir[PDSLOT_PTE]) & PG_FRAME; + XENPRINTF(("pmap_create %p set pm_pdirpa %p/%p slotval %p\n", pmap, + (void *)pmap->pm_pdirpa, + (void *)xpmap_ptom(pmap->pm_pdirpa), + (void *)pmap->pm_pdir[PDSLOT_PTE])); + + LIST_INSERT_HEAD(&pmaps, pmap, pm_list); + + simple_unlock(&pmaps_lock); + + return (pmap); +} + +/* + * pmap_destroy: drop reference count on pmap. free pmap if + * reference count goes to zero. + */ + +void +pmap_destroy(pmap) + struct pmap *pmap; +{ + int refs; +#ifdef DIAGNOSTIC + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; +#endif /* DIAGNOSTIC */ + + /* + * drop reference count + */ + + simple_lock(&pmap->pm_obj.vmobjlock); + refs = --pmap->pm_obj.uo_refs; + simple_unlock(&pmap->pm_obj.vmobjlock); + if (refs > 0) { + return; + } + +#ifdef DIAGNOSTIC + for (CPU_INFO_FOREACH(cii, ci)) + if (ci->ci_pmap == pmap) + panic("destroying pmap being used"); +#endif /* DIAGNOSTIC */ + + /* + * reference count is zero, free pmap resources and then free pmap. + */ + + XENPRINTF(("pmap_destroy %p pm_pdirpa %p/%p\n", pmap, + (void *)pmap->pm_pdirpa, + (void *)xpmap_ptom(pmap->pm_pdirpa))); + + /* + * remove it from global list of pmaps + */ + + simple_lock(&pmaps_lock); + LIST_REMOVE(pmap, pm_list); + simple_unlock(&pmaps_lock); + + /* + * destroyed pmap shouldn't have remaining PTPs + */ + + KASSERT(pmap->pm_obj.uo_npages == 0); + KASSERT(TAILQ_EMPTY(&pmap->pm_obj.memq)); + + /* + * MULTIPROCESSOR -- no need to flush out of other processors' + * APTE space because we do that in pmap_unmap_ptes(). + */ + pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); + +#ifdef USER_LDT + if (pmap->pm_flags & PMF_USER_LDT) { + /* + * no need to switch the LDT; this address space is gone, + * nothing is using it. + * + * No need to lock the pmap for ldt_free (or anything else), + * we're the last one to use it. + */ + ldt_free(pmap); + uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, + pmap->pm_ldt_len * sizeof(union descriptor)); + } +#endif + + pool_put(&pmap_pmap_pool, pmap); +} + +/* + * Add a reference to the specified pmap. + */ + +void +pmap_reference(pmap) + struct pmap *pmap; +{ + simple_lock(&pmap->pm_obj.vmobjlock); + pmap->pm_obj.uo_refs++; + simple_unlock(&pmap->pm_obj.vmobjlock); +} + +#if defined(PMAP_FORK) +/* + * pmap_fork: perform any necessary data structure manipulation when + * a VM space is forked. + */ + +void +pmap_fork(pmap1, pmap2) + struct pmap *pmap1, *pmap2; +{ + simple_lock(&pmap1->pm_obj.vmobjlock); + simple_lock(&pmap2->pm_obj.vmobjlock); + +#ifdef USER_LDT + /* Copy the LDT, if necessary. */ + if (pmap1->pm_flags & PMF_USER_LDT) { + union descriptor *new_ldt; + size_t len; + + len = pmap1->pm_ldt_len * sizeof(union descriptor); + new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len); + memcpy(new_ldt, pmap1->pm_ldt, len); + pmap2->pm_ldt = new_ldt; + pmap2->pm_ldt_len = pmap1->pm_ldt_len; + pmap2->pm_flags |= PMF_USER_LDT; + ldt_alloc(pmap2, new_ldt, len); + } +#endif /* USER_LDT */ + + simple_unlock(&pmap2->pm_obj.vmobjlock); + simple_unlock(&pmap1->pm_obj.vmobjlock); +} +#endif /* PMAP_FORK */ + +#ifdef USER_LDT +/* + * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and + * restore the default. + */ + +void +pmap_ldt_cleanup(l) + struct lwp *l; +{ + struct pcb *pcb = &l->l_addr->u_pcb; + pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; + union descriptor *old_ldt = NULL; + size_t len = 0; + + simple_lock(&pmap->pm_obj.vmobjlock); + + if (pmap->pm_flags & PMF_USER_LDT) { + ldt_free(pmap); + pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL); + pcb->pcb_ldt_sel = pmap->pm_ldt_sel; + if (pcb == curpcb) + lldt(pcb->pcb_ldt_sel); + old_ldt = pmap->pm_ldt; + len = pmap->pm_ldt_len * sizeof(union descriptor); + pmap->pm_ldt = NULL; + pmap->pm_ldt_len = 0; + pmap->pm_flags &= ~PMF_USER_LDT; + } + + simple_unlock(&pmap->pm_obj.vmobjlock); + + if (old_ldt != NULL) + uvm_km_free(kernel_map, (vaddr_t)old_ldt, len); +} +#endif /* USER_LDT */ + +/* + * pmap_activate: activate a process' pmap + * + * => called from cpu_switch() + * => if lwp is the curlwp, then set ci_want_pmapload so that + * actual MMU context switch will be done by pmap_load() later + */ + +void +pmap_activate(l) + struct lwp *l; +{ + struct cpu_info *ci = curcpu(); + struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); + + if (l == ci->ci_curlwp) { + struct pcb *pcb; + + KASSERT(ci->ci_want_pmapload == 0); + KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); +#ifdef KSTACK_CHECK_DR0 + /* + * setup breakpoint on the top of stack + */ + if (l == &lwp0) + dr0(0, 0, 0, 0); + else + dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); +#endif + + /* + * no need to switch to kernel vmspace because + * it's a subset of any vmspace. + */ + + if (pmap == pmap_kernel()) { + ci->ci_want_pmapload = 0; + return; + } + + pcb = &l->l_addr->u_pcb; + pcb->pcb_ldt_sel = pmap->pm_ldt_sel; + + ci->ci_want_pmapload = 1; + } +} + +/* + * pmap_reactivate: try to regain reference to the pmap. + */ + +static boolean_t +pmap_reactivate(struct pmap *pmap) +{ + struct cpu_info *ci = curcpu(); + u_int32_t cpumask = 1U << ci->ci_cpuid; + int s; + boolean_t result; + u_int32_t oldcpus; + + /* + * if we still have a lazy reference to this pmap, + * we can assume that there was no tlb shootdown + * for this pmap in the meantime. + */ + + s = splipi(); /* protect from tlb shootdown ipis. */ + oldcpus = pmap->pm_cpus; + x86_atomic_setbits_l(&pmap->pm_cpus, cpumask); + if (oldcpus & cpumask) { + KASSERT(ci->ci_tlbstate == TLBSTATE_LAZY); + /* got it */ + result = TRUE; + } else { + KASSERT(ci->ci_tlbstate == TLBSTATE_STALE); + result = FALSE; + } + ci->ci_tlbstate = TLBSTATE_VALID; + splx(s); + + return result; +} + +/* + * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) + */ + +void +pmap_load() +{ + struct cpu_info *ci = curcpu(); + u_int32_t cpumask = 1U << ci->ci_cpuid; + struct pmap *pmap; + struct pmap *oldpmap; + struct lwp *l; + struct pcb *pcb; + pd_entry_t *mapdp; + int s; + + KASSERT(ci->ci_want_pmapload); + + l = ci->ci_curlwp; + KASSERT(l != NULL); + pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); + KASSERT(pmap != pmap_kernel()); + oldpmap = ci->ci_pmap; + + pcb = ci->ci_curpcb; + KASSERT(pcb == &l->l_addr->u_pcb); + /* loaded by pmap_activate */ + KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel); + + if (pmap == oldpmap) { + if (!pmap_reactivate(pmap)) { + + /* + * pmap has been changed during deactivated. + * our tlb may be stale. + */ + + tlbflush(); + } + + ci->ci_want_pmapload = 0; + return; + } + + /* + * actually switch pmap. + */ + + x86_atomic_clearbits_l(&oldpmap->pm_cpus, cpumask); + + KASSERT((pmap->pm_cpus & cpumask) == 0); + + KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE); + pmap_reference(pmap); + KERNEL_UNLOCK(); + + /* + * mark the pmap in use by this processor. + */ + + s = splipi(); + x86_atomic_setbits_l(&pmap->pm_cpus, cpumask); + ci->ci_pmap = pmap; + ci->ci_tlbstate = TLBSTATE_VALID; + splx(s); + + /* + * clear apdp slot before loading %cr3 since Xen only allows + * linear pagetable mappings in the current pagetable. + */ + KDASSERT(curapdp == 0); + mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE); + PDE_CLEAR(APDP_PDE, mapdp); + + /* + * update tss and load corresponding registers. + */ + + lldt(pcb->pcb_ldt_sel); + pcb->pcb_cr3 = pmap->pm_pdirpa; + lcr3(pcb->pcb_cr3); + + ci->ci_want_pmapload = 0; + + KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE); + pmap_destroy(oldpmap); + KERNEL_UNLOCK(); +} + +/* + * pmap_deactivate: deactivate a process' pmap + */ + +void +pmap_deactivate(l) + struct lwp *l; +{ + + if (l == curlwp) + pmap_deactivate2(l); +} + +/* + * pmap_deactivate2: context switch version of pmap_deactivate. + * always treat l as curlwp. + */ + +void +pmap_deactivate2(l) + struct lwp *l; +{ + struct pmap *pmap; + struct cpu_info *ci = curcpu(); + + if (ci->ci_want_pmapload) { + KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) + != pmap_kernel()); + KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) + != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); + + /* + * userspace has not been touched. + * nothing to do here. + */ + + ci->ci_want_pmapload = 0; + return; + } + + pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); + + if (pmap == pmap_kernel()) { + return; + } + + KASSERT(ci->ci_pmap == pmap); + + KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); + ci->ci_tlbstate = TLBSTATE_LAZY; + XENPRINTF(("pmap_deactivate %p ebp %p esp %p\n", + l, (void *)l->l_addr->u_pcb.pcb_ebp, + (void *)l->l_addr->u_pcb.pcb_esp)); +} + +/* + * end of lifecycle functions + */ + +/* + * some misc. functions + */ + +/* + * pmap_extract: extract a PA for the given VA + */ + +boolean_t +pmap_extract(pmap, va, pap) + struct pmap *pmap; + vaddr_t va; + paddr_t *pap; +{ + pt_entry_t *ptes, pte; + pd_entry_t pde; + + if (__predict_true((pde = PDE_GET(&pmap->pm_pdir[pdei(va)])) != 0)) { +#ifdef LARGEPAGES + if (pde & PG_PS) { + if (pap != NULL) + *pap = (pde & PG_LGFRAME) | (va & ~PG_LGFRAME); + return (TRUE); + } +#endif + + ptes = pmap_map_ptes(pmap); + pte = PTE_GET(&ptes[x86_btop(va)]); + pmap_unmap_ptes(pmap); + + if (__predict_true((pte & PG_V) != 0)) { + if (pap != NULL) + *pap = (pte & PG_FRAME) | (va & ~PG_FRAME); + return (TRUE); + } + } + return (FALSE); +} + + +/* + * vtophys: virtual address to physical address. For use by + * machine-dependent code only. + */ + +paddr_t +vtophys(va) + vaddr_t va; +{ + paddr_t pa; + + if (pmap_extract(pmap_kernel(), va, &pa) == TRUE) + return (pa); + return (0); +} + + +/* + * pmap_virtual_space: used during bootup [pmap_steal_memory] to + * determine the bounds of the kernel virtual addess space. + */ + +void +pmap_virtual_space(startp, endp) + vaddr_t *startp; + vaddr_t *endp; +{ + *startp = virtual_avail; + *endp = virtual_end; +} + +/* + * pmap_map: map a range of PAs into kvm + * + * => used during crash dump + * => XXX: pmap_map() should be phased out? + */ + +vaddr_t +pmap_map(va, spa, epa, prot) + vaddr_t va; + paddr_t spa, epa; + vm_prot_t prot; +{ + while (spa < epa) { + pmap_enter(pmap_kernel(), va, spa, prot, 0); + va += PAGE_SIZE; + spa += PAGE_SIZE; + } + pmap_update(pmap_kernel()); + return va; +} + +/* + * pmap_zero_page: zero a page + */ + +void +pmap_zero_page(pa) + paddr_t pa; +{ +#ifdef MULTIPROCESSOR + int id = cpu_number(); +#endif + pt_entry_t *zpte = PTESLEW(zero_pte, id); + pt_entry_t *maptp; + caddr_t zerova = VASLEW(zerop, id); + +#ifdef DIAGNOSTIC + if (PTE_GET(zpte)) + panic("pmap_zero_page: lock botch"); +#endif + + maptp = (pt_entry_t *)vtomach((vaddr_t)zpte); + PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW); /* map in */ + pmap_update_pg((vaddr_t)zerova); /* flush TLB */ + + memset(zerova, 0, PAGE_SIZE); /* zero */ + PTE_CLEAR(zpte, maptp); /* zap! */ +} + +/* + * pmap_pagezeroidle: the same, for the idle loop page zero'er. + * Returns TRUE if the page was zero'd, FALSE if we aborted for + * some reason. + */ + +boolean_t +pmap_pageidlezero(pa) + paddr_t pa; +{ +#ifdef MULTIPROCESSOR + int id = cpu_number(); +#endif + pt_entry_t *zpte = PTESLEW(zero_pte, id); + pt_entry_t *maptp; + caddr_t zerova = VASLEW(zerop, id); + boolean_t rv = TRUE; + int i, *ptr; + +#ifdef DIAGNOSTIC + if (PTE_GET(zpte)) + panic("pmap_zero_page_uncached: lock botch"); +#endif + maptp = (pt_entry_t *)vtomach((vaddr_t)zpte); + PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW); /* map in */ + pmap_update_pg((vaddr_t)zerova); /* flush TLB */ + for (i = 0, ptr = (int *) zerova; i < PAGE_SIZE / sizeof(int); i++) { + if (sched_whichqs != 0) { + + /* + * A process has become ready. Abort now, + * so we don't keep it waiting while we + * do slow memory access to finish this + * page. + */ + + rv = FALSE; + break; + } + *ptr++ = 0; + } + + PTE_CLEAR(zpte, maptp); /* zap! */ + return (rv); +} + +/* + * pmap_copy_page: copy a page + */ + +void +pmap_copy_page(srcpa, dstpa) + paddr_t srcpa, dstpa; +{ +#ifdef MULTIPROCESSOR + int id = cpu_number(); +#endif + pt_entry_t *spte = PTESLEW(csrc_pte,id), *maspte; + pt_entry_t *dpte = PTESLEW(cdst_pte,id), *madpte; + caddr_t csrcva = VASLEW(csrcp, id); + caddr_t cdstva = VASLEW(cdstp, id); + +#ifdef DIAGNOSTIC + if (PTE_GET(spte) || PTE_GET(dpte)) + panic("pmap_copy_page: lock botch"); +#endif + + maspte = (pt_entry_t *)vtomach((vaddr_t)spte); + madpte = (pt_entry_t *)vtomach((vaddr_t)dpte); + PTE_SET(spte, maspte, (srcpa & PG_FRAME) | PG_V | PG_RW); + PTE_SET(dpte, madpte, (dstpa & PG_FRAME) | PG_V | PG_RW); + pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); + memcpy(cdstva, csrcva, PAGE_SIZE); + PTE_CLEAR(spte, maspte); /* zap! */ + PTE_CLEAR(dpte, madpte); /* zap! */ +} + +/* + * p m a p r e m o v e f u n c t i o n s + * + * functions that remove mappings + */ + +/* + * pmap_remove_ptes: remove PTEs from a PTP + * + * => must have proper locking on pmap_master_lock + * => caller must hold pmap's lock + * => PTP must be mapped into KVA + * => PTP should be null if pmap == pmap_kernel() + */ + +static void +pmap_remove_ptes(pmap, ptp, ptpva, startva, endva, cpumaskp, flags) + struct pmap *pmap; + struct vm_page *ptp; + vaddr_t ptpva; + vaddr_t startva, endva; + int32_t *cpumaskp; + int flags; +{ + struct pv_entry *pv_tofree = NULL; /* list of pv_entrys to free */ + struct pv_entry *pve; + pt_entry_t *pte = (pt_entry_t *) ptpva; + pt_entry_t opte; + pt_entry_t *maptp; + + /* + * note that ptpva points to the PTE that maps startva. this may + * or may not be the first PTE in the PTP. + * + * we loop through the PTP while there are still PTEs to look at + * and the wire_count is greater than 1 (because we use the wire_count + * to keep track of the number of real PTEs in the PTP). + */ + + for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) + ; pte++, startva += PAGE_SIZE) { + struct vm_page *pg; + struct vm_page_md *mdpg; + + if (!pmap_valid_entry(*pte)) + continue; /* VA not mapped */ + if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { + continue; + } + + /* atomically save the old PTE and zap! it */ + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); + opte = pte_atomic_update(pte, maptp, 0); + pmap_exec_account(pmap, startva, opte, 0); + + if (opte & PG_W) + pmap->pm_stats.wired_count--; + pmap->pm_stats.resident_count--; + + if (opte & PG_U) + pmap_tlb_shootdown(pmap, startva, opte, cpumaskp); + + if (ptp) { + ptp->wire_count--; /* dropping a PTE */ + /* Make sure that the PDE is flushed */ + if ((ptp->wire_count <= 1) && !(opte & PG_U)) + pmap_tlb_shootdown(pmap, startva, opte, + cpumaskp); + } + + /* + * if we are not on a pv_head list we are done. + */ + + if ((opte & PG_PVLIST) == 0) { +#if defined(DIAGNOSTIC) && !defined(DOM0OPS) + if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL) + panic("pmap_remove_ptes: managed page without " + "PG_PVLIST for 0x%lx", startva); +#endif + continue; + } + + pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); +#ifdef DIAGNOSTIC + if (pg == NULL) + panic("pmap_remove_ptes: unmanaged page marked " + "PG_PVLIST, va = 0x%lx, pa = 0x%lx", + startva, (u_long)(opte & PG_FRAME)); +#endif + mdpg = &pg->mdpage; + + /* sync R/M bits */ + simple_lock(&mdpg->mp_pvhead.pvh_lock); + mdpg->mp_attrs |= (opte & (PG_U|PG_M)); + pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, startva); + simple_unlock(&mdpg->mp_pvhead.pvh_lock); + + if (pve) { + SPLAY_RIGHT(pve, pv_node) = pv_tofree; + pv_tofree = pve; + } + + /* end of "for" loop: time for next pte */ + } + if (pv_tofree) + pmap_free_pvs(pmap, pv_tofree); +} + + +/* + * pmap_remove_pte: remove a single PTE from a PTP + * + * => must have proper locking on pmap_master_lock + * => caller must hold pmap's lock + * => PTP must be mapped into KVA + * => PTP should be null if pmap == pmap_kernel() + * => returns true if we removed a mapping + */ + +static boolean_t +pmap_remove_pte(pmap, ptp, pte, va, cpumaskp, flags) + struct pmap *pmap; + struct vm_page *ptp; + pt_entry_t *pte; + vaddr_t va; + int32_t *cpumaskp; + int flags; +{ + pt_entry_t opte; + pt_entry_t *maptp; + struct pv_entry *pve; + struct vm_page *pg; + struct vm_page_md *mdpg; + + if (!pmap_valid_entry(*pte)) + return(FALSE); /* VA not mapped */ + if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { + return(FALSE); + } + + /* atomically save the old PTE and zap! it */ + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); + opte = pte_atomic_update(pte, maptp, 0); + + XENPRINTK(("pmap_remove_pte %p, was %08x\n", pte, opte)); + pmap_exec_account(pmap, va, opte, 0); + + if (opte & PG_W) + pmap->pm_stats.wired_count--; + pmap->pm_stats.resident_count--; + + if (opte & PG_U) + pmap_tlb_shootdown(pmap, va, opte, cpumaskp); + + if (ptp) { + ptp->wire_count--; /* dropping a PTE */ + /* Make sure that the PDE is flushed */ + if ((ptp->wire_count <= 1) && !(opte & PG_U)) + pmap_tlb_shootdown(pmap, va, opte, cpumaskp); + + } + /* + * if we are not on a pv_head list we are done. + */ + + if ((opte & PG_PVLIST) == 0) { +#if defined(DIAGNOSTIC) && !defined(DOM0OPS) + if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL) + panic("pmap_remove_pte: managed page without " + "PG_PVLIST for 0x%lx", va); +#endif + return(TRUE); + } + + pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); +#ifdef DIAGNOSTIC + if (pg == NULL) + panic("pmap_remove_pte: unmanaged page marked " + "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va, + (u_long)(opte & PG_FRAME)); +#endif + mdpg = &pg->mdpage; + + /* sync R/M bits */ + simple_lock(&mdpg->mp_pvhead.pvh_lock); + mdpg->mp_attrs |= (opte & (PG_U|PG_M)); + pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, va); + simple_unlock(&mdpg->mp_pvhead.pvh_lock); + + if (pve) + pmap_free_pv(pmap, pve); + return(TRUE); +} + +/* + * pmap_remove: top level mapping removal function + * + * => caller should not be holding any pmap locks + */ + +void +pmap_remove(pmap, sva, eva) + struct pmap *pmap; + vaddr_t sva, eva; +{ + pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); +} + +/* + * pmap_do_remove: mapping removal guts + * + * => caller should not be holding any pmap locks + */ + +static void +pmap_do_remove(pmap, sva, eva, flags) + struct pmap *pmap; + vaddr_t sva, eva; + int flags; +{ + pt_entry_t *ptes, opte; + pt_entry_t *maptp; + boolean_t result; + paddr_t ptppa; + vaddr_t blkendva; + struct vm_page *ptp; + int32_t cpumask = 0; + TAILQ_HEAD(, vm_page) empty_ptps; + struct cpu_info *ci; + struct pmap *curpmap; + + /* + * we lock in the pmap => pv_head direction + */ + + TAILQ_INIT(&empty_ptps); + + PMAP_MAP_TO_HEAD_LOCK(); + + ptes = pmap_map_ptes(pmap); /* locks pmap */ + + ci = curcpu(); + curpmap = ci->ci_pmap; + + /* + * removing one page? take shortcut function. + */ + + if (sva + PAGE_SIZE == eva) { + if (pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) { + + /* PA of the PTP */ + ptppa = PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME; + + /* get PTP if non-kernel mapping */ + if (pmap == pmap_kernel()) { + /* we never free kernel PTPs */ + ptp = NULL; + } else { + if (pmap->pm_ptphint && + VM_PAGE_TO_PHYS(pmap->pm_ptphint) == + ptppa) { + ptp = pmap->pm_ptphint; + } else { + ptp = PHYS_TO_VM_PAGE(ptppa); +#ifdef DIAGNOSTIC + if (ptp == NULL) + panic("pmap_remove: unmanaged " + "PTP detected"); +#endif + } + } + + /* do it! */ + result = pmap_remove_pte(pmap, ptp, + &ptes[x86_btop(sva)], sva, &cpumask, flags); + + /* + * if mapping removed and the PTP is no longer + * being used, free it! + */ + + if (result && ptp && ptp->wire_count <= 1) { + /* zap! */ + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&pmap->pm_pdir[pdei(sva)]); + PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)], + maptp, opte); +#if defined(MULTIPROCESSOR) + /* + * XXXthorpej Redundant shootdown can happen + * here if we're using APTE space. + */ +#endif + pmap_tlb_shootdown(curpmap, + ((vaddr_t)ptes) + ptp->offset, opte, + &cpumask); +#if defined(MULTIPROCESSOR) + /* + * Always shoot down the pmap's self-mapping + * of the PTP. + * XXXthorpej Redundant shootdown can happen + * here if pmap == curpmap (not APTE space). + */ + pmap_tlb_shootdown(pmap, + ((vaddr_t)PTE_BASE) + ptp->offset, opte, + &cpumask); +#endif + pmap->pm_stats.resident_count--; + if (pmap->pm_ptphint == ptp) + pmap->pm_ptphint = + TAILQ_FIRST(&pmap->pm_obj.memq); + ptp->wire_count = 0; + ptp->flags |= PG_ZERO; + uvm_pagerealloc(ptp, NULL, 0); + TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq); + } + } + pmap_tlb_shootnow(cpumask); + pmap_unmap_ptes(pmap); /* unlock pmap */ + PMAP_MAP_TO_HEAD_UNLOCK(); + /* Now we can free unused ptps */ + TAILQ_FOREACH(ptp, &empty_ptps, listq) + uvm_pagefree(ptp); + return; + } + + cpumask = 0; + + for (/* null */ ; sva < eva ; sva = blkendva) { + + /* determine range of block */ + blkendva = x86_round_pdr(sva+1); + if (blkendva > eva) + blkendva = eva; + + /* + * XXXCDC: our PTE mappings should never be removed + * with pmap_remove! if we allow this (and why would + * we?) then we end up freeing the pmap's page + * directory page (PDP) before we are finished using + * it when we hit in in the recursive mapping. this + * is BAD. + * + * long term solution is to move the PTEs out of user + * address space. and into kernel address space (up + * with APTE). then we can set VM_MAXUSER_ADDRESS to + * be VM_MAX_ADDRESS. + */ + + if (pdei(sva) == PDSLOT_PTE) + /* XXXCDC: ugly hack to avoid freeing PDP here */ + continue; + + if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) + /* valid block? */ + continue; + + /* PA of the PTP */ + ptppa = (PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME); + + /* get PTP if non-kernel mapping */ + if (pmap == pmap_kernel()) { + /* we never free kernel PTPs */ + ptp = NULL; + } else { + if (pmap->pm_ptphint && + VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) { + ptp = pmap->pm_ptphint; + } else { + ptp = PHYS_TO_VM_PAGE(ptppa); +#ifdef DIAGNOSTIC + if (ptp == NULL) + panic("pmap_remove: unmanaged PTP " + "detected"); +#endif + } + } + pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[x86_btop(sva)], + sva, blkendva, &cpumask, flags); + + /* if PTP is no longer being used, free it! */ + if (ptp && ptp->wire_count <= 1) { + /* zap! */ + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&pmap->pm_pdir[pdei(sva)]); + PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)], + maptp, opte); +#if defined(MULTIPROCESSOR) + /* + * XXXthorpej Redundant shootdown can happen here + * if we're using APTE space. + */ +#endif + pmap_tlb_shootdown(curpmap, + ((vaddr_t)ptes) + ptp->offset, opte, &cpumask); +#if defined(MULTIPROCESSOR) + /* + * Always shoot down the pmap's self-mapping + * of the PTP. + * XXXthorpej Redundant shootdown can happen here + * if pmap == curpmap (not APTE space). + */ + pmap_tlb_shootdown(pmap, + ((vaddr_t)PTE_BASE) + ptp->offset, opte, &cpumask); +#endif + pmap->pm_stats.resident_count--; + if (pmap->pm_ptphint == ptp) /* update hint? */ + pmap->pm_ptphint = pmap->pm_obj.memq.tqh_first; + ptp->wire_count = 0; + ptp->flags |= PG_ZERO; + /* Postpone free to shootdown */ + uvm_pagerealloc(ptp, NULL, 0); + TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq); + } + } + + pmap_tlb_shootnow(cpumask); + pmap_unmap_ptes(pmap); + PMAP_MAP_TO_HEAD_UNLOCK(); + /* Now we can free unused ptps */ + TAILQ_FOREACH(ptp, &empty_ptps, listq) + uvm_pagefree(ptp); +} + +/* + * pmap_page_remove: remove a managed vm_page from all pmaps that map it + * + * => we set pv_head => pmap locking + * => R/M bits are sync'd back to attrs + */ + +void +pmap_page_remove(pg) + struct vm_page *pg; +{ + struct pv_head *pvh; + struct pv_entry *pve, *npve, *killlist = NULL; + pt_entry_t *ptes, opte; + pt_entry_t *maptp; + int32_t cpumask = 0; + TAILQ_HEAD(, vm_page) empty_ptps; + struct vm_page *ptp; + struct cpu_info *ci; + struct pmap *curpmap; + +#ifdef DIAGNOSTIC + int bank, off; + + bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); + if (bank == -1) + panic("pmap_page_remove: unmanaged page?"); +#endif + + pvh = &pg->mdpage.mp_pvhead; + if (SPLAY_ROOT(&pvh->pvh_root) == NULL) { + return; + } + + TAILQ_INIT(&empty_ptps); + + /* set pv_head => pmap locking */ + PMAP_HEAD_TO_MAP_LOCK(); + + ci = curcpu(); + curpmap = ci->ci_pmap; + + /* XXX: needed if we hold head->map lock? */ + simple_lock(&pvh->pvh_lock); + + for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) { + npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve); + ptes = pmap_map_ptes(pve->pv_pmap); /* locks pmap */ + +#ifdef DIAGNOSTIC + if (pve->pv_ptp && + (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]) & + PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) { + printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n", + pg, pve->pv_va, pve->pv_ptp); + printf("pmap_page_remove: PTP's phys addr: " + "actual=%lx, recorded=%lx\n", + (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]) + & PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp)); + panic("pmap_page_remove: mapped managed page has " + "invalid pv_ptp field"); + } +#endif + + /* atomically save the old PTE and zap! it */ + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&ptes[x86_btop(pve->pv_va)]); + opte = pte_atomic_update(&ptes[x86_btop(pve->pv_va)], + maptp, 0); + + if (opte & PG_W) + pve->pv_pmap->pm_stats.wired_count--; + pve->pv_pmap->pm_stats.resident_count--; + + /* Shootdown only if referenced */ + if (opte & PG_U) + pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte, + &cpumask); + + /* sync R/M bits */ + pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M)); + + /* update the PTP reference count. free if last reference. */ + if (pve->pv_ptp) { + pve->pv_ptp->wire_count--; + if (pve->pv_ptp->wire_count <= 1) { + /* + * Do we have to shootdown the page just to + * get the pte out of the TLB ? + */ + if(!(opte & PG_U)) + pmap_tlb_shootdown(pve->pv_pmap, + pve->pv_va, opte, &cpumask); + + /* zap! */ + maptp = (pt_entry_t *)vtomach((vaddr_t) + &pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]); + PTE_ATOMIC_CLEAR(&pve->pv_pmap->pm_pdir + [pdei(pve->pv_va)], maptp, opte); + pmap_tlb_shootdown(curpmap, + ((vaddr_t)ptes) + pve->pv_ptp->offset, + opte, &cpumask); +#if defined(MULTIPROCESSOR) + /* + * Always shoot down the other pmap's + * self-mapping of the PTP. + */ + pmap_tlb_shootdown(pve->pv_pmap, + ((vaddr_t)PTE_BASE) + pve->pv_ptp->offset, + opte, &cpumask); +#endif + pve->pv_pmap->pm_stats.resident_count--; + /* update hint? */ + if (pve->pv_pmap->pm_ptphint == pve->pv_ptp) + pve->pv_pmap->pm_ptphint = + pve->pv_pmap->pm_obj.memq.tqh_first; + pve->pv_ptp->wire_count = 0; + pve->pv_ptp->flags |= PG_ZERO; + /* Free only after the shootdown */ + uvm_pagerealloc(pve->pv_ptp, NULL, 0); + TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, + listq); + } + } + pmap_unmap_ptes(pve->pv_pmap); /* unlocks pmap */ + SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */ + SPLAY_RIGHT(pve, pv_node) = killlist; /* mark it for death */ + killlist = pve; + } + pmap_free_pvs(NULL, killlist); + simple_unlock(&pvh->pvh_lock); + PMAP_HEAD_TO_MAP_UNLOCK(); + pmap_tlb_shootnow(cpumask); + + /* Now we can free unused ptps */ + TAILQ_FOREACH(ptp, &empty_ptps, listq) + uvm_pagefree(ptp); +} + +/* + * p m a p a t t r i b u t e f u n c t i o n s + * functions that test/change managed page's attributes + * since a page can be mapped multiple times we must check each PTE that + * maps it by going down the pv lists. + */ + +/* + * pmap_test_attrs: test a page's attributes + * + * => we set pv_head => pmap locking + */ + +boolean_t +pmap_test_attrs(pg, testbits) + struct vm_page *pg; + int testbits; +{ + struct vm_page_md *mdpg; + int *myattrs; + struct pv_head *pvh; + struct pv_entry *pve; + volatile pt_entry_t *ptes; + pt_entry_t pte; + +#if DIAGNOSTIC + int bank, off; + + bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); + if (bank == -1) + panic("pmap_test_attrs: unmanaged page?"); +#endif + mdpg = &pg->mdpage; + + /* + * before locking: see if attributes are already set and if so, + * return! + */ + + myattrs = &mdpg->mp_attrs; + if (*myattrs & testbits) + return(TRUE); + + /* test to see if there is a list before bothering to lock */ + pvh = &mdpg->mp_pvhead; + if (SPLAY_ROOT(&pvh->pvh_root) == NULL) { + return(FALSE); + } + + /* nope, gonna have to do it the hard way */ + PMAP_HEAD_TO_MAP_LOCK(); + /* XXX: needed if we hold head->map lock? */ + simple_lock(&pvh->pvh_lock); + + for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); + pve != NULL && (*myattrs & testbits) == 0; + pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) { + ptes = pmap_map_ptes(pve->pv_pmap); + pte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); /* XXX flags only? */ + pmap_unmap_ptes(pve->pv_pmap); + *myattrs |= pte; + } + + /* + * note that we will exit the for loop with a non-null pve if + * we have found the bits we are testing for. + */ + + simple_unlock(&pvh->pvh_lock); + PMAP_HEAD_TO_MAP_UNLOCK(); + return((*myattrs & testbits) != 0); +} + +/* + * pmap_clear_attrs: clear the specified attribute for a page. + * + * => we set pv_head => pmap locking + * => we return TRUE if we cleared one of the bits we were asked to + */ + +boolean_t +pmap_clear_attrs(pg, clearbits) + struct vm_page *pg; + int clearbits; +{ + struct vm_page_md *mdpg; + u_int32_t result; + struct pv_head *pvh; + struct pv_entry *pve; + pt_entry_t *ptes, opte; + pt_entry_t *maptp; + int *myattrs; + int32_t cpumask = 0; + +#ifdef DIAGNOSTIC + int bank, off; + + bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); + if (bank == -1) + panic("pmap_change_attrs: unmanaged page?"); +#endif + mdpg = &pg->mdpage; + + PMAP_HEAD_TO_MAP_LOCK(); + pvh = &mdpg->mp_pvhead; + /* XXX: needed if we hold head->map lock? */ + simple_lock(&pvh->pvh_lock); + + myattrs = &mdpg->mp_attrs; + result = *myattrs & clearbits; + *myattrs &= ~clearbits; + + SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) { +#ifdef DIAGNOSTIC + if (!pmap_valid_entry(pve->pv_pmap->pm_pdir[pdei(pve->pv_va)])) + panic("pmap_change_attrs: mapping without PTP " + "detected"); +#endif + + ptes = pmap_map_ptes(pve->pv_pmap); /* locks pmap */ + opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); + if (opte & clearbits) { + /* We need to do something */ + if (clearbits == PG_RW) { + result |= PG_RW; + + /* + * On write protect we might not need to flush + * the TLB + */ + + /* First zap the RW bit! */ + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&ptes[x86_btop(pve->pv_va)]); + PTE_ATOMIC_CLEARBITS( + &ptes[x86_btop(pve->pv_va)], + maptp, PG_RW); + opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); + + /* + * Then test if it is not cached as RW the TLB + */ + if (!(opte & PG_M)) + goto no_tlb_shootdown; + } + + /* + * Since we need a shootdown me might as well + * always clear PG_U AND PG_M. + */ + + /* zap! */ + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&ptes[x86_btop(pve->pv_va)]); + PTE_ATOMIC_SET(&ptes[x86_btop(pve->pv_va)], maptp, + (opte & ~(PG_U | PG_M)), opte); + + result |= (opte & clearbits); + *myattrs |= (opte & ~(clearbits)); + + pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte, + &cpumask); + } +no_tlb_shootdown: + pmap_unmap_ptes(pve->pv_pmap); /* unlocks pmap */ + } + + simple_unlock(&pvh->pvh_lock); + PMAP_HEAD_TO_MAP_UNLOCK(); + + pmap_tlb_shootnow(cpumask); + return(result != 0); +} + + +/* + * p m a p p r o t e c t i o n f u n c t i o n s + */ + +/* + * pmap_page_protect: change the protection of all recorded mappings + * of a managed page + * + * => NOTE: this is an inline function in pmap.h + */ + +/* see pmap.h */ + +/* + * pmap_protect: set the protection in of the pages in a pmap + * + * => NOTE: this is an inline function in pmap.h + */ + +/* see pmap.h */ + +/* + * pmap_write_protect: write-protect pages in a pmap + */ + +void +pmap_write_protect(pmap, sva, eva, prot) + struct pmap *pmap; + vaddr_t sva, eva; + vm_prot_t prot; +{ + pt_entry_t *ptes, *epte; + pt_entry_t *maptp; +#ifndef XEN + volatile +#endif + pt_entry_t *spte; + vaddr_t blockend; + int32_t cpumask = 0; + + ptes = pmap_map_ptes(pmap); /* locks pmap */ + + /* should be ok, but just in case ... */ + sva &= PG_FRAME; + eva &= PG_FRAME; + + for (/* null */ ; sva < eva ; sva = blockend) { + + blockend = (sva & PD_MASK) + NBPD; + if (blockend > eva) + blockend = eva; + + /* + * XXXCDC: our PTE mappings should never be write-protected! + * + * long term solution is to move the PTEs out of user + * address space. and into kernel address space (up + * with APTE). then we can set VM_MAXUSER_ADDRESS to + * be VM_MAX_ADDRESS. + */ + + /* XXXCDC: ugly hack to avoid freeing PDP here */ + if (pdei(sva) == PDSLOT_PTE) + continue; + + /* empty block? */ + if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) + continue; + +#ifdef DIAGNOSTIC + if (sva >= VM_MAXUSER_ADDRESS && + sva < VM_MAX_ADDRESS) + panic("pmap_write_protect: PTE space"); +#endif + + spte = &ptes[x86_btop(sva)]; + epte = &ptes[x86_btop(blockend)]; + + for (/*null */; spte < epte ; spte++) { + if ((PTE_GET(spte) & (PG_RW|PG_V)) == (PG_RW|PG_V)) { + maptp = (pt_entry_t *)vtomach((vaddr_t)spte); + PTE_ATOMIC_CLEARBITS(spte, maptp, PG_RW); + if (PTE_GET(spte) & PG_M) + pmap_tlb_shootdown(pmap, + x86_ptob(spte - ptes), + PTE_GET(spte), &cpumask); + } + } + } + + /* + * if we kept a removal record and removed some pages update the TLB + */ + + pmap_tlb_shootnow(cpumask); + pmap_unmap_ptes(pmap); /* unlocks pmap */ +} + +/* + * end of protection functions + */ + +/* + * pmap_unwire: clear the wired bit in the PTE + * + * => mapping should already be in map + */ + +void +pmap_unwire(pmap, va) + struct pmap *pmap; + vaddr_t va; +{ + pt_entry_t *ptes; + pt_entry_t *maptp; + + if (pmap_valid_entry(pmap->pm_pdir[pdei(va)])) { + ptes = pmap_map_ptes(pmap); /* locks pmap */ + +#ifdef DIAGNOSTIC + if (!pmap_valid_entry(ptes[x86_btop(va)])) + panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); +#endif + if ((ptes[x86_btop(va)] & PG_W) != 0) { + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&ptes[x86_btop(va)]); + PTE_ATOMIC_CLEARBITS(&ptes[x86_btop(va)], maptp, PG_W); + pmap->pm_stats.wired_count--; + } +#ifdef DIAGNOSTIC + else { + printf("pmap_unwire: wiring for pmap %p va 0x%lx " + "didn't change!\n", pmap, va); + } +#endif + pmap_unmap_ptes(pmap); /* unlocks map */ + } +#ifdef DIAGNOSTIC + else { + panic("pmap_unwire: invalid PDE"); + } +#endif +} + +/* + * pmap_collect: free resources held by a pmap + * + * => optional function. + * => called when a process is swapped out to free memory. + */ + +void +pmap_collect(pmap) + struct pmap *pmap; +{ + /* + * free all of the pt pages by removing the physical mappings + * for its entire address space. + */ + + pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS, + PMAP_REMOVE_SKIPWIRED); +} + +/* + * pmap_copy: copy mappings from one pmap to another + * + * => optional function + * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) + */ + +/* + * defined as macro in pmap.h + */ + +/* + * pmap_enter: enter a mapping into a pmap + * + * => must be done "now" ... no lazy-evaluation + * => we set pmap => pv_head locking + */ + +int +pmap_enter(pmap, va, pa, prot, flags) + struct pmap *pmap; + vaddr_t va; + paddr_t pa; + vm_prot_t prot; + int flags; +{ + pt_entry_t *ptes, opte, npte; + struct vm_page *ptp, *pg; + struct vm_page_md *mdpg; + struct pv_head *old_pvh, *new_pvh; + struct pv_entry *pve = NULL; /* XXX gcc */ + int error; + boolean_t wired = (flags & PMAP_WIRED) != 0; + pt_entry_t *maptp; + + XENPRINTK(("pmap_enter(%p, %p, %p, %08x, %08x)\n", + pmap, (void *)va, (void *)pa, prot, flags)); + +#ifdef DIAGNOSTIC + /* sanity check: totally out of range? */ + if (va >= VM_MAX_KERNEL_ADDRESS) + panic("pmap_enter: too big"); + + if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) + panic("pmap_enter: trying to map over PDP/APDP!"); + + /* sanity check: kernel PTPs should already have been pre-allocated */ + if (va >= VM_MIN_KERNEL_ADDRESS && + !pmap_valid_entry(pmap->pm_pdir[pdei(va)])) + panic("pmap_enter: missing kernel PTP!"); +#endif + + npte = protection_codes[prot] | PG_V; + + if (pa >= pmap_pa_start && pa < pmap_pa_end) + npte |= xpmap_ptom(pa); + else { + XENPRINTF(("pmap_enter: va %08lx outside pa range %08lx\n", + va, pa)); + npte |= pa; + } + + /* XENPRINTK(("npte %p\n", npte)); */ + + if (wired) + npte |= PG_W; + + if (va < VM_MAXUSER_ADDRESS) + npte |= PG_u; + else if (va < VM_MAX_ADDRESS) + npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ + if (pmap == pmap_kernel()) + npte |= pmap_pg_g; + + /* get lock */ + PMAP_MAP_TO_HEAD_LOCK(); + + ptes = pmap_map_ptes(pmap); /* locks pmap */ + if (pmap == pmap_kernel()) { + ptp = NULL; + } else { + ptp = pmap_get_ptp(pmap, pdei(va)); + if (ptp == NULL) { + if (flags & PMAP_CANFAIL) { + error = ENOMEM; + goto out; + } + panic("pmap_enter: get ptp failed"); + } + } + + /* + * Get first view on old PTE + * on SMP the PTE might gain PG_U and PG_M flags + * before we zap it later + */ + opte = pte_get(&ptes[x86_btop(va)]); /* old PTE */ + XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", + (void *)npte, (void *)opte, ptes, x86_btop(va))); + + /* + * is there currently a valid mapping at our VA and does it + * map to the same PA as the one we want to map ? + */ + + if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) { + + /* + * first, calculate pm_stats updates. resident count will not + * change since we are replacing/changing a valid mapping. + * wired count might change... + */ + pmap->pm_stats.wired_count += + ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); + + npte |= (opte & PG_PVLIST); + + XENPRINTK(("pmap update opte == pa")); + /* zap! */ + maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]); + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte); + + /* + * Any change in the protection level that the CPU + * should know about ? + */ + if ((npte & PG_RW) + || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) { + XENPRINTK(("pmap update opte == pa, prot change")); + /* + * No need to flush the TLB. + * Just add old PG_M, ... flags in new entry. + */ + PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp, + opte & (PG_M | PG_U)); + goto out_ok; + } + + /* + * Might be cached in the TLB as being writable + * if this is on the PVLIST, sync R/M bit + */ + if (opte & PG_PVLIST) { + pg = PHYS_TO_VM_PAGE(pa); +#ifdef DIAGNOSTIC + if (pg == NULL) + panic("pmap_enter: same pa PG_PVLIST " + "mapping with unmanaged page " + "pa = 0x%lx (0x%lx)", pa, + atop(pa)); +#endif + mdpg = &pg->mdpage; + old_pvh = &mdpg->mp_pvhead; + simple_lock(&old_pvh->pvh_lock); + mdpg->mp_attrs |= opte; + simple_unlock(&old_pvh->pvh_lock); + } + goto shootdown_now; + } + + pg = PHYS_TO_VM_PAGE(pa); + XENPRINTK(("pg %p from %p, init %d\n", pg, (void *)pa, + pmap_initialized)); + if (pmap_initialized && pg != NULL) { + /* This is a managed page */ + npte |= PG_PVLIST; + mdpg = &pg->mdpage; + new_pvh = &mdpg->mp_pvhead; + if ((opte & (PG_PVLIST | PG_V)) != (PG_PVLIST | PG_V)) { + /* We can not steal a pve - allocate one */ + pve = pmap_alloc_pv(pmap, ALLOCPV_NEED); + if (pve == NULL) { + if (!(flags & PMAP_CANFAIL)) + panic("pmap_enter: " + "no pv entries available"); + error = ENOMEM; + goto out; + } + } + } else { + new_pvh = NULL; + } + + /* + * is there currently a valid mapping at our VA? + */ + + if (pmap_valid_entry(opte)) { + + /* + * changing PAs: we must remove the old one first + */ + + /* + * first, calculate pm_stats updates. resident count will not + * change since we are replacing/changing a valid mapping. + * wired count might change... + */ + pmap->pm_stats.wired_count += + ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); + + if (opte & PG_PVLIST) { + pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); +#ifdef DIAGNOSTIC + if (pg == NULL) + panic("pmap_enter: PG_PVLIST mapping with " + "unmanaged page " + "pa = 0x%lx (0x%lx)", pa, atop(pa)); +#endif + mdpg = &pg->mdpage; + old_pvh = &mdpg->mp_pvhead; + + /* new_pvh is NULL if page will not be managed */ + pmap_lock_pvhs(old_pvh, new_pvh); + + XENPRINTK(("pmap change pa")); + /* zap! */ + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&ptes[x86_btop(va)]); + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, + npte); + + pve = pmap_remove_pv(old_pvh, pmap, va); + KASSERT(pve != 0); + mdpg->mp_attrs |= opte; + + if (new_pvh) { + pmap_enter_pv(new_pvh, pve, pmap, va, ptp); + simple_unlock(&new_pvh->pvh_lock); + } else + pmap_free_pv(pmap, pve); + simple_unlock(&old_pvh->pvh_lock); + + goto shootdown_test; + } + } else { /* opte not valid */ + pmap->pm_stats.resident_count++; + if (wired) + pmap->pm_stats.wired_count++; + if (ptp) + ptp->wire_count++; + } + + if (new_pvh) { + simple_lock(&new_pvh->pvh_lock); + pmap_enter_pv(new_pvh, pve, pmap, va, ptp); + simple_unlock(&new_pvh->pvh_lock); + } + + XENPRINTK(("pmap initial setup\n")); + maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]); + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], + maptp, npte); /* zap! */ + +shootdown_test: + /* Update page attributes if needed */ + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { +#if defined(MULTIPROCESSOR) + int32_t cpumask = 0; +#endif +shootdown_now: +#if defined(MULTIPROCESSOR) + pmap_tlb_shootdown(pmap, va, opte, &cpumask); + pmap_tlb_shootnow(cpumask); +#else + /* Don't bother deferring in the single CPU case. */ + if (pmap_is_curpmap(pmap)) + pmap_update_pg(va); +#endif + } + +out_ok: + error = 0; + +out: + pmap_unmap_ptes(pmap); + PMAP_MAP_TO_HEAD_UNLOCK(); + + XENPRINTK(("pmap_enter: %d\n", error)); + return error; +} + +/* + * pmap_enter_ma: enter a mapping into a pmap + * + * => must be done "now" ... no lazy-evaluation + * => we set pmap => pv_head locking + */ + +int +pmap_enter_ma(pmap, va, pa, prot, flags) + struct pmap *pmap; + vaddr_t va; + paddr_t pa; + vm_prot_t prot; + int flags; +{ + pt_entry_t *ptes, opte, npte; + pt_entry_t *maptp; + struct vm_page *ptp, *pg; + struct vm_page_md *mdpg; + struct pv_head *old_pvh; + struct pv_entry *pve = NULL; /* XXX gcc */ + int error; + boolean_t wired = (flags & PMAP_WIRED) != 0; + + XENPRINTK(("pmap_enter_ma(%p, %p, %p, %08x, %08x)\n", + pmap, (void *)va, (void *)pa, prot, flags)); + +#ifdef DIAGNOSTIC + /* sanity check: totally out of range? */ + if (va >= VM_MAX_KERNEL_ADDRESS) + panic("pmap_enter: too big"); + + if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) + panic("pmap_enter: trying to map over PDP/APDP!"); + + /* sanity check: kernel PTPs should already have been pre-allocated */ + if (va >= VM_MIN_KERNEL_ADDRESS && + !pmap_valid_entry(pmap->pm_pdir[pdei(va)])) + panic("pmap_enter: missing kernel PTP!"); +#endif + + npte = pa | protection_codes[prot] | PG_V; + /* XENPRINTK(("npte %p\n", npte)); */ + + if (wired) + npte |= PG_W; + + if (va < VM_MAXUSER_ADDRESS) + npte |= PG_u; + else if (va < VM_MAX_ADDRESS) + npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ + if (pmap == pmap_kernel()) + npte |= pmap_pg_g; + + /* get lock */ + PMAP_MAP_TO_HEAD_LOCK(); + + ptes = pmap_map_ptes(pmap); /* locks pmap */ + if (pmap == pmap_kernel()) { + ptp = NULL; + } else { + ptp = pmap_get_ptp(pmap, pdei(va)); + if (ptp == NULL) { + if (flags & PMAP_CANFAIL) { + error = ENOMEM; + goto out; + } + panic("pmap_enter: get ptp failed"); + } + } + + /* + * Get first view on old PTE + * on SMP the PTE might gain PG_U and PG_M flags + * before we zap it later + */ + opte = pte_get_ma(&ptes[x86_btop(va)]); /* old PTE */ + XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", + (void *)npte, (void *)opte, ptes, x86_btop(va))); + XENPRINTF(("pmap_enter_ma pa %08lx va %08lx opte %08x npte %08x " + "wired %d count %ld\n", pa, va, opte, npte, wired, + pmap->pm_stats.wired_count)); + + /* + * is there currently a valid mapping at our VA and does it + * map to the same MA as the one we want to map ? + */ + + if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) { + + /* + * first, calculate pm_stats updates. resident count will not + * change since we are replacing/changing a valid mapping. + * wired count might change... + */ + pmap->pm_stats.wired_count += + ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); + + XENPRINTK(("pmap update opte == pa")); + /* zap! */ + maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]); + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte); + + /* + * Any change in the protection level that the CPU + * should know about ? + */ + if ((npte & PG_RW) + || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) { + XENPRINTK(("pmap update opte == pa, prot change")); + /* + * No need to flush the TLB. + * Just add old PG_M, ... flags in new entry. + */ + PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp, + opte & (PG_M | PG_U)); + goto out_ok; + } + + /* + * Might be cached in the TLB as being writable + * if this is on the PVLIST, sync R/M bit + */ + KDASSERT((opte & PG_PVLIST) == 0); + goto shootdown_now; + } + + /* + * no managed mapping for pages mapped through pmap_enter_ma. + */ + + /* + * is there currently a valid mapping at our VA? + */ + + if (pmap_valid_entry(opte)) { + + /* + * changing PAs: we must remove the old one first + */ + + /* + * first, calculate pm_stats updates. resident count will not + * change since we are replacing/changing a valid mapping. + * wired count might change... + */ + pmap->pm_stats.wired_count += + ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); + + if (opte & PG_PVLIST) { + opte = xpmap_mtop(opte); + KDASSERT((opte & PG_FRAME) != + (KERNTEXTOFF - KERNBASE_LOCORE)); + + pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); +#ifdef DIAGNOSTIC + if (pg == NULL) + panic("pmap_enter: PG_PVLIST mapping with " + "unmanaged page " + "pa = 0x%lx (0x%lx)", pa, atop(pa)); +#endif + mdpg = &pg->mdpage; + old_pvh = &mdpg->mp_pvhead; + + /* NULL new_pvh since page will not be managed */ + pmap_lock_pvhs(old_pvh, NULL); + + XENPRINTK(("pmap change pa")); + /* zap! */ + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&ptes[x86_btop(va)]); + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, + npte); + + pve = pmap_remove_pv(old_pvh, pmap, va); + KASSERT(pve != 0); + mdpg->mp_attrs |= opte; + + pmap_free_pv(pmap, pve); + simple_unlock(&old_pvh->pvh_lock); + + goto shootdown_test; + } + } else { /* opte not valid */ + pmap->pm_stats.resident_count++; + if (wired) + pmap->pm_stats.wired_count++; + if (ptp) + ptp->wire_count++; + } + + XENPRINTK(("pmap initial setup")); + maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]); + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], + maptp, npte); /* zap! */ + +shootdown_test: + /* Update page attributes if needed */ + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { +#if defined(MULTIPROCESSOR) + int32_t cpumask = 0; +#endif +shootdown_now: +#if defined(MULTIPROCESSOR) + pmap_tlb_shootdown(pmap, va, opte, &cpumask); + pmap_tlb_shootnow(cpumask); +#else + /* Don't bother deferring in the single CPU case. */ + if (pmap_is_curpmap(pmap)) + pmap_update_pg(va); +#endif + } + +out_ok: + error = 0; + +out: + pmap_unmap_ptes(pmap); + PMAP_MAP_TO_HEAD_UNLOCK(); + + XENPRINTK(("pmap_enter: %d\n", error)); + return error; +} + +/* + * pmap_growkernel: increase usage of KVM space + * + * => we allocate new PTPs for the kernel and install them in all + * the pmaps on the system. + */ + +vaddr_t +pmap_growkernel(maxkvaddr) + vaddr_t maxkvaddr; +{ + struct pmap *kpm = pmap_kernel(), *pm; + pd_entry_t *mapdp; + pt_entry_t *maptp; + int needed_kpde; /* needed number of kernel PTPs */ + int s; + paddr_t ptaddr; + + needed_kpde = (u_int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1)) + / NBPD; + XENPRINTF(("pmap_growkernel %p: %d -> %d\n", (void *)maxkvaddr, + nkpde, needed_kpde)); + if (needed_kpde <= nkpde) + goto out; /* we are OK */ + + /* + * whoops! we need to add kernel PTPs + */ + + s = splhigh(); /* to be safe */ + simple_lock(&kpm->pm_obj.vmobjlock); + + for (/*null*/ ; nkpde < needed_kpde ; nkpde++) { + + mapdp = (pt_entry_t *)vtomach((vaddr_t)&kpm->pm_pdir[PDSLOT_KERN + nkpde]); + if (uvm.page_init_done == FALSE) { + + /* + * we're growing the kernel pmap early (from + * uvm_pageboot_alloc()). this case must be + * handled a little differently. + */ + + if (uvm_page_physget(&ptaddr) == FALSE) + panic("pmap_growkernel: out of memory"); + pmap_zero_page(ptaddr); + + XENPRINTF(("xxxx maybe not PG_RW\n")); + PDE_SET(&kpm->pm_pdir[PDSLOT_KERN + nkpde], mapdp, ptaddr | PG_RW | PG_V); + + /* count PTP as resident */ + kpm->pm_stats.resident_count++; + continue; + } + + /* + * THIS *MUST* BE CODED SO AS TO WORK IN THE + * pmap_initialized == FALSE CASE! WE MAY BE + * INVOKED WHILE pmap_init() IS RUNNING! + */ + + if (pmap_alloc_ptp(kpm, PDSLOT_KERN + nkpde) == NULL) { + panic("pmap_growkernel: alloc ptp failed"); + } + + /* PG_u not for kernel */ + PDE_CLEARBITS(&kpm->pm_pdir[PDSLOT_KERN + nkpde], mapdp, PG_u); + + /* distribute new kernel PTP to all active pmaps */ + simple_lock(&pmaps_lock); + for (pm = pmaps.lh_first; pm != NULL; + pm = pm->pm_list.le_next) { + XENPRINTF(("update\n")); + maptp = (pt_entry_t *)vtomach( + (vaddr_t)&pm->pm_pdir[PDSLOT_KERN + nkpde]); + PDE_COPY(&pm->pm_pdir[PDSLOT_KERN + nkpde], maptp, + &kpm->pm_pdir[PDSLOT_KERN + nkpde]); + } + + /* Invalidate the PDP cache. */ + pool_cache_invalidate(&pmap_pdp_cache); + pmap_pdp_cache_generation++; + + simple_unlock(&pmaps_lock); + } + + simple_unlock(&kpm->pm_obj.vmobjlock); + splx(s); + +out: + XENPRINTF(("pmap_growkernel return %d %p\n", nkpde, + (void *)(VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD)))); + return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD)); +} + +#ifdef DEBUG +void pmap_dump(struct pmap *, vaddr_t, vaddr_t); + +/* + * pmap_dump: dump all the mappings from a pmap + * + * => caller should not be holding any pmap locks + */ + +void +pmap_dump(pmap, sva, eva) + struct pmap *pmap; + vaddr_t sva, eva; +{ + pt_entry_t *ptes, *pte; + vaddr_t blkendva; + + /* + * if end is out of range truncate. + * if (end == start) update to max. + */ + + if (eva > VM_MAXUSER_ADDRESS || eva <= sva) + eva = VM_MAXUSER_ADDRESS; + + /* + * we lock in the pmap => pv_head direction + */ + + PMAP_MAP_TO_HEAD_LOCK(); + ptes = pmap_map_ptes(pmap); /* locks pmap */ + + /* + * dumping a range of pages: we dump in PTP sized blocks (4MB) + */ + + for (/* null */ ; sva < eva ; sva = blkendva) { + + /* determine range of block */ + blkendva = x86_round_pdr(sva+1); + if (blkendva > eva) + blkendva = eva; + + /* valid block? */ + if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) + continue; + + pte = &ptes[x86_btop(sva)]; + for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { + if (!pmap_valid_entry(*pte)) + continue; + XENPRINTF(("va %#lx -> pa %#lx (pte=%#lx)\n", + sva, PTE_GET(pte), PTE_GET(pte) & PG_FRAME)); + } + } + pmap_unmap_ptes(pmap); + PMAP_MAP_TO_HEAD_UNLOCK(); +} +#endif + +/******************** TLB shootdown code ********************/ + + +void +pmap_tlb_shootnow(int32_t cpumask) +{ + struct cpu_info *self; +#ifdef MULTIPROCESSOR + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + int s; +#ifdef DIAGNOSTIC + int count = 0; +#endif +#endif + + if (cpumask == 0) + return; + + self = curcpu(); +#ifdef MULTIPROCESSOR + s = splipi(); + self->ci_tlb_ipi_mask = cpumask; +#endif + + pmap_do_tlb_shootdown(self); /* do *our* work. */ + +#ifdef MULTIPROCESSOR + splx(s); + + /* + * Send the TLB IPI to other CPUs pending shootdowns. + */ + for (CPU_INFO_FOREACH(cii, ci)) { + if (ci == self) + continue; + if (cpumask & (1U << ci->ci_cpuid)) + if (x86_send_ipi(ci, X86_IPI_TLB) != 0) + x86_atomic_clearbits_l(&self->ci_tlb_ipi_mask, + (1U << ci->ci_cpuid)); + } + + while (self->ci_tlb_ipi_mask != 0) { +#ifdef DIAGNOSTIC + if (count++ > 10000000) + panic("TLB IPI rendezvous failed (mask %x)", + self->ci_tlb_ipi_mask); +#endif + x86_pause(); + } +#endif +} + +/* + * pmap_tlb_shootdown: + * + * Cause the TLB entry for pmap/va to be shot down. + */ +void +pmap_tlb_shootdown(pmap, va, pte, cpumaskp) + pmap_t pmap; + vaddr_t va; + pt_entry_t pte; + int32_t *cpumaskp; +{ + struct cpu_info *ci, *self; + struct pmap_tlb_shootdown_q *pq; + struct pmap_tlb_shootdown_job *pj; + CPU_INFO_ITERATOR cii; + int s; + +#ifdef LARGEPAGES + if (pte & PG_PS) + va &= PG_LGFRAME; +#endif + + if (pmap_initialized == FALSE || cpus_attached == 0) { + pmap_update_pg(va); + return; + } + + self = curcpu(); + + s = splipi(); +#if 0 + printf("dshootdown %lx\n", va); +#endif + + for (CPU_INFO_FOREACH(cii, ci)) { + /* Note: we queue shootdown events for ourselves here! */ + if (pmap_is_active(pmap, ci->ci_cpuid) == 0) + continue; + if (ci != self && !(ci->ci_flags & CPUF_RUNNING)) + continue; + pq = &pmap_tlb_shootdown_q[ci->ci_cpuid]; + __cpu_simple_lock(&pq->pq_slock); + + /* + * If there's a global flush already queued, or a + * non-global flush, and this pte doesn't have the G + * bit set, don't bother. + */ + if (pq->pq_flushg > 0 || + (pq->pq_flushu > 0 && (pte & pmap_pg_g) == 0)) { + __cpu_simple_unlock(&pq->pq_slock); + continue; + } + +#ifdef I386_CPU + /* + * i386 CPUs can't invalidate a single VA, only + * flush the entire TLB, so don't bother allocating + * jobs for them -- just queue a `flushu'. + * + * XXX note that this can be executed for non-i386 + * when called * early (before identifycpu() has set + * cpu_class) + */ + if (cpu_class == CPUCLASS_386) { + pq->pq_flushu++; + *cpumaskp |= 1U << ci->ci_cpuid; + __cpu_simple_unlock(&pq->pq_slock); + continue; + } +#endif + + pj = pmap_tlb_shootdown_job_get(pq); + pq->pq_pte |= pte; + if (pj == NULL) { + /* + * Couldn't allocate a job entry. + * Kill it now for this CPU, unless the failure + * was due to too many pending flushes; otherwise, + * tell other cpus to kill everything.. + */ + if (ci == self && pq->pq_count < PMAP_TLB_MAXJOBS) { + pmap_update_pg(va); + __cpu_simple_unlock(&pq->pq_slock); + continue; + } else { + if (pq->pq_pte & pmap_pg_g) + pq->pq_flushg++; + else + pq->pq_flushu++; + /* + * Since we've nailed the whole thing, + * drain the job entries pending for that + * processor. + */ + pmap_tlb_shootdown_q_drain(pq); + *cpumaskp |= 1U << ci->ci_cpuid; + } + } else { + pj->pj_pmap = pmap; + pj->pj_va = va; + pj->pj_pte = pte; + TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list); + *cpumaskp |= 1U << ci->ci_cpuid; + } + __cpu_simple_unlock(&pq->pq_slock); + } + splx(s); +} + +/* + * pmap_do_tlb_shootdown_checktlbstate: check and update ci_tlbstate. + * + * => called at splipi. + * => return TRUE if we need to maintain user tlbs. + */ +static __inline boolean_t +pmap_do_tlb_shootdown_checktlbstate(struct cpu_info *ci) +{ + + KASSERT(ci == curcpu()); + + if (ci->ci_tlbstate == TLBSTATE_LAZY) { + KASSERT(ci->ci_pmap != pmap_kernel()); + /* + * mostly KASSERT(ci->ci_pmap->pm_cpus & (1U << ci->ci_cpuid)); + */ + + /* + * we no longer want tlb shootdown ipis for this pmap. + * mark the pmap no longer in use by this processor. + */ + + x86_atomic_clearbits_l(&ci->ci_pmap->pm_cpus, + 1U << ci->ci_cpuid); + ci->ci_tlbstate = TLBSTATE_STALE; + } + + if (ci->ci_tlbstate == TLBSTATE_STALE) + return FALSE; + + return TRUE; +} + +/* + * pmap_do_tlb_shootdown: + * + * Process pending TLB shootdown operations for this processor. + */ +void +pmap_do_tlb_shootdown(struct cpu_info *self) +{ + u_long cpu_id = self->ci_cpuid; + struct pmap_tlb_shootdown_q *pq = &pmap_tlb_shootdown_q[cpu_id]; + struct pmap_tlb_shootdown_job *pj; + int s; +#ifdef MULTIPROCESSOR + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; +#endif + KASSERT(self == curcpu()); + + s = splipi(); + + __cpu_simple_lock(&pq->pq_slock); + + if (pq->pq_flushg) { + COUNT(flushg); + pmap_do_tlb_shootdown_checktlbstate(self); + tlbflushg(); + pq->pq_flushg = 0; + pq->pq_flushu = 0; + pmap_tlb_shootdown_q_drain(pq); + } else { + /* + * TLB flushes for PTEs with PG_G set may be in the queue + * after a flushu, they need to be dealt with. + */ + if (pq->pq_flushu) { + COUNT(flushu); + pmap_do_tlb_shootdown_checktlbstate(self); + tlbflush(); + } + while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) { + TAILQ_REMOVE(&pq->pq_head, pj, pj_list); + + if ((pj->pj_pte & pmap_pg_g) || + pj->pj_pmap == pmap_kernel()) { + pmap_update_pg(pj->pj_va); + } else if (!pq->pq_flushu && + pj->pj_pmap == self->ci_pmap) { + if (pmap_do_tlb_shootdown_checktlbstate(self)) + pmap_update_pg(pj->pj_va); + } + + pmap_tlb_shootdown_job_put(pq, pj); + } + + pq->pq_flushu = pq->pq_pte = 0; + } + +#ifdef MULTIPROCESSOR + for (CPU_INFO_FOREACH(cii, ci)) + x86_atomic_clearbits_l(&ci->ci_tlb_ipi_mask, + (1U << cpu_id)); +#endif + __cpu_simple_unlock(&pq->pq_slock); + + splx(s); +} + + +/* + * pmap_tlb_shootdown_q_drain: + * + * Drain a processor's TLB shootdown queue. We do not perform + * the shootdown operations. This is merely a convenience + * function. + * + * Note: We expect the queue to be locked. + */ +void +pmap_tlb_shootdown_q_drain(pq) + struct pmap_tlb_shootdown_q *pq; +{ + struct pmap_tlb_shootdown_job *pj; + + while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) { + TAILQ_REMOVE(&pq->pq_head, pj, pj_list); + pmap_tlb_shootdown_job_put(pq, pj); + } + pq->pq_pte = 0; +} + +/* + * pmap_tlb_shootdown_job_get: + * + * Get a TLB shootdown job queue entry. This places a limit on + * the number of outstanding jobs a processor may have. + * + * Note: We expect the queue to be locked. + */ +struct pmap_tlb_shootdown_job * +pmap_tlb_shootdown_job_get(pq) + struct pmap_tlb_shootdown_q *pq; +{ + struct pmap_tlb_shootdown_job *pj; + + if (pq->pq_count >= PMAP_TLB_MAXJOBS) + return (NULL); + + __cpu_simple_lock(&pmap_tlb_shootdown_job_lock); + if (pj_free == NULL) { + __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock); + return NULL; + } + pj = &pj_free->pja_job; + pj_free = + (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree; + __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock); + + pq->pq_count++; + return (pj); +} + +/* + * pmap_tlb_shootdown_job_put: + * + * Put a TLB shootdown job queue entry onto the free list. + * + * Note: We expect the queue to be locked. + */ +void +pmap_tlb_shootdown_job_put(pq, pj) + struct pmap_tlb_shootdown_q *pq; + struct pmap_tlb_shootdown_job *pj; +{ + +#ifdef DIAGNOSTIC + if (pq->pq_count == 0) + panic("pmap_tlb_shootdown_job_put: queue length inconsistency"); +#endif + __cpu_simple_lock(&pmap_tlb_shootdown_job_lock); + pj->pj_nextfree = &pj_free->pja_job; + pj_free = (union pmap_tlb_shootdown_job_al *)pj; + __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock); + + pq->pq_count--; +} diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c new file mode 100644 index 0000000000..d65741fbf2 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c @@ -0,0 +1,550 @@ +/* $NetBSD: sys_machdep.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $ */ +/* NetBSD: sys_machdep.c,v 1.70 2003/10/27 14:11:47 junyoung Exp */ + +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Charles M. Hannum. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: sys_machdep.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $"); + +#include "opt_compat_netbsd.h" +#include "opt_mtrr.h" +#include "opt_perfctrs.h" +#include "opt_user_ldt.h" +#include "opt_vm86.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ioctl.h> +#include <sys/file.h> +#include <sys/time.h> +#include <sys/proc.h> +#include <sys/user.h> +#include <sys/uio.h> +#include <sys/kernel.h> +#include <sys/buf.h> +#include <sys/signal.h> +#include <sys/malloc.h> + +#include <sys/mount.h> +#include <sys/sa.h> +#include <sys/syscallargs.h> + +#include <uvm/uvm_extern.h> + +#include <machine/cpu.h> +#include <machine/cpufunc.h> +#include <machine/gdt.h> +#include <machine/psl.h> +#include <machine/reg.h> +#include <machine/sysarch.h> +#include <machine/mtrr.h> + +#ifdef VM86 +#include <machine/vm86.h> +#endif + +#ifdef PERFCTRS +#include <machine/pmc.h> +#endif + +extern struct vm_map *kernel_map; + +int i386_iopl(struct lwp *, void *, register_t *); +int i386_get_ioperm(struct lwp *, void *, register_t *); +int i386_set_ioperm(struct lwp *, void *, register_t *); +int i386_get_mtrr(struct lwp *, void *, register_t *); +int i386_set_mtrr(struct lwp *, void *, register_t *); + +#ifdef USER_LDT + +#ifdef LDT_DEBUG +static void i386_print_ldt(int, const struct segment_descriptor *); + +static void +i386_print_ldt(i, d) + int i; + const struct segment_descriptor *d; +{ + printf("[%d] lolimit=0x%x, lobase=0x%x, type=%u, dpl=%u, p=%u, " + "hilimit=0x%x, xx=%x, def32=%u, gran=%u, hibase=0x%x\n", + i, d->sd_lolimit, d->sd_lobase, d->sd_type, d->sd_dpl, d->sd_p, + d->sd_hilimit, d->sd_xx, d->sd_def32, d->sd_gran, d->sd_hibase); +} +#endif + +int +i386_get_ldt(l, args, retval) + struct lwp *l; + void *args; + register_t *retval; +{ + int error; + struct proc *p = l->l_proc; + pmap_t pmap = p->p_vmspace->vm_map.pmap; + int nldt, num; + union descriptor *lp, *cp; + struct i386_get_ldt_args ua; + + if ((error = copyin(args, &ua, sizeof(ua))) != 0) + return (error); + +#ifdef LDT_DEBUG + printf("i386_get_ldt: start=%d num=%d descs=%p\n", ua.start, + ua.num, ua.desc); +#endif + + if (ua.start < 0 || ua.num < 0 || ua.start > 8192 || ua.num > 8192 || + ua.start + ua.num > 8192) + return (EINVAL); + + cp = malloc(ua.num * sizeof(union descriptor), M_TEMP, M_WAITOK); + if (cp == NULL) + return ENOMEM; + + simple_lock(&pmap->pm_lock); + + if (pmap->pm_flags & PMF_USER_LDT) { + nldt = pmap->pm_ldt_len; + lp = pmap->pm_ldt; + } else { + nldt = NLDT; + lp = ldt; + } + + if (ua.start > nldt) { + simple_unlock(&pmap->pm_lock); + free(cp, M_TEMP); + return (EINVAL); + } + + lp += ua.start; + num = min(ua.num, nldt - ua.start); +#ifdef LDT_DEBUG + { + int i; + for (i = 0; i < num; i++) + i386_print_ldt(i, &lp[i].sd); + } +#endif + + memcpy(cp, lp, num * sizeof(union descriptor)); + simple_unlock(&pmap->pm_lock); + + error = copyout(cp, ua.desc, num * sizeof(union descriptor)); + if (error == 0) + *retval = num; + + free(cp, M_TEMP); + return (error); +} + +int +i386_set_ldt(l, args, retval) + struct lwp *l; + void *args; + register_t *retval; +{ + int error, i, n; + struct proc *p = l->l_proc; + struct pcb *pcb = &l->l_addr->u_pcb; + pmap_t pmap = p->p_vmspace->vm_map.pmap; + struct i386_set_ldt_args ua; + union descriptor *descv; + size_t old_len, new_len, ldt_len; + union descriptor *old_ldt, *new_ldt; + + if ((error = copyin(args, &ua, sizeof(ua))) != 0) + return (error); + + if (ua.start < 0 || ua.num < 0 || ua.start > 8192 || ua.num > 8192 || + ua.start + ua.num > 8192) + return (EINVAL); + + descv = malloc(sizeof (*descv) * ua.num, M_TEMP, M_NOWAIT); + if (descv == NULL) + return (ENOMEM); + + if ((error = copyin(ua.desc, descv, sizeof (*descv) * ua.num)) != 0) + goto out; + + /* Check descriptors for access violations. */ + for (i = 0; i < ua.num; i++) { + union descriptor *desc = &descv[i]; + + switch (desc->sd.sd_type) { + case SDT_SYSNULL: + desc->sd.sd_p = 0; + break; + case SDT_SYS286CGT: + case SDT_SYS386CGT: + /* + * Only allow call gates targeting a segment + * in the LDT or a user segment in the fixed + * part of the gdt. Segments in the LDT are + * constrained (below) to be user segments. + */ + if (desc->gd.gd_p != 0 && + !ISLDT(desc->gd.gd_selector) && + ((IDXSEL(desc->gd.gd_selector) >= NGDT) || + (gdt[IDXSEL(desc->gd.gd_selector)].sd.sd_dpl != + SEL_UPL))) { + error = EACCES; + goto out; + } + break; + case SDT_MEMEC: + case SDT_MEMEAC: + case SDT_MEMERC: + case SDT_MEMERAC: + /* Must be "present" if executable and conforming. */ + if (desc->sd.sd_p == 0) { + error = EACCES; + goto out; + } + break; + case SDT_MEMRO: + case SDT_MEMROA: + case SDT_MEMRW: + case SDT_MEMRWA: + case SDT_MEMROD: + case SDT_MEMRODA: + case SDT_MEMRWD: + case SDT_MEMRWDA: + case SDT_MEME: + case SDT_MEMEA: + case SDT_MEMER: + case SDT_MEMERA: + break; + default: + /* + * Make sure that unknown descriptor types are + * not marked present. + */ + if (desc->sd.sd_p != 0) { + error = EACCES; + goto out; + } + break; + } + + if (desc->sd.sd_p != 0) { + /* Only user (ring-3) descriptors may be present. */ + if (desc->sd.sd_dpl != SEL_UPL) { + error = EACCES; + goto out; + } + } + } + + /* allocate user ldt */ + simple_lock(&pmap->pm_lock); + if (pmap->pm_ldt == 0 || (ua.start + ua.num) > pmap->pm_ldt_len) { + if (pmap->pm_flags & PMF_USER_LDT) + ldt_len = pmap->pm_ldt_len; + else + ldt_len = 512; + while ((ua.start + ua.num) > ldt_len) + ldt_len *= 2; + new_len = ldt_len * sizeof(union descriptor); + + simple_unlock(&pmap->pm_lock); + new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, + new_len); + simple_lock(&pmap->pm_lock); + + if (pmap->pm_ldt != NULL && ldt_len <= pmap->pm_ldt_len) { + /* + * Another thread (re)allocated the LDT to + * sufficient size while we were blocked in + * uvm_km_alloc. Oh well. The new entries + * will quite probably not be right, but + * hey.. not our problem if user applications + * have race conditions like that. + */ + uvm_km_free(kernel_map, (vaddr_t)new_ldt, new_len); + goto copy; + } + + old_ldt = pmap->pm_ldt; + + if (old_ldt != NULL) { + old_len = pmap->pm_ldt_len * sizeof(union descriptor); + } else { + old_len = NLDT * sizeof(union descriptor); + old_ldt = ldt; + } + + memcpy(new_ldt, old_ldt, old_len); + memset((caddr_t)new_ldt + old_len, 0, new_len - old_len); + + if (old_ldt != ldt) + uvm_km_free(kernel_map, (vaddr_t)old_ldt, old_len); + + pmap->pm_ldt = new_ldt; + pmap->pm_ldt_len = ldt_len; + + if (pmap->pm_flags & PMF_USER_LDT) + ldt_free(pmap); + else + pmap->pm_flags |= PMF_USER_LDT; + ldt_alloc(pmap, new_ldt, new_len); + pcb->pcb_ldt_sel = pmap->pm_ldt_sel; + if (pcb == curpcb) + lldt(pcb->pcb_ldt_sel); + + } +copy: + /* Now actually replace the descriptors. */ + for (i = 0, n = ua.start; i < ua.num; i++, n++) + pmap->pm_ldt[n] = descv[i]; + + simple_unlock(&pmap->pm_lock); + + *retval = ua.start; + +out: + free(descv, M_TEMP); + return (error); +} +#endif /* USER_LDT */ + +int +i386_iopl(l, args, retval) + struct lwp *l; + void *args; + register_t *retval; +{ + int error; + struct proc *p = l->l_proc; + struct pcb *pcb = &l->l_addr->u_pcb; + struct i386_iopl_args ua; + dom0_op_t op; + + if ((xen_start_info.flags & SIF_PRIVILEGED) == 0) + return EPERM; + + if (securelevel > 1) + return EPERM; + + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return error; + + if ((error = copyin(args, &ua, sizeof(ua))) != 0) + return error; + + pcb->pcb_tss.tss_ioopt &= ~SEL_RPL; + if (ua.iopl) + pcb->pcb_tss.tss_ioopt |= SEL_UPL; /* i/o pl */ + else + pcb->pcb_tss.tss_ioopt |= SEL_KPL; /* i/o pl */ + + /* Force the change at ring 0. */ + op.cmd = DOM0_IOPL; + op.u.iopl.domain = DOMID_SELF; + op.u.iopl.iopl = pcb->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */ + HYPERVISOR_dom0_op(&op); + + return 0; +} + +int +i386_get_ioperm(l, args, retval) + struct lwp *l; + void *args; + register_t *retval; +{ + int error; + struct pcb *pcb = &l->l_addr->u_pcb; + struct i386_get_ioperm_args ua; + + if ((error = copyin(args, &ua, sizeof(ua))) != 0) + return (error); + + return copyout(pcb->pcb_iomap, ua.iomap, sizeof(pcb->pcb_iomap)); +} + +int +i386_set_ioperm(l, args, retval) + struct lwp *l; + void *args; + register_t *retval; +{ + int error; + struct proc *p = l->l_proc; + struct pcb *pcb = &l->l_addr->u_pcb; + struct i386_set_ioperm_args ua; + + if (securelevel > 1) + return EPERM; + + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return error; + + if ((error = copyin(args, &ua, sizeof(ua))) != 0) + return (error); + + return copyin(ua.iomap, pcb->pcb_iomap, sizeof(pcb->pcb_iomap)); +} + +#ifdef MTRR +int +i386_get_mtrr(struct lwp *l, void *args, register_t *retval) +{ + struct i386_get_mtrr_args ua; + int error, n; + struct proc *p = l->l_proc; + + if (mtrr_funcs == NULL) + return ENOSYS; + + error = copyin(args, &ua, sizeof ua); + if (error != 0) + return error; + + error = copyin(ua.n, &n, sizeof n); + if (error != 0) + return error; + + error = mtrr_get(ua.mtrrp, &n, p, MTRR_GETSET_USER); + + copyout(&n, ua.n, sizeof (int)); + + return error; +} + +int +i386_set_mtrr(struct lwp *l, void *args, register_t *retval) +{ + int error, n; + struct i386_set_mtrr_args ua; + struct proc *p = l->l_proc; + + if (mtrr_funcs == NULL) + return ENOSYS; + + error = suser(p->p_ucred, &p->p_acflag); + if (error != 0) + return error; + + error = copyin(args, &ua, sizeof ua); + if (error != 0) + return error; + + error = copyin(ua.n, &n, sizeof n); + if (error != 0) + return error; + + error = mtrr_set(ua.mtrrp, &n, p, MTRR_GETSET_USER); + if (n != 0) + mtrr_commit(); + + copyout(&n, ua.n, sizeof n); + + return error; +} +#endif + +int +sys_sysarch(struct lwp *l, void *v, register_t *retval) +{ + struct sys_sysarch_args /* { + syscallarg(int) op; + syscallarg(void *) parms; + } */ *uap = v; + int error = 0; + + switch(SCARG(uap, op)) { +#ifdef USER_LDT + case I386_GET_LDT: + error = i386_get_ldt(l, SCARG(uap, parms), retval); + break; + + case I386_SET_LDT: + error = i386_set_ldt(l, SCARG(uap, parms), retval); + break; +#endif + + case I386_IOPL: + error = i386_iopl(l, SCARG(uap, parms), retval); + break; + + case I386_GET_IOPERM: + error = i386_get_ioperm(l, SCARG(uap, parms), retval); + break; + + case I386_SET_IOPERM: + error = i386_set_ioperm(l, SCARG(uap, parms), retval); + break; + +#ifdef VM86 + case I386_VM86: + error = i386_vm86(l, SCARG(uap, parms), retval); + break; +#ifdef COMPAT_16 + case I386_OLD_VM86: + error = compat_16_i386_vm86(l, SCARG(uap, parms), retval); + break; +#endif +#endif +#ifdef MTRR + case I386_GET_MTRR: + error = i386_get_mtrr(l, SCARG(uap, parms), retval); + break; + case I386_SET_MTRR: + error = i386_set_mtrr(l, SCARG(uap, parms), retval); + break; +#endif +#ifdef PERFCTRS + case I386_PMC_INFO: + error = pmc_info(l, SCARG(uap, parms), retval); + break; + + case I386_PMC_STARTSTOP: + error = pmc_startstop(l, SCARG(uap, parms), retval); + break; + + case I386_PMC_READ: + error = pmc_read(l, SCARG(uap, parms), retval); + break; +#endif + + default: + error = EINVAL; + break; + } + return (error); +} diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S new file mode 100644 index 0000000000..165b5f06be --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S @@ -0,0 +1,1587 @@ +/* $NetBSD: vector.S,v 1.1.2.1 2004/05/22 15:57:16 he Exp $ */ +/* NetBSD: 1.13 2004/03/11 11:39:26 yamt Exp */ + +/* + * Copyright 2002 (c) Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Frank van der Linden for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Charles M. Hannum. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_ddb.h" +#include "opt_multiprocessor.h" +#include "opt_ipkdb.h" +#include "opt_vm86.h" +#include "opt_xen.h" + +#ifndef XEN +#include <machine/i8259.h> +#endif +#include <machine/i82093reg.h> +#include <machine/i82489reg.h> +#include <machine/asm.h> +#include <machine/frameasm.h> +#include <machine/segments.h> +#include <machine/trap.h> +#include <machine/intr.h> +#include <machine/psl.h> +#ifdef XEN +#include <machine/xen.h> +#endif + +#include <net/netisr.h> + +#include "ioapic.h" +#include "lapic.h" + +#include "npx.h" +#include "assym.h" + +#define __HAVE_GENERIC_SOFT_INTERRUPTS /* XXX */ + + +/* + * Macros for interrupt entry, call to handler, and exit. + * + * XXX + * The interrupt frame is set up to look like a trap frame. This may be a + * waste. The only handler which needs a frame is the clock handler, and it + * only needs a few bits. Xdoreti() needs a trap frame for handling ASTs, but + * it could easily convert the frame on demand. + * + * The direct costs of setting up a trap frame are two pushl's (error code and + * trap number), an addl to get rid of these, and pushing and popping the + * callee-saved registers %esi, %edi, %ebx, and %ebp twice. + * + * If the interrupt frame is made more flexible, INTR can push %eax first and + * decide the ipending case with less overhead, e.g., by avoiding loading the + * segment registers. + * + */ + +#define MY_COUNT _C_LABEL(uvmexp) + +/* XXX See comment in locore.s */ +#ifdef __ELF__ +#define XINTR(name,num) Xintr_/**/name/**/num +#define XSTRAY(name,num) Xstray_/**/name/**/num +#define XINTR_TSS(irq_num) Xintr_tss_ ## irq_num +#else +#define XINTR(name,num) _Xintr_/**/name/**/num +#define XSTRAY(name,num) _Xstray_/**/name/**/num +#define XINTR_TSS(irq_num) Xintr_tss_/**/irq_num +#endif + +/* + * Store address of TSS in %eax, given a selector in %eax. + * Clobbers %eax, %ecx, %edx, but that's ok for its usage. + * This is a bit complicated, but it's done to make as few + * assumptions as possible about the validity of the environment. + * The GDT and the current and previous TSS are known to be OK, + * otherwise we would not be here. The only other thing that needs + * to be OK is the cpu_info structure for the current CPU. + */ +#define GET_TSS \ + andl $0xfff8,%eax ;\ + addl CPUVAR(GDT),%eax ;\ + movl 2(%eax),%edx ;\ + andl $0xffffff,%edx ;\ + movzbl 7(%eax),%eax ;\ + shl $24,%eax ;\ + orl %edx,%eax + +#if NLAPIC > 0 +#ifdef MULTIPROCESSOR +IDTVEC(recurse_lapic_ipi) + pushfl + pushl %cs + pushl %esi + pushl $0 + pushl $T_ASTFLT + INTRENTRY +IDTVEC(resume_lapic_ipi) + cli + jmp 1f +IDTVEC(intr_lapic_ipi) + pushl $0 + pushl $T_ASTFLT + INTRENTRY + movl $0,_C_LABEL(local_apic)+LAPIC_EOI + movl CPUVAR(ILEVEL),%ebx + cmpl $IPL_IPI,%ebx + jae 2f +1: + incl CPUVAR(IDEPTH) + movl $IPL_IPI,CPUVAR(ILEVEL) + sti + pushl %ebx + call _C_LABEL(x86_ipi_handler) + jmp _C_LABEL(Xdoreti) +2: + orl $(1 << LIR_IPI),CPUVAR(IPENDING) + sti + INTRFASTEXIT + +#if defined(DDB) +IDTVEC(intrddbipi) +1: + str %ax + GET_TSS + movzwl (%eax),%eax + GET_TSS + pushl %eax + movl $0xff,_C_LABEL(lapic_tpr) + movl $0,_C_LABEL(local_apic)+LAPIC_EOI + sti + call _C_LABEL(ddb_ipi_tss) + addl $4,%esp + movl $0,_C_LABEL(lapic_tpr) + iret + jmp 1b +#endif /* DDB */ +#endif /* MULTIPROCESSOR */ + + /* + * Interrupt from the local APIC timer. + */ +IDTVEC(recurse_lapic_ltimer) + pushfl + pushl %cs + pushl %esi + pushl $0 + pushl $T_ASTFLT + INTRENTRY +IDTVEC(resume_lapic_ltimer) + cli + jmp 1f +IDTVEC(intr_lapic_ltimer) + pushl $0 + pushl $T_ASTFLT + INTRENTRY + movl $0,_C_LABEL(local_apic)+LAPIC_EOI + movl CPUVAR(ILEVEL),%ebx + cmpl $IPL_CLOCK,%ebx + jae 2f +1: + incl CPUVAR(IDEPTH) + movl $IPL_CLOCK,CPUVAR(ILEVEL) + sti + pushl %ebx + pushl $0 + call _C_LABEL(lapic_clockintr) + addl $4,%esp + jmp _C_LABEL(Xdoreti) +2: + orl $(1 << LIR_TIMER),CPUVAR(IPENDING) + sti + INTRFASTEXIT +#endif /* NLAPIC > 0 */ + +#ifdef MULTIPROCESSOR +#define LOCK_KERNEL pushl %esp ; call _C_LABEL(x86_intlock) ; addl $4,%esp +#define UNLOCK_KERNEL pushl %esp ; call _C_LABEL(x86_intunlock) ; addl $4,%esp +#else +#define LOCK_KERNEL +#define UNLOCK_KERNEL +#endif + +#define voidop(num) + + +#define XENINTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \ +IDTVEC(recurse_/**/name/**/num) ;\ + pushfl ;\ + pushl %cs ;\ + pushl %esi ;\ + subl $4,%esp ;\ + pushl $T_ASTFLT /* trap # for doing ASTs */ ;\ + INTRENTRY ;\ +IDTVEC(resume_/**/name/**/num) \ + /*movl %esp,%ecx*/ ;\ + movl $IREENT_MAGIC,TF_ERR(%esp) ;\ + movl %ebx,%esi ;\ + movl CPUVAR(ISOURCES) + (num) * 4, %ebp ;\ + movl IS_MAXLEVEL(%ebp),%ebx ;\ + jmp 1f ;\ +IDTVEC(intr_/**/name/**/num) ;\ + pushl $0 /* dummy error code */ ;\ + pushl $T_ASTFLT /* trap # for doing ASTs */ ;\ + INTRENTRY ;\ + /*movl %esp,%ecx*/ ;\ + movl CPUVAR(ISOURCES) + (num) * 4, %ebp ;\ + mask(num) /* mask it in hardware */ ;\ + early_ack(num) /* and allow other intrs */ ;\ + testl %ebp,%ebp ;\ + jz 9f /* stray */ ;\ + movl IS_MAXLEVEL(%ebp),%ebx ;\ + movl CPUVAR(ILEVEL),%esi ;\ + cmpl %ebx,%esi ;\ + jae 10f /* currently masked; hold it */ ;\ + incl MY_COUNT+V_INTR /* statistical info */ ;\ + addl $1,IS_EVCNTLO(%ebp) /* inc event counter */ ;\ + adcl $0,IS_EVCNTHI(%ebp) ;\ +1: \ + pushl %esi ;\ + movl %ebx,CPUVAR(ILEVEL) ;\ + STI(%eax) ;\ + incl CPUVAR(IDEPTH) ;\ + movl IS_HANDLERS(%ebp),%ebx ;\ + LOCK_KERNEL ;\ +6: \ + movl IH_LEVEL(%ebx),%edi ;\ + cmpl %esi,%edi ;\ + jle 7f ;\ + pushl %esp ;\ + pushl IH_ARG(%ebx) ;\ + movl %edi,CPUVAR(ILEVEL) ;\ + call *IH_FUN(%ebx) /* call it */ ;\ + addl $8,%esp /* toss the arg */ ;\ + movl IH_NEXT(%ebx),%ebx /* next handler in chain */ ;\ + testl %ebx,%ebx ;\ + jnz 6b ;\ +5: \ + UNLOCK_KERNEL ;\ + CLI(%eax) ;\ + unmask(num) /* unmask it in hardware */ ;\ + late_ack(num) ;\ + STI(%eax) ;\ + jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\ +7: \ + UNLOCK_KERNEL ;\ + CLI(%eax) ;\ + orl $(1 << num),CPUVAR(IPENDING) ;\ + level_mask(num) ;\ + late_ack(num) ;\ + STI(%eax) ;\ + jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\ +10: \ + CLI(%eax) ;\ + orl $(1 << num),CPUVAR(IPENDING) ;\ + level_mask(num) ;\ +6: ; \ + late_ack(num) ;\ + STIC(%eax) ;\ + jz 4f ; \ + call _C_LABEL(stipending) ; \ + testl %eax,%eax ; \ + jnz 1b ; \ +4: INTRFASTEXIT ;\ +9: \ + unmask(num) ;\ + jmp 6b + +#define hypervisor_asm_unmask(num) \ + movl irq_to_evtchn + (num) * 4,%ecx ;\ + movl HYPERVISOR_shared_info,%eax ;\ + lock ;\ + btrl %ecx,EVENTS_MASK(%eax) + +XENINTRSTUB(xenev,0,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,1,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,2,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,3,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,4,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,5,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,6,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,7,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,8,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,9,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,10,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,11,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,12,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,13,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,14,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,15,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,16,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,17,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,18,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,19,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,20,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,21,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,22,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,23,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,24,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,25,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,26,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,27,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,28,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,29,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,30,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) +XENINTRSTUB(xenev,31,voidop,voidop,voidop,hypervisor_asm_unmask,voidop) + +.globl _C_LABEL(xenev_stubs) +_C_LABEL(xenev_stubs): + .long _C_LABEL(Xintr_xenev0), _C_LABEL(Xrecurse_xenev0) + .long _C_LABEL(Xresume_xenev0) + .long _C_LABEL(Xintr_xenev1), _C_LABEL(Xrecurse_xenev1) + .long _C_LABEL(Xresume_xenev1) + .long _C_LABEL(Xintr_xenev2), _C_LABEL(Xrecurse_xenev2) + .long _C_LABEL(Xresume_xenev2) + .long _C_LABEL(Xintr_xenev3), _C_LABEL(Xrecurse_xenev3) + .long _C_LABEL(Xresume_xenev3) + .long _C_LABEL(Xintr_xenev4), _C_LABEL(Xrecurse_xenev4) + .long _C_LABEL(Xresume_xenev4) + .long _C_LABEL(Xintr_xenev5), _C_LABEL(Xrecurse_xenev5) + .long _C_LABEL(Xresume_xenev5) + .long _C_LABEL(Xintr_xenev6), _C_LABEL(Xrecurse_xenev6) + .long _C_LABEL(Xresume_xenev6) + .long _C_LABEL(Xintr_xenev7), _C_LABEL(Xrecurse_xenev7) + .long _C_LABEL(Xresume_xenev7) + .long _C_LABEL(Xintr_xenev8), _C_LABEL(Xrecurse_xenev8) + .long _C_LABEL(Xresume_xenev8) + .long _C_LABEL(Xintr_xenev9), _C_LABEL(Xrecurse_xenev9) + .long _C_LABEL(Xresume_xenev9) + .long _C_LABEL(Xintr_xenev10), _C_LABEL(Xrecurse_xenev10) + .long _C_LABEL(Xresume_xenev10) + .long _C_LABEL(Xintr_xenev11), _C_LABEL(Xrecurse_xenev11) + .long _C_LABEL(Xresume_xenev11) + .long _C_LABEL(Xintr_xenev12), _C_LABEL(Xrecurse_xenev12) + .long _C_LABEL(Xresume_xenev12) + .long _C_LABEL(Xintr_xenev13), _C_LABEL(Xrecurse_xenev13) + .long _C_LABEL(Xresume_xenev13) + .long _C_LABEL(Xintr_xenev14), _C_LABEL(Xrecurse_xenev14) + .long _C_LABEL(Xresume_xenev14) + .long _C_LABEL(Xintr_xenev15), _C_LABEL(Xrecurse_xenev15) + .long _C_LABEL(Xresume_xenev15) + .long _C_LABEL(Xintr_xenev16), _C_LABEL(Xrecurse_xenev16) + .long _C_LABEL(Xresume_xenev16) + .long _C_LABEL(Xintr_xenev17), _C_LABEL(Xrecurse_xenev17) + .long _C_LABEL(Xresume_xenev17) + .long _C_LABEL(Xintr_xenev18), _C_LABEL(Xrecurse_xenev18) + .long _C_LABEL(Xresume_xenev18) + .long _C_LABEL(Xintr_xenev19), _C_LABEL(Xrecurse_xenev19) + .long _C_LABEL(Xresume_xenev19) + .long _C_LABEL(Xintr_xenev20), _C_LABEL(Xrecurse_xenev20) + .long _C_LABEL(Xresume_xenev20) + .long _C_LABEL(Xintr_xenev21), _C_LABEL(Xrecurse_xenev21) + .long _C_LABEL(Xresume_xenev21) + .long _C_LABEL(Xintr_xenev22), _C_LABEL(Xrecurse_xenev22) + .long _C_LABEL(Xresume_xenev22) + .long _C_LABEL(Xintr_xenev23), _C_LABEL(Xrecurse_xenev23) + .long _C_LABEL(Xresume_xenev23) + .long _C_LABEL(Xintr_xenev24), _C_LABEL(Xrecurse_xenev24) + .long _C_LABEL(Xresume_xenev24) + .long _C_LABEL(Xintr_xenev25), _C_LABEL(Xrecurse_xenev25) + .long _C_LABEL(Xresume_xenev25) + .long _C_LABEL(Xintr_xenev26), _C_LABEL(Xrecurse_xenev26) + .long _C_LABEL(Xresume_xenev26) + .long _C_LABEL(Xintr_xenev27), _C_LABEL(Xrecurse_xenev27) + .long _C_LABEL(Xresume_xenev27) + .long _C_LABEL(Xintr_xenev28), _C_LABEL(Xrecurse_xenev28) + .long _C_LABEL(Xresume_xenev28) + .long _C_LABEL(Xintr_xenev29), _C_LABEL(Xrecurse_xenev29) + .long _C_LABEL(Xresume_xenev29) + .long _C_LABEL(Xintr_xenev30), _C_LABEL(Xrecurse_xenev30) + .long _C_LABEL(Xresume_xenev30) + .long _C_LABEL(Xintr_xenev31), _C_LABEL(Xrecurse_xenev31) + .long _C_LABEL(Xresume_xenev31) + +#ifndef XEN +/* + * This macro defines the generic stub code. Its arguments modifiy it + * for specific PICs. + */ + +#define INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \ +IDTVEC(recurse_/**/name/**/num) ;\ + pushfl ;\ + pushl %cs ;\ + pushl %esi ;\ + subl $4,%esp ;\ + pushl $T_ASTFLT /* trap # for doing ASTs */ ;\ + INTRENTRY ;\ +IDTVEC(resume_/**/name/**/num) \ + movl $IREENT_MAGIC,TF_ERR(%esp) ;\ + movl %ebx,%esi ;\ + movl CPUVAR(ISOURCES) + (num) * 4, %ebp ;\ + movl IS_MAXLEVEL(%ebp),%ebx ;\ + jmp 1f ;\ +IDTVEC(intr_/**/name/**/num) ;\ + pushl $0 /* dummy error code */ ;\ + pushl $T_ASTFLT /* trap # for doing ASTs */ ;\ + INTRENTRY ;\ + movl CPUVAR(ISOURCES) + (num) * 4, %ebp ;\ + mask(num) /* mask it in hardware */ ;\ + early_ack(num) /* and allow other intrs */ ;\ + testl %ebp,%ebp ;\ + jz 9f /* stray */ ;\ + movl IS_MAXLEVEL(%ebp),%ebx ;\ + movl CPUVAR(ILEVEL),%esi ;\ + cmpl %ebx,%esi ;\ + jae 10f /* currently masked; hold it */ ;\ + incl MY_COUNT+V_INTR /* statistical info */ ;\ + addl $1,IS_EVCNTLO(%ebp) /* inc event counter */ ;\ + adcl $0,IS_EVCNTHI(%ebp) ;\ +1: \ + pushl %esi ;\ + movl %ebx,CPUVAR(ILEVEL) ;\ + STI(%eax) ;\ + incl CPUVAR(IDEPTH) ;\ + movl IS_HANDLERS(%ebp),%ebx ;\ + LOCK_KERNEL ;\ +6: \ + movl IH_LEVEL(%ebx),%edi ;\ + cmpl %esi,%edi ;\ + jle 7f ;\ + pushl IH_ARG(%ebx) ;\ + movl %edi,CPUVAR(ILEVEL) ;\ + call *IH_FUN(%ebx) /* call it */ ;\ + addl $4,%esp /* toss the arg */ ;\ + movl IH_NEXT(%ebx),%ebx /* next handler in chain */ ;\ + testl %ebx,%ebx ;\ + jnz 6b ;\ +5: \ + UNLOCK_KERNEL ;\ + CLI(%eax) ;\ + unmask(num) /* unmask it in hardware */ ;\ + late_ack(num) ;\ + STI(%eax) ;\ + jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\ +7: \ + UNLOCK_KERNEL ;\ + CLI(%eax) ;\ + orl $(1 << num),CPUVAR(IPENDING) ;\ + level_mask(num) ;\ + late_ack(num) ;\ + STI(%eax) ;\ + jmp _C_LABEL(Xdoreti) /* lower spl and do ASTs */ ;\ +10: \ + CLI(%eax) ;\ + orl $(1 << num),CPUVAR(IPENDING) ;\ + level_mask(num) ;\ + late_ack(num) ;\ + STIC(%eax) ;\ + jz 4f ; \ + call _C_LABEL(stipending) ; \ + testl %eax,%eax ; \ + jnz 1b ; \ +4: INTRFASTEXIT ;\ +9: \ + unmask(num) ;\ + late_ack(num) ;\ + STIC(%eax) ;\ + jz 4f ; \ + call _C_LABEL(stipending) ; \ + testl %eax,%eax ; \ + jnz 1b ; \ +4: INTRFASTEXIT + +#define ICUADDR IO_ICU1 + +INTRSTUB(legacy,0,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,1,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,2,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,3,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,4,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,5,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,6,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,7,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +#undef ICUADDR +#define ICUADDR IO_ICU2 + +INTRSTUB(legacy,8,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,9,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,10,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,11,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,12,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,13,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,14,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +INTRSTUB(legacy,15,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask, + voidop) +#endif + +#if NIOAPIC > 0 + +INTRSTUB(ioapic_edge,0,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,1,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,2,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,3,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,4,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,5,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,6,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,7,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,8,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,9,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,10,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,11,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,12,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,13,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,14,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,15,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,16,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,17,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,18,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,19,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,20,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,21,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,22,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,23,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,24,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,25,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,26,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,27,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,28,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,29,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,30,voidop,ioapic_asm_ack,voidop,voidop,voidop) +INTRSTUB(ioapic_edge,31,voidop,ioapic_asm_ack,voidop,voidop,voidop) + +INTRSTUB(ioapic_level,0,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,1,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,2,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,3,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,4,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,5,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,6,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,7,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,8,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,9,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,10,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,11,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,12,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,13,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,14,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,15,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,16,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,17,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,18,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,19,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,20,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,21,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,22,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,23,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,24,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,25,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,26,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,27,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,28,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,29,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,30,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) +INTRSTUB(ioapic_level,31,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask) + +#endif + +#ifndef XEN +.globl _C_LABEL(i8259_stubs) +_C_LABEL(i8259_stubs): + .long _C_LABEL(Xintr_legacy0), _C_LABEL(Xrecurse_legacy0) + .long _C_LABEL(Xresume_legacy0) + .long _C_LABEL(Xintr_legacy1), _C_LABEL(Xrecurse_legacy1) + .long _C_LABEL(Xresume_legacy1) + .long _C_LABEL(Xintr_legacy2), _C_LABEL(Xrecurse_legacy2) + .long _C_LABEL(Xresume_legacy2) + .long _C_LABEL(Xintr_legacy3), _C_LABEL(Xrecurse_legacy3) + .long _C_LABEL(Xresume_legacy3) + .long _C_LABEL(Xintr_legacy4), _C_LABEL(Xrecurse_legacy4) + .long _C_LABEL(Xresume_legacy4) + .long _C_LABEL(Xintr_legacy5), _C_LABEL(Xrecurse_legacy5) + .long _C_LABEL(Xresume_legacy5) + .long _C_LABEL(Xintr_legacy6), _C_LABEL(Xrecurse_legacy6) + .long _C_LABEL(Xresume_legacy6) + .long _C_LABEL(Xintr_legacy7), _C_LABEL(Xrecurse_legacy7) + .long _C_LABEL(Xresume_legacy7) + .long _C_LABEL(Xintr_legacy8), _C_LABEL(Xrecurse_legacy8) + .long _C_LABEL(Xresume_legacy8) + .long _C_LABEL(Xintr_legacy9), _C_LABEL(Xrecurse_legacy9) + .long _C_LABEL(Xresume_legacy9) + .long _C_LABEL(Xintr_legacy10), _C_LABEL(Xrecurse_legacy10) + .long _C_LABEL(Xresume_legacy10) + .long _C_LABEL(Xintr_legacy11), _C_LABEL(Xrecurse_legacy11) + .long _C_LABEL(Xresume_legacy11) + .long _C_LABEL(Xintr_legacy12), _C_LABEL(Xrecurse_legacy12) + .long _C_LABEL(Xresume_legacy12) + .long _C_LABEL(Xintr_legacy13), _C_LABEL(Xrecurse_legacy13) + .long _C_LABEL(Xresume_legacy13) + .long _C_LABEL(Xintr_legacy14), _C_LABEL(Xrecurse_legacy14) + .long _C_LABEL(Xresume_legacy14) + .long _C_LABEL(Xintr_legacy15), _C_LABEL(Xrecurse_legacy15) + .long _C_LABEL(Xresume_legacy15) +#endif + +#if NIOAPIC > 0 +.globl _C_LABEL(ioapic_edge_stubs) +_C_LABEL(ioapic_edge_stubs): + .long _C_LABEL(Xintr_ioapic_edge0), _C_LABEL(Xrecurse_ioapic_edge0) + .long _C_LABEL(Xresume_ioapic_edge0) + .long _C_LABEL(Xintr_ioapic_edge1), _C_LABEL(Xrecurse_ioapic_edge1) + .long _C_LABEL(Xresume_ioapic_edge1) + .long _C_LABEL(Xintr_ioapic_edge2), _C_LABEL(Xrecurse_ioapic_edge2) + .long _C_LABEL(Xresume_ioapic_edge2) + .long _C_LABEL(Xintr_ioapic_edge3), _C_LABEL(Xrecurse_ioapic_edge3) + .long _C_LABEL(Xresume_ioapic_edge3) + .long _C_LABEL(Xintr_ioapic_edge4), _C_LABEL(Xrecurse_ioapic_edge4) + .long _C_LABEL(Xresume_ioapic_edge4) + .long _C_LABEL(Xintr_ioapic_edge5), _C_LABEL(Xrecurse_ioapic_edge5) + .long _C_LABEL(Xresume_ioapic_edge5) + .long _C_LABEL(Xintr_ioapic_edge6), _C_LABEL(Xrecurse_ioapic_edge6) + .long _C_LABEL(Xresume_ioapic_edge6) + .long _C_LABEL(Xintr_ioapic_edge7), _C_LABEL(Xrecurse_ioapic_edge7) + .long _C_LABEL(Xresume_ioapic_edge7) + .long _C_LABEL(Xintr_ioapic_edge8), _C_LABEL(Xrecurse_ioapic_edge8) + .long _C_LABEL(Xresume_ioapic_edge8) + .long _C_LABEL(Xintr_ioapic_edge9), _C_LABEL(Xrecurse_ioapic_edge9) + .long _C_LABEL(Xresume_ioapic_edge9) + .long _C_LABEL(Xintr_ioapic_edge10), _C_LABEL(Xrecurse_ioapic_edge10) + .long _C_LABEL(Xresume_ioapic_edge10) + .long _C_LABEL(Xintr_ioapic_edge11), _C_LABEL(Xrecurse_ioapic_edge11) + .long _C_LABEL(Xresume_ioapic_edge11) + .long _C_LABEL(Xintr_ioapic_edge12), _C_LABEL(Xrecurse_ioapic_edge12) + .long _C_LABEL(Xresume_ioapic_edge12) + .long _C_LABEL(Xintr_ioapic_edge13), _C_LABEL(Xrecurse_ioapic_edge13) + .long _C_LABEL(Xresume_ioapic_edge13) + .long _C_LABEL(Xintr_ioapic_edge14), _C_LABEL(Xrecurse_ioapic_edge14) + .long _C_LABEL(Xresume_ioapic_edge14) + .long _C_LABEL(Xintr_ioapic_edge15), _C_LABEL(Xrecurse_ioapic_edge15) + .long _C_LABEL(Xresume_ioapic_edge15) + .long _C_LABEL(Xintr_ioapic_edge16), _C_LABEL(Xrecurse_ioapic_edge16) + .long _C_LABEL(Xresume_ioapic_edge16) + .long _C_LABEL(Xintr_ioapic_edge17), _C_LABEL(Xrecurse_ioapic_edge17) + .long _C_LABEL(Xresume_ioapic_edge17) + .long _C_LABEL(Xintr_ioapic_edge18), _C_LABEL(Xrecurse_ioapic_edge18) + .long _C_LABEL(Xresume_ioapic_edge18) + .long _C_LABEL(Xintr_ioapic_edge19), _C_LABEL(Xrecurse_ioapic_edge19) + .long _C_LABEL(Xresume_ioapic_edge19) + .long _C_LABEL(Xintr_ioapic_edge20), _C_LABEL(Xrecurse_ioapic_edge20) + .long _C_LABEL(Xresume_ioapic_edge20) + .long _C_LABEL(Xintr_ioapic_edge21), _C_LABEL(Xrecurse_ioapic_edge21) + .long _C_LABEL(Xresume_ioapic_edge21) + .long _C_LABEL(Xintr_ioapic_edge22), _C_LABEL(Xrecurse_ioapic_edge22) + .long _C_LABEL(Xresume_ioapic_edge22) + .long _C_LABEL(Xintr_ioapic_edge23), _C_LABEL(Xrecurse_ioapic_edge23) + .long _C_LABEL(Xresume_ioapic_edge23) + .long _C_LABEL(Xintr_ioapic_edge24), _C_LABEL(Xrecurse_ioapic_edge24) + .long _C_LABEL(Xresume_ioapic_edge24) + .long _C_LABEL(Xintr_ioapic_edge25), _C_LABEL(Xrecurse_ioapic_edge25) + .long _C_LABEL(Xresume_ioapic_edge25) + .long _C_LABEL(Xintr_ioapic_edge26), _C_LABEL(Xrecurse_ioapic_edge26) + .long _C_LABEL(Xresume_ioapic_edge26) + .long _C_LABEL(Xintr_ioapic_edge27), _C_LABEL(Xrecurse_ioapic_edge27) + .long _C_LABEL(Xresume_ioapic_edge27) + .long _C_LABEL(Xintr_ioapic_edge28), _C_LABEL(Xrecurse_ioapic_edge28) + .long _C_LABEL(Xresume_ioapic_edge28) + .long _C_LABEL(Xintr_ioapic_edge29), _C_LABEL(Xrecurse_ioapic_edge29) + .long _C_LABEL(Xresume_ioapic_edge29) + .long _C_LABEL(Xintr_ioapic_edge30), _C_LABEL(Xrecurse_ioapic_edge30) + .long _C_LABEL(Xresume_ioapic_edge30) + .long _C_LABEL(Xintr_ioapic_edge31), _C_LABEL(Xrecurse_ioapic_edge31) + .long _C_LABEL(Xresume_ioapic_edge31) + +.globl _C_LABEL(ioapic_level_stubs) +_C_LABEL(ioapic_level_stubs): + .long _C_LABEL(Xintr_ioapic_level0), _C_LABEL(Xrecurse_ioapic_level0) + .long _C_LABEL(Xresume_ioapic_level0) + .long _C_LABEL(Xintr_ioapic_level1), _C_LABEL(Xrecurse_ioapic_level1) + .long _C_LABEL(Xresume_ioapic_level1) + .long _C_LABEL(Xintr_ioapic_level2), _C_LABEL(Xrecurse_ioapic_level2) + .long _C_LABEL(Xresume_ioapic_level2) + .long _C_LABEL(Xintr_ioapic_level3), _C_LABEL(Xrecurse_ioapic_level3) + .long _C_LABEL(Xresume_ioapic_level3) + .long _C_LABEL(Xintr_ioapic_level4), _C_LABEL(Xrecurse_ioapic_level4) + .long _C_LABEL(Xresume_ioapic_level4) + .long _C_LABEL(Xintr_ioapic_level5), _C_LABEL(Xrecurse_ioapic_level5) + .long _C_LABEL(Xresume_ioapic_level5) + .long _C_LABEL(Xintr_ioapic_level6), _C_LABEL(Xrecurse_ioapic_level6) + .long _C_LABEL(Xresume_ioapic_level6) + .long _C_LABEL(Xintr_ioapic_level7), _C_LABEL(Xrecurse_ioapic_level7) + .long _C_LABEL(Xresume_ioapic_level7) + .long _C_LABEL(Xintr_ioapic_level8), _C_LABEL(Xrecurse_ioapic_level8) + .long _C_LABEL(Xresume_ioapic_level8) + .long _C_LABEL(Xintr_ioapic_level9), _C_LABEL(Xrecurse_ioapic_level9) + .long _C_LABEL(Xresume_ioapic_level9) + .long _C_LABEL(Xintr_ioapic_level10), _C_LABEL(Xrecurse_ioapic_level10) + .long _C_LABEL(Xresume_ioapic_level10) + .long _C_LABEL(Xintr_ioapic_level11), _C_LABEL(Xrecurse_ioapic_level11) + .long _C_LABEL(Xresume_ioapic_level11) + .long _C_LABEL(Xintr_ioapic_level12), _C_LABEL(Xrecurse_ioapic_level12) + .long _C_LABEL(Xresume_ioapic_level12) + .long _C_LABEL(Xintr_ioapic_level13), _C_LABEL(Xrecurse_ioapic_level13) + .long _C_LABEL(Xresume_ioapic_level13) + .long _C_LABEL(Xintr_ioapic_level14), _C_LABEL(Xrecurse_ioapic_level14) + .long _C_LABEL(Xresume_ioapic_level14) + .long _C_LABEL(Xintr_ioapic_level15), _C_LABEL(Xrecurse_ioapic_level15) + .long _C_LABEL(Xresume_ioapic_level15) + .long _C_LABEL(Xintr_ioapic_level16), _C_LABEL(Xrecurse_ioapic_level16) + .long _C_LABEL(Xresume_ioapic_level16) + .long _C_LABEL(Xintr_ioapic_level17), _C_LABEL(Xrecurse_ioapic_level17) + .long _C_LABEL(Xresume_ioapic_level17) + .long _C_LABEL(Xintr_ioapic_level18), _C_LABEL(Xrecurse_ioapic_level18) + .long _C_LABEL(Xresume_ioapic_level18) + .long _C_LABEL(Xintr_ioapic_level19), _C_LABEL(Xrecurse_ioapic_level19) + .long _C_LABEL(Xresume_ioapic_level19) + .long _C_LABEL(Xintr_ioapic_level20), _C_LABEL(Xrecurse_ioapic_level20) + .long _C_LABEL(Xresume_ioapic_level20) + .long _C_LABEL(Xintr_ioapic_level21), _C_LABEL(Xrecurse_ioapic_level21) + .long _C_LABEL(Xresume_ioapic_level21) + .long _C_LABEL(Xintr_ioapic_level22), _C_LABEL(Xrecurse_ioapic_level22) + .long _C_LABEL(Xresume_ioapic_level22) + .long _C_LABEL(Xintr_ioapic_level23), _C_LABEL(Xrecurse_ioapic_level23) + .long _C_LABEL(Xresume_ioapic_level23) + .long _C_LABEL(Xintr_ioapic_level24), _C_LABEL(Xrecurse_ioapic_level24) + .long _C_LABEL(Xresume_ioapic_level24) + .long _C_LABEL(Xintr_ioapic_level25), _C_LABEL(Xrecurse_ioapic_level25) + .long _C_LABEL(Xresume_ioapic_level25) + .long _C_LABEL(Xintr_ioapic_level26), _C_LABEL(Xrecurse_ioapic_level26) + .long _C_LABEL(Xresume_ioapic_level26) + .long _C_LABEL(Xintr_ioapic_level27), _C_LABEL(Xrecurse_ioapic_level27) + .long _C_LABEL(Xresume_ioapic_level27) + .long _C_LABEL(Xintr_ioapic_level28), _C_LABEL(Xrecurse_ioapic_level28) + .long _C_LABEL(Xresume_ioapic_level28) + .long _C_LABEL(Xintr_ioapic_level29), _C_LABEL(Xrecurse_ioapic_level29) + .long _C_LABEL(Xresume_ioapic_level29) + .long _C_LABEL(Xintr_ioapic_level30), _C_LABEL(Xrecurse_ioapic_level30) + .long _C_LABEL(Xresume_ioapic_level30) + .long _C_LABEL(Xintr_ioapic_level31), _C_LABEL(Xrecurse_ioapic_level31) + .long _C_LABEL(Xresume_ioapic_level31) +#endif + +/* + * Symbols that vmstat -i wants, even though they're not used. + */ +.globl _C_LABEL(intrnames) +_C_LABEL(intrnames): +.globl _C_LABEL(eintrnames) +_C_LABEL(eintrnames): + +.globl _C_LABEL(intrcnt) +_C_LABEL(intrcnt): +.globl _C_LABEL(eintrcnt) +_C_LABEL(eintrcnt): + +/* + * Soft interrupt handlers + */ + +IDTVEC(softserial) + movl $IPL_SOFTSERIAL, CPUVAR(ILEVEL) + incl CPUVAR(IDEPTH) +#ifdef MULTIPROCESSOR + call _C_LABEL(x86_softintlock) +#endif + movl CPUVAR(ISOURCES) + SIR_SERIAL * 4, %edi + addl $1,IS_EVCNTLO(%edi) + adcl $0,IS_EVCNTHI(%edi) + pushl $X86_SOFTINTR_SOFTSERIAL + call _C_LABEL(softintr_dispatch) + addl $4,%esp +#ifdef MULTIPROCESSOR + call _C_LABEL(x86_softintunlock) +#endif + decl CPUVAR(IDEPTH) + jmp *%esi + +IDTVEC(softnet) + movl $IPL_SOFTNET, CPUVAR(ILEVEL) + incl CPUVAR(IDEPTH) +#ifdef MULTIPROCESSOR + call _C_LABEL(x86_softintlock) +#endif + movl CPUVAR(ISOURCES) + SIR_NET * 4, %edi + addl $1,IS_EVCNTLO(%edi) + adcl $0,IS_EVCNTHI(%edi) + + xorl %edi,%edi + xchgl _C_LABEL(netisr),%edi + + /* XXX Do the legacy netisrs here for now. */ +#define DONETISR(s, c) \ + .globl _C_LABEL(c) ;\ + testl $(1 << s),%edi ;\ + jz 1f ;\ + call _C_LABEL(c) ;\ +1: +#include <net/netisr_dispatch.h> + + pushl $X86_SOFTINTR_SOFTNET + call _C_LABEL(softintr_dispatch) + addl $4,%esp +#ifdef MULTIPROCESSOR + call _C_LABEL(x86_softintunlock) +#endif + decl CPUVAR(IDEPTH) + jmp *%esi + +IDTVEC(softclock) + movl $IPL_SOFTCLOCK, CPUVAR(ILEVEL) + incl CPUVAR(IDEPTH) +#ifdef MULTIPROCESSOR + call _C_LABEL(x86_softintlock) +#endif + movl CPUVAR(ISOURCES) + SIR_CLOCK * 4, %edi + addl $1,IS_EVCNTLO(%edi) + adcl $0,IS_EVCNTHI(%edi) + + pushl $X86_SOFTINTR_SOFTCLOCK + call _C_LABEL(softintr_dispatch) + addl $4,%esp +#ifdef MULTIPROCESSOR + call _C_LABEL(x86_softintunlock) +#endif + decl CPUVAR(IDEPTH) + jmp *%esi + +/* + * Trap and fault vector routines + * + * On exit from the kernel to user mode, we always need to check for ASTs. In + * addition, we need to do this atomically; otherwise an interrupt may occur + * which causes an AST, but it won't get processed until the next kernel entry + * (possibly the next clock tick). Thus, we disable interrupt before checking, + * and only enable them again on the final `iret' or before calling the AST + * handler. + */ + +#define TRAP(a) pushl $(a) ; jmp _C_LABEL(alltraps) +#define ZTRAP(a) pushl $0 ; TRAP(a) + +#ifdef IPKDB +#define BPTTRAP(a) pushl $0; pushl $(a); jmp _C_LABEL(bpttraps) +#else +#define BPTTRAP(a) ZTRAP(a) +#endif + + + .text +IDTVEC(trap00) + ZTRAP(T_DIVIDE) +IDTVEC(trap01) + BPTTRAP(T_TRCTRAP) +IDTVEC(trap02) + ZTRAP(T_NMI) +IDTVEC(trap03) + BPTTRAP(T_BPTFLT) +IDTVEC(trap04) + ZTRAP(T_OFLOW) +IDTVEC(trap05) + ZTRAP(T_BOUND) +IDTVEC(trap06) + ZTRAP(T_PRIVINFLT) +IDTVEC(trap07) +#if NNPX > 0 + pushl $0 # dummy error code + pushl $T_DNA + INTRENTRY +#ifdef XENDEBUG_LOW + pushl %esp +#endif + pushl CPUVAR(SELF) + call *_C_LABEL(npxdna_func) + addl $4,%esp +#ifdef XENDEBUG_LOW + addl $4,%esp +#endif + testl %eax,%eax + jz calltrap + INTRFASTEXIT +#else + ZTRAP(T_DNA) +#endif +IDTVEC(trap08) + TRAP(T_DOUBLEFLT) +IDTVEC(trap09) + ZTRAP(T_FPOPFLT) +IDTVEC(trap0a) + TRAP(T_TSSFLT) +IDTVEC(trap0b) + TRAP(T_SEGNPFLT) +IDTVEC(trap0c) + TRAP(T_STKFLT) +IDTVEC(trap0d) + TRAP(T_PROTFLT) +#ifndef XEN +IDTVEC(trap0e) +#ifndef I586_CPU + TRAP(T_PAGEFLT) +#else + pushl $T_PAGEFLT + INTRENTRY + testb $PGEX_U,TF_ERR(%esp) + jnz calltrap + movl %cr2,%eax + subl _C_LABEL(pentium_idt),%eax + cmpl $(6*8),%eax + jne calltrap + movb $T_PRIVINFLT,TF_TRAPNO(%esp) + jmp calltrap +#endif +#endif + +IDTVEC(intrspurious) +IDTVEC(trap0f) + /* + * The Pentium Pro local APIC may erroneously call this vector for a + * default IR7. Just ignore it. + * + * (The local APIC does this when CPL is raised while it's on the + * way to delivering an interrupt.. presumably enough has been set + * up that it's inconvenient to abort delivery completely..) + */ + iret + +IDTVEC(trap10) +#if NNPX > 0 + /* + * Handle like an interrupt so that we can call npxintr to clear the + * error. It would be better to handle npx interrupts as traps but + * this is difficult for nested interrupts. + */ + pushl $0 # dummy error code + pushl $T_ASTFLT + INTRENTRY + pushl CPUVAR(ILEVEL) + pushl %esp + incl _C_LABEL(uvmexp)+V_TRAP + call _C_LABEL(npxintr) + addl $8,%esp + INTRFASTEXIT +#else + ZTRAP(T_ARITHTRAP) +#endif +IDTVEC(trap11) + TRAP(T_ALIGNFLT) +IDTVEC(trap12) +IDTVEC(trap13) +IDTVEC(trap14) +IDTVEC(trap15) +IDTVEC(trap16) +IDTVEC(trap17) +IDTVEC(trap18) +IDTVEC(trap19) +IDTVEC(trap1a) +IDTVEC(trap1b) +IDTVEC(trap1c) +IDTVEC(trap1d) +IDTVEC(trap1e) +IDTVEC(trap1f) + /* 18 - 31 reserved for future exp */ + ZTRAP(T_RESERVED) + +IDTVEC(exceptions) +#ifndef XENDEBUG_LOW + .long _C_LABEL(Xtrap00), _C_LABEL(Xtrap01) + .long _C_LABEL(Xtrap02), _C_LABEL(Xtrap03) + .long _C_LABEL(Xtrap04), _C_LABEL(Xtrap05) + .long _C_LABEL(Xtrap06), _C_LABEL(Xtrap07) + .long _C_LABEL(Xtrap08), _C_LABEL(Xtrap09) + .long _C_LABEL(Xtrap0a), _C_LABEL(Xtrap0b) + .long _C_LABEL(Xtrap0c), _C_LABEL(Xtrap0d) + .long _C_LABEL(Xtrap0e), _C_LABEL(Xtrap0f) + .long _C_LABEL(Xtrap10), _C_LABEL(Xtrap11) + .long _C_LABEL(Xtrap12), _C_LABEL(Xtrap13) + .long _C_LABEL(Xtrap14), _C_LABEL(Xtrap15) + .long _C_LABEL(Xtrap16), _C_LABEL(Xtrap17) + .long _C_LABEL(Xtrap18), _C_LABEL(Xtrap19) + .long _C_LABEL(Xtrap1a), _C_LABEL(Xtrap1b) + .long _C_LABEL(Xtrap1c), _C_LABEL(Xtrap1d) + .long _C_LABEL(Xtrap1e), _C_LABEL(Xtrap1f) +#else + .long _C_LABEL(divide_error), _C_LABEL(debug) + .long _C_LABEL(Xtrap02), _C_LABEL(Xtrap03) #int3) + .long _C_LABEL(overflow), _C_LABEL(bounds) + .long _C_LABEL(invalid_op), _C_LABEL(device_not_available) + .long _C_LABEL(double_fault), _C_LABEL(coprocessor_segment_overrun) + .long _C_LABEL(invalid_TSS), _C_LABEL(segment_not_present) + .long _C_LABEL(stack_segment) + #.long _C_LABEL(general_protection) + .long _C_LABEL(Xtrap0d) + #.long _C_LABEL(page_fault) + .long _C_LABEL(Xtrap0e) + .long _C_LABEL(spurious_interrupt_bug) + .long _C_LABEL(coprocessor_error), _C_LABEL(alignment_check) + .long _C_LABEL(machine_check), _C_LABEL(simd_coprocessor_error) + .long _C_LABEL(Xtrap14), _C_LABEL(Xtrap15) + .long _C_LABEL(Xtrap16), _C_LABEL(Xtrap17) + .long _C_LABEL(Xtrap18), _C_LABEL(Xtrap19) + .long _C_LABEL(Xtrap1a), _C_LABEL(Xtrap1b) + .long _C_LABEL(Xtrap1c), _C_LABEL(Xtrap1d) + .long _C_LABEL(Xtrap1e), _C_LABEL(Xtrap1f) +#endif + + +IDTVEC(tss_trap08) +1: + str %ax + GET_TSS + movzwl (%eax),%eax + GET_TSS + pushl $T_DOUBLEFLT + pushl %eax + call _C_LABEL(trap_tss) + addl $12,%esp + iret + jmp 1b + +/* LINTSTUB: Ignore */ +NENTRY(alltraps) + INTRENTRY +calltrap: +#ifdef DIAGNOSTIC + movl CPUVAR(ILEVEL),%ebx +#endif /* DIAGNOSTIC */ + pushl %esp + call _C_LABEL(trap) + addl $4,%esp + testb $CHK_UPL,TF_CS(%esp) + jnz alltraps_checkast +#ifdef VM86 + testl $PSL_VM,TF_EFLAGS(%esp) + jz 6f +#else + jmp 6f +#endif +alltraps_checkast: + /* Check for ASTs on exit to user mode. */ + CLI(%eax) + CHECK_ASTPENDING(%eax) + jz 3f +5: CLEAR_ASTPENDING(%eax) + STI(%eax) + movl $T_ASTFLT,TF_TRAPNO(%esp) + pushl %esp + call _C_LABEL(trap) + addl $4,%esp + jmp alltraps_checkast /* re-check ASTs */ +3: CHECK_DEFERRED_SWITCH(%eax) + jnz 9f +6: STIC(%eax) + jz 4f + call _C_LABEL(stipending) + #testl %eax,%eax /* XXXcl */ + #jnz 1b +4: +#ifndef DIAGNOSTIC + INTRFASTEXIT +#else + cmpl CPUVAR(ILEVEL),%ebx + jne 3f + INTRFASTEXIT +3: pushl $4f + call _C_LABEL(printf) + addl $4,%esp +#ifdef DDB + int $3 +#endif /* DDB */ + movl %ebx,CPUVAR(ILEVEL) + jmp alltraps_checkast /* re-check ASTs */ +4: .asciz "WARNING: SPL NOT LOWERED ON TRAP EXIT\n" +#endif /* DIAGNOSTIC */ +9: STI(%eax) + call _C_LABEL(pmap_load) + jmp alltraps_checkast /* re-check ASTs */ + +/* LINTSTUB: Ignore */ +IDTVEC(trap0e) + INTRENTRY + movl TF_TRAPNO(%esp),%eax + movl $T_PAGEFLT,TF_TRAPNO(%esp) +#ifdef DIAGNOSTIC + movl CPUVAR(ILEVEL),%ebx +#endif /* DIAGNOSTIC */ + #pushl %esp + pushl %eax + movl %esp,%eax + addl $4,%eax + pushl %eax + call _C_LABEL(trap) + addl $4,%esp + addl $4,%esp + testb $CHK_UPL,TF_CS(%esp) + jnz trap0e_checkast +#ifdef VM86 + testl $PSL_VM,TF_EFLAGS(%esp) + jz 6f +#else + jmp 6f +#endif +trap0e_checkast: + /* Check for ASTs on exit to user mode. */ + CLI(%eax) + CHECK_ASTPENDING(%eax) + jz 3f +5: CLEAR_ASTPENDING(%eax) + STI(%eax) + movl $T_ASTFLT,TF_TRAPNO(%esp) + pushl %esp + call _C_LABEL(trap) + addl $4,%esp + jmp trap0e_checkast /* re-check ASTs */ +3: CHECK_DEFERRED_SWITCH(%eax) + jnz 9f +6: STIC(%eax) + jz 4f + call _C_LABEL(stipending) + #testl %eax,%eax /* XXXcl */ + #jnz 1b +4: +#ifndef DIAGNOSTIC + INTRFASTEXIT +#else + cmpl CPUVAR(ILEVEL),%ebx + jne 3f + INTRFASTEXIT +3: pushl $4f + call _C_LABEL(printf) + addl $4,%esp +#ifdef DDB + int $3 +#endif /* DDB */ + movl %ebx,CPUVAR(ILEVEL) + jmp trap0e_checkast /* re-check ASTs */ +4: .asciz "WARNING: SPL NOT LOWERED ON TRAP EXIT\n" +#endif /* DIAGNOSTIC */ +9: STI(%eax) + call _C_LABEL(pmap_load) + jmp trap0e_checkast /* re-check ASTs */ + +#ifdef IPKDB +/* LINTSTUB: Ignore */ +NENTRY(bpttraps) + INTRENTRY + call _C_LABEL(ipkdb_trap_glue) + testl %eax,%eax + jz calltrap + INTRFASTEXIT + +ipkdbsetup: + popl %ecx + + /* Disable write protection: */ + movl %cr0,%eax + pushl %eax + andl $~CR0_WP,%eax + movl %eax,%cr0 + + /* Substitute Protection & Page Fault handlers: */ + movl _C_LABEL(idt),%edx + pushl 13*8(%edx) + pushl 13*8+4(%edx) + pushl 14*8(%edx) + pushl 14*8+4(%edx) + movl $fault,%eax + movw %ax,13*8(%edx) + movw %ax,14*8(%edx) + shrl $16,%eax + movw %ax,13*8+6(%edx) + movw %ax,14*8+6(%edx) + + pushl %ecx + ret + +ipkdbrestore: + popl %ecx + + /* Restore Protection & Page Fault handlers: */ + movl _C_LABEL(idt),%edx + popl 14*8+4(%edx) + popl 14*8(%edx) + popl 13*8+4(%edx) + popl 13*8(%edx) + + /* Restore write protection: */ + popl %edx + movl %edx,%cr0 + + pushl %ecx + ret +#endif /* IPKDB */ + + +/* + * If an error is detected during trap, syscall, or interrupt exit, trap() will + * change %eip to point to one of these labels. We clean up the stack, if + * necessary, and resume as if we were handling a general protection fault. + * This will cause the process to get a SIGBUS. + */ +/* LINTSTUB: Var: char resume_iret[1]; */ +NENTRY(resume_iret) + ZTRAP(T_PROTFLT) +/* LINTSTUB: Var: char resume_pop_ds[1]; */ +NENTRY(resume_pop_ds) + movl %es,TF_ES(%esp) + movl $GSEL(GDATA_SEL, SEL_KPL),%eax + movw %ax,%es +/* LINTSTUB: Var: char resume_pop_es[1]; */ +NENTRY(resume_pop_es) + movl %fs,TF_FS(%esp) + movl $GSEL(GDATA_SEL, SEL_KPL),%eax + movw %ax,%fs +/* LINTSTUB: Var: char resume_pop_fs[1]; */ +NENTRY(resume_pop_fs) + movl %gs,TF_GS(%esp) + movl $GSEL(GDATA_SEL, SEL_KPL),%eax + movw %ax,%gs +/* LINTSTUB: Var: char resume_pop_gs[1]; */ +NENTRY(resume_pop_gs) + movl $T_PROTFLT,TF_TRAPNO(%esp) + jmp calltrap + +#ifdef IPKDB +/* LINTSTUB: Func: int ipkdbfbyte(u_char *c) */ +NENTRY(ipkdbfbyte) + pushl %ebp + movl %esp,%ebp + call ipkdbsetup + movl 8(%ebp),%edx + movzbl (%edx),%eax +faultexit: + call ipkdbrestore + popl %ebp + ret + +/* LINTSTUB: Func: int ipkdbsbyte(u_char *c, int i) */ +NENTRY(ipkdbsbyte) + pushl %ebp + movl %esp,%ebp + call ipkdbsetup + movl 8(%ebp),%edx + movl 12(%ebp),%eax + movb %al,(%edx) + call ipkdbrestore + popl %ebp + ret + +fault: + popl %eax /* error code */ + movl $faultexit,%eax + movl %eax,(%esp) + movl $-1,%eax + iret +#endif /* IPKDB */ + + + +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until weve done all processing. HOWEVER, we must enable events before +# popping the stack frame (cant be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so wed +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +ENTRY(hypervisor_callback) + pushl $0 # dummy error code + pushl $T_ASTFLT + INTRENTRY + movl TF_EIP(%esp),%eax + cmpl $scrit,%eax + jb 11f + cmpl $ecrit,%eax + jb critical_region_fixup +11: push %esp + call do_hypervisor_callback + add $4,%esp + movl HYPERVISOR_shared_info,%esi + xorl %eax,%eax + movb TF_CS(%esp),%cl + test $CHK_UPL,%cl # slow return to ring 2 or 3 + je safesti + movl CPUVAR(ILEVEL),%ebx + jmp doreti_checkast +safesti:XEN_UNBLOCK_EVENTS(%esi) # reenable event callbacks +scrit: /**** START OF CRITICAL REGION ****/ + testb $1,evtchn_upcall_pending(%esi) + jnz 14f # process more events if necessary... + INTRFASTEXIT +critiret: +14: XEN_BLOCK_EVENTS(%esi) + jmp 11b +ecrit: /**** END OF CRITICAL REGION ****/ +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +critical_region_fixup: + cmpl $(critiret-1),%eax # eip points to iret? + jne 1f + movl $(TF_PUSHSIZE+0x8),%eax + jmp 2f +1: xorl %eax,%eax +2: + # %eax contains num bytes popped + mov %esp,%esi + add %eax,%esi # %esi points at end of src region + mov %esp,%edi + add $(TF_PUSHSIZE+0x8+0xC),%edi # %edi points at end of dst region + mov %eax,%ecx + shr $2,%ecx # convert words to bytes + je 16f # skip loop if nothing to copy +15: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 15b +16: movl %edi,%esp # final %edi is top of merged stack + jmp 11b + + +# Hypervisor uses this for application faults while it executes. +ENTRY(failsafe_callback) + pop %ds + pop %es + pop %fs + pop %gs + call _C_LABEL(xen_failsafe_handler) + iret + +#ifdef XENDEBUG_LOW + +ES = 0x20 +ORIG_EAX = 0x24 +EIP = 0x28 +CS = 0x2C + +#define SAVE_ALL \ + cld; \ + pushl %es; \ + pushl %ds; \ + pushl %eax; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; \ + movl $GSEL(GDATA_SEL, SEL_KPL),%edx; \ + movl %edx,%ds; \ + movl %edx,%es; + +#define RESTORE_ALL \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %eax; \ + popl %ds; \ + popl %es; \ + addl $4,%esp; \ + iret; \ + +ret_from_exception: + movb CS(%esp),%cl + test $2,%cl # slow return to ring 2 or 3 + jne safesti + RESTORE_ALL + + +ENTRY(divide_error) + pushl $0 # no error code + pushl $do_divide_error +do_exception: + pushl %ds + pushl %eax + xorl %eax,%eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + decl %eax # eax = -1 + pushl %ecx + pushl %ebx + cld + movl %es,%ecx + movl ORIG_EAX(%esp), %esi # get the error code + movl ES(%esp), %edi # get the function address + movl %eax, ORIG_EAX(%esp) + movl %ecx, ES(%esp) + movl %esp,%edx + pushl %esi # push the error code + pushl %edx # push the pt_regs pointer + movl $(__KERNEL_DS),%edx + movl %edx,%ds + movl %edx,%es + call *%edi + addl $8,%esp + jmp ret_from_exception + +ENTRY(coprocessor_error) + pushl $0 + pushl $do_coprocessor_error + jmp do_exception + +ENTRY(simd_coprocessor_error) + pushl $0 + pushl $do_simd_coprocessor_error + jmp do_exception + +ENTRY(device_not_available) + iret + +ENTRY(debug) + pushl $0 + pushl $do_debug + jmp do_exception + +ENTRY(int3) + pushl $0 + pushl $do_int3 + jmp do_exception + +ENTRY(overflow) + pushl $0 + pushl $do_overflow + jmp do_exception + +ENTRY(bounds) + pushl $0 + pushl $do_bounds + jmp do_exception + +ENTRY(invalid_op) + pushl $0 + pushl $do_invalid_op + jmp do_exception + +ENTRY(coprocessor_segment_overrun) + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp do_exception + +ENTRY(double_fault) + pushl $do_double_fault + jmp do_exception + +ENTRY(invalid_TSS) + pushl $do_invalid_TSS + jmp do_exception + +ENTRY(segment_not_present) + pushl $do_segment_not_present + jmp do_exception + +ENTRY(stack_segment) + pushl $do_stack_segment + jmp do_exception + +ENTRY(general_protection) + pushl $do_general_protection + jmp do_exception + +ENTRY(alignment_check) + pushl $do_alignment_check + jmp do_exception + +# This handler is special, because it gets an extra value on its stack, +# which is the linear faulting address. +ENTRY(page_fault) + pushl %ds + pushl %eax + xorl %eax,%eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + decl %eax # eax = -1 + pushl %ecx + pushl %ebx + cld + movl %es,%ecx + movl ORIG_EAX(%esp), %esi # get the error code + movl ES(%esp), %edi # get the faulting address + movl %eax, ORIG_EAX(%esp) + movl %ecx, ES(%esp) + movl %esp,%edx + pushl %edi # push the faulting address + pushl %esi # push the error code + pushl %edx # push the pt_regs pointer + movl $(__KERNEL_DS),%edx + movl %edx,%ds + movl %edx,%es + call do_page_fault + addl $12,%esp + jmp ret_from_exception + +ENTRY(machine_check) + pushl $0 + pushl $do_machine_check + jmp do_exception + +ENTRY(spurious_interrupt_bug) + pushl $0 + pushl $do_spurious_interrupt_bug + jmp do_exception +#endif diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c new file mode 100644 index 0000000000..d51baba078 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c @@ -0,0 +1,680 @@ +/* $NetBSD: xen_machdep.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: xen_machdep.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $"); + +#include "opt_xen.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mount.h> + +#include <uvm/uvm.h> + +#include <machine/gdt.h> +#include <machine/xenfunc.h> +#include <machine/xenpmap.h> + +/* #define XENDEBUG */ +/* #define XENDEBUG_LOW */ + +#ifdef XENDEBUG +#define XENPRINTF(x) printf x +#define XENPRINTK(x) printk x +#define XENPRINTK2(x) /* printk x */ + +static char XBUF[256]; +#else +#define XENPRINTF(x) +#define XENPRINTK(x) +#define XENPRINTK2(x) +#endif +void printk(char *, ...); +#define PRINTF(x) printf x +#define PRINTK(x) printk x + +shared_info_t *HYPERVISOR_shared_info; +union start_info_union start_info_union; + +void xen_failsafe_handler(void); + +void +xen_failsafe_handler(void) +{ + + panic("xen_failsafe_handler called!\n"); +} + + +void +xen_update_descriptor(union descriptor *table, union descriptor *entry) +{ + paddr_t pa; + pt_entry_t *ptp; + + ptp = kvtopte((vaddr_t)table); + pa = (*ptp & PG_FRAME) | ((vaddr_t)table & ~PG_FRAME); + if (HYPERVISOR_update_descriptor(pa, entry->raw[0], entry->raw[1])) + panic("HYPERVISOR_update_descriptor failed\n"); +} + +void +xen_set_ldt(vaddr_t base, uint32_t entries) +{ + vaddr_t va; + pt_entry_t *ptp, *maptp; + + for (va = base; va < base + entries * sizeof(union descriptor); + va += PAGE_SIZE) { + KASSERT(va >= VM_MIN_KERNEL_ADDRESS); + ptp = kvtopte(va); + maptp = (pt_entry_t *)vtomach((vaddr_t)ptp); + XENPRINTF(("xen_set_ldt %p %d %p %p\n", (void *)base, + entries, ptp, maptp)); + PTE_CLEARBITS(ptp, maptp, PG_RW); + } + PTE_UPDATES_FLUSH(); + + xpq_queue_set_ldt(base, entries); + xpq_flush_queue(); +} + +void +lgdt(struct region_descriptor *rdp) +{ + + panic("lgdt %p %08x\n", (void *)rdp->rd_base, rdp->rd_limit); +} + +void +xen_parse_cmdline(int what, union xen_cmdline_parseinfo *xcp) +{ + char *cmd_line, *opt, *s; + int b, i, ipidx = 0; + uint32_t xi_ip[5]; + + cmd_line = xen_start_info.cmd_line; + + switch (what) { + case XEN_PARSE_BOOTDEV: + xcp->xcp_bootdev[0] = 0; + break; + case XEN_PARSE_CONSOLE: + xcp->xcp_console[0] = 0; + break; + } + + while (cmd_line && *cmd_line) { + opt = cmd_line; + cmd_line = strchr(opt, ' '); + if (cmd_line) + *cmd_line = 0; + + switch (what) { + case XEN_PARSE_BOOTDEV: + if (strncasecmp(opt, "bootdev=", 8) == 0) + strncpy(xcp->xcp_bootdev, opt + 8, + sizeof(xcp->xcp_console)); + break; + + case XEN_PARSE_NETINFO: + if (xcp->xcp_netinfo.xi_root && + strncasecmp(opt, "nfsroot=", 8) == 0) + strncpy(xcp->xcp_netinfo.xi_root, opt + 8, + MNAMELEN); + + if (strncasecmp(opt, "ip=", 3) == 0) { + memset(xi_ip, 0, sizeof(xi_ip)); + opt += 3; + ipidx = 0; + while (opt && *opt) { + s = opt; + opt = strchr(opt, ':'); + if (opt) + *opt = 0; + + switch (ipidx) { + case 0: /* ip */ + case 1: /* nfs server */ + case 2: /* gw */ + case 3: /* mask */ + case 4: /* host */ + if (*s == 0) + break; + for (i = 0; i < 4; i++) { + b = strtoul(s, &s, 10); + xi_ip[ipidx] = b + 256 + * xi_ip[ipidx]; + if (*s != '.') + break; + s++; + } + if (i < 3) + xi_ip[ipidx] = 0; + break; + case 5: /* interface */ + if (!strncmp(s, "xennet", 6)) + s += 6; + else if (!strncmp(s, "eth", 3)) + s += 3; + else + break; + if (xcp->xcp_netinfo.xi_ifno + == strtoul(s, NULL, 10)) + memcpy(xcp-> + xcp_netinfo.xi_ip, + xi_ip, + sizeof(xi_ip)); + break; + } + ipidx++; + + if (opt) + *opt++ = ':'; + } + } + break; + + case XEN_PARSE_CONSOLE: + if (strncasecmp(opt, "console=", 8) == 0) + strncpy(xcp->xcp_console, opt + 8, + sizeof(xcp->xcp_console)); + break; + + } + + if (cmd_line) + *cmd_line++ = ' '; + } +} + + + + + +#define XEN_PAGE_OFFSET 0xC0100000 + +static pd_entry_t +xpmap_get_bootpde(paddr_t va) +{ + + return ((pd_entry_t *)xen_start_info.pt_base)[va >> PDSHIFT]; +} + +static pd_entry_t +xpmap_get_vbootpde(paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_bootpde(va); + if ((pde & PG_V) == 0) + return (pde & ~PG_FRAME); + return (pde & ~PG_FRAME) | + (xpmap_mtop(pde & PG_FRAME) + KERNBASE); +} + +static pt_entry_t * +xpmap_get_bootptep(paddr_t va) +{ + pd_entry_t pde; + + pde = xpmap_get_vbootpde(va); + if ((pde & PG_V) == 0) + return (void *)-1; + return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]); +} + +static pt_entry_t +xpmap_get_bootpte(paddr_t va) +{ + + return xpmap_get_bootptep(va)[0]; +} + +#if defined(XENDEBUG) +static void +xpmap_dump_pt(pt_entry_t *ptp, int p) +{ + pt_entry_t pte; + int j; + int bufpos; + + pte = xpmap_ptom((uint32_t)ptp - KERNBASE); + PRINTK(("%03x: %p(%p) %08x\n", p, ptp, (void *)pte, p << PDSHIFT)); + + bufpos = 0; + for (j = 0; j < PTES_PER_PTP; j++) { + if ((ptp[j] & PG_V) == 0) + continue; + pte = ptp[j] /* & PG_FRAME */; + bufpos += sprintf(XBUF + bufpos, "%x:%03x:%08x ", + p, j, pte); + if (bufpos > 70) { + int k; + sprintf(XBUF + bufpos, "\n"); + PRINTK((XBUF)); + bufpos = 0; + for (k = 0; k < 1000000; k++); + } + } + if (bufpos) { + PRINTK((XBUF)); + PRINTK(("\n")); + bufpos = 0; + } +} +#endif + +void +xpmap_init(void) +{ + pd_entry_t *xen_pdp; + pt_entry_t *ptp, *sysptp; + pt_entry_t pte; + uint32_t i, j; + int bufpos; +#if defined(XENDEBUG_LOW) + extern char kernel_text, _etext, __bss_start, end, *esym; +#endif + + xpmap_phys_to_machine_mapping = (void *)xen_start_info.mfn_list; + + xen_pdp = (pd_entry_t *)xen_start_info.pt_base; + + XENPRINTK(("text %p data %p bss %p end %p esym %p\n", &kernel_text, + &_etext, &__bss_start, &end, esym)); + XENPRINTK(("xpmap_init PTD %p nkpde %d upages %d xen_PTD %p p2m-map %p\n", + (void *)PTDpaddr, nkpde, UPAGES, xen_pdp, + xpmap_phys_to_machine_mapping)); + + bufpos = 0; + + XENPRINTK(("shared_inf %08x\n", (paddr_t)xen_start_info.shared_info)); + XENPRINTK(("c0100000: %08x\n", + xpmap_get_bootpte(0xc0100000))); + + /* Map kernel. */ + + /* Map kernel data/bss/tables. */ + + /* Map ISA I/O memory. */ + + /* Map kernel PDEs. */ + + /* Install a PDE recursively mapping page directory as a page table! */ + + sysptp = (pt_entry_t *)(PTDpaddr + ((1 + UPAGES) << PAGE_SHIFT)); + + /* make xen's PDE and PTE pages read-only in our pagetable */ + for (i = 0; i < xen_start_info.nr_pt_frames; i++) { + /* mark PTE page read-only in our table */ + sysptp[((xen_start_info.pt_base + + (i << PAGE_SHIFT) - KERNBASE_LOCORE) & + (PD_MASK | PT_MASK)) >> PAGE_SHIFT] &= ~PG_RW; + } + + xpq_flush_queue(); + + for (i = 0; i < 1 + UPAGES + nkpde; i++) { + /* mark PTE page read-only in xen's table */ + ptp = xpmap_get_bootptep(PTDpaddr + (i << PAGE_SHIFT)); + xpq_queue_pte_update( + (void *)xpmap_ptom((unsigned long)ptp - KERNBASE), *ptp & ~PG_RW); + XENPRINTK(("%03x: %p(%p) -> %08x\n", i, ptp, + (unsigned long)ptp - KERNTEXTOFF, *ptp)); + + /* mark PTE page read-only in our table */ + sysptp[((PTDpaddr + (i << PAGE_SHIFT) - KERNBASE_LOCORE) & + (PD_MASK | PT_MASK)) >> PAGE_SHIFT] &= ~PG_RW; + + /* update our pte's */ + ptp = (pt_entry_t *)(PTDpaddr + (i << PAGE_SHIFT)); +#if 0 + pte = xpmap_ptom((uint32_t)ptp - KERNBASE); + XENPRINTK(("%03x: %p(%p) %08x\n", i, ptp, pte, i << PDSHIFT)); +#endif + for (j = 0; j < PTES_PER_PTP; j++) { + if ((ptp[j] & PG_V) == 0) + continue; + if (ptp[j] == 0xffffffff) + ptp[j] = xen_start_info.shared_info | + (PG_V|PG_RW); + if (ptp[j] >= KERNTEXTOFF) { + pte = ptp[j]; + ptp[j] = (pte & ~PG_FRAME) | + (xpmap_get_bootpte(pte & PG_FRAME) & + PG_FRAME); + } +#if defined(XENDEBUG) && 0 + pte = ptp[j] /* & PG_FRAME */; + bufpos += sprintf(XBUF + bufpos, "%x:%03x:%08x ", + i, j, pte); + if (bufpos > 70) { + int k; + sprintf(XBUF + bufpos, "\n"); + XENPRINTK((XBUF)); + bufpos = 0; + for (k = 0; k < 1000000; k++); + } + } + if (bufpos) { + XENPRINTK((XBUF)); + bufpos = 0; +#endif + } + if (i == 0) + i = 1 + UPAGES - 1; + } + +#if 0 + for (i = 0x300; i < 0x305; i++) + if (((pt_entry_t *)xen_start_info.pt_base)[i] & PG_V) + xpmap_dump_pt((pt_entry_t *) + (xpmap_mtop(((pt_entry_t *)xen_start_info.pt_base)[i] & + PG_FRAME) + KERNBASE), i); + xpmap_dump_pt((pt_entry_t *)xen_start_info.pt_base, 0); +#endif + + XENPRINTK(("switching pdp: %p, %08lx, %p, %p, %p\n", (void *)PTDpaddr, + PTDpaddr - KERNBASE, + (void *)xpmap_ptom(PTDpaddr - KERNBASE), + (void *)xpmap_get_bootpte(PTDpaddr), + (void *)xpmap_mtop(xpmap_ptom(PTDpaddr - KERNBASE)))); + +#if defined(XENDEBUG) + xpmap_dump_pt((pt_entry_t *)PTDpaddr, 0); +#endif + + xpq_flush_queue(); + + xpq_queue_pin_table(xpmap_get_bootpte(PTDpaddr) & PG_FRAME, + XPQ_PIN_L2_TABLE); + xpq_queue_pt_switch(xpmap_get_bootpte(PTDpaddr) & PG_FRAME); + xpq_queue_unpin_table( + xpmap_get_bootpte(xen_start_info.pt_base) & PG_FRAME); + + /* make xen's PDE and PTE pages writable in our pagetable */ + for (i = 0; i < xen_start_info.nr_pt_frames; i++) { + /* mark PTE page writable in our table */ + ptp = &sysptp[((xen_start_info.pt_base + + (i << PAGE_SHIFT) - KERNBASE_LOCORE) & + (PD_MASK | PT_MASK)) >> PAGE_SHIFT]; + xpq_queue_pte_update( + (void *)xpmap_ptom((unsigned long)ptp - KERNBASE), *ptp | + PG_RW); + } + + xpq_flush_queue(); + XENPRINTK(("pt_switch done!\n")); +} + +/* + * Do a binary search to find out where physical memory ends on the + * real hardware. Xen will fail our updates if they are beyond the + * last available page (max_page in xen/common/memory.c). + */ +paddr_t +find_pmap_mem_end(vaddr_t va) +{ + mmu_update_t r; + int start, end, ok; + pt_entry_t old; + + start = xen_start_info.nr_pages; + end = HYPERVISOR_VIRT_START >> PAGE_SHIFT; + + r.ptr = (unsigned long)&PTE_BASE[x86_btop(va)]; + old = PTE_BASE[x86_btop(va)]; + + while (start + 1 < end) { + r.val = (((start + end) / 2) << PAGE_SHIFT) | PG_V; + + if (HYPERVISOR_mmu_update(&r, 1, &ok) < 0) + end = (start + end) / 2; + else + start = (start + end) / 2; + } + r.val = old; + if (HYPERVISOR_mmu_update(&r, 1, &ok) < 0) + printf("pmap_mem_end find: old update failed %08x\n", + old); + + return end << PAGE_SHIFT; +} + + +#if 0 +void xpmap_find_memory(paddr_t); +void +xpmap_find_memory(paddr_t first_avail) +{ + char buf[256]; + uint32_t i; + int bufpos; + paddr_t p; + + bufpos = 0; + for (i = ((first_avail - KERNTEXTOFF) >> PAGE_SHIFT); + i < xen_start_info.nr_pages; i++) { + /* if (xpmap_phys_to_machine_mapping[i] */ + bufpos += sprintf(buf + bufpos, "%03x:%08x:%08x ", + i, (uint32_t)xpmap_phys_to_machine_mapping[i], + (uint32_t)xpmap_mtop(xpmap_phys_to_machine_mapping[i] << + PAGE_SHIFT)); + p = xpmap_phys_to_machine_mapping[i]; + uvm_page_physload(p, p + 1, p, p + 1, VM_FREELIST_DEFAULT); + + if (bufpos > 70) { + int k; + sprintf(buf + bufpos, "\n"); + XENPRINTK((buf)); + bufpos = 0; + for (k = 0; k < 1000000; k++); + } + } + if (bufpos) { + XENPRINTK((buf)); + bufpos = 0; + } +} +#endif + + +#ifdef XENDEBUG +void xpq_debug_dump(void); +#endif + +#define XPQUEUE_SIZE 2048 +typedef union xpq_queue { + struct { + pd_entry_t *ptr; + pd_entry_t val; + } pde; + struct { + pt_entry_t *ptr; + pt_entry_t val; + } pte; + struct { + paddr_t ptr; + uint32_t val; + } pa; +} xpq_queue_t; +static xpq_queue_t xpq_queue[XPQUEUE_SIZE]; +static int xpq_idx = 0; + +void +xpq_flush_queue() +{ + int i, ok; + + XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx)); + for (i = 0; i < xpq_idx; i++) + XENPRINTK2(("%d: %p %08x\n", i, xpq_queue[i].pde.ptr, + xpq_queue[i].pde.val)); + if (xpq_idx != 0 && + HYPERVISOR_mmu_update((mmu_update_t *)xpq_queue, xpq_idx, &ok) < 0) + panic("HYPERVISOR_mmu_update failed\n"); + xpq_idx = 0; +} + +static inline void +xpq_increment_idx(void) +{ + + xpq_idx++; + if (__predict_false(xpq_idx == XPQUEUE_SIZE)) + xpq_flush_queue(); +} + +void +xpq_queue_invlpg(vaddr_t va) +{ + + XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va)); + xpq_queue[xpq_idx].pa.ptr = (va & PG_FRAME) | MMU_EXTENDED_COMMAND; + xpq_queue[xpq_idx].pa.val = MMUEXT_INVLPG; + xpq_increment_idx(); +} + +void +xpq_queue_pde_update(pd_entry_t *ptr, pd_entry_t val) +{ + + xpq_queue[xpq_idx].pde.ptr = ptr; + xpq_queue[xpq_idx].pde.val = val; + xpq_increment_idx(); +} + +void +xpq_queue_pte_update(pt_entry_t *ptr, pt_entry_t val) +{ + + xpq_queue[xpq_idx].pte.ptr = ptr; + xpq_queue[xpq_idx].pte.val = val; + xpq_increment_idx(); +} + +void +xpq_queue_unchecked_pte_update(pt_entry_t *ptr, pt_entry_t val) +{ + + xpq_queue[xpq_idx].pa.ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE; + /* XXXcl UNCHECKED_PT_UPDATE */ + xpq_queue[xpq_idx].pa.val = val; + xpq_increment_idx(); +} + +void +xpq_queue_pt_switch(paddr_t pa) +{ + + XENPRINTK2(("xpq_queue_pt_switch: %p %p\n", (void *)pa, (void *)pa)); + xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND; + xpq_queue[xpq_idx].pa.val = MMUEXT_NEW_BASEPTR; + xpq_increment_idx(); +} + +void +xpq_queue_pin_table(paddr_t pa, int type) +{ + + XENPRINTK2(("xpq_queue_pin_table: %p %p\n", (void *)pa, (void *)pa)); + xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND; + switch (type) { + case XPQ_PIN_L1_TABLE: + xpq_queue[xpq_idx].pa.val = MMUEXT_PIN_L1_TABLE; + break; + case XPQ_PIN_L2_TABLE: + xpq_queue[xpq_idx].pa.val = MMUEXT_PIN_L2_TABLE; + break; + } + xpq_increment_idx(); +} + +void +xpq_queue_unpin_table(paddr_t pa) +{ + + XENPRINTK2(("xpq_queue_unpin_table: %p %p\n", (void *)pa, (void *)pa)); + xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND; + xpq_queue[xpq_idx].pa.val = MMUEXT_UNPIN_TABLE; + xpq_increment_idx(); +} + +void +xpq_queue_set_ldt(vaddr_t va, uint32_t entries) +{ + + XENPRINTK2(("xpq_queue_set_ldt\n")); + KASSERT(va == (va & PG_FRAME)); + xpq_queue[xpq_idx].pa.ptr = MMU_EXTENDED_COMMAND | va; + xpq_queue[xpq_idx].pa.val = MMUEXT_SET_LDT | + (entries << MMUEXT_CMD_SHIFT); + xpq_increment_idx(); +} + +void +xpq_queue_tlb_flush() +{ + + XENPRINTK2(("xpq_queue_tlb_flush\n")); + xpq_queue[xpq_idx].pa.ptr = MMU_EXTENDED_COMMAND; + xpq_queue[xpq_idx].pa.val = MMUEXT_TLB_FLUSH; + xpq_increment_idx(); +} + +#ifdef XENDEBUG +void +xpq_debug_dump() +{ + int i; + + XENPRINTK2(("idx: %d\n", xpq_idx)); + for (i = 0; i < xpq_idx; i++) { + sprintf(XBUF, "%p %08x ", xpq_queue[i].pte.ptr, + xpq_queue[i].pte.val); + if (++i < xpq_idx) + sprintf(XBUF + strlen(XBUF), "%p %08x ", + xpq_queue[i].pte.ptr, xpq_queue[i].pte.val); + if (++i < xpq_idx) + sprintf(XBUF + strlen(XBUF), "%p %08x ", + xpq_queue[i].pte.ptr, xpq_queue[i].pte.val); + if (++i < xpq_idx) + sprintf(XBUF + strlen(XBUF), "%p %08x ", + xpq_queue[i].pte.ptr, xpq_queue[i].pte.val); + XENPRINTK2(("%d: %s\n", xpq_idx, XBUF)); + } +} +#endif diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h new file mode 100644 index 0000000000..cad97f21e1 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h @@ -0,0 +1,130 @@ +/* $NetBSD: frameasm.h,v 1.1 2004/03/11 21:44:08 cl Exp $ */ +/* NetBSD: frameasm.h,v 1.4 2004/02/20 17:35:01 yamt Exp */ + +#ifndef _I386_FRAMEASM_H_ +#define _I386_FRAMEASM_H_ + +#ifdef _KERNEL_OPT +#include "opt_multiprocessor.h" +#endif + +/* XXX assym.h */ +#define TRAP_INSTR int $0x82 +#define __HYPERVISOR_stack_switch 4 +#define __HYPERVISOR_fpu_taskswitch 7 + +#ifndef TRAPLOG +#define TLOG /**/ +#else +/* + * Fill in trap record + */ +#define TLOG \ +9: \ + movl %fs:CPU_TLOG_OFFSET, %eax; \ + movl %fs:CPU_TLOG_BASE, %ebx; \ + addl $SIZEOF_TREC,%eax; \ + andl $SIZEOF_TLOG-1,%eax; \ + addl %eax,%ebx; \ + movl %eax,%fs:CPU_TLOG_OFFSET; \ + movl %esp,TREC_SP(%ebx); \ + movl $9b,TREC_HPC(%ebx); \ + movl TF_EIP(%esp),%eax; \ + movl %eax,TREC_IPC(%ebx); \ + rdtsc ; \ + movl %eax,TREC_TSC(%ebx); \ + movl $MSR_LASTBRANCHFROMIP,%ecx; \ + rdmsr ; \ + movl %eax,TREC_LBF(%ebx); \ + incl %ecx ; \ + rdmsr ; \ + movl %eax,TREC_LBT(%ebx); \ + incl %ecx ; \ + rdmsr ; \ + movl %eax,TREC_IBF(%ebx); \ + incl %ecx ; \ + rdmsr ; \ + movl %eax,TREC_IBT(%ebx) +#endif + +/* + * These are used on interrupt or trap entry or exit. + */ +#define INTRENTRY \ + cld; \ + subl $TF_PUSHSIZE,%esp ; \ + movl %gs,TF_GS(%esp) ; \ + movl %fs,TF_FS(%esp) ; \ + movl %eax,TF_EAX(%esp) ; \ + movl %es,TF_ES(%esp) ; \ + movl %ds,TF_DS(%esp) ; \ + movl $GSEL(GDATA_SEL, SEL_KPL),%eax ; \ + movl %edi,TF_EDI(%esp) ; \ + movl %esi,TF_ESI(%esp) ; \ + movl %eax,%ds ; \ + movl %ebp,TF_EBP(%esp) ; \ + movl %eax,%es ; \ + movl %ebx,TF_EBX(%esp) ; \ + movl %eax,%gs ; \ + movl %edx,TF_EDX(%esp) ; \ + movl $GSEL(GCPU_SEL, SEL_KPL),%eax ; \ + movl %ecx,TF_ECX(%esp) ; \ + movl %eax,%fs ; \ + TLOG + +#define INTRFASTEXIT \ + movl TF_GS(%esp),%gs ; \ + movl TF_FS(%esp),%fs ; \ + movl TF_ES(%esp),%es ; \ + movl TF_DS(%esp),%ds ; \ + movl TF_EDI(%esp),%edi ; \ + movl TF_ESI(%esp),%esi ; \ + movl TF_EBP(%esp),%ebp ; \ + movl TF_EBX(%esp),%ebx ; \ + movl TF_EDX(%esp),%edx ; \ + movl TF_ECX(%esp),%ecx ; \ + movl TF_EAX(%esp),%eax ; \ + addl $(TF_PUSHSIZE+8),%esp ; \ + iret + +#define DO_DEFERRED_SWITCH(reg) \ + cmpl $0, CPUVAR(WANT_PMAPLOAD) ; \ + jz 1f ; \ + call _C_LABEL(pmap_load) ; \ + 1: + +#define CHECK_DEFERRED_SWITCH(reg) \ + cmpl $0, CPUVAR(WANT_PMAPLOAD) + +#define CHECK_ASTPENDING(reg) movl CPUVAR(CURLWP),reg ; \ + cmpl $0, reg ; \ + je 1f ; \ + movl L_PROC(reg),reg ; \ + cmpl $0, P_MD_ASTPENDING(reg); \ + 1: +#define CLEAR_ASTPENDING(reg) movl $0, P_MD_ASTPENDING(reg) + +#if !defined(XEN) +#define CLI(reg) cli +#define STI(reg) sti +#else +/* XXX assym.h */ +#define EVENTS_MASK 136 +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define XEN_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) +#define XEN_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(%reg) + +#define CLI(reg) movl _C_LABEL(HYPERVISOR_shared_info),reg ; \ + XEN_BLOCK_EVENTS(reg) +#define STI(reg) movl _C_LABEL(HYPERVISOR_shared_info),reg ; \ + XEN_UNBLOCK_EVENTS(reg) +#define STIC(reg) movl _C_LABEL(HYPERVISOR_shared_info),reg ; \ + XEN_UNBLOCK_EVENTS(reg) ; \ + testb $1,evtchn_upcall_pending(reg) +#endif + +#endif /* _I386_FRAMEASM_H_ */ diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h new file mode 100644 index 0000000000..13442d22eb --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h @@ -0,0 +1,423 @@ +/* $NetBSD: hypervisor.h,v 1.1.2.2 2004/06/17 09:23:19 tron Exp $ */ + +/* + * + * Communication to/from hypervisor. + * + * Copyright (c) 2002-2003, K A Fraser + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +#ifndef _XEN_HYPERVISOR_H_ +#define _XEN_HYPERVISOR_H_ + + +struct hypervisor_attach_args { + const char *haa_busname; +}; + +struct xencons_attach_args { + const char *xa_device; +}; + +struct xen_npx_attach_args { + const char *xa_device; +}; + + +#define u8 uint8_t +#define u16 uint16_t +#define u32 uint32_t +#define u64 uint64_t +#define s8 int8_t +#define s16 int16_t +#define s32 int32_t +#define s64 int64_t + +/* include the hypervisor interface */ +#include <sys/systm.h> +#include <machine/hypervisor-ifs/hypervisor-if.h> +#include <machine/hypervisor-ifs/dom0_ops.h> +#include <machine/hypervisor-ifs/event_channel.h> +#include <machine/hypervisor-ifs/io/domain_controller.h> +#include <machine/hypervisor-ifs/io/netif.h> + +#undef u8 +#undef u16 +#undef u32 +#undef u64 +#undef s8 +#undef s16 +#undef s32 +#undef s64 + + +/* + * a placeholder for the start of day information passed up from the hypervisor + */ +union start_info_union +{ + start_info_t start_info; + char padding[512]; +}; +extern union start_info_union start_info_union; +#define xen_start_info (start_info_union.start_info) + + +/* hypervisor.c */ +void do_hypervisor_callback(struct trapframe *regs); +void hypervisor_notify_via_evtchn(unsigned int); +void hypervisor_enable_irq(unsigned int); +void hypervisor_disable_irq(unsigned int); +void hypervisor_acknowledge_irq(unsigned int); + +/* hypervisor_machdep.c */ +void hypervisor_unmask_event(unsigned int); +void hypervisor_mask_event(unsigned int); +void hypervisor_clear_event(unsigned int); +void hypervisor_force_callback(void); + +/* + * Assembler stubs for hyper-calls. + */ + +static inline int HYPERVISOR_set_trap_table(trap_info_t *table) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_trap_table), + "b" (table) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count, + int *success_count) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_mmu_update), + "b" (req), "c" (count), "d" (success_count) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_gdt), + "b" (frame_list), "c" (entries) : "memory" ); + + + return ret; +} + +static inline int HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_stack_switch), + "b" (ss), "c" (esp) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_set_callbacks( + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_callbacks), + "b" (event_selector), "c" (event_address), + "d" (failsafe_selector), "S" (failsafe_address) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_fpu_taskswitch(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_fpu_taskswitch) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_yield(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_yield) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_block(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_block) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_shutdown(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift)) + : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_reboot(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift)) + : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_suspend(unsigned long srec) +{ + int ret; + /* NB. On suspend, control software expects a suspend record in %esi. */ + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)), + "S" (srec) : "memory" ); + + return ret; +} + +static inline long HYPERVISOR_set_timer_op(uint64_t timeout) +{ + int ret; + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_timer_op), + "b" (timeout_hi), "c" (timeout_lo) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_dom0_op(dom0_op_t *dom0_op) +{ + int ret; + dom0_op->interface_version = DOM0_INTERFACE_VERSION; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_dom0_op), + "b" (dom0_op) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_set_debugreg(int reg, unsigned long value) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_debugreg), + "b" (reg), "c" (value) : "memory" ); + + return ret; +} + +static inline unsigned long HYPERVISOR_get_debugreg(int reg) +{ + unsigned long ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_get_debugreg), + "b" (reg) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_update_descriptor( + unsigned long pa, unsigned long word1, unsigned long word2) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_update_descriptor), + "b" (pa), "c" (word1), "d" (word2) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_set_fast_trap(int idx) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_fast_trap), + "b" (idx) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_dom_mem_op(unsigned int op, + unsigned long *extent_list, + unsigned long nr_extents, + unsigned int extent_order) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_dom_mem_op), + "b" (op), "c" (extent_list), "d" (nr_extents), "S" (extent_order), + "D" (DOMID_SELF) + : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_multicall(void *call_list, int nr_calls) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_multicall), + "b" (call_list), "c" (nr_calls) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_update_va_mapping( + unsigned long page_nr, unsigned long new_val, unsigned long flags) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping), + "b" (page_nr), "c" (new_val), "d" (flags) : "memory" ); + + if (__predict_false(ret < 0)) + panic("Failed update VA mapping: %08lx, %08lx, %08lx", + page_nr, new_val, flags); + + return ret; +} + +static inline int HYPERVISOR_event_channel_op(void *op) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_event_channel_op), + "b" (op) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_xen_version(int cmd) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_xen_version), + "b" (cmd) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_console_io(int cmd, int count, char *str) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_console_io), + "b" (cmd), "c" (count), "d" (str) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_physdev_op(void *physdev_op) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_physdev_op), + "b" (physdev_op) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_grant_table_op(void *gnttab_op) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_grant_table_op), + "b" (gnttab_op) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_update_va_mapping_otherdomain( + unsigned long page_nr, unsigned long new_val, unsigned long flags, domid_t domid) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping_otherdomain), + "b" (page_nr), "c" (new_val), "d" (flags), "S" (domid) : + "memory" ); + + return ret; +} + +static inline int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_vm_assist), + "b" (cmd), "c" (type) : "memory" ); + + return ret; +} + +#endif /* _XEN_HYPERVISOR_H_ */ diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h new file mode 100644 index 0000000000..32a774b1b6 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h @@ -0,0 +1,110 @@ +/* $NetBSD: if_xennetvar.h,v 1.1.2.1 2004/05/22 15:59:31 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_IF_XENNETVAR_H_ +#define _XEN_IF_XENNETVAR_H_ + +#include <machine/xen.h> + +union xennet_bufarray { + struct { + struct mbuf *xbtx_m; + } xb_tx; + struct { + vaddr_t xbrx_va; + paddr_t xbrx_pa; + struct xennet_softc *xbrx_sc; + } xb_rx; + int xb_next; +}; + +struct xennet_txbuf { + SLIST_ENTRY(xennet_txbuf) xt_next; + struct xennet_softc *xt_sc; + paddr_t xt_pa; + u_char xt_buf[0]; +}; +#define TXBUF_PER_PAGE 2 +#define TXBUF_BUFSIZE (PAGE_SIZE / TXBUF_PER_PAGE) - sizeof(struct xennet_txbuf) + +struct xennet_softc { + struct device sc_dev; /* base device glue */ + struct ethercom sc_ethercom; /* Ethernet common part */ + + int sc_ifno; + + uint8_t sc_enaddr[6]; + +#ifdef mediacode + struct ifmedia sc_media; +#endif + + /* What is the status of our connection to the remote backend? */ +#define BEST_CLOSED 0 +#define BEST_DISCONNECTED 1 +#define BEST_CONNECTED 2 + unsigned int sc_backend_state; + + unsigned int sc_evtchn; + unsigned int sc_irq; + + netif_tx_interface_t *sc_tx; + netif_rx_interface_t *sc_rx; + + uint32_t sc_tx_entries; + uint32_t sc_tx_resp_cons; + + uint32_t sc_rx_resp_cons; + uint32_t sc_rx_bufs_to_notify; + + union xennet_bufarray sc_tx_bufa[NETIF_TX_RING_SIZE]; + union xennet_bufarray sc_rx_bufa[NETIF_TX_RING_SIZE]; + + SLIST_HEAD(, xennet_txbuf) sc_tx_bufs; +}; + +struct xennet_attach_args { + const char *xa_device; + int xa_handle; +}; + +struct nfs_diskless; + +int xennet_scan(struct device *, struct xennet_attach_args *, cfprint_t); +void xennet_start(struct ifnet *); +int xennet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); +void xennet_watchdog(struct ifnet *ifp); +int xennet_bootstatic_callback(struct nfs_diskless *); + +#endif /* _XEN_IF_XENNETVAR_H_ */ diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h new file mode 100644 index 0000000000..1a482ea287 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h @@ -0,0 +1,533 @@ +/* $NetBSD: pmap.h,v 1.1.2.1 2004/05/22 15:59:58 he Exp $ */ +/* NetBSD: pmap.h,v 1.79 2004/02/20 17:35:01 yamt Exp */ + +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgment: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * pmap.h: see pmap.c for the history of this pmap module. + */ + +#ifndef _I386_PMAP_H_ +#define _I386_PMAP_H_ + +#if defined(_KERNEL_OPT) +#include "opt_user_ldt.h" +#include "opt_largepages.h" +#endif + +#include "opt_xen.h" + +#include <machine/cpufunc.h> +#include <machine/pte.h> +#include <machine/xenfunc.h> +#include <machine/xenpmap.h> +#include <machine/segments.h> +#include <uvm/uvm_object.h> + +/* + * see pte.h for a description of i386 MMU terminology and hardware + * interface. + * + * a pmap describes a processes' 4GB virtual address space. this + * virtual address space can be broken up into 1024 4MB regions which + * are described by PDEs in the PDP. the PDEs are defined as follows: + * + * (ranges are inclusive -> exclusive, just like vm_map_entry start/end) + * (the following assumes that KERNBASE is 0xc0000000) + * + * PDE#s VA range usage + * 0->766 0x0 -> 0xbfc00000 user address space + * 767 0xbfc00000-> recursive mapping of PDP (used for + * 0xc0000000 linear mapping of PTPs) + * 768->1023 0xc0000000-> kernel address space (constant + * 0xffc00000 across all pmap's/processes) + * 1023 0xffc00000-> "alternate" recursive PDP mapping + * <end> (for other pmaps) + * + * + * note: a recursive PDP mapping provides a way to map all the PTEs for + * a 4GB address space into a linear chunk of virtual memory. in other + * words, the PTE for page 0 is the first int mapped into the 4MB recursive + * area. the PTE for page 1 is the second int. the very last int in the + * 4MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB + * address). + * + * all pmap's PD's must have the same values in slots 768->1023 so that + * the kernel is always mapped in every process. these values are loaded + * into the PD at pmap creation time. + * + * at any one time only one pmap can be active on a processor. this is + * the pmap whose PDP is pointed to by processor register %cr3. this pmap + * will have all its PTEs mapped into memory at the recursive mapping + * point (slot #767 as show above). when the pmap code wants to find the + * PTE for a virtual address, all it has to do is the following: + * + * address of PTE = (767 * 4MB) + (VA / PAGE_SIZE) * sizeof(pt_entry_t) + * = 0xbfc00000 + (VA / 4096) * 4 + * + * what happens if the pmap layer is asked to perform an operation + * on a pmap that is not the one which is currently active? in that + * case we take the PA of the PDP of non-active pmap and put it in + * slot 1023 of the active pmap. this causes the non-active pmap's + * PTEs to get mapped in the final 4MB of the 4GB address space + * (e.g. starting at 0xffc00000). + * + * the following figure shows the effects of the recursive PDP mapping: + * + * PDP (%cr3) + * +----+ + * | 0| -> PTP#0 that maps VA 0x0 -> 0x400000 + * | | + * | | + * | 767| -> points back to PDP (%cr3) mapping VA 0xbfc00000 -> 0xc0000000 + * | 768| -> first kernel PTP (maps 0xc0000000 -> 0xf0400000) + * | | + * |1023| -> points to alternate pmap's PDP (maps 0xffc00000 -> end) + * +----+ + * + * note that the PDE#767 VA (0xbfc00000) is defined as "PTE_BASE" + * note that the PDE#1023 VA (0xffc00000) is defined as "APTE_BASE" + * + * starting at VA 0xbfc00000 the current active PDP (%cr3) acts as a + * PTP: + * + * PTP#767 == PDP(%cr3) => maps VA 0xbfc00000 -> 0xc0000000 + * +----+ + * | 0| -> maps the contents of PTP#0 at VA 0xbfc00000->0xbfc01000 + * | | + * | | + * | 767| -> maps contents of PTP#767 (the PDP) at VA 0xbffbf000 + * | 768| -> maps contents of first kernel PTP + * | | + * |1023| + * +----+ + * + * note that mapping of the PDP at PTP#767's VA (0xbffbf000) is + * defined as "PDP_BASE".... within that mapping there are two + * defines: + * "PDP_PDE" (0xbfeffbfc) is the VA of the PDE in the PDP + * which points back to itself. + * "APDP_PDE" (0xbfeffffc) is the VA of the PDE in the PDP which + * establishes the recursive mapping of the alternate pmap. + * to set the alternate PDP, one just has to put the correct + * PA info in *APDP_PDE. + * + * note that in the APTE_BASE space, the APDP appears at VA + * "APDP_BASE" (0xfffff000). + */ +/* XXX MP should we allocate one APDP_PDE per processor?? */ + +/* + * the following defines identify the slots used as described above. + */ + +#define PDSLOT_PTE ((KERNBASE/NBPD)-1) /* 767: for recursive PDP map */ +#define PDSLOT_KERN (KERNBASE/NBPD) /* 768: start of kernel space */ +#define PDSLOT_APTE ((unsigned)1023-16) /* 1023: alternative recursive slot */ + +/* + * the following defines give the virtual addresses of various MMU + * data structures: + * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings + * PTD_BASE and APTD_BASE: the base VA of the recursive mapping of the PTD + * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP + */ + +#define PTE_BASE ((pt_entry_t *) (PDSLOT_PTE * NBPD) ) +#define APTE_BASE ((pt_entry_t *) (PDSLOT_APTE * NBPD) ) +#define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * PAGE_SIZE))) +#define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * PAGE_SIZE))) +#define PDP_PDE (PDP_BASE + PDSLOT_PTE) +#define APDP_PDE (PDP_BASE + PDSLOT_APTE) + +/* + * the follow define determines how many PTPs should be set up for the + * kernel by locore.s at boot time. this should be large enough to + * get the VM system running. once the VM system is running, the + * pmap module can add more PTPs to the kernel area on demand. + */ + +#ifndef NKPTP +#define NKPTP 4 /* 16MB to start */ +#endif +#define NKPTP_MIN 4 /* smallest value we allow */ +#define NKPTP_MAX (1024 - (KERNBASE/NBPD) - 1) + /* largest value (-1 for APTP space) */ + +/* + * pdei/ptei: generate index into PDP/PTP from a VA + */ +#define pdei(VA) (((VA) & PD_MASK) >> PDSHIFT) +#define ptei(VA) (((VA) & PT_MASK) >> PGSHIFT) + +/* + * PTP macros: + * a PTP's index is the PD index of the PDE that points to it + * a PTP's offset is the byte-offset in the PTE space that this PTP is at + * a PTP's VA is the first VA mapped by that PTP + * + * note that PAGE_SIZE == number of bytes in a PTP (4096 bytes == 1024 entries) + * NBPD == number of bytes a PTP can map (4MB) + */ + +#define ptp_i2o(I) ((I) * PAGE_SIZE) /* index => offset */ +#define ptp_o2i(O) ((O) / PAGE_SIZE) /* offset => index */ +#define ptp_i2v(I) ((I) * NBPD) /* index => VA */ +#define ptp_v2i(V) ((V) / NBPD) /* VA => index (same as pdei) */ + +/* + * PG_AVAIL usage: we make use of the ignored bits of the PTE + */ + +#define PG_W PG_AVAIL1 /* "wired" mapping */ +#define PG_PVLIST PG_AVAIL2 /* mapping has entry on pvlist */ +#define PG_X PG_AVAIL3 /* executable mapping */ + +/* + * Number of PTE's per cache line. 4 byte pte, 32-byte cache line + * Used to avoid false sharing of cache lines. + */ +#define NPTECL 8 + +#ifdef _KERNEL +/* + * pmap data structures: see pmap.c for details of locking. + */ + +struct pmap; +typedef struct pmap *pmap_t; + +/* + * we maintain a list of all non-kernel pmaps + */ + +LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */ + +/* + * the pmap structure + * + * note that the pm_obj contains the simple_lock, the reference count, + * page list, and number of PTPs within the pmap. + * + * XXX If we ever support processor numbers higher than 31, we'll have + * XXX to rethink the CPU mask. + */ + +struct pmap { + struct uvm_object pm_obj; /* object (lck by object lock) */ +#define pm_lock pm_obj.vmobjlock + LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */ + pd_entry_t *pm_pdir; /* VA of PD (lck by object lock) */ + u_int32_t pm_pdirpa; /* PA of PD (read-only after create) */ + struct vm_page *pm_ptphint; /* pointer to a PTP in our pmap */ + struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */ + + vaddr_t pm_hiexec; /* highest executable mapping */ + int pm_flags; /* see below */ + + union descriptor *pm_ldt; /* user-set LDT */ + int pm_ldt_len; /* number of LDT entries */ + int pm_ldt_sel; /* LDT selector */ + u_int32_t pm_cpus; /* mask of CPUs using pmap */ +}; + +/* pm_flags */ +#define PMF_USER_LDT 0x01 /* pmap has user-set LDT */ + +/* + * for each managed physical page we maintain a list of <PMAP,VA>'s + * which it is mapped at. the list is headed by a pv_head structure. + * there is one pv_head per managed phys page (allocated at boot time). + * the pv_head structure points to a list of pv_entry structures (each + * describes one mapping). + */ + +struct pv_entry { /* locked by its list's pvh_lock */ + SPLAY_ENTRY(pv_entry) pv_node; /* splay-tree node */ + struct pmap *pv_pmap; /* the pmap */ + vaddr_t pv_va; /* the virtual address */ + struct vm_page *pv_ptp; /* the vm_page of the PTP */ +}; + +/* + * pv_entrys are dynamically allocated in chunks from a single page. + * we keep track of how many pv_entrys are in use for each page and + * we can free pv_entry pages if needed. there is one lock for the + * entire allocation system. + */ + +struct pv_page_info { + TAILQ_ENTRY(pv_page) pvpi_list; + struct pv_entry *pvpi_pvfree; + int pvpi_nfree; +}; + +/* + * number of pv_entry's in a pv_page + * (note: won't work on systems where NPBG isn't a constant) + */ + +#define PVE_PER_PVPAGE ((PAGE_SIZE - sizeof(struct pv_page_info)) / \ + sizeof(struct pv_entry)) + +/* + * a pv_page: where pv_entrys are allocated from + */ + +struct pv_page { + struct pv_page_info pvinfo; + struct pv_entry pvents[PVE_PER_PVPAGE]; +}; + +/* + * global kernel variables + */ + +/* PTDpaddr: is the physical address of the kernel's PDP */ +extern u_long PTDpaddr; + +extern struct pmap kernel_pmap_store; /* kernel pmap */ +extern int nkpde; /* current # of PDEs for kernel */ +extern int pmap_pg_g; /* do we support PG_G? */ + +/* + * macros + */ + +#define pmap_kernel() (&kernel_pmap_store) +#define pmap_resident_count(pmap) ((pmap)->pm_stats.resident_count) +#define pmap_wired_count(pmap) ((pmap)->pm_stats.wired_count) +#define pmap_update(pmap) /* nothing (yet) */ + +#define pmap_clear_modify(pg) pmap_clear_attrs(pg, PG_M) +#define pmap_clear_reference(pg) pmap_clear_attrs(pg, PG_U) +#define pmap_copy(DP,SP,D,L,S) +#define pmap_is_modified(pg) pmap_test_attrs(pg, PG_M) +#define pmap_is_referenced(pg) pmap_test_attrs(pg, PG_U) +#define pmap_move(DP,SP,D,L,S) +#define pmap_phys_address(ppn) x86_ptob(ppn) +#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */ + + +/* + * prototypes + */ + +void pmap_activate(struct lwp *); +void pmap_bootstrap(vaddr_t); +boolean_t pmap_clear_attrs(struct vm_page *, int); +void pmap_deactivate(struct lwp *); +void pmap_deactivate2(struct lwp *); +void pmap_page_remove (struct vm_page *); +void pmap_remove(struct pmap *, vaddr_t, vaddr_t); +boolean_t pmap_test_attrs(struct vm_page *, int); +void pmap_write_protect(struct pmap *, vaddr_t, vaddr_t, vm_prot_t); +int pmap_exec_fixup(struct vm_map *, struct trapframe *, + struct pcb *); +void pmap_load(void); +int pmap_enter_ma(struct pmap *, vaddr_t, paddr_t, vm_prot_t, + int); + +vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */ + +void pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, int32_t *); +void pmap_tlb_shootnow(int32_t); +void pmap_do_tlb_shootdown(struct cpu_info *); + +#define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */ + +/* + * Do idle page zero'ing uncached to avoid polluting the cache. + */ +boolean_t pmap_pageidlezero(paddr_t); +#define PMAP_PAGEIDLEZERO(pa) pmap_pageidlezero((pa)) + +/* + * inline functions + */ + +/*ARGSUSED*/ +static __inline void +pmap_remove_all(struct pmap *pmap) +{ + /* Nothing. */ +} + +/* + * pmap_update_pg: flush one page from the TLB (or flush the whole thing + * if hardware doesn't support one-page flushing) + */ + +__inline static void __attribute__((__unused__)) +pmap_update_pg(vaddr_t va) +{ +#if defined(I386_CPU) + if (cpu_class == CPUCLASS_386) + tlbflush(); + else +#endif + invlpg((u_int) va); +} + +/* + * pmap_update_2pg: flush two pages from the TLB + */ + +__inline static void __attribute__((__unused__)) +pmap_update_2pg(vaddr_t va, vaddr_t vb) +{ +#if defined(I386_CPU) + if (cpu_class == CPUCLASS_386) + tlbflush(); + else +#endif + { + invlpg((u_int) va); + invlpg((u_int) vb); + } +} + +/* + * pmap_page_protect: change the protection of all recorded mappings + * of a managed page + * + * => this function is a frontend for pmap_page_remove/pmap_clear_attrs + * => we only have to worry about making the page more protected. + * unprotecting a page is done on-demand at fault time. + */ + +__inline static void __attribute__((__unused__)) +pmap_page_protect(struct vm_page *pg, vm_prot_t prot) +{ + if ((prot & VM_PROT_WRITE) == 0) { + if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) { + (void) pmap_clear_attrs(pg, PG_RW); + } else { + pmap_page_remove(pg); + } + } +} + +/* + * pmap_protect: change the protection of pages in a pmap + * + * => this function is a frontend for pmap_remove/pmap_write_protect + * => we only have to worry about making the page more protected. + * unprotecting a page is done on-demand at fault time. + */ + +__inline static void __attribute__((__unused__)) +pmap_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) +{ + if ((prot & VM_PROT_WRITE) == 0) { + if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) { + pmap_write_protect(pmap, sva, eva, prot); + } else { + pmap_remove(pmap, sva, eva); + } + } +} + +/* + * various address inlines + * + * vtopte: return a pointer to the PTE mapping a VA, works only for + * user and PT addresses + * + * kvtopte: return a pointer to the PTE mapping a kernel VA + */ + +#include <lib/libkern/libkern.h> + +static __inline pt_entry_t * __attribute__((__unused__)) +vtopte(vaddr_t va) +{ + + KASSERT(va < (PDSLOT_KERN << PDSHIFT)); + + return (PTE_BASE + x86_btop(va)); +} + +static __inline pt_entry_t * __attribute__((__unused__)) +kvtopte(vaddr_t va) +{ + + KASSERT(va >= (PDSLOT_KERN << PDSHIFT)); + +#ifdef LARGEPAGES + { + pd_entry_t *pde; + + pde = PDP_BASE + pdei(va); + if (*pde & PG_PS) + return ((pt_entry_t *)pde); + } +#endif + + return (PTE_BASE + x86_btop(va)); +} + +/* + * vtomach: virtual address to machine address. For use by + * machine-dependent code only. + */ + +static inline paddr_t __attribute__((__unused__)) +vtomach(vaddr_t va) +{ + pt_entry_t pte; + + pte = PTE_GET(&PTE_BASE[x86_btop(va)]); + return xpmap_ptom((pte & PG_FRAME) | (va & ~PG_FRAME)); +} + +#define pmap_cpu_has_pg_n() (cpu_class != CPUCLASS_386) +#define pmap_cpu_has_invlpg() (cpu_class != CPUCLASS_386) + +paddr_t vtophys(vaddr_t); +vaddr_t pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t); + +void pmap_kenter_ma(vaddr_t, paddr_t, vm_prot_t); + +#if defined(USER_LDT) +void pmap_ldt_cleanup(struct lwp *); +#define PMAP_FORK +#endif /* USER_LDT */ + +/* + * Hooks for the pool allocator. + */ +#define POOL_VTOPHYS(va) vtophys((vaddr_t) (va)) + +#endif /* _KERNEL */ +#endif /* _I386_PMAP_H_ */ diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h new file mode 100644 index 0000000000..48bff484b9 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h @@ -0,0 +1,247 @@ +/* $NetBSD: xen.h,v 1.1.2.2 2004/06/17 09:23:19 tron Exp $ */ + +/* + * + * Copyright (c) 2003, 2004 Keir Fraser (on behalf of the Xen team) + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +#ifndef _XEN_H +#define _XEN_H + +#ifndef _LOCORE + +struct xen_netinfo { + uint32_t xi_ifno; + char *xi_root; + uint32_t xi_ip[5]; +}; + +union xen_cmdline_parseinfo { + char xcp_bootdev[16]; /* sizeof(dv_xname) */ + struct xen_netinfo xcp_netinfo; + char xcp_console[16]; +}; + +#define XEN_PARSE_BOOTDEV 0 +#define XEN_PARSE_NETINFO 1 +#define XEN_PARSE_CONSOLE 2 + +void xen_parse_cmdline(int, union xen_cmdline_parseinfo *); + +void xenconscn_attach(void); + +void xenmachmem_init(void); +void xenprivcmd_init(void); +void xenvfr_init(void); + +#ifdef XENDEBUG +void printk(const char *, ...); +void vprintk(const char *, va_list); +#endif + +#endif + +#endif /* _XEN_H */ + +/****************************************************************************** + * os.h + * + * random collection of macros and definition + */ + +#ifndef _OS_H_ +#define _OS_H_ + +/* + * These are the segment descriptors provided for us by the hypervisor. + * For now, these are hardwired -- guest OSes cannot update the GDT + * or LDT. + * + * It shouldn't be hard to support descriptor-table frobbing -- let me + * know if the BSD or XP ports require flexibility here. + */ + + +/* + * these are also defined in hypervisor-if.h but can't be pulled in as + * they are used in start of day assembly. Need to clean up the .h files + * a bit more... + */ + +#ifndef FLAT_RING1_CS +#define FLAT_RING1_CS 0x0819 +#define FLAT_RING1_DS 0x0821 +#define FLAT_RING3_CS 0x082b +#define FLAT_RING3_DS 0x0833 +#endif + +#define __KERNEL_CS FLAT_RING1_CS +#define __KERNEL_DS FLAT_RING1_DS + +/* Everything below this point is not included by assembler (.S) files. */ +#ifndef _LOCORE + +/* some function prototypes */ +void trap_init(void); + + +/* + * STI/CLI equivalents. These basically set and clear the virtual + * event_enable flag in the shared_info structure. Note that when + * the enable bit is set, there may be pending events to be handled. + * We may therefore call into do_hypervisor_callback() directly. + */ + +#define __save_flags(x) \ +do { \ + (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask; \ +} while (0) + +#define __restore_flags(x) \ +do { \ + shared_info_t *_shared = HYPERVISOR_shared_info; \ + __insn_barrier(); \ + if ((_shared->vcpu_data[0].evtchn_upcall_mask = (x)) == 0) { \ + __insn_barrier(); \ + if (__predict_false(_shared->vcpu_data[0].evtchn_upcall_pending)) \ + hypervisor_force_callback(); \ + } \ +} while (0) + +#define __cli() \ +do { \ + HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1; \ + __insn_barrier(); \ +} while (0) + +#define __sti() \ +do { \ + shared_info_t *_shared = HYPERVISOR_shared_info; \ + __insn_barrier(); \ + _shared->vcpu_data[0].evtchn_upcall_mask = 0; \ + __insn_barrier(); /* unmask then check (avoid races) */ \ + if (__predict_false(_shared->vcpu_data[0].evtchn_upcall_pending)) \ + hypervisor_force_callback(); \ +} while (0) + +#define cli() __cli() +#define sti() __sti() +#define save_flags(x) __save_flags(x) +#define restore_flags(x) __restore_flags(x) +#define save_and_cli(x) do { \ + __save_flags(x); \ + __cli(); \ +} while (/* CONSTCOND */ 0) +#define save_and_sti(x) __save_and_sti(x) + +#ifdef MULTIPROCESSOR +#define __LOCK_PREFIX "lock; " +#else +#define __LOCK_PREFIX "" +#endif + +static __inline__ uint32_t +x86_atomic_xchg(uint32_t *ptr, unsigned long val) +{ + unsigned long result; + + __asm __volatile("xchgl %0,%1" + :"=r" (result) + :"m" (*ptr), "0" (val) + :"memory"); + + return result; +} + +static __inline__ int +x86_atomic_test_and_clear_bit(volatile void *ptr, int bitno) +{ + int result; + + __asm __volatile(__LOCK_PREFIX + "btrl %2,%1 ;" + "sbbl %0,%0" + :"=r" (result), "=m" (*(volatile uint32_t *)(ptr)) + :"Ir" (bitno) : "memory"); + return result; +} + +static __inline__ int +x86_atomic_test_and_set_bit(volatile void *ptr, int bitno) +{ + int result; + + __asm __volatile(__LOCK_PREFIX + "btsl %2,%1 ;" + "sbbl %0,%0" + :"=r" (result), "=m" (*(volatile uint32_t *)(ptr)) + :"Ir" (bitno) : "memory"); + return result; +} + +static __inline int +x86_constant_test_bit(const volatile void *ptr, int bitno) +{ + return ((1UL << (bitno & 31)) & + (((const volatile uint32_t *) ptr)[bitno >> 5])) != 0; +} + +static __inline int +x86_variable_test_bit(const volatile void *ptr, int bitno) +{ + int result; + + __asm __volatile( + "btl %2,%1 ;" + "sbbl %0,%0" + :"=r" (result) + :"m" (*(volatile uint32_t *)(ptr)), "Ir" (bitno)); + return result; +} + +#define x86_atomic_test_bit(ptr, bitno) \ + (__builtin_constant_p(bitno) ? \ + x86_constant_test_bit((ptr),(bitno)) : \ + x86_variable_test_bit((ptr),(bitno))) + +static __inline void +x86_atomic_set_bit(volatile void *ptr, int bitno) +{ + __asm __volatile(__LOCK_PREFIX + "btsl %1,%0" + :"=m" (*(volatile uint32_t *)(ptr)) + :"Ir" (bitno)); +} + +static __inline void +x86_atomic_clear_bit(volatile void *ptr, int bitno) +{ + __asm __volatile(__LOCK_PREFIX + "btrl %1,%0" + :"=m" (*(volatile uint32_t *)(ptr)) + :"Ir" (bitno)); +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* _OS_H_ */ diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h new file mode 100644 index 0000000000..2df026a922 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h @@ -0,0 +1,135 @@ +/* $NetBSD: xenfunc.h,v 1.1.2.1 2004/05/22 15:59:31 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENFUNC_H_ +#define _XEN_XENFUNC_H_ + +#include <machine/xen.h> +#include <machine/hypervisor.h> +#include <machine/evtchn.h> +#include <machine/xenpmap.h> +#include <machine/pte.h> + +#ifdef XENDEBUG_LOW +#define __PRINTK(x) printk x +#else +#define __PRINTK(x) +#endif + +void xen_set_ldt(vaddr_t, uint32_t); +void xen_update_descriptor(union descriptor *, union descriptor *); + +static __inline void +invlpg(u_int addr) +{ + xpq_queue_invlpg(addr); + xpq_flush_queue(); +} + +static __inline void +lldt(u_short sel) +{ + + /* __PRINTK(("ldt %x\n", IDXSELN(sel))); */ + if (sel == GSEL(GLDT_SEL, SEL_KPL)) + xen_set_ldt((vaddr_t)ldt, NLDT); + else + xen_set_ldt(cpu_info_primary.ci_gdt[IDXSELN(sel)].ld.ld_base, + cpu_info_primary.ci_gdt[IDXSELN(sel)].ld.ld_entries); +} + +static __inline void +ltr(u_short sel) +{ + __PRINTK(("XXX ltr not supported\n")); +} + +static __inline void +lcr0(u_int val) +{ + __PRINTK(("XXX lcr0 not supported\n")); +} + +static __inline u_int +rcr0(void) +{ + __PRINTK(("XXX rcr0 not supported\n")); + return 0; +} + +#define lcr3(_v) _lcr3((_v), __FILE__, __LINE__) +static __inline void +_lcr3(u_int val, char *file, int line) +{ +/* __PRINTK(("lcr3 %08x at %s:%d\n", val, file, line)); */ + xpq_queue_pt_switch(xpmap_ptom(val) & PG_FRAME); + xpq_flush_queue(); +} + +static __inline void +tlbflush(void) +{ + xpq_queue_tlb_flush(); + xpq_flush_queue(); +} + +static __inline u_int +rdr6(void) +{ + u_int val; + + val = HYPERVISOR_get_debugreg(6); + return val; +} + +static __inline void +ldr6(u_int val) +{ + + HYPERVISOR_set_debugreg(6, val); +} + +static __inline void +disable_intr(void) +{ + __cli(); +} + +static __inline void +enable_intr(void) +{ + __sti(); +} + +#endif /* _XEN_XENFUNC_H_ */ diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h new file mode 100644 index 0000000000..f3c8c7f2d8 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h @@ -0,0 +1,193 @@ +/* $NetBSD: xenpmap.h,v 1.1.2.1 2004/05/22 15:59:58 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENPMAP_H_ +#define _XEN_XENPMAP_H_ + +#define INVALID_P2M_ENTRY (~0UL) + +void xpq_queue_invlpg(vaddr_t); +void xpq_queue_pde_update(pd_entry_t *, pd_entry_t); +void xpq_queue_pte_update(pt_entry_t *, pt_entry_t); +void xpq_queue_unchecked_pte_update(pt_entry_t *, pt_entry_t); +void xpq_queue_pt_switch(paddr_t); +void xpq_flush_queue(void); +void xpq_queue_set_ldt(vaddr_t, uint32_t); +void xpq_queue_tlb_flush(void); +void xpq_queue_pin_table(paddr_t, int); +void xpq_queue_unpin_table(paddr_t); + +extern paddr_t *xpmap_phys_to_machine_mapping; + +#define XPQ_PIN_L1_TABLE 1 +#define XPQ_PIN_L2_TABLE 2 + +#ifndef XEN +#define PDE_GET(_pdp) \ + *(_pdp) +#define PDE_SET(_pdp,_mapdp,_npde) \ + *(_mapdp) = (_npde) +#define PDE_CLEAR(_pdp,_mapdp) \ + *(_mapdp) = 0 +#define PTE_SET(_ptp,_maptp,_npte) \ + *(_maptp) = (_npte) +#define PTE_CLEAR(_ptp,_maptp) \ + *(_maptp) = 0 +#define PTE_ATOMIC_SET(_ptp,_maptp,_npte,_opte) \ + (_opte) = x86_atomic_testset_ul((_maptp), (_npte)) +#define PTE_ATOMIC_CLEAR(_ptp,_maptp,_opte) \ + (_opte) = x86_atomic_testset_ul((_maptp), 0) +#define PDE_CLEARBITS(_pdp,_mapdp,_bits) \ + *(_mapdp) &= ~(_bits) +#define PTE_ATOMIC_CLEARBITS(_ptp,_maptp,_bits) \ + x86_atomic_clearbits_l((_maptp), (_bits)) +#define PTE_SETBITS(_ptp,_maptp,_bits) \ + *(_maptp) |= (_bits) +#define PTE_ATOMIC_SETBITS(_ptp,_maptp,_bits) \ + x86_atomic_setbits_l((_maptp), (_bits)) +#else +paddr_t *xpmap_phys_to_machine_mapping; + +#define PDE_GET(_pdp) \ + (pmap_valid_entry(*(_pdp)) ? xpmap_mtop(*(_pdp)) : *(_pdp)) +#define PDE_SET(_pdp,_mapdp,_npde) do { \ + xpq_queue_pde_update((_mapdp), xpmap_ptom((_npde))); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PDE_CLEAR(_pdp,_mapdp) do { \ + xpq_queue_pde_update((_mapdp), 0); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_GET(_ptp) \ + (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : *(_ptp)) +#define PTE_GET_MA(_ptp) \ + *(_ptp) +#define PTE_SET(_ptp,_maptp,_npte) do { \ + xpq_queue_pte_update((_maptp), xpmap_ptom((_npte))); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_SET_MA(_ptp,_maptp,_npte) do { \ + xpq_queue_pte_update((_maptp), (_npte)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_SET_MA_UNCHECKED(_ptp,_maptp,_npte) do { \ + xpq_queue_unchecked_pte_update((_maptp), (_npte)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_CLEAR(_ptp,_maptp) do { \ + xpq_queue_pte_update((_maptp), 0); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_ATOMIC_SET(_ptp,_maptp,_npte,_opte) do { \ + (_opte) = PTE_GET(_ptp); \ + xpq_queue_pte_update((_maptp), xpmap_ptom((_npte))); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_ATOMIC_SET_MA(_ptp,_maptp,_npte,_opte) do { \ + (_opte) = *(_ptp); \ + xpq_queue_pte_update((_maptp), (_npte)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_ATOMIC_CLEAR(_ptp,_maptp,_opte) do { \ + (_opte) = PTE_GET(_ptp); \ + xpq_queue_pte_update((_maptp), 0); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_ATOMIC_CLEAR_MA(_ptp,_maptp,_opte) do { \ + (_opte) = *(_ptp); \ + xpq_queue_pte_update((_maptp), 0); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PDE_CLEARBITS(_pdp,_mapdp,_bits) do { \ + xpq_queue_pte_update((_mapdp), *(_pdp) & ~((_bits) & ~PG_FRAME)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_CLEARBITS(_ptp,_maptp,_bits) do { \ + xpq_queue_pte_update((_maptp), *(_ptp) & ~((_bits) & ~PG_FRAME)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PDE_ATOMIC_CLEARBITS(_pdp,_mapdp,_bits) do { \ + xpq_queue_pde_update((_mapdp), *(_pdp) & ~((_bits) & ~PG_FRAME)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_ATOMIC_CLEARBITS(_ptp,_maptp,_bits) do { \ + xpq_queue_pte_update((_maptp), *(_ptp) & ~((_bits) & ~PG_FRAME)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_SETBITS(_ptp,_maptp,_bits) do { \ + xpq_queue_pte_update((_maptp), *(_ptp) | ((_bits) & ~PG_FRAME)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PDE_ATOMIC_SETBITS(_pdp,_mapdp,_bits) do { \ + xpq_queue_pde_update((_mapdp), *(_pdp) | ((_bits) & ~PG_FRAME)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_ATOMIC_SETBITS(_ptp,_maptp,_bits) do { \ + xpq_queue_pte_update((_maptp), *(_ptp) | ((_bits) & ~PG_FRAME)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PDE_COPY(_dpdp,_madpdp,_spdp) do { \ + xpq_queue_pde_update((_madpdp), *(_spdp)); \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PTE_UPDATES_FLUSH() do { \ + xpq_flush_queue(); \ +} while (/*CONSTCOND*/0) + +#endif + +#define XPMAP_OFFSET (KERNTEXTOFF - KERNBASE_LOCORE) +static __inline paddr_t +xpmap_mtop(paddr_t mpa) +{ + return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) + + XPMAP_OFFSET) | (mpa & ~PG_FRAME); +} + +static __inline paddr_t +xpmap_ptom(paddr_t ppa) +{ + return (xpmap_phys_to_machine_mapping[(ppa - + XPMAP_OFFSET) >> PAGE_SHIFT] << PAGE_SHIFT) + | (ppa & ~PG_FRAME); +} + +static __inline paddr_t +xpmap_ptom_masked(paddr_t ppa) +{ + return (xpmap_phys_to_machine_mapping[(ppa - + XPMAP_OFFSET) >> PAGE_SHIFT] << PAGE_SHIFT); +} + +#endif /* _XEN_XENPMAP_H_ */ diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c b/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c new file mode 100644 index 0000000000..dda715fa54 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c @@ -0,0 +1,505 @@ +/* $NetBSD: bus_space.c,v 1.2.2.1 2004/05/22 15:57:25 he Exp $ */ +/* NetBSD: bus_space.c,v 1.2 2003/03/14 18:47:53 christos Exp */ + +/*- + * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace + * Simulation Facility, NASA Ames Research Center. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: bus_space.c,v 1.2.2.1 2004/05/22 15:57:25 he Exp $"); + +#include "opt_xen.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/extent.h> + +#include <uvm/uvm_extern.h> + +#include <machine/bus.h> + +#include <dev/isa/isareg.h> +#include <machine/isa_machdep.h> + +#include <machine/hypervisor.h> +#include <machine/xenpmap.h> + +/* + * Extent maps to manage I/O and memory space. Allocate + * storage for 8 regions in each, initially. Later, ioport_malloc_safe + * will indicate that it's safe to use malloc() to dynamically allocate + * region descriptors. + * + * N.B. At least two regions are _always_ allocated from the iomem + * extent map; (0 -> ISA hole) and (end of ISA hole -> end of RAM). + * + * The extent maps are not static! Machine-dependent ISA and EISA + * routines need access to them for bus address space allocation. + */ +static long ioport_ex_storage[EXTENT_FIXED_STORAGE_SIZE(8) / sizeof(long)]; +static long iomem_ex_storage[EXTENT_FIXED_STORAGE_SIZE(8) / sizeof(long)]; +struct extent *ioport_ex; +struct extent *iomem_ex; +static int ioport_malloc_safe; + +int x86_mem_add_mapping __P((bus_addr_t, bus_size_t, + int, bus_space_handle_t *)); + +void +x86_bus_space_init() +{ + /* + * Initialize the I/O port and I/O mem extent maps. + * Note: we don't have to check the return value since + * creation of a fixed extent map will never fail (since + * descriptor storage has already been allocated). + * + * N.B. The iomem extent manages _all_ physical addresses + * on the machine. When the amount of RAM is found, the two + * extents of RAM are allocated from the map (0 -> ISA hole + * and end of ISA hole -> end of RAM). + */ + ioport_ex = extent_create("ioport", 0x0, 0xffff, M_DEVBUF, + (caddr_t)ioport_ex_storage, sizeof(ioport_ex_storage), + EX_NOCOALESCE|EX_NOWAIT); + iomem_ex = extent_create("iomem", 0x0, 0xffffffff, M_DEVBUF, + (caddr_t)iomem_ex_storage, sizeof(iomem_ex_storage), + EX_NOCOALESCE|EX_NOWAIT); + + /* We are privileged guest os - should have IO privileges. */ + if (xen_start_info.flags & SIF_PRIVILEGED) { + dom0_op_t op; + op.cmd = DOM0_IOPL; + op.u.iopl.domain = DOMID_SELF; + op.u.iopl.iopl = 1; + if (HYPERVISOR_dom0_op(&op) != 0) + panic("Unable to obtain IOPL, " + "despite being SIF_PRIVILEGED"); + } +} + +void +x86_bus_space_mallocok() +{ + + ioport_malloc_safe = 1; +} + +int +x86_memio_map(t, bpa, size, flags, bshp) + bus_space_tag_t t; + bus_addr_t bpa; + bus_size_t size; + int flags; + bus_space_handle_t *bshp; +{ + int error; + struct extent *ex; + + /* + * Pick the appropriate extent map. + */ + if (t == X86_BUS_SPACE_IO) { + if (flags & BUS_SPACE_MAP_LINEAR) + return (EOPNOTSUPP); + ex = ioport_ex; + } else if (t == X86_BUS_SPACE_MEM) + ex = iomem_ex; + else + panic("x86_memio_map: bad bus space tag"); + + /* + * Before we go any further, let's make sure that this + * region is available. + */ + error = extent_alloc_region(ex, bpa, size, + EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0)); + if (error) + return (error); + + /* + * For I/O space, that's all she wrote. + */ + if (t == X86_BUS_SPACE_IO) { + *bshp = bpa; + return (0); + } + + /* + * For memory space, map the bus physical address to + * a kernel virtual address. + */ + error = x86_mem_add_mapping(bpa, size, + (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp); + if (error) { + if (extent_free(ex, bpa, size, EX_NOWAIT | + (ioport_malloc_safe ? EX_MALLOCOK : 0))) { + printf("x86_memio_map: pa 0x%lx, size 0x%lx\n", + bpa, size); + printf("x86_memio_map: can't free region\n"); + } + } + + return (error); +} + +int +_x86_memio_map(t, bpa, size, flags, bshp) + bus_space_tag_t t; + bus_addr_t bpa; + bus_size_t size; + int flags; + bus_space_handle_t *bshp; +{ + + /* + * For I/O space, just fill in the handle. + */ + if (t == X86_BUS_SPACE_IO) { + if (flags & BUS_SPACE_MAP_LINEAR) + return (EOPNOTSUPP); + *bshp = bpa; + return (0); + } + + /* + * For memory space, map the bus physical address to + * a kernel virtual address. + */ + return (x86_mem_add_mapping(bpa, size, + (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp)); +} + +int +x86_memio_alloc(t, rstart, rend, size, alignment, boundary, flags, + bpap, bshp) + bus_space_tag_t t; + bus_addr_t rstart, rend; + bus_size_t size, alignment, boundary; + int flags; + bus_addr_t *bpap; + bus_space_handle_t *bshp; +{ + struct extent *ex; + u_long bpa; + int error; + + /* + * Pick the appropriate extent map. + */ + if (t == X86_BUS_SPACE_IO) { + if (flags & BUS_SPACE_MAP_LINEAR) + return (EOPNOTSUPP); + ex = ioport_ex; + } else if (t == X86_BUS_SPACE_MEM) + ex = iomem_ex; + else + panic("x86_memio_alloc: bad bus space tag"); + + /* + * Sanity check the allocation against the extent's boundaries. + */ + if (rstart < ex->ex_start || rend > ex->ex_end) + panic("x86_memio_alloc: bad region start/end"); + + /* + * Do the requested allocation. + */ + error = extent_alloc_subregion(ex, rstart, rend, size, alignment, + boundary, + EX_FAST | EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0), + &bpa); + + if (error) + return (error); + + /* + * For I/O space, that's all she wrote. + */ + if (t == X86_BUS_SPACE_IO) { + *bshp = *bpap = bpa; + return (0); + } + + /* + * For memory space, map the bus physical address to + * a kernel virtual address. + */ + error = x86_mem_add_mapping(bpa, size, + (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp); + if (error) { + if (extent_free(iomem_ex, bpa, size, EX_NOWAIT | + (ioport_malloc_safe ? EX_MALLOCOK : 0))) { + printf("x86_memio_alloc: pa 0x%lx, size 0x%lx\n", + bpa, size); + printf("x86_memio_alloc: can't free region\n"); + } + } + + *bpap = bpa; + + return (error); +} + +int +x86_mem_add_mapping(bpa, size, cacheable, bshp) + bus_addr_t bpa; + bus_size_t size; + int cacheable; + bus_space_handle_t *bshp; +{ + u_long pa, endpa; + vaddr_t va; + pt_entry_t *pte; + pt_entry_t *maptp; + int32_t cpumask = 0; + + pa = x86_trunc_page(bpa); + endpa = x86_round_page(bpa + size); + +#ifdef DIAGNOSTIC + if (endpa <= pa) + panic("x86_mem_add_mapping: overflow"); +#endif + + if (bpa >= IOM_BEGIN && (bpa + size) <= IOM_END) { + va = (vaddr_t)ISA_HOLE_VADDR(pa); + } else { + va = uvm_km_valloc(kernel_map, endpa - pa); + if (va == 0) + return (ENOMEM); + } + + *bshp = (bus_space_handle_t)(va + (bpa & PGOFSET)); + + for (; pa < endpa; pa += PAGE_SIZE, va += PAGE_SIZE) { + pmap_kenter_pa(va, pa, VM_PROT_READ | VM_PROT_WRITE); + + /* + * PG_N doesn't exist on 386's, so we assume that + * the mainboard has wired up device space non-cacheable + * on those machines. + * + * Note that it's not necessary to use atomic ops to + * fiddle with the PTE here, because we don't care + * about mod/ref information. + * + * XXX should hand this bit to pmap_kenter_pa to + * save the extra invalidate! + * + * XXX extreme paranoia suggests tlb shootdown belongs here. + */ + if (pmap_cpu_has_pg_n()) { + pte = kvtopte(va); + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); + if (cacheable) + PTE_CLEARBITS(pte, maptp, PG_N); + else + PTE_SETBITS(pte, maptp, PG_N); + pmap_tlb_shootdown(pmap_kernel(), va, *pte, + &cpumask); + } + } + + pmap_tlb_shootnow(cpumask); + pmap_update(pmap_kernel()); + + return 0; +} + +/* + * void _x86_memio_unmap(bus_space_tag bst, bus_space_handle bsh, + * bus_size_t size, bus_addr_t *adrp) + * + * This function unmaps memory- or io-space mapped by the function + * _x86_memio_map(). This function works nearly as same as + * x86_memio_unmap(), but this function does not ask kernel + * built-in extents and returns physical address of the bus space, + * for the convenience of the extra extent manager. + */ +void +_x86_memio_unmap(t, bsh, size, adrp) + bus_space_tag_t t; + bus_space_handle_t bsh; + bus_size_t size; + bus_addr_t *adrp; +{ + u_long va, endva; + bus_addr_t bpa; + + /* + * Find the correct extent and bus physical address. + */ + if (t == X86_BUS_SPACE_IO) { + bpa = bsh; + } else if (t == X86_BUS_SPACE_MEM) { + if (bsh >= atdevbase && (bsh + size) <= (atdevbase + IOM_SIZE)) { + bpa = (bus_addr_t)ISA_PHYSADDR(bsh); + } else { + + va = x86_trunc_page(bsh); + endva = x86_round_page(bsh + size); + +#ifdef DIAGNOSTIC + if (endva <= va) { + panic("_x86_memio_unmap: overflow"); + } +#endif + +#if __NetBSD_Version__ > 104050000 + if (pmap_extract(pmap_kernel(), va, &bpa) == FALSE) { + panic("_x86_memio_unmap:" + " wrong virtual address"); + } + bpa += (bsh & PGOFSET); +#else + bpa = pmap_extract(pmap_kernel(), va) + (bsh & PGOFSET); +#endif + + pmap_kremove(va, endva - va); + /* + * Free the kernel virtual mapping. + */ + uvm_km_free(kernel_map, va, endva - va); + } + } else { + panic("_x86_memio_unmap: bad bus space tag"); + } + + if (adrp != NULL) { + *adrp = bpa; + } +} + +void +x86_memio_unmap(t, bsh, size) + bus_space_tag_t t; + bus_space_handle_t bsh; + bus_size_t size; +{ + struct extent *ex; + u_long va, endva; + bus_addr_t bpa; + + /* + * Find the correct extent and bus physical address. + */ + if (t == X86_BUS_SPACE_IO) { + ex = ioport_ex; + bpa = bsh; + } else if (t == X86_BUS_SPACE_MEM) { + ex = iomem_ex; + + if (bsh >= atdevbase && + (bsh + size) <= (atdevbase + IOM_SIZE)) { + bpa = (bus_addr_t)ISA_PHYSADDR(bsh); + goto ok; + } + + va = x86_trunc_page(bsh); + endva = x86_round_page(bsh + size); + +#ifdef DIAGNOSTIC + if (endva <= va) + panic("x86_memio_unmap: overflow"); +#endif + + (void) pmap_extract(pmap_kernel(), va, &bpa); + bpa += (bsh & PGOFSET); + + pmap_kremove(va, endva - va); + /* + * Free the kernel virtual mapping. + */ + uvm_km_free(kernel_map, va, endva - va); + } else + panic("x86_memio_unmap: bad bus space tag"); + +ok: + if (extent_free(ex, bpa, size, + EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0))) { + printf("x86_memio_unmap: %s 0x%lx, size 0x%lx\n", + (t == X86_BUS_SPACE_IO) ? "port" : "pa", bpa, size); + printf("x86_memio_unmap: can't free region\n"); + } +} + +void +x86_memio_free(t, bsh, size) + bus_space_tag_t t; + bus_space_handle_t bsh; + bus_size_t size; +{ + + /* x86_memio_unmap() does all that we need to do. */ + x86_memio_unmap(t, bsh, size); +} + +int +x86_memio_subregion(t, bsh, offset, size, nbshp) + bus_space_tag_t t; + bus_space_handle_t bsh; + bus_size_t offset, size; + bus_space_handle_t *nbshp; +{ + + *nbshp = bsh + offset; + return (0); +} + +paddr_t +x86_memio_mmap(t, addr, off, prot, flags) + bus_space_tag_t t; + bus_addr_t addr; + off_t off; + int prot; + int flags; +{ + + /* Can't mmap I/O space. */ + if (t == X86_BUS_SPACE_IO) + return (-1); + + /* + * "addr" is the base address of the device we're mapping. + * "off" is the offset into that device. + * + * Note we are called for each "page" in the device that + * the upper layers want to map. + */ + return (x86_btop(addr + off)); +} diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c new file mode 100644 index 0000000000..6783f69363 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c @@ -0,0 +1,234 @@ +/* $NetBSD: clock.c,v 1.1.2.2 2004/07/17 16:43:56 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_xen.h" + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.1.2.2 2004/07/17 16:43:56 he Exp $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/device.h> + +#include <machine/xen.h> +#include <machine/hypervisor.h> +#include <machine/evtchn.h> +#include <machine/cpu_counter.h> + +#include <dev/clock_subr.h> + +#include "config_time.h" /* for CONFIG_TIME */ + +static int xen_timer_handler(void *, struct trapframe *); + +/* These are peridically updated in shared_info, and then copied here. */ +static unsigned long shadow_tsc_stamp; +static u_int64_t shadow_system_time; +static unsigned long shadow_time_version; +static struct timeval shadow_tv; + +static int timeset; + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called at splclock. + */ +static void +get_time_values_from_xen(void) +{ + do { + shadow_time_version = HYPERVISOR_shared_info->time_version2; + __insn_barrier(); + shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec; + shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_usec; + shadow_tsc_stamp = HYPERVISOR_shared_info->tsc_timestamp; + shadow_system_time = HYPERVISOR_shared_info->system_time; + __insn_barrier(); + } while (shadow_time_version != HYPERVISOR_shared_info->time_version1); +} + +void +inittodr(time_t base) +{ + int s; + + /* + * if the file system time is more than a year older than the + * kernel, warn and then set the base time to the CONFIG_TIME. + */ + if (base && base < (CONFIG_TIME-SECYR)) { + printf("WARNING: preposterous time in file system\n"); + base = CONFIG_TIME; + } + + s = splclock(); + get_time_values_from_xen(); + splx(s); + + time.tv_usec = shadow_tv.tv_usec; + time.tv_sec = shadow_tv.tv_sec + rtc_offset * 60; +#ifdef DEBUG_CLOCK + printf("readclock: %ld (%ld)\n", time.tv_sec, base); +#endif + if (base != 0 && base < time.tv_sec - 5*SECYR) + printf("WARNING: file system time much less than clock time\n"); + else if (base > time.tv_sec + 5*SECYR) { + printf("WARNING: clock time much less than file system time\n"); + printf("WARNING: using file system time\n"); + goto fstime; + } + + timeset = 1; + return; + +fstime: + timeset = 1; + time.tv_sec = base; + printf("WARNING: CHECK AND RESET THE DATE!\n"); +} + +void +resettodr() +{ +#ifdef DOM0OPS + dom0_op_t op; + int s; +#endif +#ifdef DEBUG_CLOCK + struct clock_ymdhms dt; +#endif + + /* + * We might have been called by boot() due to a crash early + * on. Don't reset the clock chip in this case. + */ + if (!timeset) + return; + +#ifdef DEBUG_CLOCK + clock_secs_to_ymdhms(time.tv_sec - rtc_offset * 60, &dt); + + printf("setclock: %d/%d/%d %02d:%02d:%02d\n", dt.dt_year, + dt.dt_mon, dt.dt_day, dt.dt_hour, dt.dt_min, dt.dt_sec); +#endif +#ifdef DOM0OPS + if (xen_start_info.dom_id == 0) { + s = splclock(); + + op.cmd = DOM0_SETTIME; + op.u.settime.secs = time.tv_sec - rtc_offset * 60; + op.u.settime.usecs = time.tv_usec; + op.u.settime.system_time = shadow_system_time; + HYPERVISOR_dom0_op(&op); + + splx(s); + } +#endif +} + +void +startrtclock() +{ + +} + +/* + * Wait approximately `n' microseconds. + */ +void +xen_delay(int n) +{ + u_int64_t when; + + get_time_values_from_xen(); + when = shadow_system_time + n * 1000; + while (shadow_system_time < when) + get_time_values_from_xen(); +} + +void +xen_microtime(struct timeval *tv) +{ + + *tv = time; +} + +void +xen_initclocks() +{ + int irq = bind_virq_to_irq(VIRQ_TIMER); + + event_set_handler(irq, (int (*)(void *))xen_timer_handler, + NULL, IPL_CLOCK); + hypervisor_enable_irq(irq); +} + +static int +xen_timer_handler(void *arg, struct trapframe *regs) +{ +#if defined(I586_CPU) || defined(I686_CPU) + static int microset_iter; /* call cc_microset once/sec */ + struct cpu_info *ci = curcpu(); + + /* + * If we have a cycle counter, do the microset thing. + */ + if (ci->ci_feature_flags & CPUID_TSC) { + if ( +#if defined(MULTIPROCESSOR) + CPU_IS_PRIMARY(ci) && +#endif + (microset_iter--) == 0) { + microset_iter = hz - 1; +#if defined(MULTIPROCESSOR) + x86_broadcast_ipi(X86_IPI_MICROSET); +#endif + cc_microset_time = time; + cc_microset(ci); + } + } +#endif + + get_time_values_from_xen(); + + hardclock((struct clockframe *)regs); + + return 0; +} + +void +setstatclockrate(int arg) +{ +} diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c new file mode 100644 index 0000000000..0f5a9fe788 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c @@ -0,0 +1,226 @@ +/* $NetBSD: hypervisor.c,v 1.7.2.1 2004/05/22 15:58:54 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: hypervisor.c,v 1.7.2.1 2004/05/22 15:58:54 he Exp $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/device.h> + +#include "xencons.h" +#include "xennet.h" +#include "xbd.h" +#include "xenkbc.h" +#include "vga_xen.h" +#include "npx.h" + +#include "opt_xen.h" + +#include <machine/xen.h> +#include <machine/hypervisor.h> +#include <machine/evtchn.h> + +#ifdef DOM0OPS +#include <sys/dirent.h> +#include <sys/stat.h> +#include <sys/tree.h> +#include <sys/vnode.h> +#include <miscfs/specfs/specdev.h> +#include <miscfs/kernfs/kernfs.h> +#include <machine/kernfs_machdep.h> +#endif + +#if NXENNET > 0 +#include <net/if.h> +#include <net/if_ether.h> +#include <net/if_media.h> +#include <machine/if_xennetvar.h> +#endif + +#if NXBD > 0 +#include <sys/buf.h> +#include <sys/disk.h> +#include <dev/dkvar.h> +#include <machine/xbdvar.h> +#endif + +#if NXENKBC > 0 +#include <dev/pckbport/pckbportvar.h> +#include <machine/xenkbcvar.h> +#endif + +#if NVGA_XEN > 0 +#include <machine/bus.h> +#include <machine/vga_xenvar.h> +#endif + +int hypervisor_match(struct device *, struct cfdata *, void *); +void hypervisor_attach(struct device *, struct device *, void *); + +CFATTACH_DECL(hypervisor, sizeof(struct device), + hypervisor_match, hypervisor_attach, NULL, NULL); + +int hypervisor_print(void *, const char *); + +union hypervisor_attach_cookie { + const char *hac_device; /* first elem of all */ +#if NXENKBC > 0 + struct xenkbc_attach_args hac_xenkbc; +#endif +#if NVGA_XEN > 0 + struct xen_vga_attach_args hac_vga_xen; +#endif +#if NXENCONS > 0 + struct xencons_attach_args hac_xencons; +#endif +#if NXENNET > 0 + struct xennet_attach_args hac_xennet; +#endif +#if NXBD > 0 + struct xbd_attach_args hac_xbd; +#endif +#if NNPX > 0 + struct xen_npx_attach_args hac_xennpx; +#endif +}; + + +/* + * Probe for the hypervisor; always succeeds. + */ +int +hypervisor_match(parent, match, aux) + struct device *parent; + struct cfdata *match; + void *aux; +{ + struct hypervisor_attach_args *haa = aux; + + if (strcmp(haa->haa_busname, "hypervisor") == 0) + return 1; + return 0; +} + +/* + * Attach the hypervisor. + */ +void +hypervisor_attach(parent, self, aux) + struct device *parent, *self; + void *aux; +{ + union hypervisor_attach_cookie hac; + + printf("\n"); + + init_events(); + +#if NXENKBC > 0 + hac.hac_xenkbc.xa_device = "xenkbc"; + config_found(self, &hac.hac_xenkbc, hypervisor_print); +#endif + +#if NVGA_XEN > 0 + hac.hac_vga_xen.xa_device = "vga_xen"; + hac.hac_vga_xen.xa_iot = X86_BUS_SPACE_IO; + hac.hac_vga_xen.xa_memt = X86_BUS_SPACE_MEM; + config_found(self, &hac.hac_vga_xen, hypervisor_print); +#endif + +#if NXENCONS > 0 + hac.hac_xencons.xa_device = "xencons"; + config_found(self, &hac.hac_xencons, hypervisor_print); +#endif +#if NXENNET > 0 + hac.hac_xennet.xa_device = "xennet"; + xennet_scan(self, &hac.hac_xennet, hypervisor_print); +#endif +#if NXBD > 0 + hac.hac_xbd.xa_device = "xbd"; + xbd_scan(self, &hac.hac_xbd, hypervisor_print); +#endif +#if NNPX > 0 + hac.hac_xennpx.xa_device = "npx"; + config_found(self, &hac.hac_xennpx, hypervisor_print); +#endif +#ifdef DOM0OPS + if (xen_start_info.flags & SIF_PRIVILEGED) { + xenkernfs_init(); + xenprivcmd_init(); + xenmachmem_init(); + xenvfr_init(); + } +#endif +} + +int +hypervisor_print(aux, parent) + void *aux; + const char *parent; +{ + union hypervisor_attach_cookie *hac = aux; + + if (parent) + aprint_normal("%s at %s", hac->hac_device, parent); + return (UNCONF); +} + +void +hypervisor_notify_via_evtchn(unsigned int port) +{ + evtchn_op_t op; + + op.cmd = EVTCHNOP_send; + op.u.send.local_port = port; + (void)HYPERVISOR_event_channel_op(&op); +} + +#ifdef DOM0OPS + +#define DIR_MODE (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) + +kernfs_parentdir_t *kernxen_pkt; + +void +xenkernfs_init() +{ + kernfs_entry_t *dkt; + + KERNFS_ALLOCENTRY(dkt, M_TEMP, M_WAITOK); + KERNFS_INITENTRY(dkt, DT_DIR, "xen", NULL, KFSsubdir, VDIR, DIR_MODE); + kernfs_addentry(NULL, dkt); + kernxen_pkt = KERNFS_ENTOPARENTDIR(dkt); +} +#endif diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c new file mode 100644 index 0000000000..51219a980f --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c @@ -0,0 +1,1241 @@ +/* $NetBSD: if_xennet.c,v 1.1.2.1 2004/05/22 15:58:29 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: if_xennet.c,v 1.1.2.1 2004/05/22 15:58:29 he Exp $"); + +#include "opt_inet.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/syslog.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/device.h> +#include <sys/ioctl.h> +#include <sys/errno.h> +#if NRND > 0 +#include <sys/rnd.h> +#endif + +#include <net/if.h> +#include <net/if_types.h> +#include <net/if_dl.h> +#include <net/if_ether.h> + +#ifdef mediacode +#include <net/if_media.h> +#endif + +#ifdef INET +#include <netinet/in.h> +#include <netinet/if_inarp.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#endif + +#include <nfs/rpcv2.h> + +#include <nfs/nfsproto.h> +#include <nfs/nfs.h> +#include <nfs/nfsmount.h> +#include <nfs/nfsdiskless.h> + +#include "bpfilter.h" +#if NBPFILTER > 0 +#include <net/bpf.h> +#include <net/bpfdesc.h> +#endif + +#include <uvm/uvm_extern.h> +#include <uvm/uvm_page.h> + +#include <machine/xen.h> +#include <machine/hypervisor.h> +#include <machine/evtchn.h> +#include <machine/ctrl_if.h> + +#include <machine/if_xennetvar.h> + +#ifdef DEBUG +#define XENNET_DEBUG +#endif +#if defined(XENNET_DEBUG) && !defined(DEBUG) +#define DEBUG +#endif +/* #define XENNET_DEBUG_DUMP */ + +#ifdef XENNET_DEBUG +#define XEDB_FOLLOW 0x01 +#define XEDB_INIT 0x02 +#define XEDB_EVENT 0x04 +#define XEDB_MBUF 0x08 +#define XEDB_MEM 0x10 +int xennet_debug = 0x0; +#define DPRINTF(x) if (xennet_debug) printf x; +#define DPRINTFN(n,x) if (xennet_debug & (n)) printf x; +#else +#define DPRINTF(x) +#define DPRINTFN(n,x) +#endif +#define PRINTF(x) printf x; + +#ifdef XENNET_DEBUG_DUMP +static void xennet_hex_dump(unsigned char *, size_t, char *, int); +#endif + +int xennet_match (struct device *, struct cfdata *, void *); +void xennet_attach (struct device *, struct device *, void *); +static void xennet_ctrlif_rx(ctrl_msg_t *, unsigned long); +static void xennet_driver_status_change(netif_fe_driver_status_changed_t *); +static void xennet_status_change(netif_fe_interface_status_changed_t *); +static void xennet_tx_mbuf_free(struct mbuf *, caddr_t, size_t, void *); +static void xennet_rx_mbuf_free(struct mbuf *, caddr_t, size_t, void *); +static int xen_network_handler(void *); +static void network_tx_buf_gc(struct xennet_softc *); +static void network_alloc_rx_buffers(struct xennet_softc *); +static void network_alloc_tx_buffers(struct xennet_softc *); +void xennet_init(struct xennet_softc *); +void xennet_reset(struct xennet_softc *); +#ifdef mediacode +static int xennet_mediachange (struct ifnet *); +static void xennet_mediastatus(struct ifnet *, struct ifmediareq *); +#endif + +CFATTACH_DECL(xennet, sizeof(struct xennet_softc), + xennet_match, xennet_attach, NULL, NULL); + +#define TX_MAX_ENTRIES (NETIF_TX_RING_SIZE - 2) +#define RX_MAX_ENTRIES (NETIF_RX_RING_SIZE - 2) +#define TX_ENTRIES 128 +#define RX_ENTRIES 128 + +static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE]; +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1]; +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE]; + +/** Network interface info. */ +struct xennet_ctrl { + /** Number of interfaces. */ + int xc_interfaces; + /** Number of connected interfaces. */ + int xc_connected; + /** Error code. */ + int xc_err; + + cfprint_t xc_cfprint; + struct device *xc_parent; +}; + +static struct xennet_ctrl netctrl = { -1, 0, 0 }; + +#ifdef mediacode +static int xennet_media[] = { + IFM_ETHER|IFM_AUTO, +}; +static int nxennet_media = (sizeof(xennet_media)/sizeof(xennet_media[0])); +#endif + + +int +xennet_scan(struct device *self, struct xennet_attach_args *xneta, + cfprint_t print) +{ + ctrl_msg_t cmsg; + netif_fe_driver_status_changed_t st; + int err = 0; + + if ((xen_start_info.flags & SIF_INITDOMAIN) || + (xen_start_info.flags & SIF_NET_BE_DOMAIN)) + return 0; + + netctrl.xc_parent = self; + netctrl.xc_cfprint = print; + + printf("Initialising Xen virtual ethernet frontend driver.\n"); + + (void)ctrl_if_register_receiver(CMSG_NETIF_FE, xennet_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_NETIF_FE; + cmsg.subtype = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED; + cmsg.length = sizeof(netif_fe_driver_status_changed_t); + st.status = NETIF_DRIVER_STATUS_UP; + st.max_handle = 0; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, 0); + +#if 0 + err = xennet_wait_for_interfaces(); + if (err) + ctrl_if_unregister_receiver(CMSG_NETIF_FE, xennet_ctrlif_rx); +#endif + + return err; +} + +int +xennet_match(struct device *parent, struct cfdata *match, void *aux) +{ + struct xennet_attach_args *xa = (struct xennet_attach_args *)aux; + + if (strcmp(xa->xa_device, "xennet") == 0) + return 1; + return 0; +} + +void +xennet_attach(struct device *parent, struct device *self, void *aux) +{ + struct xennet_attach_args *xneta = (struct xennet_attach_args *)aux; + struct xennet_softc *sc = (struct xennet_softc *)self; + struct ifnet *ifp = &sc->sc_ethercom.ec_if; + int idx; + + aprint_normal(": Xen Virtual Network Interface\n"); + + sc->sc_ifno = xneta->xa_handle; + + /* Initialize ifnet structure. */ + memcpy(ifp->if_xname, sc->sc_dev.dv_xname, IFNAMSIZ); + ifp->if_softc = sc; + ifp->if_start = xennet_start; + ifp->if_ioctl = xennet_ioctl; + ifp->if_watchdog = xennet_watchdog; + ifp->if_flags = IFF_BROADCAST | IFF_NOTRAILERS; + +#ifdef mediacode + ifmedia_init(&sc->sc_media, 0, xennet_mediachange, + xennet_mediastatus); + for (idx = 0; idx < nxennet_media; idx++) + ifmedia_add(&sc->sc_media, xennet_media[idx], 0, NULL); + ifmedia_set(&sc->sc_media, xennet_media[0]); +#endif + + for (idx = 0; idx < NETIF_TX_RING_SIZE; idx++) + sc->sc_tx_bufa[idx].xb_next = idx + 1; + for (idx = 0; idx < NETIF_RX_RING_SIZE; idx++) + sc->sc_rx_bufa[idx].xb_next = idx + 1; +} + +static struct xennet_softc * +find_device(int handle) +{ + struct device *dv; + struct xennet_softc *xs = NULL; + + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) { + if (dv->dv_cfattach == NULL || + dv->dv_cfattach->ca_attach != xennet_attach) + continue; + xs = (struct xennet_softc *)dv; + if (xs->sc_ifno == handle) + break; + } + return xs; +} + +static void +xennet_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + int respond = 1; + + switch (msg->subtype) { + case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED: + if (msg->length != sizeof(netif_fe_interface_status_changed_t)) + goto error; + xennet_status_change( + (netif_fe_interface_status_changed_t *)&msg->msg[0]); + break; + + case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED: + if (msg->length != sizeof(netif_fe_driver_status_changed_t)) + goto error; + xennet_driver_status_change( + (netif_fe_driver_status_changed_t *)&msg->msg[0]); + break; + + error: + default: + msg->length = 0; + break; + } + + if (respond) + ctrl_if_send_response(msg); +} + +static void +xennet_driver_status_change(netif_fe_driver_status_changed_t *status) +{ + struct xennet_attach_args xneta; + int i; + + DPRINTFN(XEDB_EVENT, ("> max_handle=%d\n", status->max_handle)); + + /* XXX FIXME: Abuse of 'max_handle' as interface count. */ + netctrl.xc_interfaces = status->max_handle; + netctrl.xc_connected = 0; + + xneta.xa_device = "xennet"; + + for (i = 0; i < netctrl.xc_interfaces; i++) { + xneta.xa_handle = i; + config_found(netctrl.xc_parent, &xneta, netctrl.xc_cfprint); + } +} + +static void +xennet_status_change(netif_fe_interface_status_changed_t *status) +{ + ctrl_msg_t cmsg; + netif_fe_interface_connect_t up; + struct xennet_softc *sc; + struct ifnet *ifp; + struct vm_page *pg_tx, *pg_rx; + + DPRINTFN(XEDB_EVENT, (">\n")); + DPRINTFN(XEDB_EVENT, ("> status=%d handle=%d mac=%02x:%02x:%02x:%02x:%02x:%02x\n", + status->status, + status->handle, + status->mac[0], status->mac[1], status->mac[2], + status->mac[3], status->mac[4], status->mac[5])); + + if (netctrl.xc_interfaces <= 0) { + printf("Status change: no interfaces\n"); + return; + } + + sc = find_device(status->handle); + if (sc == NULL) { + printf("Status change: invalid netif handle %u\n", + status->handle); + return; + } + ifp = &sc->sc_ethercom.ec_if; + + switch (status->status) { + case NETIF_INTERFACE_STATUS_DESTROYED: + printf("Unexpected netif-DESTROYED message in state %d\n", + sc->sc_backend_state); + break; + + case NETIF_INTERFACE_STATUS_DISCONNECTED: +#if 0 + if (sc->sc_backend_state != BEST_CLOSED) { + printk("Unexpected netif-DISCONNECTED message" + " in state %d\n", sc->sc_backend_state); + printk("Attempting to reconnect network interface\n"); + + /* Begin interface recovery. + * + * NB. Whilst we're recovering, we turn the + * carrier state off. We take measures to + * ensure that this device isn't used for + * anything. We also stop the queue for this + * device. Various different approaches + * (e.g. continuing to buffer packets) have + * been tested but don't appear to improve the + * overall impact on TCP connections. + * + * TODO: (MAW) Change the Xend<->Guest + * protocol so that a recovery is initiated by + * a special "RESET" message - disconnect + * could just mean we're not allowed to use + * this interface any more. + */ + + /* Stop old i/f to prevent errors whilst we + * rebuild the state. */ + spin_lock_irq(&np->tx_lock); + spin_lock(&np->rx_lock); + netif_stop_queue(dev); + np->backend_state = BEST_DISCONNECTED; + spin_unlock(&np->rx_lock); + spin_unlock_irq(&np->tx_lock); + + /* Free resources. */ + free_irq(np->irq, dev); + unbind_evtchn_from_irq(np->evtchn); + free_page((unsigned long)np->tx); + free_page((unsigned long)np->rx); + } +#endif + + /* Move from CLOSED to DISCONNECTED state. */ + sc->sc_tx = (netif_tx_interface_t *) + uvm_km_valloc_align(kernel_map, PAGE_SIZE, PAGE_SIZE); + if (sc->sc_tx == NULL) + panic("netif: no tx va"); + sc->sc_rx = (netif_rx_interface_t *) + uvm_km_valloc_align(kernel_map, PAGE_SIZE, PAGE_SIZE); + if (sc->sc_rx == NULL) + panic("netif: no rx va"); + pg_tx = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); + if (pg_tx == NULL) { + panic("netif: no tx pages"); + } + pmap_kenter_pa((vaddr_t)sc->sc_tx, VM_PAGE_TO_PHYS(pg_tx), + VM_PROT_READ | VM_PROT_WRITE); + pg_rx = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); + if (pg_rx == NULL) { + panic("netif: no rx pages"); + } + pmap_kenter_pa((vaddr_t)sc->sc_rx, VM_PAGE_TO_PHYS(pg_rx), + VM_PROT_READ | VM_PROT_WRITE); + sc->sc_backend_state = BEST_DISCONNECTED; + + /* Construct an interface-CONNECT message for the + * domain controller. */ + cmsg.type = CMSG_NETIF_FE; + cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT; + cmsg.length = sizeof(netif_fe_interface_connect_t); + up.handle = status->handle; + up.tx_shmem_frame = xpmap_ptom(VM_PAGE_TO_PHYS(pg_tx)) >> PAGE_SHIFT; + up.rx_shmem_frame = xpmap_ptom(VM_PAGE_TO_PHYS(pg_rx)) >> PAGE_SHIFT; + memcpy(cmsg.msg, &up, sizeof(up)); + + /* Tell the controller to bring up the interface. */ + ctrl_if_send_message_block(&cmsg, NULL, 0, 0); + break; + + case NETIF_INTERFACE_STATUS_CONNECTED: + if (sc->sc_backend_state == BEST_CLOSED) { + printf("Unexpected netif-CONNECTED message" + " in state %d\n", sc->sc_backend_state); + break; + } + + memcpy(sc->sc_enaddr, status->mac, ETHER_ADDR_LEN); +#if 0 + if (xen_start_info.flags & SIF_PRIVILEGED) { + /* XXX for domain-0 change out ethernet address to be + * different than the physical address since arp + * replies from other domains will report the physical + * address. + */ + if (sc->sc_enaddr[0] != 0xaa) + sc->sc_enaddr[0] = 0xaa; + else + sc->sc_enaddr[0] = 0xab; + } +#endif + + /* Recovery procedure: */ + + /* Step 1: Reinitialise variables. */ + sc->sc_rx_resp_cons = sc->sc_tx_resp_cons = /* sc->sc_tx_full = */ 0; + sc->sc_rx->event = sc->sc_tx->event = 1; + + /* Step 2: Rebuild the RX and TX ring contents. */ + network_alloc_rx_buffers(sc); + SLIST_INIT(&sc->sc_tx_bufs); + network_alloc_tx_buffers(sc); + + /* Step 3: All public and private state should now be + * sane. Get ready to start sending and receiving + * packets and give the driver domain a kick because + * we've probably just requeued some packets. + */ + sc->sc_backend_state = BEST_CONNECTED; + __insn_barrier(); + hypervisor_notify_via_evtchn(status->evtchn); + network_tx_buf_gc(sc); + + if_attach(ifp); + ether_ifattach(ifp, sc->sc_enaddr); + + sc->sc_evtchn = status->evtchn; + sc->sc_irq = bind_evtchn_to_irq(sc->sc_evtchn); + event_set_handler(sc->sc_irq, &xen_network_handler, sc, IPL_NET); + hypervisor_enable_irq(sc->sc_irq); + netctrl.xc_connected++; + + aprint_normal("%s: MAC address %s\n", sc->sc_dev.dv_xname, + ether_sprintf(sc->sc_enaddr)); + +#if NRND > 0 + rnd_attach_source(&sc->rnd_source, sc->sc_dev.dv_xname, + RND_TYPE_NET, 0); +#endif + break; + + default: + printf("Status change to unknown value %d\n", + status->status); + break; + } +} + +static void +xennet_tx_mbuf_free(struct mbuf *m, caddr_t buf, size_t size, void *arg) +{ + struct xennet_txbuf *txbuf = (struct xennet_txbuf *)arg; + + DPRINTFN(XEDB_MBUF, ("xennet_tx_mbuf_free %p pa %p\n", txbuf, + (void *)txbuf->xt_pa)); + SLIST_INSERT_HEAD(&txbuf->xt_sc->sc_tx_bufs, txbuf, xt_next); + pool_cache_put(&mbpool_cache, m); +} + +static void +xennet_rx_push_buffer(struct xennet_softc *sc, int id) +{ + NETIF_RING_IDX ringidx; + int nr_pfns; + + ringidx = sc->sc_rx->req_prod; + nr_pfns = 0; + + DPRINTFN(XEDB_MEM, ("readding page va %p pa %p ma %p/%p to rx_ring " + "at %d with id %d\n", + (void *)sc->sc_rx_bufa[id].xb_rx.xbrx_va, + (void *)sc->sc_rx_bufa[id].xb_rx.xbrx_pa, + (void *)(PTE_BASE[x86_btop + (sc->sc_rx_bufa[id].xb_rx.xbrx_va)] & + PG_FRAME), + (void *)xpmap_ptom(sc->sc_rx_bufa[id].xb_rx.xbrx_pa), + ringidx, id)); + + sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].req.id = id; + + rx_pfn_array[nr_pfns] = xpmap_ptom(sc->sc_rx_bufa[id].xb_rx.xbrx_pa) + >> PAGE_SHIFT; + + /* Remove this page from pseudo phys map before + * passing back to Xen. */ + xpmap_phys_to_machine_mapping[(sc->sc_rx_bufa[id].xb_rx.xbrx_pa - XPMAP_OFFSET) >> PAGE_SHIFT] = + INVALID_P2M_ENTRY; + + rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping; + rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT; + rx_mcl[nr_pfns].args[1] = 0; + rx_mcl[nr_pfns].args[2] = 0; + + nr_pfns++; + + sc->sc_rx_bufs_to_notify++; + + ringidx++; + + /* + * We may have allocated buffers which have entries + * outstanding in the page update queue -- make sure we flush + * those first! + */ + xpq_flush_queue(); + + /* After all PTEs have been zapped we blow away stale TLB entries. */ + rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB; + + /* Give away a batch of pages. */ + rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op; + rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation; + rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array; + rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns; + rx_mcl[nr_pfns].args[3] = 0; + rx_mcl[nr_pfns].args[4] = DOMID_SELF; + + /* Zap PTEs and give away pages in one big multicall. */ + (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1); + + /* Check return status of HYPERVISOR_dom_mem_op(). */ + if ( rx_mcl[nr_pfns].args[5] != nr_pfns ) + panic("Unable to reduce memory reservation\n"); + + /* Above is a suitable barrier to ensure backend will see requests. */ + sc->sc_rx->req_prod = ringidx; +} + +static void +xennet_rx_mbuf_free(struct mbuf *m, caddr_t buf, size_t size, void *arg) +{ + union xennet_bufarray *xb = (union xennet_bufarray *)arg; + struct xennet_softc *sc = xb->xb_rx.xbrx_sc; + int id = (xb - sc->sc_rx_bufa); + + DPRINTFN(XEDB_MBUF, ("xennet_rx_mbuf_free id %d, mbuf %p, buf %p, " + "size %d\n", id, m, buf, size)); + + xennet_rx_push_buffer(sc, id); + + pool_cache_put(&mbpool_cache, m); +} + +static int +xen_network_handler(void *arg) +{ + struct xennet_softc *sc = arg; + struct ifnet *ifp = &sc->sc_ethercom.ec_if; + netif_rx_response_t *rx; + paddr_t pa; + NETIF_RING_IDX ringidx; + mmu_update_t *mmu = rx_mmu; + multicall_entry_t *mcl = rx_mcl; + struct mbuf *m; + + network_tx_buf_gc(sc); + + again: + for (ringidx = sc->sc_rx_resp_cons; + ringidx != sc->sc_rx->resp_prod; + ringidx++) { + rx = &sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].resp; + + if (rx->status < 0) + panic("rx->status < 0"); + /* XXXcl check rx->status for error */ + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) { + printf("xennet: rx no mbuf\n"); + break; + } + + pa = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_pa; + + DPRINTFN(XEDB_EVENT, ("rx event %d for id %d, size %d, " + "status %d, ma %08lx, pa %08lx\n", ringidx, + rx->id, rx->status, rx->status, rx->addr, pa)); + + /* Remap the page. */ + mmu->ptr = (rx->addr & PG_FRAME) | MMU_MACHPHYS_UPDATE; + mmu->val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT; + mmu++; + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT; + mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW; + mcl->args[2] = UVMF_FLUSH_TLB; // 0; + mcl++; + + xpmap_phys_to_machine_mapping + [(pa - XPMAP_OFFSET) >> PAGE_SHIFT] = + rx->addr >> PAGE_SHIFT; + + /* Do all the remapping work, and M->P updates, in one + * big hypercall. */ + if ((mcl - rx_mcl) != 0) { + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)rx_mmu; + mcl->args[1] = mmu - rx_mmu; + mcl->args[2] = 0; + mcl++; + (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); + } + if (0) + printf("page mapped at va %08lx -> %08x/%08lx\n", + sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va, + PTE_BASE[x86_btop(sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)], + rx->addr); + mmu = rx_mmu; + mcl = rx_mcl; + + DPRINTFN(XEDB_MBUF, ("rx packet mbuf %p va %p pa %p/%p " + "ma %p\n", m, + (void *)sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va, + (void *)(xpmap_mtop(PTE_BASE[x86_btop + (sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)] & PG_FRAME)), (void *)pa, + (void *)(PTE_BASE[x86_btop + (sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)] & PG_FRAME))); + + m->m_len = m->m_pkthdr.len = rx->status; + m->m_pkthdr.rcvif = ifp; + if (sc->sc_rx->req_prod != sc->sc_rx->resp_prod) { + MEXTADD(m, (void *)(sc->sc_rx_bufa[rx->id].xb_rx. + xbrx_va + (rx->addr & PAGE_MASK)), rx->status, M_DEVBUF, + xennet_rx_mbuf_free, + &sc->sc_rx_bufa[rx->id]); + } else { + /* + * This was our last receive buffer, allocate + * memory, copy data and push the receive + * buffer back to the hypervisor. + */ + MEXTMALLOC(m, rx->status, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + printf("xennet: rx no mbuf 2\n"); + m_free(m); + break; + } + memcpy(m->m_data, (void *)(sc->sc_rx_bufa[rx->id]. + xb_rx.xbrx_va + (rx->addr & PAGE_MASK)), rx->status); + xennet_rx_push_buffer(sc, rx->id); + } + +#ifdef XENNET_DEBUG_DUMP + xennet_hex_dump(mtod(m, u_char *), m->m_pkthdr.len, "r", rx->id); +#endif + +#if NBPFILTER > 0 + /* + * Pass packet to bpf if there is a listener. + */ + if (ifp->if_bpf) + bpf_mtap(ifp->if_bpf, m); +#endif + + ifp->if_ipackets++; + + /* Pass the packet up. */ + (*ifp->if_input)(ifp, m); + } + + sc->sc_rx_resp_cons = ringidx; + sc->sc_rx->event = sc->sc_rx_resp_cons + 1; + + if (sc->sc_rx->resp_prod != ringidx) + goto again; + + return 0; +} + +static inline int +get_bufarray_entry(union xennet_bufarray *a) +{ + int idx; + + idx = a[0].xb_next; + a[0].xb_next = a[idx].xb_next; + return idx; +} + +static inline void +put_bufarray_entry(union xennet_bufarray *a, int idx) +{ + + a[idx].xb_next = a[0].xb_next; + a[0].xb_next = idx; +} + +static void +network_tx_buf_gc(struct xennet_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ethercom.ec_if; + NETIF_RING_IDX idx, prod; + + do { + prod = sc->sc_tx->resp_prod; + + for (idx = sc->sc_tx_resp_cons; idx != prod; idx++) { + DPRINTFN(XEDB_EVENT, ("tx event at pos %d, status: " + "%d, id: %d, mbuf %p, buf %p\n", idx, + sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.status, + sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id, + sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m, + mtod(sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m, void *))); + m_freem(sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m); + put_bufarray_entry(sc->sc_tx_bufa, + sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id); + sc->sc_tx_entries--; /* atomic */ + } + + sc->sc_tx_resp_cons = prod; + + /* + * Set a new event, then check for race with update of + * tx_cons. + */ + sc->sc_tx->event = /* atomic */ + prod + (sc->sc_tx_entries >> 1) + 1; + __insn_barrier(); + } while (prod != sc->sc_tx->resp_prod); + + if (sc->sc_tx->resp_prod == sc->sc_tx->req_prod) + ifp->if_timer = 0; + /* KDASSERT(sc->sc_net_idx->tx_req_prod == */ + /* TX_RING_ADD(sc->sc_net_idx->tx_resp_prod, sc->sc_tx_entries)); */ +} + +static void +network_alloc_rx_buffers(struct xennet_softc *sc) +{ + vaddr_t rxpages, va; + paddr_t pa; + struct vm_page *pg; + int id, nr_pfns; + NETIF_RING_IDX ringidx; + int s; + + ringidx = sc->sc_rx->req_prod; + if (0) printf("network_alloc_rx_buffers prod %d cons %d\n", ringidx, + sc->sc_rx_resp_cons); + if ((ringidx - sc->sc_rx_resp_cons) > (RX_MAX_ENTRIES / 2)) + return; + + nr_pfns = 0; + + rxpages = uvm_km_valloc_align(kernel_map, RX_ENTRIES * PAGE_SIZE, + PAGE_SIZE); + + s = splnet(); + for (va = rxpages; va < rxpages + RX_ENTRIES * PAGE_SIZE; + va += PAGE_SIZE) { + pg = uvm_pagealloc(NULL, 0, NULL, 0); + if (pg == NULL) + panic("network_alloc_rx_buffers: no pages"); + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), + VM_PROT_READ | VM_PROT_WRITE); + + id = get_bufarray_entry(sc->sc_rx_bufa); + sc->sc_rx_bufa[id].xb_rx.xbrx_va = va; + sc->sc_rx_bufa[id].xb_rx.xbrx_sc = sc; + + pa = VM_PAGE_TO_PHYS(pg); + DPRINTFN(XEDB_MEM, ("adding page va %p pa %p/%p " + "ma %p/%p to rx_ring at %d with id %d\n", (void *)va, + (void *)(VM_PAGE_TO_PHYS(pg) & PG_FRAME), (void *)xpmap_mtop(PTE_BASE[x86_btop(va)]), + (void *)(PTE_BASE[x86_btop(va)] & PG_FRAME), + (void *)xpmap_ptom(VM_PAGE_TO_PHYS(pg)), + ringidx, id)); + sc->sc_rx_bufa[id].xb_rx.xbrx_pa = pa; + sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].req.id = id; + + rx_pfn_array[nr_pfns] = xpmap_ptom(pa) >> PAGE_SHIFT; + + /* Remove this page from pseudo phys map before + * passing back to Xen. */ + xpmap_phys_to_machine_mapping[(pa - XPMAP_OFFSET) >> PAGE_SHIFT] = + INVALID_P2M_ENTRY; + + rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping; + rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT; + rx_mcl[nr_pfns].args[1] = 0; + rx_mcl[nr_pfns].args[2] = 0; + + nr_pfns++; + + sc->sc_rx_bufs_to_notify++; + + ringidx++; + if ((ringidx - sc->sc_rx_resp_cons) == RX_MAX_ENTRIES) + break; + } + + if (nr_pfns == 0) { + splx(s); + return; + } + + /* + * We may have allocated buffers which have entries + * outstanding in the page update queue -- make sure we flush + * those first! + */ + xpq_flush_queue(); + + /* After all PTEs have been zapped we blow away stale TLB entries. */ + rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB; + + /* Give away a batch of pages. */ + rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op; + rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation; + rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array; + rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns; + rx_mcl[nr_pfns].args[3] = 0; + rx_mcl[nr_pfns].args[4] = DOMID_SELF; + + /* Zap PTEs and give away pages in one big multicall. */ + (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1); + + /* Check return status of HYPERVISOR_dom_mem_op(). */ + if (rx_mcl[nr_pfns].args[5] != nr_pfns) + panic("Unable to reduce memory reservation\n"); + + /* Above is a suitable barrier to ensure backend will see requests. */ + sc->sc_rx->req_prod = ringidx; + + splx(s); + +} + +static void +network_alloc_tx_buffers(struct xennet_softc *sc) +{ + vaddr_t txpages, va; + struct vm_page *pg; + struct xennet_txbuf *txbuf; + int i; + + txpages = uvm_km_valloc_align(kernel_map, + (TX_ENTRIES / TXBUF_PER_PAGE) * PAGE_SIZE, PAGE_SIZE); + for (va = txpages; + va < txpages + (TX_ENTRIES / TXBUF_PER_PAGE) * PAGE_SIZE; + va += PAGE_SIZE) { + pg = uvm_pagealloc(NULL, 0, NULL, 0); + if (pg == NULL) + panic("network_alloc_tx_buffers: no pages"); + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), + VM_PROT_READ | VM_PROT_WRITE); + + for (i = 0; i < TXBUF_PER_PAGE; i++) { + txbuf = (struct xennet_txbuf *) + (va + i * (PAGE_SIZE / TXBUF_PER_PAGE)); + txbuf->xt_sc = sc; + txbuf->xt_pa = VM_PAGE_TO_PHYS(pg) + + i * (PAGE_SIZE / TXBUF_PER_PAGE) + + sizeof(struct xennet_txbuf); + SLIST_INSERT_HEAD(&sc->sc_tx_bufs, txbuf, xt_next); + } + } +} + +/* + * Called at splnet. + */ +void +xennet_start(struct ifnet *ifp) +{ + struct xennet_softc *sc = ifp->if_softc; + struct mbuf *m, *new_m; + struct xennet_txbuf *txbuf; + netif_tx_request_t *txreq; + NETIF_RING_IDX idx; + paddr_t pa; + int bufid; + + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_start()\n", sc->sc_dev.dv_xname)); + +#ifdef DIAGNOSTIC + IFQ_POLL(&ifp->if_snd, m); + if (m == 0) + panic("%s: No packet to start", sc->sc_dev.dv_xname); +#endif + + if ((ifp->if_flags & (IFF_RUNNING | IFF_OACTIVE)) != IFF_RUNNING) + return; + + idx = sc->sc_tx->req_prod; + while (/*CONSTCOND*/1) { + + IFQ_POLL(&ifp->if_snd, m); + if (m == NULL) + break; + + switch (m->m_flags & (M_EXT|M_EXT_CLUSTER)) { + case M_EXT|M_EXT_CLUSTER: + pa = m->m_ext.ext_paddr + + (m->m_data - m->m_ext.ext_buf); + break; + default: + case 0: + pa = m->m_paddr + M_BUFOFFSET(m) + + (m->m_data - M_BUFADDR(m)); + break; + } + + if (m->m_pkthdr.len != m->m_len || + (pa ^ (pa + m->m_pkthdr.len)) & PG_FRAME) { + txbuf = SLIST_FIRST(&sc->sc_tx_bufs); + if (txbuf == NULL) { + // printf("xennet: no tx bufs\n"); + break; + } + + MGETHDR(new_m, M_DONTWAIT, MT_DATA); + if (new_m == NULL) { + printf("xennet: no mbuf\n"); + break; + } + + SLIST_REMOVE_HEAD(&sc->sc_tx_bufs, xt_next); + IFQ_DEQUEUE(&ifp->if_snd, m); + + KASSERT(m->m_flags & M_PKTHDR); + M_COPY_PKTHDR(new_m, m); + m_copydata(m, 0, m->m_pkthdr.len, txbuf->xt_buf); + MEXTADD(new_m, txbuf->xt_buf, m->m_pkthdr.len, + M_DEVBUF, xennet_tx_mbuf_free, txbuf); + new_m->m_ext.ext_paddr = txbuf->xt_pa; + new_m->m_len = new_m->m_pkthdr.len = m->m_pkthdr.len; + + m_freem(m); + m = new_m; + + pa = m->m_ext.ext_paddr + + (m->m_data - m->m_ext.ext_buf); + } else + IFQ_DEQUEUE(&ifp->if_snd, m); + + bufid = get_bufarray_entry(sc->sc_tx_bufa); + sc->sc_tx_bufa[bufid].xb_tx.xbtx_m = m; + + DPRINTFN(XEDB_MBUF, ("xennet_start id %d, mbuf %p, buf %p/%p, " + "size %d\n", bufid, m, mtod(m, void *), + (void *)pa, m->m_pkthdr.len)); +#ifdef XENNET_DEBUG_DUMP + xennet_hex_dump(mtod(m, u_char *), m->m_pkthdr.len, "s", bufid); +#endif + + txreq = &sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].req; + txreq->id = bufid; + txreq->addr = xpmap_ptom(pa); + txreq->size = m->m_pkthdr.len; + + __insn_barrier(); + idx++; + sc->sc_tx->req_prod = idx; + + sc->sc_tx_entries++; /* XXX atomic */ + +#ifdef XENNET_DEBUG + DPRINTFN(XEDB_MEM, ("packet addr %p/%p, physical %p/%p, " + "m_paddr %p, len %d/%d\n", M_BUFADDR(m), mtod(m, void *), + (void *)*kvtopte(mtod(m, vaddr_t)), + (void *)xpmap_mtop(*kvtopte(mtod(m, vaddr_t))), + (void *)m->m_paddr, m->m_pkthdr.len, m->m_len)); +#endif + +#if NBPFILTER > 0 + /* + * Pass packet to bpf if there is a listener. + */ + if (ifp->if_bpf) + bpf_mtap(ifp->if_bpf, m); +#endif + } + + ifp->if_flags &= ~IFF_OACTIVE; + + network_tx_buf_gc(sc); + + __insn_barrier(); + if (sc->sc_tx->resp_prod != idx) + hypervisor_notify_via_evtchn(sc->sc_evtchn); + + ifp->if_timer = 5; + + ifp->if_opackets++; + + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_start() done\n", + sc->sc_dev.dv_xname)); +} + +int +xennet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct xennet_softc *sc = ifp->if_softc; + struct ifaddr *ifa = (struct ifaddr *)data; +#ifdef mediacode + struct ifreq *ifr = (struct ifreq *)data; +#endif + int s, error = 0; + + s = splnet(); + + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl()\n", sc->sc_dev.dv_xname)); + + switch(cmd) { + case SIOCSIFADDR: + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOCSIFADDR\n", + sc->sc_dev.dv_xname)); + ifp->if_flags |= IFF_UP; + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + xennet_init(sc); + arp_ifinit(ifp, ifa); + break; +#endif + default: + xennet_init(sc); + break; + } + break; + + case SIOCSIFFLAGS: + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOCSIFFLAGS\n", + sc->sc_dev.dv_xname)); + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOC*MULTI\n", + sc->sc_dev.dv_xname)); + break; + +#ifdef mediacode + case SIOCGIFMEDIA: + case SIOCSIFMEDIA: + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOC*IFMEDIA\n", + sc->sc_dev.dv_xname)); + error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); + break; +#endif + + default: + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl(0x%lx) unknown cmd\n", + sc->sc_dev.dv_xname, cmd)); + error = EINVAL; + break; + } + + splx(s); + + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() returning %d\n", + sc->sc_dev.dv_xname, error)); + + return error; +} + +void +xennet_watchdog(struct ifnet *ifp) +{ + + panic("xennet_watchdog\n"); +} + +void +xennet_init(struct xennet_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ethercom.ec_if; + + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_init()\n", sc->sc_dev.dv_xname)); + + if (ifp->if_flags & IFF_UP) { + if ((ifp->if_flags & IFF_RUNNING) == 0) + xennet_reset(sc); + + ifp->if_flags |= IFF_RUNNING; + ifp->if_flags &= ~IFF_OACTIVE; + ifp->if_timer = 0; + } else { + ifp->if_flags &= ~IFF_RUNNING; + xennet_reset(sc); + } +} + +void +xennet_reset(struct xennet_softc *sc) +{ + + DPRINTFN(XEDB_FOLLOW, ("%s: xennet_reset()\n", sc->sc_dev.dv_xname)); +} + +#ifdef mediacode +/* + * Media change callback. + */ +static int +xennet_mediachange(struct ifnet *ifp) +{ + struct xennet_softc *sc = ifp->if_softc; + + switch IFM_SUBTYPE(sc->sc_media.ifm_media) { + case IFM_AUTO: + break; + default: + return (1); + break; + } + + return (0); +} + +/* + * Media status callback. + */ +static void +xennet_mediastatus(struct ifnet *ifp, struct ifmediareq *ifmr) +{ + struct xennet_softc *sc = ifp->if_softc; + + if (IFM_SUBTYPE(ifmr->ifm_active) == IFM_AUTO) + ifmr->ifm_active = sc->sc_media.ifm_cur->ifm_data; + + ifmr->ifm_status &= ~IFM_AVALID; +} +#endif + +int +xennet_bootstatic_callback(struct nfs_diskless *nd) +{ + struct ifnet *ifp = nd->nd_ifp; + struct xennet_softc *sc = (struct xennet_softc *)ifp->if_softc; + union xen_cmdline_parseinfo xcp; + struct sockaddr_in *sin; + + memset(&xcp, 0, sizeof(xcp.xcp_netinfo)); + xcp.xcp_netinfo.xi_ifno = sc->sc_ifno; + xcp.xcp_netinfo.xi_root = nd->nd_root.ndm_host; + xen_parse_cmdline(XEN_PARSE_NETINFO, &xcp); + + nd->nd_myip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[0]); + nd->nd_gwip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[2]); + nd->nd_mask.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[3]); + + sin = (struct sockaddr_in *) &nd->nd_root.ndm_saddr; + memset((caddr_t)sin, 0, sizeof(*sin)); + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[1]); + + return (NFS_BOOTSTATIC_HAS_MYIP|NFS_BOOTSTATIC_HAS_GWIP| + NFS_BOOTSTATIC_HAS_MASK|NFS_BOOTSTATIC_HAS_SERVADDR| + NFS_BOOTSTATIC_HAS_SERVER); +} + + +#ifdef XENNET_DEBUG_DUMP +#define XCHR(x) "0123456789abcdef"[(x) & 0xf] +static void +xennet_hex_dump(unsigned char *pkt, size_t len, char *type, int id) +{ + size_t i, j; + + printf("pkt %p len %d/%x type %s id %d\n", pkt, len, len, type, id); + printf("00000000 "); + for(i=0; i<len; i++) { + printf("%c%c ", XCHR(pkt[i]>>4), XCHR(pkt[i])); + if ((i+1) % 16 == 8) + printf(" "); + if ((i+1) % 16 == 0) { + printf(" %c", '|'); + for(j=0; j<16; j++) + printf("%c", pkt[i-15+j]>=32 && + pkt[i-15+j]<127?pkt[i-15+j]:'.'); + printf("%c\n%c%c%c%c%c%c%c%c ", '|', + XCHR((i+1)>>28), XCHR((i+1)>>24), + XCHR((i+1)>>20), XCHR((i+1)>>16), + XCHR((i+1)>>12), XCHR((i+1)>>8), + XCHR((i+1)>>4), XCHR(i+1)); + } + } + printf("\n"); +} +#undef XCHR +#endif diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c new file mode 100644 index 0000000000..b72ffc95a1 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c @@ -0,0 +1,1368 @@ +/* $NetBSD: xbd.c,v 1.9.2.1 2004/05/22 15:59:11 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: xbd.c,v 1.9.2.1 2004/05/22 15:59:11 he Exp $"); + +#include "xbd.h" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/errno.h> +#include <sys/buf.h> +#include <sys/malloc.h> +#include <sys/pool.h> +#include <sys/ioctl.h> +#include <sys/device.h> +#include <sys/disk.h> +#include <sys/disklabel.h> +#include <sys/fcntl.h> +#include <sys/vnode.h> +#include <sys/lock.h> +#include <sys/conf.h> +#include <sys/queue.h> +#include <sys/stat.h> +#include <sys/sysctl.h> +#include <sys/kernel.h> +#include <sys/kthread.h> + +#include <uvm/uvm.h> + +#include <dev/dkvar.h> +#include <machine/xbdvar.h> +#include <machine/xen.h> +#include <machine/hypervisor.h> +#include <machine/hypervisor-ifs/hypervisor-if.h> +#include <machine/hypervisor-ifs/vbd.h> +#include <machine/evtchn.h> + + +static void xbd_attach(struct device *, struct device *, void *); +static int xbd_detach(struct device *, int); + +#if NXBD > 0 +int xbd_match(struct device *, struct cfdata *, void *); +CFATTACH_DECL(xbd, sizeof(struct xbd_softc), + xbd_match, xbd_attach, xbd_detach, NULL); + +extern struct cfdriver xbd_cd; +#endif + +#if NWD > 0 +int xbd_wd_match(struct device *, struct cfdata *, void *); +CFATTACH_DECL(wd, sizeof(struct xbd_softc), + xbd_wd_match, xbd_attach, xbd_detach, NULL); + +extern struct cfdriver wd_cd; +#endif + +#if NSD > 0 +int xbd_sd_match(struct device *, struct cfdata *, void *); +CFATTACH_DECL(sd, sizeof(struct xbd_softc), + xbd_sd_match, xbd_attach, xbd_detach, NULL); + +extern struct cfdriver sd_cd; +#endif + +#if NCD > 0 +int xbd_cd_match(struct device *, struct cfdata *, void *); +CFATTACH_DECL(cd, sizeof(struct xbd_softc), + xbd_cd_match, xbd_attach, xbd_detach, NULL); + +extern struct cfdriver cd_cd; +#endif + + +dev_type_open(xbdopen); +dev_type_close(xbdclose); +dev_type_read(xbdread); +dev_type_write(xbdwrite); +dev_type_ioctl(xbdioctl); +dev_type_ioctl(xbdioctl_cdev); +dev_type_strategy(xbdstrategy); +dev_type_dump(xbddump); +dev_type_size(xbdsize); + +#if NXBD > 0 +const struct bdevsw xbd_bdevsw = { + xbdopen, xbdclose, xbdstrategy, xbdioctl, + xbddump, xbdsize, D_DISK +}; + +const struct cdevsw xbd_cdevsw = { + xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev, + nostop, notty, nopoll, nommap, nokqfilter, D_DISK +}; + +static dev_t xbd_major; +#endif + +#if NWD > 0 +const struct bdevsw wd_bdevsw = { + xbdopen, xbdclose, xbdstrategy, xbdioctl, + xbddump, xbdsize, D_DISK +}; + +const struct cdevsw wd_cdevsw = { + xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev, + nostop, notty, nopoll, nommap, nokqfilter, D_DISK +}; + +static dev_t xbd_wd_major; +static dev_t xbd_wd_cdev_major; +#endif + +#if NSD > 0 +const struct bdevsw sd_bdevsw = { + xbdopen, xbdclose, xbdstrategy, xbdioctl, + xbddump, xbdsize, D_DISK +}; + +const struct cdevsw sd_cdevsw = { + xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev, + nostop, notty, nopoll, nommap, nokqfilter, D_DISK +}; + +static dev_t xbd_sd_major; +static dev_t xbd_sd_cdev_major; +#endif + +#if NCD > 0 +const struct bdevsw cd_bdevsw = { + xbdopen, xbdclose, xbdstrategy, xbdioctl, + xbddump, xbdsize, D_DISK +}; + +const struct cdevsw cd_cdevsw = { + xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev, + nostop, notty, nopoll, nommap, nokqfilter, D_DISK +}; + +static dev_t xbd_cd_major; +static dev_t xbd_cd_cdev_major; +#endif + + +static int xbdstart(struct dk_softc *, struct buf *); +static int xbd_response_handler(void *); +static void xbd_update_create_kthread(void *); +static void xbd_update_kthread(void *); +static int xbd_update_handler(void *); + +static int xbdinit(struct xbd_softc *, xen_disk_t *, struct dk_intf *); + +/* Pseudo-disk Interface */ +static struct dk_intf dkintf_esdi = { + DTYPE_ESDI, + "Xen Virtual ESDI", + xbdopen, + xbdclose, + xbdstrategy, + xbdstart, +}; +#if NSD > 0 +static struct dk_intf dkintf_scsi = { + DTYPE_SCSI, + "Xen Virtual SCSI", + xbdopen, + xbdclose, + xbdstrategy, + xbdstart, +}; +#endif + +#if NXBD > 0 +static struct xbd_attach_args xbd_ata = { + .xa_device = "xbd", + .xa_dkintf = &dkintf_esdi, +}; +#endif + +#if NWD > 0 +static struct xbd_attach_args wd_ata = { + .xa_device = "wd", + .xa_dkintf = &dkintf_esdi, +}; +#endif + +#if NSD > 0 +static struct xbd_attach_args sd_ata = { + .xa_device = "sd", + .xa_dkintf = &dkintf_scsi, +}; +#endif + +#if NCD > 0 +static struct xbd_attach_args cd_ata = { + .xa_device = "cd", + .xa_dkintf = &dkintf_esdi, +}; +#endif + +static struct sysctlnode *diskcookies; + + +#if defined(XBDDEBUG) && !defined(DEBUG) +#define DEBUG +#endif + +#ifdef DEBUG +int xbddebug = 0; + +#define XBDB_FOLLOW 0x1 +#define XBDB_IO 0x2 +#define XBDB_SETUP 0x4 +#define XBDB_HOTPLUG 0x8 + +#define IFDEBUG(x,y) if (xbddebug & (x)) y +#define DPRINTF(x,y) IFDEBUG(x, printf y) +#define DPRINTF_FOLLOW(y) DPRINTF(XBDB_FOLLOW, y) +#define DEBUG_MARK_UNUSED(_xr) (_xr)->xr_sc = (void *)0xdeadbeef + +struct xbdreq *xbd_allxr; +#else +#define IFDEBUG(x,y) +#define DPRINTF(x,y) +#define DPRINTF_FOLLOW(y) +#define DEBUG_MARK_UNUSED(_xr) +#endif + +#ifdef DIAGNOSTIC +#define DIAGPANIC(x) panic x +#define DIAGCONDPANIC(x,y) if (x) panic y +#else +#define DIAGPANIC(x) +#define DIAGCONDPANIC(x,y) +#endif + + +struct xbdreq { + union { + SLIST_ENTRY(xbdreq) _unused; /* ptr. to next free xbdreq */ + SIMPLEQ_ENTRY(xbdreq) _suspended; + /* link when on suspended queue. */ + } _link; + struct xbdreq *xr_parent; /* ptr. to parent xbdreq */ + struct buf *xr_bp; /* ptr. to original I/O buf */ + daddr_t xr_bn; /* block no. to process */ + long xr_bqueue; /* bytes left to queue */ + long xr_bdone; /* bytes left */ + vaddr_t xr_data; /* ptr. to data to be proc. */ + vaddr_t xr_aligned; /* ptr. to aligned data */ + long xr_breq; /* bytes in this req. */ + struct xbd_softc *xr_sc; /* ptr. to xbd softc */ +}; +#define xr_unused _link._unused +#define xr_suspended _link._suspended + +SLIST_HEAD(,xbdreq) xbdreqs = + SLIST_HEAD_INITIALIZER(xbdreqs); +static SIMPLEQ_HEAD(, xbdreq) xbdr_suspended = + SIMPLEQ_HEAD_INITIALIZER(xbdr_suspended); + +#define CANGET_XBDREQ() (!SLIST_EMPTY(&xbdreqs)) + +#define GET_XBDREQ(_xr) do { \ + (_xr) = SLIST_FIRST(&xbdreqs); \ + if (__predict_true(_xr)) \ + SLIST_REMOVE_HEAD(&xbdreqs, xr_unused); \ +} while (/*CONSTCOND*/0) + +#define PUT_XBDREQ(_xr) do { \ + DEBUG_MARK_UNUSED(_xr); \ + SLIST_INSERT_HEAD(&xbdreqs, _xr, xr_unused); \ +} while (/*CONSTCOND*/0) + +static struct bufq_state bufq; +static int bufq_users = 0; + +#define XEN_MAJOR(_dev) ((_dev) >> 8) +#define XEN_MINOR(_dev) ((_dev) & 0xff) + +#define XEN_SCSI_DISK0_MAJOR 8 +#define XEN_SCSI_DISK1_MAJOR 65 +#define XEN_SCSI_DISK2_MAJOR 66 +#define XEN_SCSI_DISK3_MAJOR 67 +#define XEN_SCSI_DISK4_MAJOR 68 +#define XEN_SCSI_DISK5_MAJOR 69 +#define XEN_SCSI_DISK6_MAJOR 70 +#define XEN_SCSI_DISK7_MAJOR 71 +#define XEN_SCSI_DISK8_MAJOR 128 +#define XEN_SCSI_DISK9_MAJOR 129 +#define XEN_SCSI_DISK10_MAJOR 130 +#define XEN_SCSI_DISK11_MAJOR 131 +#define XEN_SCSI_DISK12_MAJOR 132 +#define XEN_SCSI_DISK13_MAJOR 133 +#define XEN_SCSI_DISK14_MAJOR 134 +#define XEN_SCSI_DISK15_MAJOR 135 +#define XEN_SCSI_CDROM_MAJOR 11 + +#define XEN_IDE0_MAJOR 3 +#define XEN_IDE1_MAJOR 22 +#define XEN_IDE2_MAJOR 33 +#define XEN_IDE3_MAJOR 34 +#define XEN_IDE4_MAJOR 56 +#define XEN_IDE5_MAJOR 57 +#define XEN_IDE6_MAJOR 88 +#define XEN_IDE7_MAJOR 89 +#define XEN_IDE8_MAJOR 90 +#define XEN_IDE9_MAJOR 91 + +#define XEN_BSHIFT 9 /* log2(XEN_BSIZE) */ +#define XEN_BSIZE (1 << XEN_BSHIFT) + +#define MAX_VBDS 64 +static int nr_vbds; +static xen_disk_t *vbd_info; + +static blk_ring_t *blk_ring = NULL; +static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */ +static BLK_RING_IDX req_prod; /* Private request producer. */ +static BLK_RING_IDX last_req_prod; /* Request producer at last trap. */ + +#define STATE_ACTIVE 0 +#define STATE_SUSPENDED 1 +#define STATE_CLOSED 2 +static unsigned int state = STATE_SUSPENDED; + + +#define XBDUNIT(x) DISKUNIT(x) +#define GETXBD_SOFTC(_xs, x) if (!((_xs) = getxbd_softc(x))) return ENXIO +#define GETXBD_SOFTC_CDEV(_xs, x) do { \ + dev_t bx = devsw_chr2blk((x)); \ + if (bx == NODEV) \ + return ENXIO; \ + if (!((_xs) = getxbd_softc(bx))) \ + return ENXIO; \ +} while (/*CONSTCOND*/0) + +static struct xbd_softc * +getxbd_softc(dev_t dev) +{ + int unit = XBDUNIT(dev); + + DPRINTF_FOLLOW(("getxbd_softc(0x%x): major = %d unit = %d\n", dev, + major(dev), unit)); +#if NXBD > 0 + if (major(dev) == xbd_major) + return device_lookup(&xbd_cd, unit); +#endif +#if NWD > 0 + if (major(dev) == xbd_wd_major || major(dev) == xbd_wd_cdev_major) + return device_lookup(&wd_cd, unit); +#endif +#if NSD > 0 + if (major(dev) == xbd_sd_major || major(dev) == xbd_sd_cdev_major) + return device_lookup(&sd_cd, unit); +#endif +#if NCD > 0 + if (major(dev) == xbd_cd_major || major(dev) == xbd_cd_cdev_major) + return device_lookup(&cd_cd, unit); +#endif + return NULL; +} + +static int +get_vbd_info(xen_disk_t *disk_info) +{ + int err; + block_io_op_t op; + + /* Probe for disk information. */ + memset(&op, 0, sizeof(op)); + op.cmd = BLOCK_IO_OP_VBD_PROBE; + op.u.probe_params.domain = 0; + op.u.probe_params.xdi.max = MAX_VBDS; + op.u.probe_params.xdi.disks = disk_info; + op.u.probe_params.xdi.count = 0; + + err = HYPERVISOR_block_io_op(&op); + if (err) { + printf("WARNING: Could not probe disks (%d)\n", err); + DIAGPANIC(("get_vbd_info: Could not probe disks (%d)", err)); + return -1; + } + + return op.u.probe_params.xdi.count; +} + +static void +reset_interface(void) +{ + block_io_op_t op; + + op.cmd = BLOCK_IO_OP_RESET; + if (HYPERVISOR_block_io_op(&op) != 0) + printf("xbd: Possible blkdev trouble: couldn't reset ring\n"); +} + +static void +init_interface(void) +{ + block_io_op_t op; + + reset_interface(); + + if (blk_ring == NULL) { + op.cmd = BLOCK_IO_OP_RING_ADDRESS; + (void)HYPERVISOR_block_io_op(&op); + + blk_ring = (blk_ring_t *)uvm_km_valloc_align(kernel_map, + PAGE_SIZE, PAGE_SIZE); + pmap_kenter_ma((vaddr_t)blk_ring, op.u.ring_mfn << PAGE_SHIFT, + VM_PROT_READ|VM_PROT_WRITE); + DPRINTF(XBDB_SETUP, ("init_interface: " + "ring va %p and wired to %p\n", + blk_ring, (void *)(op.u.ring_mfn << PAGE_SHIFT))); + + blk_ring->req_prod = blk_ring->resp_prod = + resp_cons = req_prod = last_req_prod = 0; + + event_set_handler(_EVENT_BLKDEV, &xbd_response_handler, + NULL, IPL_BIO); + hypervisor_enable_event(_EVENT_BLKDEV); + } + + __insn_barrier(); + state = STATE_ACTIVE; +} + +static void +enable_update_events(struct device *self) +{ + + kthread_create(xbd_update_create_kthread, self); + event_set_handler(_EVENT_VBD_UPD, &xbd_update_handler, self, IPL_BIO); + hypervisor_enable_event(_EVENT_VBD_UPD); +} + +static void +signal_requests_to_xen(void) +{ + block_io_op_t op; + + DPRINTF(XBDB_IO, ("signal_requests_to_xen: %d -> %d\n", + blk_ring->req_prod, MASK_BLK_IDX(req_prod))); + blk_ring->req_prod = MASK_BLK_IDX(req_prod); + last_req_prod = req_prod; + + op.cmd = BLOCK_IO_OP_SIGNAL; + HYPERVISOR_block_io_op(&op); + return; +} + +static void +setup_sysctl(void) +{ + struct sysctlnode *pnode; + + sysctl_createv(NULL, 0, NULL, NULL, + 0, + CTLTYPE_NODE, "machdep", NULL, + NULL, 0, NULL, 0, + CTL_MACHDEP, CTL_EOL); + + sysctl_createv(NULL, 0, NULL, &pnode, + 0, + CTLTYPE_NODE, "domain0", NULL, + NULL, 0, NULL, 0, + CTL_MACHDEP, CTL_CREATE, CTL_EOL); + + if (pnode == NULL) + return; + + sysctl_createv(NULL, 0, &pnode, &pnode, + 0, + CTLTYPE_NODE, "diskcookie", NULL, + NULL, 0, NULL, 0, + CTL_CREATE, CTL_EOL); + + if (pnode) + diskcookies = pnode; +} + +static struct xbd_attach_args * +get_xbda(xen_disk_t *xd) +{ + + switch (XEN_MAJOR(xd->device)) { +#if NSD > 0 + case XEN_SCSI_DISK0_MAJOR: + case XEN_SCSI_DISK1_MAJOR ... XEN_SCSI_DISK7_MAJOR: + case XEN_SCSI_DISK8_MAJOR ... XEN_SCSI_DISK15_MAJOR: + if (xd->capacity == 0) + return NULL; + return &sd_ata; + case XEN_SCSI_CDROM_MAJOR: + return &cd_ata; +#endif +#if NWD > 0 + case XEN_IDE0_MAJOR: + case XEN_IDE1_MAJOR: + case XEN_IDE2_MAJOR: + case XEN_IDE3_MAJOR: + case XEN_IDE4_MAJOR: + case XEN_IDE5_MAJOR: + case XEN_IDE6_MAJOR: + case XEN_IDE7_MAJOR: + case XEN_IDE8_MAJOR: + case XEN_IDE9_MAJOR: + switch (XD_TYPE(xd->info)) { + case XD_TYPE_CDROM: + return &cd_ata; + case XD_TYPE_DISK: + if (xd->capacity == 0) + return NULL; + return &wd_ata; + default: + return NULL; + } + break; +#endif + default: + if (xd->capacity == 0) + return NULL; + return &xbd_ata; + } + return NULL; +} + +int +xbd_scan(struct device *self, struct xbd_attach_args *mainbus_xbda, + cfprint_t print) +{ + struct xbdreq *xr; + struct xbd_attach_args *xbda; + xen_disk_t *xd; + int i; + + init_interface(); + if (xen_start_info.flags & SIF_PRIVILEGED) + setup_sysctl(); + +#if NXBD > 0 + xbd_major = devsw_name2blk("xbd", NULL, 0); +#endif +#if NWD > 0 + xbd_wd_major = devsw_name2blk("wd", NULL, 0); + /* XXX Also handle the cdev majors since stuff like + * read_sector calls strategy on the cdev. This only works if + * all the majors we care about are different. + */ + xbd_wd_cdev_major = major(devsw_blk2chr(makedev(xbd_wd_major, 0))); +#endif +#if NSD > 0 + xbd_sd_major = devsw_name2blk("sd", NULL, 0); + xbd_sd_cdev_major = major(devsw_blk2chr(makedev(xbd_sd_major, 0))); +#endif +#if NCD > 0 + xbd_cd_major = devsw_name2blk("cd", NULL, 0); + xbd_cd_cdev_major = major(devsw_blk2chr(makedev(xbd_cd_major, 0))); +#endif + + MALLOC(xr, struct xbdreq *, BLK_RING_SIZE * sizeof(struct xbdreq), + M_DEVBUF, M_WAITOK | M_ZERO); +#ifdef DEBUG + xbd_allxr = xr; +#endif + + /* XXX Xen1.2: We cannot use BLK_RING_SIZE many slots, since + * Xen 1.2 keeps indexes masked in the ring and the case where + * we queue all slots at once is handled wrong. + */ + for (i = 0; i < BLK_RING_SIZE - 1; i++) + PUT_XBDREQ(&xr[i]); + + MALLOC(vbd_info, xen_disk_t *, MAX_VBDS * sizeof(xen_disk_t), + M_DEVBUF, M_WAITOK); + memset(vbd_info, 0, MAX_VBDS * sizeof(xen_disk_t)); + nr_vbds = get_vbd_info(vbd_info); + if (nr_vbds <= 0) + goto out; + + for (i = 0; i < nr_vbds; i++) { + xd = &vbd_info[i]; + xbda = get_xbda(xd); + if (xbda) { + xbda->xa_xd = xd; + config_found(self, xbda, print); + } + } + + enable_update_events(self); + + return 0; + + out: + FREE(vbd_info, M_DEVBUF); + vbd_info = NULL; + FREE(xr, M_DEVBUF); +#ifdef DEBUG + xbd_allxr = NULL; +#endif + SLIST_INIT(&xbdreqs); + return 0; +} + +#if NXBD > 0 +int +xbd_match(struct device *parent, struct cfdata *match, void *aux) +{ + struct xbd_attach_args *xa = (struct xbd_attach_args *)aux; + + if (strcmp(xa->xa_device, "xbd") == 0) + return 1; + return 0; +} +#endif + +#if NWD > 0 +int +xbd_wd_match(struct device *parent, struct cfdata *match, void *aux) +{ + struct xbd_attach_args *xa = (struct xbd_attach_args *)aux; + + if (strcmp(xa->xa_device, "wd") == 0) + return 1; + return 0; +} +#endif + +#if NSD > 0 +int +xbd_sd_match(struct device *parent, struct cfdata *match, void *aux) +{ + struct xbd_attach_args *xa = (struct xbd_attach_args *)aux; + + if (strcmp(xa->xa_device, "sd") == 0) + return 1; + return 0; +} +#endif + +#if NCD > 0 +int +xbd_cd_match(struct device *parent, struct cfdata *match, void *aux) +{ + struct xbd_attach_args *xa = (struct xbd_attach_args *)aux; + + if (strcmp(xa->xa_device, "cd") == 0) + return 1; + return 0; +} +#endif + +static void +xbd_attach(struct device *parent, struct device *self, void *aux) +{ + struct xbd_attach_args *xbda = (struct xbd_attach_args *)aux; + struct xbd_softc *xs = (struct xbd_softc *)self; + + aprint_normal(": Xen Virtual Block Device"); + + simple_lock_init(&xs->sc_slock); + dk_sc_init(&xs->sc_dksc, xs, xs->sc_dev.dv_xname); + xbdinit(xs, xbda->xa_xd, xbda->xa_dkintf); + if (diskcookies) { + /* XXX beware that xs->sc_xd_device is a long */ + sysctl_createv(NULL, 0, &diskcookies, NULL, + 0, + CTLTYPE_INT, xs->sc_dev.dv_xname, NULL, + NULL, 0, &xs->sc_xd_device, 0, + CTL_CREATE, CTL_EOL); + } +} + +static int +xbd_detach(struct device *dv, int flags) +{ + struct xbd_softc *xs = (struct xbd_softc *)dv; + + /* + * Mark disk about to be removed (between now and when the xs + * will be freed). + */ + xs->sc_shutdown = 1; + + /* And give it some time to settle if it's busy. */ + if (xs->sc_dksc.sc_dkdev.dk_busy > 0) + tsleep(&xs, PWAIT, "xbdetach", hz); + + /* Detach the disk. */ + disk_detach(&xs->sc_dksc.sc_dkdev); + + /* XXX decrement bufq_users and free? */ + + /* XXX no need to remove sysctl nodes since they only exist + * in domain0 and domain0's devices are never removed. + */ + + return 0; +} + +int +xbdopen(dev_t dev, int flags, int fmt, struct proc *p) +{ + struct xbd_softc *xs; + + DPRINTF_FOLLOW(("xbdopen(0x%04x, %d)\n", dev, flags)); + switch (fmt) { + case S_IFCHR: + GETXBD_SOFTC_CDEV(xs, dev); + break; + case S_IFBLK: + GETXBD_SOFTC(xs, dev); + break; + default: + return ENXIO; + } + return dk_open(xs->sc_di, &xs->sc_dksc, dev, flags, fmt, p); +} + +int +xbdclose(dev_t dev, int flags, int fmt, struct proc *p) +{ + struct xbd_softc *xs; + + DPRINTF_FOLLOW(("xbdclose(%d, %d)\n", dev, flags)); + switch (fmt) { + case S_IFCHR: + GETXBD_SOFTC_CDEV(xs, dev); + break; + case S_IFBLK: + GETXBD_SOFTC(xs, dev); + break; + default: + return ENXIO; + } + return dk_close(xs->sc_di, &xs->sc_dksc, dev, flags, fmt, p); +} + +void +xbdstrategy(struct buf *bp) +{ + struct xbd_softc *xs = getxbd_softc(bp->b_dev); + + DPRINTF_FOLLOW(("xbdstrategy(%p): b_bcount = %ld\n", bp, + (long)bp->b_bcount)); + + if (xs == NULL || xs->sc_shutdown) { + bp->b_flags |= B_ERROR; + bp->b_error = EIO; + biodone(bp); + return; + } + + dk_strategy(xs->sc_di, &xs->sc_dksc, bp); + return; +} + +int +xbdsize(dev_t dev) +{ + struct xbd_softc *xs = getxbd_softc(dev); + + DPRINTF_FOLLOW(("xbdsize(%d)\n", dev)); + if (xs == NULL || xs->sc_shutdown) + return -1; + return dk_size(xs->sc_di, &xs->sc_dksc, dev); +} + +static void +map_align(struct xbdreq *xr) +{ + int s; + + s = splvm(); + xr->xr_aligned = uvm_km_kmemalloc1(kmem_map, NULL, + xr->xr_bqueue, XEN_BSIZE, UVM_UNKNOWN_OFFSET, + 0/* UVM_KMF_NOWAIT */); + splx(s); + DPRINTF(XBDB_IO, ("map_align(%p): bp %p addr %p align 0x%08lx " + "size 0x%04lx\n", xr, xr->xr_bp, xr->xr_bp->b_data, + xr->xr_aligned, xr->xr_bqueue)); + xr->xr_data = xr->xr_aligned; + if ((xr->xr_bp->b_flags & B_READ) == 0) + memcpy((void *)xr->xr_aligned, xr->xr_bp->b_data, + xr->xr_bqueue); +} + +static void +unmap_align(struct xbdreq *xr) +{ + int s; + + if (xr->xr_bp->b_flags & B_READ) + memcpy(xr->xr_bp->b_data, (void *)xr->xr_aligned, + xr->xr_bp->b_bcount); + DPRINTF(XBDB_IO, ("unmap_align(%p): bp %p addr %p align 0x%08lx " + "size 0x%04lx\n", xr, xr->xr_bp, xr->xr_bp->b_data, + xr->xr_aligned, xr->xr_bp->b_bcount)); + s = splvm(); + uvm_km_free(kmem_map, xr->xr_aligned, xr->xr_bp->b_bcount); + splx(s); + xr->xr_aligned = (vaddr_t)0; +} + +static void +fill_ring(struct xbdreq *xr) +{ + struct xbdreq *pxr = xr->xr_parent; + paddr_t pa; + unsigned long ma; + vaddr_t addr, off; + blk_ring_req_entry_t *ring_req; + int breq, nr_sectors; + + /* Fill out a communications ring structure. */ + ring_req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req; + ring_req->id = (unsigned long)xr; + ring_req->operation = pxr->xr_bp->b_flags & B_READ ? XEN_BLOCK_READ : + XEN_BLOCK_WRITE; + ring_req->sector_number = (xen_sector_t)pxr->xr_bn; + ring_req->device = pxr->xr_sc->sc_xd_device; + + DPRINTF(XBDB_IO, ("fill_ring(%d): bp %p sector %llu pxr %p xr %p\n", + MASK_BLK_IDX(req_prod), pxr->xr_bp, (unsigned long long)pxr->xr_bn, + pxr, xr)); + + xr->xr_breq = 0; + ring_req->nr_segments = 0; + addr = trunc_page(pxr->xr_data); + off = pxr->xr_data - addr; + while (pxr->xr_bqueue > 0) { +#if 0 + pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map), + addr, &pa); +#else + pmap_extract(pmap_kernel(), addr, &pa); +#endif + ma = xpmap_ptom_masked(pa) + off; + DIAGCONDPANIC((ma & (XEN_BSIZE - 1)) != 0, + ("xbd request ma not sector aligned")); + + if (pxr->xr_bqueue > PAGE_SIZE - off) + breq = PAGE_SIZE - off; + else + breq = pxr->xr_bqueue; + nr_sectors = breq >> XEN_BSHIFT; + DIAGCONDPANIC(nr_sectors >= XEN_BSIZE, + ("xbd request nr_sectors >= XEN_BSIZE")); + + DPRINTF(XBDB_IO, ("fill_ring(%d): va 0x%08lx pa 0x%08lx " + "ma 0x%08lx, sectors %d, left %ld/%ld\n", + MASK_BLK_IDX(req_prod), addr, pa, ma, nr_sectors, + pxr->xr_bqueue >> XEN_BSHIFT, pxr->xr_bqueue)); + + ring_req->buffer_and_sects[ring_req->nr_segments++] = + ma | nr_sectors; + addr += PAGE_SIZE; + pxr->xr_bqueue -= breq; + pxr->xr_bn += nr_sectors; + xr->xr_breq += breq; + off = 0; + if (ring_req->nr_segments == MAX_BLK_SEGS) + break; + } + pxr->xr_data = addr; + + req_prod++; +} + +static void +xbdresume(void) +{ + struct xbdreq *pxr, *xr; + struct xbd_softc *xs; + struct buf *bp; + + while ((pxr = SIMPLEQ_FIRST(&xbdr_suspended)) != NULL) { + DPRINTF(XBDB_IO, ("xbdstart: resuming xbdreq %p for bp %p\n", + pxr, pxr->xr_bp)); + bp = pxr->xr_bp; + xs = getxbd_softc(bp->b_dev); + if (xs == NULL || xs->sc_shutdown) { + bp->b_flags |= B_ERROR; + bp->b_error = EIO; + } + if (bp->b_flags & B_ERROR) { + pxr->xr_bdone -= pxr->xr_bqueue; + pxr->xr_bqueue = 0; + if (pxr->xr_bdone == 0) { + bp->b_resid = bp->b_bcount; + if (pxr->xr_aligned) + unmap_align(pxr); + PUT_XBDREQ(pxr); + if (xs) + disk_unbusy(&xs->sc_dksc.sc_dkdev, + (bp->b_bcount - bp->b_resid), + (bp->b_flags & B_READ)); + biodone(bp); + } + continue; + } + while (__predict_true(pxr->xr_bqueue > 0)) { + GET_XBDREQ(xr); + if (__predict_false(xr == NULL)) + goto out; + xr->xr_parent = pxr; + fill_ring(xr); + } + DPRINTF(XBDB_IO, ("xbdstart: resumed xbdreq %p for bp %p\n", + pxr, bp)); + SIMPLEQ_REMOVE_HEAD(&xbdr_suspended, xr_suspended); + } + + out: + return; +} + +static int +xbdstart(struct dk_softc *dksc, struct buf *bp) +{ + struct xbd_softc *xs; + struct xbdreq *pxr, *xr; + struct partition *pp; + daddr_t bn; + int ret, runqueue; + + DPRINTF_FOLLOW(("xbdstart(%p, %p)\n", dksc, bp)); + + runqueue = 1; + ret = -1; + + xs = getxbd_softc(bp->b_dev); + if (xs == NULL || xs->sc_shutdown) { + bp->b_flags |= B_ERROR; + bp->b_error = EIO; + biodone(bp); + return 0; + } + dksc = &xs->sc_dksc; + + /* XXXrcd: + * Translate partition relative blocks to absolute blocks, + * this probably belongs (somehow) in dksubr.c, since it + * is independant of the underlying code... This will require + * that the interface be expanded slightly, though. + */ + bn = bp->b_blkno; + if (DISKPART(bp->b_dev) != RAW_PART) { + pp = &xs->sc_dksc.sc_dkdev.dk_label-> + d_partitions[DISKPART(bp->b_dev)]; + bn += pp->p_offset; + } + + DPRINTF(XBDB_IO, ("xbdstart: addr %p, sector %llu, " + "count %ld [%s]\n", bp->b_data, (unsigned long long)bn, + bp->b_bcount, bp->b_flags & B_READ ? "read" : "write")); + + GET_XBDREQ(pxr); + if (__predict_false(pxr == NULL)) + goto out; + + disk_busy(&dksc->sc_dkdev); /* XXX: put in dksubr.c */ + /* + * We have a request slot, return 0 to make dk_start remove + * the bp from the work queue. + */ + ret = 0; + + pxr->xr_bp = bp; + pxr->xr_parent = pxr; + pxr->xr_bn = bn; + pxr->xr_bqueue = bp->b_bcount; + pxr->xr_bdone = bp->b_bcount; + pxr->xr_data = (vaddr_t)bp->b_data; + pxr->xr_sc = xs; + + if (pxr->xr_data & (XEN_BSIZE - 1)) + map_align(pxr); + + fill_ring(pxr); + + while (__predict_false(pxr->xr_bqueue > 0)) { + GET_XBDREQ(xr); + if (__predict_false(xr == NULL)) + break; + xr->xr_parent = pxr; + fill_ring(xr); + } + + if (__predict_false(pxr->xr_bqueue > 0)) { + SIMPLEQ_INSERT_TAIL(&xbdr_suspended, pxr, + xr_suspended); + DPRINTF(XBDB_IO, ("xbdstart: suspended xbdreq %p " + "for bp %p\n", pxr, bp)); + } else if (CANGET_XBDREQ() && BUFQ_PEEK(&bufq) != NULL) { + /* + * We have enough resources to start another bp and + * there are additional bps on the queue, dk_start + * will call us again and we'll run the queue then. + */ + runqueue = 0; + } + + out: + if (runqueue && last_req_prod != req_prod) + signal_requests_to_xen(); + + return ret; +} + +static int +xbd_response_handler(void *arg) +{ + struct buf *bp; + struct xbd_softc *xs; + blk_ring_resp_entry_t *ring_resp; + struct xbdreq *pxr, *xr; + int i; + + for (i = resp_cons; i != blk_ring->resp_prod; i = BLK_RING_INC(i)) { + ring_resp = &blk_ring->ring[MASK_BLK_IDX(i)].resp; + xr = (struct xbdreq *)ring_resp->id; + pxr = xr->xr_parent; + + DPRINTF(XBDB_IO, ("xbd_response_handler(%d): pxr %p xr %p " + "bdone %04lx breq %04lx\n", i, pxr, xr, pxr->xr_bdone, + xr->xr_breq)); + pxr->xr_bdone -= xr->xr_breq; + DIAGCONDPANIC(pxr->xr_bdone < 0, + ("xbd_response_handler: pxr->xr_bdone < 0")); + + if (__predict_false(ring_resp->status)) { + pxr->xr_bp->b_flags |= B_ERROR; + pxr->xr_bp->b_error = EIO; + } + + if (xr != pxr) { + PUT_XBDREQ(xr); + if (!SIMPLEQ_EMPTY(&xbdr_suspended)) + xbdresume(); + } + + if (pxr->xr_bdone == 0) { + bp = pxr->xr_bp; + xs = getxbd_softc(bp->b_dev); + if (xs == NULL) { /* don't fail bp if we're shutdown */ + bp->b_flags |= B_ERROR; + bp->b_error = EIO; + } + DPRINTF(XBDB_IO, ("xbd_response_handler(%d): " + "completed bp %p\n", i, bp)); + if (bp->b_flags & B_ERROR) + bp->b_resid = bp->b_bcount; + else + bp->b_resid = 0; + + if (pxr->xr_aligned) + unmap_align(pxr); + + PUT_XBDREQ(pxr); + if (xs) + disk_unbusy(&xs->sc_dksc.sc_dkdev, + (bp->b_bcount - bp->b_resid), + (bp->b_flags & B_READ)); + biodone(bp); + if (!SIMPLEQ_EMPTY(&xbdr_suspended)) + xbdresume(); + /* XXX possible lockup if this was the only + * active device and requests were held back in + * the queue. + */ + if (xs) + dk_iodone(xs->sc_di, &xs->sc_dksc); + } + } + resp_cons = i; + /* check if xbdresume queued any requests */ + if (last_req_prod != req_prod) + signal_requests_to_xen(); + return 0; +} + +static struct device * +find_device(xen_disk_t *xd) +{ + struct device *dv; + struct xbd_softc *xs; + + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) { + if (dv->dv_cfattach == NULL || + dv->dv_cfattach->ca_attach != xbd_attach) + continue; + xs = (struct xbd_softc *)dv; + if (xs->sc_xd_device == xd->device) + break; + } + return dv; +} + +static void +xbd_update_create_kthread(void *arg) +{ + + kthread_create1(xbd_update_kthread, arg, NULL, "xbdupdate"); +} + +static void +xbd_update_kthread(void *arg) +{ + struct device *parent = arg; + struct xbd_attach_args *xbda; + struct device *dev; + xen_disk_t *xd; + xen_disk_t *vbd_info_update, *vbd_info_old; + int i, j, new_nr_vbds; + extern int hypervisor_print(void *, const char *); + + MALLOC(vbd_info_update, xen_disk_t *, MAX_VBDS * + sizeof(xen_disk_t), M_DEVBUF, M_WAITOK); + + for (;;) { + memset(vbd_info_update, 0, MAX_VBDS * sizeof(xen_disk_t)); + new_nr_vbds = get_vbd_info(vbd_info_update); + + if (memcmp(vbd_info, vbd_info_update, MAX_VBDS * + sizeof(xen_disk_t)) == 0) { + FREE(vbd_info_update, M_DEVBUF); + tsleep(parent, PWAIT, "xbdupd", 0); + MALLOC(vbd_info_update, xen_disk_t *, MAX_VBDS * + sizeof(xen_disk_t), M_DEVBUF, M_WAITOK); + continue; + } + + j = 0; + for (i = 0; i < new_nr_vbds; i++) { + while (j < nr_vbds && + vbd_info[j].device < vbd_info_update[i].device) { + DPRINTF(XBDB_HOTPLUG, + ("delete device %x size %lx\n", + vbd_info[j].device, + vbd_info[j].capacity)); + xd = &vbd_info[j]; + dev = find_device(xd); + if (dev) + config_detach(dev, DETACH_FORCE); + j++; + } + if (j < nr_vbds && + vbd_info[j].device == vbd_info_update[i].device) { + DPRINTF(XBDB_HOTPLUG, + ("update device %x size %lx size %lx\n", + vbd_info_update[i].device, + vbd_info[j].capacity, + vbd_info_update[i].capacity)); + j++; + } else { + DPRINTF(XBDB_HOTPLUG, + ("add device %x size %lx\n", + vbd_info_update[i].device, + vbd_info_update[i].capacity)); + xd = &vbd_info_update[i]; + xbda = get_xbda(xd); + if (xbda) { + xbda->xa_xd = xd; + config_found(parent, xbda, hypervisor_print); + } + } + } + + while (j < nr_vbds) { + DPRINTF(XBDB_HOTPLUG, ("delete device %x\n", + vbd_info[j].device)); + xd = &vbd_info[j]; + dev = find_device(xd); + if (dev) + config_detach(dev, DETACH_FORCE); + j++; + } + + nr_vbds = new_nr_vbds; + + vbd_info_old = vbd_info; + vbd_info = vbd_info_update; + vbd_info_update = vbd_info_old; + } +} + +static int +xbd_update_handler(void *arg) +{ + + wakeup(arg); + + return 0; +} + +/* XXX: we should probably put these into dksubr.c, mostly */ +int +xbdread(dev_t dev, struct uio *uio, int flags) +{ + struct xbd_softc *xs; + struct dk_softc *dksc; + + DPRINTF_FOLLOW(("xbdread(%d, %p, %d)\n", dev, uio, flags)); + GETXBD_SOFTC_CDEV(xs, dev); + dksc = &xs->sc_dksc; + if ((dksc->sc_flags & DKF_INITED) == 0) + return ENXIO; + /* XXX see the comments about minphys in ccd.c */ + return physio(xbdstrategy, NULL, dev, B_READ, minphys, uio); +} + +/* XXX: we should probably put these into dksubr.c, mostly */ +int +xbdwrite(dev_t dev, struct uio *uio, int flags) +{ + struct xbd_softc *xs; + struct dk_softc *dksc; + + DPRINTF_FOLLOW(("xbdwrite(%d, %p, %d)\n", dev, uio, flags)); + GETXBD_SOFTC_CDEV(xs, dev); + dksc = &xs->sc_dksc; + if ((dksc->sc_flags & DKF_INITED) == 0) + return ENXIO; + /* XXX see the comments about minphys in ccd.c */ + return physio(xbdstrategy, NULL, dev, B_WRITE, minphys, uio); +} + +int +xbdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + struct xbd_softc *xs; + struct dk_softc *dksc; + int ret; + + DPRINTF_FOLLOW(("xbdioctl(%d, %08lx, %p, %d, %p)\n", + dev, cmd, data, flag, p)); + GETXBD_SOFTC(xs, dev); + dksc = &xs->sc_dksc; + + if ((ret = lockmgr(&dksc->sc_lock, LK_EXCLUSIVE, NULL)) != 0) + return ret; + + switch (cmd) { + default: + ret = dk_ioctl(xs->sc_di, dksc, dev, cmd, data, flag, p); + break; + } + + lockmgr(&dksc->sc_lock, LK_RELEASE, NULL); + return ret; +} + +int +xbdioctl_cdev(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + dev_t bdev; + + bdev = devsw_chr2blk(dev); + if (bdev == NODEV) + return ENXIO; + return xbdioctl(bdev, cmd, data, flag, p); +} + +int +xbddump(dev_t dev, daddr_t blkno, caddr_t va, size_t size) +{ + struct xbd_softc *xs; + + DPRINTF_FOLLOW(("xbddump(%d, %" PRId64 ", %p, %lu)\n", dev, blkno, va, + (unsigned long)size)); + GETXBD_SOFTC(xs, dev); + return dk_dump(xs->sc_di, &xs->sc_dksc, dev, blkno, va, size); +} + +static int +xbdinit(struct xbd_softc *xs, xen_disk_t *xd, struct dk_intf *dkintf) +{ + struct dk_geom *pdg; + char buf[9]; + int ret; + + ret = 0; + + xs->sc_dksc.sc_size = xd->capacity; + xs->sc_xd_device = xd->device; + xs->sc_di = dkintf; + xs->sc_shutdown = 0; + + /* + * XXX here we should probe the underlying device. If we + * are accessing a partition of type RAW_PART, then + * we should populate our initial geometry with the + * geometry that we discover from the device. + */ + pdg = &xs->sc_dksc.sc_geom; + pdg->pdg_secsize = DEV_BSIZE; + pdg->pdg_ntracks = 1; + pdg->pdg_nsectors = 1024 * (1024 / pdg->pdg_secsize); + pdg->pdg_ncylinders = xs->sc_dksc.sc_size / pdg->pdg_nsectors; + + /* + * We have one shared bufq for all devices because otherwise + * requests can stall if there were no free request slots + * available in xbdstart and this device had no requests + * in-flight which would trigger a dk_start from the interrupt + * handler. + * XXX this assumes that we can just memcpy struct bufq_state + * to share it between devices. + * XXX we reference count the usage in case so we can de-alloc + * the bufq if all devices are deconfigured. + */ + if (bufq_users == 0) { + bufq_alloc(&bufq, BUFQ_FCFS); + bufq_users = 1; + } + memcpy(&xs->sc_dksc.sc_bufq, &bufq, sizeof(struct bufq_state)); + + xs->sc_dksc.sc_flags |= DKF_INITED; + + /* Attach the disk. */ + disk_attach(&xs->sc_dksc.sc_dkdev); + + /* Try and read the disklabel. */ + dk_getdisklabel(xs->sc_di, &xs->sc_dksc, 0 /* XXX ? */); + + format_bytes(buf, sizeof(buf), (uint64_t)xs->sc_dksc.sc_size * + pdg->pdg_secsize); + printf(" %s\n", buf); + +/* out: */ + return ret; +} diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c new file mode 100644 index 0000000000..8181f2b9b3 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c @@ -0,0 +1,444 @@ +/* $NetBSD: xen_debug.c,v 1.1.2.1 2004/05/22 15:59:31 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * + * Copyright (c) 2002-2003, K A Fraser & R Neugebauer + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: xen_debug.c,v 1.1.2.1 2004/05/22 15:59:31 he Exp $"); + +#define XENDEBUG + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/stdarg.h> +#include <machine/xen.h> +#include <machine/hypervisor.h> + +#ifdef XENDEBUG + +#define PRINTK_BUFSIZE 1024 +void +printk(const char *fmt, ...) +{ + va_list ap; + int ret; + static char buf[PRINTK_BUFSIZE]; + + va_start(ap, fmt); + ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap); + va_end(ap); + buf[ret] = 0; + (void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf); +} + +void +vprintk(const char *fmt, va_list ap) +{ + int ret; + static char buf[PRINTK_BUFSIZE]; + + ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap); + buf[ret] = 0; + (void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf); +} + +#endif + +#ifdef XENDEBUG_LOW + +int xen_once = 0; + +void hypervisor_callback(void); +void failsafe_callback(void); + +void xen_dbglow_init(void); +void +xen_dbglow_init() +{ + start_info_t *si; +#if 0 + int i; +#endif + + si = &xen_start_info; + + HYPERVISOR_set_callbacks( + __KERNEL_CS, (unsigned long)hypervisor_callback, + __KERNEL_CS, (unsigned long)failsafe_callback); + + trap_init(); + + /* __sti(); */ + + /* print out some useful information */ + printk(version); + printk("start_info: %p\n", si); + printk(" nr_pages: %lu", si->nr_pages); + printk(" shared_inf: %p (was %p)\n", HYPERVISOR_shared_info, + si->shared_info); + printk(" pt_base: %p", (void *)si->pt_base); + printk(" mod_start: 0x%lx\n", si->mod_start); + printk(" mod_len: %lu\n", si->mod_len); +#if 0 + printk(" net_rings: "); + for (i = 0; i < MAX_DOMAIN_VIFS; i++) { + if (si->net_rings[i] == 0) + break; + printk(" %lx", si->net_rings[i]); + }; + printk("\n"); + printk(" blk_ring: 0x%lx\n", si->blk_ring); +#endif + printk(" dom_id: %d\n", si->dom_id); + printk(" flags: 0x%lx\n", si->flags); + printk(" cmd_line: %s\n", si->cmd_line ? + (const char *)si->cmd_line : "NULL"); +} + + +void xen_dbg0(char *); +void +xen_dbg0(char *end) +{ + struct cpu_info *ci; + + ci = &cpu_info_primary; + if (xen_once) + printk("xencpu level %d ipending %08x master %08x\n", + ci->ci_ilevel, ci->ci_ipending, + HYPERVISOR_shared_info->events_mask); + /* ipending %08x imask %08x iunmask %08x */ + /* ci->ci_imask[IPL_NET], ci->ci_iunmask[IPL_NET]); */ +} + +void xen_dbg1(void *esp, int ss); +void +xen_dbg1(void *esp, int ss) +{ +#if 1 + struct cpu_info *ci; + + ci = &cpu_info_primary; + if (xen_once) + printk("xenhighlevel %d ipending %08x master %08x events %08x\n", + ci->ci_ilevel, ci->ci_ipending, + HYPERVISOR_shared_info->events_mask, HYPERVISOR_shared_info->events); +#else + printk("stack switch %p %d/%d, sp %p\n", esp, ss, IDXSEL(ss), &ss); +#endif +} + +void xen_dbg2(void); +void +xen_dbg2(void) +{ + if (xen_once) + printk("xen_dbg2\n"); +} + +void xen_dbg3(void *, void *); +void +xen_dbg3(void *ss, void *esp) +{ + if (xen_once) + printk("xen_dbg3 %p %p\n", ss, esp); +} + +void xen_dbg4(void *); +void +xen_dbg4(void *esi) +{ + + printk("xen_dbg4 %p\n", esi); + for(;;); +} + + + + +static void do_exit(void); + +/* + * These are assembler stubs in vector.S. + * They are the actual entry points for virtual exceptions. + */ +void divide_error(void); +void debug(void); +void int3(void); +void overflow(void); +void bounds(void); +void invalid_op(void); +void device_not_available(void); +void double_fault(void); +void coprocessor_segment_overrun(void); +void invalid_TSS(void); +void segment_not_present(void); +void stack_segment(void); +void general_protection(void); +void page_fault(void); +void coprocessor_error(void); +void simd_coprocessor_error(void); +void alignment_check(void); +void spurious_interrupt_bug(void); +void machine_check(void); + +static void +dump_regs(struct pt_regs *regs) +{ + int in_kernel = 1; + unsigned long esp; + unsigned short ss; + + esp = (unsigned long) (®s->esp); + ss = __KERNEL_DS; + if (regs->xcs & 2) { + in_kernel = 0; + esp = regs->esp; + ss = regs->xss & 0xffff; + } + printf("EIP: %04x:[<%08lx>]\n", + 0xffff & regs->xcs, regs->eip); + printf("EFLAGS: %08lx\n",regs->eflags); + printf("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->eax, regs->ebx, regs->ecx, regs->edx); + printf("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->esi, regs->edi, regs->ebp, esp); + printf("ds: %04x es: %04x ss: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss); + printf("\n"); +} + + +static inline void +dump_code(unsigned eip) +{ + unsigned *ptr = (unsigned *)eip; + int x; + + printk("Bytes at eip:\n"); + for (x = -4; x < 5; x++) + printf("%x", ptr[x]); +} + + +/* + * C handlers here have their parameter-list constructed by the + * assembler stubs above. Each one gets a pointer to a list + * of register values (to be restored at end of exception). + * Some will also receive an error code -- this is the code that + * was generated by the processor for the underlying real exception. + * + * Note that the page-fault exception is special. It also receives + * the faulting linear address. Normally this would be found in + * register CR2, but that is not accessible in a virtualised OS. + */ + +static void inline +do_trap(int trapnr, char *str, struct pt_regs *regs, long error_code) +{ + + printk("FATAL: Unhandled Trap (see mini-os:traps.c)"); + printf("%d %s", trapnr, str); + dump_regs(regs); + dump_code(regs->eip); + + do_exit(); +} + +#define DO_ERROR(trapnr, str, name) \ +void do_##name(struct pt_regs *regs, long error_code); \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + do_trap(trapnr, str, regs, error_code); \ +} + +#define DO_ERROR_INFO(trapnr, str, name, sicode, siaddr) \ +void do_##name(struct pt_regs *regs, long error_code); \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + do_trap(trapnr, str, regs, error_code); \ +} + +DO_ERROR_INFO( 0, "divide error", divide_error, FPE_INTDIV, regs->eip) +DO_ERROR( 3, "int3", int3) +DO_ERROR( 4, "overflow", overflow) +DO_ERROR( 5, "bounds", bounds) +DO_ERROR_INFO( 6, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) +DO_ERROR( 7, "device not available", device_not_available) +DO_ERROR( 8, "double fault", double_fault) +DO_ERROR( 9, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, "invalid TSS", invalid_TSS) +DO_ERROR(11, "segment not present", segment_not_present) +DO_ERROR(12, "stack segment", stack_segment) +DO_ERROR_INFO(17, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR(18, "machine check", machine_check) + +void do_page_fault(struct pt_regs *, long, unsigned long); +void +do_page_fault(struct pt_regs *regs, long error_code, unsigned long address) +{ + + printk("Page fault\n"); + printk("Address: 0x%lx", address); + printk("Error Code: 0x%lx", error_code); + printk("eip: \t 0x%lx", regs->eip); + do_exit(); +} + +void do_general_protection(struct pt_regs *, long); +void +do_general_protection(struct pt_regs *regs, long error_code) +{ + + HYPERVISOR_shared_info->events_mask = 0; + printk("GPF\n"); + printk("Error Code: 0x%lx", error_code); + dump_regs(regs); + dump_code(regs->eip); + do_exit(); +} + + +void do_debug(struct pt_regs *, long); +void +do_debug(struct pt_regs *regs, long error_code) +{ + + printk("Debug exception\n"); +#define TF_MASK 0x100 + regs->eflags &= ~TF_MASK; + dump_regs(regs); + do_exit(); +} + + + +void do_coprocessor_error(struct pt_regs *, long); +void +do_coprocessor_error(struct pt_regs *regs, long error_code) +{ + + printk("Copro error\n"); + dump_regs(regs); + dump_code(regs->eip); + do_exit(); +} + +void simd_math_error(void *); +void +simd_math_error(void *eip) +{ + + printk("SIMD error\n"); +} + +void do_simd_coprocessor_error(struct pt_regs *, long); +void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +{ + + printk("SIMD copro error\n"); +} + +void do_spurious_interrupt_bug(struct pt_regs *, long); +void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +{ +} + +static void +do_exit(void) +{ + + HYPERVISOR_exit(); +} + +/* + * Submit a virtual IDT to teh hypervisor. This consists of tuples + * (interrupt vector, privilege ring, CS:EIP of handler). + * The 'privilege ring' field specifies the least-privileged ring that + * can trap to that vector using a software-interrupt instruction (INT). + */ +static trap_info_t trap_table[] = { + { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, + { 1, 0, __KERNEL_CS, (unsigned long)debug }, + { 3, 3, __KERNEL_CS, (unsigned long)int3 }, + { 4, 3, __KERNEL_CS, (unsigned long)overflow }, + { 5, 3, __KERNEL_CS, (unsigned long)bounds }, + { 6, 0, __KERNEL_CS, (unsigned long)invalid_op }, + { 7, 0, __KERNEL_CS, (unsigned long)device_not_available }, + { 8, 0, __KERNEL_CS, (unsigned long)double_fault }, + { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun }, + { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS }, + { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present }, + { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, + { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, + { 14, 0, __KERNEL_CS, (unsigned long)page_fault }, + { 15, 0, __KERNEL_CS, (unsigned long)spurious_interrupt_bug }, + { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, + { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, + { 18, 0, __KERNEL_CS, (unsigned long)machine_check }, + { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, + { 0, 0, 0, 0 } +}; + +void +trap_init(void) +{ + + HYPERVISOR_set_trap_table(trap_table); +} +#endif diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c new file mode 100644 index 0000000000..a151e3dd83 --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c @@ -0,0 +1,352 @@ +/* $NetBSD: xencons.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: xencons.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $"); + +#include <sys/param.h> +#include <sys/ioctl.h> +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/systm.h> +#include <sys/device.h> +#include <sys/conf.h> + +#include <machine/stdarg.h> +#include <machine/xen.h> +#include <machine/hypervisor.h> + +#include <dev/cons.h> + +#include <ddb/db_output.h> /* XXX for db_max_line */ + +static int xencons_isconsole = 0; + +#define XENCONS_UNIT(x) (minor(x)) +#define XENCONS_BURST 128 + +int xencons_match (struct device *, struct cfdata *, void *); +void xencons_attach (struct device *, struct device *, void *); +/* int xencons_intr (void *); */ +void xencons_init (void); + +struct xencons_softc { + struct device sc_dev; + struct tty *sc_tty; +}; + +CFATTACH_DECL(xencons, sizeof(struct xencons_softc), + xencons_match, xencons_attach, NULL, NULL); + +extern struct cfdriver xencons_cd; + +dev_type_open(xencons_open); +dev_type_close(xencons_close); +dev_type_read(xencons_read); +dev_type_write(xencons_write); +dev_type_ioctl(xencons_ioctl); +dev_type_stop(xencons_stop); +dev_type_tty(xencons_tty); +dev_type_poll(xencons_poll); + +const struct cdevsw xencons_cdevsw = { + xencons_open, xencons_close, xencons_read, xencons_write, + xencons_ioctl, xencons_stop, xencons_tty, xencons_poll, + NULL, ttykqfilter, D_TTY +}; + + +void xenconscn_attach(void); +int xenconscn_getc(dev_t); +void xenconscn_putc(dev_t, int); +void xenconscn_pollc(dev_t, int); + +static struct consdev xencons = { + NULL, NULL, xenconscn_getc, xenconscn_putc, xenconscn_pollc, + NULL, NULL, NULL, NODEV, CN_NORMAL +}; + +void xencons_start (struct tty *); +int xencons_param (struct tty *, struct termios *); + +int +xencons_match(struct device *parent, struct cfdata *match, void *aux) +{ + struct xencons_attach_args *xa = (struct xencons_attach_args *)aux; + + if (strcmp(xa->xa_device, "xencons") == 0) + return 1; + return 0; +} + +void +xencons_attach(struct device *parent, struct device *self, void *aux) +{ + struct xencons_softc *sc = (void *)self; + + aprint_normal(": Xen Virtual Console Driver\n"); + + if (xencons_isconsole) { + int maj; + + /* Locate the major number. */ + maj = cdevsw_lookup_major(&xencons_cdevsw); + + /* There can be only one, but it can have any unit number. */ + cn_tab->cn_dev = makedev(maj, sc->sc_dev.dv_unit); + + aprint_verbose("%s: console major %d, unit %d\n", + sc->sc_dev.dv_xname, maj, sc->sc_dev.dv_unit); + + /* Set db_max_line to avoid paging. */ + db_max_line = 0x7fffffff; + } +} + +int +xencons_open(dev_t dev, int flag, int mode, struct proc *p) +{ + struct xencons_softc *sc; + int unit = XENCONS_UNIT(dev); + struct tty *tp; + + sc = device_lookup(&xencons_cd, unit); + if (sc == NULL) + return (ENXIO); + + if (!sc->sc_tty) { + tp = sc->sc_tty = ttymalloc(); + tty_attach(tp); + } else + tp = sc->sc_tty; + + tp->t_oproc = xencons_start; + tp->t_param = xencons_param; + tp->t_dev = dev; + if ((tp->t_state & TS_ISOPEN) == 0) { + ttychars(tp); + tp->t_iflag = TTYDEF_IFLAG; + tp->t_oflag = TTYDEF_OFLAG; + tp->t_cflag = TTYDEF_CFLAG; + tp->t_lflag = TTYDEF_LFLAG; + tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; + xencons_param(tp, &tp->t_termios); + ttsetwater(tp); + } else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0) + return (EBUSY); + tp->t_state |= TS_CARR_ON; + + return ((*tp->t_linesw->l_open)(dev, tp)); +} + +int +xencons_close(dev_t dev, int flag, int mode, struct proc *p) +{ + struct xencons_softc *sc = device_lookup(&xencons_cd, + XENCONS_UNIT(dev)); + struct tty *tp = sc->sc_tty; + + if (tp == NULL) + return (0); + (*tp->t_linesw->l_close)(tp, flag); + ttyclose(tp); +#ifdef notyet /* XXX */ + ttyfree(tp); +#endif + return (0); +} + +int +xencons_read(dev_t dev, struct uio *uio, int flag) +{ + struct xencons_softc *sc = device_lookup(&xencons_cd, + XENCONS_UNIT(dev)); + struct tty *tp = sc->sc_tty; + + return ((*tp->t_linesw->l_read)(tp, uio, flag)); +} + +int +xencons_write(dev_t dev, struct uio *uio, int flag) +{ + struct xencons_softc *sc = device_lookup(&xencons_cd, + XENCONS_UNIT(dev)); + struct tty *tp = sc->sc_tty; + + return ((*tp->t_linesw->l_write)(tp, uio, flag)); +} + +int +xencons_poll(dev_t dev, int events, struct proc *p) +{ + struct xencons_softc *sc = device_lookup(&xencons_cd, + XENCONS_UNIT(dev)); + struct tty *tp = sc->sc_tty; + + return ((*tp->t_linesw->l_poll)(tp, events, p)); +} + +struct tty * +xencons_tty(dev_t dev) +{ + struct xencons_softc *sc = device_lookup(&xencons_cd, + XENCONS_UNIT(dev)); + struct tty *tp = sc->sc_tty; + + return (tp); +} + +int +xencons_ioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + struct xencons_softc *sc = device_lookup(&xencons_cd, + XENCONS_UNIT(dev)); + struct tty *tp = sc->sc_tty; + int error; + + error = (*tp->t_linesw->l_ioctl)(tp, cmd, data, flag, p); + if (error != EPASSTHROUGH) + return (error); + + error = ttioctl(tp, cmd, data, flag, p); + if (error != EPASSTHROUGH) + return (error); + + switch (cmd) { + default: + return (EPASSTHROUGH); + } + +#ifdef DIAGNOSTIC + panic("xencons_ioctl: impossible"); +#endif +} + +void +xencons_start(struct tty *tp) +{ + struct clist *cl; + int s, len; + u_char buf[XENCONS_BURST+1]; + + s = spltty(); + if (tp->t_state & (TS_TIMEOUT | TS_BUSY | TS_TTSTOP)) + goto out; + tp->t_state |= TS_BUSY; + splx(s); + + /* + * We need to do this outside spl since it could be fairly + * expensive and we don't want our serial ports to overflow. + */ + cl = &tp->t_outq; + len = q_to_b(cl, buf, XENCONS_BURST); + (void)HYPERVISOR_console_io(CONSOLEIO_write, len, buf); + + s = spltty(); + tp->t_state &= ~TS_BUSY; + if (cl->c_cc) { + tp->t_state |= TS_TIMEOUT; + callout_reset(&tp->t_rstrt_ch, 1, ttrstrt, tp); + } + if (cl->c_cc <= tp->t_lowat) { + if (tp->t_state & TS_ASLEEP) { + tp->t_state &= ~TS_ASLEEP; + wakeup(cl); + } + selwakeup(&tp->t_wsel); + } +out: + splx(s); +} + +void +xencons_stop(struct tty *tp, int flag) +{ + +} + + + +void +xenconscn_attach() +{ + + cn_tab = &xencons; + + xencons_isconsole = 1; +} + +int +xenconscn_getc(dev_t dev) +{ + + printf("\n"); + for (;;); +} + +#define MAXLINELEN 1024 +void +xenconscn_putc(dev_t dev, int c) +{ + static char buf[1024+1]; + static int bufpos = 0; + + buf[bufpos++] = c; + if (c == '\n') { + buf[bufpos] = 0; + (void)HYPERVISOR_console_io(CONSOLEIO_write, bufpos, buf); + bufpos = 0; + } +} + +void +xenconscn_pollc(dev_t dev, int on) +{ + +} + +/* + * Set line parameters. + */ +int +xencons_param(struct tty *tp, struct termios *t) +{ + + tp->t_ispeed = t->c_ispeed; + tp->t_ospeed = t->c_ospeed; + tp->t_cflag = t->c_cflag; + return (0); +} + diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c new file mode 100644 index 0000000000..e54615567b --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c @@ -0,0 +1,600 @@ +/* $NetBSD: xenkbc.c,v 1.3.2.1 2004/05/22 15:57:43 he Exp $ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2004 Ben Harris. + * Copyright (c) 1998 + * Matthias Drochner. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: xenkbc.c,v 1.3.2.1 2004/05/22 15:57:43 he Exp $"); + +#include <sys/param.h> +#include <sys/device.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <dev/pckbport/pckbportvar.h> +#include <dev/ic/i8042reg.h> + +#include <machine/intr.h> + +#include <machine/xenkbcvar.h> +#include <machine/xen.h> +#include <machine/hypervisor.h> +#include <machine/hypervisor-ifs/kbd.h> +#include <machine/evtchn.h> + +#define KBC_DELAY DELAY(1000) +#define KBC_TIMEOUT 250 + +#define XENKBC_NSLOTS 2 + +/* data per slave device */ +struct xenkbc_slotdata { + int xsd_polling; /* don't process data in interrupt handler */ + int xsd_poll_data; /* data read from inr handler if polling */ + int xsd_poll_stat; /* status read from inr handler if polling */ +#if NRND > 0 + rndsource_element_t xsd_rnd_source; +#endif +}; + +struct xenkbc_internal { + struct xenkbc_softc *xi_sc; + struct pckbport_tag *xi_pt; + struct xenkbc_slotdata *xi_slotdata[XENKBC_NSLOTS]; + int xi_flags; + int xi_data; + int xi_8042cmdbyte; +}; + +#define XI_CONSOLE_FLAG 0x01 +#define XI_HASAUX_FLAG 0x02 + +#define XI_CONSOLE(xi) ((xi)->xi_flags & XI_CONSOLE_FLAG) +#define XI_HASAUX(xi) ((xi)->xi_flags & XI_HASAUX_FLAG) + +#define XI_SETCONSOLE(xi,on) \ + ((on) ? ((xi)->xi_flags |= XI_CONSOLE_FLAG) : \ + ((xi)->xi_flags &= ~XI_CONSOLE_FLAG)) +#define XI_SETHASAUX(xi,on) \ + ((on) ? ((xi)->xi_flags |= XI_HASAUX_FLAG) : \ + ((xi)->xi_flags &= ~XI_HASAUX_FLAG)) + +static int xenkbc_match(struct device *, struct cfdata *, void *); +static void xenkbc_attach(struct device *, struct device *, void *); + +static int xenkbc_xt_translation(void *, pckbport_slot_t, int); +static void xenkbc_init_slotdata(struct xenkbc_slotdata *); + +static int xenkbc_get8042cmd (struct xenkbc_internal *); +static int xenkbc_put8042cmd (struct xenkbc_internal *); +static int xenkbc_send_devcmd(void *, pckbport_slot_t, u_char); +static int xenkbc_send_cmd(void *, u_char); +static int xenkbc_send_data(void *, u_char); +static int xenkbc_poll_data1(void *, pckbport_slot_t); + +static void xenkbc_slot_enable(void *, pckbport_slot_t, int); +static void xenkbc_intr_establish(void *, pckbport_slot_t); +static void xenkbc_set_poll(void *, pckbport_slot_t, int); + +static int xenkbc_intr(void *); + +CFATTACH_DECL(xenkbc, sizeof(struct xenkbc_softc), + xenkbc_match, xenkbc_attach, NULL, NULL); + +static struct pckbport_accessops const xenkbc_ops = { + xenkbc_xt_translation, + xenkbc_send_devcmd, + xenkbc_poll_data1, + xenkbc_slot_enable, + xenkbc_intr_establish, + xenkbc_set_poll +}; + +static struct xenkbc_internal xenkbc_consdata; +static struct xenkbc_slotdata xenkbc_cons_slotdata; + +/* #define XENKBCDEBUG */ +#ifdef XENKBCDEBUG +#define DPRINTF(x) printf x +#else +#define DPRINTF(x) +#endif + + +static int +xenkbc_getstatus(struct xenkbc_internal *xi) +{ + long res; + + res = HYPERVISOR_kbd_op(KBD_OP_READ, 0); + if (res < 0) { + xi->xi_data = 0; + return 0; + } + xi->xi_data = KBD_CODE_SCANCODE(res); + return KBD_CODE_STATUS(res); +} + +static int +xenkbc_wait_output(struct xenkbc_internal *xi) +{ + u_int i; + + for (i = KBC_TIMEOUT; i; i--) { + if ((xenkbc_getstatus(xi) & KBS_IBF) == 0) + return (1); + KBC_DELAY; + } + return (0); +} + +static int +xenkbc_match(struct device *parent, struct cfdata *cf, void *aux) +{ + struct xenkbc_attach_args *xa = aux; + + if ((xen_start_info.flags & SIF_PRIVILEGED) == 0) + return 0; + + if (strcmp(xa->xa_device, "xenkbc")) + return 0; + + return 1; +} + +static int +xenkbc_attach_slot(struct xenkbc_softc *xs, pckbport_slot_t slot) +{ + struct xenkbc_internal *xi = xs->sc_xi; + struct device *child; + int alloced = 0; + + if (xi->xi_slotdata[slot] == NULL) { + xi->xi_slotdata[slot] = malloc(sizeof(struct xenkbc_slotdata), + M_DEVBUF, M_NOWAIT); + if (xi->xi_slotdata[slot] == NULL) { + printf("%s: no memory\n", xs->sc_dev.dv_xname); + return 0; + } + xenkbc_init_slotdata(xi->xi_slotdata[slot]); + alloced++; + } + + child = pckbport_attach_slot(&xs->sc_dev, xi->xi_pt, slot); + + if (child == NULL && alloced) { + free(xi->xi_slotdata[slot], M_DEVBUF); + xi->xi_slotdata[slot] = NULL; + } + +#if NRND > 0 + if (child != NULL && xi->xi_slotdata[slot] != NULL) + rnd_attach_source(&xi->xi_slotdata[slot]->xsd_rnd_source, + child->dv_xname, RND_TYPE_TTY, 0); +#endif + + return child != NULL; +} + +static void +xenkbc_attach(struct device *parent, struct device *self, void *aux) +{ + /* struct xenkbc_attach_args *xa = aux; */ + struct xenkbc_softc *xs = (struct xenkbc_softc *)self; + struct xenkbc_internal *xi; + int res; + u_char cmdbits = 0; + + if (XI_CONSOLE(&xenkbc_consdata)) + xi = &xenkbc_consdata; + else { + xi = malloc(sizeof(struct xenkbc_internal), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (xi == NULL) { + aprint_error(": no memory\n"); + return; + } + xi->xi_8042cmdbyte = KC8_CPU; + } + + aprint_normal(": Xen Keyboard/Mouse Device\n"); + + xs->sc_xi = xi; + xi->xi_sc = xs; + + event_set_handler(_EVENT_PS2, &xenkbc_intr, xi, IPL_TTY); + hypervisor_enable_event(_EVENT_PS2); + + xi->xi_pt = pckbport_attach(xi, &xenkbc_ops); + + /* flush */ + xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT); + + /* set initial cmd byte */ + if (!xenkbc_put8042cmd(xi)) { + printf("kbc: cmd word write error\n"); + return; + } + + if (xenkbc_attach_slot(xs, PCKBPORT_KBD_SLOT)) + cmdbits |= KC8_KENABLE; + + /* + * Check aux port ok. + */ + if (!xenkbc_send_cmd(xi, KBC_AUXECHO)) { + printf("kbc: aux echo error 1\n"); + goto nomouse; + } + if (!xenkbc_wait_output(xi)) { + printf("kbc: aux echo error 2\n"); + goto nomouse; + } + XI_SETHASAUX(xi, 1); + xenkbc_send_data(xi, 0x5a); /* a random value */ + res = xenkbc_poll_data1(xi, PCKBPORT_AUX_SLOT); + if (res != -1) { + /* + * In most cases, the 0x5a gets echoed. + * Some older controllers (Gateway 2000 circa 1993) + * return 0xfe here. + * We are satisfied if there is anything in the + * aux output buffer. + */ + if (xenkbc_attach_slot(xs, PCKBPORT_AUX_SLOT)) + cmdbits |= KC8_MENABLE; + } else { +#ifdef XENKBCDEBUG + printf("kbc: aux echo test failed\n"); +#endif + XI_SETHASAUX(xi, 0); + } + + nomouse: + /* enable needed interrupts */ + xi->xi_8042cmdbyte |= cmdbits; + if (!xenkbc_put8042cmd(xi)) + printf("kbc: cmd word write error\n"); +} + +static void +xenkbc_init_slotdata(struct xenkbc_slotdata *xsd) +{ + + xsd->xsd_polling = 0; +} + +/* + * Get the current command byte. + */ +static int +xenkbc_get8042cmd(struct xenkbc_internal *xi) +{ + int data; + + if (!xenkbc_send_cmd(xi, K_RDCMDBYTE)) + return 0; + data = xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT); + if (data == -1) + return 0; + xi->xi_8042cmdbyte = data; + return 1; +} + +/* + * Pass command byte to keyboard controller (8042). + */ +static int +xenkbc_put8042cmd(struct xenkbc_internal *xi) +{ + + if (!xenkbc_send_cmd(xi, K_LDCMDBYTE)) + return 0; + if (!xenkbc_wait_output(xi)) + return 0; + return xenkbc_send_data(xi, xi->xi_8042cmdbyte); +} + +static int +xenkbc_send_devcmd(void *cookie, pckbport_slot_t slot, u_char devcmd) +{ + + DPRINTF(("send_devcmd %x\n", devcmd)); + + if (slot == PCKBPORT_AUX_SLOT) { + if (!xenkbc_send_cmd(cookie, KBC_AUXWRITE)) { + DPRINTF(("xenkbc_send_devcmd: KBC_AUXWRITE failed\n")); + return 0; + } + } + if (!xenkbc_wait_output(cookie)) { + DPRINTF(("xenkbc_send_devcmd: wait_output failed\n")); + return 0; + } + return xenkbc_send_data(cookie, devcmd); +} + +static int +xenkbc_send_cmd(void *cookie, u_char cmd) +{ + struct xenkbc_internal *xi = cookie; + + DPRINTF(("send_cmd %x\n", cmd)); + xenkbc_wait_output(xi); + return !HYPERVISOR_kbd_op(KBD_OP_WRITECOMMAND, cmd); +} + +static int +xenkbc_send_data(void *cookie, u_char output) +{ + struct xenkbc_internal *xi = cookie; + + DPRINTF(("send_data %x\n", output)); + xenkbc_wait_output(xi); + return !HYPERVISOR_kbd_op(KBD_OP_WRITEOUTPUT, output); +} + +static int +xenkbc_poll_data1(void *cookie, pckbport_slot_t slot) +{ + struct xenkbc_internal *xi = cookie; + struct xenkbc_slotdata *xsd = xi->xi_slotdata[slot]; + int s; + u_char stat, c; + int i = 1000; + + s = splhigh(); + + if (xsd && xsd->xsd_polling && xsd->xsd_poll_data != -1 && + xsd->xsd_poll_stat != -1) { + stat = xsd->xsd_poll_stat; + c = xsd->xsd_poll_data; + xsd->xsd_poll_data = -1; + xsd->xsd_poll_stat = -1; + goto process; + } + + DELAY(10); + for (; i; i--) { + stat = xenkbc_getstatus(xi); + if (stat & KBS_DIB) { + c = xi->xi_data; + DELAY(10); + process: + if (XI_HASAUX(xi) && (stat & 0x20)) { /* aux data */ + if (slot != PCKBPORT_AUX_SLOT) { +#ifdef XENKBCDEBUG + printf("lost aux 0x%x\n", c); +#endif + continue; + } + } else { + if (slot == PCKBPORT_AUX_SLOT) { +#ifdef XENKBCDEBUG + printf("lost kbd 0x%x\n", c); +#endif + continue; + } + } + splx(s); + DPRINTF(("poll -> %x stat %x\n", c, stat)); + return c; + } + } + + DPRINTF(("poll failed -> -1\n")); + splx(s); + return -1; +} + +/* + * switch scancode translation on / off + * return nonzero on success + */ +static int +xenkbc_xt_translation(void *cookie, pckbport_slot_t slot, int on) +{ + struct xenkbc_internal *xi = cookie; + int ison; + + if (slot != PCKBPORT_KBD_SLOT) { + /* translation only for kbd slot */ + if (on) + return 0; + else + return 1; + } + + ison = xi->xi_8042cmdbyte & KC8_TRANS; + if ((on && ison) || (!on && !ison)) + return 1; + + xi->xi_8042cmdbyte ^= KC8_TRANS; + if (!xenkbc_put8042cmd(xi)) + return 0; + + /* read back to be sure */ + if (!xenkbc_get8042cmd(xi)) + return 0; + + ison = xi->xi_8042cmdbyte & KC8_TRANS; + if ((on && ison) || (!on && !ison)) + return 1; + return 0; +} + +static const struct xenkbc_portcmd { + u_char cmd_en, cmd_dis; +} xenkbc_portcmd[2] = { + { + KBC_KBDENABLE, KBC_KBDDISABLE, + }, { + KBC_AUXENABLE, KBC_AUXDISABLE, + } +}; + +static void +xenkbc_slot_enable(void *cookie, pckbport_slot_t slot, int on) +{ + struct xenkbc_internal *xi = cookie; + const struct xenkbc_portcmd *cmd; + + cmd = &xenkbc_portcmd[slot]; + + DPRINTF(("slot enable %d -> %d\n", slot, on)); + xenkbc_send_cmd(xi, on ? cmd->cmd_en : cmd->cmd_dis); +} + + +static void +xenkbc_intr_establish(void *cookie, pckbport_slot_t slot) +{ + +} + +static void +xenkbc_set_poll(void *cookie, pckbport_slot_t slot, int on) +{ + struct xenkbc_internal *xi = cookie; + + DPRINTF(("xenkbc_set_poll %d -> %d\n", slot, on)); + + xi->xi_slotdata[slot]->xsd_polling = on; + + if (on) { + xi->xi_slotdata[slot]->xsd_poll_data = -1; + xi->xi_slotdata[slot]->xsd_poll_stat = -1; + } else { + int s; + + /* + * If disabling polling on a device that's been configured, + * make sure there are no bytes left in the FIFO, holding up + * the interrupt line. Otherwise we won't get any further + * interrupts. + */ + s = spltty(); + xenkbc_intr(xi); + splx(s); + } +} + +static int +xenkbc_intr(void *self) +{ + struct xenkbc_internal *xi = self; + u_char stat; + pckbport_slot_t slot; + struct xenkbc_slotdata *xsd; + int served = 0; + + for (;;) { + stat = xenkbc_getstatus(xi); + if (!(stat & KBS_DIB)) + break; + + served = 1; + + slot = (XI_HASAUX(xi) && (stat & 0x20)) ? + PCKBPORT_AUX_SLOT : PCKBPORT_KBD_SLOT; + xsd = xi->xi_slotdata[slot]; + + if (xsd == NULL) + continue; + +#if NRND > 0 + rnd_add_uint32(&xsd->xsd_rnd_source, + (stat << 8) | xi->xi_data); +#endif + + if (xsd->xsd_polling) { + xsd->xsd_poll_data = xi->xi_data; + xsd->xsd_poll_stat = stat; + break; /* xenkbc_poll_data() will get it */ + } + + pckbportintr(xi->xi_pt, slot, xi->xi_data); + } + + return served; +} + +int +xenkbc_cnattach(pckbport_slot_t slot) +{ + struct xenkbc_internal *xi = &xenkbc_consdata; + int ret; + + /* flush */ + (void) xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT); + + /* init cmd byte, enable ports */ + xenkbc_consdata.xi_8042cmdbyte = KC8_CPU; + if (!xenkbc_put8042cmd(xi)) { + printf("kbc: cmd word write error\n"); + return EIO; + } + + ret = pckbport_cnattach(xi, &xenkbc_ops, slot); + + xi->xi_slotdata[slot] = &xenkbc_cons_slotdata; + xenkbc_init_slotdata(xi->xi_slotdata[slot]); + XI_SETCONSOLE(xi, 1); + + return ret; +} diff --git a/netbsd-2.0-xen-sparse/sys/nfs/files.nfs b/netbsd-2.0-xen-sparse/sys/nfs/files.nfs new file mode 100644 index 0000000000..228c0c890f --- /dev/null +++ b/netbsd-2.0-xen-sparse/sys/nfs/files.nfs @@ -0,0 +1,34 @@ +# $NetBSD: files.nfs,v 1.3 2004/03/11 21:48:43 cl Exp $ + +deffs fs_nfs.h NFS + +defflag opt_nfs_boot.h NFS_BOOT_BOOTP NFS_BOOT_BOOTPARAM NFS_BOOT_DHCP + NFS_BOOT_GATEWAY NFS_BOOT_TCP + NFS_BOOT_BOOTSTATIC + +defparam opt_nfs_boot.h NFS_BOOT_BOOTP_REQFILE NFS_BOOT_OPTIONS + NFS_BOOT_RWSIZE + NFS_BOOTSTATIC_MYIP NFS_BOOTSTATIC_GWIP + NFS_BOOTSTATIC_MASK NFS_BOOTSTATIC_SERVADDR + NFS_BOOTSTATIC_SERVER + +defflag opt_nfs.h NFS_V2_ONLY + +defflag NFSSERVER + +file nfs/krpc_subr.c nfs +file nfs/nfs_bio.c nfs +file nfs/nfs_boot.c nfs +file nfs/nfs_bootdhcp.c nfs & (nfs_boot_bootp | nfs_boot_dhcp) +file nfs/nfs_bootparam.c nfs & nfs_boot_bootparam +file nfs/nfs_bootstatic.c nfs & nfs_boot_bootstatic +file nfs/nfs_kq.c nfs +file nfs/nfs_node.c nfs +file nfs/nfs_nqlease.c nfsserver | nfs +file nfs/nfs_serv.c nfsserver +file nfs/nfs_socket.c nfsserver | nfs +file nfs/nfs_srvcache.c nfsserver +file nfs/nfs_subs.c nfsserver | nfs +file nfs/nfs_syscalls.c nfsserver | nfs +file nfs/nfs_vfsops.c nfs +file nfs/nfs_vnops.c nfs |