diff options
author | iap10@freefall.cl.cam.ac.uk <iap10@freefall.cl.cam.ac.uk> | 2005-03-21 07:58:08 +0000 |
---|---|---|
committer | iap10@freefall.cl.cam.ac.uk <iap10@freefall.cl.cam.ac.uk> | 2005-03-21 07:58:08 +0000 |
commit | d73b5730fbb7f3d0fd7fcd9a9b6e36d71d33ade0 (patch) | |
tree | bc7051351d4d09c13c29b247ee34c3f22ec28a3f /freebsd-5.3-xen-sparse | |
parent | a280a68e6317b8d274296935eee67d12788beeb4 (diff) | |
download | xen-d73b5730fbb7f3d0fd7fcd9a9b6e36d71d33ade0.tar.gz xen-d73b5730fbb7f3d0fd7fcd9a9b6e36d71d33ade0.tar.bz2 xen-d73b5730fbb7f3d0fd7fcd9a9b6e36d71d33ade0.zip |
bitkeeper revision 1.1159.272.3 (423e7e90uxPqdRoA4EvOUikif-yhXA)
Check-in of the sparse tree for FreeBSD 5.3 (version 050317)
This currently supports running as a domU.
- to create freebsd-5.3-xenU run fbsdxensetup from anywhere in the tree
- once created go to freebsd-5.3-xenU on a FreeBSD 5.3 machine, run
xenfbsd_kernel_build - you'll find kernel and kernel.debug under
i386-xen/compile/XENCONF
See http://www.fsmware.com/xenofreebsd/5.3/xenbsdsetup.txt
Thanks to NetApp for their contributions in support of the FreeBSD port to Xen
.
Signed-off-by: Kip Macy <kip.macy@gmail.com
Signed-off-by: ian.pratt@cl.cam.ac.uk
Diffstat (limited to 'freebsd-5.3-xen-sparse')
72 files changed, 32094 insertions, 0 deletions
diff --git a/freebsd-5.3-xen-sparse/conf/Makefile.i386-xen b/freebsd-5.3-xen-sparse/conf/Makefile.i386-xen new file mode 100644 index 0000000000..80e1cdd35c --- /dev/null +++ b/freebsd-5.3-xen-sparse/conf/Makefile.i386-xen @@ -0,0 +1,51 @@ +# Makefile.i386 -- with config changes. +# Copyright 1990 W. Jolitz +# from: @(#)Makefile.i386 7.1 5/10/91 +# $FreeBSD: src/sys/conf/Makefile.i386,v 1.259 2003/04/15 21:29:11 phk Exp $ +# +# Makefile for FreeBSD +# +# This makefile is constructed from a machine description: +# config machineid +# Most changes should be made in the machine description +# /sys/i386/conf/``machineid'' +# after which you should do +# config machineid +# Generic makefile changes should be made in +# /sys/conf/Makefile.i386 +# after which config should be rerun for all machines. +# + +# Which version of config(8) is required. +%VERSREQ= 500013 + +STD8X16FONT?= iso + + + +.if !defined(S) +.if exists(./@/.) +S= ./@ +.else +S= ../../.. +.endif +.endif +.include "$S/conf/kern.pre.mk" +M= i386-xen +MKMODULESENV+= MACHINE=i386-xen +INCLUDES+= -I../../include/xen-public +%BEFORE_DEPEND + +%OBJS + +%FILES.c + +%FILES.s + +%FILES.m + +%CLEAN + +%RULES + +.include "$S/conf/kern.post.mk" diff --git a/freebsd-5.3-xen-sparse/conf/files.i386-xen b/freebsd-5.3-xen-sparse/conf/files.i386-xen new file mode 100644 index 0000000000..189378d469 --- /dev/null +++ b/freebsd-5.3-xen-sparse/conf/files.i386-xen @@ -0,0 +1,294 @@ +# This file tells config what files go into building a kernel, +# files marked standard are always included. +# +# $FreeBSD: src/sys/conf/files.i386,v 1.457 2003/12/03 23:06:30 imp Exp $ +# +# The long compile-with and dependency lines are required because of +# limitations in config: backslash-newline doesn't work in strings, and +# dependency lines other than the first are silently ignored. +# +linux_genassym.o optional compat_linux \ + dependency "$S/i386/linux/linux_genassym.c" \ + compile-with "${CC} ${CFLAGS:N-fno-common} -c ${.IMPSRC}" \ + no-obj no-implicit-rule \ + clean "linux_genassym.o" +# +linux_assym.h optional compat_linux \ + dependency "$S/kern/genassym.sh linux_genassym.o" \ + compile-with "sh $S/kern/genassym.sh linux_genassym.o > ${.TARGET}" \ + no-obj no-implicit-rule before-depend \ + clean "linux_assym.h" +# +svr4_genassym.o optional compat_svr4 \ + dependency "$S/i386/svr4/svr4_genassym.c" \ + compile-with "${CC} ${CFLAGS:N-fno-common} -c ${.IMPSRC}" \ + no-obj no-implicit-rule \ + clean "svr4_genassym.o" +# +svr4_assym.h optional compat_svr4 \ + dependency "$S/kern/genassym.sh svr4_genassym.o" \ + compile-with "sh $S/kern/genassym.sh svr4_genassym.o > ${.TARGET}" \ + no-obj no-implicit-rule before-depend \ + clean "svr4_assym.h" +# +font.h optional sc_dflt_font \ + compile-with "uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x16.fnt && file2c 'static u_char dflt_font_16[16*256] = {' '};' < ${SC_DFLT_FONT}-8x16 > font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x14.fnt && file2c 'static u_char dflt_font_14[14*256] = {' '};' < ${SC_DFLT_FONT}-8x14 >> font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x8.fnt && file2c 'static u_char dflt_font_8[8*256] = {' '};' < ${SC_DFLT_FONT}-8x8 >> font.h" \ + no-obj no-implicit-rule before-depend \ + clean "font.h ${SC_DFLT_FONT}-8x14 ${SC_DFLT_FONT}-8x16 ${SC_DFLT_FONT}-8x8" +# +atkbdmap.h optional atkbd_dflt_keymap \ + compile-with "/usr/sbin/kbdcontrol -L ${ATKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > atkbdmap.h" \ + no-obj no-implicit-rule before-depend \ + clean "atkbdmap.h" +# +ukbdmap.h optional ukbd_dflt_keymap \ + compile-with "/usr/sbin/kbdcontrol -L ${UKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > ukbdmap.h" \ + no-obj no-implicit-rule before-depend \ + clean "ukbdmap.h" +# +msysosak.o optional fla \ + dependency "$S/contrib/dev/fla/i386/msysosak.o.uu" \ + compile-with "uudecode < $S/contrib/dev/fla/i386/msysosak.o.uu" \ + no-implicit-rule +# +trlld.o optional oltr \ + dependency "$S/contrib/dev/oltr/i386-elf.trlld.o.uu" \ + compile-with "uudecode < $S/contrib/dev/oltr/i386-elf.trlld.o.uu" \ + no-implicit-rule +# +hal.o optional ath_hal \ + dependency "$S/contrib/dev/ath/freebsd/i386-elf.hal.o.uu" \ + compile-with "uudecode < $S/contrib/dev/ath/freebsd/i386-elf.hal.o.uu" \ + no-implicit-rule +# +# +compat/linux/linux_file.c optional compat_linux +compat/linux/linux_getcwd.c optional compat_linux +compat/linux/linux_ioctl.c optional compat_linux +compat/linux/linux_ipc.c optional compat_linux +compat/linux/linux_mib.c optional compat_linux +compat/linux/linux_misc.c optional compat_linux +compat/linux/linux_signal.c optional compat_linux +compat/linux/linux_socket.c optional compat_linux +compat/linux/linux_stats.c optional compat_linux +compat/linux/linux_sysctl.c optional compat_linux +compat/linux/linux_uid16.c optional compat_linux +compat/linux/linux_util.c optional compat_linux +compat/pecoff/imgact_pecoff.c optional pecoff_support +compat/svr4/imgact_svr4.c optional compat_svr4 +compat/svr4/svr4_fcntl.c optional compat_svr4 +compat/svr4/svr4_filio.c optional compat_svr4 +compat/svr4/svr4_ioctl.c optional compat_svr4 +compat/svr4/svr4_ipc.c optional compat_svr4 +compat/svr4/svr4_misc.c optional compat_svr4 +compat/svr4/svr4_resource.c optional compat_svr4 +compat/svr4/svr4_signal.c optional compat_svr4 +compat/svr4/svr4_socket.c optional compat_svr4 +compat/svr4/svr4_sockio.c optional compat_svr4 +compat/svr4/svr4_stat.c optional compat_svr4 +compat/svr4/svr4_stream.c optional compat_svr4 +compat/svr4/svr4_syscallnames.c optional compat_svr4 +compat/svr4/svr4_sysent.c optional compat_svr4 +compat/svr4/svr4_sysvec.c optional compat_svr4 +compat/svr4/svr4_termios.c optional compat_svr4 +compat/svr4/svr4_ttold.c optional compat_svr4 +contrib/dev/fla/fla.c optional fla +contrib/dev/oltr/if_oltr.c optional oltr +contrib/dev/oltr/trlldbm.c optional oltr +contrib/dev/oltr/trlldhm.c optional oltr +contrib/dev/oltr/trlldmac.c optional oltr +bf_enc.o optional ipsec ipsec_esp \ + dependency "$S/crypto/blowfish/arch/i386/bf_enc.S $S/crypto/blowfish/arch/i386/bf_enc_586.S $S/crypto/blowfish/arch/i386/bf_enc_686.S" \ + compile-with "${CC} -c -I$S/crypto/blowfish/arch/i386 ${ASM_CFLAGS} ${WERROR} ${.IMPSRC}" \ + no-implicit-rule +crypto/des/arch/i386/des_enc.S optional ipsec ipsec_esp +crypto/des/des_ecb.c optional netsmbcrypto +crypto/des/arch/i386/des_enc.S optional netsmbcrypto +crypto/des/des_setkey.c optional netsmbcrypto +bf_enc.o optional crypto \ + dependency "$S/crypto/blowfish/arch/i386/bf_enc.S $S/crypto/blowfish/arch/i386/bf_enc_586.S $S/crypto/blowfish/arch/i386/bf_enc_686.S" \ + compile-with "${CC} -c -I$S/crypto/blowfish/arch/i386 ${ASM_CFLAGS} ${WERROR} ${.IMPSRC}" \ + no-implicit-rule +crypto/des/arch/i386/des_enc.S optional crypto +crypto/des/des_ecb.c optional crypto +crypto/des/des_setkey.c optional crypto +dev/ar/if_ar.c optional ar +dev/ar/if_ar_pci.c optional ar pci +dev/cx/csigma.c optional cx +dev/cx/cxddk.c optional cx +dev/cx/if_cx.c optional cx +dev/dgb/dgb.c count dgb +dev/fb/fb.c optional fb +dev/fb/fb.c optional vga +dev/fb/splash.c optional splash +dev/fb/vga.c optional vga +dev/kbd/atkbd.c optional atkbd +dev/kbd/atkbdc.c optional atkbdc +dev/kbd/kbd.c optional atkbd +dev/kbd/kbd.c optional kbd +dev/kbd/kbd.c optional sc +dev/kbd/kbd.c optional ukbd +dev/kbd/kbd.c optional vt +dev/mem/memutil.c standard +dev/random/nehemiah.c standard +dev/ppc/ppc.c optional ppc +dev/ppc/ppc_puc.c optional ppc puc pci +dev/sio/sio.c optional sio +dev/sio/sio_isa.c optional sio isa +dev/syscons/schistory.c optional sc +dev/syscons/scmouse.c optional sc +dev/syscons/scterm.c optional sc +dev/syscons/scterm-dumb.c optional sc +dev/syscons/scterm-sc.c optional sc +dev/syscons/scvesactl.c optional sc vga vesa +dev/syscons/scvgarndr.c optional sc vga +dev/syscons/scvidctl.c optional sc +dev/syscons/scvtb.c optional sc +dev/syscons/syscons.c optional sc +dev/syscons/sysmouse.c optional sc +dev/uart/uart_cpu_i386.c optional uart +geom/geom_bsd.c standard +geom/geom_bsd_enc.c standard +geom/geom_mbr.c standard +geom/geom_mbr_enc.c standard +i386/acpica/OsdEnvironment.c optional acpi +i386/acpica/acpi_machdep.c optional acpi +i386/acpica/acpi_wakeup.c optional acpi +acpi_wakecode.h optional acpi \ + dependency "$S/i386/acpica/acpi_wakecode.S" \ + compile-with "${MAKE} -f $S/i386/acpica/Makefile MAKESRCPATH=$S/i386/acpica" \ + no-obj no-implicit-rule before-depend \ + clean "acpi_wakecode.h acpi_wakecode.o acpi_wakecode.bin" +# +i386/acpica/madt.c optional acpi apic +i386/bios/mca_machdep.c optional mca +i386/bios/smapi.c optional smapi +i386/bios/smapi_bios.S optional smapi +i386/bios/smbios.c optional smbios +i386/bios/vpd.c optional vpd +i386/i386/apic_vector.s optional apic +i386/i386/atomic.c standard \ + compile-with "${CC} -c ${CFLAGS} ${DEFINED_PROF:S/^$/-fomit-frame-pointer/} ${.IMPSRC}" +i386/i386/autoconf.c standard +i386/i386/busdma_machdep.c standard +i386-xen/i386-xen/critical.c standard +i386/i386/db_disasm.c optional ddb +i386-xen/i386-xen/db_interface.c optional ddb +i386/i386/db_trace.c optional ddb +i386/i386/i386-gdbstub.c optional ddb +i386/i386/dump_machdep.c standard +i386/i386/elf_machdep.c standard +i386-xen/i386-xen/exception.s standard +i386-xen/i386-xen/i686_mem.c standard +i386/i386/identcpu.c standard +i386/i386/in_cksum.c optional inet +i386-xen/i386-xen/initcpu.c standard +i386-xen/i386-xen/intr_machdep.c standard +i386-xen/i386-xen/io_apic.c optional apic +i386/i386/legacy.c standard +i386-xen/i386-xen/locore.s standard no-obj +i386-xen/i386-xen/machdep.c standard +i386/i386/mem.c standard +i386-xen/i386-xen/mp_clock.c optional smp +i386-xen/i386-xen/mp_machdep.c optional smp +i386/i386/mpboot.s optional smp +i386-xen/i386-xen/mptable.c optional apic +i386-xen/i386-xen/local_apic.c optional apic +i386/i386/mptable_pci.c optional apic pci +i386/i386/nexus.c standard +i386/i386/uio_machdep.c standard +i386/i386/perfmon.c optional perfmon +i386/i386/perfmon.c optional perfmon profiling-routine +i386-xen/i386-xen/pmap.c standard +i386-xen/i386-xen/support.s standard +i386-xen/i386-xen/swtch.s standard +i386-xen/i386-xen/sys_machdep.c standard +i386-xen/i386-xen/trap.c standard +i386/i386/tsc.c standard +i386-xen/i386-xen/vm_machdep.c standard +i386-xen/i386-xen/clock.c standard + +# xen specific arch-dep files +i386-xen/i386-xen/hypervisor.c standard +i386-xen/i386-xen/xen_machdep.c standard +i386-xen/i386-xen/xen_bus.c standard +i386-xen/i386-xen/evtchn.c standard +i386-xen/i386-xen/ctrl_if.c standard + + +i386/isa/asc.c count asc +i386/isa/ctx.c optional ctx +i386/isa/cy.c count cy +i386/isa/elink.c optional ep +i386/isa/elink.c optional ie +i386/isa/gpib.c optional gp +i386/isa/gsc.c count gsc +i386/isa/istallion.c optional stli nowerror +i386/isa/loran.c optional loran +i386/isa/mse.c optional mse +i386/isa/nmi.c standard + +# drivers +i386-xen/xen/misc/npx.c optional npx +i386-xen/xen/misc/evtchn_dev.c standard +i386-xen/xen/char/console.c standard +i386-xen/xen/netfront/xn_netfront.c standard +i386-xen/xen/blkfront/xb_blkfront.c standard + + + +i386/isa/pcf.c optional pcf +i386/isa/pcvt/pcvt_drv.c optional vt +i386/isa/pcvt/pcvt_ext.c optional vt +i386/isa/pcvt/pcvt_kbd.c optional vt +i386/isa/pcvt/pcvt_out.c optional vt +i386/isa/pcvt/pcvt_sup.c optional vt +i386/isa/pcvt/pcvt_vtf.c optional vt +i386/isa/pmtimer.c optional pmtimer +i386/isa/prof_machdep.c optional profiling-routine +i386/isa/spic.c optional spic +i386/isa/spigot.c count spigot +i386/isa/spkr.c optional speaker +i386/isa/stallion.c optional stl nowerror +i386/isa/vesa.c optional vga vesa +i386/isa/wt.c count wt +i386/linux/imgact_linux.c optional compat_linux +i386/linux/linux_dummy.c optional compat_linux +i386/linux/linux_locore.s optional compat_linux \ + dependency "linux_assym.h" +i386/linux/linux_machdep.c optional compat_linux +i386/linux/linux_ptrace.c optional compat_linux +i386/linux/linux_sysent.c optional compat_linux +i386/linux/linux_sysvec.c optional compat_linux +i386/pci/pci_cfgreg.c optional pci +i386/pci/pci_bus.c optional pci +i386/svr4/svr4_locore.s optional compat_svr4 \ + dependency "svr4_assym.h" \ + warning "COMPAT_SVR4 is broken and should be avoided" +i386/svr4/svr4_machdep.c optional compat_svr4 +isa/atkbd_isa.c optional atkbd +isa/atkbdc_isa.c optional atkbdc +isa/fd.c optional fdc +isa/psm.c optional psm +isa/syscons_isa.c optional sc +isa/vga_isa.c optional vga +kern/imgact_aout.c optional compat_aout +kern/imgact_gzip.c optional gzip +libkern/divdi3.c standard +libkern/moddi3.c standard +libkern/qdivrem.c standard +libkern/ucmpdi2.c standard +libkern/udivdi3.c standard +libkern/umoddi3.c standard +libkern/flsl.c standard +libkern/ffsl.c standard + +pci/cy_pci.c optional cy pci +pci/agp_intel.c optional agp +pci/agp_via.c optional agp +pci/agp_sis.c optional agp +pci/agp_ali.c optional agp +pci/agp_amd.c optional agp +pci/agp_i810.c optional agp +pci/agp_nvidia.c optional agp + diff --git a/freebsd-5.3-xen-sparse/conf/ldscript.i386-xen b/freebsd-5.3-xen-sparse/conf/ldscript.i386-xen new file mode 100644 index 0000000000..65cbc852da --- /dev/null +++ b/freebsd-5.3-xen-sparse/conf/ldscript.i386-xen @@ -0,0 +1,134 @@ +/* $FreeBSD: src/sys/conf/ldscript.i386,v 1.9 2003/12/03 07:40:03 phk Exp $ */ +OUTPUT_FORMAT("elf32-i386-freebsd", "elf32-i386-freebsd", "elf32-i386-freebsd") +OUTPUT_ARCH(i386) +ENTRY(btext) +SEARCH_DIR(/usr/lib); +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + . = kernbase + SIZEOF_HEADERS; + .interp : { *(.interp) } + .hash : { *(.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + .rel.text : + { *(.rel.text) *(.rel.gnu.linkonce.t*) } + .rela.text : + { *(.rela.text) *(.rela.gnu.linkonce.t*) } + .rel.data : + { *(.rel.data) *(.rel.gnu.linkonce.d*) } + .rela.data : + { *(.rela.data) *(.rela.gnu.linkonce.d*) } + .rel.rodata : + { *(.rel.rodata) *(.rel.gnu.linkonce.r*) } + .rela.rodata : + { *(.rela.rodata) *(.rela.gnu.linkonce.r*) } + .rel.got : { *(.rel.got) } + .rela.got : { *(.rela.got) } + .rel.ctors : { *(.rel.ctors) } + .rela.ctors : { *(.rela.ctors) } + .rel.dtors : { *(.rel.dtors) } + .rela.dtors : { *(.rela.dtors) } + .rel.init : { *(.rel.init) } + .rela.init : { *(.rela.init) } + .rel.fini : { *(.rel.fini) } + .rela.fini : { *(.rela.fini) } + .rel.bss : { *(.rel.bss) } + .rela.bss : { *(.rela.bss) } + .rel.plt : { *(.rel.plt) } + .rela.plt : { *(.rela.plt) } + .init : { *(.init) } =0x9090 + .plt : { *(.plt) } + .text : + { + *(.text) + *(.stub) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.gnu.linkonce.t*) + } =0x9090 + _etext = .; + PROVIDE (etext = .); + .fini : { *(.fini) } =0x9090 + .rodata : { *(.rodata) *(.gnu.linkonce.r*) } + .rodata1 : { *(.rodata1) } + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = ALIGN(0x1000) + (. & (0x1000 - 1)) ; + .data : + { + *(.data) + *(.gnu.linkonce.d*) + CONSTRUCTORS + } + .data1 : { *(.data1) } + . = ALIGN(32 / 8); + _start_ctors = .; + PROVIDE (start_ctors = .); + .ctors : + { + *(.ctors) + } + _stop_ctors = .; + PROVIDE (stop_ctors = .); + .dtors : + { + *(.dtors) + } + .got : { *(.got.plt) *(.got) } + .dynamic : { *(.dynamic) } + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : { *(.sdata) } + _edata = .; + PROVIDE (edata = .); + __bss_start = .; + .sbss : { *(.sbss) *(.scommon) } + .bss : + { + *(.dynbss) + *(.bss) + *(COMMON) + } + . = ALIGN(32 / 8); + _end = . ; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* These must appear regardless of . */ +} diff --git a/freebsd-5.3-xen-sparse/conf/options.i386-xen b/freebsd-5.3-xen-sparse/conf/options.i386-xen new file mode 100644 index 0000000000..6bbc509087 --- /dev/null +++ b/freebsd-5.3-xen-sparse/conf/options.i386-xen @@ -0,0 +1,162 @@ +# $FreeBSD: src/sys/conf/options.i386,v 1.204 2003/12/03 23:06:30 imp Exp $ +# Options specific to the i386 platform kernels + +AUTO_EOI_1 opt_auto_eoi.h +AUTO_EOI_2 opt_auto_eoi.h +BROKEN_KEYBOARD_RESET opt_reset.h +COMPAT_OLDISA +I586_PMC_GUPROF opt_i586_guprof.h +MAXMEM +MPTABLE_FORCE_HTT +NO_MIXED_MODE +PERFMON +DISABLE_PSE opt_pmap.h +DISABLE_PG_G opt_pmap.h +PMAP_SHPGPERPROC opt_pmap.h +PPC_PROBE_CHIPSET opt_ppc.h +PPC_DEBUG opt_ppc.h +POWERFAIL_NMI opt_trap.h +MP_WATCHDOG opt_mp_watchdog.h + + + +# Options for emulators. These should only be used at config time, so +# they are handled like options for static filesystems +# (see src/sys/conf/options), except for broken debugging options. +COMPAT_AOUT opt_dontuse.h +IBCS2 opt_dontuse.h +COMPAT_LINUX opt_dontuse.h +COMPAT_SVR4 opt_dontuse.h +DEBUG_SVR4 opt_svr4.h +PECOFF_SUPPORT opt_dontuse.h +PECOFF_DEBUG opt_pecoff.h + +# Change KVM size. Changes things all over the kernel. +KVA_PAGES opt_global.h +XEN opt_global.h +XENDEV opt_xen.h +NOXENDEBUG opt_xen.h +# Physical address extensions and support for >4G ram. As above. +PAE opt_global.h + +CLK_CALIBRATION_LOOP opt_clock.h +CLK_USE_I8254_CALIBRATION opt_clock.h +CLK_USE_TSC_CALIBRATION opt_clock.h +TIMER_FREQ opt_clock.h + +CPU_ATHLON_SSE_HACK opt_cpu.h +CPU_BLUELIGHTNING_3X opt_cpu.h +CPU_BLUELIGHTNING_FPU_OP_CACHE opt_cpu.h +CPU_BTB_EN opt_cpu.h +CPU_CYRIX_NO_LOCK opt_cpu.h +CPU_DIRECT_MAPPED_CACHE opt_cpu.h +CPU_DISABLE_5X86_LSSER opt_cpu.h +CPU_DISABLE_CMPXCHG opt_global.h # XXX global, unlike other CPU_* +CPU_DISABLE_SSE opt_cpu.h +CPU_ELAN opt_cpu.h +CPU_ELAN_XTAL opt_cpu.h +CPU_ELAN_PPS opt_cpu.h +CPU_ENABLE_SSE opt_cpu.h +CPU_FASTER_5X86_FPU opt_cpu.h +CPU_GEODE opt_cpu.h +CPU_I486_ON_386 opt_cpu.h +CPU_IORT opt_cpu.h +CPU_L2_LATENCY opt_cpu.h +CPU_LOOP_EN opt_cpu.h +CPU_PPRO2CELERON opt_cpu.h +CPU_RSTK_EN opt_cpu.h +CPU_SOEKRIS opt_cpu.h +CPU_SUSP_HLT opt_cpu.h +CPU_UPGRADE_HW_CACHE opt_cpu.h +CPU_WT_ALLOC opt_cpu.h +CYRIX_CACHE_REALLY_WORKS opt_cpu.h +CYRIX_CACHE_WORKS opt_cpu.h +NO_F00F_HACK opt_cpu.h +NO_MEMORY_HOLE opt_cpu.h + +# The CPU type affects the endian conversion functions all over the kernel. +I386_CPU opt_global.h +I486_CPU opt_global.h +I586_CPU opt_global.h +I686_CPU opt_global.h + +VGA_ALT_SEQACCESS opt_vga.h +VGA_DEBUG opt_vga.h +VGA_NO_FONT_LOADING opt_vga.h +VGA_NO_MODE_CHANGE opt_vga.h +VGA_SLOW_IOACCESS opt_vga.h +VGA_WIDTH90 opt_vga.h + +VESA +VESA_DEBUG opt_vesa.h + +PSM_HOOKRESUME opt_psm.h +PSM_RESETAFTERSUSPEND opt_psm.h +PSM_DEBUG opt_psm.h + +ATKBD_DFLT_KEYMAP opt_atkbd.h + +# pcvt(4) has a bunch of options +FAT_CURSOR opt_pcvt.h +XSERVER opt_pcvt.h +PCVT_24LINESDEF opt_pcvt.h +PCVT_CTRL_ALT_DEL opt_pcvt.h +PCVT_META_ESC opt_pcvt.h +PCVT_NSCREENS opt_pcvt.h +PCVT_PRETTYSCRNS opt_pcvt.h +PCVT_SCANSET opt_pcvt.h +PCVT_SCREENSAVER opt_pcvt.h +PCVT_USEKBDSEC opt_pcvt.h +PCVT_VT220KEYB opt_pcvt.h +PCVT_GREENSAVER opt_pcvt.h + +# Video spigot +SPIGOT_UNSECURE opt_spigot.h + +# Enables NETGRAPH support for Cronyx adapters +NETGRAPH_CRONYX opt_ng_cronyx.h + +# ------------------------------- +# isdn4bsd: passive ISA cards +# ------------------------------- +TEL_S0_8 opt_i4b.h +TEL_S0_16 opt_i4b.h +TEL_S0_16_3 opt_i4b.h +AVM_A1 opt_i4b.h +USR_STI opt_i4b.h +ITKIX1 opt_i4b.h +ELSA_PCC16 opt_i4b.h +# ------------------------------- +# isdn4bsd: passive ISA PnP cards +# ------------------------------- +CRTX_S0_P opt_i4b.h +DRN_NGO opt_i4b.h +TEL_S0_16_3_P opt_i4b.h +SEDLBAUER opt_i4b.h +DYNALINK opt_i4b.h +ASUSCOM_IPAC opt_i4b.h +ELSA_QS1ISA opt_i4b.h +SIEMENS_ISURF2 opt_i4b.h +EICON_DIVA opt_i4b.h +COMPAQ_M610 opt_i4b.h +# ------------------------------- +# isdn4bsd: passive PCI cards +# ------------------------------- +ELSA_QS1PCI opt_i4b.h +# ------------------------------- +# isdn4bsd: misc options +# ------------------------------- +# temporary workaround for SMP machines +I4B_SMP_WORKAROUND opt_i4b.h +# enable VJ compression code for ipr i/f +IPR_VJ opt_i4b.h +IPR_LOG opt_i4b.h + +# Device options +DEV_ACPI opt_acpi.h +DEV_APIC opt_apic.h +DEV_NPX opt_npx.h + +# ------------------------------- +# EOF +# ------------------------------- diff --git a/freebsd-5.3-xen-sparse/fbsdxensetup b/freebsd-5.3-xen-sparse/fbsdxensetup new file mode 100644 index 0000000000..3d024c370e --- /dev/null +++ b/freebsd-5.3-xen-sparse/fbsdxensetup @@ -0,0 +1,39 @@ +#!/bin/csh -f + +setenv XENROOT `bk root` +rm -rf $XENROOT/fbsdtmp $XENROOT/freebsd-5.3-xenU +mkdir -p $XENROOT/fbsdtmp +cd $XENROOT/fbsdtmp +echo "step 1" +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.aa +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.ab +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.ac +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.ad +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.ae +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.af +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.ag +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.ah +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.ai +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.aj +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.ak +wget ftp://ftp.freebsd.org/pub/FreeBSD/releases/i386/5.3-RELEASE/src/ssys.al +mkdir -p foo +cat ssys.?? | tar --unlink -xpzf - -C foo/ +mkdir -p $XENROOT/freebsd-5.3-xenU +mv foo/sys/* $XENROOT/freebsd-5.3-xenU +cd $XENROOT +rm -rf $XENROOT/fbsdtmp +echo "step 2" +mkdir -p $XENROOT/freebsd-5.3-xenU/i386-xen/include +cd $XENROOT/freebsd-5.3-xenU/i386-xen/include/ +foreach file (../../i386/include/*) + ln -s $file +end +echo "step 3" +cd $XENROOT/freebsd-5.3-xen-sparse +echo "step 4" +./mkbuildtree ../freebsd-5.3-xenU +echo "step 5" +cd $XENROOT/freebsd-5.3-xenU/i386-xen/include +ln -s $XENROOT/xen/include/public xen-public +echo "done" diff --git a/freebsd-5.3-xen-sparse/i386-xen/Makefile b/freebsd-5.3-xen-sparse/i386-xen/Makefile new file mode 100644 index 0000000000..f33c7a5af6 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/Makefile @@ -0,0 +1,40 @@ +# $FreeBSD: src/sys/i386/Makefile,v 1.11 2002/06/21 06:18:02 mckusick Exp $ +# @(#)Makefile 8.1 (Berkeley) 6/11/93 + +# Makefile for i386 links, tags file + +# SYS is normally set in Make.tags.inc +# SYS=/sys +SYS=/nsys + +TAGDIR= i386 + +.include "../kern/Make.tags.inc" + +all: + @echo "make links or tags only" + +# Directories in which to place i386 tags links +DI386= apm i386 ibcs2 include isa linux + +links:: + -for i in ${COMMDIR1}; do \ + (cd $$i && { rm -f tags; ln -s ../${TAGDIR}/tags tags; }) done + -for i in ${COMMDIR2}; do \ + (cd $$i && { rm -f tags; ln -s ../../${TAGDIR}/tags tags; }) done + -for i in ${DI386}; do \ + (cd $$i && { rm -f tags; ln -s ../tags tags; }) done + +SI386= ${SYS}/i386/apm/*.[ch] \ + ${SYS}/i386/i386/*.[ch] ${SYS}/i386/ibcs2/*.[ch] \ + ${SYS}/i386/include/*.[ch] ${SYS}/i386/isa/*.[ch] \ + ${SYS}/i386/linux/*.[ch] +AI386= ${SYS}/i386/i386/*.s + +tags:: + -ctags -wdt ${COMM} ${SI386} + egrep "^ENTRY(.*)|^ALTENTRY(.*)" ${AI386} | \ + sed "s;\([^:]*\):\([^(]*\)(\([^, )]*\)\(.*\);\3 \1 /^\2(\3\4$$/;" \ + >> tags + sort -o tags tags + chmod 444 tags diff --git a/freebsd-5.3-xen-sparse/i386-xen/compile/.cvsignore b/freebsd-5.3-xen-sparse/i386-xen/compile/.cvsignore new file mode 100644 index 0000000000..232298edb1 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/compile/.cvsignore @@ -0,0 +1 @@ +[A-Za-z0-9]* diff --git a/freebsd-5.3-xen-sparse/i386-xen/conf/GENERIC b/freebsd-5.3-xen-sparse/i386-xen/conf/GENERIC new file mode 100644 index 0000000000..6a70639bda --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/conf/GENERIC @@ -0,0 +1,273 @@ +# +# GENERIC -- Generic kernel configuration file for FreeBSD/i386 +# +# For more information on this file, please read the handbook section on +# Kernel Configuration Files: +# +# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html +# +# The handbook is also available locally in /usr/share/doc/handbook +# if you've installed the doc distribution, otherwise always see the +# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the +# latest information. +# +# An exhaustive list of options and more detailed explanations of the +# device lines is also present in the ../../conf/NOTES and NOTES files. +# If you are in doubt as to the purpose or necessity of a line, check first +# in NOTES. +# +# $FreeBSD: src/sys/i386/conf/GENERIC,v 1.394.2.3 2004/01/26 19:42:11 nectar Exp $ + +machine i386 +cpu I486_CPU +cpu I586_CPU +cpu I686_CPU +ident GENERIC + +#To statically compile in device wiring instead of /boot/device.hints +#hints "GENERIC.hints" #Default places to look for devices. + +#makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols + +options SCHED_4BSD #4BSD scheduler +options INET #InterNETworking +options INET6 #IPv6 communications protocols +options FFS #Berkeley Fast Filesystem +options SOFTUPDATES #Enable FFS soft updates support +options UFS_ACL #Support for access control lists +options UFS_DIRHASH #Improve performance on big directories +options MD_ROOT #MD is a potential root device +options NFSCLIENT #Network Filesystem Client +options NFSSERVER #Network Filesystem Server +options NFS_ROOT #NFS usable as /, requires NFSCLIENT +options MSDOSFS #MSDOS Filesystem +options CD9660 #ISO 9660 Filesystem +options PROCFS #Process filesystem (requires PSEUDOFS) +options PSEUDOFS #Pseudo-filesystem framework +options COMPAT_43 #Compatible with BSD 4.3 [KEEP THIS!] +options COMPAT_FREEBSD4 #Compatible with FreeBSD4 +options SCSI_DELAY=15000 #Delay (in ms) before probing SCSI +options KTRACE #ktrace(1) support +options SYSVSHM #SYSV-style shared memory +options SYSVMSG #SYSV-style message queues +options SYSVSEM #SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING #Posix P1003_1B real-time extensions +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options AHC_REG_PRETTY_PRINT # Print register bitfields in debug + # output. Adds ~128k to driver. +options AHD_REG_PRETTY_PRINT # Print register bitfields in debug + # output. Adds ~215k to driver. +options PFIL_HOOKS # pfil(9) framework + +# Debugging for use in -current +#options DDB #Enable the kernel debugger +#options INVARIANTS #Enable calls of extra sanity checking +options INVARIANT_SUPPORT #Extra sanity checks of internal structures, required by INVARIANTS +#options WITNESS #Enable checks to detect deadlocks and cycles +#options WITNESS_SKIPSPIN #Don't run witness on spinlocks for speed + +# To make an SMP kernel, the next two are needed +options SMP # Symmetric MultiProcessor Kernel +device apic # I/O APIC + +device isa +device eisa +device pci + +# Floppy drives +device fdc + +# ATA and ATAPI devices +device ata +device atadisk # ATA disk drives +device ataraid # ATA RAID drives +device atapicd # ATAPI CDROM drives +device atapifd # ATAPI floppy drives +device atapist # ATAPI tape drives +options ATA_STATIC_ID #Static device numbering + +# SCSI Controllers +device ahb # EISA AHA1742 family +device ahc # AHA2940 and onboard AIC7xxx devices +device ahd # AHA39320/29320 and onboard AIC79xx devices +device amd # AMD 53C974 (Tekram DC-390(T)) +device isp # Qlogic family +device mpt # LSI-Logic MPT-Fusion +#device ncr # NCR/Symbios Logic +device sym # NCR/Symbios Logic (newer chipsets + those of `ncr') +device trm # Tekram DC395U/UW/F DC315U adapters + +device adv # Advansys SCSI adapters +device adw # Advansys wide SCSI adapters +device aha # Adaptec 154x SCSI adapters +device aic # Adaptec 15[012]x SCSI adapters, AIC-6[23]60. +device bt # Buslogic/Mylex MultiMaster SCSI adapters + +device ncv # NCR 53C500 +device nsp # Workbit Ninja SCSI-3 +device stg # TMC 18C30/18C50 + +# SCSI peripherals +device scbus # SCSI bus (required for SCSI) +device ch # SCSI media changers +device da # Direct Access (disks) +device sa # Sequential Access (tape etc) +device cd # CD +device pass # Passthrough device (direct SCSI access) +device ses # SCSI Environmental Services (and SAF-TE) + +# RAID controllers interfaced to the SCSI subsystem +device amr # AMI MegaRAID +device asr # DPT SmartRAID V, VI and Adaptec SCSI RAID +device ciss # Compaq Smart RAID 5* +device dpt # DPT Smartcache III, IV - See NOTES for options +device iir # Intel Integrated RAID +device ips # IBM (Adaptec) ServeRAID +device mly # Mylex AcceleRAID/eXtremeRAID + +# RAID controllers +device aac # Adaptec FSA RAID +device aacp # SCSI passthrough for aac (requires CAM) +device ida # Compaq Smart RAID +device mlx # Mylex DAC960 family +device pst # Promise Supertrak SX6000 +device twe # 3ware ATA RAID + +# atkbdc0 controls both the keyboard and the PS/2 mouse +device atkbdc # AT keyboard controller +device atkbd # AT keyboard +device psm # PS/2 mouse + +device vga # VGA video card driver + +device splash # Splash screen and screen saver support + +# syscons is the default console driver, resembling an SCO console +device sc + +# Enable this for the pcvt (VT220 compatible) console driver +#device vt +#options XSERVER # support for X server on a vt console +#options FAT_CURSOR # start with block cursor + +device agp # support several AGP chipsets + +# Floating point support - do not disable. +device npx + +# Power management support (see NOTES for more options) +#device apm +# Add suspend/resume support for the i8254. +device pmtimer + +# PCCARD (PCMCIA) support +# Pcmcia and cardbus bridge support +device cbb # cardbus (yenta) bridge +#device pcic # ExCA ISA and PCI bridges +device pccard # PC Card (16-bit) bus +device cardbus # CardBus (32-bit) bus + +# Serial (COM) ports +device sio # 8250, 16[45]50 based serial ports + +# Parallel port +device ppc +device ppbus # Parallel port bus (required) +device lpt # Printer +device plip # TCP/IP over parallel +device ppi # Parallel port interface device +#device vpo # Requires scbus and da + +# If you've got a "dumb" serial or parallel PCI card that is +# supported by the puc(4) glue driver, uncomment the following +# line to enable it (connects to the sio and/or ppc drivers): +#device puc + +# PCI Ethernet NICs. +device de # DEC/Intel DC21x4x (``Tulip'') +device em # Intel PRO/1000 adapter Gigabit Ethernet Card +device txp # 3Com 3cR990 (``Typhoon'') +device vx # 3Com 3c590, 3c595 (``Vortex'') + +# PCI Ethernet NICs that use the common MII bus controller code. +# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs! +device miibus # MII bus support +device bfe # Broadcom BCM440x 10/100 ethernet +device bge # Broadcom BCM570xx Gigabit Ethernet +device dc # DEC/Intel 21143 and various workalikes +device fxp # Intel EtherExpress PRO/100B (82557, 82558) +device pcn # AMD Am79C97x PCI 10/100 (precedence over 'lnc') +device re # RealTek 8139C+/8169/8169S/8110S +device rl # RealTek 8129/8139 +device sf # Adaptec AIC-6915 (``Starfire'') +device sis # Silicon Integrated Systems SiS 900/SiS 7016 +device sk # SysKonnect SK-984x and SK-982x gigabit ethernet +device ste # Sundance ST201 (D-Link DFE-550TX) +device ti # Alteon Networks Tigon I/II gigabit ethernet +device tl # Texas Instruments ThunderLAN +device tx # SMC EtherPower II (83c170 ``EPIC'') +device vr # VIA Rhine, Rhine II +device wb # Winbond W89C840F +device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') + +# ISA Ethernet NICs. pccard nics included. +device cs # Crystal Semiconductor CS89x0 NIC +# 'device ed' requires 'device miibus' +device ed # NE[12]000, SMC Ultra, 3c503, DS8390 cards +device ex # Intel EtherExpress Pro/10 and Pro/10+ +device ep # Etherlink III based cards +device fe # Fujitsu MB8696x based cards +device ie # EtherExpress 8/16, 3C507, StarLAN 10 etc. +device lnc # NE2100, NE32-VL Lance Ethernet cards +device sn # SMC's 9000 series of ethernet chips +device xe # Xircom pccard ethernet + +# ISA devices that use the old ISA shims +#device le + +# Wireless NIC cards +device wlan # 802.11 support +device an # Aironet 4500/4800 802.11 wireless NICs. +device awi # BayStack 660 and others +device wi # WaveLAN/Intersil/Symbol 802.11 wireless NICs. +#device wl # Older non 802.11 Wavelan wireless NIC. + +# Pseudo devices - the number indicates how many units to allocate. +device random # Entropy device +device loop # Network loopback +device ether # Ethernet support +device sl # Kernel SLIP +device ppp # Kernel PPP +device tun # Packet tunnel. +device pty # Pseudo-ttys (telnet etc) +device md # Memory "disks" +device gif # IPv6 and IPv4 tunneling +device faith # IPv6-to-IPv4 relaying (translation) + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +device bpf # Berkeley packet filter + +# USB support +device uhci # UHCI PCI->USB interface +device ohci # OHCI PCI->USB interface +device usb # USB Bus (required) +#device udbp # USB Double Bulk Pipe devices +device ugen # Generic +device uhid # "Human Interface Devices" +device ukbd # Keyboard +device ulpt # Printer +device umass # Disks/Mass storage - Requires scbus and da +device ums # Mouse +device urio # Diamond Rio 500 MP3 player +device uscanner # Scanners +# USB Ethernet, requires mii +device aue # ADMtek USB ethernet +device axe # ASIX Electronics USB ethernet +device cue # CATC USB ethernet +device kue # Kawasaki LSI USB ethernet + +# FireWire support +device firewire # FireWire bus code +device sbp # SCSI over FireWire (Requires scbus and da) +device fwe # Ethernet over FireWire (non-standard!) diff --git a/freebsd-5.3-xen-sparse/i386-xen/conf/GENERIC.hints b/freebsd-5.3-xen-sparse/i386-xen/conf/GENERIC.hints new file mode 100644 index 0000000000..c02274871b --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/conf/GENERIC.hints @@ -0,0 +1,93 @@ +# $FreeBSD: src/sys/i386/conf/GENERIC.hints,v 1.11 2002/12/05 22:49:47 jhb Exp $ +hint.fdc.0.at="isa" +hint.fdc.0.port="0x3F0" +hint.fdc.0.irq="6" +hint.fdc.0.drq="2" +hint.fd.0.at="fdc0" +hint.fd.0.drive="0" +hint.fd.1.at="fdc0" +hint.fd.1.drive="1" +hint.ata.0.at="isa" +hint.ata.0.port="0x1F0" +hint.ata.0.irq="14" +hint.ata.1.at="isa" +hint.ata.1.port="0x170" +hint.ata.1.irq="15" +hint.adv.0.at="isa" +hint.adv.0.disabled="1" +hint.bt.0.at="isa" +hint.bt.0.disabled="1" +hint.aha.0.at="isa" +hint.aha.0.disabled="1" +hint.aic.0.at="isa" +hint.aic.0.disabled="1" +hint.atkbdc.0.at="isa" +hint.atkbdc.0.port="0x060" +hint.atkbd.0.at="atkbdc" +hint.atkbd.0.irq="1" +hint.atkbd.0.flags="0x1" +hint.psm.0.at="atkbdc" +hint.psm.0.irq="12" +hint.vga.0.at="isa" +hint.sc.0.at="isa" +hint.sc.0.flags="0x100" +hint.vt.0.at="isa" +hint.vt.0.disabled="1" +hint.apm.0.disabled="1" +hint.apm.0.flags="0x20" +hint.pcic.0.at="isa" +# hint.pcic.0.irq="10" # Default to polling +hint.pcic.0.port="0x3e0" +hint.pcic.0.maddr="0xd0000" +hint.pcic.1.at="isa" +hint.pcic.1.irq="11" +hint.pcic.1.port="0x3e2" +hint.pcic.1.maddr="0xd4000" +hint.pcic.1.disabled="1" +hint.sio.0.at="isa" +hint.sio.0.port="0x3F8" +hint.sio.0.flags="0x10" +hint.sio.0.irq="4" +hint.sio.1.at="isa" +hint.sio.1.port="0x2F8" +hint.sio.1.irq="3" +hint.sio.2.at="isa" +hint.sio.2.disabled="1" +hint.sio.2.port="0x3E8" +hint.sio.2.irq="5" +hint.sio.3.at="isa" +hint.sio.3.disabled="1" +hint.sio.3.port="0x2E8" +hint.sio.3.irq="9" +hint.ppc.0.at="isa" +hint.ppc.0.irq="7" +hint.ed.0.at="isa" +hint.ed.0.disabled="1" +hint.ed.0.port="0x280" +hint.ed.0.irq="10" +hint.ed.0.maddr="0xd8000" +hint.cs.0.at="isa" +hint.cs.0.disabled="1" +hint.cs.0.port="0x300" +hint.sn.0.at="isa" +hint.sn.0.disabled="1" +hint.sn.0.port="0x300" +hint.sn.0.irq="10" +hint.ie.0.at="isa" +hint.ie.0.disabled="1" +hint.ie.0.port="0x300" +hint.ie.0.irq="10" +hint.ie.0.maddr="0xd0000" +hint.fe.0.at="isa" +hint.fe.0.disabled="1" +hint.fe.0.port="0x300" +hint.le.0.at="isa" +hint.le.0.disabled="1" +hint.le.0.port="0x300" +hint.le.0.irq="5" +hint.le.0.maddr="0xd0000" +hint.lnc.0.at="isa" +hint.lnc.0.disabled="1" +hint.lnc.0.port="0x280" +hint.lnc.0.irq="10" +hint.lnc.0.drq="0" diff --git a/freebsd-5.3-xen-sparse/i386-xen/conf/Makefile b/freebsd-5.3-xen-sparse/i386-xen/conf/Makefile new file mode 100644 index 0000000000..0284f84e82 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/conf/Makefile @@ -0,0 +1,3 @@ +# $FreeBSD: src/sys/i386/conf/Makefile,v 1.9 2003/02/26 23:36:58 ru Exp $ + +.include "${.CURDIR}/../../conf/makeLINT.mk" diff --git a/freebsd-5.3-xen-sparse/i386-xen/conf/NOTES b/freebsd-5.3-xen-sparse/i386-xen/conf/NOTES new file mode 100644 index 0000000000..b01c607dfa --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/conf/NOTES @@ -0,0 +1,1115 @@ +# +# NOTES -- Lines that can be cut/pasted into kernel and hints configs. +# +# This file contains machine dependent kernel configuration notes. For +# machine independent notes, look in /sys/conf/NOTES. +# +# $FreeBSD: src/sys/i386/conf/NOTES,v 1.1108 2003/12/04 19:57:56 phk Exp $ +# + +# +# This directive is mandatory; it defines the architecture to be +# configured for; in this case, the 386 family based IBM-PC and +# compatibles. +# +machine i386 + +# +# We want LINT to cover profiling as well +profile 2 + + +##################################################################### +# SMP OPTIONS: +# +# The apic device enables the use of the I/O APIC for interrupt delivery. +# The apic device can be used in both UP and SMP kernels, but is required +# for SMP kernels. Thus, the apic device is not strictly an SMP option, +# but it is a prerequisite for SMP. +# +# Notes: +# +# Be sure to disable 'cpu I386_CPU' for SMP kernels. +# +# By default, mixed mode is used to route IRQ0 from the AT timer via +# the 8259A master PIC through the ExtINT pin on the first I/O APIC. +# This can be disabled via the NO_MIXED_MODE option. In that case, +# IRQ0 will be routed via an intpin on the first I/O APIC. Not all +# motherboards hook IRQ0 up to the first I/O APIC even though their +# MP table or MADT may claim to do so. That is why mixed mode is +# enabled by default. +# +# HTT CPUs should only be used if they are enabled in the BIOS. For +# the ACPI case, ACPI only correctly tells us about any HTT CPUs if +# they are enabled. However, most HTT systems do not list HTT CPUs +# in the MP Table if they are enabled, thus we guess at the HTT CPUs +# for the MP Table case. However, we shouldn't try to guess and use +# these CPUs if HTTT is disabled. Thus, HTT guessing is only enabled +# for the MP Table if the user explicitly asks for it via the +# MPTABLE_FORCE_HTT option. Do NOT use this option if you have HTT +# disabled in your BIOS. +# + +# Mandatory: +device apic # I/O apic + +# Optional: +options MPTABLE_FORCE_HTT # Enable HTT CPUs with the MP Table +options NO_MIXED_MODE # Disable use of mixed mode + + +##################################################################### +# CPU OPTIONS + +# +# You must specify at least one CPU (the one you intend to run on); +# deleting the specification for CPUs you don't need to use may make +# parts of the system run faster. +# I386_CPU is mutually exclusive with the other CPU types. +# +#cpu I386_CPU +cpu I486_CPU +cpu I586_CPU # aka Pentium(tm) +cpu I686_CPU # aka Pentium Pro(tm) + +# +# Options for CPU features. +# +# CPU_ATHLON_SSE_HACK tries to enable SSE instructions when the BIOS has +# forgotten to enable them. +# +# CPU_BLUELIGHTNING_FPU_OP_CACHE enables FPU operand cache on IBM +# BlueLightning CPU. It works only with Cyrix FPU, and this option +# should not be used with Intel FPU. +# +# CPU_BLUELIGHTNING_3X enables triple-clock mode on IBM Blue Lightning +# CPU if CPU supports it. The default is double-clock mode on +# BlueLightning CPU box. +# +# CPU_BTB_EN enables branch target buffer on Cyrix 5x86 (NOTE 1). +# +# CPU_DIRECT_MAPPED_CACHE sets L1 cache of Cyrix 486DLC CPU in direct +# mapped mode. Default is 2-way set associative mode. +# +# CPU_CYRIX_NO_LOCK enables weak locking for the entire address space +# of Cyrix 6x86 and 6x86MX CPUs by setting the NO_LOCK bit of CCR1. +# Otherwise, the NO_LOCK bit of CCR1 is cleared. (NOTE 3) +# +# CPU_DISABLE_5X86_LSSER disables load store serialize (i.e. enables +# reorder). This option should not be used if you use memory mapped +# I/O device(s). +# +# CPU_ELAN enables support for AMDs ElanSC520 CPU. +# CPU_ELAN_XTAL sets the clock crystal frequency in Hz +# CPU_ELAN_PPS enables precision timestamp code. +# +# CPU_SOEKRIS enables support www.soekris.com hardware. +# +# CPU_ENABLE_SSE enables SSE/MMX2 instructions support. This is default +# on I686_CPU and above. +# CPU_DISABLE_SSE explicitly prevent I686_CPU from turning on SSE. +# +# CPU_FASTER_5X86_FPU enables faster FPU exception handler. +# +# CPU_I486_ON_386 enables CPU cache on i486 based CPU upgrade products +# for i386 machines. +# +# CPU_IORT defines I/O clock delay time (NOTE 1). Default values of +# I/O clock delay time on Cyrix 5x86 and 6x86 are 0 and 7,respectively +# (no clock delay). +# +# CPU_L2_LATENCY specifed the L2 cache latency value. This option is used +# only when CPU_PPRO2CELERON is defined and Mendocino Celeron is detected. +# The default value is 5. +# +# CPU_LOOP_EN prevents flushing the prefetch buffer if the destination +# of a jump is already present in the prefetch buffer on Cyrix 5x86(NOTE +# 1). +# +# CPU_PPRO2CELERON enables L2 cache of Mendocino Celeron CPUs. This option +# is useful when you use Socket 8 to Socket 370 converter, because most Pentium +# Pro BIOSs do not enable L2 cache of Mendocino Celeron CPUs. +# +# CPU_RSTK_EN enables return stack on Cyrix 5x86 (NOTE 1). +# +# CPU_SUSP_HLT enables suspend on HALT. If this option is set, CPU +# enters suspend mode following execution of HALT instruction. +# +# CPU_UPGRADE_HW_CACHE eliminates unneeded cache flush instruction(s). +# +# CPU_WT_ALLOC enables write allocation on Cyrix 6x86/6x86MX and AMD +# K5/K6/K6-2 cpus. +# +# CYRIX_CACHE_WORKS enables CPU cache on Cyrix 486 CPUs with cache +# flush at hold state. +# +# CYRIX_CACHE_REALLY_WORKS enables (1) CPU cache on Cyrix 486 CPUs +# without cache flush at hold state, and (2) write-back CPU cache on +# Cyrix 6x86 whose revision < 2.7 (NOTE 2). +# +# NO_F00F_HACK disables the hack that prevents Pentiums (and ONLY +# Pentiums) from locking up when a LOCK CMPXCHG8B instruction is +# executed. This option is only needed if I586_CPU is also defined, +# and should be included for any non-Pentium CPU that defines it. +# +# NO_MEMORY_HOLE is an optimisation for systems with AMD K6 processors +# which indicates that the 15-16MB range is *definitely* not being +# occupied by an ISA memory hole. +# +# CPU_DISABLE_CMPXCHG disables the CMPXCHG instruction on > i386 IA32 +# machines. VmWare seems to emulate this instruction poorly, causing +# the guest OS to run very slowly. Enabling this with a SMP kernel +# will cause the kernel to be unusable. +# +# NOTE 1: The options, CPU_BTB_EN, CPU_LOOP_EN, CPU_IORT, +# CPU_LOOP_EN and CPU_RSTK_EN should not be used because of CPU bugs. +# These options may crash your system. +# +# NOTE 2: If CYRIX_CACHE_REALLY_WORKS is not set, CPU cache is enabled +# in write-through mode when revision < 2.7. If revision of Cyrix +# 6x86 >= 2.7, CPU cache is always enabled in write-back mode. +# +# NOTE 3: This option may cause failures for software that requires +# locked cycles in order to operate correctly. +# +options CPU_ATHLON_SSE_HACK +options CPU_BLUELIGHTNING_FPU_OP_CACHE +options CPU_BLUELIGHTNING_3X +options CPU_BTB_EN +options CPU_DIRECT_MAPPED_CACHE +options CPU_DISABLE_5X86_LSSER +options CPU_ELAN +options CPU_SOEKRIS +options CPU_ELAN_XTAL=32768000 +options CPU_ELAN_PPS +options CPU_ENABLE_SSE +#options CPU_DISABLE_SSE +options CPU_FASTER_5X86_FPU +options CPU_I486_ON_386 +options CPU_IORT +options CPU_L2_LATENCY=5 +options CPU_LOOP_EN +options CPU_PPRO2CELERON +options CPU_RSTK_EN +options CPU_SUSP_HLT +options CPU_UPGRADE_HW_CACHE +options CPU_WT_ALLOC +options CYRIX_CACHE_WORKS +options CYRIX_CACHE_REALLY_WORKS +#options NO_F00F_HACK +options CPU_DISABLE_CMPXCHG + +# Debug options +options NPX_DEBUG # enable npx debugging (FPU/math emu) + #new math emulator + +# +# PERFMON causes the driver for Pentium/Pentium Pro performance counters +# to be compiled. See perfmon(4) for more information. +# +options PERFMON + + +##################################################################### +# NETWORKING OPTIONS + +# +# DEVICE_POLLING adds support for mixed interrupt-polling handling +# of network device drivers, which has significant benefits in terms +# of robustness to overloads and responsivity, as well as permitting +# accurate scheduling of the CPU time between kernel network processing +# and other activities. The drawback is a moderate (up to 1/HZ seconds) +# potential increase in response times. +# It is strongly recommended to use HZ=1000 or 2000 with DEVICE_POLLING +# to achieve smoother behaviour. +# Additionally, you can enable/disable polling at runtime with the +# sysctl variable kern.polling.enable (defaults off), and select +# the CPU fraction reserved to userland with the sysctl variable +# kern.polling.user_frac (default 50, range 0..100). +# +# Only the "dc" "fxp" and "sis" devices support this mode of operation at +# the time of this writing. + +options DEVICE_POLLING + + +##################################################################### +# CLOCK OPTIONS + +# The following options are used for debugging clock behavior only, and +# should not be used for production systems. +# +# CLK_CALIBRATION_LOOP will run the clock calibration loop at startup +# until the user presses a key. + +options CLK_CALIBRATION_LOOP + +# The following two options measure the frequency of the corresponding +# clock relative to the RTC (onboard mc146818a). + +options CLK_USE_I8254_CALIBRATION +options CLK_USE_TSC_CALIBRATION + + +##################################################################### +# MISCELLANEOUS DEVICES AND OPTIONS + +device speaker #Play IBM BASIC-style noises out your speaker +hint.speaker.0.at="isa" +hint.speaker.0.port="0x61" +device gzip #Exec gzipped a.out's. REQUIRES COMPAT_AOUT! +device apm_saver # Requires APM + + +##################################################################### +# HARDWARE BUS CONFIGURATION + +# +# ISA bus +# +device isa + +# +# Options for `isa': +# +# AUTO_EOI_1 enables the `automatic EOI' feature for the master 8259A +# interrupt controller. This saves about 0.7-1.25 usec for each interrupt. +# This option breaks suspend/resume on some portables. +# +# AUTO_EOI_2 enables the `automatic EOI' feature for the slave 8259A +# interrupt controller. This saves about 0.7-1.25 usec for each interrupt. +# Automatic EOI is documented not to work for for the slave with the +# original i8259A, but it works for some clones and some integrated +# versions. +# +# MAXMEM specifies the amount of RAM on the machine; if this is not +# specified, FreeBSD will first read the amount of memory from the CMOS +# RAM, so the amount of memory will initially be limited to 64MB or 16MB +# depending on the BIOS. If the BIOS reports 64MB, a memory probe will +# then attempt to detect the installed amount of RAM. If this probe +# fails to detect >64MB RAM you will have to use the MAXMEM option. +# The amount is in kilobytes, so for a machine with 128MB of RAM, it would +# be 131072 (128 * 1024). +# +# BROKEN_KEYBOARD_RESET disables the use of the keyboard controller to +# reset the CPU for reboot. This is needed on some systems with broken +# keyboard controllers. + +options COMPAT_OLDISA #Use ISA shims and glue for old drivers +options AUTO_EOI_1 +#options AUTO_EOI_2 + +options MAXMEM=(128*1024) +#options BROKEN_KEYBOARD_RESET + +# +# EISA bus +# +# The EISA bus device is `eisa'. It provides auto-detection and +# configuration support for all devices on the EISA bus. + +device eisa + +# By default, only 10 EISA slots are probed, since the slot numbers +# above clash with the configuration address space of the PCI subsystem, +# and the EISA probe is not very smart about this. This is sufficient +# for most machines, but in particular the HP NetServer LC series comes +# with an onboard AIC7770 dual-channel SCSI controller on EISA slot #11, +# thus you need to bump this figure to 12 for them. +options EISA_SLOTS=12 + +# +# MCA bus: +# +# The MCA bus device is `mca'. It provides auto-detection and +# configuration support for all devices on the MCA bus. +# No hints are required for MCA. + +device mca + +# +# PCI bus & PCI options: +# +device pci + +# +# AGP GART support +device agp + + +##################################################################### +# HARDWARE DEVICE CONFIGURATION + +# +# Mandatory devices: +# + +# To include support for VGA VESA video modes +options VESA + +# Turn on extra debugging checks and output for VESA support. +options VESA_DEBUG + +# The pcvt console driver (vt220 compatible). +device vt +hint.vt.0.at="isa" +options XSERVER # support for running an X server on vt +options FAT_CURSOR # start with block cursor +# This PCVT option is for keyboards such as those used on really old ThinkPads +options PCVT_SCANSET=2 +# Other PCVT options are documented in pcvt(4). +options PCVT_24LINESDEF +options PCVT_CTRL_ALT_DEL +options PCVT_META_ESC +options PCVT_NSCREENS=9 +options PCVT_PRETTYSCRNS +options PCVT_SCREENSAVER +options PCVT_USEKBDSEC +options PCVT_VT220KEYB +options PCVT_GREENSAVER + +# +# The Numeric Processing eXtension driver. In addition to this, you +# may configure a math emulator (see above). If your machine has a +# hardware FPU and the kernel configuration includes the npx device +# *and* a math emulator compiled into the kernel, the hardware FPU +# will be used, unless it is found to be broken or unless "flags" to +# npx0 includes "0x08", which requests preference for the emulator. +device npx +hint.npx.0.flags="0x0" +hint.npx.0.irq="13" + +# +# `flags' for npx0: +# 0x01 don't use the npx registers to optimize bcopy. +# 0x02 don't use the npx registers to optimize bzero. +# 0x04 don't use the npx registers to optimize copyin or copyout. +# 0x08 use emulator even if hardware FPU is available. +# The npx registers are normally used to optimize copying and zeroing when +# all of the following conditions are satisfied: +# I586_CPU is an option +# the cpu is an i586 (perhaps not a Pentium) +# the probe for npx0 succeeds +# INT 16 exception handling works. +# Then copying and zeroing using the npx registers is normally 30-100% faster. +# The flags can be used to control cases where it doesn't work or is slower. +# Setting them at boot time using userconfig works right (the optimizations +# are not used until later in the bootstrap when npx0 is attached). +# Flag 0x08 automatically disables the i586 optimized routines. +# + +# +# Optional devices: +# + +# 3Dfx Voodoo Graphics, Voodoo II /dev/3dfx CDEV support. This will create +# the /dev/3dfx0 device to work with glide implementations. This should get +# linked to /dev/3dfx and /dev/voodoo. Note that this is not the same as +# the tdfx DRI module from XFree86 and is completely unrelated. +# +# To enable Linuxulator support, one must also include COMPAT_LINUX in the +# config as well, or you will not have the dependencies. The other option +# is to load both as modules. + +device tdfx # Enable 3Dfx Voodoo support +options TDFX_LINUX # Enable Linuxulator support + +# +# ACPI support using the Intel ACPI Component Architecture reference +# implementation. +# +# ACPI_DEBUG enables the use of the debug.acpi.level and debug.acpi.layer +# kernel environment variables to select initial debugging levels for the +# Intel ACPICA code. (Note that the Intel code must also have USE_DEBUGGER +# defined when it is built). +# +# ACPI_MAX_THREADS sets the number of task threads started. +# +# ACPI_NO_SEMAPHORES makes the AcpiOs*Semaphore routines a no-op. +# +# ACPICA_PEDANTIC enables strict checking of AML. Our default is to +# relax these checks to allow code generated by the Microsoft compiler +# to still execute. +# +# Note that building ACPI into the kernel is deprecated; the module is +# normally loaded automatically by the loader. +# +device acpi +options ACPI_DEBUG +options ACPI_MAX_THREADS=1 +#!options ACPI_NO_SEMAPHORES +#!options ACPICA_PEDANTIC + +# DRM options: +# mgadrm: AGP Matrox G200, G400, G450, G550 +# r128drm: ATI Rage 128 +# radeondrm: ATI Radeon up to 9000/9100 +# sisdrm: SiS 300/305,540,630 +# tdfxdrm: 3dfx Voodoo 3/4/5 and Banshee +# DRM_DEBUG: include debug printfs, very slow +# +# mga requires AGP in the kernel, and it is recommended +# for AGP r128 and radeon cards. + +device mgadrm +device "r128drm" +device radeondrm +device sisdrm +device tdfxdrm + +options DRM_DEBUG + +# M-systems DiskOnchip products see src/sys/contrib/dev/fla/README +device fla +hint.fla.0.at="isa" + +# +# mse: Logitech and ATI InPort bus mouse ports + +device mse +hint.mse.0.at="isa" +hint.mse.0.port="0x23c" +hint.mse.0.irq="5" + +# +# Network interfaces: +# + +# ar: Arnet SYNC/570i hdlc sync 2/4 port V.35/X.21 serial driver +# (requires sppp) +# ath: Atheros a/b/g WiFi adapters (requires ath_hal and wlan) +# cx: Cronyx/Sigma multiport sync/async (with Cisco or PPP framing) +# ed: Western Digital and SMC 80xx; Novell NE1000 and NE2000; 3Com 3C503 +# HP PC Lan+, various PC Card devices (refer to etc/defauls/pccard.conf) +# (requires miibus) +# el: 3Com 3C501 (slow!) +# ie: AT&T StarLAN 10 and EN100; 3Com 3C507; unknown NI5210; +# Intel EtherExpress +# le: Digital Equipment EtherWorks 2 and EtherWorks 3 (DEPCA, DE100, +# DE101, DE200, DE201, DE202, DE203, DE204, DE205, DE422) +# lnc: Lance/PCnet cards (Isolan, Novell NE2100, NE32-VL, AMD Am7990 and +# Am79C960) +# oltr: Olicom ISA token-ring adapters OC-3115, OC-3117, OC-3118 and OC-3133 +# (no hints needed). +# Olicom PCI token-ring adapters OC-3136, OC-3137, OC-3139, OC-3140, +# OC-3141, OC-3540, OC-3250 +# rdp: RealTek RTL 8002-based pocket ethernet adapters +# sbni: Granch SBNI12-xx ISA and PCI adapters +# sr: RISCom/N2 hdlc sync 1/2 port V.35/X.21 serial driver (requires sppp) +# wl: Lucent Wavelan (ISA card only). + +# Order for ISA/EISA devices is important here + +device ar +hint.ar.0.at="isa" +hint.ar.0.port="0x300" +hint.ar.0.irq="10" +hint.ar.0.maddr="0xd0000" +device cx +hint.cx.0.at="isa" +hint.cx.0.port="0x240" +hint.cx.0.irq="15" +hint.cx.0.drq="7" +device ed +#options ED_NO_MIIBUS # Disable ed miibus support +hint.ed.0.at="isa" +hint.ed.0.port="0x280" +hint.ed.0.irq="5" +hint.ed.0.maddr="0xd8000" +device el 1 +hint.el.0.at="isa" +hint.el.0.port="0x300" +hint.el.0.irq="9" +device ie # Hints only required for Starlan +hint.ie.2.at="isa" +hint.ie.2.port="0x300" +hint.ie.2.irq="5" +hint.ie.2.maddr="0xd0000" +device le 1 +hint.le.0.at="isa" +hint.le.0.port="0x300" +hint.le.0.irq="5" +hint.le.0.maddr="0xd0000" +device lnc +hint.lnc.0.at="isa" +hint.lnc.0.port="0x280" +hint.lnc.0.irq="10" +hint.lnc.0.drq="0" +device rdp 1 +hint.rdp.0.at="isa" +hint.rdp.0.port="0x378" +hint.rdp.0.irq="7" +hint.rdp.0.flags="2" +device sbni +hint.sbni.0.at="isa" +hint.sbni.0.port="0x210" +hint.sbni.0.irq="0xefdead" +hint.sbni.0.flags="0" +device sr +hint.sr.0.at="isa" +hint.sr.0.port="0x300" +hint.sr.0.irq="5" +hint.sr.0.maddr="0xd0000" +device oltr +hint.oltr.0.at="isa" +device wl +hint.wl.0.at="isa" +hint.wl.0.port="0x300" +options WLCACHE # enables the signal-strength cache +options WLDEBUG # enables verbose debugging output + +device ath +device ath_hal # Atheros HAL (includes binary component) +#device wlan # 802.11 layer + +# +# ATA raid adapters +# +device pst + +# +# SCSI host adapters: +# +# ncv: NCR 53C500 based SCSI host adapters. +# nsp: Workbit Ninja SCSI-3 based PC Card SCSI host adapters. +# stg: TMC 18C30, 18C50 based SCSI host adapters. + +device ncv +device nsp +device stg +hint.stg.0.at="isa" +hint.stg.0.port="0x140" +hint.stg.0.port="11" + +# +# Adaptec FSA RAID controllers, including integrated DELL controllers, +# the Dell PERC 2/QC and the HP NetRAID-4M +device aac +device aacp # SCSI Passthrough interface (optional, CAM required) + +# +# IBM (now Adaptec) ServeRAID controllers +device ips + +# +# SafeNet crypto driver: can be moved to the MI NOTES as soon as +# it's tested on a big-endian machine +# +device safe # SafeNet 1141 +options SAFE_DEBUG # enable debugging support: hw.safe.debug +options SAFE_RNDTEST # enable rndtest support + +##################################################################### + +# +# Miscellaneous hardware: +# +# wt: Wangtek and Archive QIC-02/QIC-36 tape drives +# ctx: Cortex-I frame grabber +# apm: Laptop Advanced Power Management (experimental) +# pmtimer: Timer device driver for power management events (APM or ACPI) +# spigot: The Creative Labs Video Spigot video-acquisition board +# dgb: Digiboard PC/Xi and PC/Xe series driver (ALPHA QUALITY!) +# digi: Digiboard driver +# gp: National Instruments AT-GPIB and AT-GPIB/TNT board, PCMCIA-GPIB +# asc: GI1904-based hand scanners, e.g. the Trust Amiscan Grey +# gsc: Genius GS-4500 hand scanner. +# spic: Sony Programmable I/O controller (VAIO notebooks) +# stl: Stallion EasyIO and EasyConnection 8/32 (cd1400 based) +# stli: Stallion EasyConnection 8/64, ONboard, Brumby (intelligent) + +# Notes on APM +# The flags takes the following meaning for apm0: +# 0x0020 Statclock is broken. +# If apm is omitted, some systems require sysctl kern.timecounter.method=1 +# for correct timekeeping. + +# Notes on the spigot: +# The video spigot is at 0xad6. This port address can not be changed. +# The irq values may only be 10, 11, or 15 +# I/O memory is an 8kb region. Possible values are: +# 0a0000, 0a2000, ..., 0fffff, f00000, f02000, ..., ffffff +# The start address must be on an even boundary. +# Add the following option if you want to allow non-root users to be able +# to access the spigot. This option is not secure because it allows users +# direct access to the I/O page. +# options SPIGOT_UNSECURE + +# Notes on the Specialix SI/XIO driver: +# The host card is memory, not IO mapped. +# The Rev 1 host cards use a 64K chunk, on a 32K boundary. +# The Rev 2 host cards use a 32K chunk, on a 32K boundary. +# The cards can use an IRQ of 11, 12 or 15. + +# Notes on the Sony Programmable I/O controller +# This is a temporary driver that should someday be replaced by something +# that hooks into the ACPI layer. The device is hooked to the PIIX4's +# General Device 10 decoder, which means you have to fiddle with PCI +# registers to map it in, even though it is otherwise treated here as +# an ISA device. At the moment, the driver polls, although the device +# is capable of generating interrupts. It largely undocumented. +# The port location in the hint is where you WANT the device to be +# mapped. 0x10a0 seems to be traditional. At the moment the jogdial +# is the only thing truly supported, but aparently a fair percentage +# of the Vaio extra features are controlled by this device. + +# Notes on the Stallion stl and stli drivers: +# See src/i386/isa/README.stl for complete instructions. +# This is version 0.0.5alpha, unsupported by Stallion. +# The stl driver has a secondary IO port hard coded at 0x280. You need +# to change src/i386/isa/stallion.c if you reconfigure this on the boards. +# The "flags" and "msize" settings on the stli driver depend on the board: +# EasyConnection 8/64 ISA: flags 23 msize 0x1000 +# EasyConnection 8/64 EISA: flags 24 msize 0x10000 +# EasyConnection 8/64 MCA: flags 25 msize 0x1000 +# ONboard ISA: flags 4 msize 0x10000 +# ONboard EISA: flags 7 msize 0x10000 +# ONboard MCA: flags 3 msize 0x10000 +# Brumby: flags 2 msize 0x4000 +# Stallion: flags 1 msize 0x10000 + +# Notes on the Digiboard PC/Xi and PC/Xe series driver +# +# The NDGBPORTS option specifies the number of ports controlled by the +# dgb(4) driver. The default value is 16 ports per device. +# +# The following flag values have special meanings in dgb: +# 0x01 - alternate layout of pins +# 0x02 - use the windowed PC/Xe in 64K mode + +device wt 1 +hint.wt.0.at="isa" +hint.wt.0.port="0x300" +hint.wt.0.irq="5" +hint.wt.0.drq="1" +device ctx +hint.ctx.0.at="isa" +hint.ctx.0.port="0x230" +hint.ctx.0.maddr="0xd0000" +device spigot 1 +hint.spigot.0.at="isa" +hint.spigot.0.port="0xad6" +hint.spigot.0.irq="15" +hint.spigot.0.maddr="0xee000" +device apm +hint.apm.0.flags="0x20" +device pmtimer # Adjust system timer at wakeup time +device gp +hint.gp.0.at="isa" +hint.gp.0.port="0x2c0" +device gsc 1 +hint.gsc.0.at="isa" +hint.gsc.0.port="0x270" +hint.gsc.0.drq="3" +device dgb 1 +options NDGBPORTS=17 +hint.dgb.0.at="isa" +hint.dgb.0.port="0x220" +hint.dgb.0.maddr="0xfc000" +device digi +hint.digi.0.at="isa" +hint.digi.0.port="0x104" +hint.digi.0.maddr="0xd0000" +# BIOS & FEP/OS components of device digi. +device digi_CX +device digi_CX_PCI +device digi_EPCX +device digi_EPCX_PCI +device digi_Xe +device digi_Xem +device digi_Xr +device asc 1 +hint.asc.0.at="isa" +hint.asc.0.port="0x3EB" +hint.asc.0.drq="3" +hint.asc.0.irq="10" +device spic +hint.spic.0.at="isa" +hint.spic.0.port="0x10a0" +device stl +hint.stl.0.at="isa" +hint.stl.0.port="0x2a0" +hint.stl.0.irq="10" +device stli +hint.stli.0.at="isa" +hint.stli.0.port="0x2a0" +hint.stli.0.maddr="0xcc000" +hint.stli.0.flags="23" +hint.stli.0.msize="0x1000" +# You are unlikely to have the hardware for loran <phk@FreeBSD.org> +device loran +hint.loran.0.at="isa" +hint.loran.0.irq="5" +# HOT1 Xilinx 6200 card (http://www.vcc.com/) +device xrpu + +# +# Laptop/Notebook options: +# +# See also: +# apm under `Miscellaneous hardware' +# above. + +# For older notebooks that signal a powerfail condition (external +# power supply dropped, or battery state low) by issuing an NMI: + +options POWERFAIL_NMI # make it beep instead of panicing + +# +# I2C Bus +# +# Philips i2c bus support is provided by the `iicbus' device. +# +# Supported interfaces: +# pcf Philips PCF8584 ISA-bus controller +# +device pcf +hint.pcf.0.at="isa" +hint.pcf.0.port="0x320" +hint.pcf.0.irq="5" + +#--------------------------------------------------------------------------- +# ISDN4BSD +# +# See /usr/share/examples/isdn/ROADMAP for an introduction to isdn4bsd. +# +# i4b passive ISDN cards support contains the following hardware drivers: +# +# isic - Siemens/Infineon ISDN ISAC/HSCX/IPAC chipset driver +# iwic - Winbond W6692 PCI bus ISDN S/T interface controller +# ifpi - AVM Fritz!Card PCI driver +# ifpi2 - AVM Fritz!Card PCI version 2 driver +# ihfc - Cologne Chip HFC ISA/ISA-PnP chipset driver +# ifpnp - AVM Fritz!Card PnP driver +# itjc - Siemens ISAC / TJNet Tiger300/320 chipset +# +# i4b active ISDN cards support contains the following hardware drivers: +# +# iavc - AVM B1 PCI, AVM B1 ISA, AVM T1 +# +# Note that the ``options'' (if given) and ``device'' lines must BOTH +# be uncommented to enable support for a given card ! +# +# In addition to a hardware driver (and probably an option) the mandatory +# ISDN protocol stack devices and the mandatory support device must be +# enabled as well as one or more devices from the optional devices section. +# +#--------------------------------------------------------------------------- +# isic driver (Siemens/Infineon chipsets) +# +device isic +# +# ISA bus non-PnP Cards: +# ---------------------- +# +# Teles S0/8 or Niccy 1008 +options TEL_S0_8 +hint.isic.0.at="isa" +hint.isic.0.maddr="0xd0000" +hint.isic.0.irq="5" +hint.isic.0.flags="1" +# +# Teles S0/16 or Creatix ISDN-S0 or Niccy 1016 +options TEL_S0_16 +hint.isic.0.at="isa" +hint.isic.0.port="0xd80" +hint.isic.0.maddr="0xd0000" +hint.isic.0.irq="5" +hint.isic.0.flags="2" +# +# Teles S0/16.3 +options TEL_S0_16_3 +hint.isic.0.at="isa" +hint.isic.0.port="0xd80" +hint.isic.0.irq="5" +hint.isic.0.flags="3" +# +# AVM A1 or AVM Fritz!Card +options AVM_A1 +hint.isic.0.at="isa" +hint.isic.0.port="0x340" +hint.isic.0.irq="5" +hint.isic.0.flags="4" +# +# USRobotics Sportster ISDN TA intern +options USR_STI +hint.isic.0.at="isa" +hint.isic.0.port="0x268" +hint.isic.0.irq="5" +hint.isic.0.flags="7" +# +# ITK ix1 Micro ( < V.3, non-PnP version ) +options ITKIX1 +hint.isic.0.at="isa" +hint.isic.0.port="0x398" +hint.isic.0.irq="10" +hint.isic.0.flags="18" +# +# ELSA PCC-16 +options ELSA_PCC16 +hint.isic.0.at="isa" +hint.isic.0.port="0x360" +hint.isic.0.irq="10" +hint.isic.0.flags="20" +# +# ISA bus PnP Cards: +# ------------------ +# +# Teles S0/16.3 PnP +options TEL_S0_16_3_P +# +# Creatix ISDN-S0 P&P +options CRTX_S0_P +# +# Dr. Neuhaus Niccy Go@ +options DRN_NGO +# +# Sedlbauer Win Speed +options SEDLBAUER +# +# Dynalink IS64PH +options DYNALINK +# +# ELSA QuickStep 1000pro ISA +options ELSA_QS1ISA +# +# Siemens I-Surf 2.0 +options SIEMENS_ISURF2 +# +# Asuscom ISDNlink 128K ISA +options ASUSCOM_IPAC +# +# Eicon Diehl DIVA 2.0 and 2.02 +options EICON_DIVA +# +# Compaq Microcom 610 ISDN card (Compaq series PSB2222I) +options COMPAQ_M610 +# +# PCI bus Cards: +# -------------- +# +# Cyclades Cyclom-Y PCI serial driver +device cy 1 +options CY_PCI_FASTINTR # Use with cy_pci unless irq is shared +hint.cy.0.at="isa" +hint.cy.0.irq="10" +hint.cy.0.maddr="0xd4000" +hint.cy.0.msize="0x2000" +# +#--------------------------------------------------------------------------- +# ELSA MicroLink ISDN/PCI (same as ELSA QuickStep 1000pro PCI) +options ELSA_QS1PCI +# +# +#--------------------------------------------------------------------------- +# ifpnp driver for AVM Fritz!Card PnP +# +# AVM Fritz!Card PnP +device ifpnp +# +#--------------------------------------------------------------------------- +# ihfc driver for Cologne Chip ISA chipsets (experimental!) +# +# Teles 16.3c ISA PnP +# AcerISDN P10 ISA PnP +# TELEINT ISDN SPEED No.1 +device ihfc +# +#--------------------------------------------------------------------------- +# ifpi driver for AVM Fritz!Card PCI +# +# AVM Fritz!Card PCI +device ifpi +# +#--------------------------------------------------------------------------- +# ifpi2 driver for AVM Fritz!Card PCI version 2 +# +# AVM Fritz!Card PCI version 2 +device "ifpi2" +# +#--------------------------------------------------------------------------- +# iwic driver for Winbond W6692 chipset +# +# ASUSCOM P-IN100-ST-D (and other Winbond W6692 based cards) +device iwic +# +#--------------------------------------------------------------------------- +# itjc driver for Simens ISAC / TJNet Tiger300/320 chipset +# +# Traverse Technologies NETjet-S +# Teles PCI-TJ +device itjc +# +#--------------------------------------------------------------------------- +# iavc driver (AVM active cards, needs i4bcapi driver!) +# +device iavc +# +# AVM B1 ISA bus (PnP mode not supported!) +# ---------------------------------------- +hint.iavc.0.at="isa" +hint.iavc.0.port="0x150" +hint.iavc.0.irq="5" +# +#--------------------------------------------------------------------------- +# ISDN Protocol Stack - mandatory for all hardware drivers +# +# Q.921 / layer 2 - i4b passive cards D channel handling +device "i4bq921" +# +# Q.931 / layer 3 - i4b passive cards D channel handling +device "i4bq931" +# +# layer 4 - i4b common passive and active card handling +device "i4b" +# +#--------------------------------------------------------------------------- +# ISDN devices - mandatory for all hardware drivers +# +# userland driver to do ISDN tracing (for passive cards only) +device "i4btrc" 4 +# +# userland driver to control the whole thing +device "i4bctl" +# +#--------------------------------------------------------------------------- +# ISDN devices - optional +# +# userland driver for access to raw B channel +device "i4brbch" 4 +# +# userland driver for telephony +device "i4btel" 2 +# +# network driver for IP over raw HDLC ISDN +device "i4bipr" 4 +# enable VJ header compression detection for ipr i/f +options IPR_VJ +# enable logging of the first n IP packets to isdnd (n=32 here) +options IPR_LOG=32 +# +# network driver for sync PPP over ISDN; requires an equivalent +# number of sppp device to be configured +device "i4bisppp" 4 +# +# B-channel interface to the netgraph subsystem +device "i4bing" 2 +# +# CAPI driver needed for active ISDN cards (see iavc driver above) +device "i4bcapi" +# +#--------------------------------------------------------------------------- + +# +# Set the number of PV entries per process. Increasing this can +# stop panics related to heavy use of shared memory. However, that can +# (combined with large amounts of physical memory) cause panics at +# boot time due the kernel running out of VM space. +# +# If you're tweaking this, you might also want to increase the sysctls +# "vm.v_free_min", "vm.v_free_reserved", and "vm.v_free_target". +# +# The value below is the one more than the default. +# +options PMAP_SHPGPERPROC=201 + +# +# Change the size of the kernel virtual address space. Due to +# constraints in loader(8) on i386, this must be a multiple of 4. +# 256 = 1 GB of kernel address space. Increasing this also causes +# a reduction of the address space in user processes. 512 splits +# the 4GB cpu address space in half (2GB user, 2GB kernel). +# +options KVA_PAGES=260 + + +##################################################################### +# ABI Emulation + +# Enable iBCS2 runtime support for SCO and ISC binaries +options IBCS2 + +# Emulate spx device for client side of SVR3 local X interface +options SPX_HACK + +# Enable Linux ABI emulation +options COMPAT_LINUX + +# Enable i386 a.out binary support +options COMPAT_AOUT + +# Enable the linux-like proc filesystem support (requires COMPAT_LINUX +# and PSEUDOFS) +options LINPROCFS + +# +# SysVR4 ABI emulation +# +# The svr4 ABI emulator can be statically compiled into the kernel or loaded as +# a KLD module. +# The STREAMS network emulation code can also be compiled statically or as a +# module. If loaded as a module, it must be loaded before the svr4 module +# (the /usr/sbin/svr4 script does this for you). If compiling statically, +# the `streams' device must be configured into any kernel which also +# specifies COMPAT_SVR4. It is possible to have a statically-configured +# STREAMS device and a dynamically loadable svr4 emulator; the /usr/sbin/svr4 +# script understands that it doesn't need to load the `streams' module under +# those circumstances. +# Caveat: At this time, `options KTRACE' is required for the svr4 emulator +# (whether static or dynamic). +# +options COMPAT_SVR4 # build emulator statically +options DEBUG_SVR4 # enable verbose debugging +device streams # STREAMS network driver (required for svr4). + + +##################################################################### +# VM OPTIONS + +# Disable the 4 MByte page PSE CPU feature. The PSE feature allows the +# kernel to use a 4 MByte pages to map the kernel instead of 4k pages. +# This saves on the amount of memory needed for page tables needed to +# map the kernel. You should only disable this feature as a temporary +# workaround if you are having problems with it enabled. +# +#options DISABLE_PSE + +# Disable the global pages PGE CPU feature. The PGE feature allows pages +# to be marked with the PG_G bit. TLB entries for these pages are not +# flushed from the cache when %cr3 is reloaded. This can make context +# switches less expensive. You should only disable this feature as a +# temporary workaround if you are having problems with it enabled. +# +#options DISABLE_PG_G + +# KSTACK_PAGES is the number of memory pages to assign to the kernel +# stack of each thread. + +options KSTACK_PAGES=3 + +##################################################################### + +# More undocumented options for linting. +# Note that documenting these are not considered an affront. + +options FB_INSTALL_CDEV # install a CDEV entry in /dev + +# PECOFF module (Win32 Execution Format) +options PECOFF_SUPPORT +options PECOFF_DEBUG + +options ENABLE_ALART +options I4B_SMP_WORKAROUND +options I586_PMC_GUPROF=0x70000 +options KBDIO_DEBUG=2 +options KBD_MAXRETRY=4 +options KBD_MAXWAIT=6 +options KBD_RESETDELAY=201 + +options PSM_DEBUG=1 + +options TIMER_FREQ=((14318182+6)/12) + +options VM_KMEM_SIZE +options VM_KMEM_SIZE_MAX +options VM_KMEM_SIZE_SCALE diff --git a/freebsd-5.3-xen-sparse/i386-xen/conf/OLDCARD b/freebsd-5.3-xen-sparse/i386-xen/conf/OLDCARD new file mode 100644 index 0000000000..2d13fbe2b5 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/conf/OLDCARD @@ -0,0 +1,17 @@ +# +# OLDCARD -- Generic kernel configuration file for FreeBSD/i386 +# using the OLDCARD pccard system. +# +# $FreeBSD: src/sys/i386/conf/OLDCARD,v 1.18 2003/02/15 02:39:13 ru Exp $ + +include GENERIC + +ident OLDCARD + +# PCCARD (PCMCIA) support +nodevice cbb # cardbus (yenta) bridge +#nodevice pcic # ExCA ISA and PCI bridges +nodevice pccard # PC Card (16-bit) bus +nodevice cardbus # CardBus (32-bit) bus +device card 1 # pccard bus +device pcic # PCMCIA bridge diff --git a/freebsd-5.3-xen-sparse/i386-xen/conf/PAE b/freebsd-5.3-xen-sparse/i386-xen/conf/PAE new file mode 100644 index 0000000000..98d4f2c252 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/conf/PAE @@ -0,0 +1,99 @@ +# +# PAE -- Generic kernel configuration file for FreeBSD/i386 PAE +# +# $FreeBSD: src/sys/i386/conf/PAE,v 1.8 2003/11/03 22:49:19 jhb Exp $ + +include GENERIC + +ident PAE-GENERIC + +# To make a PAE kernel, the next option is needed +options PAE # Physical Address Extensions Kernel + +# Compile acpi in statically since the module isn't built properly. Most +# machines which support large amounts of memory require acpi. +device acpi + +# Don't build modules with this kernel config, since they are not built with +# the correct options headers. +makeoptions NO_MODULES=yes + +# What follows is a list of drivers that are normally in GENERIC, but either +# don't work or are untested with PAE. Be very careful before enabling any +# of these drivers. Drivers which use DMA and don't handle 64 bit physical +# address properly may cause data corruption when used in a machine with more +# than 4 gigabytes of memory. + +nodevice ahb +nodevice amd +nodevice isp +nodevice sym +nodevice trm + +nodevice adv +nodevice adw +nodevice aha +nodevice aic +nodevice bt + +nodevice ncv +nodevice nsp +nodevice stg + +nodevice asr +nodevice dpt +nodevice iir +nodevice mly + +nodevice amr +nodevice ida +nodevice mlx +nodevice pst + +nodevice agp + +nodevice de +nodevice txp +nodevice vx + +nodevice dc +nodevice pcn +nodevice rl +nodevice sf +nodevice sis +nodevice ste +nodevice tl +nodevice tx +nodevice vr +nodevice wb + +nodevice cs +nodevice ed +nodevice ex +nodevice ep +nodevice fe +nodevice ie +nodevice lnc +nodevice sn +nodevice xe + +nodevice wlan +nodevice an +nodevice awi +nodevice wi + +nodevice uhci +nodevice ohci +nodevice usb +nodevice ugen +nodevice uhid +nodevice ukbd +nodevice ulpt +nodevice umass +nodevice ums +nodevice urio +nodevice uscanner +nodevice aue +nodevice axe +nodevice cue +nodevice kue diff --git a/freebsd-5.3-xen-sparse/i386-xen/conf/XENCONF b/freebsd-5.3-xen-sparse/i386-xen/conf/XENCONF new file mode 100644 index 0000000000..4214b1c59b --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/conf/XENCONF @@ -0,0 +1,137 @@ +# +# GENERIC -- Generic kernel configuration file for FreeBSD/i386 +# +# For more information on this file, please read the handbook section on +# Kernel Configuration Files: +# +# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html +# +# The handbook is also available locally in /usr/share/doc/handbook +# if you've installed the doc distribution, otherwise always see the +# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the +# latest information. +# +# An exhaustive list of options and more detailed explanations of the +# device lines is also present in the ../../conf/NOTES and NOTES files. +# If you are in doubt as to the purpose or necessity of a line, check first +# in NOTES. +# +# $FreeBSD: src/sys/i386/conf/GENERIC,v 1.394.2.3 2004/01/26 19:42:11 nectar Exp $ + +machine i386-xen +cpu I686_CPU +ident XEN + +#To statically compile in device wiring instead of /boot/device.hints +#hints "GENERIC.hints" #Default places to look for devices. + +makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols + +options SCHED_4BSD #4BSD scheduler +options INET #InterNETworking +options INET6 #IPv6 communications protocols +options FFS #Berkeley Fast Filesystem +options SOFTUPDATES #Enable FFS soft updates support +options UFS_ACL #Support for access control lists +options UFS_DIRHASH #Improve performance on big directories +options MD_ROOT #MD is a potential root device +options NFSCLIENT #Network Filesystem Client +options NFSSERVER #Network Filesystem Server +# options NFS_ROOT #NFS usable as /, requires NFSCLIENT +#options MSDOSFS #MSDOS Filesystem +#options CD9660 #ISO 9660 Filesystem +options PROCFS #Process filesystem (requires PSEUDOFS) +options PSEUDOFS #Pseudo-filesystem framework +options COMPAT_43 #Compatible with BSD 4.3 [KEEP THIS!] +options COMPAT_FREEBSD4 #Compatible with FreeBSD4 +options SCSI_DELAY=15000 #Delay (in ms) before probing SCSI +options KTRACE #ktrace(1) support +options SYSVSHM #SYSV-style shared memory +options SYSVMSG #SYSV-style message queues +options SYSVSEM #SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING #Posix P1003_1B real-time extensions +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options CPU_DISABLE_SSE # don't turn on SSE framework with Xen +#options PFIL_HOOKS # pfil(9) framework + +# Debugging for use in -current +options KDB #Enable the kernel debugger +options INVARIANTS #Enable calls of extra sanity checking +options INVARIANT_SUPPORT #Extra sanity checks of internal structures, required by INVARIANTS +#options WITNESS #Enable checks to detect deadlocks and cycles +#options WITNESS_SKIPSPIN #Don't run witness on spinlocks for speed + +# To make an SMP kernel, the next two are needed +#options SMP # Symmetric MultiProcessor Kernel +#device apic # I/O APIC + +# SCSI peripherals +device scbus # SCSI bus (required for SCSI) +#device ch # SCSI media changers +device da # Direct Access (disks) +#device sa # Sequential Access (tape etc) +#device cd # CD +device pass # Passthrough device (direct SCSI access) +#device ses # SCSI Environmental Services (and SAF-TE) + +# atkbdc0 controls both the keyboard and the PS/2 mouse +#device atkbdc # AT keyboard controller +#device atkbd # AT keyboard +#device psm # PS/2 mouse + +# device vga # VGA video card driver + +#device splash # Splash screen and screen saver support + +# syscons is the default console driver, resembling an SCO console +#device sc + +# Enable this for the pcvt (VT220 compatible) console driver +#device vt +#options XSERVER # support for X server on a vt console +#options FAT_CURSOR # start with block cursor + +#device agp # support several AGP chipsets + +# Floating point support - do not disable. +device npx + +# Serial (COM) ports +#device sio # 8250, 16[45]50 based serial ports + +# Parallel port +#device ppc +#device ppbus # Parallel port bus (required) +#device lpt # Printer +#device plip # TCP/IP over parallel +#device ppi # Parallel port interface device +#device vpo # Requires scbus and da + +# If you've got a "dumb" serial or parallel PCI card that is +# supported by the puc(4) glue driver, uncomment the following +# line to enable it (connects to the sio and/or ppc drivers): +#device puc + + +# Pseudo devices - the number indicates how many units to allocate. +device random # Entropy device +device loop # Network loopback +device ether # Ethernet support +device tun # Packet tunnel. +device pty # Pseudo-ttys (telnet etc) +device md # Memory "disks" +device gif # IPv6 and IPv4 tunneling +device faith # IPv6-to-IPv4 relaying (translation) + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +device bpf # Berkeley packet filter + +#options BOOTP +options XEN +options MCLSHIFT=12 # this has to be enabled for Xen as we can only have one cluster per page +options MSIZE=256 +options DIAGNOSTIC +options MAXMEM=(256*1024) +options NOXENDEBUG=1 # Turn off Debugging printfs + diff --git a/freebsd-5.3-xen-sparse/i386-xen/conf/gethints.awk b/freebsd-5.3-xen-sparse/i386-xen/conf/gethints.awk new file mode 100644 index 0000000000..e8cc6b67de --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/conf/gethints.awk @@ -0,0 +1,116 @@ +#! /usr/bin/awk -f +# +# This is a transition aid. It extracts old-style configuration information +# from a config file and writes an equivalent device.hints file to stdout. +# You can use that with loader(8) or statically compile it in with the +# 'hints' directive. See how GENERIC and GENERIC.hints fit together for +# a static example. You should use loader(8) if at all possible. +# +# $FreeBSD: src/sys/i386/conf/gethints.awk,v 1.2 2002/07/26 03:52:30 peter Exp $ + +# skip commented lines, empty lines and not "device" lines +/^[ \t]*#/ || /^[ \t]*$/ || !/[ \t]*device/ { next; } + +# input format : +# device <name><unit> at <controler>[?] [key [val]]... +# possible keys are : +# disable, port #, irq #, drq #, drive #, iomem #, iosiz #, +# flags #, bus #, target #, unit #. +# output format : +# hint.<name>.<unit>.<key>=<val> +# mapped keys are : +# iomem -> maddr, iosiz -> msize. +{ + gsub ("#.*", ""); # delete comments + gsub ("\"", ""); # and double-quotes + nameunit = $2; # <name><unit> + at = $3; # at + controler = $4; # <controler>[?] + rest = 5; # optional keys begin at indice 5 + if (at != "at" || controler == "") + next; # skip devices w/o controlers + name = nameunit; + sub ("[0-9]*$", "", name); # get the name + unit = nameunit; + sub ("^" name, "", unit); # and the unit + sub ("\?$", "", controler); + printf "hint.%s.%s.at=\"%s\"\n", name, unit, controler; + # for each keys, if any ? + for (key = $rest; rest <= NF; key = $(++rest)) { + # skip auto-detect keys (the one w/ a ?) + if (key == "port?" || key == "drq?" || key == "irq?" || \ + key == "iomem?" || key == "iosiz?") + continue; + # disable has no value, so, give it one + if (key == "disable") { + printf "hint.%s.%s.disabled=\"1\"\n", name, unit; + continue; + } + # recognized keys + if (key == "port" || key == "irq" || key == "drq" || \ + key == "drive" || key == "iomem" || key == "iosiz" || \ + key == "flags" || key == "bus" || key == "target" || \ + key == "unit") { + val = $(++rest); + if (val == "?") # has above + continue; + if (key == "port") { + # map port macros to static values + sub ("IO_AHA0", "0x330", val); + sub ("IO_AHA1", "0x334", val); + sub ("IO_ASC1", "0x3EB", val); + sub ("IO_ASC2", "0x22B", val); + sub ("IO_ASC3", "0x26B", val); + sub ("IO_ASC4", "0x2AB", val); + sub ("IO_ASC5", "0x2EB", val); + sub ("IO_ASC6", "0x32B", val); + sub ("IO_ASC7", "0x36B", val); + sub ("IO_ASC8", "0x3AB", val); + sub ("IO_BT0", "0x330", val); + sub ("IO_BT1", "0x334", val); + sub ("IO_CGA", "0x3D0", val); + sub ("IO_COM1", "0x3F8", val); + sub ("IO_COM2", "0x2F8", val); + sub ("IO_COM3", "0x3E8", val); + sub ("IO_COM4", "0x2E8", val); + sub ("IO_DMA1", "0x000", val); + sub ("IO_DMA2", "0x0C0", val); + sub ("IO_DMAPG", "0x080", val); + sub ("IO_FD1", "0x3F0", val); + sub ("IO_FD2", "0x370", val); + sub ("IO_GAME", "0x201", val); + sub ("IO_GSC1", "0x270", val); + sub ("IO_GSC2", "0x2E0", val); + sub ("IO_GSC3", "0x370", val); + sub ("IO_GSC4", "0x3E0", val); + sub ("IO_ICU1", "0x020", val); + sub ("IO_ICU2", "0x0A0", val); + sub ("IO_KBD", "0x060", val); + sub ("IO_LPT1", "0x378", val); + sub ("IO_LPT2", "0x278", val); + sub ("IO_LPT3", "0x3BC", val); + sub ("IO_MDA", "0x3B0", val); + sub ("IO_NMI", "0x070", val); + sub ("IO_NPX", "0x0F0", val); + sub ("IO_PMP1", "0x026", val); + sub ("IO_PMP2", "0x178", val); + sub ("IO_PPI", "0x061", val); + sub ("IO_RTC", "0x070", val); + sub ("IO_TIMER1", "0x040", val); + sub ("IO_TIMER2", "0x048", val); + sub ("IO_UHA0", "0x330", val); + sub ("IO_VGA", "0x3C0", val); + sub ("IO_WD1", "0x1F0", val); + sub ("IO_WD2", "0x170", val); + } else { + # map key names + sub ("iomem", "maddr", key); + sub ("iosiz", "msize", key); + } + printf "hint.%s.%s.%s=\"%s\"\n", name, unit, key, val; + continue; + } + printf ("unrecognized config token '%s:%s' on line %s\n", + rest, key, NR); # > "/dev/stderr"; + } +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/clock.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/clock.c new file mode 100644 index 0000000000..393e091986 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/clock.c @@ -0,0 +1,511 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz and Don Ahn. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)clock.c 7.2 (Berkeley) 5/12/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/isa/clock.c,v 1.207 2003/11/13 10:02:12 phk Exp $"); + +/* #define DELAYDEBUG */ +/* + * Routines to handle clock hardware. + */ + +/* + * inittodr, settodr and support routines written + * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at> + * + * reintroduced and updated by Chris Stenton <chris@gnome.co.uk> 8/10/94 + */ + +#include "opt_clock.h" +#include "opt_isa.h" +#include "opt_mca.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/timetc.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/sysctl.h> +#include <sys/cons.h> +#include <sys/power.h> + +#include <machine/clock.h> +#include <machine/cputypes.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/psl.h> +#if defined(SMP) +#include <machine/smp.h> +#endif +#include <machine/specialreg.h> + +#include <i386/isa/icu.h> +#include <i386/isa/isa.h> +#include <isa/rtc.h> +#include <i386/isa/timerreg.h> + +/* XEN specific defines */ +#include <machine/xen_intr.h> + +/* + * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we + * can use a simple formula for leap years. + */ +#define LEAPYEAR(y) (((u_int)(y) % 4 == 0) ? 1 : 0) +#define DAYSPERYEAR (31+28+31+30+31+30+31+31+30+31+30+31) + +int adjkerntz; /* local offset from GMT in seconds */ +int clkintr_pending; +int disable_rtc_set = 1; /* disable resettodr() if != 0 */ +int pscnt = 1; +int psdiv = 1; +int statclock_disable; +#ifndef TIMER_FREQ +#define TIMER_FREQ 1193182 +#endif +u_int timer_freq = TIMER_FREQ; + +static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; + +/* Values for timerX_state: */ +#define RELEASED 0 +#define RELEASE_PENDING 1 +#define ACQUIRED 2 +#define ACQUIRE_PENDING 3 + +/* Cached *multiplier* to convert TSC counts to microseconds. + * (see the equation below). + * Equal to 2^32 * (1 / (clocks per usec) ). + * Initialized in time_init. + */ +static unsigned long fast_gettimeoffset_quotient; + +/* These are peridically updated in shared_info, and then copied here. */ +static uint32_t shadow_tsc_stamp; +static uint64_t shadow_system_time; +static uint32_t shadow_time_version; +static struct timeval shadow_tv; + +static uint64_t processed_system_time;/* System time (ns) at last processing. */ + +#define NS_PER_TICK (1000000000ULL/hz) + +/* convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_mhz * 10^6)) + * ns = cycles * (10^3 / cpu_mhz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^3 * SC / cpu_mhz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale; +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called with the xtime_lock held for writing. + */ +static void __get_time_values_from_xen(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + do { + shadow_time_version = s->time_version2; + rmb(); + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_usec = s->wc_usec; + shadow_tsc_stamp = (uint32_t)s->tsc_timestamp; + shadow_system_time = s->system_time; + rmb(); + } + while (shadow_time_version != s->time_version1); +} + +#define TIME_VALUES_UP_TO_DATE \ + (shadow_time_version == HYPERVISOR_shared_info->time_version2) + +static void (*timer_func)(struct clockframe *frame) = hardclock; + +static unsigned xen_get_offset(void); +static unsigned xen_get_timecount(struct timecounter *tc); + +static struct timecounter xen_timecounter = { + xen_get_timecount, /* get_timecount */ + 0, /* no poll_pps */ + ~0u, /* counter_mask */ + 0, /* frequency */ + "ixen", /* name */ + 0 /* quality */ +}; + + +static void +clkintr(struct clockframe *frame) +{ + int64_t delta; + long ticks = 0; + + + do { + __get_time_values_from_xen(); + delta = (int64_t)(shadow_system_time + + xen_get_offset() * 1000 - + processed_system_time); + } while (!TIME_VALUES_UP_TO_DATE); + + if (unlikely(delta < 0)) { + printk("Timer ISR: Time went backwards: %lld\n", delta); + return; + } + + /* Process elapsed ticks since last call. */ + while ( delta >= NS_PER_TICK ) + { + ticks++; + delta -= NS_PER_TICK; + processed_system_time += NS_PER_TICK; + } + + if (ticks > 0) { + if (frame) + timer_func(frame); +#ifdef SMP + if (timer_func == hardclock && frame) + forward_hardclock(); +#endif + } +} + +#include "opt_ddb.h" +static uint32_t +getit(void) +{ + __get_time_values_from_xen(); + return shadow_tsc_stamp; +} + +/* + * Wait "n" microseconds. + * Relies on timer 1 counting down from (timer_freq / hz) + * Note: timer had better have been programmed before this is first used! + */ +void +DELAY(int n) +{ + int delta, ticks_left; + uint32_t tick, prev_tick; +#ifdef DELAYDEBUG + int getit_calls = 1; + int n1; + static int state = 0; + + if (state == 0) { + state = 1; + for (n1 = 1; n1 <= 10000000; n1 *= 10) + DELAY(n1); + state = 2; + } + if (state == 1) + printf("DELAY(%d)...", n); +#endif + /* + * Read the counter first, so that the rest of the setup overhead is + * counted. Guess the initial overhead is 20 usec (on most systems it + * takes about 1.5 usec for each of the i/o's in getit(). The loop + * takes about 6 usec on a 486/33 and 13 usec on a 386/20. The + * multiplications and divisions to scale the count take a while). + * + * However, if ddb is active then use a fake counter since reading + * the i8254 counter involves acquiring a lock. ddb must not go + * locking for many reasons, but it calls here for at least atkbd + * input. + */ + prev_tick = getit(); + + n -= 0; /* XXX actually guess no initial overhead */ + /* + * Calculate (n * (timer_freq / 1e6)) without using floating point + * and without any avoidable overflows. + */ + if (n <= 0) + ticks_left = 0; + else if (n < 256) + /* + * Use fixed point to avoid a slow division by 1000000. + * 39099 = 1193182 * 2^15 / 10^6 rounded to nearest. + * 2^15 is the first power of 2 that gives exact results + * for n between 0 and 256. + */ + ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15; + else + /* + * Don't bother using fixed point, although gcc-2.7.2 + * generates particularly poor code for the long long + * division, since even the slow way will complete long + * before the delay is up (unless we're interrupted). + */ + ticks_left = ((u_int)n * (long long)timer_freq + 999999) + / 1000000; + + while (ticks_left > 0) { + tick = getit(); +#ifdef DELAYDEBUG + ++getit_calls; +#endif + delta = tick - prev_tick; + prev_tick = tick; + if (delta < 0) { + /* + * Guard against timer0_max_count being wrong. + * This shouldn't happen in normal operation, + * but it may happen if set_timer_freq() is + * traced. + */ + /* delta += timer0_max_count; ??? */ + if (delta < 0) + delta = 0; + } + ticks_left -= delta; + } +#ifdef DELAYDEBUG + if (state == 1) + printf(" %d calls to getit() at %d usec each\n", + getit_calls, (n + 5) / getit_calls); +#endif +} + + +int +sysbeep(int pitch, int period) +{ + return (0); +} + +/* + * Restore all the timers non-atomically (XXX: should be atomically). + * + * This function is called from pmtimer_resume() to restore all the timers. + * This should not be necessary, but there are broken laptops that do not + * restore all the timers on resume. + */ +void +timer_restore(void) +{ + /* Get timebases for new environment. */ + __get_time_values_from_xen(); + + /* Reset our own concept of passage of system time. */ + processed_system_time = shadow_system_time; +} + +void +startrtclock() +{ + unsigned long long alarm; + uint64_t __cpu_khz; + uint32_t cpu_khz; + + __cpu_khz = HYPERVISOR_shared_info->cpu_freq; + __cpu_khz /= 1000; + cpu_khz = (uint32_t)__cpu_khz; + printk("Xen reported: %lu.%03lu MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + + /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = + (2^32 * 1 / (clocks/us)) */ + { + unsigned long eax=0, edx=1000; + __asm__("divl %2" + :"=a" (fast_gettimeoffset_quotient), "=d" (edx) + :"r" (cpu_khz), + "0" (eax), "1" (edx)); + } + + set_cyc2ns_scale(cpu_khz/1000); + timer_freq = tsc_freq = xen_timecounter.tc_frequency = cpu_khz * 1000; + tc_init(&xen_timecounter); + + + rdtscll(alarm); +} + +/* + * Initialize the time of day register, based on the time base which is, e.g. + * from a filesystem. + */ +void +inittodr(time_t base) +{ + int s, y; + struct timespec ts; + + s = splclock(); + if (base) { + ts.tv_sec = base; + ts.tv_nsec = 0; + tc_setclock(&ts); + } + + y = time_second - shadow_tv.tv_sec; + if (y <= -2 || y >= 2) { + /* badly off, adjust it */ + ts.tv_sec = shadow_tv.tv_sec; + ts.tv_nsec = shadow_tv.tv_usec * 1000; + tc_setclock(&ts); + } + splx(s); +} + +/* + * Write system time back to RTC. Not supported for guest domains. + */ +void +resettodr() +{ +} + + +/* + * Start clocks running. + */ +void +cpu_initclocks() +{ + int diag; + int time_irq = bind_virq_to_irq(VIRQ_TIMER); + + if ((diag = intr_add_handler("clk", time_irq, + (driver_intr_t *)clkintr, NULL, + INTR_TYPE_CLK | INTR_FAST, NULL))) { + panic("failed to register clock interrupt: %d\n", diag); + } + + /* should fast clock be enabled ? */ + + /* initialize xen values */ + __get_time_values_from_xen(); + processed_system_time = shadow_system_time; +} + +void +cpu_startprofclock(void) +{ + + printf("cpu_startprofclock: profiling clock is not supported\n"); +} + +void +cpu_stopprofclock(void) +{ + + printf("cpu_stopprofclock: profiling clock is not supported\n"); +} + +static uint32_t +xen_get_timecount(struct timecounter *tc) +{ + __get_time_values_from_xen(); + return shadow_tsc_stamp; +} + +/* + * Track behavior of cur_timer->get_offset() functionality in timer_tsc.c + */ +#undef rdtsc +#define rdtsc(low,high) \ + __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high)) + +static uint32_t +xen_get_offset(void) +{ + register unsigned long eax, edx; + + /* Read the Time Stamp Counter */ + + rdtsc(eax,edx); + + /* .. relative to previous jiffy (32 bits is enough) */ + eax -= shadow_tsc_stamp; + + /* + * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient + * = (tsc_low delta) * (usecs_per_clock) + * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) + * + * Using a mull instead of a divl saves up to 31 clock cycles + * in the critical path. + */ + + __asm__("mull %2" + :"=a" (eax), "=d" (edx) + :"rm" (fast_gettimeoffset_quotient), + "0" (eax)); + + /* our adjusted time offset in microseconds */ + return edx; +} + +void +idle_block(void) +{ + if (HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK) == 0) + HYPERVISOR_block(); +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/critical.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/critical.c new file mode 100644 index 0000000000..ce388fa048 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/critical.c @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2002 Matthew Dillon. All Rights Reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/critical.c,v 1.12 2003/11/03 21:06:54 jhb Exp $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <machine/critical.h> +#include <machine/psl.h> + +/* + * cpu_critical_fork_exit() - cleanup after fork + * + * Enable interrupts in the saved copy of eflags. + */ +void +cpu_critical_fork_exit(void) +{ + curthread->td_md.md_savecrit = 0; +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/ctrl_if.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/ctrl_if.c new file mode 100644 index 0000000000..8e8ce9fde7 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/ctrl_if.c @@ -0,0 +1,476 @@ +/****************************************************************************** + * ctrl_if.c + * + * Management functions for special interface to the domain controller. + * + * Copyright (c) 2004, K A Fraser + * Copyright (c) 2004, K M Macy + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/uio.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/selinfo.h> +#include <sys/poll.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/ioccom.h> +#include <sys/taskqueue.h> + + +#include <machine/cpufunc.h> +#include <machine/intr_machdep.h> +#include <machine/xen-os.h> +#include <machine/xen_intr.h> +#include <machine/bus.h> +#include <sys/rman.h> +#include <machine/resource.h> +#include <machine/synch_bitops.h> + + +#include <machine/hypervisor-ifs.h> + +#include <machine/ctrl_if.h> +#include <machine/evtchn.h> + +/* + * Only used by initial domain which must create its own control-interface + * event channel. This value is picked up by the user-space domain controller + * via an ioctl. + */ +int initdom_ctrlif_domcontroller_port = -1; + +static int ctrl_if_evtchn; +static int ctrl_if_irq; +static struct mtx ctrl_if_lock; +static int * ctrl_if_wchan = &ctrl_if_evtchn; + + +static CONTROL_RING_IDX ctrl_if_tx_resp_cons; +static CONTROL_RING_IDX ctrl_if_rx_req_cons; + +/* Incoming message requests. */ + /* Primary message type -> message handler. */ +static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256]; + /* Primary message type -> callback in process context? */ +static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)]; + /* Queue up messages to be handled in process context. */ +static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE]; +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod; +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons; + +/* Incoming message responses: message identifier -> message handler/id. */ +static struct { + ctrl_msg_handler_t fn; + unsigned long id; +} ctrl_if_txmsg_id_mapping[CONTROL_RING_SIZE]; + +/* + * FreeBSD task queues don't allow you to requeue an already executing task. + * Since ctrl_if_interrupt clears the TX_FULL condition and schedules any + * waiting tasks, which themselves may need to schedule a new task + * (due to new a TX_FULL condition), we ping-pong between these A/B task queues. + * The interrupt runs anything on the current queue and moves the index so that + * future schedulings occur on the next queue. We should never get into a + * situation where there is a task scheduleded on both the A & B queues. + */ +TASKQUEUE_DECLARE(ctrl_if_txA); +TASKQUEUE_DEFINE(ctrl_if_txA, NULL, NULL, {}); +TASKQUEUE_DECLARE(ctrl_if_txB); +TASKQUEUE_DEFINE(ctrl_if_txB, NULL, NULL, {}); +struct taskqueue **taskqueue_ctrl_if_tx[2] = { &taskqueue_ctrl_if_txA, + &taskqueue_ctrl_if_txB }; +int ctrl_if_idx; + +static struct task ctrl_if_rx_tasklet; +static struct task ctrl_if_tx_tasklet; + /* Passed to schedule_task(). */ +static struct task ctrl_if_rxmsg_deferred_task; + + + +#define get_ctrl_if() ((control_if_t *)((char *)HYPERVISOR_shared_info + 2048)) +#define TX_FULL(_c) \ + (((_c)->tx_req_prod - ctrl_if_tx_resp_cons) == CONTROL_RING_SIZE) + +static void +ctrl_if_notify_controller(void) +{ + notify_via_evtchn(ctrl_if_evtchn); +} + +static void +ctrl_if_rxmsg_default_handler(ctrl_msg_t *msg, unsigned long id) +{ + msg->length = 0; + ctrl_if_send_response(msg); +} + +static void +__ctrl_if_tx_tasklet(void *context __unused, int pending __unused) +{ + control_if_t *ctrl_if = get_ctrl_if(); + ctrl_msg_t *msg; + int was_full = TX_FULL(ctrl_if); + + while ( ctrl_if_tx_resp_cons != ctrl_if->tx_resp_prod ) + { + msg = &ctrl_if->tx_ring[MASK_CONTROL_IDX(ctrl_if_tx_resp_cons)]; + + /* Execute the callback handler, if one was specified. */ + if ( msg->id != 0xFF ) + { + (*ctrl_if_txmsg_id_mapping[msg->id].fn)( + msg, ctrl_if_txmsg_id_mapping[msg->id].id); + smp_mb(); /* Execute, /then/ free. */ + ctrl_if_txmsg_id_mapping[msg->id].fn = NULL; + } + + /* + * Step over the message in the ring /after/ finishing reading it. As + * soon as the index is updated then the message may get blown away. + */ + smp_mb(); + ctrl_if_tx_resp_cons++; + } + + if ( was_full && !TX_FULL(ctrl_if) ) + { + wakeup(ctrl_if_wchan); + + /* bump idx so future enqueues will occur on the next taskq + * process any currently pending tasks + */ + ctrl_if_idx++; + taskqueue_run(*taskqueue_ctrl_if_tx[(ctrl_if_idx-1) & 1]); + } +} + +static void +__ctrl_if_rxmsg_deferred_task(void *context __unused, int pending __unused) +{ + ctrl_msg_t *msg; + + while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod ) + { + msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX( + ctrl_if_rxmsg_deferred_cons++)]; + (*ctrl_if_rxmsg_handler[msg->type])(msg, 0); + } +} + +static void +__ctrl_if_rx_tasklet(void *context __unused, int pending __unused) +{ + control_if_t *ctrl_if = get_ctrl_if(); + ctrl_msg_t msg, *pmsg; + + while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod ) + { + pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)]; + memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg)); + if ( msg.length != 0 ) + memcpy(msg.msg, pmsg->msg, msg.length); + if ( test_bit(msg.type, &ctrl_if_rxmsg_blocking_context) ) + { + pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX( + ctrl_if_rxmsg_deferred_prod++)]; + memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length); + taskqueue_enqueue(taskqueue_thread, &ctrl_if_rxmsg_deferred_task); + } + else + { + (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0); + } + } +} + +static void +ctrl_if_interrupt(void *ctrl_sc) +/* (int irq, void *dev_id, struct pt_regs *regs) */ +{ + control_if_t *ctrl_if = get_ctrl_if(); + + if ( ctrl_if_tx_resp_cons != ctrl_if->tx_resp_prod ) + taskqueue_enqueue(taskqueue_swi, &ctrl_if_tx_tasklet); + + + if ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod ) + taskqueue_enqueue(taskqueue_swi, &ctrl_if_rx_tasklet); +} + +int +ctrl_if_send_message_noblock( + ctrl_msg_t *msg, + ctrl_msg_handler_t hnd, + unsigned long id) +{ + control_if_t *ctrl_if = get_ctrl_if(); + unsigned long flags; + int i; + + mtx_lock_irqsave(&ctrl_if_lock, flags); + + if ( TX_FULL(ctrl_if) ) + { + mtx_unlock_irqrestore(&ctrl_if_lock, flags); + return EAGAIN; + } + + msg->id = 0xFF; + if ( hnd != NULL ) + { + for ( i = 0; ctrl_if_txmsg_id_mapping[i].fn != NULL; i++ ) + continue; + ctrl_if_txmsg_id_mapping[i].fn = hnd; + ctrl_if_txmsg_id_mapping[i].id = id; + msg->id = i; + } + + memcpy(&ctrl_if->tx_ring[MASK_CONTROL_IDX(ctrl_if->tx_req_prod)], + msg, sizeof(*msg)); + wmb(); /* Write the message before letting the controller peek at it. */ + ctrl_if->tx_req_prod++; + + mtx_unlock_irqrestore(&ctrl_if_lock, flags); + + ctrl_if_notify_controller(); + + return 0; +} + +int +ctrl_if_send_message_block( + ctrl_msg_t *msg, + ctrl_msg_handler_t hnd, + unsigned long id, + long wait_state) +{ + int rc, sst = 0; + + /* Fast path. */ + if ( (rc = ctrl_if_send_message_noblock(msg, hnd, id)) != EAGAIN ) + return rc; + + + for ( ; ; ) + { + + if ( (rc = ctrl_if_send_message_noblock(msg, hnd, id)) != EAGAIN ) + break; + + if ( sst != 0) + return EINTR; + + sst = tsleep(ctrl_if_wchan, PWAIT|PCATCH, "ctlrwt", 10); + } + + return rc; +} + +int +ctrl_if_enqueue_space_callback(struct task *task) +{ + control_if_t *ctrl_if = get_ctrl_if(); + + /* Fast path. */ + if ( !TX_FULL(ctrl_if) ) + return 0; + + (void)taskqueue_enqueue(*taskqueue_ctrl_if_tx[(ctrl_if_idx & 1)], task); + + /* + * We may race execution of the task queue, so return re-checked status. If + * the task is not executed despite the ring being non-full then we will + * certainly return 'not full'. + */ + smp_mb(); + return TX_FULL(ctrl_if); +} + +void +ctrl_if_send_response(ctrl_msg_t *msg) +{ + control_if_t *ctrl_if = get_ctrl_if(); + unsigned long flags; + ctrl_msg_t *dmsg; + + /* + * NB. The response may the original request message, modified in-place. + * In this situation we may have src==dst, so no copying is required. + */ + mtx_lock_irqsave(&ctrl_if_lock, flags); + dmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if->rx_resp_prod)]; + if ( dmsg != msg ) + memcpy(dmsg, msg, sizeof(*msg)); + wmb(); /* Write the message before letting the controller peek at it. */ + ctrl_if->rx_resp_prod++; + mtx_unlock_irqrestore(&ctrl_if_lock, flags); + + ctrl_if_notify_controller(); +} + +int +ctrl_if_register_receiver( + uint8_t type, + ctrl_msg_handler_t hnd, + unsigned int flags) +{ + unsigned long _flags; + int inuse; + + mtx_lock_irqsave(&ctrl_if_lock, _flags); + + inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler); + + if ( inuse ) + { + printk("Receiver %p already established for control " + "messages of type %d.\n", ctrl_if_rxmsg_handler[type], type); + } + else + { + ctrl_if_rxmsg_handler[type] = hnd; + clear_bit(type, &ctrl_if_rxmsg_blocking_context); + if ( flags == CALLBACK_IN_BLOCKING_CONTEXT ) + { + set_bit(type, &ctrl_if_rxmsg_blocking_context); + } + } + + mtx_unlock_irqrestore(&ctrl_if_lock, _flags); + + return !inuse; +} + +void +ctrl_if_unregister_receiver(uint8_t type, ctrl_msg_handler_t hnd) +{ + unsigned long flags; + + mtx_lock_irqsave(&ctrl_if_lock, flags); + + if ( ctrl_if_rxmsg_handler[type] != hnd ) + printk("Receiver %p is not registered for control " + "messages of type %d.\n", hnd, type); + else + ctrl_if_rxmsg_handler[type] = ctrl_if_rxmsg_default_handler; + + mtx_unlock_irqrestore(&ctrl_if_lock, flags); + + /* Ensure that @hnd will not be executed after this function returns. */ + /* XXX need rx_tasklet_lock -- can cheat for now?*/ +#ifdef notyet + tasklet_unlock_wait(&ctrl_if_rx_tasklet); +#endif +} + +void +ctrl_if_suspend(void) +{ + /* I'm not sure what the equivalent is - we aren't going to support suspend + * yet anyway + */ +#ifdef notyet + free_irq(ctrl_if_irq, NULL); +#endif + unbind_evtchn_from_irq(ctrl_if_evtchn); +} + +/** Reset the control interface progress pointers. + * Marks the queues empty if 'clear' non-zero. + */ +static void +ctrl_if_reset(int clear) +{ + control_if_t *ctrl_if = get_ctrl_if(); + + if (clear) { + *ctrl_if = (control_if_t){}; + } + + ctrl_if_tx_resp_cons = ctrl_if->tx_resp_prod; + ctrl_if_rx_req_cons = ctrl_if->rx_resp_prod; +} + + +void +ctrl_if_resume(void) +{ + if ( xen_start_info->flags & SIF_INITDOMAIN ) + { + /* + * The initial domain must create its own domain-controller link. + * The controller is probably not running at this point, but will + * pick up its end of the event channel from + */ + evtchn_op_t op; + op.cmd = EVTCHNOP_bind_interdomain; + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = DOMID_SELF; + op.u.bind_interdomain.port1 = 0; + op.u.bind_interdomain.port2 = 0; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("event_channel_op failed\n"); + xen_start_info->domain_controller_evtchn = op.u.bind_interdomain.port1; + initdom_ctrlif_domcontroller_port = op.u.bind_interdomain.port2; + } + + ctrl_if_reset(0); + + ctrl_if_evtchn = xen_start_info->domain_controller_evtchn; + ctrl_if_irq = bind_evtchn_to_irq(ctrl_if_evtchn); + + /* + * I have not taken the time to determine what the interrupt thread priorities + * correspond to - this interface is used for network and disk, network would + * seem higher priority, hence I'm using it + */ + + intr_add_handler("ctrl-if", ctrl_if_irq, (driver_intr_t*)ctrl_if_interrupt, + NULL, INTR_TYPE_NET | INTR_MPSAFE, NULL); +} + +static void +ctrl_if_init(void *dummy __unused) +{ + int i; + + for ( i = 0; i < 256; i++ ) + ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler; + + mtx_init(&ctrl_if_lock, "ctrlif", NULL, MTX_SPIN | MTX_NOWITNESS); + + TASK_INIT(&ctrl_if_tx_tasklet, 0, __ctrl_if_tx_tasklet, NULL); + + TASK_INIT(&ctrl_if_rx_tasklet, 0, __ctrl_if_rx_tasklet, NULL); + + TASK_INIT(&ctrl_if_rxmsg_deferred_task, 0, __ctrl_if_rxmsg_deferred_task, NULL); + + ctrl_if_reset(1); + ctrl_if_resume(); +} + +/* + * !! The following are DANGEROUS FUNCTIONS !! + * Use with care [for example, see xencons_force_flush()]. + */ + +int +ctrl_if_transmitter_empty(void) +{ + return (get_ctrl_if()->tx_req_prod == ctrl_if_tx_resp_cons); +} + +void +ctrl_if_discard_responses(void) +{ + ctrl_if_tx_resp_cons = get_ctrl_if()->tx_resp_prod; +} + +SYSINIT(ctrl_if_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, ctrl_if_init, NULL); diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/db_interface.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/db_interface.c new file mode 100644 index 0000000000..57aa4e2ef4 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/db_interface.c @@ -0,0 +1,209 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/db_interface.c,v 1.77 2003/11/08 03:01:26 alc Exp $"); + +/* + * Interface to new debugger. + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/reboot.h> +#include <sys/cons.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/smp.h> + +#include <machine/cpu.h> +#ifdef SMP +#include <machine/smptests.h> /** CPUSTOP_ON_DDBBREAK */ +#endif + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <ddb/ddb.h> + +#include <machine/setjmp.h> +#include <machine/xenfunc.h> + + +static jmp_buf *db_nofault = 0; +extern jmp_buf db_jmpbuf; + +extern void gdb_handle_exception(db_regs_t *, int, int); + +int db_active; +db_regs_t ddb_regs; + +static __inline u_short +rss(void) +{ + u_short ss; +#ifdef __GNUC__ + __asm __volatile("mov %%ss,%0" : "=r" (ss)); +#else + ss = 0; /* XXXX Fix for other compilers. */ +#endif + return ss; +} + +/* + * kdb_trap - field a TRACE or BPT trap + */ +int +kdb_trap(int type, int code, struct i386_saved_state *regs) +{ + volatile int ddb_mode = !(boothowto & RB_GDB); + + disable_intr(); + + if (ddb_mode) { + /* we can't do much as a guest domain except print a + * backtrace and die gracefuly. The reason is that we + * can't get character input to make this work. + */ + db_active = 1; + db_print_backtrace(); + db_printf("************ Domain shutting down ************\n"); + HYPERVISOR_shutdown(); + } else { + Debugger("kdb_trap"); + } + return (1); +} + +/* + * Read bytes from kernel address space for debugger. + */ +void +db_read_bytes(vm_offset_t addr, size_t size, char *data) +{ + char *src; + + db_nofault = &db_jmpbuf; + + src = (char *)addr; + while (size-- > 0) + *data++ = *src++; + + db_nofault = 0; +} + +/* + * Write bytes to kernel address space for debugger. + */ +void +db_write_bytes(vm_offset_t addr, size_t size, char *data) +{ + char *dst; + + pt_entry_t *ptep0 = NULL; + pt_entry_t oldmap0 = 0; + vm_offset_t addr1; + pt_entry_t *ptep1 = NULL; + pt_entry_t oldmap1 = 0; + + db_nofault = &db_jmpbuf; + + if (addr > trunc_page((vm_offset_t)btext) - size && + addr < round_page((vm_offset_t)etext)) { + + ptep0 = pmap_pte(kernel_pmap, addr); + oldmap0 = *ptep0; + *ptep0 |= PG_RW; + + /* Map another page if the data crosses a page boundary. */ + if ((*ptep0 & PG_PS) == 0) { + addr1 = trunc_page(addr + size - 1); + if (trunc_page(addr) != addr1) { + ptep1 = pmap_pte(kernel_pmap, addr1); + oldmap1 = *ptep1; + *ptep1 |= PG_RW; + } + } else { + addr1 = trunc_4mpage(addr + size - 1); + if (trunc_4mpage(addr) != addr1) { + ptep1 = pmap_pte(kernel_pmap, addr1); + oldmap1 = *ptep1; + *ptep1 |= PG_RW; + } + } + + invltlb(); + } + + dst = (char *)addr; + + while (size-- > 0) + *dst++ = *data++; + + db_nofault = 0; + + if (ptep0) { + *ptep0 = oldmap0; + + if (ptep1) + *ptep1 = oldmap1; + + invltlb(); + } +} + +/* + * XXX + * Move this to machdep.c and allow it to be called if any debugger is + * installed. + */ +void +Debugger(const char *msg) +{ + static volatile u_int in_Debugger; + + /* + * XXX + * Do nothing if the console is in graphics mode. This is + * OK if the call is for the debugger hotkey but not if the call + * is a weak form of panicing. + */ + if (cons_unavail && !(boothowto & RB_GDB)) + return; + + if (atomic_cmpset_acq_int(&in_Debugger, 0, 1)) { + db_printf("Debugger(\"%s\")\n", msg); + breakpoint(); + atomic_store_rel_int(&in_Debugger, 0); + } +} + +void +db_show_mdpcpu(struct pcpu *pc) +{ + + db_printf("APIC ID = %d\n", pc->pc_apic_id); + db_printf("currentldt = 0x%x\n", pc->pc_currentldt); +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/evtchn.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/evtchn.c new file mode 100644 index 0000000000..635a3bfe4e --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/evtchn.c @@ -0,0 +1,580 @@ +/****************************************************************************** + * evtchn.c + * + * Communication via Xen event channels. + * + * Copyright (c) 2002-2004, K A Fraser + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <machine/cpufunc.h> +#include <machine/intr_machdep.h> +#include <machine/xen-os.h> +#include <machine/xen_intr.h> +#include <machine/synch_bitops.h> +#include <machine/evtchn.h> +#include <machine/hypervisor.h> +#include <machine/hypervisor-ifs.h> + + +static struct mtx irq_mapping_update_lock; + +#define TODO printf("%s: not implemented!\n", __func__) + +/* IRQ <-> event-channel mappings. */ +static int evtchn_to_irq[NR_EVENT_CHANNELS]; +static int irq_to_evtchn[NR_IRQS]; + +/* IRQ <-> VIRQ mapping. */ +static int virq_to_irq[NR_VIRQS]; + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +#define VALID_EVTCHN(_chn) ((_chn) != -1) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0); +} + +void +evtchn_do_upcall(struct intrframe *frame) +{ + unsigned long l1, l2; + unsigned int l1i, l2i, port; + int irq, owned; + unsigned long flags; + shared_info_t *s = HYPERVISOR_shared_info; + + local_irq_save(flags); + + while ( s->vcpu_data[0].evtchn_upcall_pending ) + { + s->vcpu_data[0].evtchn_upcall_pending = 0; + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ + l1 = xen_xchg(&s->evtchn_pending_sel, 0); + while ( (l1i = ffs(l1)) != 0 ) + { + l1i--; + l1 &= ~(1 << l1i); + + l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i]; + while ( (l2i = ffs(l2)) != 0 ) + { + l2i--; + l2 &= ~(1 << l2i); + + port = (l1i << 5) + l2i; + if ((owned = mtx_owned(&sched_lock)) != 0) + mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); + if ( (irq = evtchn_to_irq[port]) != -1 ) { + struct intsrc *isrc = intr_lookup_source(irq); + intr_execute_handlers(isrc, frame); + + } else { + evtchn_device_upcall(port); + } + if (owned) + mtx_lock_spin_flags(&sched_lock, MTX_QUIET); + } + } + } + + local_irq_restore(flags); + +} + + +static int +find_unbound_irq(void) +{ + int irq; + + for ( irq = 0; irq < NR_IRQS; irq++ ) + if ( irq_bindcount[irq] == 0 ) + break; + + if ( irq == NR_IRQS ) + panic("No available IRQ to bind to: increase NR_IRQS!\n"); + + return irq; +} + +int +bind_virq_to_irq(int virq) +{ + evtchn_op_t op; + int evtchn, irq; + + mtx_lock(&irq_mapping_update_lock); + + if ( (irq = virq_to_irq[virq]) == -1 ) + { + op.cmd = EVTCHNOP_bind_virq; + op.u.bind_virq.virq = virq; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to bind virtual IRQ %d\n", virq); + evtchn = op.u.bind_virq.port; + + irq = find_unbound_irq(); + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + + virq_to_irq[virq] = irq; + } + + irq_bindcount[irq]++; + + mtx_unlock(&irq_mapping_update_lock); + + return irq; +} + +void +unbind_virq_from_irq(int virq) +{ + evtchn_op_t op; + int irq = virq_to_irq[virq]; + int evtchn = irq_to_evtchn[irq]; + + mtx_lock(&irq_mapping_update_lock); + + if ( --irq_bindcount[irq] == 0 ) + { + op.cmd = EVTCHNOP_close; + op.u.close.dom = DOMID_SELF; + op.u.close.port = evtchn; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to unbind virtual IRQ %d\n", virq); + + evtchn_to_irq[evtchn] = -1; + irq_to_evtchn[irq] = -1; + virq_to_irq[virq] = -1; + } + + mtx_unlock(&irq_mapping_update_lock); +} + +int +bind_evtchn_to_irq(int evtchn) +{ + int irq; + + mtx_lock(&irq_mapping_update_lock); + + if ( (irq = evtchn_to_irq[evtchn]) == -1 ) + { + irq = find_unbound_irq(); + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + } + + irq_bindcount[irq]++; + + mtx_unlock(&irq_mapping_update_lock); + + return irq; +} + +void +unbind_evtchn_from_irq(int evtchn) +{ + int irq = evtchn_to_irq[evtchn]; + + mtx_lock(&irq_mapping_update_lock); + + if ( --irq_bindcount[irq] == 0 ) + { + evtchn_to_irq[evtchn] = -1; + irq_to_evtchn[irq] = -1; + } + + mtx_unlock(&irq_mapping_update_lock); +} + + +/* + * Interface to generic handling in intr_machdep.c + */ + + +/*------------ interrupt handling --------------------------------------*/ +#define TODO printf("%s: not implemented!\n", __func__) + + struct mtx xenpic_lock; + +struct xenpic_intsrc { + struct intsrc xp_intsrc; + uint8_t xp_vector; + boolean_t xp_masked; +}; + +struct xenpic { + struct pic xp_pic; /* this MUST be first */ + uint16_t xp_numintr; + struct xenpic_intsrc xp_pins[0]; +}; + +static void xenpic_enable_dynirq_source(struct intsrc *isrc); +static void xenpic_disable_dynirq_source(struct intsrc *isrc, int); +static void xenpic_eoi_source(struct intsrc *isrc); +static void xenpic_enable_dynirq_intr(struct intsrc *isrc); +static int xenpic_vector(struct intsrc *isrc); +static int xenpic_source_pending(struct intsrc *isrc); +static void xenpic_suspend(struct intsrc *isrc); +static void xenpic_resume(struct intsrc *isrc); + + +struct pic xenpic_template = { + xenpic_enable_dynirq_source, + xenpic_disable_dynirq_source, + xenpic_eoi_source, + xenpic_enable_dynirq_intr, + xenpic_vector, + xenpic_source_pending, + xenpic_suspend, + xenpic_resume +}; + + +void +xenpic_enable_dynirq_source(struct intsrc *isrc) +{ + unsigned int irq; + struct xenpic_intsrc *xp; + + xp = (struct xenpic_intsrc *)isrc; + + if (xp->xp_masked) { + irq = xenpic_vector(isrc); + unmask_evtchn(irq_to_evtchn[irq]); + xp->xp_masked = FALSE; + } +} + +static void +xenpic_disable_dynirq_source(struct intsrc *isrc, int foo) +{ + unsigned int irq; + struct xenpic_intsrc *xp; + + xp = (struct xenpic_intsrc *)isrc; + + if (!xp->xp_masked) { + irq = xenpic_vector(isrc); + mask_evtchn(irq_to_evtchn[irq]); + xp->xp_masked = TRUE; + } + +} + +static void +xenpic_enable_dynirq_intr(struct intsrc *isrc) +{ + unsigned int irq; + + irq = xenpic_vector(isrc); + unmask_evtchn(irq_to_evtchn[irq]); +} + +static void +xenpic_eoi_source(struct intsrc *isrc) +{ + unsigned int irq = xenpic_vector(isrc); + clear_evtchn(irq_to_evtchn[irq]); +} + +static int +xenpic_vector(struct intsrc *isrc) +{ + struct xenpic_intsrc *pin = (struct xenpic_intsrc *)isrc; + return (pin->xp_vector); +} + +static int +xenpic_source_pending(struct intsrc *isrc) +{ + TODO; + return 0; +} + +static void +xenpic_suspend(struct intsrc *isrc) +{ + TODO; +} + +static void +xenpic_resume(struct intsrc *isrc) +{ + TODO; +} + +#ifdef CONFIG_PHYSDEV +/* required for support of physical devices */ +static inline void +pirq_unmask_notify(int pirq) +{ + physdev_op_t op; + if ( unlikely(test_bit(pirq, &pirq_needs_unmask_notify[0])) ) + { + op.cmd = PHYSDEVOP_IRQ_UNMASK_NOTIFY; + (void)HYPERVISOR_physdev_op(&op); + } +} + +static inline void +pirq_query_unmask(int pirq) +{ + physdev_op_t op; + op.cmd = PHYSDEVOP_IRQ_STATUS_QUERY; + op.u.irq_status_query.irq = pirq; + (void)HYPERVISOR_physdev_op(&op); + clear_bit(pirq, &pirq_needs_unmask_notify[0]); + if ( op.u.irq_status_query.flags & PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY ) + set_bit(pirq, &pirq_needs_unmask_notify[0]); +} + +/* + * On startup, if there is no action associated with the IRQ then we are + * probing. In this case we should not share with others as it will confuse us. + */ +#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL) + +static unsigned int startup_pirq(unsigned int irq) +{ + evtchn_op_t op; + int evtchn; + + op.cmd = EVTCHNOP_bind_pirq; + op.u.bind_pirq.pirq = irq; + /* NB. We are happy to share unless we are probing. */ + op.u.bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + { + if ( !probing_irq(irq) ) /* Some failures are expected when probing. */ + printk(KERN_INFO "Failed to obtain physical IRQ %d\n", irq); + return 0; + } + evtchn = op.u.bind_pirq.port; + + pirq_query_unmask(irq_to_pirq(irq)); + + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + + unmask_evtchn(evtchn); + pirq_unmask_notify(irq_to_pirq(irq)); + + return 0; +} + +static void shutdown_pirq(unsigned int irq) +{ + evtchn_op_t op; + int evtchn = irq_to_evtchn[irq]; + + if ( !VALID_EVTCHN(evtchn) ) + return; + + mask_evtchn(evtchn); + + op.cmd = EVTCHNOP_close; + op.u.close.dom = DOMID_SELF; + op.u.close.port = evtchn; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to unbind physical IRQ %d\n", irq); + + evtchn_to_irq[evtchn] = -1; + irq_to_evtchn[irq] = -1; +} + +static void enable_pirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + if ( !VALID_EVTCHN(evtchn) ) + return; + unmask_evtchn(evtchn); + pirq_unmask_notify(irq_to_pirq(irq)); +} + +static void disable_pirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + if ( !VALID_EVTCHN(evtchn) ) + return; + mask_evtchn(evtchn); +} + +static void ack_pirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + if ( !VALID_EVTCHN(evtchn) ) + return; + mask_evtchn(evtchn); + clear_evtchn(evtchn); +} + +static void end_pirq(unsigned int irq) +{ + int evtchn = irq_to_evtchn[irq]; + if ( !VALID_EVTCHN(evtchn) ) + return; + if ( !(irq_desc[irq].status & IRQ_DISABLED) ) + { + unmask_evtchn(evtchn); + pirq_unmask_notify(irq_to_pirq(irq)); + } +} + +static struct hw_interrupt_type pirq_type = { + "Phys-irq", + startup_pirq, + shutdown_pirq, + enable_pirq, + disable_pirq, + ack_pirq, + end_pirq, + NULL +}; +#endif + + +static void +misdirect_interrupt(void *sc) +{ +} + +void irq_suspend(void) +{ + int virq, irq, evtchn; + + /* Unbind VIRQs from event channels. */ + for ( virq = 0; virq < NR_VIRQS; virq++ ) + { + if ( (irq = virq_to_irq[virq]) == -1 ) + continue; + evtchn = irq_to_evtchn[irq]; + + /* Mark the event channel as unused in our table. */ + evtchn_to_irq[evtchn] = -1; + irq_to_evtchn[irq] = -1; + } + + /* + * We should now be unbound from all event channels. Stale bindings to + * PIRQs and/or inter-domain event channels will cause us to barf here. + */ + for ( evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++ ) + if ( evtchn_to_irq[evtchn] != -1 ) + panic("Suspend attempted while bound to evtchn %d.\n", evtchn); +} + + +void irq_resume(void) +{ + evtchn_op_t op; + int virq, irq, evtchn; + + for ( evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++ ) + mask_evtchn(evtchn); /* New event-channel space is not 'live' yet. */ + + for ( virq = 0; virq < NR_VIRQS; virq++ ) + { + if ( (irq = virq_to_irq[virq]) == -1 ) + continue; + + /* Get a new binding from Xen. */ + op.cmd = EVTCHNOP_bind_virq; + op.u.bind_virq.virq = virq; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to bind virtual IRQ %d\n", virq); + evtchn = op.u.bind_virq.port; + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + + /* Ready for use. */ + unmask_evtchn(evtchn); + } +} + +static void +evtchn_init(void *dummy __unused) +{ + int i; + struct xenpic *xp; + struct xenpic_intsrc *pin; + + /* + * xenpic_lock: in order to allow an interrupt to occur in a critical + * section, to set pcpu->ipending (etc...) properly, we + * must be able to get the icu lock, so it can't be + * under witness. + */ + mtx_init(&irq_mapping_update_lock, "xp", NULL, MTX_DEF); + + /* No VIRQ -> IRQ mappings. */ + for ( i = 0; i < NR_VIRQS; i++ ) + virq_to_irq[i] = -1; + + /* No event-channel -> IRQ mappings. */ + for ( i = 0; i < NR_EVENT_CHANNELS; i++ ) + { + evtchn_to_irq[i] = -1; + mask_evtchn(i); /* No event channels are 'live' right now. */ + } + + /* No IRQ -> event-channel mappings. */ + for ( i = 0; i < NR_IRQS; i++ ) + irq_to_evtchn[i] = -1; + + xp = malloc(sizeof(struct xenpic) + NR_DYNIRQS*sizeof(struct xenpic_intsrc), M_DEVBUF, M_WAITOK); + xp->xp_pic = xenpic_template; + xp->xp_numintr = NR_DYNIRQS; + bzero(xp->xp_pins, sizeof(struct xenpic_intsrc) * NR_DYNIRQS); + + for ( i = 0, pin = xp->xp_pins; i < NR_DYNIRQS; i++, pin++ ) + { + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + irq_bindcount[dynirq_to_irq(i)] = 0; + + pin->xp_intsrc.is_pic = (struct pic *)xp; + pin->xp_vector = i; + intr_register_source(&pin->xp_intsrc); + } + /* We don't currently have any support for physical devices in XenoFreeBSD + * so leaving this out for the moment for the sake of expediency. + */ +#ifdef notyet + for ( i = 0; i < NR_PIRQS; i++ ) + { + /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */ + irq_bindcount[pirq_to_irq(i)] = 1; + + irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED; + irq_desc[pirq_to_irq(i)].action = 0; + irq_desc[pirq_to_irq(i)].depth = 1; + irq_desc[pirq_to_irq(i)].handler = &pirq_type; + } + +#endif + (void) intr_add_handler("xb_mis", bind_virq_to_irq(VIRQ_MISDIRECT), + (driver_intr_t *)misdirect_interrupt, + NULL, INTR_TYPE_MISC, NULL); +} + +SYSINIT(evtchn_init, SI_SUB_INTR, SI_ORDER_ANY, evtchn_init, NULL); diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/exception.s b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/exception.s new file mode 100644 index 0000000000..4adb61a350 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/exception.s @@ -0,0 +1,428 @@ +/*- + * Copyright (c) 1989, 1990 William F. Jolitz. + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/i386/i386/exception.s,v 1.106 2003/11/03 22:08:52 jhb Exp $ + */ + +#include "opt_npx.h" + +#include <machine/asmacros.h> +#include <machine/psl.h> +#include <machine/trap.h> + +#include "assym.s" + +#define SEL_RPL_MASK 0x0002 +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 +#define XEN_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) +#define XEN_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) +#define XEN_TEST_PENDING(reg) testb $0x1,evtchn_upcall_pending(reg) + + +#define POPA \ + popl %edi; \ + popl %esi; \ + popl %ebp; \ + popl %ebx; \ + popl %ebx; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + + .text + +/*****************************************************************************/ +/* Trap handling */ +/*****************************************************************************/ +/* + * Trap and fault vector routines. + * + * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on + * the stack that mostly looks like an interrupt, but does not disable + * interrupts. A few of the traps we are use are interrupt gates, + * SDT_SYS386IGT, which are nearly the same thing except interrupts are + * disabled on entry. + * + * The cpu will push a certain amount of state onto the kernel stack for + * the current process. The amount of state depends on the type of trap + * and whether the trap crossed rings or not. See i386/include/frame.h. + * At the very least the current EFLAGS (status register, which includes + * the interrupt disable state prior to the trap), the code segment register, + * and the return instruction pointer are pushed by the cpu. The cpu + * will also push an 'error' code for certain traps. We push a dummy + * error code for those traps where the cpu doesn't in order to maintain + * a consistent frame. We also push a contrived 'trap number'. + * + * The cpu does not push the general registers, we must do that, and we + * must restore them prior to calling 'iret'. The cpu adjusts the %cs and + * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we + * must load them with appropriate values for supervisor mode operation. + */ + +MCOUNT_LABEL(user) +MCOUNT_LABEL(btrap) + +IDTVEC(div) + pushl $0; pushl $0; TRAP(T_DIVIDE) +IDTVEC(dbg) + pushl $0; pushl $0; TRAP(T_TRCTRAP) +IDTVEC(nmi) + pushl $0; pushl $0; TRAP(T_NMI) +IDTVEC(bpt) + pushl $0; pushl $0; TRAP(T_BPTFLT) +IDTVEC(ofl) + pushl $0; pushl $0; TRAP(T_OFLOW) +IDTVEC(bnd) + pushl $0; pushl $0; TRAP(T_BOUND) +IDTVEC(ill) + pushl $0; pushl $0; TRAP(T_PRIVINFLT) +IDTVEC(dna) + pushl $0; pushl $0; TRAP(T_DNA) +IDTVEC(fpusegm) + pushl $0; pushl $0; TRAP(T_FPOPFLT) +IDTVEC(tss) + pushl $0; TRAP(T_TSSFLT) +IDTVEC(missing) + pushl $0; TRAP(T_SEGNPFLT) +IDTVEC(stk) + pushl $0; TRAP(T_STKFLT) +IDTVEC(prot) + pushl $0; TRAP(T_PROTFLT) +IDTVEC(page) + TRAP(T_PAGEFLT) +IDTVEC(mchk) + pushl $0; pushl $0; TRAP(T_MCHK) +IDTVEC(rsvd) + pushl $0; pushl $0; TRAP(T_RESERVED) +IDTVEC(fpu) + pushl $0; pushl $0; TRAP(T_ARITHTRAP) +IDTVEC(align) + pushl $0; TRAP(T_ALIGNFLT) + +IDTVEC(xmm) + pushl $0; pushl $0; TRAP(T_XMMFLT) + +IDTVEC(hypervisor_callback) + pushl $T_HYPCALLBACK; pushl %eax; TRAP(T_HYPCALLBACK) + +hypervisor_callback_pending: + movl $T_HYPCALLBACK,TF_TRAPNO(%esp) + movl $T_HYPCALLBACK,TF_ERR(%esp) + jmp 11f + + /* + * alltraps entry point. Interrupts are enabled if this was a trap + * gate (TGT), else disabled if this was an interrupt gate (IGT). + * Note that int0x80_syscall is a trap gate. Only page faults + * use an interrupt gate. + */ + + SUPERALIGN_TEXT + .globl alltraps + .type alltraps,@function +alltraps: + cld + pushal + pushl %ds + pushl %es + pushl %fs +alltraps_with_regs_pushed: + movl $KDSEL,%eax + movl %eax,%ds + movl %eax,%es + movl $KPSEL,%eax + movl %eax,%fs + FAKE_MCOUNT(TF_EIP(%esp)) +calltrap: + movl TF_EIP(%esp),%eax + cmpl $scrit,%eax + jb 11f + cmpl $ecrit,%eax + jb critical_region_fixup +11: call trap + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + +/* + * SYSCALL CALL GATE (old entry point for a.out binaries) + * + * The intersegment call has been set up to specify one dummy parameter. + * + * This leaves a place to put eflags so that the call frame can be + * converted to a trap frame. Note that the eflags is (semi-)bogusly + * pushed into (what will be) tf_err and then copied later into the + * final spot. It has to be done this way because esp can't be just + * temporarily altered for the pushfl - an interrupt might come in + * and clobber the saved cs/eip. + */ + SUPERALIGN_TEXT +IDTVEC(lcall_syscall) + pushfl /* save eflags */ + popl 8(%esp) /* shuffle into tf_eflags */ + pushl $7 /* sizeof "lcall 7,0" */ + subl $4,%esp /* skip over tf_trapno */ + pushal + pushl %ds + pushl %es + pushl %fs + movl $KDSEL,%eax /* switch to kernel segments */ + movl %eax,%ds + movl %eax,%es + movl $KPSEL,%eax + movl %eax,%fs + FAKE_MCOUNT(TF_EIP(%esp)) + call syscall + MEXITCOUNT + jmp doreti + +/* + * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80) + * + * Even though the name says 'int0x80', this is actually a TGT (trap gate) + * rather then an IGT (interrupt gate). Thus interrupts are enabled on + * entry just as they are for a normal syscall. + */ + SUPERALIGN_TEXT +IDTVEC(int0x80_syscall) + pushl $2 /* sizeof "int 0x80" */ + pushl $0xCAFE + pushl $0xDEAD + pushal + pushl %ds + pushl %es + pushl %fs + movl $KDSEL,%eax /* switch to kernel segments */ + movl %eax,%ds + movl %eax,%es + movl $KPSEL,%eax + movl %eax,%fs + FAKE_MCOUNT(TF_EIP(%esp)) + call syscall + MEXITCOUNT + jmp doreti + +ENTRY(fork_trampoline) + pushl %esp /* trapframe pointer */ + pushl %ebx /* arg1 */ + pushl %esi /* function */ + call fork_exit + addl $12,%esp + /* cut from syscall */ + + /* + * Return via doreti to handle ASTs. + */ + MEXITCOUNT + jmp doreti + + +/* +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until weve done all processing. HOWEVER, we must enable events before +# popping the stack frame (cant be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so wed +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +*/ + + +/* + * void doreti(struct trapframe) + * + * Handle return from interrupts, traps and syscalls. + */ + .text + SUPERALIGN_TEXT + .globl doreti + .type doreti,@function +doreti: + FAKE_MCOUNT(bintr) /* init "from" bintr -> doreti */ +doreti_next: + testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */ + jz doreti_exit /* #can't handle ASTs now if not */ + +doreti_ast: + /* + * Check for ASTs atomically with returning. Disabling CPU + * interrupts provides sufficient locking even in the SMP case, + * since we will be informed of any new ASTs by an IPI. + */ + + movl HYPERVISOR_shared_info,%esi + XEN_BLOCK_EVENTS(%esi) + movl PCPU(CURTHREAD),%eax + testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax) + je doreti_exit + XEN_UNBLOCK_EVENTS(%esi) + pushl %esp /* pass a pointer to the trapframe */ + call ast + add $4,%esp + jmp doreti_ast + +doreti_exit: + /* + * doreti_exit: pop registers, iret. + * + * The segment register pop is a special case, since it may + * fault if (for example) a sigreturn specifies bad segment + * registers. The fault is handled in trap.c. + */ + + movl HYPERVISOR_shared_info,%esi + XEN_UNBLOCK_EVENTS(%esi) # reenable event callbacks (sti) + + .globl scrit +scrit: + XEN_TEST_PENDING(%esi) + jnz hypervisor_callback_pending /* More to go */ + MEXITCOUNT + + .globl doreti_popl_fs +doreti_popl_fs: + popl %fs + .globl doreti_popl_es +doreti_popl_es: + popl %es + .globl doreti_popl_ds +doreti_popl_ds: + popl %ds + POPA + addl $12,%esp + .globl doreti_iret +doreti_iret: + iret + .globl ecrit +ecrit: + + /* + * doreti_iret_fault and friends. Alternative return code for + * the case where we get a fault in the doreti_exit code + * above. trap() (i386/i386/trap.c) catches this specific + * case, sends the process a signal and continues in the + * corresponding place in the code below. + */ + ALIGN_TEXT + .globl doreti_iret_fault +doreti_iret_fault: + subl $12,%esp + pushal + pushl %ds + .globl doreti_popl_ds_fault +doreti_popl_ds_fault: + pushl %es + .globl doreti_popl_es_fault +doreti_popl_es_fault: + pushl %fs + .globl doreti_popl_fs_fault +doreti_popl_fs_fault: + movl $0,TF_ERR(%esp) /* XXX should be the error code */ + movl $T_PROTFLT,TF_TRAPNO(%esp) + jmp alltraps_with_regs_pushed + + + + +/* +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +*/ + +.globl critical_region_fixup +critical_region_fixup: + addl $critical_fixup_table-scrit,%eax + movzbl (%eax),%eax # %eax contains num bytes popped + movl %esp,%esi + add %eax,%esi # %esi points at end of src region + movl %esp,%edi + add $0x44,%edi # %edi points at end of dst region + movl %eax,%ecx + shr $2,%ecx # convert bytes to words + je 16f # skip loop if nothing to copy +15: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 15b +16: movl %edi,%esp # final %edi is top of merged stack + jmp hypervisor_callback_pending + + +critical_fixup_table: +.byte 0x0,0x0,0x0 #testb $0x1,(%esi) +.byte 0x0,0x0,0x0,0x0,0x0,0x0 #jne ea +.byte 0x0,0x0 #pop %fs +.byte 0x04 #pop %es +.byte 0x08 #pop %ds +.byte 0x0c #pop %edi +.byte 0x10 #pop %esi +.byte 0x14 #pop %ebp +.byte 0x18 #pop %ebx +.byte 0x1c #pop %ebx +.byte 0x20 #pop %edx +.byte 0x24 #pop %ecx +.byte 0x28 #pop %eax +.byte 0x2c,0x2c,0x2c #add $0xc,%esp +.byte 0x38 #iret + + +/* # Hypervisor uses this for application faults while it executes.*/ +ENTRY(failsafe_callback) + pushal + call xen_failsafe_handler +/*# call install_safe_pf_handler */ + movl 32(%esp),%ebx +1: movl %ebx,%ds + movl 36(%esp),%ebx +2: movl %ebx,%es + movl 40(%esp),%ebx +3: movl %ebx,%fs + movl 44(%esp),%ebx +4: movl %ebx,%gs +/*# call install_normal_pf_handler */ + popal + addl $16,%esp + iret + + diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/genassym.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/genassym.c new file mode 100644 index 0000000000..1e9df732c7 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/genassym.c @@ -0,0 +1,234 @@ +/*- + * Copyright (c) 1982, 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/genassym.c,v 1.146 2003/11/12 18:14:34 jhb Exp $"); + +#include "opt_apic.h" +#include "opt_compat.h" +#include "opt_kstack_pages.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/assym.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/errno.h> +#include <sys/mount.h> +#include <sys/mutex.h> +#include <sys/socket.h> +#include <sys/resourcevar.h> +#include <sys/ucontext.h> +#include <sys/user.h> +#include <machine/bootinfo.h> +#include <machine/tss.h> +#include <sys/vmmeter.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/user.h> +#include <sys/proc.h> +#include <net/if.h> +#include <netinet/in.h> +#include <nfs/nfsproto.h> +#include <nfs/rpcv2.h> +#include <nfsclient/nfs.h> +#include <nfsclient/nfsdiskless.h> +#ifdef DEV_APIC +#include <machine/apicreg.h> +#endif +#include <machine/cpu.h> +#include <machine/sigframe.h> +#include <machine/proc.h> + +ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); +ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); +ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); +ASSYM(P_SFLAG, offsetof(struct proc, p_sflag)); +ASSYM(P_UAREA, offsetof(struct proc, p_uarea)); + +ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); +ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); +ASSYM(TD_PROC, offsetof(struct thread, td_proc)); +ASSYM(TD_MD, offsetof(struct thread, td_md)); + +ASSYM(P_MD, offsetof(struct proc, p_md)); +ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); + +ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); +ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); + +ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap)); +ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall)); +ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); +/* ASSYM(UPAGES, UPAGES);*/ +ASSYM(UAREA_PAGES, UAREA_PAGES); +ASSYM(KSTACK_PAGES, KSTACK_PAGES); +ASSYM(PAGE_SIZE, PAGE_SIZE); +ASSYM(NPTEPG, NPTEPG); +ASSYM(NPDEPG, NPDEPG); +ASSYM(NPDEPTD, NPDEPTD); +ASSYM(NPGPTD, NPGPTD); +ASSYM(PDESIZE, sizeof(pd_entry_t)); +ASSYM(PTESIZE, sizeof(pt_entry_t)); +ASSYM(PDESHIFT, PDESHIFT); +ASSYM(PTESHIFT, PTESHIFT); +ASSYM(PAGE_SHIFT, PAGE_SHIFT); +ASSYM(PAGE_MASK, PAGE_MASK); +ASSYM(PDRSHIFT, PDRSHIFT); +ASSYM(PDRMASK, PDRMASK); +ASSYM(USRSTACK, USRSTACK); +ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); +ASSYM(KERNBASE, KERNBASE); +ASSYM(KERNLOAD, KERNLOAD); +ASSYM(MCLBYTES, MCLBYTES); +ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); +ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi)); +ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi)); +ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp)); +ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp)); +ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx)); +ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip)); +ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0)); + +ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs)); +ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); +ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); +ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); +ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); +ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); +ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); +ASSYM(PCB_PSL, offsetof(struct pcb, pcb_psl)); +ASSYM(PCB_DBREGS, PCB_DBREGS); +ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); + +ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); +ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); +ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); +ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu)); +ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); +ASSYM(PCB_SWITCHOUT, offsetof(struct pcb, pcb_switchout)); + +ASSYM(PCB_SIZE, sizeof(struct pcb)); + +ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); +ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); +ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); +ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags)); +ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip)); +ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); +#ifdef COMPAT_43 +ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc)); +#endif +ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); +#ifdef COMPAT_FREEBSD4 +ASSYM(SIGF_UC4, offsetof(struct sigframe4, sf_uc)); +#endif +#ifdef COMPAT_43 +ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps)); +ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs)); +ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs)); +ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno)); +#endif +#ifdef COMPAT_FREEBSD4 +ASSYM(UC4_EFLAGS, offsetof(struct ucontext4, uc_mcontext.mc_eflags)); +ASSYM(UC4_GS, offsetof(struct ucontext4, uc_mcontext.mc_gs)); +#endif +ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags)); +ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs)); +ASSYM(ENOENT, ENOENT); +ASSYM(EFAULT, EFAULT); +ASSYM(ENAMETOOLONG, ENAMETOOLONG); +ASSYM(MAXCOMLEN, MAXCOMLEN); +ASSYM(MAXPATHLEN, MAXPATHLEN); +ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo)); +ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version)); +ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname)); +ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless)); +ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon)); +ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless)); +ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size)); +ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab)); +ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); +ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); +ASSYM(PC_SIZEOF, sizeof(struct pcpu)); +ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace)); +ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); +ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread)); +ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); +ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); +ASSYM(PC_COMMON_TSS, offsetof(struct pcpu, pc_common_tss)); +ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd)); +ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt)); +ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt)); +ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); +ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); +ASSYM(PC_TRAP_NESTING, offsetof(struct pcpu, pc_trap_nesting)); + +ASSYM(PC_CR3, offsetof(struct pcpu, pc_pdir)); + +#ifdef DEV_APIC +ASSYM(LA_VER, offsetof(struct LAPIC, version)); +ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); +ASSYM(LA_EOI, offsetof(struct LAPIC, eoi)); +ASSYM(LA_SVR, offsetof(struct LAPIC, svr)); +ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo)); +ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi)); +ASSYM(LA_ISR, offsetof(struct LAPIC, isr0)); +#endif + +ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); +ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); +ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); + +ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); +ASSYM(GPROC0_SEL, GPROC0_SEL); + +ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); +ASSYM(MTX_RECURSECNT, offsetof(struct mtx, mtx_recurse)); + +#ifdef PC98 +#include <machine/bus.h> + +ASSYM(BUS_SPACE_HANDLE_BASE, offsetof(struct bus_space_handle, bsh_base)); +ASSYM(BUS_SPACE_HANDLE_IAT, offsetof(struct bus_space_handle, bsh_iat)); +#endif + +ASSYM(HYPERVISOR_STACK_SWITCH, __HYPERVISOR_stack_switch); diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/hypervisor.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/hypervisor.c new file mode 100644 index 0000000000..df9568c7d1 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/hypervisor.c @@ -0,0 +1,107 @@ +/****************************************************************************** + * hypervisor.c + * + * Communication to/from hypervisor. + * + * Copyright (c) 2002-2003, K A Fraser + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIEAS OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <machine/xen-os.h> +#include <machine/hypervisor.h> +#include <machine/xenvar.h> +#include <machine/multicall.h> + +/* XXX need to verify what the caller save registers are on x86 KMM */ +#define CALLER_SAVE __asm__("pushal; ") +#define CALLER_RESTORE __asm__("popal;") + + +/* ni == non-inline - these are only intended for use from assembler + * no reason to have them in a header - + * + */ +void ni_queue_multicall0(unsigned long op); +void ni_queue_multicall1(unsigned long op, unsigned long arg1); +void ni_queue_multicall2(unsigned long op, unsigned long arg1, + unsigned long arg2); +void ni_queue_multicall3(unsigned long op, unsigned long arg1, + unsigned long arg2, unsigned long arg3); +void ni_queue_multicall4(unsigned long op, unsigned long arg1, + unsigned long arg2, unsigned long arg4, + unsigned long arg5); + +void ni_execute_multicall_list(void); + +multicall_entry_t multicall_list[MAX_MULTICALL_ENTS]; +int nr_multicall_ents = 0; + + +void +ni_queue_multicall0(unsigned long op) +{ + CALLER_SAVE; + queue_multicall0(op); + CALLER_RESTORE; +} + +void +ni_queue_multicall1(unsigned long op, unsigned long arg1) +{ + CALLER_SAVE; + queue_multicall1(op, arg1); + CALLER_RESTORE; +} + +void +ni_queue_multicall2(unsigned long op, unsigned long arg1, + unsigned long arg2) +{ + CALLER_SAVE; + queue_multicall2(op, arg1, arg2); + CALLER_RESTORE; +} + +void +ni_queue_multicall3(unsigned long op, unsigned long arg1, + unsigned long arg2, unsigned long arg3) +{ + CALLER_SAVE; + queue_multicall3(op, arg1, arg2, arg3); + CALLER_RESTORE; +} + +void +ni_queue_multicall4(unsigned long op, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4) +{ + CALLER_SAVE; + queue_multicall4(op, arg1, arg2, arg3, arg4); + CALLER_RESTORE; +} + +void +ni_execute_multicall_list(void) +{ + CALLER_SAVE; + execute_multicall_list(); + CALLER_RESTORE; +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/i686_mem.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/i686_mem.c new file mode 100644 index 0000000000..fe21232f7a --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/i686_mem.c @@ -0,0 +1,626 @@ +/*- + * Copyright (c) 1999 Michael Smith <msmith@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/i686_mem.c,v 1.23 2003/10/21 18:28:34 silby Exp $"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/memrange.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <machine/md_var.h> +#include <machine/specialreg.h> + +/* + * i686 memory range operations + * + * This code will probably be impenetrable without reference to the + * Intel Pentium Pro documentation. + */ + +static char *mem_owner_bios = "BIOS"; + +#define MR686_FIXMTRR (1<<0) + +#define mrwithin(mr, a) \ + (((a) >= (mr)->mr_base) && ((a) < ((mr)->mr_base + (mr)->mr_len))) +#define mroverlap(mra, mrb) \ + (mrwithin(mra, mrb->mr_base) || mrwithin(mrb, mra->mr_base)) + +#define mrvalid(base, len) \ + ((!(base & ((1 << 12) - 1))) && /* base is multiple of 4k */ \ + ((len) >= (1 << 12)) && /* length is >= 4k */ \ + powerof2((len)) && /* ... and power of two */ \ + !((base) & ((len) - 1))) /* range is not discontiuous */ + +#define mrcopyflags(curr, new) (((curr) & ~MDF_ATTRMASK) | ((new) & MDF_ATTRMASK)) + +static int mtrrs_disabled; +TUNABLE_INT("machdep.disable_mtrrs", &mtrrs_disabled); +SYSCTL_INT(_machdep, OID_AUTO, disable_mtrrs, CTLFLAG_RDTUN, + &mtrrs_disabled, 0, "Disable i686 MTRRs."); + +static void i686_mrinit(struct mem_range_softc *sc); +static int i686_mrset(struct mem_range_softc *sc, + struct mem_range_desc *mrd, + int *arg); +static void i686_mrAPinit(struct mem_range_softc *sc); + +static struct mem_range_ops i686_mrops = { + i686_mrinit, + i686_mrset, + i686_mrAPinit +}; + +/* XXX for AP startup hook */ +static u_int64_t mtrrcap, mtrrdef; + +static struct mem_range_desc *mem_range_match(struct mem_range_softc *sc, + struct mem_range_desc *mrd); +static void i686_mrfetch(struct mem_range_softc *sc); +static int i686_mtrrtype(int flags); +#if 0 +static int i686_mrt2mtrr(int flags, int oldval); +#endif +static int i686_mtrrconflict(int flag1, int flag2); +static void i686_mrstore(struct mem_range_softc *sc); +static void i686_mrstoreone(void *arg); +static struct mem_range_desc *i686_mtrrfixsearch(struct mem_range_softc *sc, + u_int64_t addr); +static int i686_mrsetlow(struct mem_range_softc *sc, + struct mem_range_desc *mrd, + int *arg); +static int i686_mrsetvariable(struct mem_range_softc *sc, + struct mem_range_desc *mrd, + int *arg); + +/* i686 MTRR type to memory range type conversion */ +static int i686_mtrrtomrt[] = { + MDF_UNCACHEABLE, + MDF_WRITECOMBINE, + MDF_UNKNOWN, + MDF_UNKNOWN, + MDF_WRITETHROUGH, + MDF_WRITEPROTECT, + MDF_WRITEBACK +}; + +#define MTRRTOMRTLEN (sizeof(i686_mtrrtomrt) / sizeof(i686_mtrrtomrt[0])) + +static int +i686_mtrr2mrt(int val) { + if (val < 0 || val >= MTRRTOMRTLEN) + return MDF_UNKNOWN; + return i686_mtrrtomrt[val]; +} + +/* + * i686 MTRR conflicts. Writeback and uncachable may overlap. + */ +static int +i686_mtrrconflict(int flag1, int flag2) { + flag1 &= MDF_ATTRMASK; + flag2 &= MDF_ATTRMASK; + if (flag1 == flag2 || + (flag1 == MDF_WRITEBACK && flag2 == MDF_UNCACHEABLE) || + (flag2 == MDF_WRITEBACK && flag1 == MDF_UNCACHEABLE)) + return 0; + return 1; +} + +/* + * Look for an exactly-matching range. + */ +static struct mem_range_desc * +mem_range_match(struct mem_range_softc *sc, struct mem_range_desc *mrd) +{ + struct mem_range_desc *cand; + int i; + + for (i = 0, cand = sc->mr_desc; i < sc->mr_ndesc; i++, cand++) + if ((cand->mr_base == mrd->mr_base) && + (cand->mr_len == mrd->mr_len)) + return(cand); + return(NULL); +} + +/* + * Fetch the current mtrr settings from the current CPU (assumed to all + * be in sync in the SMP case). Note that if we are here, we assume + * that MTRRs are enabled, and we may or may not have fixed MTRRs. + */ +static void +i686_mrfetch(struct mem_range_softc *sc) +{ + struct mem_range_desc *mrd; + u_int64_t msrv; + int i, j, msr; + + mrd = sc->mr_desc; + + /* Get fixed-range MTRRs */ + if (sc->mr_cap & MR686_FIXMTRR) { + msr = MSR_MTRR64kBase; + for (i = 0; i < (MTRR_N64K / 8); i++, msr++) { + msrv = rdmsr(msr); + for (j = 0; j < 8; j++, mrd++) { + mrd->mr_flags = (mrd->mr_flags & ~MDF_ATTRMASK) | + i686_mtrr2mrt(msrv & 0xff) | + MDF_ACTIVE; + if (mrd->mr_owner[0] == 0) + strcpy(mrd->mr_owner, mem_owner_bios); + msrv = msrv >> 8; + } + } + msr = MSR_MTRR16kBase; + for (i = 0; i < (MTRR_N16K / 8); i++, msr++) { + msrv = rdmsr(msr); + for (j = 0; j < 8; j++, mrd++) { + mrd->mr_flags = (mrd->mr_flags & ~MDF_ATTRMASK) | + i686_mtrr2mrt(msrv & 0xff) | + MDF_ACTIVE; + if (mrd->mr_owner[0] == 0) + strcpy(mrd->mr_owner, mem_owner_bios); + msrv = msrv >> 8; + } + } + msr = MSR_MTRR4kBase; + for (i = 0; i < (MTRR_N4K / 8); i++, msr++) { + msrv = rdmsr(msr); + for (j = 0; j < 8; j++, mrd++) { + mrd->mr_flags = (mrd->mr_flags & ~MDF_ATTRMASK) | + i686_mtrr2mrt(msrv & 0xff) | + MDF_ACTIVE; + if (mrd->mr_owner[0] == 0) + strcpy(mrd->mr_owner, mem_owner_bios); + msrv = msrv >> 8; + } + } + } + + /* Get remainder which must be variable MTRRs */ + msr = MSR_MTRRVarBase; + for (; (mrd - sc->mr_desc) < sc->mr_ndesc; msr += 2, mrd++) { + msrv = rdmsr(msr); + mrd->mr_flags = (mrd->mr_flags & ~MDF_ATTRMASK) | + i686_mtrr2mrt(msrv & 0xff); + mrd->mr_base = msrv & 0x0000000ffffff000LL; + msrv = rdmsr(msr + 1); + mrd->mr_flags = (msrv & 0x800) ? + (mrd->mr_flags | MDF_ACTIVE) : + (mrd->mr_flags & ~MDF_ACTIVE); + /* Compute the range from the mask. Ick. */ + mrd->mr_len = (~(msrv & 0x0000000ffffff000LL) & 0x0000000fffffffffLL) + 1; + if (!mrvalid(mrd->mr_base, mrd->mr_len)) + mrd->mr_flags |= MDF_BOGUS; + /* If unclaimed and active, must be the BIOS */ + if ((mrd->mr_flags & MDF_ACTIVE) && (mrd->mr_owner[0] == 0)) + strcpy(mrd->mr_owner, mem_owner_bios); + } +} + +/* + * Return the MTRR memory type matching a region's flags + */ +static int +i686_mtrrtype(int flags) +{ + int i; + + flags &= MDF_ATTRMASK; + + for (i = 0; i < MTRRTOMRTLEN; i++) { + if (i686_mtrrtomrt[i] == MDF_UNKNOWN) + continue; + if (flags == i686_mtrrtomrt[i]) + return(i); + } + return(-1); +} +#if 0 +static int +i686_mrt2mtrr(int flags, int oldval) +{ + int val; + + if ((val = i686_mtrrtype(flags)) == -1) + return oldval & 0xff; + return val & 0xff; +} +#endif +/* + * Update running CPU(s) MTRRs to match the ranges in the descriptor + * list. + * + * XXX Must be called with interrupts enabled. + */ +static void +i686_mrstore(struct mem_range_softc *sc) +{ +#ifdef SMP + /* + * We should use ipi_all_but_self() to call other CPUs into a + * locking gate, then call a target function to do this work. + * The "proper" solution involves a generalised locking gate + * implementation, not ready yet. + */ + smp_rendezvous(NULL, i686_mrstoreone, NULL, (void *)sc); +#else + disable_intr(); /* disable interrupts */ + i686_mrstoreone((void *)sc); + enable_intr(); +#endif +} + +/* + * Update the current CPU's MTRRs with those represented in the + * descriptor list. Note that we do this wholesale rather than + * just stuffing one entry; this is simpler (but slower, of course). + */ +static void +i686_mrstoreone(void *arg) +{ +#if 0 + struct mem_range_softc *sc = (struct mem_range_softc *)arg; + struct mem_range_desc *mrd; + u_int64_t omsrv, msrv; + int i, j, msr; + u_int cr4save; + + mrd = sc->mr_desc; + + cr4save = rcr4(); /* save cr4 */ + if (cr4save & CR4_PGE) + load_cr4(cr4save & ~CR4_PGE); + load_cr0((rcr0() & ~CR0_NW) | CR0_CD); /* disable caches (CD = 1, NW = 0) */ + wbinvd(); /* flush caches, TLBs */ + wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) & ~0x800); /* disable MTRRs (E = 0) */ + + /* Set fixed-range MTRRs */ + if (sc->mr_cap & MR686_FIXMTRR) { + msr = MSR_MTRR64kBase; + for (i = 0; i < (MTRR_N64K / 8); i++, msr++) { + msrv = 0; + omsrv = rdmsr(msr); + for (j = 7; j >= 0; j--) { + msrv = msrv << 8; + msrv |= i686_mrt2mtrr((mrd + j)->mr_flags, omsrv >> (j*8)); + } + wrmsr(msr, msrv); + mrd += 8; + } + msr = MSR_MTRR16kBase; + for (i = 0; i < (MTRR_N16K / 8); i++, msr++) { + msrv = 0; + omsrv = rdmsr(msr); + for (j = 7; j >= 0; j--) { + msrv = msrv << 8; + msrv |= i686_mrt2mtrr((mrd + j)->mr_flags, omsrv >> (j*8)); + } + wrmsr(msr, msrv); + mrd += 8; + } + msr = MSR_MTRR4kBase; + for (i = 0; i < (MTRR_N4K / 8); i++, msr++) { + msrv = 0; + omsrv = rdmsr(msr); + for (j = 7; j >= 0; j--) { + msrv = msrv << 8; + msrv |= i686_mrt2mtrr((mrd + j)->mr_flags, omsrv >> (j*8)); + } + wrmsr(msr, msrv); + mrd += 8; + } + } + + /* Set remainder which must be variable MTRRs */ + msr = MSR_MTRRVarBase; + for (; (mrd - sc->mr_desc) < sc->mr_ndesc; msr += 2, mrd++) { + /* base/type register */ + omsrv = rdmsr(msr); + if (mrd->mr_flags & MDF_ACTIVE) { + msrv = mrd->mr_base & 0x0000000ffffff000LL; + msrv |= i686_mrt2mtrr(mrd->mr_flags, omsrv); + } else { + msrv = 0; + } + wrmsr(msr, msrv); + + /* mask/active register */ + if (mrd->mr_flags & MDF_ACTIVE) { + msrv = 0x800 | (~(mrd->mr_len - 1) & 0x0000000ffffff000LL); + } else { + msrv = 0; + } + wrmsr(msr + 1, msrv); + } + wbinvd(); /* flush caches, TLBs */ + wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) | 0x800); /* restore MTRR state */ + load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* enable caches CD = 0 and NW = 0 */ + load_cr4(cr4save); /* restore cr4 */ +#endif +} + +/* + * Hunt for the fixed MTRR referencing (addr) + */ +static struct mem_range_desc * +i686_mtrrfixsearch(struct mem_range_softc *sc, u_int64_t addr) +{ + struct mem_range_desc *mrd; + int i; + + for (i = 0, mrd = sc->mr_desc; i < (MTRR_N64K + MTRR_N16K + MTRR_N4K); i++, mrd++) + if ((addr >= mrd->mr_base) && (addr < (mrd->mr_base + mrd->mr_len))) + return(mrd); + return(NULL); +} + +/* + * Try to satisfy the given range request by manipulating the fixed MTRRs that + * cover low memory. + * + * Note that we try to be generous here; we'll bloat the range out to the + * next higher/lower boundary to avoid the consumer having to know too much + * about the mechanisms here. + * + * XXX note that this will have to be updated when we start supporting "busy" ranges. + */ +static int +i686_mrsetlow(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg) +{ + struct mem_range_desc *first_md, *last_md, *curr_md; + + /* range check */ + if (((first_md = i686_mtrrfixsearch(sc, mrd->mr_base)) == NULL) || + ((last_md = i686_mtrrfixsearch(sc, mrd->mr_base + mrd->mr_len - 1)) == NULL)) + return(EINVAL); + + /* check we aren't doing something risky */ + if (!(mrd->mr_flags & MDF_FORCE)) + for (curr_md = first_md; curr_md <= last_md; curr_md++) { + if ((curr_md->mr_flags & MDF_ATTRMASK) == MDF_UNKNOWN) + return (EACCES); + } + + /* set flags, clear set-by-firmware flag */ + for (curr_md = first_md; curr_md <= last_md; curr_md++) { + curr_md->mr_flags = mrcopyflags(curr_md->mr_flags & ~MDF_FIRMWARE, mrd->mr_flags); + bcopy(mrd->mr_owner, curr_md->mr_owner, sizeof(mrd->mr_owner)); + } + + return(0); +} + + +/* + * Modify/add a variable MTRR to satisfy the request. + * + * XXX needs to be updated to properly support "busy" ranges. + */ +static int +i686_mrsetvariable(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg) +{ + struct mem_range_desc *curr_md, *free_md; + int i; + + /* + * Scan the currently active variable descriptors, look for + * one we exactly match (straight takeover) and for possible + * accidental overlaps. + * Keep track of the first empty variable descriptor in case we + * can't perform a takeover. + */ + i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0; + curr_md = sc->mr_desc + i; + free_md = NULL; + for (; i < sc->mr_ndesc; i++, curr_md++) { + if (curr_md->mr_flags & MDF_ACTIVE) { + /* exact match? */ + if ((curr_md->mr_base == mrd->mr_base) && + (curr_md->mr_len == mrd->mr_len)) { + /* whoops, owned by someone */ + if (curr_md->mr_flags & MDF_BUSY) + return(EBUSY); + /* check we aren't doing something risky */ + if (!(mrd->mr_flags & MDF_FORCE) && + ((curr_md->mr_flags & MDF_ATTRMASK) == MDF_UNKNOWN)) + return (EACCES); + /* Ok, just hijack this entry */ + free_md = curr_md; + break; + } + /* non-exact overlap ? */ + if (mroverlap(curr_md, mrd)) { + /* between conflicting region types? */ + if (i686_mtrrconflict(curr_md->mr_flags, mrd->mr_flags)) + return(EINVAL); + } + } else if (free_md == NULL) { + free_md = curr_md; + } + } + /* got somewhere to put it? */ + if (free_md == NULL) + return(ENOSPC); + + /* Set up new descriptor */ + free_md->mr_base = mrd->mr_base; + free_md->mr_len = mrd->mr_len; + free_md->mr_flags = mrcopyflags(MDF_ACTIVE, mrd->mr_flags); + bcopy(mrd->mr_owner, free_md->mr_owner, sizeof(mrd->mr_owner)); + return(0); +} + +/* + * Handle requests to set memory range attributes by manipulating MTRRs. + * + */ +static int +i686_mrset(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg) +{ + struct mem_range_desc *targ; + int error = 0; + + switch(*arg) { + case MEMRANGE_SET_UPDATE: + /* make sure that what's being asked for is even possible at all */ + if (!mrvalid(mrd->mr_base, mrd->mr_len) || + i686_mtrrtype(mrd->mr_flags) == -1) + return(EINVAL); + +#define FIXTOP ((MTRR_N64K * 0x10000) + (MTRR_N16K * 0x4000) + (MTRR_N4K * 0x1000)) + + /* are the "low memory" conditions applicable? */ + if ((sc->mr_cap & MR686_FIXMTRR) && + ((mrd->mr_base + mrd->mr_len) <= FIXTOP)) { + if ((error = i686_mrsetlow(sc, mrd, arg)) != 0) + return(error); + } else { + /* it's time to play with variable MTRRs */ + if ((error = i686_mrsetvariable(sc, mrd, arg)) != 0) + return(error); + } + break; + + case MEMRANGE_SET_REMOVE: + if ((targ = mem_range_match(sc, mrd)) == NULL) + return(ENOENT); + if (targ->mr_flags & MDF_FIXACTIVE) + return(EPERM); + if (targ->mr_flags & MDF_BUSY) + return(EBUSY); + targ->mr_flags &= ~MDF_ACTIVE; + targ->mr_owner[0] = 0; + break; + + default: + return(EOPNOTSUPP); + } + + /* update the hardware */ + i686_mrstore(sc); + i686_mrfetch(sc); /* refetch to see where we're at */ + return(0); +} + +/* + * Work out how many ranges we support, initialise storage for them, + * fetch the initial settings. + */ +static void +i686_mrinit(struct mem_range_softc *sc) +{ + struct mem_range_desc *mrd; + int nmdesc = 0; + int i; + + /* XXX */ + return; + + mtrrcap = rdmsr(MSR_MTRRcap); + mtrrdef = rdmsr(MSR_MTRRdefType); + + /* For now, bail out if MTRRs are not enabled */ + if (!(mtrrdef & 0x800)) { + if (bootverbose) + printf("CPU supports MTRRs but not enabled\n"); + return; + } + nmdesc = mtrrcap & 0xff; + printf("Pentium Pro MTRR support enabled\n"); + + /* If fixed MTRRs supported and enabled */ + if ((mtrrcap & 0x100) && (mtrrdef & 0x400)) { + sc->mr_cap = MR686_FIXMTRR; + nmdesc += MTRR_N64K + MTRR_N16K + MTRR_N4K; + } + + sc->mr_desc = + (struct mem_range_desc *)malloc(nmdesc * sizeof(struct mem_range_desc), + M_MEMDESC, M_WAITOK | M_ZERO); + sc->mr_ndesc = nmdesc; + + mrd = sc->mr_desc; + + /* Populate the fixed MTRR entries' base/length */ + if (sc->mr_cap & MR686_FIXMTRR) { + for (i = 0; i < MTRR_N64K; i++, mrd++) { + mrd->mr_base = i * 0x10000; + mrd->mr_len = 0x10000; + mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN | MDF_FIXACTIVE; + } + for (i = 0; i < MTRR_N16K; i++, mrd++) { + mrd->mr_base = i * 0x4000 + 0x80000; + mrd->mr_len = 0x4000; + mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN | MDF_FIXACTIVE; + } + for (i = 0; i < MTRR_N4K; i++, mrd++) { + mrd->mr_base = i * 0x1000 + 0xc0000; + mrd->mr_len = 0x1000; + mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN | MDF_FIXACTIVE; + } + } + + /* + * Get current settings, anything set now is considered to have + * been set by the firmware. (XXX has something already played here?) + */ + i686_mrfetch(sc); + mrd = sc->mr_desc; + for (i = 0; i < sc->mr_ndesc; i++, mrd++) { + if (mrd->mr_flags & MDF_ACTIVE) + mrd->mr_flags |= MDF_FIRMWARE; + } +} + +/* + * Initialise MTRRs on an AP after the BSP has run the init code. + */ +static void +i686_mrAPinit(struct mem_range_softc *sc) +{ + i686_mrstoreone((void *)sc); /* set MTRRs to match BSP */ + wrmsr(MSR_MTRRdefType, mtrrdef); /* set MTRR behaviour to match BSP */ +} + +static void +i686_mem_drvinit(void *unused) +{ + /* Try for i686 MTRRs */ + if (!mtrrs_disabled && (cpu_feature & CPUID_MTRR) && + ((cpu_id & 0xf00) == 0x600 || (cpu_id & 0xf00) == 0xf00) && + ((strcmp(cpu_vendor, "GenuineIntel") == 0) || + (strcmp(cpu_vendor, "AuthenticAMD") == 0))) { + mem_range_softc.mr_op = &i686_mrops; + } +} + +SYSINIT(i686memdev,SI_SUB_DRIVERS,SI_ORDER_FIRST,i686_mem_drvinit,NULL) diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/initcpu.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/initcpu.c new file mode 100644 index 0000000000..0852fb98aa --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/initcpu.c @@ -0,0 +1,889 @@ +/*- + * Copyright (c) KATO Takenori, 1997, 1998. + * + * All rights reserved. Unpublished rights reserved under the copyright + * laws of Japan. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer as + * the first lines of this file unmodified. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/initcpu.c,v 1.49 2003/11/10 15:48:30 jhb Exp $"); + +#include "opt_cpu.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysctl.h> + +#include <machine/cputypes.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> + +#if !defined(CPU_ENABLE_SSE) && defined(I686_CPU) +#define CPU_ENABLE_SSE +#endif +#if defined(CPU_DISABLE_SSE) +#undef CPU_ENABLE_SSE +#endif + +void initializecpu(void); +#if defined(I586_CPU) && defined(CPU_WT_ALLOC) +void enable_K5_wt_alloc(void); +void enable_K6_wt_alloc(void); +void enable_K6_2_wt_alloc(void); +#endif + +#ifdef I486_CPU +static void init_5x86(void); +static void init_bluelightning(void); +static void init_486dlc(void); +static void init_cy486dx(void); +#ifdef CPU_I486_ON_386 +static void init_i486_on_386(void); +#endif +static void init_6x86(void); +#endif /* I486_CPU */ + +#ifdef I686_CPU +static void init_6x86MX(void); +static void init_ppro(void); +static void init_mendocino(void); +#endif + +static int hw_instruction_sse; +SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD, + &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU"); + +/* Must *NOT* be BSS or locore will bzero these after setting them */ +int cpu = 0; /* Are we 386, 386sx, 486, etc? */ +u_int cpu_feature = 0; /* Feature flags */ +u_int cpu_high = 0; /* Highest arg to CPUID */ +u_int cpu_id = 0; /* Stepping ID */ +u_int cpu_procinfo = 0; /* HyperThreading Info / Brand Index / CLFUSH */ +char cpu_vendor[20] = ""; /* CPU Origin code */ + +#ifdef CPU_ENABLE_SSE +u_int cpu_fxsr; /* SSE enabled */ +#endif + +#ifdef I486_CPU +/* + * IBM Blue Lightning + */ +static void +init_bluelightning(void) +{ +#if 0 + u_long eflags; + +#if defined(PC98) && !defined(CPU_UPGRADE_HW_CACHE) + need_post_dma_flush = 1; +#endif + + eflags = read_eflags(); + disable_intr(); + + load_cr0(rcr0() | CR0_CD | CR0_NW); + invd(); + +#ifdef CPU_BLUELIGHTNING_FPU_OP_CACHE + wrmsr(0x1000, 0x9c92LL); /* FP operand can be cacheable on Cyrix FPU */ +#else + wrmsr(0x1000, 0x1c92LL); /* Intel FPU */ +#endif + /* Enables 13MB and 0-640KB cache. */ + wrmsr(0x1001, (0xd0LL << 32) | 0x3ff); +#ifdef CPU_BLUELIGHTNING_3X + wrmsr(0x1002, 0x04000000LL); /* Enables triple-clock mode. */ +#else + wrmsr(0x1002, 0x03000000LL); /* Enables double-clock mode. */ +#endif + + /* Enable caching in CR0. */ + load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ + invd(); + write_eflags(eflags); +#endif +} + +/* + * Cyrix 486SLC/DLC/SR/DR series + */ +static void +init_486dlc(void) +{ + u_long eflags; + u_char ccr0; + + eflags = read_eflags(); + disable_intr(); + invd(); + + ccr0 = read_cyrix_reg(CCR0); +#ifndef CYRIX_CACHE_WORKS + ccr0 |= CCR0_NC1 | CCR0_BARB; + write_cyrix_reg(CCR0, ccr0); + invd(); +#else + ccr0 &= ~CCR0_NC0; +#ifndef CYRIX_CACHE_REALLY_WORKS + ccr0 |= CCR0_NC1 | CCR0_BARB; +#else + ccr0 |= CCR0_NC1; +#endif +#ifdef CPU_DIRECT_MAPPED_CACHE + ccr0 |= CCR0_CO; /* Direct mapped mode. */ +#endif + write_cyrix_reg(CCR0, ccr0); + + /* Clear non-cacheable region. */ + write_cyrix_reg(NCR1+2, NCR_SIZE_0K); + write_cyrix_reg(NCR2+2, NCR_SIZE_0K); + write_cyrix_reg(NCR3+2, NCR_SIZE_0K); + write_cyrix_reg(NCR4+2, NCR_SIZE_0K); + + write_cyrix_reg(0, 0); /* dummy write */ + + /* Enable caching in CR0. */ + load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ + invd(); +#endif /* !CYRIX_CACHE_WORKS */ + write_eflags(eflags); +} + + +/* + * Cyrix 486S/DX series + */ +static void +init_cy486dx(void) +{ + u_long eflags; + u_char ccr2; + + eflags = read_eflags(); + disable_intr(); + invd(); + + ccr2 = read_cyrix_reg(CCR2); +#ifdef CPU_SUSP_HLT + ccr2 |= CCR2_SUSP_HLT; +#endif + +#ifdef PC98 + /* Enables WB cache interface pin and Lock NW bit in CR0. */ + ccr2 |= CCR2_WB | CCR2_LOCK_NW; + /* Unlock NW bit in CR0. */ + write_cyrix_reg(CCR2, ccr2 & ~CCR2_LOCK_NW); + load_cr0((rcr0() & ~CR0_CD) | CR0_NW); /* CD = 0, NW = 1 */ +#endif + + write_cyrix_reg(CCR2, ccr2); + write_eflags(eflags); +} + + +/* + * Cyrix 5x86 + */ +static void +init_5x86(void) +{ + u_long eflags; + u_char ccr2, ccr3, ccr4, pcr0; + + eflags = read_eflags(); + disable_intr(); + + load_cr0(rcr0() | CR0_CD | CR0_NW); + wbinvd(); + + (void)read_cyrix_reg(CCR3); /* dummy */ + + /* Initialize CCR2. */ + ccr2 = read_cyrix_reg(CCR2); + ccr2 |= CCR2_WB; +#ifdef CPU_SUSP_HLT + ccr2 |= CCR2_SUSP_HLT; +#else + ccr2 &= ~CCR2_SUSP_HLT; +#endif + ccr2 |= CCR2_WT1; + write_cyrix_reg(CCR2, ccr2); + + /* Initialize CCR4. */ + ccr3 = read_cyrix_reg(CCR3); + write_cyrix_reg(CCR3, CCR3_MAPEN0); + + ccr4 = read_cyrix_reg(CCR4); + ccr4 |= CCR4_DTE; + ccr4 |= CCR4_MEM; +#ifdef CPU_FASTER_5X86_FPU + ccr4 |= CCR4_FASTFPE; +#else + ccr4 &= ~CCR4_FASTFPE; +#endif + ccr4 &= ~CCR4_IOMASK; + /******************************************************************** + * WARNING: The "BIOS Writers Guide" mentions that I/O recovery time + * should be 0 for errata fix. + ********************************************************************/ +#ifdef CPU_IORT + ccr4 |= CPU_IORT & CCR4_IOMASK; +#endif + write_cyrix_reg(CCR4, ccr4); + + /* Initialize PCR0. */ + /**************************************************************** + * WARNING: RSTK_EN and LOOP_EN could make your system unstable. + * BTB_EN might make your system unstable. + ****************************************************************/ + pcr0 = read_cyrix_reg(PCR0); +#ifdef CPU_RSTK_EN + pcr0 |= PCR0_RSTK; +#else + pcr0 &= ~PCR0_RSTK; +#endif +#ifdef CPU_BTB_EN + pcr0 |= PCR0_BTB; +#else + pcr0 &= ~PCR0_BTB; +#endif +#ifdef CPU_LOOP_EN + pcr0 |= PCR0_LOOP; +#else + pcr0 &= ~PCR0_LOOP; +#endif + + /**************************************************************** + * WARNING: if you use a memory mapped I/O device, don't use + * DISABLE_5X86_LSSER option, which may reorder memory mapped + * I/O access. + * IF YOUR MOTHERBOARD HAS PCI BUS, DON'T DISABLE LSSER. + ****************************************************************/ +#ifdef CPU_DISABLE_5X86_LSSER + pcr0 &= ~PCR0_LSSER; +#else + pcr0 |= PCR0_LSSER; +#endif + write_cyrix_reg(PCR0, pcr0); + + /* Restore CCR3. */ + write_cyrix_reg(CCR3, ccr3); + + (void)read_cyrix_reg(0x80); /* dummy */ + + /* Unlock NW bit in CR0. */ + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW); + load_cr0((rcr0() & ~CR0_CD) | CR0_NW); /* CD = 0, NW = 1 */ + /* Lock NW bit in CR0. */ + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW); + + write_eflags(eflags); +} + +#ifdef CPU_I486_ON_386 +/* + * There are i486 based upgrade products for i386 machines. + * In this case, BIOS doesn't enables CPU cache. + */ +static void +init_i486_on_386(void) +{ + u_long eflags; + +#if defined(PC98) && !defined(CPU_UPGRADE_HW_CACHE) + need_post_dma_flush = 1; +#endif + + eflags = read_eflags(); + disable_intr(); + + load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0, NW = 0 */ + + write_eflags(eflags); +} +#endif + +/* + * Cyrix 6x86 + * + * XXX - What should I do here? Please let me know. + */ +static void +init_6x86(void) +{ + u_long eflags; + u_char ccr3, ccr4; + + eflags = read_eflags(); + disable_intr(); + + load_cr0(rcr0() | CR0_CD | CR0_NW); + wbinvd(); + + /* Initialize CCR0. */ + write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1); + + /* Initialize CCR1. */ +#ifdef CPU_CYRIX_NO_LOCK + write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK); +#else + write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK); +#endif + + /* Initialize CCR2. */ +#ifdef CPU_SUSP_HLT + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT); +#else + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT); +#endif + + ccr3 = read_cyrix_reg(CCR3); + write_cyrix_reg(CCR3, CCR3_MAPEN0); + + /* Initialize CCR4. */ + ccr4 = read_cyrix_reg(CCR4); + ccr4 |= CCR4_DTE; + ccr4 &= ~CCR4_IOMASK; +#ifdef CPU_IORT + write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK)); +#else + write_cyrix_reg(CCR4, ccr4 | 7); +#endif + + /* Initialize CCR5. */ +#ifdef CPU_WT_ALLOC + write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC); +#endif + + /* Restore CCR3. */ + write_cyrix_reg(CCR3, ccr3); + + /* Unlock NW bit in CR0. */ + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW); + + /* + * Earlier revision of the 6x86 CPU could crash the system if + * L1 cache is in write-back mode. + */ + if ((cyrix_did & 0xff00) > 0x1600) + load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ + else { + /* Revision 2.6 and lower. */ +#ifdef CYRIX_CACHE_REALLY_WORKS + load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ +#else + load_cr0((rcr0() & ~CR0_CD) | CR0_NW); /* CD = 0 and NW = 1 */ +#endif + } + + /* Lock NW bit in CR0. */ + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW); + + write_eflags(eflags); +} +#endif /* I486_CPU */ + +#ifdef I686_CPU +/* + * Cyrix 6x86MX (code-named M2) + * + * XXX - What should I do here? Please let me know. + */ +static void +init_6x86MX(void) +{ +#if 0 + u_long eflags; + u_char ccr3, ccr4; + + eflags = read_eflags(); + disable_intr(); + + load_cr0(rcr0() | CR0_CD | CR0_NW); + wbinvd(); + + /* Initialize CCR0. */ + write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1); + + /* Initialize CCR1. */ +#ifdef CPU_CYRIX_NO_LOCK + write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK); +#else + write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK); +#endif + + /* Initialize CCR2. */ +#ifdef CPU_SUSP_HLT + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT); +#else + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT); +#endif + + ccr3 = read_cyrix_reg(CCR3); + write_cyrix_reg(CCR3, CCR3_MAPEN0); + + /* Initialize CCR4. */ + ccr4 = read_cyrix_reg(CCR4); + ccr4 &= ~CCR4_IOMASK; +#ifdef CPU_IORT + write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK)); +#else + write_cyrix_reg(CCR4, ccr4 | 7); +#endif + + /* Initialize CCR5. */ +#ifdef CPU_WT_ALLOC + write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC); +#endif + + /* Restore CCR3. */ + write_cyrix_reg(CCR3, ccr3); + + /* Unlock NW bit in CR0. */ + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW); + + load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ + + /* Lock NW bit in CR0. */ + write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW); + + write_eflags(eflags); +#endif +} + +static void +init_ppro(void) +{ + u_int64_t apicbase; + + /* + * Local APIC should be disabled if it is not going to be used. + */ + apicbase = rdmsr(MSR_APICBASE); + apicbase &= ~APICBASE_ENABLED; + wrmsr(MSR_APICBASE, apicbase); +} + +/* + * Initialize BBL_CR_CTL3 (Control register 3: used to configure the + * L2 cache). + */ +static void +init_mendocino(void) +{ +#ifdef CPU_PPRO2CELERON + u_long eflags; + u_int64_t bbl_cr_ctl3; + + eflags = read_eflags(); + disable_intr(); + + load_cr0(rcr0() | CR0_CD | CR0_NW); + wbinvd(); + + bbl_cr_ctl3 = rdmsr(MSR_BBL_CR_CTL3); + + /* If the L2 cache is configured, do nothing. */ + if (!(bbl_cr_ctl3 & 1)) { + bbl_cr_ctl3 = 0x134052bLL; + + /* Set L2 Cache Latency (Default: 5). */ +#ifdef CPU_CELERON_L2_LATENCY +#if CPU_L2_LATENCY > 15 +#error invalid CPU_L2_LATENCY. +#endif + bbl_cr_ctl3 |= CPU_L2_LATENCY << 1; +#else + bbl_cr_ctl3 |= 5 << 1; +#endif + wrmsr(MSR_BBL_CR_CTL3, bbl_cr_ctl3); + } + + load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); + write_eflags(eflags); +#endif /* CPU_PPRO2CELERON */ +} + +#endif /* I686_CPU */ + +/* + * Initialize CR4 (Control register 4) to enable SSE instructions. + */ +void +enable_sse(void) +{ +#ifdef XEN + return; +#endif +#if defined(CPU_ENABLE_SSE) + if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { + load_cr4(rcr4() | CR4_FXSR | CR4_XMM); + cpu_fxsr = hw_instruction_sse = 1; + } +#endif +} + +void +initializecpu(void) +{ + + switch (cpu) { +#ifdef I486_CPU + case CPU_BLUE: + init_bluelightning(); + break; + case CPU_486DLC: + init_486dlc(); + break; + case CPU_CY486DX: + init_cy486dx(); + break; + case CPU_M1SC: + init_5x86(); + break; +#ifdef CPU_I486_ON_386 + case CPU_486: + init_i486_on_386(); + break; +#endif + case CPU_M1: + init_6x86(); + break; +#endif /* I486_CPU */ +#ifdef I686_CPU + case CPU_M2: + init_6x86MX(); + break; + case CPU_686: + if (strcmp(cpu_vendor, "GenuineIntel") == 0) { + switch (cpu_id & 0xff0) { + case 0x610: + init_ppro(); + break; + case 0x660: + init_mendocino(); + break; + } + } else if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { +#if defined(I686_CPU) && defined(CPU_ATHLON_SSE_HACK) + /* + * Sometimes the BIOS doesn't enable SSE instructions. + * According to AMD document 20734, the mobile + * Duron, the (mobile) Athlon 4 and the Athlon MP + * support SSE. These correspond to cpu_id 0x66X + * or 0x67X. + */ + if ((cpu_feature & CPUID_XMM) == 0 && + ((cpu_id & ~0xf) == 0x660 || + (cpu_id & ~0xf) == 0x670 || + (cpu_id & ~0xf) == 0x680)) { + u_int regs[4]; + wrmsr(0xC0010015, rdmsr(0xC0010015) & ~0x08000); + do_cpuid(1, regs); + cpu_feature = regs[3]; + } +#endif + } + break; +#endif + default: + break; + } + enable_sse(); + +#if defined(PC98) && !defined(CPU_UPGRADE_HW_CACHE) + /* + * OS should flush L1 cache by itself because no PC-98 supports + * non-Intel CPUs. Use wbinvd instruction before DMA transfer + * when need_pre_dma_flush = 1, use invd instruction after DMA + * transfer when need_post_dma_flush = 1. If your CPU upgrade + * product supports hardware cache control, you can add the + * CPU_UPGRADE_HW_CACHE option in your kernel configuration file. + * This option eliminates unneeded cache flush instruction(s). + */ + if (strcmp(cpu_vendor, "CyrixInstead") == 0) { + switch (cpu) { +#ifdef I486_CPU + case CPU_486DLC: + need_post_dma_flush = 1; + break; + case CPU_M1SC: + need_pre_dma_flush = 1; + break; + case CPU_CY486DX: + need_pre_dma_flush = 1; +#ifdef CPU_I486_ON_386 + need_post_dma_flush = 1; +#endif + break; +#endif + default: + break; + } + } else if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { + switch (cpu_id & 0xFF0) { + case 0x470: /* Enhanced Am486DX2 WB */ + case 0x490: /* Enhanced Am486DX4 WB */ + case 0x4F0: /* Am5x86 WB */ + need_pre_dma_flush = 1; + break; + } + } else if (strcmp(cpu_vendor, "IBM") == 0) { + need_post_dma_flush = 1; + } else { +#ifdef CPU_I486_ON_386 + need_pre_dma_flush = 1; +#endif + } +#endif /* PC98 && !CPU_UPGRADE_HW_CACHE */ +} + +#if defined(I586_CPU) && defined(CPU_WT_ALLOC) +/* + * Enable write allocate feature of AMD processors. + * Following two functions require the Maxmem variable being set. + */ +void +enable_K5_wt_alloc(void) +{ + u_int64_t msr; + register_t savecrit; + + /* + * Write allocate is supported only on models 1, 2, and 3, with + * a stepping of 4 or greater. + */ + if (((cpu_id & 0xf0) > 0) && ((cpu_id & 0x0f) > 3)) { + savecrit = intr_disable(); + msr = rdmsr(0x83); /* HWCR */ + wrmsr(0x83, msr & !(0x10)); + + /* + * We have to tell the chip where the top of memory is, + * since video cards could have frame bufferes there, + * memory-mapped I/O could be there, etc. + */ + if(Maxmem > 0) + msr = Maxmem / 16; + else + msr = 0; + msr |= AMD_WT_ALLOC_TME | AMD_WT_ALLOC_FRE; +#ifdef PC98 + if (!(inb(0x43b) & 4)) { + wrmsr(0x86, 0x0ff00f0); + msr |= AMD_WT_ALLOC_PRE; + } +#else + /* + * There is no way to know wheter 15-16M hole exists or not. + * Therefore, we disable write allocate for this range. + */ + wrmsr(0x86, 0x0ff00f0); + msr |= AMD_WT_ALLOC_PRE; +#endif + wrmsr(0x85, msr); + + msr=rdmsr(0x83); + wrmsr(0x83, msr|0x10); /* enable write allocate */ + intr_restore(savecrit); + } +} + +void +enable_K6_wt_alloc(void) +{ + quad_t size; + u_int64_t whcr; + u_long eflags; + + eflags = read_eflags(); + disable_intr(); + wbinvd(); + +#ifdef CPU_DISABLE_CACHE + /* + * Certain K6-2 box becomes unstable when write allocation is + * enabled. + */ + /* + * The AMD-K6 processer provides the 64-bit Test Register 12(TR12), + * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported. + * All other bits in TR12 have no effect on the processer's operation. + * The I/O Trap Restart function (bit 9 of TR12) is always enabled + * on the AMD-K6. + */ + wrmsr(0x0000000e, (u_int64_t)0x0008); +#endif + /* Don't assume that memory size is aligned with 4M. */ + if (Maxmem > 0) + size = ((Maxmem >> 8) + 3) >> 2; + else + size = 0; + + /* Limit is 508M bytes. */ + if (size > 0x7f) + size = 0x7f; + whcr = (rdmsr(0xc0000082) & ~(0x7fLL << 1)) | (size << 1); + +#if defined(PC98) || defined(NO_MEMORY_HOLE) + if (whcr & (0x7fLL << 1)) { +#ifdef PC98 + /* + * If bit 2 of port 0x43b is 0, disable wrte allocate for the + * 15-16M range. + */ + if (!(inb(0x43b) & 4)) + whcr &= ~0x0001LL; + else +#endif + whcr |= 0x0001LL; + } +#else + /* + * There is no way to know wheter 15-16M hole exists or not. + * Therefore, we disable write allocate for this range. + */ + whcr &= ~0x0001LL; +#endif + wrmsr(0x0c0000082, whcr); + + write_eflags(eflags); +} + +void +enable_K6_2_wt_alloc(void) +{ + quad_t size; + u_int64_t whcr; + u_long eflags; + + eflags = read_eflags(); + disable_intr(); + wbinvd(); + +#ifdef CPU_DISABLE_CACHE + /* + * Certain K6-2 box becomes unstable when write allocation is + * enabled. + */ + /* + * The AMD-K6 processer provides the 64-bit Test Register 12(TR12), + * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported. + * All other bits in TR12 have no effect on the processer's operation. + * The I/O Trap Restart function (bit 9 of TR12) is always enabled + * on the AMD-K6. + */ + wrmsr(0x0000000e, (u_int64_t)0x0008); +#endif + /* Don't assume that memory size is aligned with 4M. */ + if (Maxmem > 0) + size = ((Maxmem >> 8) + 3) >> 2; + else + size = 0; + + /* Limit is 4092M bytes. */ + if (size > 0x3fff) + size = 0x3ff; + whcr = (rdmsr(0xc0000082) & ~(0x3ffLL << 22)) | (size << 22); + +#if defined(PC98) || defined(NO_MEMORY_HOLE) + if (whcr & (0x3ffLL << 22)) { +#ifdef PC98 + /* + * If bit 2 of port 0x43b is 0, disable wrte allocate for the + * 15-16M range. + */ + if (!(inb(0x43b) & 4)) + whcr &= ~(1LL << 16); + else +#endif + whcr |= 1LL << 16; + } +#else + /* + * There is no way to know wheter 15-16M hole exists or not. + * Therefore, we disable write allocate for this range. + */ + whcr &= ~(1LL << 16); +#endif + wrmsr(0x0c0000082, whcr); + + write_eflags(eflags); +} +#endif /* I585_CPU && CPU_WT_ALLOC */ + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> +#if 0 +DB_SHOW_COMMAND(cyrixreg, cyrixreg) +{ + u_long eflags; + u_int cr0; + u_char ccr1, ccr2, ccr3; + u_char ccr0 = 0, ccr4 = 0, ccr5 = 0, pcr0 = 0; + + cr0 = rcr0(); + if (strcmp(cpu_vendor,"CyrixInstead") == 0) { + eflags = read_eflags(); + disable_intr(); + + + if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX)) { + ccr0 = read_cyrix_reg(CCR0); + } + ccr1 = read_cyrix_reg(CCR1); + ccr2 = read_cyrix_reg(CCR2); + ccr3 = read_cyrix_reg(CCR3); + if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) { + write_cyrix_reg(CCR3, CCR3_MAPEN0); + ccr4 = read_cyrix_reg(CCR4); + if ((cpu == CPU_M1) || (cpu == CPU_M2)) + ccr5 = read_cyrix_reg(CCR5); + else + pcr0 = read_cyrix_reg(PCR0); + write_cyrix_reg(CCR3, ccr3); /* Restore CCR3. */ + } + write_eflags(eflags); + + if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX)) + printf("CCR0=%x, ", (u_int)ccr0); + + printf("CCR1=%x, CCR2=%x, CCR3=%x", + (u_int)ccr1, (u_int)ccr2, (u_int)ccr3); + if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) { + printf(", CCR4=%x, ", (u_int)ccr4); + if (cpu == CPU_M1SC) + printf("PCR0=%x\n", pcr0); + else + printf("CCR5=%x\n", ccr5); + } + } + printf("CR0=%x\n", cr0); +} +#endif +#endif /* DDB */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/intr_machdep.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/intr_machdep.c new file mode 100644 index 0000000000..6ab354a00c --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/intr_machdep.c @@ -0,0 +1,326 @@ +/*- + * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/i386/i386/intr_machdep.c,v 1.4 2003/11/17 06:10:14 peter Exp $ + */ + +/* + * Machine dependent interrupt code for i386. For the i386, we have to + * deal with different PICs. Thus, we use the passed in vector to lookup + * an interrupt source associated with that vector. The interrupt source + * describes which PIC the source belongs to and includes methods to handle + * that source. + */ + +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> +#include <sys/lock.h> +#include <sys/ktr.h> +#include <sys/kernel.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <machine/clock.h> +#include <machine/intr_machdep.h> +#ifdef DDB +#include <ddb/ddb.h> +#endif + +#define MAX_STRAY_LOG 5 + +typedef void (*mask_fn)(uintptr_t vector); + +static int intrcnt_index; +static struct intsrc *interrupt_sources[NUM_IO_INTS]; +static struct mtx intr_table_lock; + +static void intr_init(void *__dummy); +static void intrcnt_setname(const char *name, int index); +static void intrcnt_updatename(struct intsrc *is); +static void intrcnt_register(struct intsrc *is); + +/* + * Register a new interrupt source with the global interrupt system. + * The global interrupts need to be disabled when this function is + * called. + */ +int +intr_register_source(struct intsrc *isrc) +{ + int error, vector; + + vector = isrc->is_pic->pic_vector(isrc); + if (interrupt_sources[vector] != NULL) + return (EEXIST); + error = ithread_create(&isrc->is_ithread, (uintptr_t)isrc, 0, + (mask_fn)isrc->is_pic->pic_disable_source, + (mask_fn)isrc->is_pic->pic_enable_source, "irq%d:", vector); + if (error) + return (error); + mtx_lock_spin(&intr_table_lock); + if (interrupt_sources[vector] != NULL) { + mtx_unlock_spin(&intr_table_lock); + ithread_destroy(isrc->is_ithread); + return (EEXIST); + } + intrcnt_register(isrc); + interrupt_sources[vector] = isrc; + mtx_unlock_spin(&intr_table_lock); + return (0); +} + +struct intsrc * +intr_lookup_source(int vector) +{ + + return (interrupt_sources[vector]); +} + +int +intr_add_handler(const char *name, int vector, driver_intr_t handler, + void *arg, enum intr_type flags, void **cookiep) +{ + struct intsrc *isrc; + int error; + + isrc = intr_lookup_source(vector); + if (isrc == NULL) + return (EINVAL); + + error = ithread_add_handler(isrc->is_ithread, name, handler, arg, + ithread_priority(flags), flags, cookiep); + if (error == 0) { + intrcnt_updatename(isrc); + isrc->is_pic->pic_enable_intr(isrc); + isrc->is_pic->pic_enable_source(isrc); + } + return (error); +} + +int +intr_remove_handler(void *cookie) +{ + int error; + + error = ithread_remove_handler(cookie); +#ifdef XXX + if (error == 0) + intrcnt_updatename(/* XXX */); +#endif + return (error); +} + +int +intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol) +{ + struct intsrc *isrc; + + isrc = intr_lookup_source(vector); + if (isrc == NULL) + return (EINVAL); + return (isrc->is_pic->pic_config_intr(isrc, trig, pol)); +} + +void +intr_execute_handlers(struct intsrc *isrc, struct intrframe *iframe) +{ + struct thread *td; + struct ithd *it; + struct intrhand *ih; + int error, vector; + + td = curthread; + td->td_intr_nesting_level++; + + /* + * We count software interrupts when we process them. The + * code here follows previous practice, but there's an + * argument for counting hardware interrupts when they're + * processed too. + */ + atomic_add_long(isrc->is_count, 1); + atomic_add_int(&cnt.v_intr, 1); + + it = isrc->is_ithread; + if (it == NULL) + ih = NULL; + else + ih = TAILQ_FIRST(&it->it_handlers); + + /* + * XXX: We assume that IRQ 0 is only used for the ISA timer + * device (clk). + */ + vector = isrc->is_pic->pic_vector(isrc); + if (vector == 0) + clkintr_pending = 1; + + + if (ih != NULL && ih->ih_flags & IH_FAST) { + /* + * Execute fast interrupt handlers directly. + * To support clock handlers, if a handler registers + * with a NULL argument, then we pass it a pointer to + * a trapframe as its argument. + */ + critical_enter(); + TAILQ_FOREACH(ih, &it->it_handlers, ih_next) { + MPASS(ih->ih_flags & IH_FAST); + CTR3(KTR_INTR, "%s: executing handler %p(%p)", + __func__, ih->ih_handler, + ih->ih_argument == NULL ? iframe : + ih->ih_argument); + if (ih->ih_argument == NULL) + ih->ih_handler(iframe); + else + ih->ih_handler(ih->ih_argument); + } + isrc->is_pic->pic_eoi_source(isrc); + error = 0; + /* XXX */ + td->td_pflags &= ~TDP_OWEPREEMPT; + critical_exit(); + } else { + /* + * For stray and threaded interrupts, we mask and EOI the + * source. + */ + isrc->is_pic->pic_disable_source(isrc, PIC_EOI); + if (ih == NULL) + error = EINVAL; + else + error = ithread_schedule(it); + isrc->is_pic->pic_eoi_source(isrc); + } + + if (error == EINVAL) { + atomic_add_long(isrc->is_straycount, 1); + if (*isrc->is_straycount < MAX_STRAY_LOG) + log(LOG_ERR, "stray irq%d\n", vector); + else if (*isrc->is_straycount == MAX_STRAY_LOG) + log(LOG_CRIT, + "too many stray irq %d's: not logging anymore\n", + vector); + } + td->td_intr_nesting_level--; + +} + +void +intr_resume(void) +{ + struct intsrc **isrc; + int i; + + mtx_lock_spin(&intr_table_lock); + for (i = 0, isrc = interrupt_sources; i < NUM_IO_INTS; i++, isrc++) + if (*isrc != NULL && (*isrc)->is_pic->pic_resume != NULL) + (*isrc)->is_pic->pic_resume(*isrc); + mtx_unlock_spin(&intr_table_lock); +} + +void +intr_suspend(void) +{ + struct intsrc **isrc; + int i; + + mtx_lock_spin(&intr_table_lock); + for (i = 0, isrc = interrupt_sources; i < NUM_IO_INTS; i++, isrc++) + if (*isrc != NULL && (*isrc)->is_pic->pic_suspend != NULL) + (*isrc)->is_pic->pic_suspend(*isrc); + mtx_unlock_spin(&intr_table_lock); +} + +static void +intrcnt_setname(const char *name, int index) +{ + + snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s", + MAXCOMLEN, name); +} + +static void +intrcnt_updatename(struct intsrc *is) +{ + + intrcnt_setname(is->is_ithread->it_td->td_proc->p_comm, is->is_index); +} + +static void +intrcnt_register(struct intsrc *is) +{ + char straystr[MAXCOMLEN + 1]; + + /* mtx_assert(&intr_table_lock, MA_OWNED); */ + KASSERT(is->is_ithread != NULL, ("%s: isrc with no ithread", __func__)); + is->is_index = intrcnt_index; + intrcnt_index += 2; + snprintf(straystr, MAXCOMLEN + 1, "stray irq%d", + is->is_pic->pic_vector(is)); + intrcnt_updatename(is); + is->is_count = &intrcnt[is->is_index]; + intrcnt_setname(straystr, is->is_index + 1); + is->is_straycount = &intrcnt[is->is_index + 1]; +} + +static void +intr_init(void *dummy __unused) +{ + + intrcnt_setname("???", 0); + intrcnt_index = 1; + mtx_init(&intr_table_lock, "intr table", NULL, MTX_SPIN); +} +SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL) + +#ifdef DDB +/* + * Dump data about interrupt handlers + */ +DB_SHOW_COMMAND(irqs, db_show_irqs) +{ + struct intsrc **isrc; + int i, quit, verbose; + + quit = 0; + if (strcmp(modif, "v") == 0) + verbose = 1; + else + verbose = 0; + isrc = interrupt_sources; + db_setup_paging(db_simple_pager, &quit, DB_LINES_PER_PAGE); + for (i = 0; i < NUM_IO_INTS && !quit; i++, isrc++) + if (*isrc != NULL) + db_dump_ithread((*isrc)->is_ithread, verbose); +} +#endif diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/io_apic.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/io_apic.c new file mode 100644 index 0000000000..9892a998b2 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/io_apic.c @@ -0,0 +1,850 @@ +/*- + * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/io_apic.c,v 1.14 2004/08/02 15:31:10 scottl Exp $"); + +#include "opt_isa.h" +#include "opt_no_mixed_mode.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/apicreg.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> +#include <machine/segments.h> + +#define IOAPIC_ISA_INTS 16 +#define IOAPIC_MEM_REGION 32 +#define IOAPIC_REDTBL_LO(i) (IOAPIC_REDTBL + (i) * 2) +#define IOAPIC_REDTBL_HI(i) (IOAPIC_REDTBL_LO(i) + 1) + +#define VECTOR_EXTINT 252 +#define VECTOR_NMI 253 +#define VECTOR_SMI 254 +#define VECTOR_DISABLED 255 + +#define DEST_NONE -1 +#define DEST_EXTINT -2 + +#define TODO printf("%s: not implemented!\n", __func__) + +MALLOC_DEFINE(M_IOAPIC, "I/O APIC", "I/O APIC structures"); + +/* + * New interrupt support code.. + * + * XXX: we really should have the interrupt cookie passed up from new-bus + * just be a int pin, and not map 1:1 to interrupt vector number but should + * use INTR_TYPE_FOO to set priority bands for device classes and do all the + * magic remapping of intpin to vector in here. For now we just cheat as on + * ia64 and map intpin X to vector NRSVIDT + X. Note that we assume that the + * first IO APIC has ISA interrupts on pins 1-15. Not sure how you are + * really supposed to figure out which IO APIC in a system with multiple IO + * APIC's actually has the ISA interrupts routed to it. As far as interrupt + * pin numbers, we use the ACPI System Interrupt number model where each + * IO APIC has a contiguous chunk of the System Interrupt address space. + */ + +/* + * Direct the ExtINT pin on the first I/O APIC to a logical cluster of + * CPUs rather than a physical destination of just the BSP. + * + * Note: This is disabled by default as test systems seem to croak with it + * enabled. +#define ENABLE_EXTINT_LOGICAL_DESTINATION + */ + +struct ioapic_intsrc { + struct intsrc io_intsrc; + u_int io_intpin:8; + u_int io_vector:8; + u_int io_activehi:1; + u_int io_edgetrigger:1; + u_int io_masked:1; + int io_dest:5; + int io_bus:4; +}; + +struct ioapic { + struct pic io_pic; + u_int io_id:8; /* logical ID */ + u_int io_apic_id:4; + u_int io_intbase:8; /* System Interrupt base */ + u_int io_numintr:8; + volatile ioapic_t *io_addr; /* XXX: should use bus_space */ + STAILQ_ENTRY(ioapic) io_next; + struct ioapic_intsrc io_pins[0]; +}; + +static u_int ioapic_read(volatile ioapic_t *apic, int reg); +static void ioapic_write(volatile ioapic_t *apic, int reg, u_int val); +static const char *ioapic_bus_string(int bus_type); +static void ioapic_print_vector(struct ioapic_intsrc *intpin); +static void ioapic_enable_source(struct intsrc *isrc); +static void ioapic_disable_source(struct intsrc *isrc, int eoi); +static void ioapic_eoi_source(struct intsrc *isrc); +static void ioapic_enable_intr(struct intsrc *isrc); +static int ioapic_vector(struct intsrc *isrc); +static int ioapic_source_pending(struct intsrc *isrc); +static int ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig, + enum intr_polarity pol); +static void ioapic_suspend(struct intsrc *isrc); +static void ioapic_resume(struct intsrc *isrc); +static void ioapic_program_destination(struct ioapic_intsrc *intpin); +static void ioapic_program_intpin(struct ioapic_intsrc *intpin); +static void ioapic_setup_mixed_mode(struct ioapic_intsrc *intpin); + +static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list); +struct pic ioapic_template = { ioapic_enable_source, ioapic_disable_source, + ioapic_eoi_source, ioapic_enable_intr, + ioapic_vector, ioapic_source_pending, + ioapic_suspend, ioapic_resume, + ioapic_config_intr }; + +static int bsp_id, current_cluster, logical_clusters, next_ioapic_base; +static u_int mixed_mode_enabled, next_id, program_logical_dest; +#ifdef NO_MIXED_MODE +static int mixed_mode_active = 0; +#else +static int mixed_mode_active = 1; +#endif +TUNABLE_INT("hw.apic.mixed_mode", &mixed_mode_active); + +static __inline void +_ioapic_eoi_source(struct intsrc *isrc) +{ + lapic_eoi(); +} + +static u_int +ioapic_read(volatile ioapic_t *apic, int reg) +{ + + mtx_assert(&icu_lock, MA_OWNED); + apic->ioregsel = reg; + return (apic->iowin); +} + +static void +ioapic_write(volatile ioapic_t *apic, int reg, u_int val) +{ + + mtx_assert(&icu_lock, MA_OWNED); + apic->ioregsel = reg; + apic->iowin = val; +} + +static const char * +ioapic_bus_string(int bus_type) +{ + + switch (bus_type) { + case APIC_BUS_ISA: + return ("ISA"); + case APIC_BUS_EISA: + return ("EISA"); + case APIC_BUS_PCI: + return ("PCI"); + default: + return ("unknown"); + } +} + +static void +ioapic_print_vector(struct ioapic_intsrc *intpin) +{ + + switch (intpin->io_vector) { + case VECTOR_DISABLED: + printf("disabled"); + break; + case VECTOR_EXTINT: + printf("ExtINT"); + break; + case VECTOR_NMI: + printf("NMI"); + break; + case VECTOR_SMI: + printf("SMI"); + break; + default: + printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus), + intpin->io_vector); + } +} + +static void +ioapic_enable_source(struct intsrc *isrc) +{ + struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; + struct ioapic *io = (struct ioapic *)isrc->is_pic; + uint32_t flags; + + mtx_lock_spin(&icu_lock); + if (intpin->io_masked) { + flags = ioapic_read(io->io_addr, + IOAPIC_REDTBL_LO(intpin->io_intpin)); + flags &= ~(IOART_INTMASK); + ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), + flags); + intpin->io_masked = 0; + } + mtx_unlock_spin(&icu_lock); +} + +static void +ioapic_disable_source(struct intsrc *isrc, int eoi) +{ + struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; + struct ioapic *io = (struct ioapic *)isrc->is_pic; + uint32_t flags; + + mtx_lock_spin(&icu_lock); + if (!intpin->io_masked && !intpin->io_edgetrigger) { + flags = ioapic_read(io->io_addr, + IOAPIC_REDTBL_LO(intpin->io_intpin)); + flags |= IOART_INTMSET; + ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), + flags); + intpin->io_masked = 1; + } + + if (eoi == PIC_EOI) + _ioapic_eoi_source(isrc); + + mtx_unlock_spin(&icu_lock); +} + +static void +ioapic_eoi_source(struct intsrc *isrc) +{ + + _ioapic_eoi_source(isrc); +} + +/* + * Completely program an intpin based on the data in its interrupt source + * structure. + */ +static void +ioapic_program_intpin(struct ioapic_intsrc *intpin) +{ + struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic; + uint32_t low, high, value; + + /* + * For pins routed via mixed mode or disabled, just ensure that + * they are masked. + */ + if (intpin->io_dest == DEST_EXTINT || + intpin->io_vector == VECTOR_DISABLED) { + low = ioapic_read(io->io_addr, + IOAPIC_REDTBL_LO(intpin->io_intpin)); + if ((low & IOART_INTMASK) == IOART_INTMCLR) + ioapic_write(io->io_addr, + IOAPIC_REDTBL_LO(intpin->io_intpin), + low | IOART_INTMSET); + return; + } + + /* Set the destination. */ + if (intpin->io_dest == DEST_NONE) { + low = IOART_DESTPHY; + high = bsp_id << APIC_ID_SHIFT; + } else { + low = IOART_DESTLOG; + high = (intpin->io_dest << APIC_ID_CLUSTER_SHIFT | + APIC_ID_CLUSTER_ID) << APIC_ID_SHIFT; + } + + /* Program the rest of the low word. */ + if (intpin->io_edgetrigger) + low |= IOART_TRGREDG; + else + low |= IOART_TRGRLVL; + if (intpin->io_activehi) + low |= IOART_INTAHI; + else + low |= IOART_INTALO; + if (intpin->io_masked) + low |= IOART_INTMSET; + switch (intpin->io_vector) { + case VECTOR_EXTINT: + KASSERT(intpin->io_edgetrigger, + ("EXTINT not edge triggered")); + low |= IOART_DELEXINT; + break; + case VECTOR_NMI: + KASSERT(intpin->io_edgetrigger, + ("NMI not edge triggered")); + low |= IOART_DELNMI; + break; + case VECTOR_SMI: + KASSERT(intpin->io_edgetrigger, + ("SMI not edge triggered")); + low |= IOART_DELSMI; + break; + default: + low |= IOART_DELLOPRI | apic_irq_to_idt(intpin->io_vector); + } + + /* Write the values to the APIC. */ + mtx_lock_spin(&icu_lock); + ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low); + value = ioapic_read(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin)); + value &= ~IOART_DEST; + value |= high; + ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), value); + mtx_unlock_spin(&icu_lock); +} + +/* + * Program an individual intpin's logical destination. + */ +static void +ioapic_program_destination(struct ioapic_intsrc *intpin) +{ + struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic; + + KASSERT(intpin->io_dest != DEST_NONE, + ("intpin not assigned to a cluster")); + KASSERT(intpin->io_dest != DEST_EXTINT, + ("intpin routed via ExtINT")); + if (bootverbose) { + printf("ioapic%u: routing intpin %u (", io->io_id, + intpin->io_intpin); + ioapic_print_vector(intpin); + printf(") to cluster %u\n", intpin->io_dest); + } + ioapic_program_intpin(intpin); +} + +static void +ioapic_assign_cluster(struct ioapic_intsrc *intpin) +{ + + /* + * Assign this intpin to a logical APIC cluster in a + * round-robin fashion. We don't actually use the logical + * destination for this intpin until after all the CPU's + * have been started so that we don't end up with interrupts + * that don't go anywhere. Another alternative might be to + * start up the CPU's earlier so that they can handle interrupts + * sooner. + */ + intpin->io_dest = current_cluster; + current_cluster++; + if (current_cluster >= logical_clusters) + current_cluster = 0; + if (program_logical_dest) + ioapic_program_destination(intpin); +} + +static void +ioapic_enable_intr(struct intsrc *isrc) +{ + struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; + + KASSERT(intpin->io_dest != DEST_EXTINT, + ("ExtINT pin trying to use ioapic enable_intr method")); + if (intpin->io_dest == DEST_NONE) { + ioapic_assign_cluster(intpin); + lapic_enable_intr(intpin->io_vector); + } +} + +static int +ioapic_vector(struct intsrc *isrc) +{ + struct ioapic_intsrc *pin; + + pin = (struct ioapic_intsrc *)isrc; + return (pin->io_vector); +} + +static int +ioapic_source_pending(struct intsrc *isrc) +{ + struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; + + return (lapic_intr_pending(intpin->io_vector)); +} + +static int +ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig, + enum intr_polarity pol) +{ + struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; + struct ioapic *io = (struct ioapic *)isrc->is_pic; + int changed; + + KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM), + ("%s: Conforming trigger or polarity\n", __func__)); + + /* + * EISA interrupts always use active high polarity, so don't allow + * them to be set to active low. + * + * XXX: Should we write to the ELCR if the trigger mode changes for + * an EISA IRQ? + */ + if (intpin->io_bus == APIC_BUS_EISA) + pol = INTR_POLARITY_HIGH; + changed = 0; + if (intpin->io_edgetrigger != (trig == INTR_TRIGGER_EDGE)) { + if (bootverbose) + printf("ioapic%u: Changing trigger for pin %u to %s\n", + io->io_id, intpin->io_intpin, + trig == INTR_TRIGGER_EDGE ? "edge" : "level"); + intpin->io_edgetrigger = (trig == INTR_TRIGGER_EDGE); + changed++; + } + if (intpin->io_activehi != (pol == INTR_POLARITY_HIGH)) { + if (bootverbose) + printf("ioapic%u: Changing polarity for pin %u to %s\n", + io->io_id, intpin->io_intpin, + pol == INTR_POLARITY_HIGH ? "high" : "low"); + intpin->io_activehi = (pol == INTR_POLARITY_HIGH); + changed++; + } + if (changed) + ioapic_program_intpin(intpin); + return (0); +} + +static void +ioapic_suspend(struct intsrc *isrc) +{ + + TODO; +} + +static void +ioapic_resume(struct intsrc *isrc) +{ + + ioapic_program_intpin((struct ioapic_intsrc *)isrc); +} + +/* + * APIC enumerators call this function to indicate that the 8259A AT PICs + * are available and that mixed mode can be used. + */ +void +ioapic_enable_mixed_mode(void) +{ + + mixed_mode_enabled = 1; +} + +/* + * Allocate and return a logical cluster ID. Note that the first time + * this is called, it returns cluster 0. ioapic_enable_intr() treats + * the two cases of logical_clusters == 0 and logical_clusters == 1 the + * same: one cluster of ID 0 exists. The logical_clusters == 0 case is + * for UP kernels, which should never call this function. + */ +int +ioapic_next_logical_cluster(void) +{ + + if (logical_clusters >= APIC_MAX_CLUSTER) + panic("WARNING: Local APIC cluster IDs exhausted!"); + return (logical_clusters++); +} + +/* + * Create a plain I/O APIC object. + */ +void * +ioapic_create(uintptr_t addr, int32_t apic_id, int intbase) +{ + struct ioapic *io; + struct ioapic_intsrc *intpin; + volatile ioapic_t *apic; + u_int numintr, i; + uint32_t value; + + apic = (ioapic_t *)pmap_mapdev(addr, IOAPIC_MEM_REGION); + mtx_lock_spin(&icu_lock); + numintr = ((ioapic_read(apic, IOAPIC_VER) & IOART_VER_MAXREDIR) >> + MAXREDIRSHIFT) + 1; + mtx_unlock_spin(&icu_lock); + io = malloc(sizeof(struct ioapic) + + numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK); + io->io_pic = ioapic_template; + mtx_lock_spin(&icu_lock); + io->io_id = next_id++; + io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT; + if (apic_id != -1 && io->io_apic_id != apic_id) { + ioapic_write(apic, IOAPIC_ID, apic_id << APIC_ID_SHIFT); + mtx_unlock_spin(&icu_lock); + io->io_apic_id = apic_id; + printf("ioapic%u: Changing APIC ID to %d\n", io->io_id, + apic_id); + } else + mtx_unlock_spin(&icu_lock); + if (intbase == -1) { + intbase = next_ioapic_base; + printf("ioapic%u: Assuming intbase of %d\n", io->io_id, + intbase); + } else if (intbase != next_ioapic_base) + printf("ioapic%u: WARNING: intbase %d != expected base %d\n", + io->io_id, intbase, next_ioapic_base); + io->io_intbase = intbase; + next_ioapic_base = intbase + numintr; + io->io_numintr = numintr; + io->io_addr = apic; + + /* + * Initialize pins. Start off with interrupts disabled. Default + * to active-hi and edge-triggered for ISA interrupts and active-lo + * and level-triggered for all others. + */ + bzero(io->io_pins, sizeof(struct ioapic_intsrc) * numintr); + mtx_lock_spin(&icu_lock); + for (i = 0, intpin = io->io_pins; i < numintr; i++, intpin++) { + intpin->io_intsrc.is_pic = (struct pic *)io; + intpin->io_intpin = i; + intpin->io_vector = intbase + i; + + /* + * Assume that pin 0 on the first I/O APIC is an ExtINT pin + * and that pins 1-15 are ISA interrupts. Assume that all + * other pins are PCI interrupts. + */ + if (intpin->io_vector == 0) + ioapic_set_extint(io, i); + else if (intpin->io_vector < IOAPIC_ISA_INTS) { + intpin->io_bus = APIC_BUS_ISA; + intpin->io_activehi = 1; + intpin->io_edgetrigger = 1; + intpin->io_masked = 1; + } else { + intpin->io_bus = APIC_BUS_PCI; + intpin->io_activehi = 0; + intpin->io_edgetrigger = 0; + intpin->io_masked = 1; + } + + /* + * Route interrupts to the BSP by default using physical + * addressing. Vectored interrupts get readdressed using + * logical IDs to CPU clusters when they are enabled. + */ + intpin->io_dest = DEST_NONE; + if (bootverbose && intpin->io_vector != VECTOR_DISABLED) { + printf("ioapic%u: intpin %d -> ", io->io_id, i); + ioapic_print_vector(intpin); + printf(" (%s, %s)\n", intpin->io_edgetrigger ? + "edge" : "level", intpin->io_activehi ? "high" : + "low"); + } + value = ioapic_read(apic, IOAPIC_REDTBL_LO(i)); + ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET); + } + mtx_unlock_spin(&icu_lock); + + return (io); +} + +int +ioapic_get_vector(void *cookie, u_int pin) +{ + struct ioapic *io; + + io = (struct ioapic *)cookie; + if (pin >= io->io_numintr) + return (-1); + return (io->io_pins[pin].io_vector); +} + +int +ioapic_disable_pin(void *cookie, u_int pin) +{ + struct ioapic *io; + + io = (struct ioapic *)cookie; + if (pin >= io->io_numintr) + return (EINVAL); + if (io->io_pins[pin].io_vector == VECTOR_DISABLED) + return (EINVAL); + io->io_pins[pin].io_vector = VECTOR_DISABLED; + if (bootverbose) + printf("ioapic%u: intpin %d disabled\n", io->io_id, pin); + return (0); +} + +int +ioapic_remap_vector(void *cookie, u_int pin, int vector) +{ + struct ioapic *io; + + io = (struct ioapic *)cookie; + if (pin >= io->io_numintr || vector < 0) + return (EINVAL); + if (io->io_pins[pin].io_vector >= NUM_IO_INTS) + return (EINVAL); + io->io_pins[pin].io_vector = vector; + if (bootverbose) + printf("ioapic%u: Routing IRQ %d -> intpin %d\n", io->io_id, + vector, pin); + return (0); +} + +int +ioapic_set_bus(void *cookie, u_int pin, int bus_type) +{ + struct ioapic *io; + + if (bus_type < 0 || bus_type > APIC_BUS_MAX) + return (EINVAL); + io = (struct ioapic *)cookie; + if (pin >= io->io_numintr) + return (EINVAL); + if (io->io_pins[pin].io_vector >= NUM_IO_INTS) + return (EINVAL); + io->io_pins[pin].io_bus = bus_type; + if (bootverbose) + printf("ioapic%u: intpin %d bus %s\n", io->io_id, pin, + ioapic_bus_string(bus_type)); + return (0); +} + +int +ioapic_set_nmi(void *cookie, u_int pin) +{ + struct ioapic *io; + + io = (struct ioapic *)cookie; + if (pin >= io->io_numintr) + return (EINVAL); + if (io->io_pins[pin].io_vector == VECTOR_NMI) + return (0); + if (io->io_pins[pin].io_vector >= NUM_IO_INTS) + return (EINVAL); + io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; + io->io_pins[pin].io_vector = VECTOR_NMI; + io->io_pins[pin].io_masked = 0; + io->io_pins[pin].io_edgetrigger = 1; + io->io_pins[pin].io_activehi = 1; + if (bootverbose) + printf("ioapic%u: Routing NMI -> intpin %d\n", + io->io_id, pin); + return (0); +} + +int +ioapic_set_smi(void *cookie, u_int pin) +{ + struct ioapic *io; + + io = (struct ioapic *)cookie; + if (pin >= io->io_numintr) + return (EINVAL); + if (io->io_pins[pin].io_vector == VECTOR_SMI) + return (0); + if (io->io_pins[pin].io_vector >= NUM_IO_INTS) + return (EINVAL); + io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; + io->io_pins[pin].io_vector = VECTOR_SMI; + io->io_pins[pin].io_masked = 0; + io->io_pins[pin].io_edgetrigger = 1; + io->io_pins[pin].io_activehi = 1; + if (bootverbose) + printf("ioapic%u: Routing SMI -> intpin %d\n", + io->io_id, pin); + return (0); +} + +int +ioapic_set_extint(void *cookie, u_int pin) +{ + struct ioapic *io; + + io = (struct ioapic *)cookie; + if (pin >= io->io_numintr) + return (EINVAL); + if (io->io_pins[pin].io_vector == VECTOR_EXTINT) + return (0); + if (io->io_pins[pin].io_vector >= NUM_IO_INTS) + return (EINVAL); + io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; + io->io_pins[pin].io_vector = VECTOR_EXTINT; + + /* Enable this pin if mixed mode is available and active. */ + if (mixed_mode_enabled && mixed_mode_active) + io->io_pins[pin].io_masked = 0; + else + io->io_pins[pin].io_masked = 1; + io->io_pins[pin].io_edgetrigger = 1; + io->io_pins[pin].io_activehi = 1; + if (bootverbose) + printf("ioapic%u: Routing external 8259A's -> intpin %d\n", + io->io_id, pin); + return (0); +} + +int +ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol) +{ + struct ioapic *io; + + io = (struct ioapic *)cookie; + if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM) + return (EINVAL); + if (io->io_pins[pin].io_vector >= NUM_IO_INTS) + return (EINVAL); + io->io_pins[pin].io_activehi = (pol == INTR_POLARITY_HIGH); + if (bootverbose) + printf("ioapic%u: intpin %d polarity: %s\n", io->io_id, pin, + pol == INTR_POLARITY_HIGH ? "high" : "low"); + return (0); +} + +int +ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger) +{ + struct ioapic *io; + + io = (struct ioapic *)cookie; + if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM) + return (EINVAL); + if (io->io_pins[pin].io_vector >= NUM_IO_INTS) + return (EINVAL); + io->io_pins[pin].io_edgetrigger = (trigger == INTR_TRIGGER_EDGE); + if (bootverbose) + printf("ioapic%u: intpin %d trigger: %s\n", io->io_id, pin, + trigger == INTR_TRIGGER_EDGE ? "edge" : "level"); + return (0); +} + +/* + * Register a complete I/O APIC object with the interrupt subsystem. + */ +void +ioapic_register(void *cookie) +{ + struct ioapic_intsrc *pin; + struct ioapic *io; + volatile ioapic_t *apic; + uint32_t flags; + int i; + + io = (struct ioapic *)cookie; + apic = io->io_addr; + mtx_lock_spin(&icu_lock); + flags = ioapic_read(apic, IOAPIC_VER) & IOART_VER_VERSION; + STAILQ_INSERT_TAIL(&ioapic_list, io, io_next); + mtx_unlock_spin(&icu_lock); + printf("ioapic%u <Version %u.%u> irqs %u-%u on motherboard\n", + io->io_id, flags >> 4, flags & 0xf, io->io_intbase, + io->io_intbase + io->io_numintr - 1); + bsp_id = PCPU_GET(apic_id); + for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) { + /* + * Finish initializing the pins by programming the vectors + * and delivery mode. + */ + if (pin->io_vector == VECTOR_DISABLED) + continue; + ioapic_program_intpin(pin); + if (pin->io_vector >= NUM_IO_INTS) + continue; + /* + * Route IRQ0 via the 8259A using mixed mode if mixed mode + * is available and turned on. + */ + if (pin->io_vector == 0 && mixed_mode_active && + mixed_mode_enabled) + ioapic_setup_mixed_mode(pin); + else + intr_register_source(&pin->io_intsrc); + } +} + +/* + * Program all the intpins to use logical destinations once the AP's + * have been launched. + */ +static void +ioapic_set_logical_destinations(void *arg __unused) +{ + struct ioapic *io; + int i; + + program_logical_dest = 1; + STAILQ_FOREACH(io, &ioapic_list, io_next) + for (i = 0; i < io->io_numintr; i++) + if (io->io_pins[i].io_dest != DEST_NONE && + io->io_pins[i].io_dest != DEST_EXTINT) + ioapic_program_destination(&io->io_pins[i]); +} +SYSINIT(ioapic_destinations, SI_SUB_SMP, SI_ORDER_SECOND, + ioapic_set_logical_destinations, NULL) + +/* + * Support for mixed-mode interrupt sources. These sources route an ISA + * IRQ through the 8259A's via the ExtINT on pin 0 of the I/O APIC that + * routes the ISA interrupts. We just ignore the intpins that use this + * mode and allow the atpic driver to register its interrupt source for + * that IRQ instead. + */ + +static void +ioapic_setup_mixed_mode(struct ioapic_intsrc *intpin) +{ + struct ioapic_intsrc *extint; + struct ioapic *io; + + /* + * Mark the associated I/O APIC intpin as being delivered via + * ExtINT and enable the ExtINT pin on the I/O APIC if needed. + */ + intpin->io_dest = DEST_EXTINT; + io = (struct ioapic *)intpin->io_intsrc.is_pic; + extint = &io->io_pins[0]; + if (extint->io_vector != VECTOR_EXTINT) + panic("Can't find ExtINT pin to route through!"); +#ifdef ENABLE_EXTINT_LOGICAL_DESTINATION + if (extint->io_dest == DEST_NONE) + ioapic_assign_cluster(extint); +#endif +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/local_apic.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/local_apic.c new file mode 100644 index 0000000000..8fb7f9f12e --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/local_apic.c @@ -0,0 +1,762 @@ +/*- + * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org> + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Local APIC support on Pentium and later processors. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/local_apic.c,v 1.9 2004/07/14 18:12:15 jhb Exp $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/pcpu.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/apicreg.h> +#include <machine/cputypes.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> +#include <machine/md_var.h> +#include <machine/smp.h> +#include <machine/specialreg.h> + +/* + * We can handle up to 60 APICs via our logical cluster IDs, but currently + * the physical IDs on Intel processors up to the Pentium 4 are limited to + * 16. + */ +#define MAX_APICID 16 + +/* Sanity checks on IDT vectors. */ +CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS <= APIC_LOCAL_INTS); +CTASSERT(IPI_STOP < APIC_SPURIOUS_INT); + +/* + * Support for local APICs. Local APICs manage interrupts on each + * individual processor as opposed to I/O APICs which receive interrupts + * from I/O devices and then forward them on to the local APICs. + * + * Local APICs can also send interrupts to each other thus providing the + * mechanism for IPIs. + */ + +struct lvt { + u_int lvt_edgetrigger:1; + u_int lvt_activehi:1; + u_int lvt_masked:1; + u_int lvt_active:1; + u_int lvt_mode:16; + u_int lvt_vector:8; +}; + +struct lapic { + struct lvt la_lvts[LVT_MAX + 1]; + u_int la_id:8; + u_int la_cluster:4; + u_int la_cluster_id:2; + u_int la_present:1; +} static lapics[MAX_APICID]; + +/* XXX: should thermal be an NMI? */ + +/* Global defaults for local APIC LVT entries. */ +static struct lvt lvts[LVT_MAX + 1] = { + { 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 }, /* LINT0: masked ExtINT */ + { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 }, /* LINT1: NMI */ + { 1, 1, 1, 1, APIC_LVT_DM_FIXED, 0 }, /* Timer: needs a vector */ + { 1, 1, 1, 1, APIC_LVT_DM_FIXED, 0 }, /* Error: needs a vector */ + { 1, 1, 1, 1, APIC_LVT_DM_FIXED, 0 }, /* PMC */ + { 1, 1, 1, 1, APIC_LVT_DM_FIXED, 0 }, /* Thermal: needs a vector */ +}; + +static inthand_t *ioint_handlers[] = { + NULL, /* 0 - 31 */ + IDTVEC(apic_isr1), /* 32 - 63 */ + IDTVEC(apic_isr2), /* 64 - 95 */ + IDTVEC(apic_isr3), /* 96 - 127 */ + IDTVEC(apic_isr4), /* 128 - 159 */ + IDTVEC(apic_isr5), /* 160 - 191 */ + IDTVEC(apic_isr6), /* 192 - 223 */ + IDTVEC(apic_isr7), /* 224 - 255 */ +}; + +volatile lapic_t *lapic; + +static uint32_t +lvt_mode(struct lapic *la, u_int pin, uint32_t value) +{ + struct lvt *lvt; + + KASSERT(pin <= LVT_MAX, ("%s: pin %u out of range", __func__, pin)); + if (la->la_lvts[pin].lvt_active) + lvt = &la->la_lvts[pin]; + else + lvt = &lvts[pin]; + + value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM | + APIC_LVT_VECTOR); + if (lvt->lvt_edgetrigger == 0) + value |= APIC_LVT_TM; + if (lvt->lvt_activehi == 0) + value |= APIC_LVT_IIPP_INTALO; + if (lvt->lvt_masked) + value |= APIC_LVT_M; + value |= lvt->lvt_mode; + switch (lvt->lvt_mode) { + case APIC_LVT_DM_NMI: + case APIC_LVT_DM_SMI: + case APIC_LVT_DM_INIT: + case APIC_LVT_DM_EXTINT: + if (!lvt->lvt_edgetrigger) { + printf("lapic%u: Forcing LINT%u to edge trigger\n", + la->la_id, pin); + value |= APIC_LVT_TM; + } + /* Use a vector of 0. */ + break; + case APIC_LVT_DM_FIXED: +#if 0 + value |= lvt->lvt_vector; +#else + panic("Fixed LINT pins not supported"); +#endif + break; + default: + panic("bad APIC LVT delivery mode: %#x\n", value); + } + return (value); +} + +/* + * Map the local APIC and setup necessary interrupt vectors. + */ +void +lapic_init(uintptr_t addr) +{ + u_int32_t value; + + /* Map the local APIC and setup the spurious interrupt handler. */ + KASSERT(trunc_page(addr) == addr, + ("local APIC not aligned on a page boundary")); + lapic = (lapic_t *)pmap_mapdev(addr, sizeof(lapic_t)); + setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + + /* Perform basic initialization of the BSP's local APIC. */ + value = lapic->svr; + value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS); + value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT); + lapic->svr = value; + + /* Set BSP's per-CPU local APIC ID. */ + PCPU_SET(apic_id, lapic_id()); + + /* XXX: timer/error/thermal interrupts */ +} + +/* + * Create a local APIC instance. + */ +void +lapic_create(u_int apic_id, int boot_cpu) +{ + int i; + + if (apic_id >= MAX_APICID) { + printf("APIC: Ignoring local APIC with ID %d\n", apic_id); + if (boot_cpu) + panic("Can't ignore BSP"); + return; + } + KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u", + apic_id)); + + /* + * Assume no local LVT overrides and a cluster of 0 and + * intra-cluster ID of 0. + */ + lapics[apic_id].la_present = 1; + lapics[apic_id].la_id = apic_id; + for (i = 0; i < LVT_MAX; i++) { + lapics[apic_id].la_lvts[i] = lvts[i]; + lapics[apic_id].la_lvts[i].lvt_active = 0; + } + +#ifdef SMP + cpu_add(apic_id, boot_cpu); +#endif +} + +/* + * Dump contents of local APIC registers + */ +void +lapic_dump(const char* str) +{ + + printf("cpu%d %s:\n", PCPU_GET(cpuid), str); + printf(" ID: 0x%08x VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n", + lapic->id, lapic->version, lapic->ldr, lapic->dfr); + printf(" lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n", + lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr); +} + +void +lapic_enable_intr(u_int irq) +{ + u_int vector; + + vector = apic_irq_to_idt(irq); + KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); + KASSERT(ioint_handlers[vector / 32] != NULL, + ("No ISR handler for IRQ %u", irq)); + setidt(vector, ioint_handlers[vector / 32], SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); +} + +void +lapic_setup(void) +{ + struct lapic *la; + u_int32_t value, maxlvt; + register_t eflags; + + la = &lapics[lapic_id()]; + KASSERT(la->la_present, ("missing APIC structure")); + eflags = intr_disable(); + maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + + /* Program LINT[01] LVT entries. */ + lapic->lvt_lint0 = lvt_mode(la, LVT_LINT0, lapic->lvt_lint0); + lapic->lvt_lint1 = lvt_mode(la, LVT_LINT1, lapic->lvt_lint1); + + /* XXX: more LVT entries */ + + /* Clear the TPR. */ + value = lapic->tpr; + value &= ~APIC_TPR_PRIO; + lapic->tpr = value; + + /* Use the cluster model for logical IDs. */ + value = lapic->dfr; + value &= ~APIC_DFR_MODEL_MASK; + value |= APIC_DFR_MODEL_CLUSTER; + lapic->dfr = value; + + /* Set this APIC's logical ID. */ + value = lapic->ldr; + value &= ~APIC_ID_MASK; + value |= (la->la_cluster << APIC_ID_CLUSTER_SHIFT | + 1 << la->la_cluster_id) << APIC_ID_SHIFT; + lapic->ldr = value; + + /* Setup spurious vector and enable the local APIC. */ + value = lapic->svr; + value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS); + value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT); + lapic->svr = value; + intr_restore(eflags); +} + +void +lapic_disable(void) +{ + uint32_t value; + + /* Software disable the local APIC. */ + value = lapic->svr; + value &= ~APIC_SVR_SWEN; + lapic->svr = value; +} + +int +lapic_id(void) +{ + + KASSERT(lapic != NULL, ("local APIC is not mapped")); + return (lapic->id >> APIC_ID_SHIFT); +} + +int +lapic_intr_pending(u_int vector) +{ + volatile u_int32_t *irr; + + /* + * The IRR registers are an array of 128-bit registers each of + * which only describes 32 interrupts in the low 32 bits.. Thus, + * we divide the vector by 32 to get the 128-bit index. We then + * multiply that index by 4 to get the equivalent index from + * treating the IRR as an array of 32-bit registers. Finally, we + * modulus the vector by 32 to determine the individual bit to + * test. + */ + irr = &lapic->irr0; + return (irr[(vector / 32) * 4] & 1 << (vector % 32)); +} + +void +lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) +{ + struct lapic *la; + + KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist", + __func__, apic_id)); + KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big", + __func__, cluster)); + KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID, + ("%s: intra cluster id %u too big", __func__, cluster_id)); + la = &lapics[apic_id]; + la->la_cluster = cluster; + la->la_cluster_id = cluster_id; +} + +int +lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked) +{ + + if (pin > LVT_MAX) + return (EINVAL); + if (apic_id == APIC_ID_ALL) { + lvts[pin].lvt_masked = masked; + if (bootverbose) + printf("lapic:"); + } else { + KASSERT(lapics[apic_id].la_present, + ("%s: missing APIC %u", __func__, apic_id)); + lapics[apic_id].la_lvts[pin].lvt_masked = masked; + lapics[apic_id].la_lvts[pin].lvt_active = 1; + if (bootverbose) + printf("lapic%u:", apic_id); + } + if (bootverbose) + printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked"); + return (0); +} + +int +lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode) +{ + struct lvt *lvt; + + if (pin > LVT_MAX) + return (EINVAL); + if (apic_id == APIC_ID_ALL) { + lvt = &lvts[pin]; + if (bootverbose) + printf("lapic:"); + } else { + KASSERT(lapics[apic_id].la_present, + ("%s: missing APIC %u", __func__, apic_id)); + lvt = &lapics[apic_id].la_lvts[pin]; + lvt->lvt_active = 1; + if (bootverbose) + printf("lapic%u:", apic_id); + } + lvt->lvt_mode = mode; + switch (mode) { + case APIC_LVT_DM_NMI: + case APIC_LVT_DM_SMI: + case APIC_LVT_DM_INIT: + case APIC_LVT_DM_EXTINT: + lvt->lvt_edgetrigger = 1; + lvt->lvt_activehi = 1; + if (mode == APIC_LVT_DM_EXTINT) + lvt->lvt_masked = 1; + else + lvt->lvt_masked = 0; + break; + default: + panic("Unsupported delivery mode: 0x%x\n", mode); + } + if (bootverbose) { + printf(" Routing "); + switch (mode) { + case APIC_LVT_DM_NMI: + printf("NMI"); + break; + case APIC_LVT_DM_SMI: + printf("SMI"); + break; + case APIC_LVT_DM_INIT: + printf("INIT"); + break; + case APIC_LVT_DM_EXTINT: + printf("ExtINT"); + break; + } + printf(" -> LINT%u\n", pin); + } + return (0); +} + +int +lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol) +{ + + if (pin > LVT_MAX || pol == INTR_POLARITY_CONFORM) + return (EINVAL); + if (apic_id == APIC_ID_ALL) { + lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH); + if (bootverbose) + printf("lapic:"); + } else { + KASSERT(lapics[apic_id].la_present, + ("%s: missing APIC %u", __func__, apic_id)); + lapics[apic_id].la_lvts[pin].lvt_active = 1; + lapics[apic_id].la_lvts[pin].lvt_activehi = + (pol == INTR_POLARITY_HIGH); + if (bootverbose) + printf("lapic%u:", apic_id); + } + if (bootverbose) + printf(" LINT%u polarity: active-%s\n", pin, + pol == INTR_POLARITY_HIGH ? "high" : "low"); + return (0); +} + +int +lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger) +{ + + if (pin > LVT_MAX || trigger == INTR_TRIGGER_CONFORM) + return (EINVAL); + if (apic_id == APIC_ID_ALL) { + lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE); + if (bootverbose) + printf("lapic:"); + } else { + KASSERT(lapics[apic_id].la_present, + ("%s: missing APIC %u", __func__, apic_id)); + lapics[apic_id].la_lvts[pin].lvt_edgetrigger = + (trigger == INTR_TRIGGER_EDGE); + lapics[apic_id].la_lvts[pin].lvt_active = 1; + if (bootverbose) + printf("lapic%u:", apic_id); + } + if (bootverbose) + printf(" LINT%u trigger: %s\n", pin, + trigger == INTR_TRIGGER_EDGE ? "edge" : "level"); + return (0); +} + +void +lapic_eoi(void) +{ + + lapic->eoi = 0; +} + +void +lapic_handle_intr(struct intrframe frame) +{ + struct intsrc *isrc; + + if (frame.if_vec == -1) + panic("Couldn't get vector from ISR!"); + isrc = intr_lookup_source(apic_idt_to_irq(frame.if_vec)); + intr_execute_handlers(isrc, &frame); +} + +/* Translate between IDT vectors and IRQ vectors. */ +u_int +apic_irq_to_idt(u_int irq) +{ + u_int vector; + + KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq)); + vector = irq + APIC_IO_INTS; + if (vector >= IDT_SYSCALL) + vector++; + return (vector); +} + +u_int +apic_idt_to_irq(u_int vector) +{ + + KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL && + vector <= APIC_IO_INTS + NUM_IO_INTS, + ("Vector %u does not map to an IRQ line", vector)); + if (vector > IDT_SYSCALL) + vector--; + return (vector - APIC_IO_INTS); +} + +/* + * APIC probing support code. This includes code to manage enumerators. + */ + +static SLIST_HEAD(, apic_enumerator) enumerators = + SLIST_HEAD_INITIALIZER(enumerators); +static struct apic_enumerator *best_enum; + +void +apic_register_enumerator(struct apic_enumerator *enumerator) +{ +#ifdef INVARIANTS + struct apic_enumerator *apic_enum; + + SLIST_FOREACH(apic_enum, &enumerators, apic_next) { + if (apic_enum == enumerator) + panic("%s: Duplicate register of %s", __func__, + enumerator->apic_name); + } +#endif + SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next); +} + +/* + * Probe the APIC enumerators, enumerate CPUs, and initialize the + * local APIC. + */ +static void +apic_init(void *dummy __unused) +{ + struct apic_enumerator *enumerator; + uint64_t apic_base; + int retval, best; + + /* We only support built in local APICs. */ + if (!(cpu_feature & CPUID_APIC)) + return; + + /* Don't probe if APIC mode is disabled. */ + if (resource_disabled("apic", 0)) + return; + + /* First, probe all the enumerators to find the best match. */ + best_enum = NULL; + best = 0; + SLIST_FOREACH(enumerator, &enumerators, apic_next) { + retval = enumerator->apic_probe(); + if (retval > 0) + continue; + if (best_enum == NULL || best < retval) { + best_enum = enumerator; + best = retval; + } + } + if (best_enum == NULL) { + if (bootverbose) + printf("APIC: Could not find any APICs.\n"); + return; + } + + if (bootverbose) + printf("APIC: Using the %s enumerator.\n", + best_enum->apic_name); + + /* + * To work around an errata, we disable the local APIC on some + * CPUs during early startup. We need to turn the local APIC back + * on on such CPUs now. + */ + if (cpu == CPU_686 && strcmp(cpu_vendor, "GenuineIntel") == 0 && + (cpu_id & 0xff0) == 0x610) { + apic_base = rdmsr(MSR_APICBASE); + apic_base |= APICBASE_ENABLED; + wrmsr(MSR_APICBASE, apic_base); + } + + /* Second, probe the CPU's in the system. */ + retval = best_enum->apic_probe_cpus(); + if (retval != 0) + printf("%s: Failed to probe CPUs: returned %d\n", + best_enum->apic_name, retval); + + /* Third, initialize the local APIC. */ + retval = best_enum->apic_setup_local(); + if (retval != 0) + printf("%s: Failed to setup the local APIC: returned %d\n", + best_enum->apic_name, retval); +#ifdef SMP + /* Last, setup the cpu topology now that we have probed CPUs */ + mp_topology(); +#endif +} +SYSINIT(apic_init, SI_SUB_CPU, SI_ORDER_FIRST, apic_init, NULL) + +/* + * Setup the I/O APICs. + */ +static void +apic_setup_io(void *dummy __unused) +{ + int retval; + + if (best_enum == NULL) + return; + retval = best_enum->apic_setup_io(); + if (retval != 0) + printf("%s: Failed to setup I/O APICs: returned %d\n", + best_enum->apic_name, retval); + + /* + * Finish setting up the local APIC on the BSP once we know how to + * properly program the LINT pins. + */ + lapic_setup(); + if (bootverbose) + lapic_dump("BSP"); +} +SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_SECOND, apic_setup_io, NULL) + +#ifdef SMP +/* + * Inter Processor Interrupt functions. The lapic_ipi_*() functions are + * private the sys/i386 code. The public interface for the rest of the + * kernel is defined in mp_machdep.c. + */ + +int +lapic_ipi_wait(int delay) +{ + int x, incr; + + /* + * Wait delay loops for IPI to be sent. This is highly bogus + * since this is sensitive to CPU clock speed. If delay is + * -1, we wait forever. + */ + if (delay == -1) { + incr = 0; + delay = 1; + } else + incr = 1; + for (x = 0; x < delay; x += incr) { + if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE) + return (1); + ia32_pause(); + } + return (0); +} + +void +lapic_ipi_raw(register_t icrlo, u_int dest) +{ + register_t value, eflags; + + /* XXX: Need more sanity checking of icrlo? */ + KASSERT(lapic != NULL, ("%s called too early", __func__)); + KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, + ("%s: invalid dest field", __func__)); + KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0, + ("%s: reserved bits set in ICR LO register", __func__)); + + /* Set destination in ICR HI register if it is being used. */ + eflags = intr_disable(); + if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) { + value = lapic->icr_hi; + value &= ~APIC_ID_MASK; + value |= dest << APIC_ID_SHIFT; + lapic->icr_hi = value; + } + + /* Program the contents of the IPI and dispatch it. */ + value = lapic->icr_lo; + value &= APIC_ICRLO_RESV_MASK; + value |= icrlo; + lapic->icr_lo = value; + intr_restore(eflags); +} + +#define BEFORE_SPIN 1000000 +#ifdef DETECT_DEADLOCK +#define AFTER_SPIN 1000 +#endif + +void +lapic_ipi_vectored(u_int vector, int dest) +{ + register_t icrlo, destfield; + + KASSERT((vector & ~APIC_VECTOR_MASK) == 0, + ("%s: invalid vector %d", __func__, vector)); + + icrlo = vector | APIC_DELMODE_FIXED | APIC_DESTMODE_PHY | + APIC_LEVEL_DEASSERT | APIC_TRIGMOD_EDGE; + destfield = 0; + switch (dest) { + case APIC_IPI_DEST_SELF: + icrlo |= APIC_DEST_SELF; + break; + case APIC_IPI_DEST_ALL: + icrlo |= APIC_DEST_ALLISELF; + break; + case APIC_IPI_DEST_OTHERS: + icrlo |= APIC_DEST_ALLESELF; + break; + default: + KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, + ("%s: invalid destination 0x%x", __func__, dest)); + destfield = dest; + } + + /* Wait for an earlier IPI to finish. */ + if (!lapic_ipi_wait(BEFORE_SPIN)) + panic("APIC: Previous IPI is stuck"); + + lapic_ipi_raw(icrlo, destfield); + +#ifdef DETECT_DEADLOCK + /* Wait for IPI to be delivered. */ + if (!lapic_ipi_wait(AFTER_SPIN)) { +#ifdef needsattention + /* + * XXX FIXME: + * + * The above function waits for the message to actually be + * delivered. It breaks out after an arbitrary timeout + * since the message should eventually be delivered (at + * least in theory) and that if it wasn't we would catch + * the failure with the check above when the next IPI is + * sent. + * + * We could skiip this wait entirely, EXCEPT it probably + * protects us from other routines that assume that the + * message was delivered and acted upon when this function + * returns. + */ + printf("APIC: IPI might be stuck\n"); +#else /* !needsattention */ + /* Wait until mesage is sent without a timeout. */ + while (lapic->icr_lo & APIC_DELSTAT_PEND) + ia32_pause(); +#endif /* needsattention */ + } +#endif /* DETECT_DEADLOCK */ +} +#endif /* SMP */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/locore.s b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/locore.s new file mode 100644 index 0000000000..5146169162 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/locore.s @@ -0,0 +1,949 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)locore.s 7.3 (Berkeley) 5/13/91 + * $FreeBSD: src/sys/i386/i386/locore.s,v 1.181 2003/11/03 21:53:37 jhb Exp $ + * + * originally from: locore.s, by William F. Jolitz + * + * Substantially rewritten by David Greenman, Rod Grimes, + * Bruce Evans, Wolfgang Solfrank, Poul-Henning Kamp + * and many others. + */ + +#include "opt_bootp.h" +#include "opt_compat.h" +#include "opt_nfsroot.h" +#include "opt_pmap.h" + +#include <sys/syscall.h> +#include <sys/reboot.h> + +#include <machine/asmacros.h> +#include <machine/cputypes.h> +#include <machine/psl.h> +#include <machine/pmap.h> +#include <machine/specialreg.h> + +#include "assym.s" + +.section __xen_guest + .asciz "LOADER=generic,GUEST_VER=5.2.1,XEN_VER=2.0,BSD_SYMTAB" + + +/* + * XXX + * + * Note: This version greatly munged to avoid various assembler errors + * that may be fixed in newer versions of gas. Perhaps newer versions + * will have more pleasant appearance. + */ + +/* + * PTmap is recursive pagemap at top of virtual address space. + * Within PTmap, the page directory can be found (third indirection). + */ + .globl PTmap,PTD,PTDpde + .set PTmap,(PTDPTDI << PDRSHIFT) + .set PTD,PTmap + (PTDPTDI * PAGE_SIZE) + .set PTDpde,PTD + (PTDPTDI * PDESIZE) + +#ifdef SMP +/* + * Define layout of per-cpu address space. + * This is "constructed" in locore.s on the BSP and in mp_machdep.c + * for each AP. DO NOT REORDER THESE WITHOUT UPDATING THE REST! + */ + .globl SMP_prvspace + .set SMP_prvspace,(MPPTDI << PDRSHIFT) +#endif /* SMP */ + +/* + * Compiled KERNBASE location and the kernel load address + */ + .globl kernbase + .set kernbase,KERNBASE + .globl kernload + .set kernload,KERNLOAD + +/* + * Globals + */ + .data + ALIGN_DATA /* just to be sure */ + + .space 0x2000 /* space for tmpstk - temporary stack */ +tmpstk: + + .globl bootinfo +bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ + + .globl KERNend +KERNend: .long 0 /* phys addr end of kernel (just after bss) */ +physfree: .long 0 /* phys addr of next free page */ + +#ifdef SMP + .globl cpu0prvpage +cpu0pp: .long 0 /* phys addr cpu0 private pg */ +cpu0prvpage: .long 0 /* relocated version */ + + .globl SMPpt +SMPptpa: .long 0 /* phys addr SMP page table */ +SMPpt: .long 0 /* relocated version */ +#endif /* SMP */ + + .globl IdlePTD +IdlePTD: .long 0 /* phys addr of kernel PTD */ + + + .globl KPTphys +KPTphys: .long 0 /* phys addr of kernel page tables */ + + .globl proc0uarea, proc0kstack +proc0uarea: .long 0 /* address of proc 0 uarea space */ +proc0kstack: .long 0 /* address of proc 0 kstack space */ +p0upa: .long 0 /* phys addr of proc0's UAREA */ +p0kpa: .long 0 /* phys addr of proc0's STACK */ + +#ifdef PC98 + .globl pc98_system_parameter +pc98_system_parameter: + .space 0x240 +#endif + +/********************************************************************** + * + * Some handy macros + * + */ + +#define R(foo) ((foo)) + +#define ALLOCPAGES(foo) \ + movl R(physfree), %esi ; \ + movl $((foo)*PAGE_SIZE), %eax ; \ + addl %esi, %eax ; \ + movl %eax, R(physfree) ; \ + movl %esi, %edi ; \ + movl $((foo)*PAGE_SIZE),%ecx ; \ + xorl %eax,%eax ; \ + cld ; \ + rep ; \ + stosb + +/* + * fillkpt + * eax = page frame address + * ebx = index into page table + * ecx = how many pages to map + * base = base address of page dir/table + * prot = protection bits + */ +#define fillkpt(base, prot) \ + shll $PTESHIFT,%ebx ; \ + addl base,%ebx ; \ + orl $PG_V,%eax ; \ + orl prot,%eax ; \ +1: movl %eax,(%ebx) ; \ + addl $PAGE_SIZE,%eax ; /* increment physical address */ \ + addl $PTESIZE,%ebx ; /* next pte */ \ + loop 1b + +/* + * fillkptphys(prot) + * eax = physical address + * ecx = how many pages to map + * prot = protection bits + */ +#define fillkptphys(prot) \ + movl %eax, %ebx ; \ + shrl $PAGE_SHIFT, %ebx ; \ + fillkpt(R(KPTphys), prot) + + .text +/********************************************************************** + * + * This is where the bootblocks start us, set the ball rolling... + * + */ +NON_GPROF_ENTRY(btext) + pushl %esi + call initvalues + popl %esi + call identify_cpu + movl proc0kstack,%eax + leal (KSTACK_PAGES*PAGE_SIZE-PCB_SIZE)(%eax),%esp + xorl %ebp,%ebp /* mark end of frames */ + movl IdlePTD,%esi + movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) + call init386 + call mi_startup + int $3 + + +#ifdef PC98 + /* save SYSTEM PARAMETER for resume (NS/T or other) */ + movl $0xa1400,%esi + movl $R(pc98_system_parameter),%edi + movl $0x0240,%ecx + cld + rep + movsb +#else /* IBM-PC */ +/* Tell the bios to warmboot next time */ + movw $0x1234,0x472 +#endif /* PC98 */ + +/* Set up a real frame in case the double return in newboot is executed. */ + pushl %ebp + movl %esp, %ebp + +/* Don't trust what the BIOS gives for eflags. */ + pushl $PSL_KERNEL + popfl + +/* + * Don't trust what the BIOS gives for %fs and %gs. Trust the bootstrap + * to set %cs, %ds, %es and %ss. + */ + mov %ds, %ax + mov %ax, %fs + mov %ax, %gs + +/* + * Clear the bss. Not all boot programs do it, and it is our job anyway. + * + * XXX we don't check that there is memory for our bss and page tables + * before using it. + * + * Note: we must be careful to not overwrite an active gdt or idt. They + * inactive from now until we switch to new ones, since we don't load any + * more segment registers or permit interrupts until after the switch. + */ + movl $R(end),%ecx + movl $R(edata),%edi + subl %edi,%ecx + xorl %eax,%eax + cld + rep + stosb + + call recover_bootinfo + +/* Get onto a stack that we can trust. */ +/* + * XXX this step is delayed in case recover_bootinfo needs to return via + * the old stack, but it need not be, since recover_bootinfo actually + * returns via the old frame. + */ + movl $R(tmpstk),%esp + +#ifdef PC98 + /* pc98_machine_type & M_EPSON_PC98 */ + testb $0x02,R(pc98_system_parameter)+220 + jz 3f + /* epson_machine_id <= 0x0b */ + cmpb $0x0b,R(pc98_system_parameter)+224 + ja 3f + + /* count up memory */ + movl $0x100000,%eax /* next, talley remaining memory */ + movl $0xFFF-0x100,%ecx +1: movl 0(%eax),%ebx /* save location to check */ + movl $0xa55a5aa5,0(%eax) /* write test pattern */ + cmpl $0xa55a5aa5,0(%eax) /* does not check yet for rollover */ + jne 2f + movl %ebx,0(%eax) /* restore memory */ + addl $PAGE_SIZE,%eax + loop 1b +2: subl $0x100000,%eax + shrl $17,%eax + movb %al,R(pc98_system_parameter)+1 +3: + + movw R(pc98_system_parameter+0x86),%ax + movw %ax,R(cpu_id) +#endif + + call identify_cpu + call create_pagetables + +/* + * If the CPU has support for VME, turn it on. + */ + testl $CPUID_VME, R(cpu_feature) + jz 1f + movl %cr4, %eax + orl $CR4_VME, %eax + movl %eax, %cr4 +1: + +/* Now enable paging */ + movl R(IdlePTD), %eax + movl %eax,%cr3 /* load ptd addr into mmu */ + movl %cr0,%eax /* get control word */ + orl $CR0_PE|CR0_PG,%eax /* enable paging */ + movl %eax,%cr0 /* and let's page NOW! */ + + pushl $begin /* jump to high virtualized address */ + ret + +/* now running relocated at KERNBASE where the system is linked to run */ +begin: + /* set up bootstrap stack */ + movl proc0kstack,%eax /* location of in-kernel stack */ + /* bootstrap stack end location */ + leal (KSTACK_PAGES*PAGE_SIZE-PCB_SIZE)(%eax),%esp + + xorl %ebp,%ebp /* mark end of frames */ + +#ifdef PAE + movl IdlePDPT,%esi +#else + movl IdlePTD,%esi +#endif + movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) + + pushl physfree /* value of first for init386(first) */ + call init386 /* wire 386 chip for unix operation */ + + /* + * Clean up the stack in a way that db_numargs() understands, so + * that backtraces in ddb don't underrun the stack. Traps for + * inaccessible memory are more fatal than usual this early. + */ + addl $4,%esp + + call mi_startup /* autoconfiguration, mountroot etc */ + /* NOTREACHED */ + addl $0,%esp /* for db_numargs() again */ + +/* + * Signal trampoline, copied to top of user stack + */ +NON_GPROF_ENTRY(sigcode) + calll *SIGF_HANDLER(%esp) + leal SIGF_UC(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC_EFLAGS(%eax) + jne 1f + movl UC_GS(%eax),%gs /* restore %gs */ +1: + movl $SYS_sigreturn,%eax + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b + +#ifdef COMPAT_FREEBSD4 + ALIGN_TEXT +freebsd4_sigcode: + calll *SIGF_HANDLER(%esp) + leal SIGF_UC4(%esp),%eax /* get ucontext */ + pushl %eax + testl $PSL_VM,UC4_EFLAGS(%eax) + jne 1f + movl UC4_GS(%eax),%gs /* restore %gs */ +1: + movl $344,%eax /* 4.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ + /* on stack */ +1: + jmp 1b +#endif + +#ifdef COMPAT_43 + ALIGN_TEXT +osigcode: + call *SIGF_HANDLER(%esp) /* call signal handler */ + lea SIGF_SC(%esp),%eax /* get sigcontext */ + pushl %eax + testl $PSL_VM,SC_PS(%eax) + jne 9f + movl SC_GS(%eax),%gs /* restore %gs */ +9: + movl $103,%eax /* 3.x SYS_sigreturn */ + pushl %eax /* junk to fake return addr. */ + int $0x80 /* enter kernel with args */ +0: jmp 0b +#endif /* COMPAT_43 */ + + ALIGN_TEXT +esigcode: + + .data + .globl szsigcode +szsigcode: + .long esigcode-sigcode +#ifdef COMPAT_FREEBSD4 + .globl szfreebsd4_sigcode +szfreebsd4_sigcode: + .long esigcode-freebsd4_sigcode +#endif +#ifdef COMPAT_43 + .globl szosigcode +szosigcode: + .long esigcode-osigcode +#endif + .text + +/********************************************************************** + * + * Recover the bootinfo passed to us from the boot program + * + */ +recover_bootinfo: + /* + * This code is called in different ways depending on what loaded + * and started the kernel. This is used to detect how we get the + * arguments from the other code and what we do with them. + * + * Old disk boot blocks: + * (*btext)(howto, bootdev, cyloffset, esym); + * [return address == 0, and can NOT be returned to] + * [cyloffset was not supported by the FreeBSD boot code + * and always passed in as 0] + * [esym is also known as total in the boot code, and + * was never properly supported by the FreeBSD boot code] + * + * Old diskless netboot code: + * (*btext)(0,0,0,0,&nfsdiskless,0,0,0); + * [return address != 0, and can NOT be returned to] + * If we are being booted by this code it will NOT work, + * so we are just going to halt if we find this case. + * + * New uniform boot code: + * (*btext)(howto, bootdev, 0, 0, 0, &bootinfo) + * [return address != 0, and can be returned to] + * + * There may seem to be a lot of wasted arguments in here, but + * that is so the newer boot code can still load very old kernels + * and old boot code can load new kernels. + */ + + /* + * The old style disk boot blocks fake a frame on the stack and + * did an lret to get here. The frame on the stack has a return + * address of 0. + */ + cmpl $0,4(%ebp) + je olddiskboot + + /* + * We have some form of return address, so this is either the + * old diskless netboot code, or the new uniform code. That can + * be detected by looking at the 5th argument, if it is 0 + * we are being booted by the new uniform boot code. + */ + cmpl $0,24(%ebp) + je newboot + + /* + * Seems we have been loaded by the old diskless boot code, we + * don't stand a chance of running as the diskless structure + * changed considerably between the two, so just halt. + */ + hlt + + /* + * We have been loaded by the new uniform boot code. + * Let's check the bootinfo version, and if we do not understand + * it we return to the loader with a status of 1 to indicate this error + */ +newboot: + movl 28(%ebp),%ebx /* &bootinfo.version */ + movl BI_VERSION(%ebx),%eax + cmpl $1,%eax /* We only understand version 1 */ + je 1f + movl $1,%eax /* Return status */ + leave + /* + * XXX this returns to our caller's caller (as is required) since + * we didn't set up a frame and our caller did. + */ + ret + +1: + /* + * If we have a kernelname copy it in + */ + movl BI_KERNELNAME(%ebx),%esi + cmpl $0,%esi + je 2f /* No kernelname */ + movl $MAXPATHLEN,%ecx /* Brute force!!! */ + movl $R(kernelname),%edi + cmpb $'/',(%esi) /* Make sure it starts with a slash */ + je 1f + movb $'/',(%edi) + incl %edi + decl %ecx +1: + cld + rep + movsb + +2: + /* + * Determine the size of the boot loader's copy of the bootinfo + * struct. This is impossible to do properly because old versions + * of the struct don't contain a size field and there are 2 old + * versions with the same version number. + */ + movl $BI_ENDCOMMON,%ecx /* prepare for sizeless version */ + testl $RB_BOOTINFO,8(%ebp) /* bi_size (and bootinfo) valid? */ + je got_bi_size /* no, sizeless version */ + movl BI_SIZE(%ebx),%ecx +got_bi_size: + + /* + * Copy the common part of the bootinfo struct + */ + movl %ebx,%esi + movl $R(bootinfo),%edi + cmpl $BOOTINFO_SIZE,%ecx + jbe got_common_bi_size + movl $BOOTINFO_SIZE,%ecx +got_common_bi_size: + cld + rep + movsb + +#ifdef NFS_ROOT +#ifndef BOOTP_NFSV3 + /* + * If we have a nfs_diskless structure copy it in + */ + movl BI_NFS_DISKLESS(%ebx),%esi + cmpl $0,%esi + je olddiskboot + movl $R(nfs_diskless),%edi + movl $NFSDISKLESS_SIZE,%ecx + cld + rep + movsb + movl $R(nfs_diskless_valid),%edi + movl $1,(%edi) +#endif +#endif + + /* + * The old style disk boot. + * (*btext)(howto, bootdev, cyloffset, esym); + * Note that the newer boot code just falls into here to pick + * up howto and bootdev, cyloffset and esym are no longer used + */ +olddiskboot: + movl 8(%ebp),%eax + movl %eax,R(boothowto) + movl 12(%ebp),%eax + movl %eax,R(bootdev) + + ret + + +/********************************************************************** + * + * Identify the CPU and initialize anything special about it + * + */ +identify_cpu: + + /* Try to toggle alignment check flag ; does not exist on 386. */ + pushfl + popl %eax + movl %eax,%ecx + orl $PSL_AC,%eax + pushl %eax + popfl + pushfl + popl %eax + xorl %ecx,%eax + andl $PSL_AC,%eax + pushl %ecx + popfl + + testl %eax,%eax + jnz try486 + + /* NexGen CPU does not have aligment check flag. */ + pushfl + movl $0x5555, %eax + xorl %edx, %edx + movl $2, %ecx + clc + divl %ecx + jz trynexgen + popfl + movl $CPU_386,R(cpu) + jmp 3f + +trynexgen: + popfl + movl $CPU_NX586,R(cpu) + movl $0x4778654e,R(cpu_vendor) # store vendor string + movl $0x72446e65,R(cpu_vendor+4) + movl $0x6e657669,R(cpu_vendor+8) + movl $0,R(cpu_vendor+12) + jmp 3f + +try486: /* Try to toggle identification flag ; does not exist on early 486s. */ + pushfl + popl %eax + movl %eax,%ecx + xorl $PSL_ID,%eax + pushl %eax + popfl + pushfl + popl %eax + xorl %ecx,%eax + andl $PSL_ID,%eax + pushl %ecx + popfl + + testl %eax,%eax + jnz trycpuid + movl $CPU_486,R(cpu) + + /* + * Check Cyrix CPU + * Cyrix CPUs do not change the undefined flags following + * execution of the divide instruction which divides 5 by 2. + * + * Note: CPUID is enabled on M2, so it passes another way. + */ + pushfl + movl $0x5555, %eax + xorl %edx, %edx + movl $2, %ecx + clc + divl %ecx + jnc trycyrix + popfl + jmp 3f /* You may use Intel CPU. */ + +trycyrix: + popfl + /* + * IBM Bluelighting CPU also doesn't change the undefined flags. + * Because IBM doesn't disclose the information for Bluelighting + * CPU, we couldn't distinguish it from Cyrix's (including IBM + * brand of Cyrix CPUs). + */ + movl $0x69727943,R(cpu_vendor) # store vendor string + movl $0x736e4978,R(cpu_vendor+4) + movl $0x64616574,R(cpu_vendor+8) + jmp 3f + +trycpuid: /* Use the `cpuid' instruction. */ + xorl %eax,%eax + cpuid # cpuid 0 + movl %eax,R(cpu_high) # highest capability + movl %ebx,R(cpu_vendor) # store vendor string + movl %edx,R(cpu_vendor+4) + movl %ecx,R(cpu_vendor+8) + movb $0,R(cpu_vendor+12) + + movl $1,%eax + cpuid # cpuid 1 + movl %eax,R(cpu_id) # store cpu_id + movl %ebx,R(cpu_procinfo) # store cpu_procinfo + movl %edx,R(cpu_feature) # store cpu_feature + rorl $8,%eax # extract family type + andl $15,%eax + cmpl $5,%eax + jae 1f + + /* less than Pentium ; must be 486 */ + movl $CPU_486,R(cpu) + jmp 3f +1: + /* a Pentium? */ + cmpl $5,%eax + jne 2f + movl $CPU_586,R(cpu) + jmp 3f +2: + /* Greater than Pentium...call it a Pentium Pro */ + movl $CPU_686,R(cpu) +3: + ret + +/********************************************************************** + * + * Create the first page directory and its page tables. + * + */ + +create_pagetables: + +/* Find end of kernel image (rounded up to a page boundary). */ + movl $R(_end),%esi + +/* Include symbols, if any. */ + movl R(bootinfo+BI_ESYMTAB),%edi + testl %edi,%edi + je over_symalloc + movl %edi,%esi + movl $KERNBASE,%edi + addl %edi,R(bootinfo+BI_SYMTAB) + addl %edi,R(bootinfo+BI_ESYMTAB) +over_symalloc: + +/* If we are told where the end of the kernel space is, believe it. */ + movl R(bootinfo+BI_KERNEND),%edi + testl %edi,%edi + je no_kernend + movl %edi,%esi +no_kernend: + + addl $PDRMASK,%esi /* Play conservative for now, and */ + andl $~PDRMASK,%esi /* ... wrap to next 4M. */ + movl %esi,R(KERNend) /* save end of kernel */ + movl %esi,R(physfree) /* next free page is at end of kernel */ + +/* Allocate Kernel Page Tables */ + ALLOCPAGES(NKPT) + movl %esi,R(KPTphys) + +/* Allocate Page Table Directory */ +#ifdef PAE + /* XXX only need 32 bytes (easier for now) */ + ALLOCPAGES(1) + movl %esi,R(IdlePDPT) +#endif + ALLOCPAGES(NPGPTD) + movl %esi,R(IdlePTD) + +/* Allocate UPAGES */ + ALLOCPAGES(UAREA_PAGES) + movl %esi,R(p0upa) + addl $KERNBASE, %esi + movl %esi, R(proc0uarea) + + ALLOCPAGES(KSTACK_PAGES) + movl %esi,R(p0kpa) + addl $KERNBASE, %esi + movl %esi, R(proc0kstack) +#if 0 + ALLOCPAGES(1) /* vm86/bios stack */ + movl %esi,R(vm86phystk) + + ALLOCPAGES(3) /* pgtable + ext + IOPAGES */ + movl %esi,R(vm86pa) + addl $KERNBASE, %esi + movl %esi, R(vm86paddr) +#endif +#ifdef SMP +/* Allocate cpu0's private data page */ + ALLOCPAGES(1) + movl %esi,R(cpu0pp) + addl $KERNBASE, %esi + movl %esi, R(cpu0prvpage) /* relocated to KVM space */ + +/* Allocate SMP page table page */ + ALLOCPAGES(1) + movl %esi,R(SMPptpa) + addl $KERNBASE, %esi + movl %esi, R(SMPpt) /* relocated to KVM space */ +#endif /* SMP */ + +/* Map page zero read-write so bios32 calls can use it */ + xorl %eax, %eax + movl $PG_RW,%edx + movl $1,%ecx + fillkptphys(%edx) + +/* Map read-only from page 1 to the beginning of the kernel text section */ + movl $PAGE_SIZE, %eax + xorl %edx,%edx + movl $R(btext),%ecx + addl $PAGE_MASK,%ecx + subl %eax,%ecx + shrl $PAGE_SHIFT,%ecx + fillkptphys(%edx) + +/* + * Enable PSE and PGE. + */ +#ifndef DISABLE_PSE + testl $CPUID_PSE, R(cpu_feature) + jz 1f + movl $PG_PS, R(pseflag) + movl %cr4, %eax + orl $CR4_PSE, %eax + movl %eax, %cr4 +1: +#endif +#ifndef DISABLE_PG_G + testl $CPUID_PGE, R(cpu_feature) + jz 2f + movl $PG_G, R(pgeflag) + movl %cr4, %eax + orl $CR4_PGE, %eax + movl %eax, %cr4 +2: +#endif + +/* + * Write page tables for the kernel starting at btext and + * until the end. Make sure to map read+write. We do this even + * if we've enabled PSE above, we'll just switch the corresponding kernel + * PDEs before we turn on paging. + * + * XXX: We waste some pages here in the PSE case! DON'T BLINDLY REMOVE + * THIS! SMP needs the page table to be there to map the kernel P==V. + */ + movl $R(btext),%eax + addl $PAGE_MASK, %eax + andl $~PAGE_MASK, %eax + movl $PG_RW,%edx + movl R(KERNend),%ecx + subl %eax,%ecx + shrl $PAGE_SHIFT,%ecx + fillkptphys(%edx) + +/* Map page directory. */ + movl R(IdlePTD), %eax + movl $NPGPTD, %ecx + fillkptphys($PG_RW) + +/* Map proc0's UPAGES in the physical way ... */ + movl R(p0upa), %eax + movl $(UAREA_PAGES), %ecx + fillkptphys($PG_RW) + +/* Map proc0's KSTACK in the physical way ... */ + movl R(p0kpa), %eax + movl $(KSTACK_PAGES), %ecx + fillkptphys($PG_RW) + +/* Map ISA hole */ + movl $ISA_HOLE_START, %eax + movl $ISA_HOLE_LENGTH>>PAGE_SHIFT, %ecx + fillkptphys($PG_RW) +#if 0 +/* Map space for the vm86 region */ + movl R(vm86phystk), %eax + movl $4, %ecx + fillkptphys($PG_RW) + +/* Map page 0 into the vm86 page table */ + movl $0, %eax + movl $0, %ebx + movl $1, %ecx + fillkpt(R(vm86pa), $PG_RW|PG_U) + +/* ...likewise for the ISA hole */ + movl $ISA_HOLE_START, %eax + movl $ISA_HOLE_START>>PAGE_SHIFT, %ebx + movl $ISA_HOLE_LENGTH>>PAGE_SHIFT, %ecx + fillkpt(R(vm86pa), $PG_RW|PG_U) +#endif +#ifdef SMP +/* Map cpu0's private page into global kmem (4K @ cpu0prvpage) */ + movl R(cpu0pp), %eax + movl $1, %ecx + fillkptphys($PG_RW) + +/* Map SMP page table page into global kmem FWIW */ + movl R(SMPptpa), %eax + movl $1, %ecx + fillkptphys($PG_RW) + +/* Map the private page into the SMP page table */ + movl R(cpu0pp), %eax + movl $0, %ebx /* pte offset = 0 */ + movl $1, %ecx /* one private page coming right up */ + fillkpt(R(SMPptpa), $PG_RW) + +/* ... and put the page table table in the pde. */ + movl R(SMPptpa), %eax + movl $MPPTDI, %ebx + movl $1, %ecx + fillkpt(R(IdlePTD), $PG_RW) + +/* Fakeup VA for the local apic to allow early traps. */ + ALLOCPAGES(1) + movl %esi, %eax + movl $(NPTEPG-1), %ebx /* pte offset = NTEPG-1 */ + movl $1, %ecx /* one private pt coming right up */ + fillkpt(R(SMPptpa), $PG_RW) +#endif /* SMP */ + +/* install a pde for temporary double map of bottom of VA */ + movl R(KPTphys), %eax + xorl %ebx, %ebx + movl $NKPT, %ecx + fillkpt(R(IdlePTD), $PG_RW) + +/* + * For the non-PSE case, install PDEs for PTs covering the kernel. + * For the PSE case, do the same, but clobber the ones corresponding + * to the kernel (from btext to KERNend) with 4M ('PS') PDEs immediately + * after. + */ + movl R(KPTphys), %eax + movl $KPTDI, %ebx + movl $NKPT, %ecx + fillkpt(R(IdlePTD), $PG_RW) + cmpl $0,R(pseflag) + je done_pde + + movl R(KERNend), %ecx + movl $KERNLOAD, %eax + subl %eax, %ecx + shrl $PDRSHIFT, %ecx + movl $(KPTDI+(KERNLOAD/(1 << PDRSHIFT))), %ebx + shll $PDESHIFT, %ebx + addl R(IdlePTD), %ebx + orl $(PG_V|PG_RW|PG_PS), %eax +1: movl %eax, (%ebx) + addl $(1 << PDRSHIFT), %eax + addl $PDESIZE, %ebx + loop 1b + +done_pde: +/* install a pde recursively mapping page directory as a page table */ + movl R(IdlePTD), %eax + movl $PTDPTDI, %ebx + movl $NPGPTD,%ecx + fillkpt(R(IdlePTD), $PG_RW) + +#ifdef PAE + movl R(IdlePTD), %eax + xorl %ebx, %ebx + movl $NPGPTD, %ecx + fillkpt(R(IdlePDPT), $0x0) +#endif + + ret diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/machdep.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/machdep.c new file mode 100644 index 0000000000..ea813b897c --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/machdep.c @@ -0,0 +1,2396 @@ +/*- + * Copyright (c) 1992 Terrence R. Lambert. + * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/machdep.c,v 1.584 2003/12/03 21:12:09 jhb Exp $"); + +#include "opt_apic.h" +#include "opt_atalk.h" +#include "opt_compat.h" +#include "opt_cpu.h" +#include "opt_ddb.h" +#include "opt_inet.h" +#include "opt_ipx.h" +#include "opt_isa.h" +#include "opt_kstack_pages.h" +#include "opt_maxmem.h" +#include "opt_msgbuf.h" +#include "opt_npx.h" +#include "opt_perfmon.h" +#include "opt_xen.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/signalvar.h> +#include <sys/imgact.h> +#include <sys/kdb.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/linker.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memrange.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/reboot.h> +#include <sys/callout.h> +#include <sys/msgbuf.h> +#include <sys/sched.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/ucontext.h> +#include <sys/vmmeter.h> +#include <sys/bus.h> +#include <sys/eventhandler.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_pager.h> +#include <vm/vm_extern.h> + +#include <sys/user.h> +#include <sys/exec.h> +#include <sys/cons.h> + +#ifdef DDB +#ifndef KDB +#error KDB must be enabled in order for DDB to work! +#endif +#include <ddb/ddb.h> +#include <ddb/db_sym.h> +#endif + +#include <net/netisr.h> + +#include <machine/cpu.h> +#include <machine/cputypes.h> +#include <machine/reg.h> +#include <machine/clock.h> +#include <machine/specialreg.h> +#include <machine/bootinfo.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/pc/bios.h> +#include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */ +#include <machine/proc.h> +#ifdef PERFMON +#include <machine/perfmon.h> +#endif +#ifdef SMP +#include <machine/privatespace.h> +#include <machine/smp.h> +#endif + +#ifdef DEV_ISA +#include <i386/isa/icu.h> +#endif + +#include <isa/rtc.h> +#include <sys/ptrace.h> +#include <machine/sigframe.h> + + +/* XEN includes */ +#include <machine/hypervisor-ifs.h> +#include <machine/xen-os.h> +#include <machine/hypervisor.h> +#include <machine/xenfunc.h> +#include <machine/xenvar.h> +#include <machine/xen_intr.h> + +void Xhypervisor_callback(void); +void failsafe_callback(void); + +/***************/ + + +/* Sanity check for __curthread() */ +CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); + +extern void init386(void); +extern void dblfault_handler(void); + +extern void printcpuinfo(void); /* XXX header file */ +extern void finishidentcpu(void); +extern void panicifcpuunsupported(void); +extern void initializecpu(void); +void initvalues(start_info_t *startinfo); + +#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) +#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) + +#if !defined(CPU_ENABLE_SSE) && defined(I686_CPU) +#define CPU_ENABLE_SSE +#endif +#if defined(CPU_DISABLE_SSE) +#undef CPU_ENABLE_SSE +#endif + +static void cpu_startup(void *); +static void fpstate_drop(struct thread *td); +static void get_fpcontext(struct thread *td, mcontext_t *mcp); +static int set_fpcontext(struct thread *td, const mcontext_t *mcp); +#ifdef CPU_ENABLE_SSE +static void set_fpregs_xmm(struct save87 *, struct savexmm *); +static void fill_fpregs_xmm(struct savexmm *, struct save87 *); +#endif /* CPU_ENABLE_SSE */ +SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) + +#ifdef DDB +extern vm_offset_t ksym_start, ksym_end; +#endif + +int _udatasel, _ucodesel; +u_int basemem; + +start_info_t *xen_start_info; +unsigned long *xen_phys_machine; +int xendebug_flags; +int init_first = 0; +int cold = 1; + +#ifdef COMPAT_43 +static void osendsig(sig_t catcher, int sig, sigset_t *mask, u_long code); +#endif +#ifdef COMPAT_FREEBSD4 +static void freebsd4_sendsig(sig_t catcher, int sig, sigset_t *mask, + u_long code); +#endif + +long Maxmem = 0; + +vm_paddr_t phys_avail[10]; + +/* must be 2 less so 0 0 can signal end of chunks */ +#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) + +struct kva_md_info kmi; + +static struct trapframe proc0_tf; +#ifndef SMP +static struct pcpu __pcpu; +#endif + +static void +map_range(void *physptr, unsigned long physptrindex, + unsigned long physindex, int count, unsigned int flags) { + int i; + unsigned long pte, ppa; + for (i = 0; i < count; i++) { + pte = ((unsigned long)physptr) + (physptrindex << 2) + (i << 2); + ppa = (PTOM(physindex + i) << PAGE_SHIFT) | flags | PG_V | PG_A; + xpq_queue_pt_update((pt_entry_t *)pte, ppa); + } + mcl_flush_queue(); +} + +struct mem_range_softc mem_range_softc; + +static void +cpu_startup(void *dummy) +{ + /* + * Good {morning,afternoon,evening,night}. + */ + /* XXX need to write clock driver */ + startrtclock(); + + printcpuinfo(); + panicifcpuunsupported(); +#ifdef PERFMON + perfmon_init(); +#endif + printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem), + ptoa((uintmax_t)Maxmem) / 1048576); + /* + * Display any holes after the first chunk of extended memory. + */ + if (bootverbose) { + int indx; + + printf("Physical memory chunk(s):\n"); + for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { + vm_paddr_t size; + + size = phys_avail[indx + 1] - phys_avail[indx]; + printf( + "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", + (uintmax_t)phys_avail[indx], + (uintmax_t)phys_avail[indx + 1] - 1, + (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); + } + } + + vm_ksubmap_init(&kmi); + + printf("avail memory = %ju (%ju MB)\n", + ptoa((uintmax_t)cnt.v_free_count), + ptoa((uintmax_t)cnt.v_free_count) / 1048576); + + /* + * Set up buffers, so they can be used to read disk labels. + */ + bufinit(); + vm_pager_bufferinit(); + + cpu_setregs(); + +} + +/* + * Send an interrupt to process. + * + * Stack is set up to allow sigcode stored + * at top to call routine, followed by kcall + * to sigreturn routine below. After sigreturn + * resets the signal mask, the stack, and the + * frame pointer, it returns to the user + * specified pc, psl. + */ +#ifdef COMPAT_43 +static void +osendsig(catcher, sig, mask, code) + sig_t catcher; + int sig; + sigset_t *mask; + u_long code; +{ + struct osigframe sf, *fp; + struct proc *p; + struct thread *td; + struct sigacts *psp; + struct trapframe *regs; + int oonstack; + + td = curthread; + p = td->td_proc; + PROC_LOCK_ASSERT(p, MA_OWNED); + psp = p->p_sigacts; + mtx_assert(&psp->ps_mtx, MA_OWNED); + regs = td->td_frame; + oonstack = sigonstack(regs->tf_esp); + + /* Allocate space for the signal handler context. */ + if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && + SIGISMEMBER(psp->ps_sigonstack, sig)) { + fp = (struct osigframe *)(td->td_sigstk.ss_sp + + td->td_sigstk.ss_size - sizeof(struct osigframe)); +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + td->td_sigstk.ss_flags |= SS_ONSTACK; +#endif + } else + fp = (struct osigframe *)regs->tf_esp - 1; + + /* Translate the signal if appropriate. */ + if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) + sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; + + /* Build the argument list for the signal handler. */ + sf.sf_signum = sig; + sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; + if (SIGISMEMBER(psp->ps_siginfo, sig)) { + /* Signal handler installed with SA_SIGINFO. */ + sf.sf_arg2 = (register_t)&fp->sf_siginfo; + sf.sf_siginfo.si_signo = sig; + sf.sf_siginfo.si_code = code; + sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; + } else { + /* Old FreeBSD-style arguments. */ + sf.sf_arg2 = code; + sf.sf_addr = regs->tf_err; + sf.sf_ahu.sf_handler = catcher; + } + mtx_unlock(&psp->ps_mtx); + PROC_UNLOCK(p); + + /* Save most if not all of trap frame. */ + sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; + sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; + sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; + sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; + sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; + sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; + sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; + sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; + sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; + sf.sf_siginfo.si_sc.sc_es = regs->tf_es; + sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; + sf.sf_siginfo.si_sc.sc_gs = rgs(); + sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; + + /* Build the signal context to be used by osigreturn(). */ + sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; + SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); + sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; + sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; + sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; + sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; + sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; + sf.sf_siginfo.si_sc.sc_err = regs->tf_err; + + /* + * Copy the sigframe out to the user's stack. + */ + if (copyout(&sf, fp, sizeof(*fp)) != 0) { +#ifdef DEBUG + printf("process %ld has trashed its stack\n", (long)p->p_pid); +#endif + PROC_LOCK(p); + sigexit(td, SIGILL); + } + + regs->tf_esp = (int)fp; + regs->tf_eip = PS_STRINGS - szosigcode; + regs->tf_eflags &= ~PSL_T; + regs->tf_cs = _ucodesel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _udatasel; + load_gs(_udatasel); + regs->tf_ss = _udatasel; + PROC_LOCK(p); + mtx_lock(&psp->ps_mtx); +} +#endif /* COMPAT_43 */ + +#ifdef COMPAT_FREEBSD4 +static void +freebsd4_sendsig(catcher, sig, mask, code) + sig_t catcher; + int sig; + sigset_t *mask; + u_long code; +{ + struct sigframe4 sf, *sfp; + struct proc *p; + struct thread *td; + struct sigacts *psp; + struct trapframe *regs; + int oonstack; + + td = curthread; + p = td->td_proc; + PROC_LOCK_ASSERT(p, MA_OWNED); + psp = p->p_sigacts; + mtx_assert(&psp->ps_mtx, MA_OWNED); + regs = td->td_frame; + oonstack = sigonstack(regs->tf_esp); + + /* Save user context. */ + bzero(&sf, sizeof(sf)); + sf.sf_uc.uc_sigmask = *mask; + sf.sf_uc.uc_stack = td->td_sigstk; + sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) + ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; + sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; + sf.sf_uc.uc_mcontext.mc_gs = rgs(); + bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); + + /* Allocate space for the signal handler context. */ + if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && + SIGISMEMBER(psp->ps_sigonstack, sig)) { + sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp + + td->td_sigstk.ss_size - sizeof(struct sigframe4)); +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + td->td_sigstk.ss_flags |= SS_ONSTACK; +#endif + } else + sfp = (struct sigframe4 *)regs->tf_esp - 1; + + /* Translate the signal if appropriate. */ + if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) + sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; + + /* Build the argument list for the signal handler. */ + sf.sf_signum = sig; + sf.sf_ucontext = (register_t)&sfp->sf_uc; + if (SIGISMEMBER(psp->ps_siginfo, sig)) { + /* Signal handler installed with SA_SIGINFO. */ + sf.sf_siginfo = (register_t)&sfp->sf_si; + sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; + + /* Fill in POSIX parts */ + sf.sf_si.si_signo = sig; + sf.sf_si.si_code = code; + sf.sf_si.si_addr = (void *)regs->tf_err; + } else { + /* Old FreeBSD-style arguments. */ + sf.sf_siginfo = code; + sf.sf_addr = regs->tf_err; + sf.sf_ahu.sf_handler = catcher; + } + mtx_unlock(&psp->ps_mtx); + PROC_UNLOCK(p); + + /* + * Copy the sigframe out to the user's stack. + */ + if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { +#ifdef DEBUG + printf("process %ld has trashed its stack\n", (long)p->p_pid); +#endif + PROC_LOCK(p); + sigexit(td, SIGILL); + } + + regs->tf_esp = (int)sfp; + regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode; + regs->tf_eflags &= ~PSL_T; + regs->tf_cs = _ucodesel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _udatasel; + regs->tf_ss = _udatasel; + PROC_LOCK(p); + mtx_lock(&psp->ps_mtx); +} +#endif /* COMPAT_FREEBSD4 */ + +void +sendsig(catcher, sig, mask, code) + sig_t catcher; + int sig; + sigset_t *mask; + u_long code; +{ + struct sigframe sf, *sfp; + struct proc *p; + struct thread *td; + struct sigacts *psp; + char *sp; + struct trapframe *regs; + int oonstack; + + td = curthread; + p = td->td_proc; + PROC_LOCK_ASSERT(p, MA_OWNED); + psp = p->p_sigacts; + mtx_assert(&psp->ps_mtx, MA_OWNED); +#ifdef COMPAT_FREEBSD4 + if (SIGISMEMBER(psp->ps_freebsd4, sig)) { + freebsd4_sendsig(catcher, sig, mask, code); + return; + } +#endif +#ifdef COMPAT_43 + if (SIGISMEMBER(psp->ps_osigset, sig)) { + osendsig(catcher, sig, mask, code); + return; + } +#endif + regs = td->td_frame; + oonstack = sigonstack(regs->tf_esp); + + /* Save user context. */ + bzero(&sf, sizeof(sf)); + sf.sf_uc.uc_sigmask = *mask; + sf.sf_uc.uc_stack = td->td_sigstk; + sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) + ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; + sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; + sf.sf_uc.uc_mcontext.mc_gs = rgs(); + bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); + sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ + get_fpcontext(td, &sf.sf_uc.uc_mcontext); + fpstate_drop(td); + + /* Allocate space for the signal handler context. */ + if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && + SIGISMEMBER(psp->ps_sigonstack, sig)) { + sp = td->td_sigstk.ss_sp + + td->td_sigstk.ss_size - sizeof(struct sigframe); +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + td->td_sigstk.ss_flags |= SS_ONSTACK; +#endif + } else + sp = (char *)regs->tf_esp - sizeof(struct sigframe); + /* Align to 16 bytes. */ + sfp = (struct sigframe *)((unsigned int)sp & ~0xF); + + /* Translate the signal if appropriate. */ + if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) + sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; + + /* Build the argument list for the signal handler. */ + sf.sf_signum = sig; + sf.sf_ucontext = (register_t)&sfp->sf_uc; + if (SIGISMEMBER(psp->ps_siginfo, sig)) { + /* Signal handler installed with SA_SIGINFO. */ + sf.sf_siginfo = (register_t)&sfp->sf_si; + sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; + + /* Fill in POSIX parts */ + sf.sf_si.si_signo = sig; + sf.sf_si.si_code = code; + sf.sf_si.si_addr = (void *)regs->tf_err; + } else { + /* Old FreeBSD-style arguments. */ + sf.sf_siginfo = code; + sf.sf_addr = regs->tf_err; + sf.sf_ahu.sf_handler = catcher; + } + mtx_unlock(&psp->ps_mtx); + PROC_UNLOCK(p); + /* + * Copy the sigframe out to the user's stack. + */ + if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { +#ifdef DEBUG + printf("process %ld has trashed its stack\n", (long)p->p_pid); +#endif + PROC_LOCK(p); + sigexit(td, SIGILL); + } + + regs->tf_esp = (int)sfp; + regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); + regs->tf_eflags &= ~PSL_T; + regs->tf_cs = _ucodesel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _udatasel; + regs->tf_ss = _udatasel; + PROC_LOCK(p); + mtx_lock(&psp->ps_mtx); +} + +/* + * Build siginfo_t for SA thread + */ +void +cpu_thread_siginfo(int sig, u_long code, siginfo_t *si) +{ + struct proc *p; + struct thread *td; + + td = curthread; + p = td->td_proc; + PROC_LOCK_ASSERT(p, MA_OWNED); + + bzero(si, sizeof(*si)); + si->si_signo = sig; + si->si_code = code; + si->si_addr = (void *)td->td_frame->tf_err; + /* XXXKSE fill other fields */ +} + +/* + * System call to cleanup state after a signal + * has been taken. Reset signal mask and + * stack state from context left by sendsig (above). + * Return to previous pc and psl as specified by + * context left by sendsig. Check carefully to + * make sure that the user has not modified the + * state to gain improper privileges. + * + * MPSAFE + */ +#ifdef COMPAT_43 +int +osigreturn(td, uap) + struct thread *td; + struct osigreturn_args /* { + struct osigcontext *sigcntxp; + } */ *uap; +{ + struct osigcontext sc; + struct trapframe *regs; + struct osigcontext *scp; + struct proc *p = td->td_proc; + int eflags, error; + + regs = td->td_frame; + error = copyin(uap->sigcntxp, &sc, sizeof(sc)); + if (error != 0) + return (error); + scp = ≻ + eflags = scp->sc_ps; + /* + * Don't allow users to change privileged or reserved flags. + */ + /* + * XXX do allow users to change the privileged flag PSL_RF. + * The cpu sets PSL_RF in tf_eflags for faults. Debuggers + * should sometimes set it there too. tf_eflags is kept in + * the signal context during signal handling and there is no + * other place to remember it, so the PSL_RF bit may be + * corrupted by the signal handler without us knowing. + * Corruption of the PSL_RF bit at worst causes one more or + * one less debugger trap, so allowing it is fairly harmless. + */ + if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { + return (EINVAL); + } + + /* + * Don't allow users to load a valid privileged %cs. Let the + * hardware check for invalid selectors, excess privilege in + * other selectors, invalid %eip's and invalid %esp's. + */ + if (!CS_SECURE(scp->sc_cs)) { + trapsignal(td, SIGBUS, T_PROTFLT); + return (EINVAL); + } + regs->tf_ds = scp->sc_ds; + regs->tf_es = scp->sc_es; + regs->tf_fs = scp->sc_fs; + + /* Restore remaining registers. */ + regs->tf_eax = scp->sc_eax; + regs->tf_ebx = scp->sc_ebx; + regs->tf_ecx = scp->sc_ecx; + regs->tf_edx = scp->sc_edx; + regs->tf_esi = scp->sc_esi; + regs->tf_edi = scp->sc_edi; + regs->tf_cs = scp->sc_cs; + regs->tf_ss = scp->sc_ss; + regs->tf_isp = scp->sc_isp; + regs->tf_ebp = scp->sc_fp; + regs->tf_esp = scp->sc_sp; + regs->tf_eip = scp->sc_pc; + regs->tf_eflags = eflags; + + PROC_LOCK(p); +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + if (scp->sc_onstack & 1) + td->td_sigstk.ss_flags |= SS_ONSTACK; + else + td->td_sigstk.ss_flags &= ~SS_ONSTACK; +#endif + SIGSETOLD(td->td_sigmask, scp->sc_mask); + SIG_CANTMASK(td->td_sigmask); + signotify(td); + PROC_UNLOCK(p); + return (EJUSTRETURN); +} +#endif /* COMPAT_43 */ + +#ifdef COMPAT_FREEBSD4 +/* + * MPSAFE + */ +int +freebsd4_sigreturn(td, uap) + struct thread *td; + struct freebsd4_sigreturn_args /* { + const ucontext4 *sigcntxp; + } */ *uap; +{ + struct ucontext4 uc; + struct proc *p = td->td_proc; + struct trapframe *regs; + const struct ucontext4 *ucp; + int cs, eflags, error; + + error = copyin(uap->sigcntxp, &uc, sizeof(uc)); + if (error != 0) + return (error); + ucp = &uc; + regs = td->td_frame; + eflags = ucp->uc_mcontext.mc_eflags; + /* + * Don't allow users to change privileged or reserved flags. + */ + /* + * XXX do allow users to change the privileged flag PSL_RF. + * The cpu sets PSL_RF in tf_eflags for faults. Debuggers + * should sometimes set it there too. tf_eflags is kept in + * the signal context during signal handling and there is no + * other place to remember it, so the PSL_RF bit may be + * corrupted by the signal handler without us knowing. + * Corruption of the PSL_RF bit at worst causes one more or + * one less debugger trap, so allowing it is fairly harmless. + */ + if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { + printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags); + return (EINVAL); + } + + /* + * Don't allow users to load a valid privileged %cs. Let the + * hardware check for invalid selectors, excess privilege in + * other selectors, invalid %eip's and invalid %esp's. + */ + cs = ucp->uc_mcontext.mc_cs; + if (!CS_SECURE(cs)) { + printf("freebsd4_sigreturn: cs = 0x%x\n", cs); + trapsignal(td, SIGBUS, T_PROTFLT); + return (EINVAL); + } + + bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); + + PROC_LOCK(p); +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + if (ucp->uc_mcontext.mc_onstack & 1) + td->td_sigstk.ss_flags |= SS_ONSTACK; + else + td->td_sigstk.ss_flags &= ~SS_ONSTACK; +#endif + + td->td_sigmask = ucp->uc_sigmask; + SIG_CANTMASK(td->td_sigmask); + signotify(td); + PROC_UNLOCK(p); + return (EJUSTRETURN); +} +#endif /* COMPAT_FREEBSD4 */ + +/* + * MPSAFE + */ +int +sigreturn(td, uap) + struct thread *td; + struct sigreturn_args /* { + const __ucontext *sigcntxp; + } */ *uap; +{ + ucontext_t uc; + struct proc *p = td->td_proc; + struct trapframe *regs; + const ucontext_t *ucp; + int cs, eflags, error, ret; + + error = copyin(uap->sigcntxp, &uc, sizeof(uc)); + if (error != 0) + return (error); + ucp = &uc; + regs = td->td_frame; + eflags = ucp->uc_mcontext.mc_eflags; + /* + * Don't allow users to change privileged or reserved flags. + */ + /* + * XXX do allow users to change the privileged flag PSL_RF. + * The cpu sets PSL_RF in tf_eflags for faults. Debuggers + * should sometimes set it there too. tf_eflags is kept in + * the signal context during signal handling and there is no + * other place to remember it, so the PSL_RF bit may be + * corrupted by the signal handler without us knowing. + * Corruption of the PSL_RF bit at worst causes one more or + * one less debugger trap, so allowing it is fairly harmless. + */ +#if 0 + if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { + __asm__("int $0x3"); + printf("sigreturn: eflags = 0x%x\n", eflags); + return (EINVAL); + } +#endif + /* + * Don't allow users to load a valid privileged %cs. Let the + * hardware check for invalid selectors, excess privilege in + * other selectors, invalid %eip's and invalid %esp's. + */ + cs = ucp->uc_mcontext.mc_cs; + if (!CS_SECURE(cs)) { + __asm__("int $0x3"); + printf("sigreturn: cs = 0x%x\n", cs); + trapsignal(td, SIGBUS, T_PROTFLT); + return (EINVAL); + } + + ret = set_fpcontext(td, &ucp->uc_mcontext); + if (ret != 0) + return (ret); + bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); + PROC_LOCK(p); +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + if (ucp->uc_mcontext.mc_onstack & 1) + td->td_sigstk.ss_flags |= SS_ONSTACK; + else + td->td_sigstk.ss_flags &= ~SS_ONSTACK; +#endif + + td->td_sigmask = ucp->uc_sigmask; + SIG_CANTMASK(td->td_sigmask); + signotify(td); + PROC_UNLOCK(p); + return (EJUSTRETURN); +} + +/* + * Machine dependent boot() routine + * + * I haven't seen anything to put here yet + * Possibly some stuff might be grafted back here from boot() + */ +void +cpu_boot(int howto) +{ +} + +/* + * Shutdown the CPU as much as possible + */ +void +cpu_halt(void) +{ + HYPERVISOR_shutdown(); +} + +/* + * Hook to idle the CPU when possible. In the SMP case we default to + * off because a halted cpu will not currently pick up a new thread in the + * run queue until the next timer tick. If turned on this will result in + * approximately a 4.2% loss in real time performance in buildworld tests + * (but improves user and sys times oddly enough), and saves approximately + * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3). + * + * XXX we need to have a cpu mask of idle cpus and generate an IPI or + * otherwise generate some sort of interrupt to wake up cpus sitting in HLT. + * Then we can have our cake and eat it too. + * + * XXX I'm turning it on for SMP as well by default for now. It seems to + * help lock contention somewhat, and this is critical for HTT. -Peter + */ +static int cpu_idle_hlt = 1; +SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, + &cpu_idle_hlt, 0, "Idle loop HLT enable"); + +static void +cpu_idle_default(void) +{ +#if 0 + /* + * we must absolutely guarentee that hlt is the + * absolute next instruction after sti or we + * introduce a timing window. + */ + __asm __volatile("sti; hlt"); +#endif + idle_block(); + enable_intr(); +} + +/* + * Note that we have to be careful here to avoid a race between checking + * sched_runnable() and actually halting. If we don't do this, we may waste + * the time between calling hlt and the next interrupt even though there + * is a runnable process. + */ +void +cpu_idle(void) +{ + +#ifdef SMP + if (mp_grab_cpu_hlt()) + return; +#endif + + if (cpu_idle_hlt) { + disable_intr(); + if (sched_runnable()) + enable_intr(); + else + (*cpu_idle_hook)(); + } +} + +/* Other subsystems (e.g., ACPI) can hook this later. */ +void (*cpu_idle_hook)(void) = cpu_idle_default; + +/* + * Clear registers on exec + */ +void +exec_setregs(td, entry, stack, ps_strings) + struct thread *td; + u_long entry; + u_long stack; + u_long ps_strings; +{ + struct trapframe *regs = td->td_frame; + struct pcb *pcb = td->td_pcb; + + /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ + pcb->pcb_gs = _udatasel; + load_gs(_udatasel); + + if (td->td_proc->p_md.md_ldt) + user_ldt_free(td); + + bzero((char *)regs, sizeof(struct trapframe)); + regs->tf_eip = entry; + regs->tf_esp = stack; + regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); + regs->tf_ss = _udatasel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _udatasel; + regs->tf_cs = _ucodesel; + + /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ + regs->tf_ebx = ps_strings; + + /* + * Reset the hardware debug registers if they were in use. + * They won't have any meaning for the newly exec'd process. + */ + if (pcb->pcb_flags & PCB_DBREGS) { + pcb->pcb_dr0 = 0; + pcb->pcb_dr1 = 0; + pcb->pcb_dr2 = 0; + pcb->pcb_dr3 = 0; + pcb->pcb_dr6 = 0; + pcb->pcb_dr7 = 0; + if (pcb == PCPU_GET(curpcb)) { + /* + * Clear the debug registers on the running + * CPU, otherwise they will end up affecting + * the next process we switch to. + */ + reset_dbregs(); + } + pcb->pcb_flags &= ~PCB_DBREGS; + } + + /* + * Initialize the math emulator (if any) for the current process. + * Actually, just clear the bit that says that the emulator has + * been initialized. Initialization is delayed until the process + * traps to the emulator (if it is done at all) mainly because + * emulators don't provide an entry point for initialization. + */ + td->td_pcb->pcb_flags &= ~FP_SOFTFP; + + /* Initialize the npx (if any) for the current process. */ + /* + * XXX the above load_cr0() also initializes it and is a layering + * violation if NPX is configured. It drops the npx partially + * and this would be fatal if we were interrupted now, and decided + * to force the state to the pcb, and checked the invariant + * (CR0_TS clear) if and only if PCPU_GET(fpcurthread) != NULL). + * ALL of this can happen except the check. The check used to + * happen and be fatal later when we didn't complete the drop + * before returning to user mode. This should be fixed properly + * soon. + */ + fpstate_drop(td); + + /* + * XXX - Linux emulator + * Make sure sure edx is 0x0 on entry. Linux binaries depend + * on it. + */ + td->td_retval[1] = 0; +} + +void +cpu_setregs(void) +{ + /* nothing for Xen to do */ +} + +static int +sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) +{ + int error; + error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, + req); + if (!error && req->newptr) + resettodr(); + return (error); +} + +SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, + &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); + +SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, + CTLFLAG_RW, &disable_rtc_set, 0, ""); + +SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, + CTLFLAG_RD, &bootinfo, bootinfo, ""); + +u_long bootdev; /* not a dev_t - encoding is different */ +SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, + CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); + +/* + * Initialize 386 and configure to run kernel + */ + +/* + * Initialize segments & interrupt table + */ + +int _default_ldt; +union descriptor *gdt; /* global descriptor table */ +static struct gate_descriptor idt0[NIDT]; +struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ +union descriptor *ldt; /* local descriptor table */ +struct region_descriptor r_idt; /* table descriptors */ + +int private_tss; /* flag indicating private tss */ + +#if defined(I586_CPU) && !defined(NO_F00F_HACK) +extern int has_f00f_bug; +#endif + +static struct i386tss dblfault_tss; +static char dblfault_stack[PAGE_SIZE]; + +extern struct user *proc0uarea; +extern vm_offset_t proc0kstack; + + +/* software prototypes -- in more palatable form */ +struct soft_segment_descriptor gdt_segs[] = { +/* GNULL_SEL 0 Null Descriptor */ +{ 0x0, /* segment base address */ + 0x0, /* length */ + 0, /* segment type */ + SEL_KPL, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GCODE_SEL 1 Code Descriptor for kernel */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + +/* GDATA_SEL 2 Data Descriptor for kernel */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + +/* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMRWA, /* segment type */ + SEL_KPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +#if 0 +/* GPROC0_SEL 4 Proc 0 Tss Descriptor */ +{ + 0x0, /* segment base address */ + sizeof(struct i386tss)-1,/* length */ + SDT_SYS386TSS, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* unused - default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GLDT_SEL 5 LDT Descriptor */ +{ (int) ldt, /* segment base address */ + sizeof(ldt)-1, /* length - all address space */ + SDT_SYSLDT, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* unused - default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GUSERLDT_SEL 6 User LDT Descriptor per process */ +{ (int) ldt, /* segment base address */ + (512 * sizeof(union descriptor)-1), /* length */ + SDT_SYSLDT, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* unused - default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GTGATE_SEL 7 Null Descriptor - Placeholder */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ +{ 0x400, /* segment base address */ + 0xfffff, /* length */ + SDT_MEMRWA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GPANIC_SEL 9 Panic Tss Descriptor */ +{ (int) &dblfault_tss, /* segment base address */ + sizeof(struct i386tss)-1,/* length - all address space */ + SDT_SYS386TSS, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* unused - default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, +/* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMERA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMERA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMRWA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMRWA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +/* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */ +{ 0, /* segment base address (overwritten) */ + 0xfffff, /* length */ + SDT_MEMRWA, /* segment type */ + 0, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +#endif +}; + +static struct soft_segment_descriptor ldt_segs[] = { + /* Null Descriptor - overwritten by call gate */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + /* Null Descriptor - overwritten by call gate */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + /* Null Descriptor - overwritten by call gate */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + /* Code Descriptor for user */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMERA, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, + /* Null Descriptor - overwritten by call gate */ +{ 0x0, /* segment base address */ + 0x0, /* length - all address space */ + 0, /* segment type */ + 0, /* segment descriptor priority level */ + 0, /* segment descriptor present */ + 0, 0, + 0, /* default 32 vs 16 bit size */ + 0 /* limit granularity (byte/page units)*/ }, + /* Data Descriptor for user */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMRWA, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, 0, + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, +}; + +struct proc_ldt default_proc_ldt; + +void +setidt(idx, func, typ, dpl, selec) + int idx; + inthand_t *func; + int typ; + int dpl; + int selec; +{ + struct gate_descriptor *ip; + + ip = idt + idx; + ip->gd_looffset = (int)func; + ip->gd_selector = selec; + ip->gd_stkcpy = 0; + ip->gd_xx = 0; + ip->gd_type = typ; + ip->gd_dpl = dpl; + ip->gd_p = 1; + ip->gd_hioffset = ((int)func)>>16 ; +} + +#define IDTVEC(name) __CONCAT(X,name) + +extern inthand_t + IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), + IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), + IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), + IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), + IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); + +#ifdef DDB +/* + * Display the index and function name of any IDT entries that don't use + * the default 'rsvd' entry point. + */ +DB_SHOW_COMMAND(idt, db_show_idt) +{ + struct gate_descriptor *ip; + int idx, quit; + uintptr_t func; + + ip = idt; + db_setup_paging(db_simple_pager, &quit, DB_LINES_PER_PAGE); + for (idx = 0, quit = 0; idx < NIDT; idx++) { + func = (ip->gd_hioffset << 16 | ip->gd_looffset); + if (func != (uintptr_t)&IDTVEC(rsvd)) { + db_printf("%3d\t", idx); + db_printsym(func, DB_STGY_PROC); + db_printf("\n"); + } + ip++; + } +} +#endif + +void +sdtossd(sd, ssd) + struct segment_descriptor *sd; + struct soft_segment_descriptor *ssd; +{ + ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; + ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; + ssd->ssd_type = sd->sd_type; + ssd->ssd_dpl = sd->sd_dpl; + ssd->ssd_p = sd->sd_p; + ssd->ssd_def32 = sd->sd_def32; + ssd->ssd_gran = sd->sd_gran; +} + +#define PHYSMAP_SIZE (2 * 8) + +/* + * Populate the (physmap) array with base/bound pairs describing the + * available physical memory in the system, then test this memory and + * build the phys_avail array describing the actually-available memory. + * + * If we cannot accurately determine the physical memory map, then use + * value from the 0xE801 call, and failing that, the RTC. + * + * Total memory size may be set by the kernel environment variable + * hw.physmem or the compile-time define MAXMEM. + * + * XXX first should be vm_paddr_t. + */ +static void +getmemsize(void) +{ + int i; + printf("start_info %p\n", xen_start_info); + printf("start_info->nr_pages %ld\n", xen_start_info->nr_pages); + Maxmem = xen_start_info->nr_pages - init_first; + /* call pmap initialization to make new kernel address space */ + pmap_bootstrap((init_first)<< PAGE_SHIFT, 0); + for (i = 0; i < 10; i++) + phys_avail[i] = 0; +#ifdef MAXMEM + if (MAXMEM/4 < Maxmem) + Maxmem = MAXMEM/4; +#endif + physmem = Maxmem; + avail_end = ptoa(Maxmem) - round_page(MSGBUF_SIZE); + phys_avail[0] = init_first << PAGE_SHIFT; + phys_avail[1] = avail_end; +} + +extern pt_entry_t *KPTphys; +extern int kernbase; +pteinfo_t *pteinfo_list; +unsigned long *xen_machine_phys = ((unsigned long *)VADDR(1008, 0)); + +/* Linux infection */ +#define PAGE_OFFSET KERNBASE +#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) +void +initvalues(start_info_t *startinfo) +{ + int i; + xen_start_info = startinfo; + xen_phys_machine = (unsigned long *)startinfo->mfn_list; + unsigned long tmpindex = ((__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + xen_start_info->nr_pt_frames) + 3 /* number of pages allocated after the pts + 1*/; + xendebug_flags = 0xffffffff; + /* pre-zero unused mapped pages */ + bzero((char *)(KERNBASE + (tmpindex << PAGE_SHIFT)), (1024 - tmpindex)*PAGE_SIZE); + + KPTphys = (pt_entry_t *)xpmap_ptom(__pa(startinfo->pt_base + PAGE_SIZE)); + IdlePTD = (pd_entry_t *)xpmap_ptom(__pa(startinfo->pt_base)); + XENPRINTF("IdlePTD %p\n", IdlePTD); + XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%lx pt_base: 0x%lx " + "mod_start: 0x%lx mod_len: 0x%lx\n", + xen_start_info->nr_pages, xen_start_info->shared_info, + xen_start_info->flags, xen_start_info->pt_base, + xen_start_info->mod_start, xen_start_info->mod_len); + + /* Map proc0's UPAGES */ + proc0uarea = (struct user *)(KERNBASE + (tmpindex << PAGE_SHIFT)); + tmpindex += UAREA_PAGES; + + /* Map proc0's KSTACK */ + proc0kstack = KERNBASE + (tmpindex << PAGE_SHIFT); + tmpindex += KSTACK_PAGES; + + /* allocate page for gdt */ + gdt = (union descriptor *)(KERNBASE + (tmpindex << PAGE_SHIFT)); + tmpindex++; + + /* allocate page for ldt */ + ldt = (union descriptor *)(KERNBASE + (tmpindex << PAGE_SHIFT)); + tmpindex++; + +#ifdef PMAP_DEBUG + pteinfo_list = (pteinfo_t *)(KERNBASE + (tmpindex << PAGE_SHIFT)); + tmpindex += ((xen_start_info->nr_pages >> 10) + 1)*(1 + XPQ_CALL_DEPTH*XPQ_CALL_COUNT); + + if (tmpindex > 980) + __asm__("int3"); +#endif + /* unmap remaining pages from initial 4MB chunk */ + for (i = tmpindex; i%1024 != 0; i++) + PT_CLEAR(KERNBASE + (i << PAGE_SHIFT), TRUE); + + /* allocate remainder of NKPT pages */ + map_range(IdlePTD, KPTDI + 1, tmpindex, NKPT-1, PG_U | PG_M | PG_RW); + tmpindex += NKPT-1; + map_range(IdlePTD, PTDPTDI, __pa(xen_start_info->pt_base) >> PAGE_SHIFT, 1, 0); + + xpq_queue_pt_update(KPTphys + tmpindex, xen_start_info->shared_info | PG_A | PG_V | PG_RW); + HYPERVISOR_shared_info = (shared_info_t *)(KERNBASE + (tmpindex << PAGE_SHIFT)); + tmpindex++; + + mcl_flush_queue(); + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = (unsigned long)xen_phys_machine; + HYPERVISOR_shared_info->arch.mfn_to_pfn_start = (unsigned long)xen_machine_phys; + + init_first = tmpindex; + +} + +void +init386(void) +{ + int gsel_tss, metadata_missing, off, x, error; + struct pcpu *pc; + trap_info_t trap_table[] = { + { 0, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(div)}, + { 1, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dbg)}, + { 3, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bpt)}, + { 4, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ofl)}, + /* This is UPL on Linux and KPL on BSD */ + { 5, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bnd)}, + { 6, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ill)}, + { 7, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dna)}, + /* + * { 8, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(XXX)}, + * no handler for double fault + */ + { 9, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpusegm)}, + {10, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(tss)}, + {11, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(missing)}, + {12, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(stk)}, + {13, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(prot)}, + {14, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(page)}, + {15, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(rsvd)}, + {16, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpu)}, + {17, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(align)}, + {18, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(mchk)}, + {19, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(xmm)}, + {0x80, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(int0x80_syscall)}, + { 0, 0, 0, 0 } + }; + proc0.p_uarea = proc0uarea; + thread0.td_kstack = proc0kstack; + thread0.td_pcb = (struct pcb *) + (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; + + /* + * This may be done better later if it gets more high level + * components in it. If so just link td->td_proc here. + */ + proc_linkup(&proc0, &ksegrp0, &thread0); + + metadata_missing = 0; + if (xen_start_info->mod_start) + preload_metadata = (caddr_t)xen_start_info->mod_start; + else + metadata_missing = 1; + + /* XXX - temporary hack */ + preload_metadata = (caddr_t)0; + /* XXX */ + + if (envmode == 1) + kern_envp = static_env; + else if ((caddr_t)xen_start_info->cmd_line) + kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); + + boothowto |= xen_boothowto(kern_envp); + + if (boothowto & RB_GDB_PAUSE) + __asm__("int $0x3;"); + + /* Init basic tunables, hz etc */ + init_param1(); + /* + * make gdt memory segments, the code segment goes up to end of the + * page with etext in it, the data segment goes to the end of + * the address space + */ +#if 0 + /* + * XEN occupies the upper 64MB of virtual address space + * At its base it manages an array mapping machine page frames + * to physical page frames - hence we need to be able to + * access 4GB - (64MB - 4MB + 64k) + */ + gdt_segs[GCODE_SEL].ssd_limit = atop(0 - ((1 << 26) - (1 << 22) + (1 << 16))); + gdt_segs[GDATA_SEL].ssd_limit = atop(0 - ((1 << 26) - (1 << 22) + (1 << 16))); +#endif +#ifdef SMP + pc = &SMP_prvspace[0].pcpu; + gdt_segs[GPRIV_SEL].ssd_limit = + atop(sizeof(struct privatespace) - 1); +#else + pc = &__pcpu; + gdt_segs[GPRIV_SEL].ssd_limit = + atop(sizeof(struct pcpu) - 1); +#endif + gdt_segs[GPRIV_SEL].ssd_base = (int) pc; + gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; + for (x = 0; x < NGDT; x++) + ssdtosd(&gdt_segs[x], &gdt[x].sd); + /* re-map GDT read-only */ + { + unsigned long gdtindex = (((unsigned long)gdt - KERNBASE) >> PAGE_SHIFT); + unsigned long gdtphys = PTOM(gdtindex); + map_range(KPTphys, gdtindex, gdtindex, 1, 0); + mcl_flush_queue(); + if (HYPERVISOR_set_gdt(&gdtphys, LAST_RESERVED_GDT_ENTRY + 1)) { + panic("set_gdt failed\n"); + } + lgdt_finish(); + } + + if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { + panic("set_trap_table failed - error %d\n", error); + } + if ((error = HYPERVISOR_set_fast_trap(0x80)) != 0) { + panic("set_fast_trap failed - error %d\n", error); + } + HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback, + GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); + + + + pcpu_init(pc, 0, sizeof(struct pcpu)); + PCPU_SET(prvspace, pc); + PCPU_SET(curthread, &thread0); + PCPU_SET(curpcb, thread0.td_pcb); + PCPU_SET(trap_nesting, 0); + PCPU_SET(pdir, (unsigned long)IdlePTD); + /* + * Initialize mutexes. + * + */ + mutex_init(); + + /* make ldt memory segments */ + /* + * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it + * should be spelled ...MAX_USER... + */ + ldt_segs[LUCODE_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); + ldt_segs[LUDATA_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); + for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) + ssdtosd(&ldt_segs[x], &ldt[x].sd); + default_proc_ldt.ldt_base = (caddr_t)ldt; + default_proc_ldt.ldt_len = 6; + _default_ldt = (int)&default_proc_ldt; + PCPU_SET(currentldt, _default_ldt); + { + unsigned long ldtindex = (((unsigned long)ldt - KERNBASE) >> PAGE_SHIFT); + map_range(KPTphys, ldtindex, ldtindex, 1, 0); + mcl_flush_queue(); + xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); + } + + /* + * Initialize the console before we print anything out. + */ + cninit(); + if (metadata_missing) + printf("WARNING: loader(8) metadata is missing!\n"); + +#ifdef DDB + ksym_start = bootinfo.bi_symtab; + ksym_end = bootinfo.bi_esymtab; +#endif + kdb_init(); +#ifdef KDB + if (boothowto & RB_KDB) + kdb_enter("Boot flags requested debugger"); +#endif + + finishidentcpu(); /* Final stage of CPU initialization */ + setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + initializecpu(); /* Initialize CPU registers */ + + /* make an initial tss so cpu can get interrupt stack on syscall! */ + /* Note: -16 is so we can grow the trapframe if we came from vm86 */ + PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16); + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); + private_tss = 0; + PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); + PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); + PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); + HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), PCPU_GET(common_tss.tss_esp0)); + + dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = + dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; + dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = + dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); + + dblfault_tss.tss_cr3 = (int)IdlePTD; + dblfault_tss.tss_eip = (int)dblfault_handler; + dblfault_tss.tss_eflags = PSL_KERNEL; + dblfault_tss.tss_ds = dblfault_tss.tss_es = + dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); + dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); + dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); + dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); + + getmemsize(); + init_param2(physmem); + /* now running on new page tables, configured,and u/iom is accessible */ + /* Map the message buffer. */ + for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) + pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); + PT_UPDATES_FLUSH(); + + /* safe to enable xen page queue locking */ + xpq_init(); + + msgbufinit(msgbufp, MSGBUF_SIZE); + /* XXX KMM I don't think we need call gates */ +#if 0 + printf("modify ldt\n"); + /* make a call gate to reenter kernel with */ + gdp = &ldt[LSYS5CALLS_SEL].gd; + + x = (int) &IDTVEC(lcall_syscall); + gdp->gd_looffset = x; + gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); + gdp->gd_stkcpy = 1; + gdp->gd_type = SDT_SYS386CGT; + gdp->gd_dpl = SEL_UPL; + gdp->gd_p = 1; + gdp->gd_hioffset = x >> 16; + + /* XXX does this work? */ + ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; + ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; +#endif + /* transfer to user mode */ + + _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); + _udatasel = LSEL(LUDATA_SEL, SEL_UPL); + + /* setup proc 0's pcb */ + thread0.td_pcb->pcb_flags = 0; /* XXXKSE */ + thread0.td_pcb->pcb_cr3 = (int)IdlePTD; + thread0.td_pcb->pcb_ext = 0; + thread0.td_frame = &proc0_tf; +} + +void +cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) +{ + + pcpu->pc_acpi_id = 0xffffffff; +} + +/* + * Construct a PCB from a trapframe. This is called from kdb_trap() where + * we want to start a backtrace from the function that caused us to enter + * the debugger. We have the context in the trapframe, but base the trace + * on the PCB. The PCB doesn't have to be perfect, as long as it contains + * enough for a backtrace. + */ +void +makectx(struct trapframe *tf, struct pcb *pcb) +{ + + pcb->pcb_edi = tf->tf_edi; + pcb->pcb_esi = tf->tf_esi; + pcb->pcb_ebp = tf->tf_ebp; + pcb->pcb_ebx = tf->tf_ebx; + pcb->pcb_eip = tf->tf_eip; + pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; +} + +int +ptrace_set_pc(struct thread *td, u_long addr) +{ + + td->td_frame->tf_eip = addr; + return (0); +} + +int +ptrace_single_step(struct thread *td) +{ + td->td_frame->tf_eflags |= PSL_T; + return (0); +} + +int +ptrace_clear_single_step(struct thread *td) +{ + td->td_frame->tf_eflags &= ~PSL_T; + return (0); +} + +int +fill_regs(struct thread *td, struct reg *regs) +{ + struct pcb *pcb; + struct trapframe *tp; + + tp = td->td_frame; + regs->r_fs = tp->tf_fs; + regs->r_es = tp->tf_es; + regs->r_ds = tp->tf_ds; + regs->r_edi = tp->tf_edi; + regs->r_esi = tp->tf_esi; + regs->r_ebp = tp->tf_ebp; + regs->r_ebx = tp->tf_ebx; + regs->r_edx = tp->tf_edx; + regs->r_ecx = tp->tf_ecx; + regs->r_eax = tp->tf_eax; + regs->r_eip = tp->tf_eip; + regs->r_cs = tp->tf_cs; + regs->r_eflags = tp->tf_eflags; + regs->r_esp = tp->tf_esp; + regs->r_ss = tp->tf_ss; + pcb = td->td_pcb; + regs->r_gs = pcb->pcb_gs; + return (0); +} + +int +set_regs(struct thread *td, struct reg *regs) +{ + struct pcb *pcb; + struct trapframe *tp; + + tp = td->td_frame; + if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || + !CS_SECURE(regs->r_cs)) + return (EINVAL); + tp->tf_fs = regs->r_fs; + tp->tf_es = regs->r_es; + tp->tf_ds = regs->r_ds; + tp->tf_edi = regs->r_edi; + tp->tf_esi = regs->r_esi; + tp->tf_ebp = regs->r_ebp; + tp->tf_ebx = regs->r_ebx; + tp->tf_edx = regs->r_edx; + tp->tf_ecx = regs->r_ecx; + tp->tf_eax = regs->r_eax; + tp->tf_eip = regs->r_eip; + tp->tf_cs = regs->r_cs; + tp->tf_eflags = regs->r_eflags; + tp->tf_esp = regs->r_esp; + tp->tf_ss = regs->r_ss; + pcb = td->td_pcb; + pcb->pcb_gs = regs->r_gs; + return (0); +} + +#ifdef CPU_ENABLE_SSE +static void +fill_fpregs_xmm(sv_xmm, sv_87) + struct savexmm *sv_xmm; + struct save87 *sv_87; +{ + register struct env87 *penv_87 = &sv_87->sv_env; + register struct envxmm *penv_xmm = &sv_xmm->sv_env; + int i; + + bzero(sv_87, sizeof(*sv_87)); + + /* FPU control/status */ + penv_87->en_cw = penv_xmm->en_cw; + penv_87->en_sw = penv_xmm->en_sw; + penv_87->en_tw = penv_xmm->en_tw; + penv_87->en_fip = penv_xmm->en_fip; + penv_87->en_fcs = penv_xmm->en_fcs; + penv_87->en_opcode = penv_xmm->en_opcode; + penv_87->en_foo = penv_xmm->en_foo; + penv_87->en_fos = penv_xmm->en_fos; + + /* FPU registers */ + for (i = 0; i < 8; ++i) + sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; +} + +static void +set_fpregs_xmm(sv_87, sv_xmm) + struct save87 *sv_87; + struct savexmm *sv_xmm; +{ + register struct env87 *penv_87 = &sv_87->sv_env; + register struct envxmm *penv_xmm = &sv_xmm->sv_env; + int i; + + /* FPU control/status */ + penv_xmm->en_cw = penv_87->en_cw; + penv_xmm->en_sw = penv_87->en_sw; + penv_xmm->en_tw = penv_87->en_tw; + penv_xmm->en_fip = penv_87->en_fip; + penv_xmm->en_fcs = penv_87->en_fcs; + penv_xmm->en_opcode = penv_87->en_opcode; + penv_xmm->en_foo = penv_87->en_foo; + penv_xmm->en_fos = penv_87->en_fos; + + /* FPU registers */ + for (i = 0; i < 8; ++i) + sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; +} +#endif /* CPU_ENABLE_SSE */ + +int +fill_fpregs(struct thread *td, struct fpreg *fpregs) +{ +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm, + (struct save87 *)fpregs); + return (0); + } +#endif /* CPU_ENABLE_SSE */ + bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); + return (0); +} + +int +set_fpregs(struct thread *td, struct fpreg *fpregs) +{ +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + set_fpregs_xmm((struct save87 *)fpregs, + &td->td_pcb->pcb_save.sv_xmm); + return (0); + } +#endif /* CPU_ENABLE_SSE */ + bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs); + return (0); +} + +/* + * Get machine context. + */ +int +get_mcontext(struct thread *td, mcontext_t *mcp, int flags) +{ + struct trapframe *tp; + + tp = td->td_frame; + + PROC_LOCK(curthread->td_proc); + mcp->mc_onstack = sigonstack(tp->tf_esp); + PROC_UNLOCK(curthread->td_proc); + mcp->mc_gs = td->td_pcb->pcb_gs; + mcp->mc_fs = tp->tf_fs; + mcp->mc_es = tp->tf_es; + mcp->mc_ds = tp->tf_ds; + mcp->mc_edi = tp->tf_edi; + mcp->mc_esi = tp->tf_esi; + mcp->mc_ebp = tp->tf_ebp; + mcp->mc_isp = tp->tf_isp; + if (flags & GET_MC_CLEAR_RET) { + mcp->mc_eax = 0; + mcp->mc_edx = 0; + } else { + mcp->mc_eax = tp->tf_eax; + mcp->mc_edx = tp->tf_edx; + } + mcp->mc_ebx = tp->tf_ebx; + mcp->mc_ecx = tp->tf_ecx; + mcp->mc_eip = tp->tf_eip; + mcp->mc_cs = tp->tf_cs; + mcp->mc_eflags = tp->tf_eflags; + mcp->mc_esp = tp->tf_esp; + mcp->mc_ss = tp->tf_ss; + mcp->mc_len = sizeof(*mcp); + get_fpcontext(td, mcp); + return (0); +} + +/* + * Set machine context. + * + * However, we don't set any but the user modifiable flags, and we won't + * touch the cs selector. + */ +int +set_mcontext(struct thread *td, const mcontext_t *mcp) +{ + struct trapframe *tp; + int eflags, ret; + + tp = td->td_frame; + if (mcp->mc_len != sizeof(*mcp)) + return (EINVAL); + eflags = (mcp->mc_eflags & PSL_USERCHANGE) | + (tp->tf_eflags & ~PSL_USERCHANGE); + if ((ret = set_fpcontext(td, mcp)) == 0) { + tp->tf_fs = mcp->mc_fs; + tp->tf_es = mcp->mc_es; + tp->tf_ds = mcp->mc_ds; + tp->tf_edi = mcp->mc_edi; + tp->tf_esi = mcp->mc_esi; + tp->tf_ebp = mcp->mc_ebp; + tp->tf_ebx = mcp->mc_ebx; + tp->tf_edx = mcp->mc_edx; + tp->tf_ecx = mcp->mc_ecx; + tp->tf_eax = mcp->mc_eax; + tp->tf_eip = mcp->mc_eip; + tp->tf_eflags = eflags; + tp->tf_esp = mcp->mc_esp; + tp->tf_ss = mcp->mc_ss; + td->td_pcb->pcb_gs = mcp->mc_gs; + ret = 0; + } + return (ret); +} + +static void +get_fpcontext(struct thread *td, mcontext_t *mcp) +{ +#ifndef DEV_NPX + mcp->mc_fpformat = _MC_FPFMT_NODEV; + mcp->mc_ownedfp = _MC_FPOWNED_NONE; +#else + union savefpu *addr; + + /* + * XXX mc_fpstate might be misaligned, since its declaration is not + * unportabilized using __attribute__((aligned(16))) like the + * declaration of struct savemm, and anyway, alignment doesn't work + * for auto variables since we don't use gcc's pessimal stack + * alignment. Work around this by abusing the spare fields after + * mcp->mc_fpstate. + * + * XXX unpessimize most cases by only aligning when fxsave might be + * called, although this requires knowing too much about + * npxgetregs()'s internals. + */ + addr = (union savefpu *)&mcp->mc_fpstate; + if (td == PCPU_GET(fpcurthread) && +#ifdef CPU_ENABLE_SSE + cpu_fxsr && +#endif + ((uintptr_t)(void *)addr & 0xF)) { + do + addr = (void *)((char *)addr + 4); + while ((uintptr_t)(void *)addr & 0xF); + } + mcp->mc_ownedfp = npxgetregs(td, addr); + if (addr != (union savefpu *)&mcp->mc_fpstate) { + bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); + bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2)); + } + mcp->mc_fpformat = npxformat(); +#endif +} + +static int +set_fpcontext(struct thread *td, const mcontext_t *mcp) +{ + union savefpu *addr; + + if (mcp->mc_fpformat == _MC_FPFMT_NODEV) + return (0); + else if (mcp->mc_fpformat != _MC_FPFMT_387 && + mcp->mc_fpformat != _MC_FPFMT_XMM) + return (EINVAL); + else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) + /* We don't care what state is left in the FPU or PCB. */ + fpstate_drop(td); + else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || + mcp->mc_ownedfp == _MC_FPOWNED_PCB) { + /* XXX align as above. */ + addr = (union savefpu *)&mcp->mc_fpstate; + if (td == PCPU_GET(fpcurthread) && +#ifdef CPU_ENABLE_SSE + cpu_fxsr && +#endif + ((uintptr_t)(void *)addr & 0xF)) { + do + addr = (void *)((char *)addr + 4); + while ((uintptr_t)(void *)addr & 0xF); + bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate)); + } +#ifdef DEV_NPX + /* + * XXX we violate the dubious requirement that npxsetregs() + * be called with interrupts disabled. + */ + npxsetregs(td, addr); +#endif + /* + * Don't bother putting things back where they were in the + * misaligned case, since we know that the caller won't use + * them again. + */ + } else + return (EINVAL); + return (0); +} + +static void +fpstate_drop(struct thread *td) +{ + register_t s; + + s = intr_disable(); +#ifdef DEV_NPX + if (PCPU_GET(fpcurthread) == td) + npxdrop(); +#endif + /* + * XXX force a full drop of the npx. The above only drops it if we + * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. + * + * XXX I don't much like npxgetregs()'s semantics of doing a full + * drop. Dropping only to the pcb matches fnsave's behaviour. + * We only need to drop to !PCB_INITDONE in sendsig(). But + * sendsig() is the only caller of npxgetregs()... perhaps we just + * have too many layers. + */ + curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE; + intr_restore(s); +} + +int +fill_dbregs(struct thread *td, struct dbreg *dbregs) +{ + struct pcb *pcb; + + if (td == NULL) { + dbregs->dr[0] = rdr0(); + dbregs->dr[1] = rdr1(); + dbregs->dr[2] = rdr2(); + dbregs->dr[3] = rdr3(); + dbregs->dr[4] = rdr4(); + dbregs->dr[5] = rdr5(); + dbregs->dr[6] = rdr6(); + dbregs->dr[7] = rdr7(); + } else { + pcb = td->td_pcb; + dbregs->dr[0] = pcb->pcb_dr0; + dbregs->dr[1] = pcb->pcb_dr1; + dbregs->dr[2] = pcb->pcb_dr2; + dbregs->dr[3] = pcb->pcb_dr3; + dbregs->dr[4] = 0; + dbregs->dr[5] = 0; + dbregs->dr[6] = pcb->pcb_dr6; + dbregs->dr[7] = pcb->pcb_dr7; + } + return (0); +} + +int +set_dbregs(struct thread *td, struct dbreg *dbregs) +{ + struct pcb *pcb; + int i; + u_int32_t mask1, mask2; + + if (td == NULL) { + load_dr0(dbregs->dr[0]); + load_dr1(dbregs->dr[1]); + load_dr2(dbregs->dr[2]); + load_dr3(dbregs->dr[3]); + load_dr4(dbregs->dr[4]); + load_dr5(dbregs->dr[5]); + load_dr6(dbregs->dr[6]); + load_dr7(dbregs->dr[7]); + } else { + /* + * Don't let an illegal value for dr7 get set. Specifically, + * check for undefined settings. Setting these bit patterns + * result in undefined behaviour and can lead to an unexpected + * TRCTRAP. + */ + for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; + i++, mask1 <<= 2, mask2 <<= 2) + if ((dbregs->dr[7] & mask1) == mask2) + return (EINVAL); + + pcb = td->td_pcb; + + /* + * Don't let a process set a breakpoint that is not within the + * process's address space. If a process could do this, it + * could halt the system by setting a breakpoint in the kernel + * (if ddb was enabled). Thus, we need to check to make sure + * that no breakpoints are being enabled for addresses outside + * process's address space, unless, perhaps, we were called by + * uid 0. + * + * XXX - what about when the watched area of the user's + * address space is written into from within the kernel + * ... wouldn't that still cause a breakpoint to be generated + * from within kernel mode? + */ + + if (suser(td) != 0) { + if (dbregs->dr[7] & 0x3) { + /* dr0 is enabled */ + if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) + return (EINVAL); + } + + if (dbregs->dr[7] & (0x3<<2)) { + /* dr1 is enabled */ + if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) + return (EINVAL); + } + + if (dbregs->dr[7] & (0x3<<4)) { + /* dr2 is enabled */ + if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) + return (EINVAL); + } + + if (dbregs->dr[7] & (0x3<<6)) { + /* dr3 is enabled */ + if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) + return (EINVAL); + } + } + + pcb->pcb_dr0 = dbregs->dr[0]; + pcb->pcb_dr1 = dbregs->dr[1]; + pcb->pcb_dr2 = dbregs->dr[2]; + pcb->pcb_dr3 = dbregs->dr[3]; + pcb->pcb_dr6 = dbregs->dr[6]; + pcb->pcb_dr7 = dbregs->dr[7]; + + pcb->pcb_flags |= PCB_DBREGS; + } + + return (0); +} + +/* + * Return > 0 if a hardware breakpoint has been hit, and the + * breakpoint was in user space. Return 0, otherwise. + */ +int +user_dbreg_trap(void) +{ + u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ + u_int32_t bp; /* breakpoint bits extracted from dr6 */ + int nbp; /* number of breakpoints that triggered */ + caddr_t addr[4]; /* breakpoint addresses */ + int i; + + dr7 = rdr7(); + if ((dr7 & 0x000000ff) == 0) { + /* + * all GE and LE bits in the dr7 register are zero, + * thus the trap couldn't have been caused by the + * hardware debug registers + */ + return 0; + } + + nbp = 0; + dr6 = rdr6(); + bp = dr6 & 0x0000000f; + + if (!bp) { + /* + * None of the breakpoint bits are set meaning this + * trap was not caused by any of the debug registers + */ + return 0; + } + + /* + * at least one of the breakpoints were hit, check to see + * which ones and if any of them are user space addresses + */ + + if (bp & 0x01) { + addr[nbp++] = (caddr_t)rdr0(); + } + if (bp & 0x02) { + addr[nbp++] = (caddr_t)rdr1(); + } + if (bp & 0x04) { + addr[nbp++] = (caddr_t)rdr2(); + } + if (bp & 0x08) { + addr[nbp++] = (caddr_t)rdr3(); + } + + for (i=0; i<nbp; i++) { + if (addr[i] < + (caddr_t)VM_MAXUSER_ADDRESS) { + /* + * addr[i] is in user space + */ + return nbp; + } + } + + /* + * None of the breakpoints are in user space. + */ + return 0; +} + +#ifndef DEV_APIC +#include <machine/apicvar.h> + +/* + * Provide stub functions so that the MADT APIC enumerator in the acpi + * kernel module will link against a kernel without 'device apic'. + * + * XXX - This is a gross hack. + */ +void +apic_register_enumerator(struct apic_enumerator *enumerator) +{ +} + +void * +ioapic_create(uintptr_t addr, int32_t id, int intbase) +{ + return (NULL); +} + +int +ioapic_disable_pin(void *cookie, u_int pin) +{ + return (ENXIO); +} + +int +ioapic_get_vector(void *cookie, u_int pin) +{ + return (-1); +} + +void +ioapic_register(void *cookie) +{ +} + +int +ioapic_remap_vector(void *cookie, u_int pin, int vector) +{ + return (ENXIO); +} + +int +ioapic_set_extint(void *cookie, u_int pin) +{ + return (ENXIO); +} + +int +ioapic_set_nmi(void *cookie, u_int pin) +{ + return (ENXIO); +} + +int +ioapic_set_polarity(void *cookie, u_int pin,enum intr_polarity pol ) +{ + return (ENXIO); +} + +int +ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger ) +{ + return (ENXIO); +} + +void +lapic_create(u_int apic_id, int boot_cpu) +{ +} + +void +lapic_init(uintptr_t addr) +{ +} + +int +lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode) +{ + return (ENXIO); +} + +int +lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol) +{ + return (ENXIO); +} + +int +lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger) +{ + return (ENXIO); +} +#endif + +#ifdef KDB + +/* + * Provide inb() and outb() as functions. They are normally only + * available as macros calling inlined functions, thus cannot be + * called from the debugger. + * + * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. + */ + +#undef inb +#undef outb + +/* silence compiler warnings */ +u_char inb(u_int); +void outb(u_int, u_char); + +u_char +inb(u_int port) +{ + u_char data; + /* + * We use %%dx and not %1 here because i/o is done at %dx and not at + * %edx, while gcc generates inferior code (movw instead of movl) + * if we tell it to load (u_short) port. + */ + __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); + return (data); +} + +void +outb(u_int port, u_char data) +{ + u_char al; + /* + * Use an unnecessary assignment to help gcc's register allocator. + * This make a large difference for gcc-1.40 and a tiny difference + * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for + * best results. gcc-2.6.0 can't handle this. + */ + al = data; + __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); +} + +#endif /* KDB */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/mp_clock.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/mp_clock.c new file mode 100644 index 0000000000..af07002ebb --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/mp_clock.c @@ -0,0 +1,150 @@ +/*- + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/mp_clock.c,v 1.19 2004/05/30 20:34:57 phk Exp $"); + +/*- + * Just when we thought life were beautiful, reality pops its grim face over + * the edge again: + * + * ] 20. ACPI Timer Errata + * ] + * ] Problem: The power management timer may return improper result when + * ] read. Although the timer value settles properly after incrementing, + * ] while incrementing there is a 3nS window every 69.8nS where the + * ] timer value is indeterminate (a 4.2% chance that the data will be + * ] incorrect when read). As a result, the ACPI free running count up + * ] timer specification is violated due to erroneous reads. Implication: + * ] System hangs due to the "inaccuracy" of the timer when used by + * ] software for time critical events and delays. + * ] + * ] Workaround: Read the register twice and compare. + * ] Status: This will not be fixed in the PIIX4 or PIIX4E. + * + * The counter is in other words not latched to the PCI bus clock when + * read. Notice the workaround isn't: We need to read until we have + * three monotonic samples and then use the middle one, otherwise we are + * not protected against the fact that the bits can be wrong in two + * directions. If we only cared about monosity two reads would be enough. + */ + +/* #include "opt_bus.h" */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/timetc.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/sysctl.h> +#include <sys/bus.h> + +#include <dev/pci/pcireg.h> +#include <dev/pci/pcivar.h> + +static unsigned piix_get_timecount(struct timecounter *tc); + +static u_int32_t piix_timecounter_address; +static u_int piix_freq = 14318182/4; + +static struct timecounter piix_timecounter = { + piix_get_timecount, /* get_timecount */ + 0, /* no poll_pps */ + 0xffffff, /* counter_mask */ + 0, /* frequency */ + "PIIX" /* name */ +}; + + +static int +sysctl_machdep_piix_freq(SYSCTL_HANDLER_ARGS) +{ + int error; + u_int freq; + + if (piix_timecounter.tc_frequency == 0) + return (EOPNOTSUPP); + freq = piix_freq; + error = sysctl_handle_int(oidp, &freq, sizeof(freq), req); + if (error == 0 && req->newptr != NULL) { + piix_freq = freq; + piix_timecounter.tc_frequency = piix_freq; + } + return (error); +} + +SYSCTL_PROC(_machdep, OID_AUTO, piix_freq, CTLTYPE_INT | CTLFLAG_RW, + 0, sizeof(u_int), sysctl_machdep_piix_freq, "I", ""); + +static unsigned +piix_get_timecount(struct timecounter *tc) +{ + unsigned u1, u2, u3; + + u2 = inl(piix_timecounter_address); + u3 = inl(piix_timecounter_address); + do { + u1 = u2; + u2 = u3; + u3 = inl(piix_timecounter_address); + } while (u1 > u2 || u2 > u3); + return (u2); +} + +static int +piix_probe(device_t dev) +{ + u_int32_t d; + + if (devclass_get_device(devclass_find("acpi"), 0) != NULL) + return (ENXIO); + switch (pci_get_devid(dev)) { + case 0x71138086: + device_set_desc(dev, "PIIX Timecounter"); + break; + default: + return (ENXIO); + } + + d = pci_read_config(dev, PCIR_COMMAND, 2); + if (!(d & PCIM_CMD_PORTEN)) { + device_printf(dev, "PIIX I/O space not mapped\n"); + return (ENXIO); + } + return (0); +} + +static int +piix_attach(device_t dev) +{ + u_int32_t d; + + d = pci_read_config(dev, 0x40, 4); + piix_timecounter_address = (d & 0xffc0) + 8; + piix_timecounter.tc_frequency = piix_freq; + tc_init(&piix_timecounter); + return (0); +} + +static device_method_t piix_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, piix_probe), + DEVMETHOD(device_attach, piix_attach), + { 0, 0 } +}; + +static driver_t piix_driver = { + "piix", + piix_methods, + 1, +}; + +static devclass_t piix_devclass; + +DRIVER_MODULE(piix, pci, piix_driver, piix_devclass, 0, 0); diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/mp_machdep.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/mp_machdep.c new file mode 100644 index 0000000000..b975c9e491 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/mp_machdep.c @@ -0,0 +1,1315 @@ +/*- + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/mp_machdep.c,v 1.235.2.3 2004/09/24 15:02:33 rik Exp $"); + +#include "opt_apic.h" +#include "opt_cpu.h" +#include "opt_kstack_pages.h" +#include "opt_mp_watchdog.h" + +#if !defined(lint) +#if !defined(SMP) +#error How did you get here? +#endif + +#if defined(I386_CPU) && !defined(COMPILING_LINT) +#error SMP not supported with I386_CPU +#endif +#if 0 +#ifndef DEV_APIC +#error The apic device is required for SMP, add "device apic" to your config file. +#endif +#endif +#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) +#error SMP not supported with CPU_DISABLE_CMPXCHG +#endif +#endif /* not lint */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/cons.h> /* cngetc() */ +#ifdef GPROF +#include <sys/gmon.h> +#endif +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/memrange.h> +#include <sys/mutex.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +#include <machine/apicreg.h> +#include <machine/clock.h> +#include <machine/md_var.h> +#include <machine/mp_watchdog.h> +#include <machine/pcb.h> +#include <machine/smp.h> +#include <machine/smptests.h> /** COUNT_XINVLTLB_HITS */ +#include <machine/specialreg.h> +#include <machine/privatespace.h> + +#include <machine/xenfunc.h> + +#define WARMBOOT_TARGET 0 +#define WARMBOOT_OFF (KERNBASE + 0x0467) +#define WARMBOOT_SEG (KERNBASE + 0x0469) + +#define CMOS_REG (0x70) +#define CMOS_DATA (0x71) +#define BIOS_RESET (0x0f) +#define BIOS_WARM (0x0a) + +/* + * this code MUST be enabled here and in mpboot.s. + * it follows the very early stages of AP boot by placing values in CMOS ram. + * it NORMALLY will never be needed and thus the primitive method for enabling. + * +#define CHECK_POINTS + */ + +#if defined(CHECK_POINTS) && !defined(PC98) +#define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) +#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) + +#define CHECK_INIT(D); \ + CHECK_WRITE(0x34, (D)); \ + CHECK_WRITE(0x35, (D)); \ + CHECK_WRITE(0x36, (D)); \ + CHECK_WRITE(0x37, (D)); \ + CHECK_WRITE(0x38, (D)); \ + CHECK_WRITE(0x39, (D)); + +#define CHECK_PRINT(S); \ + printf("%s: %d, %d, %d, %d, %d, %d\n", \ + (S), \ + CHECK_READ(0x34), \ + CHECK_READ(0x35), \ + CHECK_READ(0x36), \ + CHECK_READ(0x37), \ + CHECK_READ(0x38), \ + CHECK_READ(0x39)); + +#else /* CHECK_POINTS */ + +#define CHECK_INIT(D) +#define CHECK_PRINT(S) +#define CHECK_WRITE(A, D) + +#endif /* CHECK_POINTS */ + +/* + * Values to send to the POST hardware. + */ +#define MP_BOOTADDRESS_POST 0x10 +#define MP_PROBE_POST 0x11 +#define MPTABLE_PASS1_POST 0x12 + +#define MP_START_POST 0x13 +#define MP_ENABLE_POST 0x14 +#define MPTABLE_PASS2_POST 0x15 + +#define START_ALL_APS_POST 0x16 +#define INSTALL_AP_TRAMP_POST 0x17 +#define START_AP_POST 0x18 + +#define MP_ANNOUNCE_POST 0x19 + +/* lock region used by kernel profiling */ +int mcount_lock; + +/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ +int current_postcode; + +int mp_naps; /* # of Applications processors */ +int boot_cpu_id = -1; /* designated BSP */ +extern int nkpt; + +/* + * CPU topology map datastructures for HTT. + */ +static struct cpu_group mp_groups[MAXCPU]; +static struct cpu_top mp_top; + +/* AP uses this during bootstrap. Do not staticize. */ +char *bootSTK; +static int bootAP; + +/* Hotwire a 0->4MB V==P mapping */ +extern pt_entry_t *KPTphys; + +/* SMP page table page */ +extern pt_entry_t *SMPpt; + +struct pcb stoppcbs[MAXCPU]; + +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1; +vm_offset_t smp_tlb_addr2; +volatile int smp_tlb_wait; + +/* + * Local data and functions. + */ + +static u_int logical_cpus; + +/* used to hold the AP's until we are ready to release them */ +static struct mtx ap_boot_mtx; + +/* Set to 1 once we're ready to let the APs out of the pen. */ +static volatile int aps_ready = 0; + +/* + * Store data from cpu_add() until later in the boot when we actually setup + * the APs. + */ +struct cpu_info { + int cpu_present:1; + int cpu_bsp:1; +} static cpu_info[MAXCPU]; +static int cpu_apic_ids[MAXCPU]; + +static u_int boot_address; + +static void set_logical_apic_ids(void); +static int start_all_aps(void); +static void install_ap_tramp(void); +static int start_ap(int apic_id); +static void release_aps(void *dummy); + +static int hlt_logical_cpus; +static struct sysctl_ctx_list logical_cpu_clist; + +static void +mem_range_AP_init(void) +{ + if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) + mem_range_softc.mr_op->initAP(&mem_range_softc); +} + +void +mp_topology(void) +{ + struct cpu_group *group; + int logical_cpus; + int apic_id; + int groups; + int cpu; + + /* Build the smp_topology map. */ + /* Nothing to do if there is no HTT support. */ + if ((cpu_feature & CPUID_HTT) == 0) + return; + logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; + if (logical_cpus <= 1) + return; + group = &mp_groups[0]; + groups = 1; + for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) { + if (!cpu_info[apic_id].cpu_present) + continue; + /* + * If the current group has members and we're not a logical + * cpu, create a new group. + */ + if (group->cg_count != 0 && (apic_id % logical_cpus) == 0) { + group++; + groups++; + } + group->cg_count++; + group->cg_mask |= 1 << cpu; + cpu++; + } + + mp_top.ct_count = groups; + mp_top.ct_group = mp_groups; + smp_topology = &mp_top; +} + + +/* + * Calculate usable address in base memory for AP trampoline code. + */ +u_int +mp_bootaddress(u_int basemem) +{ + POSTCODE(MP_BOOTADDRESS_POST); + + boot_address = trunc_page(basemem); /* round down to 4k boundary */ + if ((basemem - boot_address) < bootMP_size) + boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ + + return boot_address; +} + +void +cpu_add(u_int apic_id, char boot_cpu) +{ + + if (apic_id >= MAXCPU) { + printf("SMP: CPU %d exceeds maximum CPU %d, ignoring\n", + apic_id, MAXCPU - 1); + return; + } + KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", + apic_id)); + cpu_info[apic_id].cpu_present = 1; + if (boot_cpu) { + KASSERT(boot_cpu_id == -1, + ("CPU %d claims to be BSP, but CPU %d already is", apic_id, + boot_cpu_id)); + boot_cpu_id = apic_id; + cpu_info[apic_id].cpu_bsp = 1; + } + mp_ncpus++; + if (bootverbose) + printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : + "AP"); + +} + +void +cpu_mp_setmaxid(void) +{ + + mp_maxid = MAXCPU - 1; +} + +int +cpu_mp_probe(void) +{ + + /* + * Always record BSP in CPU map so that the mbuf init code works + * correctly. + */ + all_cpus = 1; + if (mp_ncpus == 0) { + /* + * No CPUs were found, so this must be a UP system. Setup + * the variables to represent a system with a single CPU + * with an id of 0. + */ + mp_ncpus = 1; + return (0); + } + + /* At least one CPU was found. */ + if (mp_ncpus == 1) { + /* + * One CPU was found, so this must be a UP system with + * an I/O APIC. + */ + return (0); + } + + /* At least two CPUs were found. */ + return (1); +} + +/* + * Initialize the IPI handlers and start up the AP's. + */ +void +cpu_mp_start(void) +{ + int i; + + POSTCODE(MP_START_POST); + + /* Initialize the logical ID to APIC ID table. */ + for (i = 0; i < MAXCPU; i++) + cpu_apic_ids[i] = -1; + + /* Install an inter-CPU IPI for TLB invalidation */ + setidt(IPI_INVLTLB, IDTVEC(invltlb), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_INVLPG, IDTVEC(invlpg), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IPI_INVLRNG, IDTVEC(invlrng), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for forwarding hardclock() */ + setidt(IPI_HARDCLOCK, IDTVEC(hardclock), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for forwarding statclock() */ + setidt(IPI_STATCLOCK, IDTVEC(statclock), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for lazy pmap release */ + setidt(IPI_LAZYPMAP, IDTVEC(lazypmap), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for all-CPU rendezvous */ + setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for forcing an additional software trap */ + setidt(IPI_AST, IDTVEC(cpuast), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* Install an inter-CPU IPI for CPU stop/restart */ + setidt(IPI_STOP, IDTVEC(cpustop), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + + /* Set boot_cpu_id if needed. */ + if (boot_cpu_id == -1) { + boot_cpu_id = PCPU_GET(apic_id); + cpu_info[boot_cpu_id].cpu_bsp = 1; + } else + KASSERT(boot_cpu_id == PCPU_GET(apic_id), + ("BSP's APIC ID doesn't match boot_cpu_id")); + cpu_apic_ids[0] = boot_cpu_id; + + /* Start each Application Processor */ + start_all_aps(); + + /* Setup the initial logical CPUs info. */ + logical_cpus = logical_cpus_mask = 0; + if (cpu_feature & CPUID_HTT) + logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; + + set_logical_apic_ids(); +} + + +/* + * Print various information about the SMP system hardware and setup. + */ +void +cpu_mp_announce(void) +{ + int i, x; + + POSTCODE(MP_ANNOUNCE_POST); + + /* List CPUs */ + printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); + for (i = 1, x = 0; x < MAXCPU; x++) { + if (cpu_info[x].cpu_present && !cpu_info[x].cpu_bsp) { + KASSERT(i < mp_ncpus, + ("mp_ncpus and actual cpus are out of whack")); + printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); + } + } +} + +/* + * AP CPU's call this to initialize themselves. + */ +void +init_secondary(void) +{ + int gsel_tss; + int x, myid; +#if 0 + u_int cr0; +#endif + /* bootAP is set in start_ap() to our ID. */ + myid = bootAP; + gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid]; + gdt_segs[GPROC0_SEL].ssd_base = + (int) &SMP_prvspace[myid].pcpu.pc_common_tss; + SMP_prvspace[myid].pcpu.pc_prvspace = + &SMP_prvspace[myid].pcpu; + + for (x = 0; x < NGDT; x++) { + ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); + } + +#if 0 + r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; + r_gdt.rd_base = (int) &gdt[myid * NGDT]; + lgdt(&r_gdt); /* does magic intra-segment return */ + + lidt(&r_idt); + lldt(_default_ldt); +#endif + PCPU_SET(currentldt, _default_ldt); + + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); + gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; + PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); + PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); + PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); +#if 0 + ltr(gsel_tss); + + /* + * Set to a known state: + * Set by mpboot.s: CR0_PG, CR0_PE + * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM + */ + cr0 = rcr0(); + cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); + load_cr0(cr0); +#endif + CHECK_WRITE(0x38, 5); + + /* Disable local APIC just to be sure. */ + lapic_disable(); + + /* signal our startup to the BSP. */ + mp_naps++; + CHECK_WRITE(0x39, 6); + + /* Spin until the BSP releases the AP's. */ + while (!aps_ready) + ia32_pause(); + + /* BSP may have changed PTD while we were waiting */ + invltlb(); + pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); + +#if defined(I586_CPU) && !defined(NO_F00F_HACK) + lidt(&r_idt); +#endif + + /* set up CPU registers and state */ + cpu_setregs(); + + /* set up FPU state on the AP */ + npxinit(__INITIAL_NPXCW__); + + /* set up SSE registers */ + enable_sse(); + + /* A quick check from sanity claus */ + if (PCPU_GET(apic_id) != lapic_id()) { + printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); + printf("SMP: actual apic_id = %d\n", lapic_id()); + printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); + printf("PTD[MPPTDI] = %#jx\n", (uintmax_t)PTD[MPPTDI]); + panic("cpuid mismatch! boom!!"); + } + + mtx_lock_spin(&ap_boot_mtx); + + /* Init local apic for irq's */ + lapic_setup(); + + /* Set memory range attributes for this CPU to match the BSP */ + mem_range_AP_init(); + + smp_cpus++; + + CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); + printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); + + /* Determine if we are a logical CPU. */ + if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) + logical_cpus_mask |= PCPU_GET(cpumask); + + /* Build our map of 'other' CPUs. */ + PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + + if (bootverbose) + lapic_dump("AP"); + + if (smp_cpus == mp_ncpus) { + /* enable IPI's, tlb shootdown, freezes etc */ + atomic_store_rel_int(&smp_started, 1); + smp_active = 1; /* historic */ + } + + mtx_unlock_spin(&ap_boot_mtx); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ia32_pause(); + + /* ok, now grab sched_lock and enter the scheduler */ + mtx_lock_spin(&sched_lock); + + binuptime(PCPU_PTR(switchtime)); + PCPU_SET(switchticks, ticks); + + cpu_throw(NULL, choosethread()); /* doesn't return */ + + panic("scheduler returned us to %s", __func__); + /* NOTREACHED */ +} + +/******************************************************************* + * local functions and data + */ + +/* + * Set the APIC logical IDs. + * + * We want to cluster logical CPU's within the same APIC ID cluster. + * Since logical CPU's are aligned simply filling in the clusters in + * APIC ID order works fine. Note that this does not try to balance + * the number of CPU's in each cluster. (XXX?) + */ +static void +set_logical_apic_ids(void) +{ + u_int apic_id, cluster, cluster_id; + + /* Force us to allocate cluster 0 at the start. */ + cluster = -1; + cluster_id = APIC_MAX_INTRACLUSTER_ID; + for (apic_id = 0; apic_id < MAXCPU; apic_id++) { + if (!cpu_info[apic_id].cpu_present) + continue; + if (cluster_id == APIC_MAX_INTRACLUSTER_ID) { + cluster = ioapic_next_logical_cluster(); + cluster_id = 0; + } else + cluster_id++; + if (bootverbose) + printf("APIC ID: physical %u, logical %u:%u\n", + apic_id, cluster, cluster_id); + lapic_set_logical_id(apic_id, cluster, cluster_id); + } +} + +/* + * start each AP in our list + */ +static int +start_all_aps(void) +{ +#ifndef PC98 + u_char mpbiosreason; +#endif + u_long mpbioswarmvec; + struct pcpu *pc; + char *stack; + uintptr_t kptbase; + int i, pg, apic_id, cpu; + + POSTCODE(START_ALL_APS_POST); + + mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); + + /* install the AP 1st level boot code */ + install_ap_tramp(); + + /* save the current value of the warm-start vector */ + mpbioswarmvec = *((u_long *) WARMBOOT_OFF); +#ifndef PC98 + outb(CMOS_REG, BIOS_RESET); + mpbiosreason = inb(CMOS_DATA); +#endif + + /* set up temporary P==V mapping for AP boot */ + /* XXX this is a hack, we should boot the AP on its own stack/PTD */ + kptbase = (uintptr_t)(void *)KPTphys; + for (i = 0; i < NKPT; i++) + PTD[i] = (pd_entry_t)(PG_V | PG_RW | + ((kptbase + i * PAGE_SIZE) & PG_FRAME)); + invltlb(); + + /* start each AP */ + for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) { + if (!cpu_info[apic_id].cpu_present || + cpu_info[apic_id].cpu_bsp) + continue; + cpu++; + + /* save APIC ID for this logical ID */ + cpu_apic_ids[cpu] = apic_id; + + /* first page of AP's private space */ + pg = cpu * i386_btop(sizeof(struct privatespace)); + + /* allocate a new private data page */ + pc = (struct pcpu *)kmem_alloc(kernel_map, PAGE_SIZE); + + /* wire it into the private page table page */ + SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(pc)); + + /* allocate and set up an idle stack data page */ + stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); /* XXXKSE */ + for (i = 0; i < KSTACK_PAGES; i++) + SMPpt[pg + 1 + i] = (pt_entry_t) + (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); + + /* prime data page for it to use */ + pcpu_init(pc, cpu, sizeof(struct pcpu)); + pc->pc_apic_id = apic_id; + + /* setup a vector to our boot code */ + *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; + *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); +#ifndef PC98 + outb(CMOS_REG, BIOS_RESET); + outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ +#endif + + bootSTK = &SMP_prvspace[cpu].idlekstack[KSTACK_PAGES * + PAGE_SIZE]; + bootAP = cpu; + + /* attempt to start the Application Processor */ + CHECK_INIT(99); /* setup checkpoints */ + if (!start_ap(apic_id)) { + printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); + CHECK_PRINT("trace"); /* show checkpoints */ + /* better panic as the AP may be running loose */ + printf("panic y/n? [y] "); + if (cngetc() != 'n') + panic("bye-bye"); + } + CHECK_PRINT("trace"); /* show checkpoints */ + + all_cpus |= (1 << cpu); /* record AP in CPU map */ + } + + /* build our map of 'other' CPUs */ + PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); + + /* restore the warmstart vector */ + *(u_long *) WARMBOOT_OFF = mpbioswarmvec; +#ifndef PC98 + outb(CMOS_REG, BIOS_RESET); + outb(CMOS_DATA, mpbiosreason); +#endif + + /* + * Set up the idle context for the BSP. Similar to above except + * that some was done by locore, some by pmap.c and some is implicit + * because the BSP is cpu#0 and the page is initially zero and also + * because we can refer to variables by name on the BSP.. + */ + + /* Allocate and setup BSP idle stack */ + stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); + for (i = 0; i < KSTACK_PAGES; i++) + SMPpt[1 + i] = (pt_entry_t) + (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); + + for (i = 0; i < NKPT; i++) + PTD[i] = 0; + pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); + + /* number of APs actually started */ + return mp_naps; +} + +/* + * load the 1st level AP boot code into base memory. + */ + +/* targets for relocation */ +extern void bigJump(void); +extern void bootCodeSeg(void); +extern void bootDataSeg(void); +extern void MPentry(void); +extern u_int MP_GDT; +extern u_int mp_gdtbase; + +static void +install_ap_tramp(void) +{ + int x; + int size = *(int *) ((u_long) & bootMP_size); + vm_offset_t va = boot_address + KERNBASE; + u_char *src = (u_char *) ((u_long) bootMP); + u_char *dst = (u_char *) va; + u_int boot_base = (u_int) bootMP; + u_int8_t *dst8; + u_int16_t *dst16; + u_int32_t *dst32; + + POSTCODE(INSTALL_AP_TRAMP_POST); + + KASSERT (size <= PAGE_SIZE, + ("'size' do not fit into PAGE_SIZE, as expected.")); + pmap_kenter(va, boot_address); + pmap_invalidate_page (kernel_pmap, va); + for (x = 0; x < size; ++x) + *dst++ = *src++; + + /* + * modify addresses in code we just moved to basemem. unfortunately we + * need fairly detailed info about mpboot.s for this to work. changes + * to mpboot.s might require changes here. + */ + + /* boot code is located in KERNEL space */ + dst = (u_char *) va; + + /* modify the lgdt arg */ + dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); + *dst32 = boot_address + ((u_int) & MP_GDT - boot_base); + + /* modify the ljmp target for MPentry() */ + dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); + *dst32 = ((u_int) MPentry - KERNBASE); + + /* modify the target for boot code segment */ + dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); + dst8 = (u_int8_t *) (dst16 + 1); + *dst16 = (u_int) boot_address & 0xffff; + *dst8 = ((u_int) boot_address >> 16) & 0xff; + + /* modify the target for boot data segment */ + dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); + dst8 = (u_int8_t *) (dst16 + 1); + *dst16 = (u_int) boot_address & 0xffff; + *dst8 = ((u_int) boot_address >> 16) & 0xff; +} + +/* + * This function starts the AP (application processor) identified + * by the APIC ID 'physicalCpu'. It does quite a "song and dance" + * to accomplish this. This is necessary because of the nuances + * of the different hardware we might encounter. It isn't pretty, + * but it seems to work. + */ +static int +start_ap(int apic_id) +{ + int vector, ms; + int cpus; + + POSTCODE(START_AP_POST); + + /* calculate the vector */ + vector = (boot_address >> 12) & 0xff; + + /* used as a watchpoint to signal AP startup */ + cpus = mp_naps; + + /* + * first we do an INIT/RESET IPI this INIT IPI might be run, reseting + * and running the target CPU. OR this INIT IPI might be latched (P5 + * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be + * ignored. + */ + + /* do an INIT IPI: assert RESET */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | + APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); + + /* wait for pending status end */ + lapic_ipi_wait(-1); + + /* do an INIT IPI: deassert RESET */ + lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | + APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); + + /* wait for pending status end */ + DELAY(10000); /* wait ~10mS */ + lapic_ipi_wait(-1); + + /* + * next we do a STARTUP IPI: the previous INIT IPI might still be + * latched, (P5 bug) this 1st STARTUP would then terminate + * immediately, and the previously started INIT IPI would continue. OR + * the previous INIT IPI has already run. and this STARTUP IPI will + * run. OR the previous INIT IPI was ignored. and this STARTUP IPI + * will run. + */ + + /* do a STARTUP IPI */ + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | + APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | + vector, apic_id); + lapic_ipi_wait(-1); + DELAY(200); /* wait ~200uS */ + + /* + * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF + * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR + * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is + * recognized after hardware RESET or INIT IPI. + */ + + lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | + APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | + vector, apic_id); + lapic_ipi_wait(-1); + DELAY(200); /* wait ~200uS */ + + /* Wait up to 5 seconds for it to start. */ + for (ms = 0; ms < 5000; ms++) { + if (mp_naps > cpus) + return 1; /* return SUCCESS */ + DELAY(1000); + } + return 0; /* return FAILURE */ +} + +#ifdef COUNT_XINVLTLB_HITS +u_int xhits_gbl[MAXCPU]; +u_int xhits_pg[MAXCPU]; +u_int xhits_rng[MAXCPU]; +SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, + sizeof(xhits_gbl), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, + sizeof(xhits_pg), "IU", ""); +SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, + sizeof(xhits_rng), "IU", ""); + +u_int ipi_global; +u_int ipi_page; +u_int ipi_range; +u_int ipi_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, + 0, ""); + +u_int ipi_masked_global; +u_int ipi_masked_page; +u_int ipi_masked_range; +u_int ipi_masked_range_size; +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, + &ipi_masked_global, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, + &ipi_masked_page, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, + &ipi_masked_range, 0, ""); +SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, + &ipi_masked_range_size, 0, ""); +#endif /* COUNT_XINVLTLB_HITS */ + +/* + * Flush the TLB on all other CPU's + */ +static void +smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + u_int ncpu; + + ncpu = mp_ncpus - 1; /* does not shootdown self */ + if (ncpu < 1) + return; /* no other cpus */ + mtx_assert(&smp_rv_mtx, MA_OWNED); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + ipi_all_but_self(vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); +} + +/* + * This is about as magic as it gets. fortune(1) has got similar code + * for reversing bits in a word. Who thinks up this stuff?? + * + * Yes, it does appear to be consistently faster than: + * while (i = ffs(m)) { + * m >>= i; + * bits++; + * } + * and + * while (lsb = (m & -m)) { // This is magic too + * m &= ~lsb; // or: m ^= lsb + * bits++; + * } + * Both of these latter forms do some very strange things on gcc-3.1 with + * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2. + * There is probably an SSE or MMX popcnt instruction. + * + * I wonder if this should be in libkern? + * + * XXX Stop the presses! Another one: + * static __inline u_int32_t + * popcnt1(u_int32_t v) + * { + * v -= ((v >> 1) & 0x55555555); + * v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + * v = (v + (v >> 4)) & 0x0F0F0F0F; + * return (v * 0x01010101) >> 24; + * } + * The downside is that it has a multiply. With a pentium3 with + * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use + * an imull, and in that case it is faster. In most other cases + * it appears slightly slower. + * + * Another variant (also from fortune): + * #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255) + * #define BX_(x) ((x) - (((x)>>1)&0x77777777) \ + * - (((x)>>2)&0x33333333) \ + * - (((x)>>3)&0x11111111)) + */ +static __inline u_int32_t +popcnt(u_int32_t m) +{ + + m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1); + m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2); + m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4); + m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8); + m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16); + return m; +} + +static void +smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) +{ + int ncpu, othercpus; + + othercpus = mp_ncpus - 1; + if (mask == (u_int)-1) { + ncpu = othercpus; + if (ncpu < 1) + return; + } else { + mask &= ~PCPU_GET(cpumask); + if (mask == 0) + return; + ncpu = popcnt(mask); + if (ncpu > othercpus) { + /* XXX this should be a panic offence */ + printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", + ncpu, othercpus); + ncpu = othercpus; + } + /* XXX should be a panic, implied by mask == 0 above */ + if (ncpu < 1) + return; + } + mtx_assert(&smp_rv_mtx, MA_OWNED); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + atomic_store_rel_int(&smp_tlb_wait, 0); + if (mask == (u_int)-1) + ipi_all_but_self(vector); + else + ipi_selected(mask, vector); + while (smp_tlb_wait < ncpu) + ia32_pause(); +} + +void +smp_invltlb(void) +{ + if (smp_started) { + smp_tlb_shootdown(IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif + } +} + +void +smp_invlpg(vm_offset_t addr) +{ + if (smp_started) { + smp_tlb_shootdown(IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif + } +} + +void +smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) +{ + if (smp_started) { + smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +} + +void +smp_masked_invltlb(u_int mask) +{ + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_global++; +#endif + } +} + +void +smp_masked_invlpg(u_int mask, vm_offset_t addr) +{ + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_page++; +#endif + } +} + +void +smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) +{ + if (smp_started) { + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); +#ifdef COUNT_XINVLTLB_HITS + ipi_masked_range++; + ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif + } +} + + +/* + * For statclock, we send an IPI to all CPU's to have them call this + * function. + */ +void +forwarded_statclock(struct clockframe frame) +{ + struct thread *td; + + CTR0(KTR_SMP, "forwarded_statclock"); + td = curthread; + td->td_intr_nesting_level++; + if (profprocs != 0) + profclock(&frame); + if (pscnt == psdiv) + statclock(&frame); + td->td_intr_nesting_level--; +} + +void +forward_statclock(void) +{ + int map; + + CTR0(KTR_SMP, "forward_statclock"); + + if (!smp_started || cold || panicstr) + return; + + map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask); + if (map != 0) + ipi_selected(map, IPI_STATCLOCK); +} + +/* + * For each hardclock(), we send an IPI to all other CPU's to have them + * execute this function. It would be nice to reduce contention on + * sched_lock if we could simply peek at the CPU to determine the user/kernel + * state and call hardclock_process() on the CPU receiving the clock interrupt + * and then just use a simple IPI to handle any ast's if needed. + */ +void +forwarded_hardclock(struct clockframe frame) +{ + struct thread *td; + + CTR0(KTR_SMP, "forwarded_hardclock"); + td = curthread; + td->td_intr_nesting_level++; + hardclock_process(&frame); + td->td_intr_nesting_level--; +} + +void +forward_hardclock(void) +{ + u_int map; + + CTR0(KTR_SMP, "forward_hardclock"); + + if (!smp_started || cold || panicstr) + return; + + map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask); + if (map != 0) + ipi_selected(map, IPI_HARDCLOCK); +} + +/* + * send an IPI to a set of cpus. + */ +void +ipi_selected(u_int32_t cpus, u_int ipi) +{ + int cpu; + + CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); + while ((cpu = ffs(cpus)) != 0) { + cpu--; + KASSERT(cpu_apic_ids[cpu] != -1, + ("IPI to non-existent CPU %d", cpu)); + lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); + cpus &= ~(1 << cpu); + } +} + +/* + * send an IPI INTerrupt containing 'vector' to all CPUs, including myself + */ +void +ipi_all(u_int ipi) +{ + + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL); +} + +/* + * send an IPI to all CPUs EXCEPT myself + */ +void +ipi_all_but_self(u_int ipi) +{ + + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); +} + +/* + * send an IPI to myself + */ +void +ipi_self(u_int ipi) +{ + + CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); + lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF); +} + +/* + * This is called once the rest of the system is up and running and we're + * ready to let the AP's out of the pen. + */ +static void +release_aps(void *dummy __unused) +{ + + if (mp_ncpus == 1) + return; + mtx_lock_spin(&sched_lock); + atomic_store_rel_int(&aps_ready, 1); + while (smp_started == 0) + ia32_pause(); + mtx_unlock_spin(&sched_lock); +} +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); + +static int +sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS) +{ + u_int mask; + int error; + + mask = hlt_cpus_mask; + error = sysctl_handle_int(oidp, &mask, 0, req); + if (error || !req->newptr) + return (error); + + if (logical_cpus_mask != 0 && + (mask & logical_cpus_mask) == logical_cpus_mask) + hlt_logical_cpus = 1; + else + hlt_logical_cpus = 0; + + if ((mask & all_cpus) == all_cpus) + mask &= ~(1<<0); + hlt_cpus_mask = mask; + return (error); +} +SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_hlt_cpus, "IU", + "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2."); + +static int +sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS) +{ + int disable, error; + + disable = hlt_logical_cpus; + error = sysctl_handle_int(oidp, &disable, 0, req); + if (error || !req->newptr) + return (error); + + if (disable) + hlt_cpus_mask |= logical_cpus_mask; + else + hlt_cpus_mask &= ~logical_cpus_mask; + + if ((hlt_cpus_mask & all_cpus) == all_cpus) + hlt_cpus_mask &= ~(1<<0); + + hlt_logical_cpus = disable; + return (error); +} + +static void +cpu_hlt_setup(void *dummy __unused) +{ + + if (logical_cpus_mask != 0) { + TUNABLE_INT_FETCH("machdep.hlt_logical_cpus", + &hlt_logical_cpus); + sysctl_ctx_init(&logical_cpu_clist); + SYSCTL_ADD_PROC(&logical_cpu_clist, + SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, + "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0, + sysctl_hlt_logical_cpus, "IU", ""); + SYSCTL_ADD_UINT(&logical_cpu_clist, + SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, + "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD, + &logical_cpus_mask, 0, ""); + + if (hlt_logical_cpus) + hlt_cpus_mask |= logical_cpus_mask; + } +} +SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL); + +int +mp_grab_cpu_hlt(void) +{ + u_int mask = PCPU_GET(cpumask); +#ifdef MP_WATCHDOG + u_int cpuid = PCPU_GET(cpuid); +#endif + int retval; + +#ifdef MP_WATCHDOG + ap_watchdog(cpuid); +#endif + + retval = mask & hlt_cpus_mask; + while (mask & hlt_cpus_mask) + __asm __volatile("sti; hlt" : : : "memory"); + return (retval); +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/mptable.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/mptable.c new file mode 100644 index 0000000000..2f0aff0055 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/mptable.c @@ -0,0 +1,974 @@ +/*- + * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org> + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/mptable.c,v 1.235.2.1 2004/09/28 16:24:09 jhb Exp $"); + +#include "opt_mptable_force_htt.h" +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include <machine/apicreg.h> +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> +#include <machine/md_var.h> +#include <machine/mptable.h> +#include <machine/specialreg.h> + +#include <dev/pci/pcivar.h> + +/* string defined by the Intel MP Spec as identifying the MP table */ +#define MP_SIG 0x5f504d5f /* _MP_ */ + +#define NAPICID 32 /* Max number of APIC's */ + +#ifdef PC98 +#define BIOS_BASE (0xe8000) +#define BIOS_SIZE (0x18000) +#else +#define BIOS_BASE (0xf0000) +#define BIOS_SIZE (0x10000) +#endif +#define BIOS_COUNT (BIOS_SIZE/4) + +typedef void mptable_entry_handler(u_char *entry, void *arg); + +static basetable_entry basetable_entry_types[] = +{ + {0, 20, "Processor"}, + {1, 8, "Bus"}, + {2, 8, "I/O APIC"}, + {3, 8, "I/O INT"}, + {4, 8, "Local INT"} +}; + +typedef struct BUSDATA { + u_char bus_id; + enum busTypes bus_type; +} bus_datum; + +typedef struct INTDATA { + u_char int_type; + u_short int_flags; + u_char src_bus_id; + u_char src_bus_irq; + u_char dst_apic_id; + u_char dst_apic_int; + u_char int_vector; +} io_int, local_int; + +typedef struct BUSTYPENAME { + u_char type; + char name[7]; +} bus_type_name; + +/* From MP spec v1.4, table 4-8. */ +static bus_type_name bus_type_table[] = +{ + {UNKNOWN_BUSTYPE, "CBUS "}, + {UNKNOWN_BUSTYPE, "CBUSII"}, + {EISA, "EISA "}, + {UNKNOWN_BUSTYPE, "FUTURE"}, + {UNKNOWN_BUSTYPE, "INTERN"}, + {ISA, "ISA "}, + {UNKNOWN_BUSTYPE, "MBI "}, + {UNKNOWN_BUSTYPE, "MBII "}, + {MCA, "MCA "}, + {UNKNOWN_BUSTYPE, "MPI "}, + {UNKNOWN_BUSTYPE, "MPSA "}, + {UNKNOWN_BUSTYPE, "NUBUS "}, + {PCI, "PCI "}, + {UNKNOWN_BUSTYPE, "PCMCIA"}, + {UNKNOWN_BUSTYPE, "TC "}, + {UNKNOWN_BUSTYPE, "VL "}, + {UNKNOWN_BUSTYPE, "VME "}, + {UNKNOWN_BUSTYPE, "XPRESS"} +}; + +/* From MP spec v1.4, table 5-1. */ +static int default_data[7][5] = +{ +/* nbus, id0, type0, id1, type1 */ + {1, 0, ISA, 255, NOBUS}, + {1, 0, EISA, 255, NOBUS}, + {1, 0, EISA, 255, NOBUS}, + {1, 0, MCA, 255, NOBUS}, + {2, 0, ISA, 1, PCI}, + {2, 0, EISA, 1, PCI}, + {2, 0, MCA, 1, PCI} +}; + +struct pci_probe_table_args { + u_char bus; + u_char found; +}; + +struct pci_route_interrupt_args { + u_char bus; /* Source bus. */ + u_char irq; /* Source slot:pin. */ + int vector; /* Return value. */ +}; + +static mpfps_t mpfps; +static mpcth_t mpct; +static void *ioapics[NAPICID]; +static bus_datum *busses; +static int mptable_nioapics, mptable_nbusses, mptable_maxbusid; +static int pci0 = -1; + +MALLOC_DEFINE(M_MPTABLE, "MP Table", "MP Table Items"); + +static enum intr_polarity conforming_polarity(u_char src_bus, + u_char src_bus_irq); +static enum intr_trigger conforming_trigger(u_char src_bus, u_char src_bus_irq); +static enum intr_polarity intentry_polarity(int_entry_ptr intr); +static enum intr_trigger intentry_trigger(int_entry_ptr intr); +static int lookup_bus_type(char *name); +static void mptable_count_items(void); +static void mptable_count_items_handler(u_char *entry, void *arg); +#ifdef MPTABLE_FORCE_HTT +static void mptable_hyperthread_fixup(u_int id_mask); +#endif +static void mptable_parse_apics_and_busses(void); +static void mptable_parse_apics_and_busses_handler(u_char *entry, + void *arg); +static void mptable_parse_ints(void); +static void mptable_parse_ints_handler(u_char *entry, void *arg); +static void mptable_parse_io_int(int_entry_ptr intr); +static void mptable_parse_local_int(int_entry_ptr intr); +static void mptable_pci_probe_table_handler(u_char *entry, void *arg); +static void mptable_pci_route_interrupt_handler(u_char *entry, void *arg); +static void mptable_pci_setup(void); +static int mptable_probe(void); +static int mptable_probe_cpus(void); +static void mptable_probe_cpus_handler(u_char *entry, void *arg __unused); +static void mptable_register(void *dummy); +static int mptable_setup_local(void); +static int mptable_setup_io(void); +static void mptable_walk_table(mptable_entry_handler *handler, void *arg); +static int search_for_sig(u_int32_t target, int count); + +static struct apic_enumerator mptable_enumerator = { + "MPTable", + mptable_probe, + mptable_probe_cpus, + mptable_setup_local, + mptable_setup_io +}; + +/* + * look for the MP spec signature + */ + +static int +search_for_sig(u_int32_t target, int count) +{ + int x; + u_int32_t *addr = (u_int32_t *) (KERNBASE + target); + + for (x = 0; x < count; x += 4) + if (addr[x] == MP_SIG) + /* make array index a byte index */ + return (target + (x * sizeof(u_int32_t))); + return (-1); +} + +static int +lookup_bus_type(char *name) +{ + int x; + + for (x = 0; x < MAX_BUSTYPE; ++x) + if (strncmp(bus_type_table[x].name, name, 6) == 0) + return (bus_type_table[x].type); + + return (UNKNOWN_BUSTYPE); +} + +/* + * Look for an Intel MP spec table (ie, SMP capable hardware). + */ +static int +mptable_probe(void) +{ + int x; + u_long segment; + u_int32_t target; + + /* see if EBDA exists */ + if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) { + /* search first 1K of EBDA */ + target = (u_int32_t) (segment << 4); + if ((x = search_for_sig(target, 1024 / 4)) >= 0) + goto found; + } else { + /* last 1K of base memory, effective 'top of base' passed in */ + target = (u_int32_t) ((basemem * 1024) - 0x400); + if ((x = search_for_sig(target, 1024 / 4)) >= 0) + goto found; + } + + /* search the BIOS */ + target = (u_int32_t) BIOS_BASE; + if ((x = search_for_sig(target, BIOS_COUNT)) >= 0) + goto found; + + /* nothing found */ + return (ENXIO); + +found: + mpfps = (mpfps_t)(KERNBASE + x); + + /* Map in the configuration table if it exists. */ + if (mpfps->config_type != 0) + mpct = NULL; + else { + if ((uintptr_t)mpfps->pap >= 1024 * 1024) { + printf("%s: Unable to map MP Configuration Table\n", + __func__); + return (ENXIO); + } + mpct = (mpcth_t)(KERNBASE + (uintptr_t)mpfps->pap); + if (mpct->base_table_length + (uintptr_t)mpfps->pap >= + 1024 * 1024) { + printf("%s: Unable to map end of MP Config Table\n", + __func__); + return (ENXIO); + } + if (mpct->signature[0] != 'P' || mpct->signature[1] != 'C' || + mpct->signature[2] != 'M' || mpct->signature[3] != 'P') { + printf("%s: MP Config Table has bad signature: %c%c%c%c\n", + __func__, mpct->signature[0], mpct->signature[1], + mpct->signature[2], mpct->signature[3]); + return (ENXIO); + } + if (bootverbose) + printf( + "MP Configuration Table version 1.%d found at %p\n", + mpct->spec_rev, mpct); + } + + return (-100); +} + +/* + * Run through the MP table enumerating CPUs. + */ +static int +mptable_probe_cpus(void) +{ + u_int cpu_mask; + + /* Is this a pre-defined config? */ + if (mpfps->config_type != 0) { + lapic_create(0, 1); + lapic_create(1, 0); + } else { + cpu_mask = 0; + mptable_walk_table(mptable_probe_cpus_handler, &cpu_mask); +#ifdef MPTABLE_FORCE_HTT + mptable_hyperthread_fixup(cpu_mask); +#endif + } + return (0); +} + +/* + * Initialize the local APIC on the BSP. + */ +static int +mptable_setup_local(void) +{ + + /* Is this a pre-defined config? */ + printf("MPTable: <"); + if (mpfps->config_type != 0) { + lapic_init(DEFAULT_APIC_BASE); + printf("Preset Config %d", mpfps->config_type); + } else { + lapic_init((uintptr_t)mpct->apic_address); + printf("%.*s %.*s", (int)sizeof(mpct->oem_id), mpct->oem_id, + (int)sizeof(mpct->product_id), mpct->product_id); + } + printf(">\n"); + return (0); +} + +/* + * Run through the MP table enumerating I/O APICs. + */ +static int +mptable_setup_io(void) +{ + int i; + u_char byte; + + /* First, we count individual items and allocate arrays. */ + mptable_count_items(); + busses = malloc((mptable_maxbusid + 1) * sizeof(bus_datum), M_MPTABLE, + M_WAITOK); + for (i = 0; i <= mptable_maxbusid; i++) + busses[i].bus_type = NOBUS; + + /* Second, we run through adding I/O APIC's and busses. */ + ioapic_enable_mixed_mode(); + mptable_parse_apics_and_busses(); + + /* Third, we run through the table tweaking interrupt sources. */ + mptable_parse_ints(); + + /* Fourth, we register all the I/O APIC's. */ + for (i = 0; i < NAPICID; i++) + if (ioapics[i] != NULL) + ioapic_register(ioapics[i]); + + /* Fifth, we setup data structures to handle PCI interrupt routing. */ + mptable_pci_setup(); + + /* Finally, we throw the switch to enable the I/O APIC's. */ + if (mpfps->mpfb2 & MPFB2_IMCR_PRESENT) { + outb(0x22, 0x70); /* select IMCR */ + byte = inb(0x23); /* current contents */ + byte |= 0x01; /* mask external INTR */ + outb(0x23, byte); /* disconnect 8259s/NMI */ + } + + return (0); +} + +static void +mptable_register(void *dummy __unused) +{ + + apic_register_enumerator(&mptable_enumerator); +} +SYSINIT(mptable_register, SI_SUB_CPU - 1, SI_ORDER_FIRST, mptable_register, + NULL) + +/* + * Call the handler routine for each entry in the MP config table. + */ +static void +mptable_walk_table(mptable_entry_handler *handler, void *arg) +{ + u_int i; + u_char *entry; + + entry = (u_char *)(mpct + 1); + for (i = 0; i < mpct->entry_count; i++) { + switch (*entry) { + case MPCT_ENTRY_PROCESSOR: + case MPCT_ENTRY_IOAPIC: + case MPCT_ENTRY_BUS: + case MPCT_ENTRY_INT: + case MPCT_ENTRY_LOCAL_INT: + break; + default: + panic("%s: Unknown MP Config Entry %d\n", __func__, + (int)*entry); + } + handler(entry, arg); + entry += basetable_entry_types[*entry].length; + } +} + +static void +mptable_probe_cpus_handler(u_char *entry, void *arg) +{ + proc_entry_ptr proc; + u_int *cpu_mask; + + switch (*entry) { + case MPCT_ENTRY_PROCESSOR: + proc = (proc_entry_ptr)entry; + if (proc->cpu_flags & PROCENTRY_FLAG_EN) { + lapic_create(proc->apic_id, proc->cpu_flags & + PROCENTRY_FLAG_BP); + cpu_mask = (u_int *)arg; + *cpu_mask |= (1 << proc->apic_id); + } + break; + } +} + +static void +mptable_count_items_handler(u_char *entry, void *arg __unused) +{ + io_apic_entry_ptr apic; + bus_entry_ptr bus; + + switch (*entry) { + case MPCT_ENTRY_BUS: + bus = (bus_entry_ptr)entry; + mptable_nbusses++; + if (bus->bus_id > mptable_maxbusid) + mptable_maxbusid = bus->bus_id; + break; + case MPCT_ENTRY_IOAPIC: + apic = (io_apic_entry_ptr)entry; + if (apic->apic_flags & IOAPICENTRY_FLAG_EN) + mptable_nioapics++; + break; + } +} + +/* + * Count items in the table. + */ +static void +mptable_count_items(void) +{ + + /* Is this a pre-defined config? */ + if (mpfps->config_type != 0) { + mptable_nioapics = 1; + switch (mpfps->config_type) { + case 1: + case 2: + case 3: + case 4: + mptable_nbusses = 1; + break; + case 5: + case 6: + case 7: + mptable_nbusses = 2; + break; + default: + panic("Unknown pre-defined MP Table config type %d", + mpfps->config_type); + } + mptable_maxbusid = mptable_nbusses - 1; + } else + mptable_walk_table(mptable_count_items_handler, NULL); +} + +/* + * Add a bus or I/O APIC from an entry in the table. + */ +static void +mptable_parse_apics_and_busses_handler(u_char *entry, void *arg __unused) +{ + io_apic_entry_ptr apic; + bus_entry_ptr bus; + enum busTypes bus_type; + int i; + + + switch (*entry) { + case MPCT_ENTRY_BUS: + bus = (bus_entry_ptr)entry; + bus_type = lookup_bus_type(bus->bus_type); + if (bus_type == UNKNOWN_BUSTYPE) { + printf("MPTable: Unknown bus %d type \"", bus->bus_id); + for (i = 0; i < 6; i++) + printf("%c", bus->bus_type[i]); + printf("\"\n"); + } + busses[bus->bus_id].bus_id = bus->bus_id; + busses[bus->bus_id].bus_type = bus_type; + break; + case MPCT_ENTRY_IOAPIC: + apic = (io_apic_entry_ptr)entry; + if (!(apic->apic_flags & IOAPICENTRY_FLAG_EN)) + break; + if (apic->apic_id >= NAPICID) + panic("%s: I/O APIC ID %d too high", __func__, + apic->apic_id); + if (ioapics[apic->apic_id] != NULL) + panic("%s: Double APIC ID %d", __func__, + apic->apic_id); + ioapics[apic->apic_id] = ioapic_create( + (uintptr_t)apic->apic_address, apic->apic_id, -1); + break; + default: + break; + } +} + +/* + * Enumerate I/O APIC's and busses. + */ +static void +mptable_parse_apics_and_busses(void) +{ + + /* Is this a pre-defined config? */ + if (mpfps->config_type != 0) { + ioapics[0] = ioapic_create(DEFAULT_IO_APIC_BASE, 2, 0); + busses[0].bus_id = 0; + busses[0].bus_type = default_data[mpfps->config_type][2]; + if (mptable_nbusses > 1) { + busses[1].bus_id = 1; + busses[1].bus_type = + default_data[mpfps->config_type][4]; + } + } else + mptable_walk_table(mptable_parse_apics_and_busses_handler, + NULL); +} + +/* + * Determine conforming polarity for a given bus type. + */ +static enum intr_polarity +conforming_polarity(u_char src_bus, u_char src_bus_irq) +{ + + KASSERT(src_bus <= mptable_maxbusid, ("bus id %d too large", src_bus)); + switch (busses[src_bus].bus_type) { + case ISA: + case EISA: + return (INTR_POLARITY_HIGH); + case PCI: + return (INTR_POLARITY_LOW); + default: + panic("%s: unknown bus type %d", __func__, + busses[src_bus].bus_type); + } +} + +/* + * Determine conforming trigger for a given bus type. + */ +static enum intr_trigger +conforming_trigger(u_char src_bus, u_char src_bus_irq) +{ + + KASSERT(src_bus <= mptable_maxbusid, ("bus id %d too large", src_bus)); + switch (busses[src_bus].bus_type) { + case ISA: + return (INTR_TRIGGER_EDGE); + case PCI: + return (INTR_TRIGGER_LEVEL); +#if !defined(PC98) && !defined(XEN) + case EISA: + KASSERT(src_bus_irq < 16, ("Invalid EISA IRQ %d", src_bus_irq)); + return (elcr_read_trigger(src_bus_irq)); +#endif + default: + panic("%s: unknown bus type %d", __func__, + busses[src_bus].bus_type); + } +} + +static enum intr_polarity +intentry_polarity(int_entry_ptr intr) +{ + + switch (intr->int_flags & INTENTRY_FLAGS_POLARITY) { + case INTENTRY_FLAGS_POLARITY_CONFORM: + return (conforming_polarity(intr->src_bus_id, + intr->src_bus_irq)); + case INTENTRY_FLAGS_POLARITY_ACTIVEHI: + return (INTR_POLARITY_HIGH); + case INTENTRY_FLAGS_POLARITY_ACTIVELO: + return (INTR_POLARITY_LOW); + default: + panic("Bogus interrupt flags"); + } +} + +static enum intr_trigger +intentry_trigger(int_entry_ptr intr) +{ + + switch (intr->int_flags & INTENTRY_FLAGS_TRIGGER) { + case INTENTRY_FLAGS_TRIGGER_CONFORM: + return (conforming_trigger(intr->src_bus_id, + intr->src_bus_irq)); + case INTENTRY_FLAGS_TRIGGER_EDGE: + return (INTR_TRIGGER_EDGE); + case INTENTRY_FLAGS_TRIGGER_LEVEL: + return (INTR_TRIGGER_LEVEL); + default: + panic("Bogus interrupt flags"); + } +} + +/* + * Parse an interrupt entry for an I/O interrupt routed to a pin on an I/O APIC. + */ +static void +mptable_parse_io_int(int_entry_ptr intr) +{ + void *ioapic; + u_int pin; + + if (intr->dst_apic_id == 0xff) { + printf("MPTable: Ignoring global interrupt entry for pin %d\n", + intr->dst_apic_int); + return; + } + if (intr->dst_apic_id >= NAPICID) { + printf("MPTable: Ignoring interrupt entry for ioapic%d\n", + intr->dst_apic_id); + return; + } + ioapic = ioapics[intr->dst_apic_id]; + if (ioapic == NULL) { + printf( + "MPTable: Ignoring interrupt entry for missing ioapic%d\n", + intr->dst_apic_id); + return; + } + pin = intr->dst_apic_int; + switch (intr->int_type) { + case INTENTRY_TYPE_INT: + switch (busses[intr->src_bus_id].bus_type) { + case NOBUS: + panic("interrupt from missing bus"); + case ISA: + case EISA: + if (busses[intr->src_bus_id].bus_type == ISA) + ioapic_set_bus(ioapic, pin, APIC_BUS_ISA); + else + ioapic_set_bus(ioapic, pin, APIC_BUS_EISA); + if (intr->src_bus_irq == pin) + break; + ioapic_remap_vector(ioapic, pin, intr->src_bus_irq); + if (ioapic_get_vector(ioapic, intr->src_bus_irq) == + intr->src_bus_irq) + ioapic_disable_pin(ioapic, intr->src_bus_irq); + break; + case PCI: + ioapic_set_bus(ioapic, pin, APIC_BUS_PCI); + break; + default: + ioapic_set_bus(ioapic, pin, APIC_BUS_UNKNOWN); + break; + } + break; + case INTENTRY_TYPE_NMI: + ioapic_set_nmi(ioapic, pin); + break; + case INTENTRY_TYPE_SMI: + ioapic_set_smi(ioapic, pin); + break; + case INTENTRY_TYPE_EXTINT: + ioapic_set_extint(ioapic, pin); + break; + default: + panic("%s: invalid interrupt entry type %d\n", __func__, + intr->int_type); + } + if (intr->int_type == INTENTRY_TYPE_INT || + (intr->int_flags & INTENTRY_FLAGS_TRIGGER) != + INTENTRY_FLAGS_TRIGGER_CONFORM) + ioapic_set_triggermode(ioapic, pin, intentry_trigger(intr)); + if (intr->int_type == INTENTRY_TYPE_INT || + (intr->int_flags & INTENTRY_FLAGS_POLARITY) != + INTENTRY_FLAGS_POLARITY_CONFORM) + ioapic_set_polarity(ioapic, pin, intentry_polarity(intr)); +} + +/* + * Parse an interrupt entry for a local APIC LVT pin. + */ +static void +mptable_parse_local_int(int_entry_ptr intr) +{ + u_int apic_id, pin; + + if (intr->dst_apic_id == 0xff) + apic_id = APIC_ID_ALL; + else + apic_id = intr->dst_apic_id; + if (intr->dst_apic_int == 0) + pin = LVT_LINT0; + else + pin = LVT_LINT1; + switch (intr->int_type) { + case INTENTRY_TYPE_INT: +#if 1 + printf( + "MPTable: Ignoring vectored local interrupt for LINTIN%d vector %d\n", + intr->dst_apic_int, intr->src_bus_irq); + return; +#else + lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_FIXED); + break; +#endif + case INTENTRY_TYPE_NMI: + lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_NMI); + break; + case INTENTRY_TYPE_SMI: + lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_SMI); + break; + case INTENTRY_TYPE_EXTINT: + lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_EXTINT); + break; + default: + panic("%s: invalid interrupt entry type %d\n", __func__, + intr->int_type); + } + if ((intr->int_flags & INTENTRY_FLAGS_TRIGGER) != + INTENTRY_FLAGS_TRIGGER_CONFORM) + lapic_set_lvt_triggermode(apic_id, pin, + intentry_trigger(intr)); + if ((intr->int_flags & INTENTRY_FLAGS_POLARITY) != + INTENTRY_FLAGS_POLARITY_CONFORM) + lapic_set_lvt_polarity(apic_id, pin, intentry_polarity(intr)); +} + +/* + * Parse interrupt entries. + */ +static void +mptable_parse_ints_handler(u_char *entry, void *arg __unused) +{ + int_entry_ptr intr; + + intr = (int_entry_ptr)entry; + switch (*entry) { + case MPCT_ENTRY_INT: + mptable_parse_io_int(intr); + break; + case MPCT_ENTRY_LOCAL_INT: + mptable_parse_local_int(intr); + break; + } +} + +/* + * Configure the interrupt pins + */ +static void +mptable_parse_ints(void) +{ + + /* Is this a pre-defined config? */ + if (mpfps->config_type != 0) { + /* Configure LINT pins. */ + lapic_set_lvt_mode(APIC_ID_ALL, LVT_LINT0, APIC_LVT_DM_EXTINT); + lapic_set_lvt_mode(APIC_ID_ALL, LVT_LINT1, APIC_LVT_DM_NMI); + + /* Configure I/O APIC pins. */ + if (mpfps->config_type != 7) + ioapic_set_extint(ioapics[0], 0); + else + ioapic_disable_pin(ioapics[0], 0); + if (mpfps->config_type != 2) + ioapic_remap_vector(ioapics[0], 2, 0); + else + ioapic_disable_pin(ioapics[0], 2); + if (mpfps->config_type == 2) + ioapic_disable_pin(ioapics[0], 13); + } else + mptable_walk_table(mptable_parse_ints_handler, NULL); +} + +#ifdef MPTABLE_FORCE_HTT +/* + * Perform a hyperthreading "fix-up" to enumerate any logical CPU's + * that aren't already listed in the table. + * + * XXX: We assume that all of the physical CPUs in the + * system have the same number of logical CPUs. + * + * XXX: We assume that APIC ID's are allocated such that + * the APIC ID's for a physical processor are aligned + * with the number of logical CPU's in the processor. + */ +static void +mptable_hyperthread_fixup(u_int id_mask) +{ + u_int i, id, logical_cpus; + + /* Nothing to do if there is no HTT support. */ + if ((cpu_feature & CPUID_HTT) == 0) + return; + logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; + if (logical_cpus <= 1) + return; + + /* + * For each APIC ID of a CPU that is set in the mask, + * scan the other candidate APIC ID's for this + * physical processor. If any of those ID's are + * already in the table, then kill the fixup. + */ + for (id = 0; id < NAPICID; id++) { + if ((id_mask & 1 << id) == 0) + continue; + /* First, make sure we are on a logical_cpus boundary. */ + if (id % logical_cpus != 0) + return; + for (i = id + 1; i < id + logical_cpus; i++) + if ((id_mask & 1 << i) != 0) + return; + } + + /* + * Ok, the ID's checked out, so perform the fixup by + * adding the logical CPUs. + */ + while ((id = ffs(id_mask)) != 0) { + id--; + for (i = id + 1; i < id + logical_cpus; i++) { + if (bootverbose) + printf( + "MPTable: Adding logical CPU %d from main CPU %d\n", + i, id); + lapic_create(i, 0); + } + id_mask &= ~(1 << id); + } +} +#endif /* MPTABLE_FORCE_HTT */ + +/* + * Support code for routing PCI interrupts using the MP Table. + */ +static void +mptable_pci_setup(void) +{ + int i; + + /* + * Find the first pci bus and call it 0. Panic if pci0 is not + * bus zero and there are multiple PCI busses. + */ + for (i = 0; i <= mptable_maxbusid; i++) + if (busses[i].bus_type == PCI) { + if (pci0 == -1) + pci0 = i; + else if (pci0 != 0) + panic( + "MPTable contains multiple PCI busses but no PCI bus 0"); + } +} + +static void +mptable_pci_probe_table_handler(u_char *entry, void *arg) +{ + struct pci_probe_table_args *args; + int_entry_ptr intr; + + if (*entry != MPCT_ENTRY_INT) + return; + intr = (int_entry_ptr)entry; + args = (struct pci_probe_table_args *)arg; + KASSERT(args->bus <= mptable_maxbusid, + ("bus %d is too big", args->bus)); + KASSERT(busses[args->bus].bus_type == PCI, ("probing for non-PCI bus")); + if (intr->src_bus_id == args->bus) + args->found = 1; +} + +int +mptable_pci_probe_table(int bus) +{ + struct pci_probe_table_args args; + + if (bus < 0) + return (EINVAL); + if (pci0 == -1 || pci0 + bus > mptable_maxbusid) + return (ENXIO); + if (busses[pci0 + bus].bus_type != PCI) + return (ENXIO); + args.bus = pci0 + bus; + args.found = 0; + mptable_walk_table(mptable_pci_probe_table_handler, &args); + if (args.found == 0) + return (ENXIO); + return (0); +} + +static void +mptable_pci_route_interrupt_handler(u_char *entry, void *arg) +{ + struct pci_route_interrupt_args *args; + int_entry_ptr intr; + int vector; + + if (*entry != MPCT_ENTRY_INT) + return; + intr = (int_entry_ptr)entry; + args = (struct pci_route_interrupt_args *)arg; + if (intr->src_bus_id != args->bus || intr->src_bus_irq != args->irq) + return; + + /* Make sure the APIC maps to a known APIC. */ + KASSERT(ioapics[intr->dst_apic_id] != NULL, + ("No I/O APIC %d to route interrupt to", intr->dst_apic_id)); + + /* + * Look up the vector for this APIC / pin combination. If we + * have previously matched an entry for this PCI IRQ but it + * has the same vector as this entry, just return. Otherwise, + * we use the vector for this APIC / pin combination. + */ + vector = ioapic_get_vector(ioapics[intr->dst_apic_id], + intr->dst_apic_int); + if (args->vector == vector) + return; + KASSERT(args->vector == -1, + ("Multiple IRQs for PCI interrupt %d.%d.INT%c: %d and %d\n", + args->bus, args->irq >> 2, 'A' + (args->irq & 0x3), args->vector, + vector)); + args->vector = vector; +} + +int +mptable_pci_route_interrupt(device_t pcib, device_t dev, int pin) +{ + struct pci_route_interrupt_args args; + int slot; + + /* Like ACPI, pin numbers are 0-3, not 1-4. */ + pin--; + KASSERT(pci0 != -1, ("do not know how to route PCI interrupts")); + args.bus = pci_get_bus(dev) + pci0; + slot = pci_get_slot(dev); + + /* + * PCI interrupt entries in the MP Table encode both the slot and + * pin into the IRQ with the pin being the two least significant + * bits, the slot being the next five bits, and the most significant + * bit being reserved. + */ + args.irq = slot << 2 | pin; + args.vector = -1; + mptable_walk_table(mptable_pci_route_interrupt_handler, &args); + if (args.vector < 0) { + device_printf(pcib, "unable to route slot %d INT%c\n", slot, + 'A' + pin); + return (PCI_INVALID_IRQ); + } + if (bootverbose) + device_printf(pcib, "slot %d INT%c routed to irq %d\n", slot, + 'A' + pin, args.vector); + return (args.vector); +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/pmap.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/pmap.c new file mode 100644 index 0000000000..ee61e80ed9 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/pmap.c @@ -0,0 +1,3381 @@ +/*- + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 + */ +/*- + * Copyright (c) 2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jake Burkholder, + * Safeport Network Services, and Network Associates Laboratories, the + * Security Research Division of Network Associates, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA + * CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/pmap.c,v 1.494.2.6 2004/10/10 19:08:00 alc Exp $"); + +/* + * Manages physical address maps. + * XEN NOTES: page table entries (pt_entry_t) and + * page directory entries (pd_entry_t) contain machine + * addresses and not physical addresses. Use PT_GET() before + * dereferencing these structures to convert them into a + * physical address. Use the PT_SET_VA operations to commit + * page changes back to XEN. PT_SET_VA_MA should be used with + * great care! + * + * + * In addition to hardware address maps, this + * module is called upon to provide software-use-only + * maps which may or may not be stored in the same + * form as hardware maps. These pseudo-maps are + * used to store intermediate results from copy + * operations to and from address spaces. + * + * Since the information managed by this module is + * also stored by the logical address mapping module, + * this module may throw away valid virtual-to-physical + * mappings at almost any time. However, invalidations + * of virtual-to-physical mappings must be done as + * requested. + * + * In order to cope with hardware architectures which + * make virtual-to-physical map invalidates expensive, + * this module may delay invalidate or reduced protection + * operations until such time as they are actually + * necessary. This module is given full information as + * to which processors are currently using which maps, + * and to when physical maps must be made correct. + */ + +#include "opt_cpu.h" +#include "opt_pmap.h" +#include "opt_msgbuf.h" +#include "opt_kstack_pages.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/msgbuf.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sx.h> +#include <sys/user.h> +#include <sys/vmmeter.h> +#include <sys/sched.h> +#include <sys/sysctl.h> +#ifdef SMP +#include <sys/smp.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_pageout.h> +#include <vm/vm_pager.h> +#include <vm/uma.h> + +#include <machine/cpu.h> +#include <machine/cputypes.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#ifdef SMP +#include <machine/smp.h> +#endif + +#include <machine/xenfunc.h> + +#if !defined(CPU_ENABLE_SSE) && defined(I686_CPU) +#define CPU_ENABLE_SSE +#endif +#if defined(CPU_DISABLE_SSE) +#undef CPU_ENABLE_SSE +#endif + +#ifndef PMAP_SHPGPERPROC +#define PMAP_SHPGPERPROC 200 +#endif + +#if defined(DIAGNOSTIC) +#define PMAP_DIAGNOSTIC +#endif + +#define MINPV 2048 + +#if !defined(PMAP_DIAGNOSTIC) +#define PMAP_INLINE __inline +#else +#define PMAP_INLINE +#endif + +/* + * Get PDEs and PTEs for user/kernel address space + */ +#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) +#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) + +#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) +#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) +#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) +#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) +#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) + +#if 0 +#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ + atomic_clear_int((u_int *)(pte), PG_W)) +#else +#define pmap_pte_set_w(pte, v) { \ + if (v) \ + PT_SET_VA_MA(pte, *pte | PG_W, TRUE); \ + else \ + PT_SET_VA_MA(pte, *pte & ~PG_W, TRUE); \ +} +#endif + +struct pmap kernel_pmap_store; +LIST_HEAD(pmaplist, pmap); +static struct pmaplist allpmaps; +static struct mtx allpmaps_lock; + +vm_paddr_t avail_end; /* PA of last available physical page */ +vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ +vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ +static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ +int pgeflag = 0; /* PG_G or-in */ +int pseflag = 0; /* PG_PS or-in */ + +static int nkpt; +vm_offset_t kernel_vm_end; +extern u_int32_t KERNend; + +#ifdef PAE +static uma_zone_t pdptzone; +#endif + +/* + * Data for the pv entry allocation mechanism + */ +static uma_zone_t pvzone; +static struct vm_object pvzone_obj; +static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +int pmap_pagedaemon_waken; + +/* + * All those kernel PT submaps that BSD is so fond of + */ +pt_entry_t *CMAP1 = 0; +static pt_entry_t *CMAP2, *CMAP3; +caddr_t CADDR1 = 0, ptvmmap = 0; +static caddr_t CADDR2, CADDR3; +static struct mtx CMAPCADDR12_lock; +struct msgbuf *msgbufp = 0; + +/* + * Crashdump maps. + */ +static caddr_t crashdumpmap; + +#ifdef SMP +extern pt_entry_t *SMPpt; +#endif +static pt_entry_t *PMAP1 = 0, *PMAP2; +static pt_entry_t *PADDR1 = 0, *PADDR2; +#ifdef SMP +static int PMAP1cpu; +static int PMAP1changedcpu; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, + &PMAP1changedcpu, 0, + "Number of times pmap_pte_quick changed CPU with same PMAP1"); +#endif +static int PMAP1changed; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, + &PMAP1changed, 0, + "Number of times pmap_pte_quick changed PMAP1"); +static int PMAP1unchanged; +SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, + &PMAP1unchanged, 0, + "Number of times pmap_pte_quick didn't change PMAP1"); +static struct mtx PMAP2mutex; + +static PMAP_INLINE void free_pv_entry(pv_entry_t pv); +static pv_entry_t get_pv_entry(void); +static void pmap_clear_ptes(vm_page_t m, int bit); + +static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva); +static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); +static int pmap_remove_entry(struct pmap *pmap, vm_page_t m, + vm_offset_t va); +static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); + +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); + +static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags); +static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m); +static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); +static void pmap_pte_release(pt_entry_t *pte); +static int pmap_unuse_pt(pmap_t, vm_offset_t); +static vm_offset_t pmap_kmem_choose(vm_offset_t addr); +#ifdef PAE +static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); +#endif + +CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); +CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); + +#ifndef DEBUG +#define DEBUG +#endif +#ifdef PMAP_DEBUG +static void pmap_dec_ref(unsigned long ma); +static void pmap_mark_privileged(unsigned long pa); +static void pmap_mark_unprivileged(unsigned long pa); +static void pmap_dec_ref_page(vm_page_t m); +int pmap_pid_dump(int pid); +#endif +/* + * Move the kernel virtual free pointer to the next + * 4MB. This is used to help improve performance + * by using a large (4MB) page for much of the kernel + * (.text, .data, .bss) + */ +static vm_offset_t +pmap_kmem_choose(vm_offset_t addr) +{ + vm_offset_t newaddr = addr; + +#ifndef DISABLE_PSE + if (cpu_feature & CPUID_PSE) + newaddr = (addr + PDRMASK) & ~PDRMASK; +#endif + return newaddr; +} + +/* + * Bootstrap the system enough to run with virtual memory. + * + * On the i386 this is called after mapping has already been enabled + * and just syncs the pmap module with what has already been done. + * [We can't call it easily with mapping off since the kernel is not + * mapped with PA == VA, hence we would have to relocate every address + * from the linked base (virtual) address "KERNBASE" to the actual + * (physical) address starting relative to 0] + */ +void +pmap_bootstrap(firstaddr, loadaddr) + vm_paddr_t firstaddr; + vm_paddr_t loadaddr; +{ + vm_offset_t va; + pt_entry_t *pte, *unused; + int i; + + /* + * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too + * large. It should instead be correctly calculated in locore.s and + * not based on 'first' (which is a physical address, not a virtual + * address, for the start of unused physical memory). The kernel + * page tables are NOT double mapped and thus should not be included + * in this calculation. + */ + virtual_avail = (vm_offset_t) KERNBASE + firstaddr; + virtual_avail = pmap_kmem_choose(virtual_avail); + + virtual_end = VM_MAX_KERNEL_ADDRESS; + + /* + * Initialize the kernel pmap (which is statically allocated). + */ + PMAP_LOCK_INIT(kernel_pmap); + kernel_pmap->pm_pdir = (pd_entry_t *) xen_start_info->pt_base; +#ifdef PAE + kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); +#endif + kernel_pmap->pm_active = -1; /* don't allow deactivation */ + TAILQ_INIT(&kernel_pmap->pm_pvlist); + LIST_INIT(&allpmaps); + mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + nkpt = NKPT; + + /* + * Reserve some special page table entries/VA space for temporary + * mapping of pages. + */ +#define SYSMAP(c, p, v, n) \ + v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); + + va = virtual_avail; + pte = vtopte(va); + + /* + * CMAP1/CMAP2 are used for zeroing and copying pages. + * CMAP3 is used for the idle process page zeroing. + */ + SYSMAP(caddr_t, CMAP1, CADDR1, 1) + SYSMAP(caddr_t, CMAP2, CADDR2, 1) + SYSMAP(caddr_t, CMAP3, CADDR3, 1) + PT_CLEAR_VA(CMAP3, TRUE); + + mtx_init(&CMAPCADDR12_lock, "CMAPCADDR12", NULL, MTX_DEF); + + /* + * Crashdump maps. + */ + SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) + + /* + * ptvmmap is used for reading arbitrary physical pages via /dev/mem. + */ + SYSMAP(caddr_t, unused, ptvmmap, 1) + + /* + * msgbufp is used to map the system message buffer. + */ + SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) + + /* + * ptemap is used for pmap_pte_quick + */ + SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); + SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); + + mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); + + virtual_avail = va; + PT_CLEAR_VA(CMAP1, FALSE); + PT_CLEAR_VA(CMAP2, FALSE); + + for (i = 0; i < NKPT; i++) + PT_CLEAR_VA(&PTD[i], FALSE); + PT_UPDATES_FLUSH(); +#ifdef XEN_UNNEEDED + /* Turn on PG_G on kernel page(s) */ + pmap_set_pg(); +#endif +} + +/* + * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. + */ +void +pmap_set_pg(void) +{ + pd_entry_t pdir; + pt_entry_t *pte; + vm_offset_t va, endva; + int i; + + if (pgeflag == 0) + return; + panic("this won't work"); + i = KERNLOAD/NBPDR; + endva = KERNBASE + KERNend; + + if (pseflag) { + va = KERNBASE + KERNLOAD; + while (va < endva) { + pdir = kernel_pmap->pm_pdir[KPTDI+i]; + pdir |= pgeflag; + kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; + invltlb(); /* Play it safe, invltlb() every time */ + i++; + va += NBPDR; + } + } else { + va = (vm_offset_t)btext; + while (va < endva) { + pte = vtopte(va); + if (*pte) + *pte |= pgeflag; + invltlb(); /* Play it safe, invltlb() every time */ + va += PAGE_SIZE; + } + } +} + +#ifdef PAE + +static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt"); + +static void * +pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) +{ + *flags = UMA_SLAB_PRIV; + return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL, + 1, 0)); +} +#endif + +/* + * Initialize the pmap module. + * Called by vm_init, to initialize any structures that the pmap + * system needs to map virtual memory. + * pmap_init has been enhanced to support in a fairly consistant + * way, discontiguous physical memory. + */ +void +pmap_init(void) +{ + int i; + + /* + * Allocate memory for random pmap data structures. Includes the + * pv_head_table. + */ + + for(i = 0; i < vm_page_array_size; i++) { + vm_page_t m; + + m = &vm_page_array[i]; + TAILQ_INIT(&m->md.pv_list); + m->md.pv_list_count = 0; + } + + /* + * init the pv free list + */ + pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); + uma_prealloc(pvzone, MINPV); + +#ifdef PAE + pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, + NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, + UMA_ZONE_VM | UMA_ZONE_NOFREE); + uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); +#endif + + /* + * Now it is safe to enable pv_table recording. + */ + pmap_initialized = TRUE; +} + +/* + * Initialize the address space (zone) for the pv_entries. Set a + * high water mark so that the system can recover from excessive + * numbers of pv entries. + */ +void +pmap_init2() +{ + int shpgperproc = PMAP_SHPGPERPROC; + + TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); + pv_entry_max = shpgperproc * maxproc + vm_page_array_size; + TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); + pv_entry_high_water = 9 * (pv_entry_max / 10); + uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); +} + + +/*************************************************** + * Low level helper routines..... + ***************************************************/ + +#if defined(PMAP_DIAGNOSTIC) + +/* + * This code checks for non-writeable/modified pages. + * This should be an invalid condition. + */ +static int +pmap_nw_modified(pt_entry_t ptea) +{ + int pte; + + pte = (int) ptea; + + if ((pte & (PG_M|PG_RW)) == PG_M) + return 1; + else + return 0; +} +#endif + + +/* + * this routine defines the region(s) of memory that should + * not be tested for the modified bit. + */ +static PMAP_INLINE int +pmap_track_modified(vm_offset_t va) +{ + if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) + return 1; + else + return 0; +} + +#ifdef I386_CPU +/* + * i386 only has "invalidate everything" and no SMP to worry about. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); +} + +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); +} +#else /* !I386_CPU */ +#ifdef SMP +/* + * For SMP, these functions have to use the IPI mechanism for coherence. + */ +void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + u_int cpumask; + u_int other_cpus; + + if (smp_started) { + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_rv_mtx); + } else + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + * XXX critical sections disable interrupts again + */ + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invlpg(va); + smp_invlpg(va); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invlpg(va); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg(pmap->pm_active & other_cpus, va); + } + if (smp_started) + mtx_unlock_spin(&smp_rv_mtx); + else + critical_exit(); +} + +void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + u_int cpumask; + u_int other_cpus; + vm_offset_t addr; + + if (smp_started) { + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_rv_mtx); + } else + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + * XXX critical sections disable interrupts again + */ + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + smp_invlpg_range(sva, eva); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + if (pmap->pm_active & other_cpus) + smp_masked_invlpg_range(pmap->pm_active & other_cpus, + sva, eva); + } + if (smp_started) + mtx_unlock_spin(&smp_rv_mtx); + else + critical_exit(); +} + +void +pmap_invalidate_all(pmap_t pmap) +{ + u_int cpumask; + u_int other_cpus; + + if (smp_started) { + if (!(read_eflags() & PSL_I)) + panic("%s: interrupts disabled", __func__); + mtx_lock_spin(&smp_rv_mtx); + } else + critical_enter(); + /* + * We need to disable interrupt preemption but MUST NOT have + * interrupts disabled here. + * XXX we may need to hold schedlock to get a coherent pm_active + * XXX critical sections disable interrupts again + */ + if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { + invltlb(); + smp_invltlb(); + } else { + cpumask = PCPU_GET(cpumask); + other_cpus = PCPU_GET(other_cpus); + if (pmap->pm_active & cpumask) + invltlb(); + if (pmap->pm_active & other_cpus) + smp_masked_invltlb(pmap->pm_active & other_cpus); + } + if (smp_started) + mtx_unlock_spin(&smp_rv_mtx); + else + critical_exit(); +} +#else /* !SMP */ +/* + * Normal, non-SMP, 486+ invalidation functions. + * We inline these within pmap.c for speed. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invlpg(va); + PT_UPDATES_FLUSH(); + +} + +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t addr; + + if (pmap == kernel_pmap || pmap->pm_active) + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + PT_UPDATES_FLUSH(); + +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + if (pmap == kernel_pmap || pmap->pm_active) + invltlb(); +} +#endif /* !SMP */ +#endif /* !I386_CPU */ + +/* + * Are we current address space or kernel? N.B. We return FALSE when + * a pmap's page table is in use because a kernel thread is borrowing + * it. The borrowed page table can change spontaneously, making any + * dependence on its continued use subject to a race condition. + */ +static __inline int +pmap_is_current(pmap_t pmap) +{ + + return (pmap == kernel_pmap || + (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && + (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); +} + +/* + * If the given pmap is not the current or kernel pmap, the returned pte must + * be released by passing it to pmap_pte_release(). + */ +pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t tmppf, newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + mtx_lock(&PMAP2mutex); + newpf = PT_GET(pde) & PG_FRAME; + tmppf = PT_GET(PMAP2) & PG_FRAME; + if (tmppf != newpf) { + PT_SET_VA(PMAP2, newpf | PG_V | PG_A, FALSE); + pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); + } + return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); + } + return (0); +} + +/* + * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte + * being NULL. + */ +static __inline void +pmap_pte_release(pt_entry_t *pte) +{ + + if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) + mtx_unlock(&PMAP2mutex); +} + +static __inline void +invlcaddr(void *caddr) +{ +#ifdef I386_CPU + invltlb(); +#else + invlpg((u_int)caddr); +#endif + PT_UPDATES_FLUSH(); +} + +/* + * Super fast pmap_pte routine best used when scanning + * the pv lists. This eliminates many coarse-grained + * invltlb calls. Note that many of the pv list + * scans are across different pmaps. It is very wasteful + * to do an entire invltlb for checking a single mapping. + * + * If the given pmap is not the current pmap, vm_page_queue_mtx + * must be held and curthread pinned to a CPU. + */ +static pt_entry_t * +pmap_pte_quick(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t tmppf, newpf; + pd_entry_t *pde; + + pde = pmap_pde(pmap, va); + if (*pde & PG_PS) + return (pde); + if (*pde != 0) { + /* are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (vtopte(va)); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); + newpf = PT_GET(pde) & PG_FRAME; + tmppf = PT_GET(PMAP1) & PG_FRAME; + if (tmppf != newpf) { + PT_SET_VA(PMAP1, newpf | PG_V | PG_A, TRUE); +#ifdef SMP + PMAP1cpu = PCPU_GET(cpuid); +#endif + invlcaddr(PADDR1); + PMAP1changed++; + } else +#ifdef SMP + if (PMAP1cpu != PCPU_GET(cpuid)) { + PMAP1cpu = PCPU_GET(cpuid); + invlcaddr(PADDR1); + PMAP1changedcpu++; + } else +#endif + PMAP1unchanged++; + return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); + } + return (0); +} + +/* + * Routine: pmap_extract + * Function: + * Extract the physical page address associated + * with the given map/virtual_address pair. + */ +vm_paddr_t +pmap_extract(pmap_t pmap, vm_offset_t va) +{ + vm_paddr_t rtval; + pt_entry_t *pte; + pd_entry_t pde; + + rtval = 0; + PMAP_LOCK(pmap); + pde = PT_GET(&pmap->pm_pdir[va >> PDRSHIFT]); + if (pde != 0) { + if ((pde & PG_PS) != 0) { + rtval = (pde & ~PDRMASK) | (va & PDRMASK); + PMAP_UNLOCK(pmap); + return rtval; + } + pte = pmap_pte(pmap, va); + rtval = (PT_GET(pte) & PG_FRAME) | (va & PAGE_MASK); + pmap_pte_release(pte); + } + PMAP_UNLOCK(pmap); + return (rtval); +} + +/* + * Routine: pmap_extract_and_hold + * Function: + * Atomically extract and hold the physical page + * with the given pmap and virtual address pair + * if that mapping permits the given protection. + */ +vm_page_t +pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + pd_entry_t pde; + pt_entry_t pte; + vm_page_t m; + + m = NULL; + vm_page_lock_queues(); + PMAP_LOCK(pmap); + pde = PT_GET(pmap_pde(pmap, va)); + if (pde != 0) { + if (pde & PG_PS) { + panic("4MB pages not currently supported"); + if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { + m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) | + (va & PDRMASK)); + vm_page_hold(m); + } + } else { + sched_pin(); + pte = PT_GET(pmap_pte_quick(pmap, va)); + if (pte != 0 && + ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { + m = PHYS_TO_VM_PAGE(pte & PG_FRAME); + vm_page_hold(m); + } + sched_unpin(); + } + } + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); + return (m); +} + +/*************************************************** + * Low level mapping routines..... + ***************************************************/ + +/* + * Add a wired page to the kva. + * Note: not SMP coherent. + */ +PMAP_INLINE void +pmap_kenter(vm_offset_t va, vm_paddr_t pa) +{ + PT_SET(va, pa | PG_RW | PG_V | pgeflag, TRUE); +} + +/* + * Remove a page from the kernel pagetables. + * Note: not SMP coherent. + */ +PMAP_INLINE void +pmap_kremove(vm_offset_t va) +{ + PT_CLEAR(va, TRUE); +} + +/* + * Used to map a range of physical addresses into kernel + * virtual address space. + * + * The value passed in '*virt' is a suggested virtual address for + * the mapping. Architectures which can support a direct-mapped + * physical to virtual region can return the appropriate address + * within that region, leaving '*virt' unchanged. Other + * architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped + * region. + */ +vm_offset_t +pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) +{ + vm_offset_t va, sva; + pt_entry_t *pte; + + va = sva = *virt; + while (start < end) { + pte = vtopte(va); + PT_SET_VA(pte, start | PG_RW | PG_V | pgeflag, FALSE); + va += PAGE_SIZE; + start += PAGE_SIZE; + } + /* invalidate will flush the update queue */ + pmap_invalidate_range(kernel_pmap, sva, va); + *virt = va; + return (sva); +} + + +/* + * Add a list of wired pages to the kva + * this routine is only used for temporary + * kernel mappings that do not need to have + * page modification or references recorded. + * Note that old mappings are simply written + * over. The page *must* be wired. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) +{ + vm_offset_t va; + + va = sva; + while (count-- > 0) { + PT_SET(va, VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag, + FALSE); + va += PAGE_SIZE; + m++; + } + /* invalidate will flush the update queue */ + pmap_invalidate_range(kernel_pmap, sva, va); +} + +/* + * This routine tears out page mappings from the + * kernel -- it is meant only for temporary mappings. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qremove(vm_offset_t sva, int count) +{ + vm_offset_t va; + + va = sva; + while (count-- > 0) { + PT_CLEAR(va, FALSE); + va += PAGE_SIZE; + } + /* invalidate will flush the update queue */ + pmap_invalidate_range(kernel_pmap, sva, va); +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ + +/* + * This routine unholds page table pages, and if the hold count + * drops to zero, then it decrements the wire count. + */ +static PMAP_INLINE int +pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) +{ + + --m->wire_count; + if (m->wire_count == 0) + return _pmap_unwire_pte_hold(pmap, m); + else + return 0; +} + +static int +_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) +{ + vm_offset_t pteva; + /* + * unmap the page table page + */ + xpq_queue_unpin_table(pmap->pm_pdir[m->pindex]); + PT_CLEAR_VA(&pmap->pm_pdir[m->pindex], TRUE); + --pmap->pm_stats.resident_count; + + /* + * Do an invltlb to make the invalidated mapping + * take effect immediately. + */ + pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); + pmap_invalidate_page(pmap, pteva); + + vm_page_free_zero(m); + atomic_subtract_int(&cnt.v_wire_count, 1); + return 1; +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static int +pmap_unuse_pt(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t ptepde; + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return 0; + ptepde = PT_GET(pmap_pde(pmap, va)); + mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); + return pmap_unwire_pte_hold(pmap, mpte); +} + +void +pmap_pinit0(pmap) + struct pmap *pmap; +{ + + PMAP_LOCK_INIT(pmap); + pmap->pm_pdir = (pd_entry_t *)(xen_start_info->pt_base); +#ifdef PAE + pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); +#endif + pmap->pm_active = 0; + PCPU_SET(curpmap, pmap); + TAILQ_INIT(&pmap->pm_pvlist); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); +} + +/* + * Initialize a preallocated and zeroed pmap structure, + * such as one in a vmspace structure. + */ +void +pmap_pinit(struct pmap *pmap) +{ + vm_page_t m, ptdpg[NPGPTD]; + vm_paddr_t ma; + static int color; + int i; + + PMAP_LOCK_INIT(pmap); + + /* + * No need to allocate page table space yet but we do need a valid + * page directory table. + */ + if (pmap->pm_pdir == NULL) { + pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, + NBPTD); +#ifdef PAE + pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); + KASSERT(((vm_offset_t)pmap->pm_pdpt & + ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, + ("pmap_pinit: pdpt misaligned")); + KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), + ("pmap_pinit: pdpt above 4g")); +#endif + } + + /* + * allocate the page directory page(s) + */ + for (i = 0; i < NPGPTD;) { + m = vm_page_alloc(NULL, color++, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (m == NULL) + VM_WAIT; + else { + pmap_zero_page(m); + ptdpg[i++] = m; + } + } + + pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); + + for (i = 0; i < NPGPTD; i++) { + if ((ptdpg[i]->flags & PG_ZERO) == 0) + bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE); + } + + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + /* Wire in kernel global address entries. */ + /* XXX copies current process, does not fill in MPPTDI */ + bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); +#ifdef SMP + pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; +#endif + + /* install self-referential address mapping entry(s) */ + for (i = 0; i < NPGPTD; i++) { + ma = xpmap_ptom(VM_PAGE_TO_PHYS(ptdpg[i])); + pmap->pm_pdir[PTDPTDI + i] = ma | PG_V | PG_A; +#ifdef PAE + pmap->pm_pdpt[i] = ma | PG_V; +#endif +#ifndef PAE + PT_SET_MA(pmap->pm_pdir, ma | PG_V | PG_A, TRUE); +#else + panic("FIX ME!"); +#endif + xpq_queue_pin_table(ma, XPQ_PIN_L2_TABLE); + } + + pmap->pm_active = 0; + TAILQ_INIT(&pmap->pm_pvlist); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); +} + +/* + * this routine is called if the page table page is not + * mapped correctly. + */ +static vm_page_t +_pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags) +{ + vm_paddr_t ptepa; + vm_page_t m; + + KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || + (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, + ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); + + /* + * Allocate a page table page. + */ + if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if (flags & M_WAITOK) { + PMAP_UNLOCK(pmap); + vm_page_unlock_queues(); + VM_WAIT; + vm_page_lock_queues(); + PMAP_LOCK(pmap); + } + + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + + /* + * Map the pagetable page into the process address space, if + * it isn't already there. + */ + + pmap->pm_stats.resident_count++; + + ptepa = VM_PAGE_TO_PHYS(m); + xpq_queue_pin_table(xpmap_ptom(ptepa), XPQ_PIN_L1_TABLE); + PT_SET_VA(&pmap->pm_pdir[ptepindex], + (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M), TRUE); + + return m; +} + +static vm_page_t +pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) +{ + unsigned ptepindex; + pd_entry_t ptepa; + vm_page_t m; + + KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || + (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, + ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); + + /* + * Calculate pagetable page index + */ + ptepindex = va >> PDRSHIFT; +retry: + /* + * Get the page directory entry + */ + ptepa = PT_GET(&pmap->pm_pdir[ptepindex]); + + /* + * This supports switching from a 4MB page to a + * normal 4K page. + */ + if (ptepa & PG_PS) { + pmap->pm_pdir[ptepindex] = 0; + ptepa = 0; + pmap_invalidate_all(kernel_pmap); + } + + /* + * If the page table page is mapped, we just increment the + * hold count, and activate it. + */ + if (ptepa) { + m = PHYS_TO_VM_PAGE(ptepa); + m->wire_count++; + } else { + /* + * Here if the pte page isn't mapped, or if it has + * been deallocated. + */ + m = _pmap_allocpte(pmap, ptepindex, flags); + if (m == NULL && (flags & M_WAITOK)) + goto retry; + } + return (m); +} + + +/*************************************************** +* Pmap allocation/deallocation routines. + ***************************************************/ + +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static u_int *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + u_int mymask = PCPU_GET(cpumask); + + if (PCPU_GET(curpcb)->pcb_cr3 == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + atomic_clear_int(lazymask, mymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(u_int mymask) +{ + + if (PCPU_GET(curpcb)->pcb_cr3 == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + atomic_clear_int(lazymask, mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int mymask = PCPU_GET(cpumask); + u_int mask; + register u_int spins; + + while ((mask = pmap->pm_active) != 0) { + spins = 50000000; + mask = mask & -mask; /* Find least significant set bit */ + mtx_lock_spin(&smp_rv_mtx); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } + } + mtx_unlock_spin(&smp_rv_mtx); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == PCPU_GET(curpcb)->pcb_cr3) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + pmap->pm_active &= ~(PCPU_GET(cpumask)); + } +} +#endif /* SMP */ + +/* + * Release any resources held by the given physical map. + * Called when a pmap initialized by pmap_pinit is being released. + * Should only be called if the map contains no valid mappings. + */ +void +pmap_release(pmap_t pmap) +{ + vm_page_t m, ptdpg[NPGPTD]; + vm_paddr_t ma; + int i; + + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); + + pmap_lazyfix(pmap); + mtx_lock_spin(&allpmaps_lock); + LIST_REMOVE(pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + + for (i = 0; i < NPGPTD; i++) + ptdpg[i] = PHYS_TO_VM_PAGE(PT_GET(&pmap->pm_pdir[PTDPTDI + i])); + + for (i = 0; i < nkpt + NPGPTD; i++) + PT_CLEAR_VA(&pmap->pm_pdir[PTDPTDI + i], FALSE); + + bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * + sizeof(*pmap->pm_pdir)); +#ifdef SMP + PT_CLEAR_VA(&pmap->pm_pdir[MPPTDI], FALSE); +#endif + PT_UPDATES_FLUSH(); + pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); + + vm_page_lock_queues(); + for (i = 0; i < NPGPTD; i++) { + m = ptdpg[i]; + + ma = xpmap_ptom(VM_PAGE_TO_PHYS(m)); + xpq_queue_unpin_table(ma); + pmap_zero_page(m); +#ifdef PAE + KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), + ("pmap_release: got wrong ptd page")); +#endif + m->wire_count--; + atomic_subtract_int(&cnt.v_wire_count, 1); + + vm_page_free_zero(m); + } + vm_page_unlock_queues(); + PMAP_LOCK_DESTROY(pmap); +} + +static int +kvm_size(SYSCTL_HANDLER_ARGS) +{ + unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; + + return sysctl_handle_long(oidp, &ksize, 0, req); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_size, "IU", "Size of KVM"); + +static int +kvm_free(SYSCTL_HANDLER_ARGS) +{ + unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; + + return sysctl_handle_long(oidp, &kfree, 0, req); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_free, "IU", "Amount of KVM free"); + +/* + * grow the number of kernel page table entries, if needed + */ +void +pmap_growkernel(vm_offset_t addr) +{ + struct pmap *pmap; + vm_paddr_t ptppaddr; + vm_page_t nkpg; + pd_entry_t newpdir; + pt_entry_t *pde; + + mtx_assert(&kernel_map->system_mtx, MA_OWNED); + if (kernel_vm_end == 0) { + kernel_vm_end = KERNBASE; + nkpt = 0; + while (pdir_pde(PTD, kernel_vm_end)) { + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + nkpt++; + } + } + addr = roundup2(addr, PAGE_SIZE * NPTEPG); + while (kernel_vm_end < addr) { + if (pdir_pde(PTD, kernel_vm_end)) { + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + continue; + } + + /* + * This index is bogus, but out of the way + */ + nkpg = vm_page_alloc(NULL, nkpt, + VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); + if (!nkpg) + panic("pmap_growkernel: no memory to grow kernel"); + + nkpt++; + + pmap_zero_page(nkpg); + ptppaddr = VM_PAGE_TO_PHYS(nkpg); + newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); + PT_SET_VA(&pdir_pde(PTD, kernel_vm_end), newpdir, TRUE); + + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(pmap, &allpmaps, pm_list) { + pde = pmap_pde(pmap, kernel_vm_end); + PT_SET_VA(pde, newpdir, FALSE); + } + PT_UPDATES_FLUSH(); + mtx_unlock_spin(&allpmaps_lock); + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); + } +} + + +/*************************************************** + * page management routines. + ***************************************************/ + +/* + * free the pv_entry back to the free list + */ +static PMAP_INLINE void +free_pv_entry(pv_entry_t pv) +{ + pv_entry_count--; + uma_zfree(pvzone, pv); +} + +/* + * get a new pv_entry, allocating a block from the system + * when needed. + * the memory allocation is performed bypassing the malloc code + * because of the possibility of allocations at interrupt time. + */ +static pv_entry_t +get_pv_entry(void) +{ + pv_entry_count++; + if (pv_entry_high_water && + (pv_entry_count > pv_entry_high_water) && + (pmap_pagedaemon_waken == 0)) { + pmap_pagedaemon_waken = 1; + wakeup (&vm_pages_needed); + } + return uma_zalloc(pvzone, M_NOWAIT); +} + + +static int +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + pv_entry_t pv; + int rtval; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if (m->md.pv_list_count < pmap->pm_stats.resident_count) { + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (pmap == pv->pv_pmap && va == pv->pv_va) + break; + } + } else { + TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { + if (va == pv->pv_va) + break; + } + } + + rtval = 0; + if (pv) { + rtval = pmap_unuse_pt(pmap, va); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + m->md.pv_list_count--; + if (TAILQ_FIRST(&m->md.pv_list) == NULL) + vm_page_flag_clear(m, PG_WRITEABLE); + + TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); + free_pv_entry(pv); + } + + return rtval; +} + +/* + * Create a pv entry for page at pa for + * (pmap, va). + */ +static void +pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + pv_entry_t pv; + pv = get_pv_entry(); + pv->pv_va = va; + pv->pv_pmap = pmap; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + m->md.pv_list_count++; +} + +/* + * pmap_remove_pte: do the things to unmap a page in a process + */ +static int +pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) +{ + pt_entry_t oldpte; + vm_page_t m; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpte = pte_load_clear(ptq); + if (oldpte & PG_W) + pmap->pm_stats.wired_count -= 1; + /* + * Machines that don't support invlpg, also don't support + * PG_G. + */ + if (oldpte & PG_G) + pmap_invalidate_page(kernel_pmap, va); + pmap->pm_stats.resident_count -= 1; + if (oldpte & PG_MANAGED) { + m = PHYS_TO_VM_PAGE(oldpte); + if (oldpte & PG_M) { +#if defined(PMAP_DIAGNOSTIC) + if (pmap_nw_modified((pt_entry_t) oldpte)) { + printf( + "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n", + va, oldpte); + } +#endif + if (pmap_track_modified(va)) + vm_page_dirty(m); + } + if (oldpte & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + return pmap_remove_entry(pmap, m, va); + } else { + return pmap_unuse_pt(pmap, va); + } +} + +/* + * Remove a single page from a process address space + */ +static void +pmap_remove_page(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *pte; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) + return; + pmap_remove_pte(pmap, pte, va); + pmap_invalidate_page(pmap, va); +} + +/* + * Remove the given range of addresses from the specified map. + * + * It is assumed that the start and end are properly + * rounded to the page size. + */ +void +pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t pdnxt; + pd_entry_t ptpaddr; + pt_entry_t *pte; + int anyvalid; + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = 0; + + vm_page_lock_queues(); + sched_pin(); + PMAP_LOCK(pmap); + + /* + * special handling of removing one page. a very + * common operation and easy to short circuit some + * code. + */ + if ((sva + PAGE_SIZE == eva) && + ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { + pmap_remove_page(pmap, sva); + goto out; + } + + for (; sva < eva; sva = pdnxt) { + unsigned pdirindex; + + /* + * Calculate index for next page table. + */ + pdnxt = (sva + NBPDR) & ~PDRMASK; + if (pmap->pm_stats.resident_count == 0) + break; + + pdirindex = sva >> PDRSHIFT; + ptpaddr = PT_GET(&pmap->pm_pdir[pdirindex]); + + /* + * Weed out invalid mappings. Note: we assume that the page + * directory table is always allocated, and in kernel virtual. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & PG_PS) != 0) { + PT_CLEAR_VA(pmap->pm_pdir[pdirindex], TRUE); + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + anyvalid = 1; + continue; + } + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (pdnxt > eva) + pdnxt = eva; + + for (; sva != pdnxt; sva += PAGE_SIZE) { + if ((pte = pmap_pte_quick(pmap, sva)) == NULL || + *pte == 0) + continue; + anyvalid = 1; + if (pmap_remove_pte(pmap, pte, sva)) + break; + } + } +out: + sched_unpin(); + vm_page_unlock_queues(); + if (anyvalid) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +/* + * Routine: pmap_remove_all + * Function: + * Removes this physical page from + * all physical maps in which it resides. + * Reflects back modify bits to the pager. + * + * Notes: + * Original versions of this routine were very + * inefficient because they iteratively called + * pmap_remove (slow...) + */ + +void +pmap_remove_all(vm_page_t m) +{ + pv_entry_t pv; + pt_entry_t *pte, tpte; + +#if defined(PMAP_DIAGNOSTIC) + /* + * XXX This makes pmap_remove_all() illegal for non-managed pages! + */ + if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { + panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x", + VM_PAGE_TO_PHYS(m)); + } +#endif + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + sched_pin(); + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + PMAP_LOCK(pv->pv_pmap); + pv->pv_pmap->pm_stats.resident_count--; + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + tpte = pte_load_clear(pte); + if (tpte & PG_W) + pv->pv_pmap->pm_stats.wired_count--; + if (tpte & PG_A) + vm_page_flag_set(m, PG_REFERENCED); + + /* + * Update the vm_page_t clean and reference bits. + */ + if (tpte & PG_M) { +#if defined(PMAP_DIAGNOSTIC) + if (pmap_nw_modified((pt_entry_t) tpte)) { + printf( + "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n", + pv->pv_va, tpte); + } +#endif + if (pmap_track_modified(pv->pv_va)) + vm_page_dirty(m); + } + pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + m->md.pv_list_count--; + pmap_unuse_pt(pv->pv_pmap, pv->pv_va); + PMAP_UNLOCK(pv->pv_pmap); + free_pv_entry(pv); + } + vm_page_flag_clear(m, PG_WRITEABLE); + sched_unpin(); +} + +/* + * Set the physical protection on the + * specified range of this map as requested. + */ +void +pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) +{ + vm_offset_t pdnxt; + pd_entry_t ptpaddr; + int anychanged; + + if ((prot & VM_PROT_READ) == VM_PROT_NONE) { + pmap_remove(pmap, sva, eva); + return; + } + + if (prot & VM_PROT_WRITE) + return; + + anychanged = 0; + + vm_page_lock_queues(); + sched_pin(); + PMAP_LOCK(pmap); + for (; sva < eva; sva = pdnxt) { + unsigned obits, pbits, pdirindex; + + pdnxt = (sva + NBPDR) & ~PDRMASK; + + pdirindex = sva >> PDRSHIFT; + ptpaddr = PT_GET(&pmap->pm_pdir[pdirindex]); + + /* + * Weed out invalid mappings. Note: we assume that the page + * directory table is always allocated, and in kernel virtual. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & PG_PS) != 0) { + pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); + pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; + anychanged = 1; + continue; + } + + if (pdnxt > eva) + pdnxt = eva; + + for (; sva != pdnxt; sva += PAGE_SIZE) { + pt_entry_t *pte; + vm_page_t m; + + if ((pte = pmap_pte_quick(pmap, sva)) == NULL) + continue; +#ifdef notyet +retry: +#endif + /* + * Regardless of whether a pte is 32 or 64 bits in + * size, PG_RW, PG_A, and PG_M are among the least + * significant 32 bits. + */ + obits = pbits = PT_GET(pte); + if (pbits & PG_MANAGED) { + m = NULL; + if (pbits & PG_A) { + m = PHYS_TO_VM_PAGE(pbits); + vm_page_flag_set(m, PG_REFERENCED); + pbits &= ~PG_A; + } + if ((pbits & PG_M) != 0 && + pmap_track_modified(sva)) { + if (m == NULL) + m = PHYS_TO_VM_PAGE(pbits); + vm_page_dirty(m); + } + } + + pbits &= ~(PG_RW | PG_M); + + if (pbits != obits) { +#ifdef notyet + if (!atomic_cmpset_int((u_int *)pte, obits, + pbits)) + goto retry; +#endif + PT_SET_VA(pte, pbits, FALSE); + anychanged = 1; + } + } + } + sched_unpin(); + vm_page_unlock_queues(); + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +/* + * Insert the given physical page (p) at + * the specified virtual address (v) in the + * target physical map with the protection requested. + * + * If specified, the page will be wired down, meaning + * that the related pte can not be reclaimed. + * + * NB: This is the only routine which MAY NOT lazy-evaluate + * or lose information. That is, this routine must actually + * insert this page into the given map NOW. + */ +void +pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + boolean_t wired) +{ + vm_paddr_t pa; + register pt_entry_t *pte; + vm_paddr_t opa; + pt_entry_t origpte, newpte; + vm_page_t mpte, om; + + va &= PG_FRAME; +#ifdef PMAP_DIAGNOSTIC + if (va > VM_MAX_KERNEL_ADDRESS) + panic("pmap_enter: toobig"); + if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) + panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); +#endif + + mpte = NULL; + + vm_page_lock_queues(); + PMAP_LOCK(pmap); + sched_pin(); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + mpte = pmap_allocpte(pmap, va, M_WAITOK); + } +#if 0 && defined(PMAP_DIAGNOSTIC) + else { + pd_entry_t *pdeaddr = pmap_pde(pmap, va); + origpte = PT_GET(pdeaddr); + if ((origpte & PG_V) == 0) { + panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", + pmap->pm_pdir[PTDPTDI], origpte, va); + } + } +#endif + + pte = pmap_pte_quick(pmap, va); + + /* + * Page Directory table entry not valid, we need a new PT page + */ + if (pte == NULL) { + panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", + (uintmax_t)pmap->pm_pdir[PTDPTDI], va); + } + + pa = VM_PAGE_TO_PHYS(m); + om = NULL; + origpte = PT_GET(pte); + opa = origpte & PG_FRAME; + + if (origpte & PG_PS) { + /* + * Yes, I know this will truncate upper address bits for PAE, + * but I'm actually more interested in the lower bits + */ + printf("pmap_enter: va %p, pte %p, origpte %p\n", + (void *)va, (void *)pte, (void *)(uintptr_t)origpte); + panic("pmap_enter: attempted pmap_enter on 4MB page"); + } + + /* + * Mapping has not changed, must be protection or wiring change. + */ + if (origpte && (opa == pa)) { + /* + * Wiring change, just update stats. We don't worry about + * wiring PT pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is wired, + * the PT page will be also. + */ + if (wired && ((origpte & PG_W) == 0)) + pmap->pm_stats.wired_count++; + else if (!wired && (origpte & PG_W)) + pmap->pm_stats.wired_count--; + +#if defined(PMAP_DIAGNOSTIC) + if (pmap_nw_modified((pt_entry_t) origpte)) { + printf( + "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n", + va, origpte); + } +#endif + + /* + * Remove extra pte reference + */ + if (mpte) + mpte->wire_count--; + + /* + * We might be turning off write access to the page, + * so we go ahead and sense modify status. + */ + if (origpte & PG_MANAGED) { + om = m; + pa |= PG_MANAGED; + } + goto validate; + } + /* + * Mapping has changed, invalidate old range and fall through to + * handle validating new mapping. + */ + if (opa) { + int err; + if (origpte & PG_W) + pmap->pm_stats.wired_count--; + if (origpte & PG_MANAGED) { + om = PHYS_TO_VM_PAGE(opa); + err = pmap_remove_entry(pmap, om, va); + } else + err = pmap_unuse_pt(pmap, va); + if (err) + panic("pmap_enter: pte vanished, va: 0x%x", va); + } else + pmap->pm_stats.resident_count++; + + /* + * Enter on the PV list if part of our managed memory. Note that we + * raise IPL while manipulating pv_table since pmap_enter can be + * called at interrupt time. + */ + if (pmap_initialized && + (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { + pmap_insert_entry(pmap, va, m); + pa |= PG_MANAGED; + } + + /* + * Increment counters + */ + if (wired) + pmap->pm_stats.wired_count++; + +validate: + /* + * Now validate mapping with desired protection/wiring. + */ + newpte = (pt_entry_t)(pa | PG_V); + if ((prot & VM_PROT_WRITE) != 0) + newpte |= PG_RW; + if (wired) + newpte |= PG_W; + if (va < VM_MAXUSER_ADDRESS) + newpte |= PG_U; + if (pmap == kernel_pmap) + newpte |= pgeflag; + + /* + * if the mapping or permission bits are different, we need + * to update the pte. + */ + if ((origpte & ~(PG_M|PG_A)) != newpte) { + if (origpte & PG_MANAGED) { + origpte = PT_GET(pte); + PT_SET_VA(pte, newpte | PG_A, TRUE); + if ((origpte & PG_M) && pmap_track_modified(va)) + vm_page_dirty(om); + if (origpte & PG_A) + vm_page_flag_set(om, PG_REFERENCED); + } else + PT_SET_VA(pte, newpte | PG_A, TRUE); + if (origpte) { + pmap_invalidate_page(pmap, va); + } + } + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); +} + +/* + * this code makes some *MAJOR* assumptions: + * 1. Current pmap & pmap exists. + * 2. Not wired. + * 3. Read access. + * 4. No page table pages. + * 5. Tlbflush is deferred to calling procedure. + * 6. Page IS managed. + * but is *MUCH* faster than pmap_enter... + */ + +vm_page_t +pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) +{ + pt_entry_t *pte; + vm_paddr_t pa; + + vm_page_lock_queues(); + PMAP_LOCK(pmap); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + unsigned ptepindex; + pd_entry_t ptepa; + + /* + * Calculate pagetable page index + */ + ptepindex = va >> PDRSHIFT; + if (mpte && (mpte->pindex == ptepindex)) { + mpte->wire_count++; + } else { +retry: + /* + * Get the page directory entry + */ + ptepa = PT_GET(&pmap->pm_pdir[ptepindex]); + + /* + * If the page table page is mapped, we just increment + * the hold count, and activate it. + */ + if (ptepa) { + if (ptepa & PG_PS) + panic("pmap_enter_quick: unexpected mapping into 4MB page"); + mpte = PHYS_TO_VM_PAGE(ptepa); + mpte->wire_count++; + } else { + mpte = _pmap_allocpte(pmap, ptepindex, + M_WAITOK); + if (mpte == NULL) + goto retry; + } + } + } else { + mpte = NULL; + } + + /* + * This call to vtopte makes the assumption that we are + * entering the page into the current pmap. In order to support + * quick entry into any pmap, one would likely use pmap_pte_quick. + * But that isn't as quick as vtopte. + */ + pte = vtopte(va); + if (PT_GET(pte)) { + if (mpte != NULL) { + pmap_unwire_pte_hold(pmap, mpte); + mpte = NULL; + } + goto out; + } + + /* + * Enter on the PV list if part of our managed memory. Note that we + * raise IPL while manipulating pv_table since pmap_enter can be + * called at interrupt time. + */ + if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) + pmap_insert_entry(pmap, va, m); + + /* + * Increment counters + */ + pmap->pm_stats.resident_count++; + + pa = VM_PAGE_TO_PHYS(m); + + /* + * Now validate mapping with RO protection + */ + if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) + PT_SET(va, pa | PG_V | PG_U, TRUE); + else + PT_SET(va, pa | PG_V | PG_U | PG_MANAGED, TRUE); +out: + vm_page_unlock_queues(); + PMAP_UNLOCK(pmap); + return mpte; +} + +/* + * Make a temporary mapping for a physical address. This is only intended + * to be used for panic dumps. + */ +void * +pmap_kenter_temporary(vm_paddr_t pa, int i) +{ + vm_offset_t va; + + va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); + pmap_kenter(va, pa); +#ifndef I386_CPU + invlpg(va); +#else + invltlb(); +#endif + return ((void *)crashdumpmap); +} + +/* + * This code maps large physical mmap regions into the + * processor address space. Note that some shortcuts + * are taken, but the code works. + */ +void +pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, + vm_object_t object, vm_pindex_t pindex, + vm_size_t size) +{ + vm_page_t p; + + VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); + KASSERT(object->type == OBJT_DEVICE, + ("pmap_object_init_pt: non-device object")); + if (pseflag && + ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { + int i; + vm_page_t m[1]; + unsigned int ptepindex; + int npdes; + pd_entry_t ptepa; + + PMAP_LOCK(pmap); + if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) + goto out; + PMAP_UNLOCK(pmap); +retry: + p = vm_page_lookup(object, pindex); + if (p != NULL) { + vm_page_lock_queues(); + if (vm_page_sleep_if_busy(p, FALSE, "init4p")) + goto retry; + } else { + p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); + if (p == NULL) + return; + m[0] = p; + + if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + return; + } + + p = vm_page_lookup(object, pindex); + vm_page_lock_queues(); + vm_page_wakeup(p); + } + vm_page_unlock_queues(); + + ptepa = VM_PAGE_TO_PHYS(p); + if (ptepa & (NBPDR - 1)) + return; + + p->valid = VM_PAGE_BITS_ALL; + + PMAP_LOCK(pmap); + pmap->pm_stats.resident_count += size >> PAGE_SHIFT; + npdes = size >> PDRSHIFT; + for(i = 0; i < npdes; i++) { + PT_SET_VA(&pmap->pm_pdir[ptepindex], + ptepa | PG_U | PG_RW | PG_V | PG_PS, FALSE); + ptepa += NBPDR; + ptepindex += 1; + } + pmap_invalidate_all(pmap); +out: + PMAP_UNLOCK(pmap); + } +} + +void +pmap_map_readonly(pmap_t pmap, vm_offset_t va, int len) +{ + int i, npages = round_page(len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + pt_entry_t *pte; + pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); + PT_SET_MA(va + i*PAGE_SIZE, *pte & ~(PG_RW|PG_M), FALSE); + PMAP_MARK_PRIV(xpmap_mtop(*pte)); + pmap_pte_release(pte); + } + PT_UPDATES_FLUSH(); +} + +void +pmap_map_readwrite(pmap_t pmap, vm_offset_t va, int len) +{ + int i, npages = round_page(len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + pt_entry_t *pte; + pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); + PMAP_MARK_UNPRIV(xpmap_mtop(*pte)); + PT_SET_MA(va + i*PAGE_SIZE, *pte | (PG_RW|PG_M), FALSE); + pmap_pte_release(pte); + } + PT_UPDATES_FLUSH(); +} + +/* + * Routine: pmap_change_wiring + * Function: Change the wiring attribute for a map/virtual-address + * pair. + * In/out conditions: + * The mapping must already exist in the pmap. + */ +void +pmap_change_wiring(pmap, va, wired) + register pmap_t pmap; + vm_offset_t va; + boolean_t wired; +{ + register pt_entry_t *pte; + + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, va); + + if (wired && !pmap_pte_w(pte)) + pmap->pm_stats.wired_count++; + else if (!wired && pmap_pte_w(pte)) + pmap->pm_stats.wired_count--; + + /* + * Wiring is not a hardware characteristic so there is no need to + * invalidate TLB. + */ + pmap_pte_set_w(pte, wired); + pmap_pte_release(pte); + PMAP_UNLOCK(pmap); +} + + + +/* + * Copy the range specified by src_addr/len + * from the source map to the range dst_addr/len + * in the destination map. + * + * This routine is only advisory and need not do anything. + */ + +void +pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, + vm_offset_t src_addr) +{ + vm_offset_t addr; + vm_offset_t end_addr = src_addr + len; + vm_offset_t pdnxt; + vm_page_t m; + + if (dst_addr != src_addr) + return; + + if (!pmap_is_current(src_pmap)) + return; + + vm_page_lock_queues(); + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + sched_pin(); + for (addr = src_addr; addr < end_addr; addr = pdnxt) { + pt_entry_t *src_pte, *dst_pte; + vm_page_t dstmpte, srcmpte; + pd_entry_t srcptepaddr; + unsigned ptepindex; + + if (addr >= UPT_MIN_ADDRESS) + panic("pmap_copy: invalid to pmap_copy page tables"); + + /* + * Don't let optional prefaulting of pages make us go + * way below the low water mark of free pages or way + * above high water mark of used pv entries. + */ + if (cnt.v_free_count < cnt.v_free_reserved || + pv_entry_count > pv_entry_high_water) + break; + + pdnxt = (addr + NBPDR) & ~PDRMASK; + ptepindex = addr >> PDRSHIFT; + + srcptepaddr = PT_GET(&src_pmap->pm_pdir[ptepindex]); + if (srcptepaddr == 0) + continue; + + if (srcptepaddr & PG_PS) { + if (dst_pmap->pm_pdir[ptepindex] == 0) { + PT_SET_VA(&dst_pmap->pm_pdir[ptepindex], srcptepaddr, TRUE); + dst_pmap->pm_stats.resident_count += + NBPDR / PAGE_SIZE; + } + continue; + } + + srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); + if (srcmpte->wire_count == 0) + panic("pmap_copy: source page table page is unused"); + + if (pdnxt > end_addr) + pdnxt = end_addr; + + src_pte = vtopte(addr); + while (addr < pdnxt) { + pt_entry_t ptetemp; + ptetemp = PT_GET(src_pte); + /* + * we only virtual copy managed pages + */ + if ((ptetemp & PG_MANAGED) != 0) { + /* + * We have to check after allocpte for the + * pte still being around... allocpte can + * block. + */ + dstmpte = pmap_allocpte(dst_pmap, addr, + M_NOWAIT); + if (dstmpte == NULL) + break; + dst_pte = pmap_pte_quick(dst_pmap, addr); + if (*dst_pte == 0) { + /* + * Clear the modified and + * accessed (referenced) bits + * during the copy. + */ + m = PHYS_TO_VM_PAGE(ptetemp); + PT_SET_VA(dst_pte, ptetemp & ~(PG_M | PG_A), FALSE); + dst_pmap->pm_stats.resident_count++; + pmap_insert_entry(dst_pmap, addr, m); + } else + pmap_unwire_pte_hold(dst_pmap, dstmpte); + if (dstmpte->wire_count >= srcmpte->wire_count) + break; + } + addr += PAGE_SIZE; + src_pte++; + } + } + PT_UPDATES_FLUSH(); + sched_unpin(); + vm_page_unlock_queues(); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); +} + +static __inline void +pagezero(void *page) +{ +#if defined(I686_CPU) + if (cpu_class == CPUCLASS_686) { +#if defined(CPU_ENABLE_SSE) + if (cpu_feature & CPUID_SSE2) + sse2_pagezero(page); + else +#endif + i686_pagezero(page); + } else +#endif + bzero(page, PAGE_SIZE); +} + +/* + * pmap_zero_page zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + */ +void +pmap_zero_page(vm_page_t m) +{ + + mtx_lock(&CMAPCADDR12_lock); + if (*CMAP2) + panic("pmap_zero_page: CMAP2 busy"); + sched_pin(); + PT_SET_VA(CMAP2, PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M, FALSE); + invlcaddr(CADDR2); + pagezero(CADDR2); + PT_CLEAR_VA(CMAP2, TRUE); + sched_unpin(); + mtx_unlock(&CMAPCADDR12_lock); +} + +/* + * pmap_zero_page_area zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + * + * off and size may not cover an area beyond a single hardware page. + */ +void +pmap_zero_page_area(vm_page_t m, int off, int size) +{ + + mtx_lock(&CMAPCADDR12_lock); + if (*CMAP2) + panic("pmap_zero_page: CMAP2 busy"); + sched_pin(); + PT_SET_VA(CMAP2, PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M, FALSE); + invlcaddr(CADDR2); + if (off == 0 && size == PAGE_SIZE) + pagezero(CADDR2); + else + bzero((char *)CADDR2 + off, size); + PT_CLEAR_VA(CMAP2, TRUE); + sched_unpin(); + mtx_unlock(&CMAPCADDR12_lock); +} + +/* + * pmap_zero_page_idle zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. This + * is intended to be called from the vm_pagezero process only and + * outside of Giant. + */ +void +pmap_zero_page_idle(vm_page_t m) +{ + + if (*CMAP3) + panic("pmap_zero_page: CMAP3 busy"); + sched_pin(); + PT_SET_VA(CMAP3, PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M, TRUE); + invlcaddr(CADDR3); + pagezero(CADDR3); + PT_CLEAR_VA(CMAP3, TRUE); + sched_unpin(); +} + +/* + * pmap_copy_page copies the specified (machine independent) + * page by mapping the page into virtual memory and using + * bcopy to copy the page, one machine dependent page at a + * time. + */ +void +pmap_copy_page(vm_page_t src, vm_page_t dst) +{ + + mtx_lock(&CMAPCADDR12_lock); + if (*CMAP1) + panic("pmap_copy_page: CMAP1 busy"); + if (*CMAP2) + panic("pmap_copy_page: CMAP2 busy"); + sched_pin(); +#ifdef I386_CPU + invltlb(); +#else + invlpg((u_int)CADDR1); + invlpg((u_int)CADDR2); +#endif + PT_SET_VA(CMAP1, PG_V | VM_PAGE_TO_PHYS(src) | PG_A, FALSE); + PT_SET_VA(CMAP2, PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M, TRUE); + + bcopy(CADDR1, CADDR2, PAGE_SIZE); + PT_CLEAR_VA(CMAP1, FALSE); + PT_CLEAR_VA(CMAP2, TRUE); + sched_unpin(); + mtx_unlock(&CMAPCADDR12_lock); +} + +/* + * Returns true if the pmap's pv is one of the first + * 16 pvs linked to from this page. This count may + * be changed upwards or downwards in the future; it + * is only necessary that true be returned for a small + * subset of pmaps for proper page aging. + */ +boolean_t +pmap_page_exists_quick(pmap, m) + pmap_t pmap; + vm_page_t m; +{ + pv_entry_t pv; + int loops = 0; + + if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) + return FALSE; + + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (pv->pv_pmap == pmap) { + return TRUE; + } + loops++; + if (loops >= 16) + break; + } + return (FALSE); +} + +#define PMAP_REMOVE_PAGES_CURPROC_ONLY +/* + * Remove all pages from specified address space + * this aids process exit speeds. Also, this code + * is special cased for current process only, but + * can have the more generic (and slightly slower) + * mode enabled. This is much faster than pmap_remove + * in the case of running down an entire address space. + */ +void +pmap_remove_pages(pmap, sva, eva) + pmap_t pmap; + vm_offset_t sva, eva; +{ + pt_entry_t *pte, tpte; + vm_page_t m; + pv_entry_t pv, npv; + +#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY + if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { + printf("warning: pmap_remove_pages called with non-current pmap\n"); + return; + } +#endif + vm_page_lock_queues(); + PMAP_LOCK(pmap); + sched_pin(); + + for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { + if (pv->pv_va >= eva || pv->pv_va < sva) { + npv = TAILQ_NEXT(pv, pv_plist); + continue; + } + +#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY + pte = vtopte(pv->pv_va); +#else + pte = pmap_pte_quick(pmap, pv->pv_va); +#endif + tpte = PT_GET(pte); + + if (tpte == 0) { + printf("TPTE at %p IS ZERO @ VA %08x\n", + pte, pv->pv_va); + panic("bad pte"); + } + +/* + * We cannot remove wired pages from a process' mapping at this time + */ + if (tpte & PG_W) { + npv = TAILQ_NEXT(pv, pv_plist); + continue; + } + + m = PHYS_TO_VM_PAGE(tpte); + KASSERT(m->phys_addr == (tpte & PG_FRAME), + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); + + KASSERT(m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); + + pmap->pm_stats.resident_count--; + + pte_clear(pte); + + /* + * Update the vm_page_t clean and reference bits. + */ + if (tpte & PG_M) { + vm_page_dirty(m); + } + + npv = TAILQ_NEXT(pv, pv_plist); + TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); + + m->md.pv_list_count--; + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + + pmap_unuse_pt(pmap, pv->pv_va); + free_pv_entry(pv); + } + sched_unpin(); + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + vm_page_unlock_queues(); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +boolean_t +pmap_is_modified(vm_page_t m) +{ + pv_entry_t pv; + pt_entry_t *pte; + boolean_t rv; + + rv = FALSE; + if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) + return (rv); + + sched_pin(); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + /* + * if the bit being tested is the modified bit, then + * mark clean_map and ptes as never + * modified. + */ + if (!pmap_track_modified(pv->pv_va)) + continue; +#if defined(PMAP_DIAGNOSTIC) + if (!pv->pv_pmap) { + printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va); + continue; + } +#endif + PMAP_LOCK(pv->pv_pmap); + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + rv = (*pte & PG_M) != 0; + PMAP_UNLOCK(pv->pv_pmap); + if (rv) + break; + } + sched_unpin(); + return (rv); +} + +/* + * pmap_is_prefaultable: + * + * Return whether or not the specified virtual address is elgible + * for prefault. + */ +boolean_t +pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) +{ + pt_entry_t *pte; + boolean_t rv; + + rv = FALSE; + + return (rv); + PMAP_LOCK(pmap); + if (pmap_pde(pmap, addr)) { + pte = vtopte(addr); + rv = *pte == 0; + } + PMAP_UNLOCK(pmap); + return (rv); +} + +/* + * Clear the given bit in each of the given page's ptes. The bit is + * expressed as a 32-bit mask. Consequently, if the pte is 64 bits in + * size, only a bit within the least significant 32 can be cleared. + */ +static __inline void +pmap_clear_ptes(vm_page_t m, int bit) +{ + register pv_entry_t pv; + pt_entry_t pbits, *pte; + + if (!pmap_initialized || (m->flags & PG_FICTITIOUS) || + (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0)) + return; + + sched_pin(); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + /* + * Loop over all current mappings setting/clearing as appropos If + * setting RO do we need to clear the VAC? + */ + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + /* + * don't write protect pager mappings + */ + if (bit == PG_RW) { + if (!pmap_track_modified(pv->pv_va)) + continue; + } + +#if defined(PMAP_DIAGNOSTIC) + if (!pv->pv_pmap) { + printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va); + continue; + } +#endif + + PMAP_LOCK(pv->pv_pmap); + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); +#ifdef notyet +retry: +#endif + pbits = PT_GET(pte); + if (pbits & bit) { + if (bit == PG_RW) { + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_RW and PG_M are among the least + * significant 32 bits. + */ +#ifdef notyet + if (!atomic_cmpset_int((u_int *)pte, pbits, + pbits & ~(PG_RW | PG_M))) + goto retry; +#endif + PT_SET_VA(pte, pbits & ~(PG_M|PG_RW), TRUE); + + + if (pbits & PG_M) { + vm_page_dirty(m); + } + } else { +#ifdef notyet + atomic_clear_int((u_int *)pte, bit); +#endif + /* XXX */ + PT_SET_VA(pte, pbits & ~bit, TRUE); + } + pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + } + PMAP_UNLOCK(pv->pv_pmap); + } + if (bit == PG_RW) + vm_page_flag_clear(m, PG_WRITEABLE); + sched_unpin(); +} + +/* + * pmap_page_protect: + * + * Lower the permission for all mappings to a given page. + */ +void +pmap_page_protect(vm_page_t m, vm_prot_t prot) +{ + if ((prot & VM_PROT_WRITE) == 0) { + if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { + pmap_clear_ptes(m, PG_RW); + } else { + pmap_remove_all(m); + } + } +} + +/* + * pmap_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * XXX: The exact number of bits to check and clear is a matter that + * should be tested and standardized at some point in the future for + * optimal aging of shared pages. + */ +int +pmap_ts_referenced(vm_page_t m) +{ + register pv_entry_t pv, pvf, pvn; + pt_entry_t *pte; + pt_entry_t v; + int rtval = 0; + + if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) + return (rtval); + + sched_pin(); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + + pvf = pv; + + do { + pvn = TAILQ_NEXT(pv, pv_list); + + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + + if (!pmap_track_modified(pv->pv_va)) + continue; + + PMAP_LOCK(pv->pv_pmap); + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + + if (pte && ((v = PT_GET(pte)) & PG_A) != 0) { +#ifdef notyet + atomic_clear_int((u_int *)pte, PG_A); +#endif + PT_SET_VA(pte, v & ~PG_A, FALSE); + pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + + rtval++; + if (rtval > 4) { + PMAP_UNLOCK(pv->pv_pmap); + break; + } + } + PMAP_UNLOCK(pv->pv_pmap); + } while ((pv = pvn) != NULL && pv != pvf); + } + sched_unpin(); + + return (rtval); +} + +/* + * Clear the modify bits on the specified physical page. + */ +void +pmap_clear_modify(vm_page_t m) +{ + pmap_clear_ptes(m, PG_M); +} + +/* + * pmap_clear_reference: + * + * Clear the reference bit on the specified physical page. + */ +void +pmap_clear_reference(vm_page_t m) +{ + pmap_clear_ptes(m, PG_A); +} + +/* + * Miscellaneous support routines follow + */ + +/* + * Map a set of physical memory pages into the kernel virtual + * address space. Return a pointer to where it is mapped. This + * routine is intended to be used for mapping device memory, + * NOT real memory. + */ +void * +pmap_mapdev(pa, size) + vm_paddr_t pa; + vm_size_t size; +{ + vm_offset_t va, tmpva, offset; + + offset = pa & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + pa = pa & PG_FRAME; + + if (pa < KERNLOAD && pa + size <= KERNLOAD) + va = KERNBASE + pa; + else + va = kmem_alloc_nofault(kernel_map, size); + if (!va) + panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); + + for (tmpva = va; size > 0; ) { + PT_SET(tmpva, pa | PG_RW | PG_V | pgeflag, FALSE); + size -= PAGE_SIZE; + tmpva += PAGE_SIZE; + pa += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, va, tmpva); + return ((void *)(va + offset)); +} + +void +pmap_unmapdev(va, size) + vm_offset_t va; + vm_size_t size; +{ + vm_offset_t base, offset, tmpva; + panic("unused"); + if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) + return; + base = va & PG_FRAME; + offset = va & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) + PT_CLEAR(tmpva, FALSE); + pmap_invalidate_range(kernel_pmap, va, tmpva); + kmem_free(kernel_map, base, size); +} + +/* + * perform the pmap work for mincore + */ +int +pmap_mincore(pmap, addr) + pmap_t pmap; + vm_offset_t addr; +{ + pt_entry_t *ptep, pte; + vm_page_t m; + int val = 0; + + PMAP_LOCK(pmap); + ptep = pmap_pte(pmap, addr); + pte = (ptep != NULL) ? PT_GET(ptep) : 0; + pmap_pte_release(ptep); + PMAP_UNLOCK(pmap); + + if (pte != 0) { + vm_paddr_t pa; + + val = MINCORE_INCORE; + if ((pte & PG_MANAGED) == 0) + return val; + + pa = pte & PG_FRAME; + + m = PHYS_TO_VM_PAGE(pa); + + /* + * Modified by us + */ + if (pte & PG_M) + val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; + else { + /* + * Modified by someone else + */ + vm_page_lock_queues(); + if (m->dirty || pmap_is_modified(m)) + val |= MINCORE_MODIFIED_OTHER; + vm_page_unlock_queues(); + } + /* + * Referenced by us + */ + if (pte & PG_A) + val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; + else { + /* + * Referenced by someone else + */ + vm_page_lock_queues(); + if ((m->flags & PG_REFERENCED) || + pmap_ts_referenced(m)) { + val |= MINCORE_REFERENCED_OTHER; + vm_page_flag_set(m, PG_REFERENCED); + } + vm_page_unlock_queues(); + } + } + return val; +} + +void +pmap_activate(struct thread *td) +{ + struct proc *p = td->td_proc; + pmap_t pmap, oldpmap; + u_int32_t cr3; + + critical_enter(); + pmap = vmspace_pmap(td->td_proc->p_vmspace); + oldpmap = PCPU_GET(curpmap); +#if defined(SMP) + atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); + atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); +#else + oldpmap->pm_active &= ~1; + pmap->pm_active |= 1; +#endif +#ifdef PAE + cr3 = vtophys(pmap->pm_pdpt); +#else + cr3 = vtophys(pmap->pm_pdir); +#endif + /* XXXKSE this is wrong. + * pmap_activate is for the current thread on the current cpu + */ + if (p->p_flag & P_SA) { + /* Make sure all other cr3 entries are updated. */ + /* what if they are running? XXXKSE (maybe abort them) */ + FOREACH_THREAD_IN_PROC(p, td) { + td->td_pcb->pcb_cr3 = cr3; + } + } else { + td->td_pcb->pcb_cr3 = cr3; + } + load_cr3(cr3); + PCPU_SET(curpmap, pmap); + critical_exit(); +} + +vm_offset_t +pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) +{ + + if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { + return addr; + } + + addr = (addr + PDRMASK) & ~PDRMASK; + return addr; +} + + +#if defined(PMAP_DEBUG) +extern int init_first; +void +pmap_ref(pt_entry_t *pte, unsigned long ma) +{ + int ind, i, count; + unsigned long ebp_prev, eip_prev, oma = 0; + unsigned long pa = xpmap_mtop(ma); + + /* are we to the point where mappings are set up? */ + if (!init_first) + return; + + ind = pa >> PAGE_SHIFT; + /* privileged? */ + if ((pa & PG_RW) && pteinfo_list[ind].pt_ref & (1 << 31)) + BKPT; + + /* is MA already mapped ? */ + oma = *pte; + + /* old reference being lost */ + if (oma && (oma & PG_RW) && ((oma & PG_FRAME) != (ma & PG_FRAME))) + pmap_dec_ref(oma); + + /* ignore RO mappings - unless were downgrading */ + if (!(ma & PG_RW)) { + /* downgrading mapping - lose reference */ + if (((oma & PG_FRAME) == (ma & PG_FRAME)) && + (oma & PG_RW)) + pmap_dec_ref(ma); + return; + } + + if (pteinfo_list[ind].pt_ref < 0) + BKPT; + + + /* same address and not upgrading the mapping */ + if (((oma & PG_FRAME) == (ma & PG_FRAME)) && + (oma & PG_RW)) + return; + + count = pteinfo_list[ind].pt_ref; + __asm__("movl %%ebp, %0" : "=r" (ebp_prev)); + for (i = 0; i < XPQ_CALL_DEPTH && ebp_prev > KERNBASE; i++) { + __asm__("movl 4(%1), %0" : "=r" (eip_prev) : "r" (ebp_prev)); + pteinfo_list[ind].pt_eip[count%XPQ_CALL_COUNT][i] = eip_prev; + __asm__("movl (%1), %0" : "=r" (ebp_prev) : "r" (ebp_prev)); + } + + pteinfo_list[ind].pt_ref++; + +} + +void +pmap_dec_ref(unsigned long ma) +{ + unsigned long pa; + int ind, count; + + if (!ma) BKPT; + + pa = xpmap_mtop(ma); + + ind = pa >> PAGE_SHIFT; + if (pteinfo_list[ind].pt_ref & (1 << 31)) BKPT; + + count = pteinfo_list[ind].pt_ref & ~(1 << 31); + if (count < 1) { + printk("ma: %lx has ref count of 0\n", ma); + BKPT; + } + pteinfo_list[ind].pt_ref = (--count | (pteinfo_list[ind].pt_ref & (1 << 31))); + +} + +void +pmap_dec_ref_page(vm_page_t m) +{ + unsigned long *pt; + int i; + mtx_lock(&CMAPCADDR12_lock); + if (*CMAP2) + panic("pmap_zero_page: CMAP2 busy"); + sched_pin(); + PT_SET_VA(CMAP2, PG_V | VM_PAGE_TO_PHYS(m) | PG_A | PG_M, FALSE); + invlcaddr(CADDR2); + pt = (unsigned long *)CADDR2; + for (i = 0; i < 1024; i++) + if (pt[i] & PG_RW) + pmap_dec_ref(xpmap_ptom(pt[i])); + PT_CLEAR_VA(CMAP2, TRUE); + sched_unpin(); + mtx_unlock(&CMAPCADDR12_lock); +} + +void +pmap_mark_privileged(unsigned long pa) +{ + int ind = pa >> PAGE_SHIFT; + + if (pteinfo_list[ind].pt_ref & (1 << 31)) BKPT; + if ((pteinfo_list[ind].pt_ref & ~(1 << 31)) > 0) BKPT; + + pteinfo_list[ind].pt_ref |= (1 << 31); + +} + +void +pmap_mark_unprivileged(unsigned long pa) +{ + int ind = pa >> PAGE_SHIFT; + + if (pteinfo_list[ind].pt_ref != (1 << 31)) BKPT; + + pteinfo_list[ind].pt_ref &= ~(1 << 31); + +} + + +int +pmap_pid_dump(int pid) +{ + pmap_t pmap; + struct proc *p; + int npte = 0; + int index; + + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + if (p->p_pid != pid) + continue; + + if (p->p_vmspace) { + int i,j; + index = 0; + pmap = vmspace_pmap(p->p_vmspace); + for (i = 0; i < NPDEPTD; i++) { + pd_entry_t *pde; + pt_entry_t *pte; + vm_offset_t base = i << PDRSHIFT; + + pde = &pmap->pm_pdir[i]; + if (pde && pmap_pde_v(pde)) { + for (j = 0; j < NPTEPG; j++) { + vm_offset_t va = base + (j << PAGE_SHIFT); + if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { + if (index) { + index = 0; + printf("\n"); + } + sx_sunlock(&allproc_lock); + return npte; + } + pte = pmap_pte(pmap, va); + if (pte && pmap_pte_v(pte)) { + pt_entry_t pa; + vm_page_t m; + pa = PT_GET(pte); + m = PHYS_TO_VM_PAGE(pa); + printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", + va, pa, m->hold_count, m->wire_count, m->flags); + npte++; + index++; + if (index >= 2) { + index = 0; + printf("\n"); + } else { + printf(" "); + } + } + } + } + } + } + } + sx_sunlock(&allproc_lock); + return npte; +} +#endif /* PMAP_DEBUG */ + +#if defined(DEBUG) + +static void pads(pmap_t pm); +void pmap_pvdump(vm_offset_t pa); + +/* print address space of pmap*/ +static void +pads(pm) + pmap_t pm; +{ + int i, j; + vm_paddr_t va; + pt_entry_t *ptep; + + if (pm == kernel_pmap) + return; + for (i = 0; i < NPDEPTD; i++) + if (pm->pm_pdir[i]) + for (j = 0; j < NPTEPG; j++) { + va = (i << PDRSHIFT) + (j << PAGE_SHIFT); + if (pm == kernel_pmap && va < KERNBASE) + continue; + if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) + continue; + ptep = pmap_pte(pm, va); + if (pmap_pte_v(ptep)) + printf("%x:%x ", va, *ptep); + }; + +} + +void +pmap_pvdump(pa) + vm_paddr_t pa; +{ + pv_entry_t pv; + vm_page_t m; + + printf("pa %x", pa); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); + pads(pv->pv_pmap); + } + printf(" "); +} +#endif diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/support.s b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/support.s new file mode 100644 index 0000000000..deb4a94859 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/support.s @@ -0,0 +1,1553 @@ +/*- + * Copyright (c) 1993 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/i386/i386/support.s,v 1.100 2003/11/03 21:28:54 jhb Exp $ + */ + +#include "opt_npx.h" + +#include <machine/asmacros.h> +#include <machine/cputypes.h> +#include <machine/intr_machdep.h> +#include <machine/pmap.h> +#include <machine/specialreg.h> + +#include "assym.s" + +#define IDXSHIFT 10 + + .data + .globl bcopy_vector +bcopy_vector: + .long generic_bcopy + .globl bzero_vector +bzero_vector: + .long generic_bzero + .globl copyin_vector +copyin_vector: + .long generic_copyin + .globl copyout_vector +copyout_vector: + .long generic_copyout +#if defined(I586_CPU) && defined(DEV_NPX) +kernel_fpu_lock: + .byte 0xfe + .space 3 +#endif + ALIGN_DATA + .globl intrcnt, eintrcnt +intrcnt: + .space INTRCNT_COUNT * 4 +eintrcnt: + + .globl intrnames, eintrnames +intrnames: + .space INTRCNT_COUNT * (MAXCOMLEN + 1) +eintrnames: + + .text + +/* + * bcopy family + * void bzero(void *buf, u_int len) + */ + +ENTRY(bzero) + MEXITCOUNT + jmp *bzero_vector + +ENTRY(generic_bzero) + pushl %edi + movl 8(%esp),%edi + movl 12(%esp),%ecx + xorl %eax,%eax + shrl $2,%ecx + cld + rep + stosl + movl 12(%esp),%ecx + andl $3,%ecx + rep + stosb + popl %edi + ret + +#ifdef I486_CPU +ENTRY(i486_bzero) + movl 4(%esp),%edx + movl 8(%esp),%ecx + xorl %eax,%eax +/* + * do 64 byte chunks first + * + * XXX this is probably over-unrolled at least for DX2's + */ +2: + cmpl $64,%ecx + jb 3f + movl %eax,(%edx) + movl %eax,4(%edx) + movl %eax,8(%edx) + movl %eax,12(%edx) + movl %eax,16(%edx) + movl %eax,20(%edx) + movl %eax,24(%edx) + movl %eax,28(%edx) + movl %eax,32(%edx) + movl %eax,36(%edx) + movl %eax,40(%edx) + movl %eax,44(%edx) + movl %eax,48(%edx) + movl %eax,52(%edx) + movl %eax,56(%edx) + movl %eax,60(%edx) + addl $64,%edx + subl $64,%ecx + jnz 2b + ret + +/* + * do 16 byte chunks + */ + SUPERALIGN_TEXT +3: + cmpl $16,%ecx + jb 4f + movl %eax,(%edx) + movl %eax,4(%edx) + movl %eax,8(%edx) + movl %eax,12(%edx) + addl $16,%edx + subl $16,%ecx + jnz 3b + ret + +/* + * do 4 byte chunks + */ + SUPERALIGN_TEXT +4: + cmpl $4,%ecx + jb 5f + movl %eax,(%edx) + addl $4,%edx + subl $4,%ecx + jnz 4b + ret + +/* + * do 1 byte chunks + * a jump table seems to be faster than a loop or more range reductions + * + * XXX need a const section for non-text + */ + .data +jtab: + .long do0 + .long do1 + .long do2 + .long do3 + + .text + SUPERALIGN_TEXT +5: + jmp *jtab(,%ecx,4) + + SUPERALIGN_TEXT +do3: + movw %ax,(%edx) + movb %al,2(%edx) + ret + + SUPERALIGN_TEXT +do2: + movw %ax,(%edx) + ret + + SUPERALIGN_TEXT +do1: + movb %al,(%edx) + ret + + SUPERALIGN_TEXT +do0: + ret +#endif + +#if defined(I586_CPU) && defined(DEV_NPX) +ENTRY(i586_bzero) + movl 4(%esp),%edx + movl 8(%esp),%ecx + + /* + * The FPU register method is twice as fast as the integer register + * method unless the target is in the L1 cache and we pre-allocate a + * cache line for it (then the integer register method is 4-5 times + * faster). However, we never pre-allocate cache lines, since that + * would make the integer method 25% or more slower for the common + * case when the target isn't in either the L1 cache or the L2 cache. + * Thus we normally use the FPU register method unless the overhead + * would be too large. + */ + cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */ + jb intreg_i586_bzero + + /* + * The FPU registers may belong to an application or to fastmove() + * or to another invocation of bcopy() or ourself in a higher level + * interrupt or trap handler. Preserving the registers is + * complicated since we avoid it if possible at all levels. We + * want to localize the complications even when that increases them. + * Here the extra work involves preserving CR0_TS in TS. + * `fpcurthread != NULL' is supposed to be the condition that all the + * FPU resources belong to an application, but fpcurthread and CR0_TS + * aren't set atomically enough for this condition to work in + * interrupt handlers. + * + * Case 1: FPU registers belong to the application: we must preserve + * the registers if we use them, so we only use the FPU register + * method if the target size is large enough to amortize the extra + * overhead for preserving them. CR0_TS must be preserved although + * it is very likely to end up as set. + * + * Case 2: FPU registers belong to fastmove(): fastmove() currently + * makes the registers look like they belong to an application so + * that cpu_switch() and savectx() don't have to know about it, so + * this case reduces to case 1. + * + * Case 3: FPU registers belong to the kernel: don't use the FPU + * register method. This case is unlikely, and supporting it would + * be more complicated and might take too much stack. + * + * Case 4: FPU registers don't belong to anyone: the FPU registers + * don't need to be preserved, so we always use the FPU register + * method. CR0_TS must be preserved although it is very likely to + * always end up as clear. + */ + cmpl $0,PCPU(FPCURTHREAD) + je i586_bz1 + + /* + * XXX don't use the FPU for cases 1 and 2, since preemptive + * scheduling of ithreads broke these cases. Note that we can + * no longer get here from an interrupt handler, since the + * context sitch to the interrupt handler will have saved the + * FPU state. + */ + jmp intreg_i586_bzero + + cmpl $256+184,%ecx /* empirical; not quite 2*108 more */ + jb intreg_i586_bzero + sarb $1,kernel_fpu_lock + jc intreg_i586_bzero + smsw %ax + clts + subl $108,%esp + fnsave 0(%esp) + jmp i586_bz2 + +i586_bz1: + sarb $1,kernel_fpu_lock + jc intreg_i586_bzero + smsw %ax + clts + fninit /* XXX should avoid needing this */ +i586_bz2: + fldz + + /* + * Align to an 8 byte boundary (misalignment in the main loop would + * cost a factor of >= 2). Avoid jumps (at little cost if it is + * already aligned) by always zeroing 8 bytes and using the part up + * to the _next_ alignment position. + */ + fstl 0(%edx) + addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */ + addl $8,%edx + andl $~7,%edx + subl %edx,%ecx + + /* + * Similarly align `len' to a multiple of 8. + */ + fstl -8(%edx,%ecx) + decl %ecx + andl $~7,%ecx + + /* + * This wouldn't be any faster if it were unrolled, since the loop + * control instructions are much faster than the fstl and/or done + * in parallel with it so their overhead is insignificant. + */ +fpureg_i586_bzero_loop: + fstl 0(%edx) + addl $8,%edx + subl $8,%ecx + cmpl $8,%ecx + jae fpureg_i586_bzero_loop + + cmpl $0,PCPU(FPCURTHREAD) + je i586_bz3 + + /* XXX check that the condition for cases 1-2 stayed false. */ +i586_bzero_oops: + int $3 + jmp i586_bzero_oops + + frstor 0(%esp) + addl $108,%esp + lmsw %ax + movb $0xfe,kernel_fpu_lock + ret + +i586_bz3: + fstp %st(0) + lmsw %ax + movb $0xfe,kernel_fpu_lock + ret + +intreg_i586_bzero: + /* + * `rep stos' seems to be the best method in practice for small + * counts. Fancy methods usually take too long to start up due + * to cache and BTB misses. + */ + pushl %edi + movl %edx,%edi + xorl %eax,%eax + shrl $2,%ecx + cld + rep + stosl + movl 12(%esp),%ecx + andl $3,%ecx + jne 1f + popl %edi + ret + +1: + rep + stosb + popl %edi + ret +#endif /* I586_CPU && defined(DEV_NPX) */ + +ENTRY(sse2_pagezero) + pushl %ebx + movl 8(%esp),%ecx + movl %ecx,%eax + addl $4096,%eax + xor %ebx,%ebx +1: + movnti %ebx,(%ecx) + addl $4,%ecx + cmpl %ecx,%eax + jne 1b + sfence + popl %ebx + ret + +ENTRY(i686_pagezero) + pushl %edi + pushl %ebx + + movl 12(%esp), %edi + movl $1024, %ecx + cld + + ALIGN_TEXT +1: + xorl %eax, %eax + repe + scasl + jnz 2f + + popl %ebx + popl %edi + ret + + ALIGN_TEXT + +2: + incl %ecx + subl $4, %edi + + movl %ecx, %edx + cmpl $16, %ecx + + jge 3f + + movl %edi, %ebx + andl $0x3f, %ebx + shrl %ebx + shrl %ebx + movl $16, %ecx + subl %ebx, %ecx + +3: + subl %ecx, %edx + rep + stosl + + movl %edx, %ecx + testl %edx, %edx + jnz 1b + + popl %ebx + popl %edi + ret + +/* fillw(pat, base, cnt) */ +ENTRY(fillw) + pushl %edi + movl 8(%esp),%eax + movl 12(%esp),%edi + movl 16(%esp),%ecx + cld + rep + stosw + popl %edi + ret + +ENTRY(bcopyb) + pushl %esi + pushl %edi + movl 12(%esp),%esi + movl 16(%esp),%edi + movl 20(%esp),%ecx + movl %edi,%eax + subl %esi,%eax + cmpl %ecx,%eax /* overlapping && src < dst? */ + jb 1f + cld /* nope, copy forwards */ + rep + movsb + popl %edi + popl %esi + ret + + ALIGN_TEXT +1: + addl %ecx,%edi /* copy backwards. */ + addl %ecx,%esi + decl %edi + decl %esi + std + rep + movsb + popl %edi + popl %esi + cld + ret + +ENTRY(bcopy) + MEXITCOUNT + jmp *bcopy_vector + +/* + * generic_bcopy(src, dst, cnt) + * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 + */ +ENTRY(generic_bcopy) + pushl %esi + pushl %edi + movl 12(%esp),%esi + movl 16(%esp),%edi + movl 20(%esp),%ecx + + movl %edi,%eax + subl %esi,%eax + cmpl %ecx,%eax /* overlapping && src < dst? */ + jb 1f + + shrl $2,%ecx /* copy by 32-bit words */ + cld /* nope, copy forwards */ + rep + movsl + movl 20(%esp),%ecx + andl $3,%ecx /* any bytes left? */ + rep + movsb + popl %edi + popl %esi + ret + + ALIGN_TEXT +1: + addl %ecx,%edi /* copy backwards */ + addl %ecx,%esi + decl %edi + decl %esi + andl $3,%ecx /* any fractional bytes? */ + std + rep + movsb + movl 20(%esp),%ecx /* copy remainder by 32-bit words */ + shrl $2,%ecx + subl $3,%esi + subl $3,%edi + rep + movsl + popl %edi + popl %esi + cld + ret + +#if defined(I586_CPU) && defined(DEV_NPX) +ENTRY(i586_bcopy) + pushl %esi + pushl %edi + movl 12(%esp),%esi + movl 16(%esp),%edi + movl 20(%esp),%ecx + + movl %edi,%eax + subl %esi,%eax + cmpl %ecx,%eax /* overlapping && src < dst? */ + jb 1f + + cmpl $1024,%ecx + jb small_i586_bcopy + + sarb $1,kernel_fpu_lock + jc small_i586_bcopy + cmpl $0,PCPU(FPCURTHREAD) + je i586_bc1 + + /* XXX turn off handling of cases 1-2, as above. */ + movb $0xfe,kernel_fpu_lock + jmp small_i586_bcopy + + smsw %dx + clts + subl $108,%esp + fnsave 0(%esp) + jmp 4f + +i586_bc1: + smsw %dx + clts + fninit /* XXX should avoid needing this */ + + ALIGN_TEXT +4: + pushl %ecx +#define DCACHE_SIZE 8192 + cmpl $(DCACHE_SIZE-512)/2,%ecx + jbe 2f + movl $(DCACHE_SIZE-512)/2,%ecx +2: + subl %ecx,0(%esp) + cmpl $256,%ecx + jb 5f /* XXX should prefetch if %ecx >= 32 */ + pushl %esi + pushl %ecx + ALIGN_TEXT +3: + movl 0(%esi),%eax + movl 32(%esi),%eax + movl 64(%esi),%eax + movl 96(%esi),%eax + movl 128(%esi),%eax + movl 160(%esi),%eax + movl 192(%esi),%eax + movl 224(%esi),%eax + addl $256,%esi + subl $256,%ecx + cmpl $256,%ecx + jae 3b + popl %ecx + popl %esi +5: + ALIGN_TEXT +large_i586_bcopy_loop: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fistpq 56(%edi) + fistpq 48(%edi) + fistpq 40(%edi) + fistpq 32(%edi) + fistpq 24(%edi) + fistpq 16(%edi) + fistpq 8(%edi) + fistpq 0(%edi) + addl $64,%esi + addl $64,%edi + subl $64,%ecx + cmpl $64,%ecx + jae large_i586_bcopy_loop + popl %eax + addl %eax,%ecx + cmpl $64,%ecx + jae 4b + + cmpl $0,PCPU(FPCURTHREAD) + je i586_bc2 + + /* XXX check that the condition for cases 1-2 stayed false. */ +i586_bcopy_oops: + int $3 + jmp i586_bcopy_oops + + frstor 0(%esp) + addl $108,%esp +i586_bc2: + lmsw %dx + movb $0xfe,kernel_fpu_lock + +/* + * This is a duplicate of the main part of generic_bcopy. See the comments + * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and + * would mess up high resolution profiling. + */ + ALIGN_TEXT +small_i586_bcopy: + shrl $2,%ecx + cld + rep + movsl + movl 20(%esp),%ecx + andl $3,%ecx + rep + movsb + popl %edi + popl %esi + ret + + ALIGN_TEXT +1: + addl %ecx,%edi + addl %ecx,%esi + decl %edi + decl %esi + andl $3,%ecx + std + rep + movsb + movl 20(%esp),%ecx + shrl $2,%ecx + subl $3,%esi + subl $3,%edi + rep + movsl + popl %edi + popl %esi + cld + ret +#endif /* I586_CPU && defined(DEV_NPX) */ + +/* + * Note: memcpy does not support overlapping copies + */ +ENTRY(memcpy) + pushl %edi + pushl %esi + movl 12(%esp),%edi + movl 16(%esp),%esi + movl 20(%esp),%ecx + movl %edi,%eax + shrl $2,%ecx /* copy by 32-bit words */ + cld /* nope, copy forwards */ + rep + movsl + movl 20(%esp),%ecx + andl $3,%ecx /* any bytes left? */ + rep + movsb + popl %esi + popl %edi + ret + + +/*****************************************************************************/ +/* copyout and fubyte family */ +/*****************************************************************************/ +/* + * Access user memory from inside the kernel. These routines and possibly + * the math- and DOS emulators should be the only places that do this. + * + * We have to access the memory with user's permissions, so use a segment + * selector with RPL 3. For writes to user space we have to additionally + * check the PTE for write permission, because the 386 does not check + * write permissions when we are executing with EPL 0. The 486 does check + * this if the WP bit is set in CR0, so we can use a simpler version here. + * + * These routines set curpcb->onfault for the time they execute. When a + * protection violation occurs inside the functions, the trap handler + * returns to *curpcb->onfault instead of the function. + */ + +/* + * copyout(from_kernel, to_user, len) - MP SAFE (if not I386_CPU) + */ +ENTRY(copyout) + MEXITCOUNT + jmp *copyout_vector + +ENTRY(generic_copyout) + movl PCPU(CURPCB),%eax + movl $copyout_fault,PCB_ONFAULT(%eax) + pushl %esi + pushl %edi + pushl %ebx + movl 16(%esp),%esi + movl 20(%esp),%edi + movl 24(%esp),%ebx + testl %ebx,%ebx /* anything to do? */ + jz done_copyout + + /* + * Check explicitly for non-user addresses. If 486 write protection + * is being used, this check is essential because we are in kernel + * mode so the h/w does not provide any protection against writing + * kernel addresses. + */ + + /* + * First, prevent address wrapping. + */ + movl %edi,%eax + addl %ebx,%eax + jc copyout_fault +/* + * XXX STOP USING VM_MAXUSER_ADDRESS. + * It is an end address, not a max, so every time it is used correctly it + * looks like there is an off by one error, and of course it caused an off + * by one error in several places. + */ + cmpl $VM_MAXUSER_ADDRESS,%eax + ja copyout_fault + + /* bcopy(%esi, %edi, %ebx) */ + movl %ebx,%ecx + +#if defined(I586_CPU) && defined(DEV_NPX) + ALIGN_TEXT +slow_copyout: +#endif + shrl $2,%ecx + cld + rep + movsl + movb %bl,%cl + andb $3,%cl + rep + movsb + +done_copyout: + popl %ebx + popl %edi + popl %esi + xorl %eax,%eax + movl PCPU(CURPCB),%edx + movl %eax,PCB_ONFAULT(%edx) + ret + + ALIGN_TEXT +copyout_fault: + popl %ebx + popl %edi + popl %esi + movl PCPU(CURPCB),%edx + movl $0,PCB_ONFAULT(%edx) + movl $EFAULT,%eax + ret + +#if defined(I586_CPU) && defined(DEV_NPX) +ENTRY(i586_copyout) + /* + * Duplicated from generic_copyout. Could be done a bit better. + */ + movl PCPU(CURPCB),%eax + movl $copyout_fault,PCB_ONFAULT(%eax) + pushl %esi + pushl %edi + pushl %ebx + movl 16(%esp),%esi + movl 20(%esp),%edi + movl 24(%esp),%ebx + testl %ebx,%ebx /* anything to do? */ + jz done_copyout + + /* + * Check explicitly for non-user addresses. If 486 write protection + * is being used, this check is essential because we are in kernel + * mode so the h/w does not provide any protection against writing + * kernel addresses. + */ + + /* + * First, prevent address wrapping. + */ + movl %edi,%eax + addl %ebx,%eax + jc copyout_fault +/* + * XXX STOP USING VM_MAXUSER_ADDRESS. + * It is an end address, not a max, so every time it is used correctly it + * looks like there is an off by one error, and of course it caused an off + * by one error in several places. + */ + cmpl $VM_MAXUSER_ADDRESS,%eax + ja copyout_fault + + /* bcopy(%esi, %edi, %ebx) */ +3: + movl %ebx,%ecx + /* + * End of duplicated code. + */ + + cmpl $1024,%ecx + jb slow_copyout + + pushl %ecx + call fastmove + addl $4,%esp + jmp done_copyout +#endif /* I586_CPU && defined(DEV_NPX) */ + +/* + * copyin(from_user, to_kernel, len) - MP SAFE + */ +ENTRY(copyin) + MEXITCOUNT + jmp *copyin_vector + +ENTRY(generic_copyin) + movl PCPU(CURPCB),%eax + movl $copyin_fault,PCB_ONFAULT(%eax) + pushl %esi + pushl %edi + movl 12(%esp),%esi /* caddr_t from */ + movl 16(%esp),%edi /* caddr_t to */ + movl 20(%esp),%ecx /* size_t len */ + + /* + * make sure address is valid + */ + movl %esi,%edx + addl %ecx,%edx + jc copyin_fault + cmpl $VM_MAXUSER_ADDRESS,%edx + ja copyin_fault + +#if defined(I586_CPU) && defined(DEV_NPX) + ALIGN_TEXT +slow_copyin: +#endif + movb %cl,%al + shrl $2,%ecx /* copy longword-wise */ + cld + rep + movsl + movb %al,%cl + andb $3,%cl /* copy remaining bytes */ + rep + movsb + +#if defined(I586_CPU) && defined(DEV_NPX) + ALIGN_TEXT +done_copyin: +#endif + popl %edi + popl %esi + xorl %eax,%eax + movl PCPU(CURPCB),%edx + movl %eax,PCB_ONFAULT(%edx) + ret + + ALIGN_TEXT +copyin_fault: + popl %edi + popl %esi + movl PCPU(CURPCB),%edx + movl $0,PCB_ONFAULT(%edx) + movl $EFAULT,%eax + ret + +#if defined(I586_CPU) && defined(DEV_NPX) +ENTRY(i586_copyin) + /* + * Duplicated from generic_copyin. Could be done a bit better. + */ + movl PCPU(CURPCB),%eax + movl $copyin_fault,PCB_ONFAULT(%eax) + pushl %esi + pushl %edi + movl 12(%esp),%esi /* caddr_t from */ + movl 16(%esp),%edi /* caddr_t to */ + movl 20(%esp),%ecx /* size_t len */ + + /* + * make sure address is valid + */ + movl %esi,%edx + addl %ecx,%edx + jc copyin_fault + cmpl $VM_MAXUSER_ADDRESS,%edx + ja copyin_fault + /* + * End of duplicated code. + */ + + cmpl $1024,%ecx + jb slow_copyin + + pushl %ebx /* XXX prepare for fastmove_fault */ + pushl %ecx + call fastmove + addl $8,%esp + jmp done_copyin +#endif /* I586_CPU && defined(DEV_NPX) */ + +#if defined(I586_CPU) && defined(DEV_NPX) +/* fastmove(src, dst, len) + src in %esi + dst in %edi + len in %ecx XXX changed to on stack for profiling + uses %eax and %edx for tmp. storage + */ +/* XXX use ENTRY() to get profiling. fastmove() is actually a non-entry. */ +ENTRY(fastmove) + pushl %ebp + movl %esp,%ebp + subl $PCB_SAVEFPU_SIZE+3*4,%esp + + movl 8(%ebp),%ecx + cmpl $63,%ecx + jbe fastmove_tail + + testl $7,%esi /* check if src addr is multiple of 8 */ + jnz fastmove_tail + + testl $7,%edi /* check if dst addr is multiple of 8 */ + jnz fastmove_tail + + /* XXX grab FPU context atomically. */ + call ni_cli + +/* if (fpcurthread != NULL) { */ + cmpl $0,PCPU(FPCURTHREAD) + je 6f +/* fnsave(&curpcb->pcb_savefpu); */ + movl PCPU(CURPCB),%eax + fnsave PCB_SAVEFPU(%eax) +/* FPCURTHREAD = NULL; */ + movl $0,PCPU(FPCURTHREAD) +/* } */ +6: +/* now we own the FPU. */ + +/* + * The process' FP state is saved in the pcb, but if we get + * switched, the cpu_switch() will store our FP state in the + * pcb. It should be possible to avoid all the copying for + * this, e.g., by setting a flag to tell cpu_switch() to + * save the state somewhere else. + */ +/* tmp = curpcb->pcb_savefpu; */ + movl %ecx,-12(%ebp) + movl %esi,-8(%ebp) + movl %edi,-4(%ebp) + movl %esp,%edi + movl PCPU(CURPCB),%esi + addl $PCB_SAVEFPU,%esi + cld + movl $PCB_SAVEFPU_SIZE>>2,%ecx + rep + movsl + movl -12(%ebp),%ecx + movl -8(%ebp),%esi + movl -4(%ebp),%edi +/* stop_emulating(); */ + clts +/* fpcurthread = curthread; */ + movl PCPU(CURTHREAD),%eax + movl %eax,PCPU(FPCURTHREAD) + movl PCPU(CURPCB),%eax + + /* XXX end of atomic FPU context grab. */ + call ni_sti + + movl $fastmove_fault,PCB_ONFAULT(%eax) +4: + movl %ecx,-12(%ebp) + cmpl $1792,%ecx + jbe 2f + movl $1792,%ecx +2: + subl %ecx,-12(%ebp) + cmpl $256,%ecx + jb 5f + movl %ecx,-8(%ebp) + movl %esi,-4(%ebp) + ALIGN_TEXT +3: + movl 0(%esi),%eax + movl 32(%esi),%eax + movl 64(%esi),%eax + movl 96(%esi),%eax + movl 128(%esi),%eax + movl 160(%esi),%eax + movl 192(%esi),%eax + movl 224(%esi),%eax + addl $256,%esi + subl $256,%ecx + cmpl $256,%ecx + jae 3b + movl -8(%ebp),%ecx + movl -4(%ebp),%esi +5: + ALIGN_TEXT +fastmove_loop: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fistpq 56(%edi) + fistpq 48(%edi) + fistpq 40(%edi) + fistpq 32(%edi) + fistpq 24(%edi) + fistpq 16(%edi) + fistpq 8(%edi) + fistpq 0(%edi) + addl $-64,%ecx + addl $64,%esi + addl $64,%edi + cmpl $63,%ecx + ja fastmove_loop + movl -12(%ebp),%eax + addl %eax,%ecx + cmpl $64,%ecx + jae 4b + + /* XXX ungrab FPU context atomically. */ + call ni_cli + +/* curpcb->pcb_savefpu = tmp; */ + movl %ecx,-12(%ebp) + movl %esi,-8(%ebp) + movl %edi,-4(%ebp) + movl PCPU(CURPCB),%edi + addl $PCB_SAVEFPU,%edi + movl %esp,%esi + cld + movl $PCB_SAVEFPU_SIZE>>2,%ecx + rep + movsl + movl -12(%ebp),%ecx + movl -8(%ebp),%esi + movl -4(%ebp),%edi + +/* start_emulating(); */ + smsw %ax + orb $CR0_TS,%al + lmsw %ax +/* fpcurthread = NULL; */ + movl $0,PCPU(FPCURTHREAD) + + /* XXX end of atomic FPU context ungrab. */ + call ni_sti + + ALIGN_TEXT +fastmove_tail: + movl PCPU(CURPCB),%eax + movl $fastmove_tail_fault,PCB_ONFAULT(%eax) + + movb %cl,%al + shrl $2,%ecx /* copy longword-wise */ + cld + rep + movsl + movb %al,%cl + andb $3,%cl /* copy remaining bytes */ + rep + movsb + + movl %ebp,%esp + popl %ebp + ret + + ALIGN_TEXT +fastmove_fault: + /* XXX ungrab FPU context atomically. */ + call ni_cli + + movl PCPU(CURPCB),%edi + addl $PCB_SAVEFPU,%edi + movl %esp,%esi + cld + movl $PCB_SAVEFPU_SIZE>>2,%ecx + rep + movsl + + smsw %ax + orb $CR0_TS,%al + lmsw %ax + movl $0,PCPU(FPCURTHREAD) + + /* XXX end of atomic FPU context ungrab. */ + call ni_sti + +fastmove_tail_fault: + movl %ebp,%esp + popl %ebp + addl $8,%esp + popl %ebx + popl %edi + popl %esi + movl PCPU(CURPCB),%edx + movl $0,PCB_ONFAULT(%edx) + movl $EFAULT,%eax + ret +#endif /* I586_CPU && defined(DEV_NPX) */ + +/* + * casuptr. Compare and set user pointer. Returns -1 or the current value. + */ +ENTRY(casuptr) + movl PCPU(CURPCB),%ecx + movl $fusufault,PCB_ONFAULT(%ecx) + movl 4(%esp),%edx /* dst */ + movl 8(%esp),%eax /* old */ + movl 12(%esp),%ecx /* new */ + + cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ + ja fusufault + +#ifdef SMP + lock +#endif + cmpxchgl %ecx, (%edx) /* Compare and set. */ + + /* + * The old value is in %eax. If the store succeeded it will be the + * value we expected (old) from before the store, otherwise it will + * be the current value. + */ + + movl PCPU(CURPCB),%ecx + movl $fusufault,PCB_ONFAULT(%ecx) + movl $0,PCB_ONFAULT(%ecx) + ret + +/* + * fu{byte,sword,word} - MP SAFE + * + * Fetch a byte (sword, word) from user memory + */ +ENTRY(fuword) + movl PCPU(CURPCB),%ecx + movl $fusufault,PCB_ONFAULT(%ecx) + movl 4(%esp),%edx /* from */ + + cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ + ja fusufault + + movl (%edx),%eax + movl $0,PCB_ONFAULT(%ecx) + ret + +ENTRY(fuword32) + jmp fuword + +/* + * These two routines are called from the profiling code, potentially + * at interrupt time. If they fail, that's okay, good things will + * happen later. Fail all the time for now - until the trap code is + * able to deal with this. + */ +ALTENTRY(suswintr) +ENTRY(fuswintr) + movl $-1,%eax + ret + +/* + * fuword16 - MP SAFE + */ +ENTRY(fuword16) + movl PCPU(CURPCB),%ecx + movl $fusufault,PCB_ONFAULT(%ecx) + movl 4(%esp),%edx + + cmpl $VM_MAXUSER_ADDRESS-2,%edx + ja fusufault + + movzwl (%edx),%eax + movl $0,PCB_ONFAULT(%ecx) + ret + +/* + * fubyte - MP SAFE + */ +ENTRY(fubyte) + movl PCPU(CURPCB),%ecx + movl $fusufault,PCB_ONFAULT(%ecx) + movl 4(%esp),%edx + + cmpl $VM_MAXUSER_ADDRESS-1,%edx + ja fusufault + + movzbl (%edx),%eax + movl $0,PCB_ONFAULT(%ecx) + ret + + ALIGN_TEXT +fusufault: + movl PCPU(CURPCB),%ecx + xorl %eax,%eax + movl %eax,PCB_ONFAULT(%ecx) + decl %eax + ret + +/* + * su{byte,sword,word} - MP SAFE (if not I386_CPU) + * + * Write a byte (word, longword) to user memory + */ +ENTRY(suword) + movl PCPU(CURPCB),%ecx + movl $fusufault,PCB_ONFAULT(%ecx) + movl 4(%esp),%edx + + cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address validity */ + ja fusufault + + movl 8(%esp),%eax + movl %eax,(%edx) + xorl %eax,%eax + movl PCPU(CURPCB),%ecx + movl %eax,PCB_ONFAULT(%ecx) + ret + +ENTRY(suword32) + jmp suword + +/* + * suword16 - MP SAFE (if not I386_CPU) + */ +ENTRY(suword16) + movl PCPU(CURPCB),%ecx + movl $fusufault,PCB_ONFAULT(%ecx) + movl 4(%esp),%edx + + cmpl $VM_MAXUSER_ADDRESS-2,%edx /* verify address validity */ + ja fusufault + + movw 8(%esp),%ax + movw %ax,(%edx) + xorl %eax,%eax + movl PCPU(CURPCB),%ecx /* restore trashed register */ + movl %eax,PCB_ONFAULT(%ecx) + ret + +/* + * subyte - MP SAFE (if not I386_CPU) + */ +ENTRY(subyte) + movl PCPU(CURPCB),%ecx + movl $fusufault,PCB_ONFAULT(%ecx) + movl 4(%esp),%edx + + cmpl $VM_MAXUSER_ADDRESS-1,%edx /* verify address validity */ + ja fusufault + + movb 8(%esp),%al + movb %al,(%edx) + xorl %eax,%eax + movl PCPU(CURPCB),%ecx /* restore trashed register */ + movl %eax,PCB_ONFAULT(%ecx) + ret + +/* + * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE + * + * copy a string from from to to, stop when a 0 character is reached. + * return ENAMETOOLONG if string is longer than maxlen, and + * EFAULT on protection violations. If lencopied is non-zero, + * return the actual length in *lencopied. + */ +ENTRY(copyinstr) + pushl %esi + pushl %edi + movl PCPU(CURPCB),%ecx + movl $cpystrflt,PCB_ONFAULT(%ecx) + + movl 12(%esp),%esi /* %esi = from */ + movl 16(%esp),%edi /* %edi = to */ + movl 20(%esp),%edx /* %edx = maxlen */ + + movl $VM_MAXUSER_ADDRESS,%eax + + /* make sure 'from' is within bounds */ + subl %esi,%eax + jbe cpystrflt + + /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ + cmpl %edx,%eax + jae 1f + movl %eax,%edx + movl %eax,20(%esp) +1: + incl %edx + cld + +2: + decl %edx + jz 3f + + lodsb + stosb + orb %al,%al + jnz 2b + + /* Success -- 0 byte reached */ + decl %edx + xorl %eax,%eax + jmp cpystrflt_x +3: + /* edx is zero - return ENAMETOOLONG or EFAULT */ + cmpl $VM_MAXUSER_ADDRESS,%esi + jae cpystrflt +4: + movl $ENAMETOOLONG,%eax + jmp cpystrflt_x + +cpystrflt: + movl $EFAULT,%eax + +cpystrflt_x: + /* set *lencopied and return %eax */ + movl PCPU(CURPCB),%ecx + movl $0,PCB_ONFAULT(%ecx) + movl 20(%esp),%ecx + subl %edx,%ecx + movl 24(%esp),%edx + testl %edx,%edx + jz 1f + movl %ecx,(%edx) +1: + popl %edi + popl %esi + ret + + +/* + * copystr(from, to, maxlen, int *lencopied) - MP SAFE + */ +ENTRY(copystr) + pushl %esi + pushl %edi + + movl 12(%esp),%esi /* %esi = from */ + movl 16(%esp),%edi /* %edi = to */ + movl 20(%esp),%edx /* %edx = maxlen */ + incl %edx + cld +1: + decl %edx + jz 4f + lodsb + stosb + orb %al,%al + jnz 1b + + /* Success -- 0 byte reached */ + decl %edx + xorl %eax,%eax + jmp 6f +4: + /* edx is zero -- return ENAMETOOLONG */ + movl $ENAMETOOLONG,%eax + +6: + /* set *lencopied and return %eax */ + movl 20(%esp),%ecx + subl %edx,%ecx + movl 24(%esp),%edx + testl %edx,%edx + jz 7f + movl %ecx,(%edx) +7: + popl %edi + popl %esi + ret + +ENTRY(bcmp) + pushl %edi + pushl %esi + movl 12(%esp),%edi + movl 16(%esp),%esi + movl 20(%esp),%edx + xorl %eax,%eax + + movl %edx,%ecx + shrl $2,%ecx + cld /* compare forwards */ + repe + cmpsl + jne 1f + + movl %edx,%ecx + andl $3,%ecx + repe + cmpsb + je 2f +1: + incl %eax +2: + popl %esi + popl %edi + ret + + +/* + * Handling of special 386 registers and descriptor tables etc + */ +/* void lgdt(struct region_descriptor *rdp); */ +ENTRY(lgdt_finish) +#if 0 + /* reload the descriptor table */ + movl 4(%esp),%eax + lgdt (%eax) +#endif + /* flush the prefetch q */ + jmp 1f + nop +1: + /* reload "stale" selectors */ + movl $KDSEL,%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%gs + movl %eax,%ss + movl $KPSEL,%eax + movl %eax,%fs + + /* reload code selector by turning return into intersegmental return */ + movl (%esp),%eax + pushl %eax + movl $KCSEL,4(%esp) + lret + +/* ssdtosd(*ssdp,*sdp) */ +ENTRY(ssdtosd) + pushl %ebx + movl 8(%esp),%ecx + movl 8(%ecx),%ebx + shll $16,%ebx + movl (%ecx),%edx + roll $16,%edx + movb %dh,%bl + movb %dl,%bh + rorl $8,%ebx + movl 4(%ecx),%eax + movw %ax,%dx + andl $0xf0000,%eax + orl %eax,%ebx + movl 12(%esp),%ecx + movl %edx,(%ecx) + movl %ebx,4(%ecx) + popl %ebx + ret + +/* void reset_dbregs() */ +ENTRY(reset_dbregs) + movl $0,%eax + movl %eax,%dr7 /* disable all breapoints first */ + movl %eax,%dr0 + movl %eax,%dr1 + movl %eax,%dr2 + movl %eax,%dr3 + movl %eax,%dr6 + ret + +/*****************************************************************************/ +/* setjump, longjump */ +/*****************************************************************************/ + +ENTRY(setjmp) + movl 4(%esp),%eax + movl %ebx,(%eax) /* save ebx */ + movl %esp,4(%eax) /* save esp */ + movl %ebp,8(%eax) /* save ebp */ + movl %esi,12(%eax) /* save esi */ + movl %edi,16(%eax) /* save edi */ + movl (%esp),%edx /* get rta */ + movl %edx,20(%eax) /* save eip */ + xorl %eax,%eax /* return(0); */ + ret + +ENTRY(longjmp) + movl 4(%esp),%eax + movl (%eax),%ebx /* restore ebx */ + movl 4(%eax),%esp /* restore esp */ + movl 8(%eax),%ebp /* restore ebp */ + movl 12(%eax),%esi /* restore esi */ + movl 16(%eax),%edi /* restore edi */ + movl 20(%eax),%edx /* get rta */ + movl %edx,(%esp) /* put in return frame */ + xorl %eax,%eax /* return(1); */ + incl %eax + ret + +/* + * Support for BB-profiling (gcc -a). The kernbb program will extract + * the data from the kernel. + */ + + .data + ALIGN_DATA + .globl bbhead +bbhead: + .long 0 + + .text +NON_GPROF_ENTRY(__bb_init_func) + movl 4(%esp),%eax + movl $1,(%eax) + movl bbhead,%edx + movl %edx,16(%eax) + movl %eax,bbhead + NON_GPROF_RET diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/swtch.s b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/swtch.s new file mode 100644 index 0000000000..f468c429bd --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/swtch.s @@ -0,0 +1,445 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.143 2003/09/30 08:11:35 jeff Exp $ + */ + +#include "opt_npx.h" + +#include <machine/asmacros.h> + +#include "assym.s" + + +/*****************************************************************************/ +/* Scheduling */ +/*****************************************************************************/ + + .text + +/* + * cpu_throw() + * + * This is the second half of cpu_swtch(). It is used when the current + * thread is either a dummy or slated to die, and we no longer care + * about its state. This is only a slight optimization and is probably + * not worth it anymore. Note that we need to clear the pm_active bits so + * we do need the old proc if it still exists. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd + */ +ENTRY(cpu_throw) + movl PCPU(CPUID), %esi + movl 4(%esp),%ecx /* Old thread */ + testl %ecx,%ecx /* no thread? */ + jz 1f + /* release bit from old pm_active */ + movl PCPU(CURPMAP), %ebx +#ifdef SMP + lock +#endif + btrl %esi, PM_ACTIVE(%ebx) /* clear old */ +1: + movl 8(%esp),%ecx /* New thread */ + movl TD_PCB(%ecx),%edx + movl PCB_CR3(%edx),%eax + + movl %eax,PCPU(CR3) /* new address space */ + + pushl %ecx + pushl %edx + pushl %esi + pushl %eax + call load_cr3 + addl $4,%esp + popl %esi + popl %edx + popl %ecx + + /* set bit in new pm_active */ + movl TD_PROC(%ecx),%eax + movl P_VMSPACE(%eax), %ebx + addl $VM_PMAP, %ebx + movl %ebx, PCPU(CURPMAP) +#ifdef SMP + lock +#endif + btsl %esi, PM_ACTIVE(%ebx) /* set new */ + jmp sw1 + +/* + * cpu_switch(old, new) + * + * Save the current thread state, then select the next thread to run + * and load its state. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd + */ +ENTRY(cpu_switch) + + /* Switch to new thread. First, save context. */ + movl 4(%esp),%ecx + +#ifdef INVARIANTS + testl %ecx,%ecx /* no thread? */ + jz badsw2 /* no, panic */ +#endif + + movl TD_PCB(%ecx),%edx + + movl (%esp),%eax /* Hardware registers */ + movl %eax,PCB_EIP(%edx) + movl %ebx,PCB_EBX(%edx) + movl %esp,PCB_ESP(%edx) + movl %ebp,PCB_EBP(%edx) + movl %esi,PCB_ESI(%edx) + movl %edi,PCB_EDI(%edx) + movl %gs,PCB_GS(%edx) +#if 0 + pushfl /* PSL */ + popl PCB_PSL(%edx) +#endif + /* Check to see if we need to call a switchout function. */ + movl PCB_SWITCHOUT(%edx),%eax + cmpl $0, %eax + je 1f + call *%eax +1: + /* Test if debug registers should be saved. */ + testl $PCB_DBREGS,PCB_FLAGS(%edx) + jz 1f /* no, skip over */ + movl %dr7,%eax /* yes, do the save */ + movl %eax,PCB_DR7(%edx) + andl $0x0000fc00, %eax /* disable all watchpoints */ + movl %eax,%dr7 + movl %dr6,%eax + movl %eax,PCB_DR6(%edx) + movl %dr3,%eax + movl %eax,PCB_DR3(%edx) + movl %dr2,%eax + movl %eax,PCB_DR2(%edx) + movl %dr1,%eax + movl %eax,PCB_DR1(%edx) + movl %dr0,%eax + movl %eax,PCB_DR0(%edx) +1: + +#ifdef DEV_NPX + /* have we used fp, and need a save? */ + cmpl %ecx,PCPU(FPCURTHREAD) + jne 1f + addl $PCB_SAVEFPU,%edx /* h/w bugs make saving complicated */ + pushl %edx + call npxsave /* do it in a big C function */ + popl %eax +1: +#endif + + + /* Save is done. Now fire up new thread. Leave old vmspace. */ + movl %ecx,%edi + movl 8(%esp),%ecx /* New thread */ +#ifdef INVARIANTS + testl %ecx,%ecx /* no thread? */ + jz badsw3 /* no, panic */ +#endif + movl TD_PCB(%ecx),%edx + movl PCPU(CPUID), %esi + + /* switch address space */ + movl PCB_CR3(%edx),%eax + + cmpl %eax,IdlePTD /* Kernel address space? */ + + je sw1 + /* XXX optimize later KMM */ +#if 0 + movl %cr3,%ebx /* The same address space? */ +#else + movl PCPU(CR3),%ebx +#endif + cmpl %ebx,%eax + je sw1 + + movl %eax,PCPU(CR3) /* new address space */ + + pushl %edx + pushl %ecx + pushl %esi + pushl %eax + call load_cr3 /* inform xen of the switch */ + addl $4,%esp + popl %esi + popl %ecx + popl %edx + + /* Release bit from old pmap->pm_active */ + movl PCPU(CURPMAP), %ebx + +#ifdef SMP + lock +#endif + btrl %esi, PM_ACTIVE(%ebx) /* clear old */ + /* Set bit in new pmap->pm_active */ + movl TD_PROC(%ecx),%eax /* newproc */ + movl P_VMSPACE(%eax), %ebx + addl $VM_PMAP, %ebx + movl %ebx, PCPU(CURPMAP) +#ifdef SMP + lock +#endif + btsl %esi, PM_ACTIVE(%ebx) /* set new */ +sw1: + +#if 0 + + /* only one task selector under Xen */ + /* + * At this point, we've switched address spaces and are ready + * to load up the rest of the next context. + */ + cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ + je 1f /* If not, use the default */ + btsl %esi, private_tss /* mark use of private tss */ + movl PCB_EXT(%edx), %edi /* new tss descriptor */ + jmp 2f /* Load it up */ + +1: /* + * Use the common default TSS instead of our own. + * Set our stack pointer into the TSS, it's set to just + * below the PCB. In C, common_tss.tss_esp0 = &pcb - 16; + */ + leal -16(%edx), %ebx /* leave space for vm86 */ + movl %ebx, PCPU(COMMON_TSS) + TSS_ESP0 + + /* + * Test this CPU's bit in the bitmap to see if this + * CPU was using a private TSS. + */ + btrl %esi, private_tss /* Already using the common? */ + jae 3f /* if so, skip reloading */ + PCPU_ADDR(COMMON_TSSD, %edi) +2: + /* Move correct tss descriptor into GDT slot, then reload tr. */ + movl PCPU(TSS_GDT), %ebx /* entry in GDT */ + movl 0(%edi), %eax + movl %eax, 0(%ebx) + movl 4(%edi), %eax + movl %eax, 4(%ebx) + + movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ + ltr %si +#endif /* !XEN */ +3: + /* notify Xen of task switch */ + pushl %edx /* &pcb is the new stack base */ + pushl $KDSEL + pushl $HYPERVISOR_STACK_SWITCH + call ni_queue_multicall2 + addl $12,%esp + /* XXX handle DOM0 IOPL case here (KMM) */ + /* we currently don't support running FreeBSD */ + /* in DOM0 so we can skip for now */ + + call ni_execute_multicall_list + + /* Restore context. */ + movl PCB_EBX(%edx),%ebx + movl PCB_ESP(%edx),%esp + movl PCB_EBP(%edx),%ebp + movl PCB_ESI(%edx),%esi + movl PCB_EDI(%edx),%edi + movl PCB_EIP(%edx),%eax + movl %eax,(%esp) +#if 0 + pushl PCB_PSL(%edx) + popfl +#endif + movl %edx, PCPU(CURPCB) + movl %ecx, PCPU(CURTHREAD) /* into next thread */ + + /* + * Determine the LDT to use and load it if is the default one and + * that is not the current one. + */ + movl TD_PROC(%ecx),%eax + cmpl $0,P_MD+MD_LDT(%eax) + jnz 1f + movl _default_ldt,%eax + cmpl PCPU(CURRENTLDT),%eax + je 2f + pushl %edx + pushl %eax + xorl %eax,%eax + movl %eax,%gs + call i386_reset_ldt + popl %eax + popl %edx + + movl %eax,PCPU(CURRENTLDT) + jmp 2f +1: + /* Load the LDT when it is not the default one. */ + pushl %edx /* Preserve pointer to pcb. */ + addl $P_MD,%eax /* Pointer to mdproc is arg. */ + pushl %eax + call set_user_ldt + addl $4,%esp + popl %edx +2: + /* This must be done after loading the user LDT. */ + .globl cpu_switch_load_gs +cpu_switch_load_gs: + movl PCB_GS(%edx),%gs + + /* XXX evidently setting debug registers needs to be + * routed through Xen - this appears to work - so I + * am leaving it as it is for now - (KMM) + */ + + /* Test if debug registers should be restored. */ + testl $PCB_DBREGS,PCB_FLAGS(%edx) + jz 1f + + /* + * Restore debug registers. The special code for dr7 is to + * preserve the current values of its reserved bits. + */ + movl PCB_DR6(%edx),%eax + movl %eax,%dr6 + movl PCB_DR3(%edx),%eax + movl %eax,%dr3 + movl PCB_DR2(%edx),%eax + movl %eax,%dr2 + movl PCB_DR1(%edx),%eax + movl %eax,%dr1 + movl PCB_DR0(%edx),%eax + movl %eax,%dr0 + movl %dr7,%eax + andl $0x0000fc00,%eax + movl PCB_DR7(%edx),%ecx + andl $~0x0000fc00,%ecx + orl %ecx,%eax + movl %eax,%dr7 +1: + ret + +#ifdef INVARIANTS +badsw1: + pushal + pushl $sw0_1 + call panic +sw0_1: .asciz "cpu_throw: no newthread supplied" + +badsw2: + pushal + pushl $sw0_2 + call panic +sw0_2: .asciz "cpu_switch: no curthread supplied" + +badsw3: + pushal + pushl $sw0_3 + call panic +sw0_3: .asciz "cpu_switch: no newthread supplied" +#endif + +/* + * savectx(pcb) + * Update pcb, saving current processor state. + */ +ENTRY(savectx) + /* Fetch PCB. */ + movl 4(%esp),%ecx + + /* Save caller's return address. Child won't execute this routine. */ + movl (%esp),%eax + movl %eax,PCB_EIP(%ecx) + +#if 0 + movl %cr3,%eax +#else + movl PCPU(CR3),%eax +#endif + movl %eax,PCB_CR3(%ecx) + + movl %ebx,PCB_EBX(%ecx) + movl %esp,PCB_ESP(%ecx) + movl %ebp,PCB_EBP(%ecx) + movl %esi,PCB_ESI(%ecx) + movl %edi,PCB_EDI(%ecx) + movl %gs,PCB_GS(%ecx) +#if 0 + pushfl + popl PCB_PSL(%ecx) +#endif +#ifdef DEV_NPX + /* + * If fpcurthread == NULL, then the npx h/w state is irrelevant and the + * state had better already be in the pcb. This is true for forks + * but not for dumps (the old book-keeping with FP flags in the pcb + * always lost for dumps because the dump pcb has 0 flags). + * + * If fpcurthread != NULL, then we have to save the npx h/w state to + * fpcurthread's pcb and copy it to the requested pcb, or save to the + * requested pcb and reload. Copying is easier because we would + * have to handle h/w bugs for reloading. We used to lose the + * parent's npx state for forks by forgetting to reload. + */ + pushfl + call ni_cli + movl PCPU(FPCURTHREAD),%eax + testl %eax,%eax + je 1f + + pushl %ecx + movl TD_PCB(%eax),%eax + leal PCB_SAVEFPU(%eax),%eax + pushl %eax + pushl %eax + call npxsave + addl $4,%esp + popl %eax + popl %ecx + + pushl $PCB_SAVEFPU_SIZE + leal PCB_SAVEFPU(%ecx),%ecx + pushl %ecx + pushl %eax + call bcopy + addl $12,%esp +1: + popfl +#endif /* DEV_NPX */ + + ret diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/symbols.raw b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/symbols.raw new file mode 100644 index 0000000000..014c6442ad --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/symbols.raw @@ -0,0 +1,75 @@ +# @(#)symbols.raw 7.6 (Berkeley) 5/8/91 +# +# $FreeBSD: src/sys/i386/i386/symbols.raw,v 1.15 1999/08/28 00:43:51 peter Exp $ +# + + +#gdb + _IdlePTD + _PTD + _panicstr + _atdevbase +# _version +#dmesg + _msgbufp +# _msgbuf +#iostat + _tk_nin + _tk_nout + _cp_time +# _io_info +#ps + _nswap + _maxslp + _ccpu + _fscale + _avail_start + _avail_end +#pstat +# _cons + _nswap + _swapblist +# _swaplist +#vmstat + _cp_time +# _rate +# _total +# _sum +# _rectime +# _pgintime + _boottime +#w + _swapdev + _nswap + _averunnable + _boottime +#netstat + _mbstat + _ipstat + _tcb + _tcpstat + _udb + _udpstat +# _rawcb + _ifnet +# _rthost +# _rtnet + _icmpstat + _filehead + _nfiles +# _rthashsize +# _radix_node_head +#routed + _ifnet +#rwho + _boottime +#savecore + _dumpdev + _dumplo + _time_second + _version + _dumpsize + _panicstr + _dumpmag +#deprecated +# _avenrun diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/sys_machdep.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/sys_machdep.c new file mode 100644 index 0000000000..8f85c128ba --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/sys_machdep.c @@ -0,0 +1,703 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)sys_machdep.c 5.5 (Berkeley) 1/19/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/sys_machdep.c,v 1.91 2003/09/07 05:23:28 davidxu Exp $"); + +#include "opt_kstack_pages.h" +#include "opt_mac.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/mac.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/smp.h> +#include <sys/sysproto.h> +#include <sys/user.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> + +#include <machine/cpu.h> +#include <machine/pcb_ext.h> /* pcb.h included by sys/user.h */ +#include <machine/proc.h> +#include <machine/sysarch.h> +#include <machine/xenfunc.h> + +#include <vm/vm_kern.h> /* for kernel_map */ + +#define MAX_LD 8192 +#define LD_PER_PAGE 512 +#define NEW_MAX_LD(num) ((num + LD_PER_PAGE) & ~(LD_PER_PAGE-1)) +#define SIZE_FROM_LARGEST_LD(num) (NEW_MAX_LD(num) << 3) + +void i386_reset_ldt(struct proc_ldt *pldt); + +static int i386_get_ldt(struct thread *, char *); +static int i386_set_ldt(struct thread *, char *); +static int i386_set_ldt_data(struct thread *, int start, int num, + union descriptor *descs); +static int i386_ldt_grow(struct thread *td, int len); +static int i386_get_ioperm(struct thread *, char *); +static int i386_set_ioperm(struct thread *, char *); +#ifdef SMP +static void set_user_ldt_rv(struct thread *); +#endif + +#ifndef _SYS_SYSPROTO_H_ +struct sysarch_args { + int op; + char *parms; +}; +#endif + +int +sysarch(td, uap) + struct thread *td; + register struct sysarch_args *uap; +{ + int error; + + mtx_lock(&Giant); + switch(uap->op) { + case I386_GET_LDT: + error = i386_get_ldt(td, uap->parms); + break; + + case I386_SET_LDT: + error = i386_set_ldt(td, uap->parms); + break; + case I386_GET_IOPERM: + error = i386_get_ioperm(td, uap->parms); + break; + case I386_SET_IOPERM: + error = i386_set_ioperm(td, uap->parms); + break; +#if 0 + case I386_VM86: + error = vm86_sysarch(td, uap->parms); + break; +#endif + default: + error = EINVAL; + break; + } + mtx_unlock(&Giant); + return (error); +} + +int +i386_extend_pcb(struct thread *td) +{ + int i, offset; + u_long *addr; + struct pcb_ext *ext; + struct soft_segment_descriptor ssd = { + 0, /* segment base address (overwritten) */ + ctob(IOPAGES + 1) - 1, /* length */ + SDT_SYS386TSS, /* segment type */ + 0, /* priority level */ + 1, /* descriptor present */ + 0, 0, + 0, /* default 32 size */ + 0 /* granularity */ + }; + + if (td->td_proc->p_flag & P_SA) + return (EINVAL); /* XXXKSE */ +/* XXXKSE All the code below only works in 1:1 needs changing */ + ext = (struct pcb_ext *)kmem_alloc(kernel_map, ctob(IOPAGES+1)); + if (ext == 0) + return (ENOMEM); + bzero(ext, sizeof(struct pcb_ext)); + /* -16 is so we can convert a trapframe into vm86trapframe inplace */ + ext->ext_tss.tss_esp0 = td->td_kstack + ctob(KSTACK_PAGES) - + sizeof(struct pcb) - 16; + ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); + /* + * The last byte of the i/o map must be followed by an 0xff byte. + * We arbitrarily allocate 16 bytes here, to keep the starting + * address on a doubleword boundary. + */ + offset = PAGE_SIZE - 16; + ext->ext_tss.tss_ioopt = + (offset - ((unsigned)&ext->ext_tss - (unsigned)ext)) << 16; + ext->ext_iomap = (caddr_t)ext + offset; + ext->ext_vm86.vm86_intmap = (caddr_t)ext + offset - 32; + + addr = (u_long *)ext->ext_vm86.vm86_intmap; + for (i = 0; i < (ctob(IOPAGES) + 32 + 16) / sizeof(u_long); i++) + *addr++ = ~0; + + ssd.ssd_base = (unsigned)&ext->ext_tss; + ssd.ssd_limit -= ((unsigned)&ext->ext_tss - (unsigned)ext); + ssdtosd(&ssd, &ext->ext_tssd); + + KASSERT(td->td_proc == curthread->td_proc, ("giving TSS to !curproc")); + KASSERT(td->td_pcb->pcb_ext == 0, ("already have a TSS!")); + mtx_lock_spin(&sched_lock); + td->td_pcb->pcb_ext = ext; + + /* switch to the new TSS after syscall completes */ + td->td_flags |= TDF_NEEDRESCHED; + mtx_unlock_spin(&sched_lock); + + return 0; +} + +static int +i386_set_ioperm(td, args) + struct thread *td; + char *args; +{ + int i, error; + struct i386_ioperm_args ua; + char *iomap; + + if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0) + return (error); + +#ifdef MAC + if ((error = mac_check_sysarch_ioperm(td->td_ucred)) != 0) + return (error); +#endif + if ((error = suser(td)) != 0) + return (error); + if ((error = securelevel_gt(td->td_ucred, 0)) != 0) + return (error); + /* + * XXX + * While this is restricted to root, we should probably figure out + * whether any other driver is using this i/o address, as so not to + * cause confusion. This probably requires a global 'usage registry'. + */ + + if (td->td_pcb->pcb_ext == 0) + if ((error = i386_extend_pcb(td)) != 0) + return (error); + iomap = (char *)td->td_pcb->pcb_ext->ext_iomap; + + if (ua.start + ua.length > IOPAGES * PAGE_SIZE * NBBY) + return (EINVAL); + + for (i = ua.start; i < ua.start + ua.length; i++) { + if (ua.enable) + iomap[i >> 3] &= ~(1 << (i & 7)); + else + iomap[i >> 3] |= (1 << (i & 7)); + } + return (error); +} + +static int +i386_get_ioperm(td, args) + struct thread *td; + char *args; +{ + int i, state, error; + struct i386_ioperm_args ua; + char *iomap; + + if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0) + return (error); + if (ua.start >= IOPAGES * PAGE_SIZE * NBBY) + return (EINVAL); + + if (td->td_pcb->pcb_ext == 0) { + ua.length = 0; + goto done; + } + + iomap = (char *)td->td_pcb->pcb_ext->ext_iomap; + + i = ua.start; + state = (iomap[i >> 3] >> (i & 7)) & 1; + ua.enable = !state; + ua.length = 1; + + for (i = ua.start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) { + if (state != ((iomap[i >> 3] >> (i & 7)) & 1)) + break; + ua.length++; + } + +done: + error = copyout(&ua, args, sizeof(struct i386_ioperm_args)); + return (error); +} + +/* + * Update the GDT entry pointing to the LDT to point to the LDT of the + * current process. + * + * This must be called with sched_lock held. Unfortunately, we can't use a + * mtx_assert() here because cpu_switch() calls this function after changing + * curproc but before sched_lock's owner is updated in mi_switch(). + */ +void +set_user_ldt(struct mdproc *mdp) +{ + struct proc_ldt *pldt; + pldt = mdp->md_ldt; + i386_reset_ldt(pldt); + PCPU_SET(currentldt, (int)pldt); + +} + +#ifdef SMP +static void +set_user_ldt_rv(struct thread *td) +{ + + if (td->td_proc != curthread->td_proc) + return; + + set_user_ldt(&td->td_proc->p_md); +} +#endif + +/* + * Must be called with either sched_lock free or held but not recursed. + * If it does not return NULL, it will return with it owned. + */ +struct proc_ldt * +user_ldt_alloc(struct mdproc *mdp, int len) +{ + struct proc_ldt *pldt,*new_ldt; + + + if (mtx_owned(&sched_lock)) + mtx_unlock_spin(&sched_lock); + mtx_assert(&sched_lock, MA_NOTOWNED); + MALLOC(new_ldt, struct proc_ldt *, sizeof(struct proc_ldt), + M_SUBPROC, M_WAITOK); + + new_ldt->ldt_len = len = NEW_MAX_LD(len); + new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map, + round_page(len * sizeof(union descriptor))); + if (new_ldt->ldt_base == NULL) { + FREE(new_ldt, M_SUBPROC); + return NULL; + } + new_ldt->ldt_refcnt = 1; + new_ldt->ldt_active = 0; + + mtx_lock_spin(&sched_lock); + + if ((pldt = mdp->md_ldt)) { + if (len > pldt->ldt_len) + len = pldt->ldt_len; + bcopy(pldt->ldt_base, new_ldt->ldt_base, + len * sizeof(union descriptor)); + } else { + bcopy(ldt, new_ldt->ldt_base, PAGE_SIZE); + } + pmap_map_readonly(kernel_pmap, (vm_offset_t)new_ldt->ldt_base, + new_ldt->ldt_len*sizeof(union descriptor)); + return new_ldt; +} + +/* + * Must be called either with sched_lock free or held but not recursed. + * If md_ldt is not NULL, it will return with sched_lock released. + */ +void +user_ldt_free(struct thread *td) +{ + struct mdproc *mdp = &td->td_proc->p_md; + struct proc_ldt *pldt = mdp->md_ldt; + if (pldt == NULL) + return; + + if (!mtx_owned(&sched_lock)) + mtx_lock_spin(&sched_lock); + mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); + if (td == PCPU_GET(curthread)) { + PCPU_SET(currentldt, _default_ldt); + i386_reset_ldt((struct proc_ldt *)_default_ldt); + } + + mdp->md_ldt = NULL; + if (--pldt->ldt_refcnt == 0) { + mtx_unlock_spin(&sched_lock); + + pmap_map_readwrite(kernel_pmap,(vm_offset_t) pldt->ldt_base, + pldt->ldt_len*sizeof(union descriptor)); + kmem_free(kernel_map, (vm_offset_t)pldt->ldt_base, + pldt->ldt_len * sizeof(union descriptor)); + FREE(pldt, M_SUBPROC); + } else + mtx_unlock_spin(&sched_lock); +} + +void +i386_reset_ldt(struct proc_ldt *pldt) +{ + xen_set_ldt((vm_offset_t)pldt->ldt_base, pldt->ldt_len); +} + +static int +i386_get_ldt(td, args) + struct thread *td; + char *args; +{ + int error = 0; + struct proc_ldt *pldt = td->td_proc->p_md.md_ldt; + int nldt, num; + union descriptor *lp; + struct i386_ldt_args ua, *uap = &ua; + + if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0) + return(error); + +#ifdef DEBUG + printf("i386_get_ldt: start=%d num=%d descs=%p\n", + uap->start, uap->num, (void *)uap->descs); +#endif + + /* verify range of LDTs exist */ + if ((uap->start < 0) || (uap->num <= 0)) + return(EINVAL); + + if (pldt) { + nldt = pldt->ldt_len; + num = min(uap->num, nldt); + lp = &((union descriptor *)(pldt->ldt_base))[uap->start]; + } else { + nldt = sizeof(ldt)/sizeof(ldt[0]); + num = min(uap->num, nldt); + lp = &ldt[uap->start]; + } + if (uap->start + num > nldt) + return(EINVAL); + + error = copyout(lp, uap->descs, num * sizeof(union descriptor)); + if (!error) + td->td_retval[0] = num; + + return(error); +} + +static int ldt_warnings; +#define NUM_LDT_WARNINGS 10 + +static int +i386_set_ldt(struct thread *td, char *args) +{ + int error = 0, i; + int largest_ld; + struct mdproc *mdp = &td->td_proc->p_md; + struct proc_ldt *pldt = 0; + struct i386_ldt_args ua, *uap = &ua; + union descriptor *descs, *dp; + int descs_size; + + if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0) + return(error); +#ifdef DEBUG + printf("i386_set_ldt: start=%d num=%d descs=%p\n", + uap->start, uap->num, (void *)uap->descs); + +#endif + + if (uap->descs == NULL) { + /* Free descriptors */ + if (uap->start == 0 && uap->num == 0) { + /* + * Treat this as a special case, so userland needn't + * know magic number NLDT. + */ + uap->start = NLDT; + uap->num = MAX_LD - NLDT; + } + if (uap->start <= LUDATA_SEL || uap->num <= 0) + return (EINVAL); + mtx_lock_spin(&sched_lock); + pldt = mdp->md_ldt; + if (pldt == NULL || uap->start >= pldt->ldt_len) { + mtx_unlock_spin(&sched_lock); + return (0); + } + largest_ld = uap->start + uap->num; + if (largest_ld > pldt->ldt_len) + largest_ld = pldt->ldt_len; + i = largest_ld - uap->start; + bzero(&((union descriptor *)(pldt->ldt_base))[uap->start], + sizeof(union descriptor) * i); + mtx_unlock_spin(&sched_lock); + return (0); + } + + if (!(uap->start == LDT_AUTO_ALLOC && uap->num == 1)) { + /* complain a for a while if using old methods */ + if (ldt_warnings++ < NUM_LDT_WARNINGS) { + printf("Warning: pid %d used static ldt allocation.\n", + td->td_proc->p_pid); + printf("See the i386_set_ldt man page for more info\n"); + } + /* verify range of descriptors to modify */ + largest_ld = uap->start + uap->num; + if (uap->start >= MAX_LD || + uap->num < 0 || largest_ld > MAX_LD) { + return (EINVAL); + } + } + + descs_size = uap->num * sizeof(union descriptor); + descs = (union descriptor *)kmem_alloc(kernel_map, descs_size); + if (descs == NULL) + return (ENOMEM); + error = copyin(uap->descs, descs, descs_size); + if (error) { + kmem_free(kernel_map, (vm_offset_t)descs, descs_size); + return (error); + } + + /* Check descriptors for access violations */ + for (i = 0; i < uap->num; i++) { + dp = &descs[i]; + + switch (dp->sd.sd_type) { + case SDT_SYSNULL: /* system null */ + dp->sd.sd_p = 0; + break; + case SDT_SYS286TSS: /* system 286 TSS available */ + case SDT_SYSLDT: /* system local descriptor table */ + case SDT_SYS286BSY: /* system 286 TSS busy */ + case SDT_SYSTASKGT: /* system task gate */ + case SDT_SYS286IGT: /* system 286 interrupt gate */ + case SDT_SYS286TGT: /* system 286 trap gate */ + case SDT_SYSNULL2: /* undefined by Intel */ + case SDT_SYS386TSS: /* system 386 TSS available */ + case SDT_SYSNULL3: /* undefined by Intel */ + case SDT_SYS386BSY: /* system 386 TSS busy */ + case SDT_SYSNULL4: /* undefined by Intel */ + case SDT_SYS386IGT: /* system 386 interrupt gate */ + case SDT_SYS386TGT: /* system 386 trap gate */ + case SDT_SYS286CGT: /* system 286 call gate */ + case SDT_SYS386CGT: /* system 386 call gate */ + /* I can't think of any reason to allow a user proc + * to create a segment of these types. They are + * for OS use only. + */ + kmem_free(kernel_map, (vm_offset_t)descs, descs_size); + return (EACCES); + /*NOTREACHED*/ + + /* memory segment types */ + case SDT_MEMEC: /* memory execute only conforming */ + case SDT_MEMEAC: /* memory execute only accessed conforming */ + case SDT_MEMERC: /* memory execute read conforming */ + case SDT_MEMERAC: /* memory execute read accessed conforming */ + /* Must be "present" if executable and conforming. */ + if (dp->sd.sd_p == 0) { + kmem_free(kernel_map, (vm_offset_t)descs, + descs_size); + return (EACCES); + } + break; + case SDT_MEMRO: /* memory read only */ + case SDT_MEMROA: /* memory read only accessed */ + case SDT_MEMRW: /* memory read write */ + case SDT_MEMRWA: /* memory read write accessed */ + case SDT_MEMROD: /* memory read only expand dwn limit */ + case SDT_MEMRODA: /* memory read only expand dwn lim accessed */ + case SDT_MEMRWD: /* memory read write expand dwn limit */ + case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */ + case SDT_MEME: /* memory execute only */ + case SDT_MEMEA: /* memory execute only accessed */ + case SDT_MEMER: /* memory execute read */ + case SDT_MEMERA: /* memory execute read accessed */ + break; + default: + kmem_free(kernel_map, (vm_offset_t)descs, descs_size); + return(EINVAL); + /*NOTREACHED*/ + } + + /* Only user (ring-3) descriptors may be present. */ + if ((dp->sd.sd_p != 0) && (dp->sd.sd_dpl != SEL_UPL)) { + kmem_free(kernel_map, (vm_offset_t)descs, descs_size); + return (EACCES); + } + } + + if (uap->start == LDT_AUTO_ALLOC && uap->num == 1) { + /* Allocate a free slot */ + pldt = mdp->md_ldt; + if (pldt == NULL) { + load_gs(0); + error = i386_ldt_grow(td, NLDT+1); + if (error) { + kmem_free(kernel_map, (vm_offset_t)descs, + descs_size); + return (error); + } + pldt = mdp->md_ldt; + } +again: + mtx_lock_spin(&sched_lock); + /* + * start scanning a bit up to leave room for NVidia and + * Wine, which still user the "Blat" method of allocation. + */ + dp = &((union descriptor *)(pldt->ldt_base))[NLDT]; + for (i = NLDT; i < pldt->ldt_len; ++i) { + if (dp->sd.sd_type == SDT_SYSNULL) + break; + dp++; + } + if (i >= pldt->ldt_len) { + mtx_unlock_spin(&sched_lock); + error = i386_ldt_grow(td, pldt->ldt_len+1); + if (error) { + kmem_free(kernel_map, (vm_offset_t)descs, + descs_size); + return (error); + } + goto again; + } + uap->start = i; + error = i386_set_ldt_data(td, i, 1, descs); + mtx_unlock_spin(&sched_lock); + } else { + largest_ld = uap->start + uap->num; + error = i386_ldt_grow(td, largest_ld); + if (error == 0) { + mtx_lock_spin(&sched_lock); + error = i386_set_ldt_data(td, uap->start, uap->num, + descs); + mtx_unlock_spin(&sched_lock); + } + } + kmem_free(kernel_map, (vm_offset_t)descs, descs_size); + if (error == 0) + td->td_retval[0] = uap->start; + return (error); +} +typedef struct uint64_lohi { + unsigned long lo; + unsigned long hi; +} uint64_lohi; + +static int +i386_set_ldt_data(struct thread *td, int start, int num, + union descriptor *descs) +{ + struct mdproc *mdp = &td->td_proc->p_md; + struct proc_ldt *pldt = mdp->md_ldt; + int i, error; + + mtx_assert(&sched_lock, MA_OWNED); + + /* Fill in range */ + for (i = 0; i < num; i++) { + error = HYPERVISOR_update_descriptor(vtomach(&((union descriptor *)(pldt->ldt_base))[start + i]), ((uint64_lohi *)descs)[i].lo, ((uint64_lohi *)descs)[i].hi); + if (error) + panic("failed to update ldt: %d", error); + } + return (0); +} + +static int +i386_ldt_grow(struct thread *td, int len) +{ + struct mdproc *mdp = &td->td_proc->p_md; + struct proc_ldt *pldt; + caddr_t old_ldt_base; + int old_ldt_len; + + if (len > MAX_LD) + return (ENOMEM); + if (len < NLDT+1) + len = NLDT+1; + pldt = mdp->md_ldt; + /* allocate user ldt */ + if (!pldt || len > pldt->ldt_len) { + struct proc_ldt *new_ldt = user_ldt_alloc(mdp, len); + if (new_ldt == NULL) + return (ENOMEM); + pldt = mdp->md_ldt; + /* sched_lock was held by user_ldt_alloc */ + if (pldt) { + if (new_ldt->ldt_len > pldt->ldt_len) { + old_ldt_base = pldt->ldt_base; + old_ldt_len = pldt->ldt_len; + pldt->ldt_sd = new_ldt->ldt_sd; + pldt->ldt_base = new_ldt->ldt_base; + pldt->ldt_len = new_ldt->ldt_len; + mtx_unlock_spin(&sched_lock); + pmap_map_readwrite(kernel_pmap, + (vm_offset_t)old_ldt_base, + old_ldt_len * sizeof(union descriptor)); + kmem_free(kernel_map, (vm_offset_t)old_ldt_base, + old_ldt_len * sizeof(union descriptor)); + FREE(new_ldt, M_SUBPROC); + mtx_lock_spin(&sched_lock); + } else { + /* + * If other threads already did the work, + * do nothing + */ + mtx_unlock_spin(&sched_lock); + pmap_map_readwrite(kernel_pmap, + (vm_offset_t)new_ldt->ldt_base, + new_ldt->ldt_len * sizeof(union descriptor)); + kmem_free(kernel_map, + (vm_offset_t)new_ldt->ldt_base, + new_ldt->ldt_len * sizeof(union descriptor)); + FREE(new_ldt, M_SUBPROC); + return (0); + } + } else { + mdp->md_ldt = pldt = new_ldt; + } +#ifdef SMP + mtx_unlock_spin(&sched_lock); + /* signal other cpus to reload ldt */ + smp_rendezvous(NULL, (void (*)(void *))set_user_ldt_rv, + NULL, td); +#else + set_user_ldt(mdp); + mtx_unlock_spin(&sched_lock); +#endif + } + return (0); +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/trap.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/trap.c new file mode 100644 index 0000000000..a74986ed18 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/trap.c @@ -0,0 +1,1006 @@ +/*- + * Copyright (C) 1994, David Greenman + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the University of Utah, and William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/trap.c,v 1.260 2003/11/03 21:53:37 jhb Exp $"); + +/* + * 386 Trap and System call handling + */ + +#include "opt_clock.h" +#include "opt_cpu.h" +#include "opt_isa.h" +#include "opt_ktrace.h" +#include "opt_npx.h" +#include "opt_trap.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/pioctl.h> +#include <sys/ptrace.h> +#include <sys/kdb.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/syscall.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/uio.h> +#include <sys/vmmeter.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_extern.h> + +#include <machine/cpu.h> +#include <machine/intr_machdep.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#ifdef SMP +#include <machine/smp.h> +#endif +#include <machine/tss.h> +#ifdef POWERFAIL_NMI +#include <sys/syslog.h> +#include <machine/clock.h> +#endif + + +#include <machine/xenfunc.h> +#include <machine/hypervisor.h> +#include <machine/xenvar.h> +#include <machine/hypervisor-ifs.h> + + +extern void trap(struct trapframe frame); +extern void syscall(struct trapframe frame); + +static int trap_pfault(struct trapframe *, int, vm_offset_t); +static void trap_fatal(struct trapframe *, vm_offset_t); +void dblfault_handler(void); + +extern inthand_t IDTVEC(lcall_syscall); + +#define MAX_TRAP_MSG 28 +static char *trap_msg[] = { + "", /* 0 unused */ + "privileged instruction fault", /* 1 T_PRIVINFLT */ + "", /* 2 unused */ + "breakpoint instruction fault", /* 3 T_BPTFLT */ + "", /* 4 unused */ + "", /* 5 unused */ + "arithmetic trap", /* 6 T_ARITHTRAP */ + "", /* 7 unused */ + "", /* 8 unused */ + "general protection fault", /* 9 T_PROTFLT */ + "trace trap", /* 10 T_TRCTRAP */ + "", /* 11 unused */ + "page fault", /* 12 T_PAGEFLT */ + "", /* 13 unused */ + "alignment fault", /* 14 T_ALIGNFLT */ + "", /* 15 unused */ + "", /* 16 unused */ + "hypervisor callback", /* 17 T_HYPCALLBACK */ + "integer divide fault", /* 18 T_DIVIDE */ + "non-maskable interrupt trap", /* 19 T_NMI */ + "overflow trap", /* 20 T_OFLOW */ + "FPU bounds check fault", /* 21 T_BOUND */ + "FPU device not available", /* 22 T_DNA */ + "double fault", /* 23 T_DOUBLEFLT */ + "FPU operand fetch fault", /* 24 T_FPOPFLT */ + "invalid TSS fault", /* 25 T_TSSFLT */ + "segment not present fault", /* 26 T_SEGNPFLT */ + "stack fault", /* 27 T_STKFLT */ + "machine check trap", /* 28 T_MCHK */ +}; + +#if defined(I586_CPU) && !defined(NO_F00F_HACK) +extern int has_f00f_bug; +#endif + +#ifdef KDB +static int kdb_on_nmi = 1; +SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW, + &kdb_on_nmi, 0, "Go to KDB on NMI"); +#endif +static int panic_on_nmi = 1; +SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, + &panic_on_nmi, 0, "Panic on NMI"); + +#ifdef WITNESS +extern char *syscallnames[]; +#endif + +#ifdef DEVICE_POLLING +extern u_int32_t poll_in_trap; +extern int ether_poll(int count); +#endif /* DEVICE_POLLING */ + + +/* + * Exception, fault, and trap interface to the FreeBSD kernel. + * This common code is called from assembly language IDT gate entry + * routines that prepare a suitable stack frame, and restore this + * frame after the exception has been processed. + */ + +void +trap(struct trapframe frame) +{ + struct thread *td = curthread; + struct proc *p = td->td_proc; + u_int sticks = 0; + int i = 0, ucode = 0, type, code; + vm_offset_t eva; +#ifdef STACK_DEBUGGING + int nesting, current_sp; + static int prev_csp = 0, prev_ssp = 0; + nesting = PCPU_GET(trap_nesting); +#endif + +#ifdef POWERFAIL_NMI + static int lastalert = 0; +#endif + + atomic_add_int(&cnt.v_trap, 1); + type = frame.tf_trapno; +#ifdef KDB + if (kdb_active) { + kdb_reenter(); + goto out; + } +#endif + + eva = 0; + code = frame.tf_err; + + if (type == T_HYPCALLBACK) { + evtchn_do_upcall((struct intrframe *)&frame); + if (ISPL(frame.tf_cs) == SEL_KPL) + goto out; + goto userout; + } else if (type == 0) + panic("invalid trap type/code %d/%d\n",type, code); + + + if (type == T_PAGEFLT) { + /* + * For some Cyrix CPUs, %cr2 is clobbered by + * interrupts. This problem is worked around by using + * an interrupt gate for the pagefault handler. We + * are finally ready to read %cr2 and then must + * reenable interrupts. + * + * If we get a page fault while in a critical section, then + * it is most likely a fatal kernel page fault. The kernel + * is already going to panic trying to get a sleep lock to + * do the VM lookup, so just consider it a fatal trap so the + * kernel can print out a useful trap message and even get + * to the debugger. + */ + eva = frame.tf_cr2; + + if (td->td_critnest != 0) + trap_fatal(&frame, eva); + } + +#ifdef DEVICE_POLLING + if (poll_in_trap) + ether_poll(poll_in_trap); +#endif /* DEVICE_POLLING */ + + if ((ISPL(frame.tf_cs) == SEL_UPL) + || ((frame.tf_eflags & PSL_VM) && + !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL))) { + /* user trap */ + + sticks = td->td_sticks; + td->td_frame = &frame; + if (td->td_ucred != p->p_ucred) + cred_update_thread(td); + + switch (type) { + case T_PRIVINFLT: /* privileged instruction fault */ + ucode = type; + i = SIGILL; + break; + + case T_BPTFLT: /* bpt instruction fault */ + case T_TRCTRAP: /* trace trap */ + enable_intr(); + frame.tf_eflags &= ~PSL_T; + i = SIGTRAP; + break; + + case T_ARITHTRAP: /* arithmetic trap */ +#ifdef DEV_NPX + ucode = npxtrap(); + if (ucode == -1) + goto userout; +#else + ucode = code; +#endif + i = SIGFPE; + break; + + case T_PROTFLT: /* general protection fault */ + case T_STKFLT: /* stack fault */ + case T_SEGNPFLT: /* segment not present fault */ + case T_TSSFLT: /* invalid TSS fault */ + case T_DOUBLEFLT: /* double fault */ + default: + ucode = code + BUS_SEGM_FAULT ; + printf("unexpected trap type/code %d/%d\n",type, code); /* XXX temporary */ + + i = SIGBUS; + break; + + case T_PAGEFLT: /* page fault */ + if (td->td_pflags & TDP_SA) + thread_user_enter(td); + + i = trap_pfault(&frame, TRUE, eva); +#if defined(I586_CPU) && !defined(NO_F00F_HACK) + if (i == -2) { + /* + * The f00f hack workaround has triggered, so + * treat the fault as an illegal instruction + * (T_PRIVINFLT) instead of a page fault. + */ + type = frame.tf_trapno = T_PRIVINFLT; + + /* Proceed as in that case. */ + ucode = type; + i = SIGILL; + break; + } +#endif + if (i == -1) + goto userout; + if (i == 0) + goto user; + + ucode = T_PAGEFLT; + break; + + case T_DIVIDE: /* integer divide fault */ + ucode = FPE_INTDIV; + i = SIGFPE; + break; + +#ifdef DEV_ISA + case T_NMI: +#ifdef POWERFAIL_NMI +#ifndef TIMER_FREQ +# define TIMER_FREQ 1193182 +#endif + mtx_lock(&Giant); + if (time_second - lastalert > 10) { + log(LOG_WARNING, "NMI: power fail\n"); + sysbeep(TIMER_FREQ/880, hz); + lastalert = time_second; + } + mtx_unlock(&Giant); + goto userout; +#else /* !POWERFAIL_NMI */ + /* machine/parity/power fail/"kitchen sink" faults */ + /* XXX Giant */ + if (isa_nmi(code) == 0) { +#ifdef KDB + /* + * NMI can be hooked up to a pushbutton + * for debugging. + */ + if (kdb_on_nmi) { + printf ("NMI ... going to debugger\n"); + kdb_trap (type, 0, &frame); + } +#endif /* KDB */ + goto userout; + } else if (panic_on_nmi) + panic("NMI indicates hardware failure"); + break; +#endif /* POWERFAIL_NMI */ +#endif /* DEV_ISA */ + + case T_OFLOW: /* integer overflow fault */ + ucode = FPE_INTOVF; + i = SIGFPE; + break; + + case T_BOUND: /* bounds check fault */ + ucode = FPE_FLTSUB; + i = SIGFPE; + break; + + case T_DNA: +#ifdef DEV_NPX + /* transparent fault (due to context switch "late") */ + if (npxdna()) + goto userout; +#endif + i = SIGFPE; + ucode = FPE_FPU_NP_TRAP; + break; + + case T_FPOPFLT: /* FPU operand fetch fault */ + ucode = T_FPOPFLT; + i = SIGILL; + break; + + case T_XMMFLT: /* SIMD floating-point exception */ + ucode = 0; /* XXX */ + i = SIGFPE; + break; + } + } else { + /* kernel trap */ + + KASSERT(cold || td->td_ucred != NULL, + ("kernel trap doesn't have ucred")); + switch (type) { + case T_PAGEFLT: /* page fault */ + (void) trap_pfault(&frame, FALSE, eva); + goto out; + + case T_DNA: +#ifdef DEV_NPX + /* + * The kernel is apparently using npx for copying. + * XXX this should be fatal unless the kernel has + * registered such use. + */ + if (npxdna()) + goto out; +#endif + break; + + /* + * The following two traps can happen in + * vm86 mode, and, if so, we want to handle + * them specially. + */ + case T_PROTFLT: /* general protection fault */ + case T_STKFLT: /* stack fault */ +#if 0 + if (frame.tf_eflags & PSL_VM) { + i = vm86_emulate((struct vm86frame *)&frame); + if (i != 0) + /* + * returns to original process + */ + vm86_trap((struct vm86frame *)&frame); + goto out; + } +#endif + if (type == T_STKFLT) + break; + + /* FALL THROUGH */ + + case T_SEGNPFLT: /* segment not present fault */ + if (PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL) + break; + + /* + * Invalid %fs's and %gs's can be created using + * procfs or PT_SETREGS or by invalidating the + * underlying LDT entry. This causes a fault + * in kernel mode when the kernel attempts to + * switch contexts. Lose the bad context + * (XXX) so that we can continue, and generate + * a signal. + */ + if (frame.tf_eip == (int)cpu_switch_load_gs) { + PCPU_GET(curpcb)->pcb_gs = 0; +#if 0 + PROC_LOCK(p); + psignal(p, SIGBUS); + PROC_UNLOCK(p); +#endif + goto out; + } + + if (td->td_intr_nesting_level != 0) + break; + + /* + * Invalid segment selectors and out of bounds + * %eip's and %esp's can be set up in user mode. + * This causes a fault in kernel mode when the + * kernel tries to return to user mode. We want + * to get this fault so that we can fix the + * problem here and not have to check all the + * selectors and pointers when the user changes + * them. + */ + if (frame.tf_eip == (int)doreti_iret) { + frame.tf_eip = (int)doreti_iret_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_ds) { + frame.tf_eip = (int)doreti_popl_ds_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_es) { + frame.tf_eip = (int)doreti_popl_es_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_fs) { + frame.tf_eip = (int)doreti_popl_fs_fault; + goto out; + } + if (PCPU_GET(curpcb)->pcb_onfault != NULL) { + frame.tf_eip = + (int)PCPU_GET(curpcb)->pcb_onfault; + goto out; + } + break; + + case T_TSSFLT: + /* + * PSL_NT can be set in user mode and isn't cleared + * automatically when the kernel is entered. This + * causes a TSS fault when the kernel attempts to + * `iret' because the TSS link is uninitialized. We + * want to get this fault so that we can fix the + * problem here and not every time the kernel is + * entered. + */ + if (frame.tf_eflags & PSL_NT) { + frame.tf_eflags &= ~PSL_NT; + goto out; + } + break; + + case T_TRCTRAP: /* trace trap */ + if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) { + /* + * We've just entered system mode via the + * syscall lcall. Continue single stepping + * silently until the syscall handler has + * saved the flags. + */ + goto out; + } + if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) { + /* + * The syscall handler has now saved the + * flags. Stop single stepping it. + */ + frame.tf_eflags &= ~PSL_T; + goto out; + } + /* + * Ignore debug register trace traps due to + * accesses in the user's address space, which + * can happen under several conditions such as + * if a user sets a watchpoint on a buffer and + * then passes that buffer to a system call. + * We still want to get TRCTRAPS for addresses + * in kernel space because that is useful when + * debugging the kernel. + */ + /* XXX Giant */ + if (user_dbreg_trap() && + !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)) { + /* + * Reset breakpoint bits because the + * processor doesn't + */ + load_dr6(rdr6() & 0xfffffff0); + goto out; + } + /* + * FALLTHROUGH (TRCTRAP kernel mode, kernel address) + */ + case T_BPTFLT: + /* + * If KDB is enabled, let it handle the debugger trap. + * Otherwise, debugger traps "can't happen". + */ +#ifdef KDB + /* XXX Giant */ + if (kdb_trap (type, 0, &frame)) + goto out; +#endif + break; + +#ifdef DEV_ISA + case T_NMI: +#ifdef POWERFAIL_NMI + mtx_lock(&Giant); + if (time_second - lastalert > 10) { + log(LOG_WARNING, "NMI: power fail\n"); + sysbeep(TIMER_FREQ/880, hz); + lastalert = time_second; + } + mtx_unlock(&Giant); + goto out; +#else /* !POWERFAIL_NMI */ + /* XXX Giant */ + /* machine/parity/power fail/"kitchen sink" faults */ + if (isa_nmi(code) == 0) { +#ifdef KDB + /* + * NMI can be hooked up to a pushbutton + * for debugging. + */ + if (kdb_on_nmi) { + printf ("NMI ... going to debugger\n"); + kdb_trap (type, 0, &frame); + } +#endif /* KDB */ + goto out; + } else if (panic_on_nmi == 0) + goto out; + /* FALLTHROUGH */ +#endif /* POWERFAIL_NMI */ +#endif /* DEV_ISA */ + } + + trap_fatal(&frame, eva); + goto out; + } + + /* Translate fault for emulators (e.g. Linux) */ + if (*p->p_sysent->sv_transtrap) + i = (*p->p_sysent->sv_transtrap)(i, type); + + trapsignal(td, i, ucode); + +#if 1 /* DEBUG */ + if (type <= MAX_TRAP_MSG) { + uprintf("fatal process exception: %s", + trap_msg[type]); + if ((type == T_PAGEFLT) || (type == T_PROTFLT)) + uprintf(", fault VA = 0x%lx", (u_long)eva); + uprintf("\n"); + } +#endif + +user: + userret(td, &frame, sticks); + mtx_assert(&Giant, MA_NOTOWNED); +userout: +out: +#ifdef STACK_DEBUGGING + PCPU_SET(trap_nesting, nesting); +#endif + return; +} + +static int +trap_pfault(frame, usermode, eva) + struct trapframe *frame; + int usermode; + vm_offset_t eva; +{ + vm_offset_t va; + struct vmspace *vm = NULL; + vm_map_t map = 0; + int rv = 0; + vm_prot_t ftype; + struct thread *td = curthread; + struct proc *p = td->td_proc; + + va = trunc_page(eva); + if (va >= KERNBASE) { + /* + * Don't allow user-mode faults in kernel address space. + * An exception: if the faulting address is the invalid + * instruction entry in the IDT, then the Intel Pentium + * F00F bug workaround was triggered, and we need to + * treat it is as an illegal instruction, and not a page + * fault. + */ +#if defined(I586_CPU) && !defined(NO_F00F_HACK) + if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) + return -2; +#endif + if (usermode) + goto nogo; + + map = kernel_map; + } else { + /* + * This is a fault on non-kernel virtual memory. + * vm is initialized above to NULL. If curproc is NULL + * or curproc->p_vmspace is NULL the fault is fatal. + */ + if (p != NULL) + vm = p->p_vmspace; + + if (vm == NULL) + goto nogo; + + map = &vm->vm_map; + } + + if (frame->tf_err & PGEX_W) + ftype = VM_PROT_WRITE; + else + ftype = VM_PROT_READ; + + if (map != kernel_map) { + /* + * Keep swapout from messing with us during this + * critical time. + */ + PROC_LOCK(p); + ++p->p_lock; + PROC_UNLOCK(p); + + /* Fault in the user page: */ + rv = vm_fault(map, va, ftype, + (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY + : VM_FAULT_NORMAL); + + PROC_LOCK(p); + --p->p_lock; + PROC_UNLOCK(p); + } else { + /* + * Don't have to worry about process locking or stacks in the + * kernel. + */ + rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); + } + if (rv == KERN_SUCCESS) + return (0); +nogo: + if (!usermode) { + if (td->td_intr_nesting_level == 0 && + PCPU_GET(curpcb)->pcb_onfault != NULL) { + frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; + return (0); + } + trap_fatal(frame, eva); + return (-1); + } + + /* kludge to pass faulting virtual address to sendsig */ + frame->tf_err = eva; + + return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); +} + +static void +trap_fatal(struct trapframe *frame, vm_offset_t eva) +{ + int code, type, ss, esp; + struct soft_segment_descriptor softseg; + + code = frame->tf_err; + type = frame->tf_trapno; +#if 0 + XENPRINTF("trying to read gdt\n"); + sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); + XENPRINTF("read gdt\n"); +#endif + if (type <= MAX_TRAP_MSG) + printf("\n\nFatal trap %d: %s while in %s mode\n", + type, trap_msg[type], + frame->tf_eflags & PSL_VM ? "vm86" : + ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); +#ifdef SMP + /* two separate prints in case of a trap on an unmapped page */ + printf("cpuid = %d; ", PCPU_GET(cpuid)); + printf("apic id = %02x\n", PCPU_GET(apic_id)); +#endif + if (type == T_PAGEFLT) { + printf("fault virtual address = 0x%x\n", eva); + printf("fault code = %s %s, %s\n", + code & PGEX_U ? "user" : "supervisor", + code & PGEX_W ? "write" : "read", + code & PGEX_P ? "protection violation" : "page not present"); + } + printf("instruction pointer = 0x%x:0x%x\n", + frame->tf_cs & 0xffff, frame->tf_eip); + if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { + ss = frame->tf_ss & 0xffff; + esp = frame->tf_esp; + } else { + ss = GSEL(GDATA_SEL, SEL_KPL); + esp = (int)&frame->tf_esp; + } + printf("stack pointer = 0x%x:0x%x\n", ss, esp); + printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); + printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", + softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); + printf(" = DPL %d, pres %d, def32 %d, gran %d\n", + softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, + softseg.ssd_gran); + printf("processor eflags = "); + if (frame->tf_eflags & PSL_T) + printf("trace trap, "); + if (frame->tf_eflags & PSL_I) + printf("interrupt enabled, "); + if (frame->tf_eflags & PSL_NT) + printf("nested task, "); + if (frame->tf_eflags & PSL_RF) + printf("resume, "); + if (frame->tf_eflags & PSL_VM) + printf("vm86, "); + printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); + printf("current process = "); + if (curproc) { + printf("%lu (%s)\n", + (u_long)curproc->p_pid, curproc->p_comm ? + curproc->p_comm : ""); + } else { + printf("Idle\n"); + } + /* XXX */ + +#ifdef KDB + if (kdb_trap(type, 0, frame)) + return; +#endif + printf("trap number = %d\n", type); + if (type <= MAX_TRAP_MSG) + panic("%s", trap_msg[type]); + else + panic("unknown/reserved trap"); +} + +/* + * Double fault handler. Called when a fault occurs while writing + * a frame for a trap/exception onto the stack. This usually occurs + * when the stack overflows (such is the case with infinite recursion, + * for example). + * + * XXX Note that the current PTD gets replaced by IdlePTD when the + * task switch occurs. This means that the stack that was active at + * the time of the double fault is not available at <kstack> unless + * the machine was idle when the double fault occurred. The downside + * of this is that "trace <ebp>" in ddb won't work. + */ +void +dblfault_handler() +{ + printf("\nFatal double fault:\n"); + printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); + printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); + printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); +#ifdef SMP + /* two separate prints in case of a trap on an unmapped page */ + printf("cpuid = %d; ", PCPU_GET(cpuid)); + printf("apic id = %02x\n", PCPU_GET(apic_id)); +#endif + panic("double fault"); +} + +/* + * syscall - system call request C handler + * + * A system call is essentially treated as a trap. + */ +void +syscall(frame) + struct trapframe frame; +{ + caddr_t params; + struct sysent *callp; + struct thread *td = curthread; + struct proc *p = td->td_proc; + register_t orig_tf_eflags; + u_int sticks; + int error; + int narg; + int args[8]; + u_int code; + + /* + * note: PCPU_LAZY_INC() can only be used if we can afford + * occassional inaccuracy in the count. + */ + PCPU_LAZY_INC(cnt.v_syscall); + +#ifdef DIAGNOSTIC + if (ISPL(frame.tf_cs) != SEL_UPL) { + mtx_lock(&Giant); /* try to stabilize the system XXX */ + panic("syscall"); + /* NOT REACHED */ + mtx_unlock(&Giant); + } +#endif + + sticks = td->td_sticks; + td->td_frame = &frame; + if (td->td_ucred != p->p_ucred) + cred_update_thread(td); + if (p->p_flag & P_SA) + thread_user_enter(td); + params = (caddr_t)frame.tf_esp + sizeof(int); + code = frame.tf_eax; + orig_tf_eflags = frame.tf_eflags; + + if (p->p_sysent->sv_prepsyscall) { + /* + * The prep code is MP aware. + */ + (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); + } else { + /* + * Need to check if this is a 32 bit or 64 bit syscall. + * fuword is MP aware. + */ + if (code == SYS_syscall) { + /* + * Code is first argument, followed by actual args. + */ + code = fuword(params); + params += sizeof(int); + } else if (code == SYS___syscall) { + /* + * Like syscall, but code is a quad, so as to maintain + * quad alignment for the rest of the arguments. + */ + code = fuword(params); + params += sizeof(quad_t); + } + } + + if (p->p_sysent->sv_mask) + code &= p->p_sysent->sv_mask; + + if (code >= p->p_sysent->sv_size) + callp = &p->p_sysent->sv_table[0]; + else + callp = &p->p_sysent->sv_table[code]; + + narg = callp->sy_narg & SYF_ARGMASK; + + /* + * copyin and the ktrsyscall()/ktrsysret() code is MP-aware + */ + if (params != NULL && narg != 0) + error = copyin(params, (caddr_t)args, + (u_int)(narg * sizeof(int))); + else + error = 0; + +#ifdef KTRACE + if (KTRPOINT(td, KTR_SYSCALL)) + ktrsyscall(code, narg, args); +#endif + CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td, + td->td_proc->p_pid, td->td_proc->p_comm, code); + + /* + * Try to run the syscall without Giant if the syscall + * is MP safe. + */ + if ((callp->sy_narg & SYF_MPSAFE) == 0) + mtx_lock(&Giant); + + if (error == 0) { + td->td_retval[0] = 0; + td->td_retval[1] = frame.tf_edx; + + STOPEVENT(p, S_SCE, narg); + + PTRACESTOP_SC(p, td, S_PT_SCE); + + error = (*callp->sy_call)(td, args); + } + + switch (error) { + case 0: + frame.tf_eax = td->td_retval[0]; + frame.tf_edx = td->td_retval[1]; + frame.tf_eflags &= ~PSL_C; + break; + + case ERESTART: + /* + * Reconstruct pc, assuming lcall $X,y is 7 bytes, + * int 0x80 is 2 bytes. We saved this in tf_err. + */ + frame.tf_eip -= frame.tf_err; + break; + + case EJUSTRETURN: + break; + + default: + if (p->p_sysent->sv_errsize) { + if (error >= p->p_sysent->sv_errsize) + error = -1; /* XXX */ + else + error = p->p_sysent->sv_errtbl[error]; + } + frame.tf_eax = error; + frame.tf_eflags |= PSL_C; + break; + } + + /* + * Release Giant if we previously set it. + */ + if ((callp->sy_narg & SYF_MPSAFE) == 0) + mtx_unlock(&Giant); + + /* + * Traced syscall. + */ + if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { + frame.tf_eflags &= ~PSL_T; + trapsignal(td, SIGTRAP, 0); + } + + /* + * Handle reschedule and other end-of-syscall issues + */ + userret(td, &frame, sticks); + +#ifdef KTRACE + if (KTRPOINT(td, KTR_SYSRET)) + ktrsysret(code, error, td->td_retval[0]); +#endif + + /* + * This works because errno is findable through the + * register set. If we ever support an emulation where this + * is not the case, this code will need to be revisited. + */ + STOPEVENT(p, S_SCX, code); + + PTRACESTOP_SC(p, td, S_PT_SCX); + + WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", + (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); + mtx_assert(&sched_lock, MA_NOTOWNED); + mtx_assert(&Giant, MA_NOTOWNED); +} + diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/vm_machdep.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/vm_machdep.c new file mode 100644 index 0000000000..cff67833f7 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/vm_machdep.c @@ -0,0 +1,618 @@ +/*- + * Copyright (c) 1982, 1986 The Regents of the University of California. + * Copyright (c) 1989, 1990 William Jolitz + * Copyright (c) 1994 John Dyson + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department, and William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 + * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.219 2003/11/17 18:22:24 alc Exp $"); + +#include "opt_npx.h" +#ifdef PC98 +#include "opt_pc98.h" +#endif +#include "opt_reset.h" +#include "opt_cpu.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/kse.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sf_buf.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/unistd.h> +#include <sys/user.h> +#include <sys/vnode.h> +#include <sys/vmmeter.h> + +#include <machine/cpu.h> +#include <machine/cputypes.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#include <machine/pcb_ext.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_param.h> + +#ifdef PC98 +#include <pc98/pc98/pc98.h> +#else +#include <i386/isa/isa.h> +#endif + +#ifndef NSFBUFS +#define NSFBUFS (512 + maxusers * 16) +#endif + +#include <machine/xenfunc.h> + +#ifdef SMP +static void cpu_reset_proxy(void); +static u_int cpu_reset_proxyid; +static volatile u_int cpu_reset_proxy_active; +#endif +static void sf_buf_init(void *arg); +SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) + +LIST_HEAD(sf_head, sf_buf); + +/* + * A hash table of active sendfile(2) buffers + */ +static TAILQ_HEAD(, sf_buf) sf_buf_freelist; + + +static struct sf_head *sf_buf_active; +static u_long sf_buf_hashmask; + + +#define SF_BUF_HASH(m) (((m) - vm_page_array) & sf_buf_hashmask) + +static u_int sf_buf_alloc_want; + +/* + * A lock used to synchronize access to the hash table and free list + */ +static struct mtx sf_buf_lock; + +extern int _ucodesel, _udatasel; + +/* + * Finish a fork operation, with process p2 nearly set up. + * Copy and update the pcb, set up the stack so that the child + * ready to run and return to user mode. + */ +void +cpu_fork(struct thread *td1, + struct proc *p2, + struct thread *td2, + int flags) +{ + register struct proc *p1; + struct pcb *pcb2; + struct mdproc *mdp2; +#ifdef DEV_NPX + register_t savecrit; +#endif + + p1 = td1->td_proc; + if ((flags & RFPROC) == 0) { + if ((flags & RFMEM) == 0) { + /* unshare user LDT */ + struct mdproc *mdp1 = &p1->p_md; + struct proc_ldt *pldt = mdp1->md_ldt; + if (pldt && pldt->ldt_refcnt > 1) { + pldt = user_ldt_alloc(mdp1, pldt->ldt_len); + if (pldt == NULL) + panic("could not copy LDT"); + mdp1->md_ldt = pldt; + set_user_ldt(mdp1); + user_ldt_free(td1); + } + } + return; + } + + /* Ensure that p1's pcb is up to date. */ +#ifdef DEV_NPX + if (td1 == curthread) + td1->td_pcb->pcb_gs = rgs(); + savecrit = intr_disable(); + if (PCPU_GET(fpcurthread) == td1) + npxsave(&td1->td_pcb->pcb_save); + intr_restore(savecrit); +#endif + + /* Point the pcb to the top of the stack */ + pcb2 = (struct pcb *)(td2->td_kstack + td2->td_kstack_pages * PAGE_SIZE) - 1; + td2->td_pcb = pcb2; + + /* Copy p1's pcb */ + bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); + + /* Point mdproc and then copy over td1's contents */ + mdp2 = &p2->p_md; + bcopy(&p1->p_md, mdp2, sizeof(*mdp2)); + + /* + * Create a new fresh stack for the new process. + * Copy the trap frame for the return to user mode as if from a + * syscall. This copies most of the user mode register values. + */ + td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb) - 1; + bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe)); + + td2->td_frame->tf_eax = 0; /* Child returns zero */ + td2->td_frame->tf_eflags &= ~PSL_C; /* success */ + td2->td_frame->tf_edx = 1; + /* + * Set registers for trampoline to user mode. Leave space for the + * return address on stack. These are the kernel mode register values. + */ + pcb2->pcb_cr3 = vtophys(vmspace_pmap(p2->p_vmspace)->pm_pdir); + pcb2->pcb_edi = 0; + pcb2->pcb_esi = (int)fork_return; /* fork_trampoline argument */ + pcb2->pcb_ebp = 0; + pcb2->pcb_esp = (int)td2->td_frame - sizeof(void *); + pcb2->pcb_ebx = (int)td2; /* fork_trampoline argument */ + pcb2->pcb_eip = (int)fork_trampoline; + pcb2->pcb_psl = PSL_KERNEL; /* ints disabled */ + pcb2->pcb_gs = rgs(); + /*- + * pcb2->pcb_dr*: cloned above. + * pcb2->pcb_savefpu: cloned above. + * pcb2->pcb_flags: cloned above. + * pcb2->pcb_onfault: cloned above (always NULL here?). + * pcb2->pcb_gs: cloned above. + * pcb2->pcb_ext: cleared below. + */ + + /* + * XXX don't copy the i/o pages. this should probably be fixed. + */ + pcb2->pcb_ext = 0; + + /* Copy the LDT, if necessary. */ + mtx_lock_spin(&sched_lock); + + if (mdp2->md_ldt != 0) { + if (flags & RFMEM) { + mdp2->md_ldt->ldt_refcnt++; + } else { + mdp2->md_ldt = user_ldt_alloc(mdp2, + mdp2->md_ldt->ldt_len); + if (mdp2->md_ldt == NULL) + panic("could not copy LDT"); + } + } + mtx_unlock_spin(&sched_lock); + + /* + * Now, cpu_switch() can schedule the new process. + * pcb_esp is loaded pointing to the cpu_switch() stack frame + * containing the return address when exiting cpu_switch. + * This will normally be to fork_trampoline(), which will have + * %ebx loaded with the new proc's pointer. fork_trampoline() + * will set up a stack to call fork_return(p, frame); to complete + * the return to user-mode. + */ +} + +/* + * Intercept the return address from a freshly forked process that has NOT + * been scheduled yet. + * + * This is needed to make kernel threads stay in kernel mode. + */ +void +cpu_set_fork_handler(td, func, arg) + struct thread *td; + void (*func)(void *); + void *arg; +{ + /* + * Note that the trap frame follows the args, so the function + * is really called like this: func(arg, frame); + */ + td->td_pcb->pcb_esi = (int) func; /* function */ + td->td_pcb->pcb_ebx = (int) arg; /* first arg */ +} + +void +cpu_exit(struct thread *td) +{ + struct mdproc *mdp; + struct pcb *pcb = td->td_pcb; + + + /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ + mdp = &td->td_proc->p_md; + if (mdp->md_ldt) { + td->td_pcb->pcb_gs = _udatasel; + load_gs(_udatasel); + user_ldt_free(td); + } + if (pcb->pcb_flags & PCB_DBREGS) { + /* disable all hardware breakpoints */ + reset_dbregs(); + pcb->pcb_flags &= ~PCB_DBREGS; + } +} + +void +cpu_thread_exit(struct thread *td) +{ + struct pcb *pcb = td->td_pcb; +#ifdef DEV_NPX + if (td == PCPU_GET(fpcurthread)) + npxdrop(); +#endif + if (pcb->pcb_flags & PCB_DBREGS) { + /* disable all hardware breakpoints */ + reset_dbregs(); + pcb->pcb_flags &= ~PCB_DBREGS; + } +} + +void +cpu_thread_clean(struct thread *td) +{ + struct pcb *pcb; + + pcb = td->td_pcb; + if (pcb->pcb_ext != 0) { + /* XXXKSE XXXSMP not SMP SAFE.. what locks do we have? */ + /* if (pcb->pcb_ext->ext_refcount-- == 1) ?? */ + /* + * XXX do we need to move the TSS off the allocated pages + * before freeing them? (not done here) + */ + kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ext, + ctob(IOPAGES + 1)); + pcb->pcb_ext = 0; + } +} + +void +cpu_thread_swapin(struct thread *td) +{ +} + +void +cpu_thread_swapout(struct thread *td) +{ +} + +void +cpu_thread_setup(struct thread *td) +{ + + td->td_pcb = + (struct pcb *)(td->td_kstack + td->td_kstack_pages * PAGE_SIZE) - 1; + td->td_frame = (struct trapframe *)((caddr_t)td->td_pcb - 16) - 1; + td->td_pcb->pcb_ext = NULL; +} + +/* + * Initialize machine state (pcb and trap frame) for a new thread about to + * upcall. Pu t enough state in the new thread's PCB to get it to go back + * userret(), where we can intercept it again to set the return (upcall) + * Address and stack, along with those from upcals that are from other sources + * such as those generated in thread_userret() itself. + */ +void +cpu_set_upcall(struct thread *td, struct thread *td0) +{ + struct pcb *pcb2; + + /* Point the pcb to the top of the stack. */ + pcb2 = td->td_pcb; + + /* + * Copy the upcall pcb. This loads kernel regs. + * Those not loaded individually below get their default + * values here. + * + * XXXKSE It might be a good idea to simply skip this as + * the values of the other registers may be unimportant. + * This would remove any requirement for knowing the KSE + * at this time (see the matching comment below for + * more analysis) (need a good safe default). + */ + bcopy(td0->td_pcb, pcb2, sizeof(*pcb2)); + pcb2->pcb_flags &= ~(PCB_NPXTRAP|PCB_NPXINITDONE); + + /* + * Create a new fresh stack for the new thread. + * Don't forget to set this stack value into whatever supplies + * the address for the fault handlers. + * The contexts are filled in at the time we actually DO the + * upcall as only then do we know which KSE we got. + */ + bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe)); + + /* + * Set registers for trampoline to user mode. Leave space for the + * return address on stack. These are the kernel mode register values. + */ +#ifdef PAE + pcb2->pcb_cr3 = vtophys(vmspace_pmap(td->td_proc->p_vmspace)->pm_pdpt); +#else + pcb2->pcb_cr3 = vtophys(vmspace_pmap(td->td_proc->p_vmspace)->pm_pdir); +#endif + pcb2->pcb_edi = 0; + pcb2->pcb_esi = (int)fork_return; /* trampoline arg */ + pcb2->pcb_ebp = 0; + pcb2->pcb_esp = (int)td->td_frame - sizeof(void *); /* trampoline arg */ + pcb2->pcb_ebx = (int)td; /* trampoline arg */ + pcb2->pcb_eip = (int)fork_trampoline; + pcb2->pcb_psl &= ~(PSL_I); /* interrupts must be disabled */ + pcb2->pcb_gs = rgs(); + /* + * If we didn't copy the pcb, we'd need to do the following registers: + * pcb2->pcb_dr*: cloned above. + * pcb2->pcb_savefpu: cloned above. + * pcb2->pcb_flags: cloned above. + * pcb2->pcb_onfault: cloned above (always NULL here?). + * pcb2->pcb_gs: cloned above. XXXKSE ??? + * pcb2->pcb_ext: cleared below. + */ + pcb2->pcb_ext = NULL; +} + +/* + * Set that machine state for performing an upcall that has to + * be done in thread_userret() so that those upcalls generated + * in thread_userret() itself can be done as well. + */ +void +cpu_set_upcall_kse(struct thread *td, struct kse_upcall *ku) +{ + + /* + * Do any extra cleaning that needs to be done. + * The thread may have optional components + * that are not present in a fresh thread. + * This may be a recycled thread so make it look + * as though it's newly allocated. + */ + cpu_thread_clean(td); + + /* + * Set the trap frame to point at the beginning of the uts + * function. + */ + td->td_frame->tf_ebp = 0; + td->td_frame->tf_esp = + (int)ku->ku_stack.ss_sp + ku->ku_stack.ss_size - 16; + td->td_frame->tf_eip = (int)ku->ku_func; + + /* + * Pass the address of the mailbox for this kse to the uts + * function as a parameter on the stack. + */ + suword((void *)(td->td_frame->tf_esp + sizeof(void *)), + (int)ku->ku_mailbox); +} + +/* + * Convert kernel VA to physical address + */ +vm_paddr_t +kvtop(void *addr) +{ + vm_paddr_t pa; + + pa = pmap_kextract((vm_offset_t)addr); + if (pa == 0) + panic("kvtop: zero page frame"); + return (pa); +} + +/* + * Force reset the processor by invalidating the entire address space! + */ + +#ifdef SMP +static void +cpu_reset_proxy() +{ + + cpu_reset_proxy_active = 1; + while (cpu_reset_proxy_active == 1) + ; /* Wait for other cpu to see that we've started */ + stop_cpus((1<<cpu_reset_proxyid)); + printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); + DELAY(1000000); + cpu_reset_real(); +} +#endif + +void +cpu_reset() +{ + HYPERVISOR_shutdown(); +} + + +/* + * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) + */ +static void +sf_buf_init(void *arg) +{ + struct sf_buf *sf_bufs; + vm_offset_t sf_base; + int i; + + nsfbufs = NSFBUFS; + TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs); + + sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask); + TAILQ_INIT(&sf_buf_freelist); + sf_base = kmem_alloc_nofault(kernel_map, nsfbufs * PAGE_SIZE); + sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, + M_NOWAIT | M_ZERO); + for (i = 0; i < nsfbufs; i++) { + sf_bufs[i].kva = sf_base + i * PAGE_SIZE; + TAILQ_INSERT_TAIL(&sf_buf_freelist, &sf_bufs[i], free_entry); + } + sf_buf_alloc_want = 0; + mtx_init(&sf_buf_lock, "sf_buf", NULL, MTX_DEF); +} + +/* + * Get an sf_buf from the freelist. Will block if none are available. + */ +struct sf_buf * +sf_buf_alloc(struct vm_page *m, int pri) +{ + struct sf_head *hash_list; + struct sf_buf *sf; + int error; + + hash_list = &sf_buf_active[SF_BUF_HASH(m)]; + mtx_lock(&sf_buf_lock); + LIST_FOREACH(sf, hash_list, list_entry) { + if (sf->m == m) { + sf->ref_count++; + if (sf->ref_count == 1) { + TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry); + nsfbufsused++; + nsfbufspeak = imax(nsfbufspeak, nsfbufsused); + } + goto done; + } + } + while ((sf = TAILQ_FIRST(&sf_buf_freelist)) == NULL) { + sf_buf_alloc_want++; + mbstat.sf_allocwait++; + error = msleep(&sf_buf_freelist, &sf_buf_lock, PVM | pri, + "sfbufa", 0); + sf_buf_alloc_want--; + + /* + * If we got a signal, don't risk going back to sleep. + */ + if (error) + goto done; + } + TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry); + if (sf->m != NULL) + LIST_REMOVE(sf, list_entry); + LIST_INSERT_HEAD(hash_list, sf, list_entry); + sf->ref_count = 1; + sf->m = m; + nsfbufsused++; + nsfbufspeak = imax(nsfbufspeak, nsfbufsused); + pmap_qenter(sf->kva, &sf->m, 1); +done: + mtx_unlock(&sf_buf_lock); + return (sf); +} + +/* + * Detatch mapped page and release resources back to the system. + */ +void +sf_buf_free(struct sf_buf *sf) +{ + mtx_lock(&sf_buf_lock); + sf->ref_count--; + if (sf->ref_count == 0) { + TAILQ_INSERT_TAIL(&sf_buf_freelist, sf, free_entry); + nsfbufsused--; + /* XEN only */ + pmap_qremove(sf->kva, 1); + sf->m = NULL; + LIST_REMOVE(sf, list_entry); + /* ----- */ + if (sf_buf_alloc_want > 0) + wakeup_one(&sf_buf_freelist); + } + mtx_unlock(&sf_buf_lock); +} + +/* + * Software interrupt handler for queued VM system processing. + */ +void +swi_vm(void *dummy) +{ + if (busdma_swi_pending != 0) + busdma_swi(); +} + +/* + * Tell whether this address is in some physical memory region. + * Currently used by the kernel coredump code in order to avoid + * dumping the ``ISA memory hole'' which could cause indefinite hangs, + * or other unpredictable behaviour. + */ + +int +is_physical_memory(vm_paddr_t addr) +{ + +#ifdef DEV_ISA + /* The ISA ``memory hole''. */ + if (addr >= 0xa0000 && addr < 0x100000) + return 0; +#endif + + /* + * stuff other tests for known memory-mapped devices (PCI?) + * here + */ + + return 1; +} diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/xen_bus.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/xen_bus.c new file mode 100644 index 0000000000..96f6ca086b --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/xen_bus.c @@ -0,0 +1,238 @@ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <machine/bus.h> +#include <sys/rman.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <machine/frame.h> +#include <machine/intr_machdep.h> +#include <machine/resource.h> + +#include <machine/xen-os.h> +#include <machine/hypervisor.h> +#include <machine/xen_intr.h> + +static MALLOC_DEFINE(M_XENDEV, "xenintrdrv", "xen system device"); + +struct xenbus_device { + struct resource_list xen_resources; +}; + +#define DEVTOXEN(dev) ((struct xenbus_device *)device_get_ivars(dev)) + +static void xenbus_identify(driver_t *, device_t); +static int xenbus_probe(device_t); +static int xenbus_attach(device_t); +static int xenbus_print_child(device_t, device_t); +static device_t xenbus_add_child(device_t bus, int order, const char *name, + int unit); +static struct resource *xenbus_alloc_resource(device_t, device_t, int, int *, + u_long, u_long, u_long, u_int); +static int xenbus_release_resource(device_t, device_t, int, int, + struct resource *); +static int xenbus_set_resource(device_t, device_t, int, int, u_long, u_long); +static int xenbus_get_resource(device_t, device_t, int, int, u_long *, u_long *); +static void xenbus_delete_resource(device_t, device_t, int, int); + + +static device_method_t xenbus_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, xenbus_identify), + DEVMETHOD(device_probe, xenbus_probe), + DEVMETHOD(device_attach, xenbus_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_print_child, xenbus_print_child), + DEVMETHOD(bus_add_child, xenbus_add_child), + DEVMETHOD(bus_read_ivar, bus_generic_read_ivar), + DEVMETHOD(bus_write_ivar, bus_generic_write_ivar), + DEVMETHOD(bus_set_resource, xenbus_set_resource), + DEVMETHOD(bus_get_resource, xenbus_get_resource), + DEVMETHOD(bus_alloc_resource, xenbus_alloc_resource), + DEVMETHOD(bus_release_resource, xenbus_release_resource), + DEVMETHOD(bus_delete_resource, xenbus_delete_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), + DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), + + { 0, 0 } +}; + + +static driver_t xenbus_driver = { + "xenbus", + xenbus_methods, + 1, /* no softc */ +}; +static devclass_t xenbus_devclass; +static device_t xenbus_dev; +static boolean_t xenbus_probe_delay = TRUE; /* delay child probes */ + +DRIVER_MODULE(xenbus, nexus, xenbus_driver, xenbus_devclass, 0, 0); + +static void +xenbus_identify(driver_t *driver, device_t parent) +{ + + /* + * Add child device with order of 0 so it gets probed + * first + */ + xenbus_dev = BUS_ADD_CHILD(parent, 0, "xenbus", 0); + if (xenbus_dev == NULL) + panic("xenbus: could not attach"); +} + +static int +xenbus_probe(device_t dev) +{ + device_set_desc(dev, "xen system"); + device_quiet(dev); + return (0); +} + +static int +xenbus_attach(device_t dev) +{ + /* + * First, let our child driver's identify any child devices that + * they can find. Once that is done attach any devices that we + * found. + */ + if (!xenbus_probe_delay) { + bus_generic_probe(dev); + bus_generic_attach(dev); + } + + return 0; +} + + +static int +xenbus_print_all_resources(device_t dev) +{ + struct xenbus_device *xdev = device_get_ivars(dev); + struct resource_list *rl = &xdev->xen_resources; + int retval = 0; + + if (SLIST_FIRST(rl)) + retval += printf(" at"); + + retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#lx"); + retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#lx"); + retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%ld"); + + return retval; +} + + +static int +xenbus_print_child(device_t bus, device_t child) +{ + int retval = 0; + + retval += bus_print_child_header(bus, child); + retval += xenbus_print_all_resources(child); + retval += printf(" on motherboard\n"); /* XXX "motherboard", ick */ + + return (retval); +} + +static device_t +xenbus_add_child(device_t bus, int order, const char *name, int unit) +{ + device_t child; + struct xenbus_device *xendev; + + xendev = malloc(sizeof(struct xenbus_device), M_XENDEV, + M_NOWAIT | M_ZERO); + if (!xendev) + return(0); + resource_list_init(&xendev->xen_resources); + + child = device_add_child_ordered(bus, order, name, unit); + + /* should we free this in xenbus_child_detached? */ + device_set_ivars(child, xendev); + + return(child); +} + +static struct resource * +xenbus_alloc_resource(device_t bus, device_t child, int type, int *rid, + u_long start, u_long end, u_long count, u_int flags) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + return (resource_list_alloc(rl, bus, child, type, rid, start, end, + count, flags)); +} + + +static int +xenbus_release_resource(device_t bus, device_t child, int type, int rid, + struct resource *r) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + return (resource_list_release(rl, bus, child, type, rid, r)); +} + +static int +xenbus_set_resource(device_t dev, device_t child, int type, int rid, + u_long start, u_long count) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + resource_list_add(rl, type, rid, start, start + count - 1, count); + return(0); +} + +static int +xenbus_get_resource(device_t dev, device_t child, int type, int rid, + u_long *startp, u_long *countp) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + struct resource_list_entry *rle; + + rle = resource_list_find(rl, type, rid); + if (!rle) + return(ENOENT); + if (startp) + *startp = rle->start; + if (countp) + *countp = rle->count; + return(0); +} + +static void +xenbus_delete_resource(device_t dev, device_t child, int type, int rid) +{ + struct xenbus_device *xendev = DEVTOXEN(child); + struct resource_list *rl = &xendev->xen_resources; + + resource_list_delete(rl, type, rid); +} + +static void +xenbus_init(void *unused) +{ + xenbus_probe_delay = FALSE; + xenbus_attach(xenbus_dev); +} +SYSINIT(xenbusdev, SI_SUB_PSEUDO, SI_ORDER_FIRST, xenbus_init, NULL); diff --git a/freebsd-5.3-xen-sparse/i386-xen/i386-xen/xen_machdep.c b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/xen_machdep.c new file mode 100644 index 0000000000..dd24a206b1 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/i386-xen/xen_machdep.c @@ -0,0 +1,687 @@ +/* $NetBSD:$ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mount.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/reboot.h> + + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/stdarg.h> +#include <machine/xenfunc.h> +#include <machine/xenpmap.h> +#include <machine/vmparam.h> +#include <machine/cpu.h> +#include <machine/xenvar.h> + +#include <sys/socket.h> +#include <sys/sockio.h> +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/if_var.h> +#include <net/ethernet.h> +#include <netinet/in.h> +#include <sys/mbuf.h> +#include <nfs/rpcv2.h> +#include <nfsclient/krpc.h> +#include <nfs/nfsproto.h> + + +shared_info_t *HYPERVISOR_shared_info; + +void ni_cli(void); +void ni_sti(void); +#ifdef NFS_ROOT + +static int +xdr_opaque_decode(struct mbuf **mptr, u_char *buf, int len) +{ + struct mbuf *m; + int alignedlen; + + m = *mptr; + alignedlen = ( len + 3 ) & ~3; + + if (m->m_len < alignedlen) { + m = m_pullup(m, alignedlen); + if (m == NULL) { + *mptr = NULL; + return EBADRPC; + } + } + bcopy(mtod(m, u_char *), buf, len); + m_adj(m, alignedlen); + *mptr = m; + return 0; +} + + +static int +getdec(char **ptr) +{ + char *p; + int ret; + + p = *ptr; + ret = 0; + if ((*p < '0') || (*p > '9')) + return -1; + while ((*p >= '0') && (*p <= '9')) { + ret = ret * 10 + (*p - '0'); + p++; + } + *ptr = p; + return ret; +} + +int +setinaddr(struct sockaddr_in *addr, char *ipstr) +{ + unsigned int ip; + int val; + + ip = 0; + if (((val = getdec(&ipstr)) < 0) || (val > 255)) + return 1; + ip = val << 24; + if (*ipstr != '.') + return 1; + ipstr++; + if (((val = getdec(&ipstr)) < 0) || (val > 255)) + return 1; + ip |= (val << 16); + if (*ipstr != '.') + return 1; + ipstr++; + if (((val = getdec(&ipstr)) < 0) || (val > 255)) + return 1; + ip |= (val << 8); + if (*ipstr != '.') + return 1; + ipstr++; + if (((val = getdec(&ipstr)) < 0) || (val > 255)) + return 1; + ip |= val; + + addr->sin_addr.s_addr = htonl(ip); + addr->sin_len = sizeof(struct sockaddr_in); + addr->sin_family = AF_INET; + + return 0; +} + +static int +hwaddr_to_sockaddr(char *ev, struct sockaddr_dl *sa) +{ + char *cp; + u_int32_t a[6]; + int count; + + bzero(sa, sizeof(*sa)); + sa->sdl_len = sizeof(*sa); + sa->sdl_family = AF_LINK; + sa->sdl_type = IFT_ETHER; + sa->sdl_alen = ETHER_ADDR_LEN; + if ((cp = getenv(ev)) == NULL) + return (1); + count = sscanf(cp, "%x:%x:%x:%x:%x:%x", + &a[0], &a[1], &a[2], &a[3], &a[4], &a[5]); + freeenv(cp); + if (count != 6) + return (1); + sa->sdl_data[0] = a[0]; + sa->sdl_data[1] = a[1]; + sa->sdl_data[2] = a[2]; + sa->sdl_data[3] = a[3]; + sa->sdl_data[4] = a[4]; + sa->sdl_data[5] = a[5]; + return (0); +} +extern int in_control(struct socket *so, u_long cmd, + caddr_t data, struct ifnet *ifp, + struct thread *td); + +static int +xen_setnetwork(void) +{ + int error = 0; + struct ifaddr *ifa; + struct ifnet *ifp; + struct sockaddr_dl *sdl, ourdl; + + if (sizeof(struct sockaddr) != sizeof(struct sockaddr_in)) + panic("sizes not equal\n"); + + if (hwaddr_to_sockaddr("boot.netif.hwaddr", &ourdl)) { + printf("nfs_diskless: no hardware address\n"); + return -1; + } + + + ifa = NULL; + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if ((ifa->ifa_addr->sa_family == AF_LINK) && + (sdl = ((struct sockaddr_dl *)ifa->ifa_addr))) { + if ((sdl->sdl_type == ourdl.sdl_type) && + (sdl->sdl_alen == ourdl.sdl_alen) && + !bcmp(sdl->sdl_data + sdl->sdl_nlen, + ourdl.sdl_data + ourdl.sdl_nlen, + sdl->sdl_alen)) { + IFNET_RUNLOCK(); + goto match_done; + } + } + } + } + IFNET_RUNLOCK(); + printf("nfs_diskless: no interface\n"); + return -1; /* no matching interface */ + match_done: + + if (getenv("boot.netif.ip") && getenv("boot.netif.gateway") && + getenv("boot.netif.netmask")) { + struct ifaliasreq ifra; + char *ip; + + bzero(&ifra, sizeof(ifra)); + strcpy(ifra.ifra_name, "xn0"); + ip = getenv("boot.netif.ip"); + setinaddr((struct sockaddr_in *)&(ifra.ifra_addr), ip); + printf("setting ip to %s\n", ip); + ip = getenv("boot.netif.netmask"); + setinaddr((struct sockaddr_in *)&ifra.ifra_mask, ip); + setinaddr((struct sockaddr_in *)&ifra.ifra_broadaddr, "255.255.255.255"); + + + if ((error = in_control(NULL, SIOCAIFADDR, (caddr_t) &ifra, ifp, curthread))) + printf("couldn't set interface address %d\n", error); +#if 0 + if ((error = xn_ioctl(ifp, SIOCSIFNETMASK, (caddr_t)&ifa))) + printf("couldn't set interface netmask %d\n", error); +#endif + } + return error; +} + +int +xen_setnfshandle(void) +{ + char *path, *ip; + u_char fhp[NFSX_V2FH]; + int error = 0; + struct sockaddr_in sin_local, *sin ; + struct mbuf *m; + + if ((error = xen_setnetwork())) + return error; + + sin = &sin_local; + + path = getenv("boot.nfsroot.path"); + ip = getenv("boot.nfsroot.server"); + + /* we aren't configured for NFS root */ + if (!path || !ip) + return 0; + + error = setinaddr(sin, ip); + if (error) { + printf("invalid ip address %s\n", ip); + return error; + } + + error = krpc_portmap(sin, RPCPROG_MNT, RPCMNT_VER1, + &sin->sin_port, curthread); + if (error) { + printf("failed to find port number for mountd\n"); + return error; + } + m = xdr_string_encode(path, strlen(path)); + + /* Do RPC to mountd */ + error = krpc_call(sin, RPCPROG_MNT, RPCMNT_VER1, + RPCMNT_MOUNT, &m, NULL, curthread); + if (error) { + printf("call to mountd failed\n"); + return error; + } + + if (xdr_opaque_decode(&m, fhp, NFSX_V2FH) != 0) { + printf("failed to decode nfs file handle\n"); + return error; + } + + setenv("boot.nfsroot.nfshandle", fhp); + + return 0; +} +#endif +void +ni_cli(void) +{ + __asm__("pushl %edx;" + "pushl %eax;" + ); + __cli(); + __asm__("popl %eax;" + "popl %edx;" + ); +} + + +void +ni_sti(void) +{ + __asm__("pushl %edx;" + "pushl %esi;" + "pushl %eax;" + ); + __sti(); + __asm__("popl %eax;" + "popl %esi;" + "popl %edx;" + ); +} + +/* + * Modify the cmd_line by converting ',' to NULLs so that it is in a format + * suitable for the static env vars. + */ +char * +xen_setbootenv(char *cmd_line) +{ + char *cmd_line_next; + + for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;); + return cmd_line; +} + +static struct +{ + const char *ev; + int mask; +} howto_names[] = { + {"boot_askname", RB_ASKNAME}, + {"boot_cdrom", RB_CDROM}, + {"boot_userconfig", RB_CONFIG}, + {"boot_ddb", RB_KDB}, + {"boot_gdb", RB_GDB}, + {"boot_gdb_pause", RB_GDB_PAUSE}, + {"boot_single", RB_SINGLE}, + {"boot_verbose", RB_VERBOSE}, + {"boot_multicons", RB_MULTIPLE}, + {"boot_serial", RB_SERIAL}, + {NULL, 0} +}; + +int +xen_boothowto(char *envp) +{ + int i, howto = 0; + + /* get equivalents from the environment */ + for (i = 0; howto_names[i].ev != NULL; i++) + if (getenv(howto_names[i].ev) != NULL) + howto |= howto_names[i].mask; + return howto; +} + +#define PRINTK_BUFSIZE 1024 +void +printk(const char *fmt, ...) +{ + __va_list ap; + int ret; + static char buf[PRINTK_BUFSIZE]; + + va_start(ap, fmt); + ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap); + va_end(ap); + buf[ret] = 0; + (void)HYPERVISOR_console_write(buf, ret); +} + +#define XPQUEUE_SIZE 2048 + +typedef struct xpq_queue { + uint32_t ptr; + uint32_t val; +} xpq_queue_t; + +#define MCLQUEUE_SIZE 512 +static multicall_entry_t mcl_queue[MCLQUEUE_SIZE]; +static int mcl_idx = 0; + +static xpq_queue_t xpq_queue[XPQUEUE_SIZE]; +static boolean_t xpq_initialized; +static struct mtx update_lock; +static int xpq_idx = 0; + +/* + * Don't attempt to lock until after lock & memory initialization + */ +#define XPQ_LOCK(lock, flags) \ + if (likely(xpq_initialized)) \ + mtx_lock_irqsave(lock, flags) +#define XPQ_UNLOCK(lock, flags) \ + if (likely(xpq_initialized)) \ + mtx_unlock_irqrestore(lock, flags) + +void +xpq_init(void) +{ + xpq_initialized = TRUE; + mtx_init(&update_lock, "mmu", "MMU LOCK", MTX_SPIN); +} + +static __inline void +_xpq_flush_queue(void) +{ + int _xpq_idx = xpq_idx; + int error, i; + + xpq_idx = 0; + /* Make sure index is cleared first to avoid double updates. */ + error = HYPERVISOR_mmu_update((mmu_update_t *)xpq_queue, _xpq_idx, + NULL); + + if (__predict_false(error < 0)) { + for (i = 0; i < _xpq_idx; i++) + printk("val: %x ptr: %p\n", xpq_queue[i].val, xpq_queue[i].ptr); + panic("Failed to execute MMU updates: %d", error); + } + +} +static void +xpq_flush_queue(void) +{ + unsigned long flags = 0; + + XPQ_LOCK(&update_lock, flags); + if (xpq_idx != 0) _xpq_flush_queue(); + XPQ_UNLOCK(&update_lock, flags); +} + +static __inline void +_mcl_flush_queue(void) +{ + int _mcl_idx = mcl_idx; + mcl_idx = 0; + (void)HYPERVISOR_multicall(mcl_queue, _mcl_idx); +} + +void +mcl_flush_queue(void) +{ + unsigned long flags = 0; + + XPQ_LOCK(&update_lock, flags); + if (__predict_true(mcl_idx != 0)) _mcl_flush_queue(); + XPQ_UNLOCK(&update_lock, flags); + /* XXX: until we can remove the pervasive + * __HYPERVISOR_update_va_mapping calls, we have 2 queues. In order + * to ensure that they never get out of sync, only 1 flush interface + * is provided. + */ + xpq_flush_queue(); +} + + +static __inline void +xpq_increment_idx(void) +{ + xpq_idx++; + if (__predict_false(xpq_idx == XPQUEUE_SIZE)) + xpq_flush_queue(); +} + +static __inline void +mcl_increment_idx(void) +{ + mcl_idx++; + if (__predict_false(mcl_idx == MCLQUEUE_SIZE)) + mcl_flush_queue(); +} + +void +xpq_queue_invlpg(vm_offset_t va) +{ + unsigned long flags = 0; + + XPQ_LOCK(&update_lock, flags); + xpq_queue[xpq_idx].ptr = (va & ~PAGE_MASK) | MMU_EXTENDED_COMMAND; + xpq_queue[xpq_idx].val = MMUEXT_INVLPG; + xpq_increment_idx(); + XPQ_UNLOCK(&update_lock, flags); +} + +void +load_cr3(uint32_t val) +{ + xpq_queue_pt_switch(val); + xpq_flush_queue(); +} + +void +xen_set_ldt(vm_offset_t base, uint32_t entries) +{ + xpq_queue_set_ldt(base, entries); + _xpq_flush_queue(); +} + +void +xen_machphys_update(unsigned long mfn, unsigned long pfn) +{ + unsigned long flags = 0; + XPQ_LOCK(&update_lock, flags); + xpq_queue[xpq_idx].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + xpq_queue[xpq_idx].val = pfn; + xpq_increment_idx(); + _xpq_flush_queue(); + XPQ_UNLOCK(&update_lock, flags); +} + +void +xpq_queue_pt_update(pt_entry_t *ptr, pt_entry_t val) +{ + unsigned long flags = 0; + + XPQ_LOCK(&update_lock, flags); + xpq_queue[xpq_idx].ptr = (uint32_t)ptr; + xpq_queue[xpq_idx].val = val; + xpq_increment_idx(); + XPQ_UNLOCK(&update_lock, flags); +} + +void +mcl_queue_pt_update(vm_offset_t va, vm_paddr_t ma) +{ +#if 0 + printf("setting va %x to ma %x\n", va, ma); +#endif + unsigned long flags = 0; + XPQ_LOCK(&update_lock, flags); + mcl_queue[mcl_idx].op = __HYPERVISOR_update_va_mapping; + mcl_queue[mcl_idx].args[0] = (unsigned long)(va >> PAGE_SHIFT); + mcl_queue[mcl_idx].args[1] = (unsigned long)ma; + mcl_queue[mcl_idx].args[2] = 0; + mcl_increment_idx(); + XPQ_UNLOCK(&update_lock, flags); +} + + + +void +xpq_queue_pt_switch(uint32_t val) +{ + unsigned long flags = 0; + vm_paddr_t ma = xpmap_ptom(val) & PG_FRAME; + + XPQ_LOCK(&update_lock, flags); + xpq_queue[xpq_idx].ptr = ma | MMU_EXTENDED_COMMAND; + xpq_queue[xpq_idx].val = MMUEXT_NEW_BASEPTR; + xpq_increment_idx(); + XPQ_UNLOCK(&update_lock, flags); +} + + +void +xpq_queue_pin_table(uint32_t pa, int type) +{ + unsigned long flags = 0; + XPQ_LOCK(&update_lock, flags); + xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND; + switch (type) { + case XPQ_PIN_L1_TABLE: + xpq_queue[xpq_idx].val = MMUEXT_PIN_L1_TABLE; + break; + case XPQ_PIN_L2_TABLE: + xpq_queue[xpq_idx].val = MMUEXT_PIN_L2_TABLE; + break; + } + xpq_increment_idx(); + XPQ_UNLOCK(&update_lock, flags); +} + +void +xpq_queue_unpin_table(uint32_t pa) +{ + unsigned long flags = 0; + + XPQ_LOCK(&update_lock, flags); + xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND; + xpq_queue[xpq_idx].val = MMUEXT_UNPIN_TABLE; + xpq_increment_idx(); + XPQ_UNLOCK(&update_lock, flags); +} + +void +xpq_queue_set_ldt(vm_offset_t va, uint32_t entries) +{ + unsigned long flags = 0; + + XPQ_LOCK(&update_lock, flags); + KASSERT(va == (va & PG_FRAME), ("ldt not page aligned")); + xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND | va; + xpq_queue[xpq_idx].val = MMUEXT_SET_LDT | + (entries << MMUEXT_CMD_SHIFT); + xpq_increment_idx(); + XPQ_UNLOCK(&update_lock, flags); +} + +void +xpq_queue_tlb_flush() +{ + unsigned long flags = 0; + + XPQ_LOCK(&update_lock, flags); + + xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND; + xpq_queue[xpq_idx].val = MMUEXT_TLB_FLUSH; + xpq_increment_idx(); + XPQ_UNLOCK(&update_lock, flags); +} + + +/********** CODE WORTH KEEPING ABOVE HERE *****************/ + +void xen_failsafe_handler(void); + +void +xen_failsafe_handler(void) +{ + + panic("xen_failsafe_handler called!\n"); +} + + +void +xen_update_descriptor(union descriptor *table, union descriptor *entry) +{ + vm_paddr_t pa; + pt_entry_t *ptp; + uint32_t raw[2]; + + bcopy(entry, raw, 2*sizeof(int32_t)); + ptp = vtopte((vm_offset_t)table); + pa = (*ptp & PG_FRAME) | ((vm_offset_t)table & PAGE_MASK); + if (HYPERVISOR_update_descriptor(pa, raw[0], raw[1])) + panic("HYPERVISOR_update_descriptor failed\n"); +} + + + +#if defined(XENDEBUG) +static void +xpmap_dump_pt(pt_entry_t *ptp, int p) +{ + pt_entry_t pte; + int j; + int bufpos; + + pte = xpmap_ptom((uint32_t)ptp - KERNTEXTOFF); + PRINTK(("%03x: %p(%p) %08x\n", p, ptp, (void *)pte, p << PDRSHIFT)); + + bufpos = 0; + for (j = 0; j < PTES_PER_PTP; j++) { + if ((ptp[j] & PG_V) == 0) + continue; + pte = ptp[j] /* & PG_FRAME */; + bufpos += sprintf(XBUF + bufpos, "%x:%03x:%08x ", + p, j, pte); + if (bufpos > 70) { + int k; + sprintf(XBUF + bufpos, "\n"); + PRINTK((XBUF)); + bufpos = 0; + for (k = 0; k < 1000000; k++); + } + } + if (bufpos) { + PRINTK((XBUF)); + bufpos = 0; + } +} +#endif + + diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/cpufunc.h b/freebsd-5.3-xen-sparse/i386-xen/include/cpufunc.h new file mode 100644 index 0000000000..fadc3a4a26 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/cpufunc.h @@ -0,0 +1,601 @@ +/*- + * Copyright (c) 1993 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/i386/include/cpufunc.h,v 1.135 2003/08/06 18:21:27 bde Exp $ + */ + +/* + * Functions to provide access to special i386 instructions. + * This in included in sys/systm.h, and that file should be + * used in preference to this. + */ + +#ifndef _MACHINE_CPUFUNC_H_ +#define _MACHINE_CPUFUNC_H_ + +#include <sys/cdefs.h> +#include <machine/psl.h> +#define NO_EXCHANGE +#include <machine/xen-os.h> +#include <machine/evtchn.h> +#include <machine/xenvar.h> +struct thread; +struct region_descriptor; + +__BEGIN_DECLS +#define readb(va) (*(volatile u_int8_t *) (va)) +#define readw(va) (*(volatile u_int16_t *) (va)) +#define readl(va) (*(volatile u_int32_t *) (va)) + +#define writeb(va, d) (*(volatile u_int8_t *) (va) = (d)) +#define writew(va, d) (*(volatile u_int16_t *) (va) = (d)) +#define writel(va, d) (*(volatile u_int32_t *) (va) = (d)) + +static __inline u_int +read_eflags(void) +{ + u_int ef; + __asm __volatile("pushfl; popl %0" : "=r" (ef)); + return (ef); +} + +static __inline void +write_eflags(u_int ef) +{ + __asm __volatile("pushl %0; popfl" : : "r" (ef)); +} +#ifdef __GNUC__ + +static __inline void +breakpoint(void) +{ + __asm __volatile("int $3"); +} + +static __inline u_int +bsfl(u_int mask) +{ + u_int result; + + __asm __volatile("bsfl %1,%0" : "=r" (result) : "rm" (mask)); + return (result); +} + +static __inline u_int +bsrl(u_int mask) +{ + u_int result; + + __asm __volatile("bsrl %1,%0" : "=r" (result) : "rm" (mask)); + return (result); +} +static __inline void +disable_intr(void) +{ + __cli(); +} +static __inline void +do_cpuid(u_int ax, u_int *p) +{ + __asm __volatile("cpuid" + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax)); +} + +static __inline void +enable_intr(void) +{ + __sti(); +} + + +#define HAVE_INLINE_FFS + +static __inline int +ffs(int mask) +{ + /* + * Note that gcc-2's builtin ffs would be used if we didn't declare + * this inline or turn off the builtin. The builtin is faster but + * broken in gcc-2.4.5 and slower but working in gcc-2.5 and later + * versions. + */ + return (mask == 0 ? mask : (int)bsfl((u_int)mask) + 1); +} + +#define HAVE_INLINE_FLS + +static __inline int +fls(int mask) +{ + return (mask == 0 ? mask : (int)bsrl((u_int)mask) + 1); +} + +static __inline void +halt(void) +{ + __asm __volatile("hlt"); +} + +#if __GNUC__ < 2 + +#define inb(port) inbv(port) +#define outb(port, data) outbv(port, data) + +#else /* __GNUC >= 2 */ + +/* + * The following complications are to get around gcc not having a + * constraint letter for the range 0..255. We still put "d" in the + * constraint because "i" isn't a valid constraint when the port + * isn't constant. This only matters for -O0 because otherwise + * the non-working version gets optimized away. + * + * Use an expression-statement instead of a conditional expression + * because gcc-2.6.0 would promote the operands of the conditional + * and produce poor code for "if ((inb(var) & const1) == const2)". + * + * The unnecessary test `(port) < 0x10000' is to generate a warning if + * the `port' has type u_short or smaller. Such types are pessimal. + * This actually only works for signed types. The range check is + * careful to avoid generating warnings. + */ +#define inb(port) __extension__ ({ \ + u_char _data; \ + if (__builtin_constant_p(port) && ((port) & 0xffff) < 0x100 \ + && (port) < 0x10000) \ + _data = inbc(port); \ + else \ + _data = inbv(port); \ + _data; }) + +#define outb(port, data) ( \ + __builtin_constant_p(port) && ((port) & 0xffff) < 0x100 \ + && (port) < 0x10000 \ + ? outbc(port, data) : outbv(port, data)) + +static __inline u_char +inbc(u_int port) +{ + u_char data; + + __asm __volatile("inb %1,%0" : "=a" (data) : "id" ((u_short)(port))); + return (data); +} + +static __inline void +outbc(u_int port, u_char data) +{ + __asm __volatile("outb %0,%1" : : "a" (data), "id" ((u_short)(port))); +} + +#endif /* __GNUC <= 2 */ + +static __inline u_char +inbv(u_int port) +{ + u_char data; + /* + * We use %%dx and not %1 here because i/o is done at %dx and not at + * %edx, while gcc generates inferior code (movw instead of movl) + * if we tell it to load (u_short) port. + */ + __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); + return (data); +} + +static __inline u_int +inl(u_int port) +{ + u_int data; + + __asm __volatile("inl %%dx,%0" : "=a" (data) : "d" (port)); + return (data); +} + +static __inline void +insb(u_int port, void *addr, size_t cnt) +{ + __asm __volatile("cld; rep; insb" + : "+D" (addr), "+c" (cnt) + : "d" (port) + : "memory"); +} + +static __inline void +insw(u_int port, void *addr, size_t cnt) +{ + __asm __volatile("cld; rep; insw" + : "+D" (addr), "+c" (cnt) + : "d" (port) + : "memory"); +} + +static __inline void +insl(u_int port, void *addr, size_t cnt) +{ + __asm __volatile("cld; rep; insl" + : "+D" (addr), "+c" (cnt) + : "d" (port) + : "memory"); +} + +static __inline void +invd(void) +{ + __asm __volatile("invd"); +} + +static __inline u_short +inw(u_int port) +{ + u_short data; + + __asm __volatile("inw %%dx,%0" : "=a" (data) : "d" (port)); + return (data); +} + +static __inline void +outbv(u_int port, u_char data) +{ + u_char al; + /* + * Use an unnecessary assignment to help gcc's register allocator. + * This make a large difference for gcc-1.40 and a tiny difference + * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for + * best results. gcc-2.6.0 can't handle this. + */ + al = data; + __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); +} + +static __inline void +outl(u_int port, u_int data) +{ + /* + * outl() and outw() aren't used much so we haven't looked at + * possible micro-optimizations such as the unnecessary + * assignment for them. + */ + __asm __volatile("outl %0,%%dx" : : "a" (data), "d" (port)); +} + +static __inline void +outsb(u_int port, const void *addr, size_t cnt) +{ + __asm __volatile("cld; rep; outsb" + : "+S" (addr), "+c" (cnt) + : "d" (port)); +} + +static __inline void +outsw(u_int port, const void *addr, size_t cnt) +{ + __asm __volatile("cld; rep; outsw" + : "+S" (addr), "+c" (cnt) + : "d" (port)); +} + +static __inline void +outsl(u_int port, const void *addr, size_t cnt) +{ + __asm __volatile("cld; rep; outsl" + : "+S" (addr), "+c" (cnt) + : "d" (port)); +} + +static __inline void +outw(u_int port, u_short data) +{ + __asm __volatile("outw %0,%%dx" : : "a" (data), "d" (port)); +} + +static __inline void +ia32_pause(void) +{ + __asm __volatile("pause"); +} + +static __inline u_int64_t +rdmsr(u_int msr) +{ + u_int64_t rv; + + __asm __volatile("rdmsr" : "=A" (rv) : "c" (msr)); + return (rv); +} + +static __inline u_int64_t +rdpmc(u_int pmc) +{ + u_int64_t rv; + + __asm __volatile("rdpmc" : "=A" (rv) : "c" (pmc)); + return (rv); +} + +static __inline u_int64_t +rdtsc(void) +{ + u_int64_t rv; + + __asm __volatile("rdtsc" : "=A" (rv)); + return (rv); +} + +static __inline void +wbinvd(void) +{ + __asm __volatile("wbinvd"); +} + +static __inline void +wrmsr(u_int msr, u_int64_t newval) +{ + __asm __volatile("wrmsr" : : "A" (newval), "c" (msr)); +} + +static __inline u_int +rfs(void) +{ + u_int sel; + __asm __volatile("movl %%fs,%0" : "=rm" (sel)); + return (sel); +} + +static __inline u_int +rgs(void) +{ + u_int sel; + __asm __volatile("movl %%gs,%0" : "=rm" (sel)); + return (sel); +} + +static __inline void +load_fs(u_int sel) +{ + __asm __volatile("movl %0,%%fs" : : "rm" (sel)); +} + +static __inline void +load_gs(u_int sel) +{ + __asm __volatile("movl %0,%%gs" : : "rm" (sel)); +} + +/* void lidt(struct region_descriptor *addr); */ +static __inline void +lidt(struct region_descriptor *addr) +{ + __asm __volatile("lidt (%0)" : : "r" (addr)); +} + +static __inline u_int +rdr0(void) +{ + u_int data; + __asm __volatile("movl %%dr0,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr0(u_int dr0) +{ + __asm __volatile("movl %0,%%dr0" : : "r" (dr0)); +} + +static __inline u_int +rdr1(void) +{ + u_int data; + __asm __volatile("movl %%dr1,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr1(u_int dr1) +{ + __asm __volatile("movl %0,%%dr1" : : "r" (dr1)); +} + +static __inline u_int +rdr2(void) +{ + u_int data; + __asm __volatile("movl %%dr2,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr2(u_int dr2) +{ + __asm __volatile("movl %0,%%dr2" : : "r" (dr2)); +} + +static __inline u_int +rdr3(void) +{ + u_int data; + __asm __volatile("movl %%dr3,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr3(u_int dr3) +{ + __asm __volatile("movl %0,%%dr3" : : "r" (dr3)); +} + +static __inline u_int +rdr4(void) +{ + u_int data; + __asm __volatile("movl %%dr4,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr4(u_int dr4) +{ + __asm __volatile("movl %0,%%dr4" : : "r" (dr4)); +} + +static __inline u_int +rdr5(void) +{ + u_int data; + __asm __volatile("movl %%dr5,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr5(u_int dr5) +{ + __asm __volatile("movl %0,%%dr5" : : "r" (dr5)); +} + +static __inline u_int +rdr6(void) +{ + u_int data; + __asm __volatile("movl %%dr6,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr6(u_int dr6) +{ + __asm __volatile("movl %0,%%dr6" : : "r" (dr6)); +} + +static __inline u_int +rdr7(void) +{ + u_int data; + __asm __volatile("movl %%dr7,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr7(u_int dr7) +{ + __asm __volatile("movl %0,%%dr7" : : "r" (dr7)); +} + +static __inline register_t +intr_disable(void) +{ + register_t eflags; + + __save_and_cli(eflags); + return (eflags); +} + +static __inline void +intr_restore(register_t eflags) +{ + __restore_flags(eflags); +} + +#else /* !__GNUC__ */ + +int breakpoint(void); +u_int bsfl(u_int mask); +u_int bsrl(u_int mask); +void cpu_invlpg(u_int addr); +void cpu_invlpg_range(u_int start, u_int end); +void disable_intr(void); +void do_cpuid(u_int ax, u_int *p); +void enable_intr(void); +void halt(void); +u_char inb(u_int port); +u_int inl(u_int port); +void insb(u_int port, void *addr, size_t cnt); +void insl(u_int port, void *addr, size_t cnt); +void insw(u_int port, void *addr, size_t cnt); +void invd(void); +void invlpg(u_int addr); +void invlpg_range(u_int start, u_int end); +void invltlb(void); +u_short inw(u_int port); +void load_cr3(u_int cr3); +void load_cr4(u_int cr4); +void load_fs(u_int sel); +void load_gs(u_int sel); +struct region_descriptor; +void lidt(struct region_descriptor *addr); +void ltr(u_short sel); +void outb(u_int port, u_char data); +void outl(u_int port, u_int data); +void outsb(u_int port, void *addr, size_t cnt); +void outsl(u_int port, void *addr, size_t cnt); +void outsw(u_int port, void *addr, size_t cnt); +void outw(u_int port, u_short data); +void ia32_pause(void); +u_int rcr2(void); +u_int rcr3(void); +u_int rcr4(void); +u_int rfs(void); +u_int rgs(void); +u_int64_t rdmsr(u_int msr); +u_int64_t rdpmc(u_int pmc); +u_int64_t rdtsc(void); +u_int read_eflags(void); +void wbinvd(void); +void write_eflags(u_int ef); +void wrmsr(u_int msr, u_int64_t newval); +u_int rdr0(void); +void load_dr0(u_int dr0); +u_int rdr1(void); +void load_dr1(u_int dr1); +u_int rdr2(void); +void load_dr2(u_int dr2); +u_int rdr3(void); +void load_dr3(u_int dr3); +u_int rdr4(void); +void load_dr4(u_int dr4); +u_int rdr5(void); +void load_dr5(u_int dr5); +u_int rdr6(void); +void load_dr6(u_int dr6); +u_int rdr7(void); +void load_dr7(u_int dr7); +register_t intr_disable(void); +void intr_restore(register_t ef); + +#endif /* __GNUC__ */ + +void reset_dbregs(void); + +__END_DECLS + +#endif /* !_MACHINE_CPUFUNC_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/ctrl_if.h b/freebsd-5.3-xen-sparse/i386-xen/include/ctrl_if.h new file mode 100644 index 0000000000..1ccd49d448 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/ctrl_if.h @@ -0,0 +1,120 @@ +/****************************************************************************** + * ctrl_if.h + * + * Management functions for special interface to the domain controller. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __I386_XENO__CTRL_IF_H__ +#define __I386_XENO__CTRL_IF_H__ + +#include <sys/taskqueue.h> +#include <machine/hypervisor.h> + + +typedef control_msg_t ctrl_msg_t; + +/* + * Callback function type. Called for asynchronous processing of received + * request messages, and responses to previously-transmitted request messages. + * The parameters are (@msg, @id). + * @msg: Original request/response message (not a copy). The message can be + * modified in-place by the handler (e.g., a response callback can + * turn a request message into a response message in place). The message + * is no longer accessible after the callback handler returns -- if the + * message is required to persist for longer then it must be copied. + * @id: (Response callbacks only) The 'id' that was specified when the + * original request message was queued for transmission. + */ +typedef void (*ctrl_msg_handler_t)(ctrl_msg_t *, unsigned long); + +/* + * Send @msg to the domain controller. Execute @hnd when a response is + * received, passing the response message and the specified @id. This + * operation will not block: it will return -EAGAIN if there is no space. + * Notes: + * 1. The @msg is copied if it is transmitted and so can be freed after this + * function returns. + * 2. If @hnd is NULL then no callback is executed. + */ +int ctrl_if_send_message_noblock( + ctrl_msg_t *msg, + ctrl_msg_handler_t hnd, + unsigned long id); + +/* + * Send @msg to the domain controller. Execute @hnd when a response is + * received, passing the response message and the specified @id. This + * operation will block until the message is sent, or a signal is received + * for the calling process (unless @wait_state is TASK_UNINTERRUPTIBLE). + * Notes: + * 1. The @msg is copied if it is transmitted and so can be freed after this + * function returns. + * 2. If @hnd is NULL then no callback is executed. + */ +int ctrl_if_send_message_block( + ctrl_msg_t *msg, + ctrl_msg_handler_t hnd, + unsigned long id, + long wait_state); + +/* + * Request a callback when there is /possibly/ space to immediately send a + * message to the domain controller. This function returns 0 if there is + * already space to trasnmit a message --- in this case the callback task /may/ + * still be executed. If this function returns 1 then the callback /will/ be + * executed when space becomes available. + */ +int ctrl_if_enqueue_space_callback(struct task *task); + +/* + * Send a response (@msg) to a message from the domain controller. This will + * never block. + * Notes: + * 1. The @msg is copied and so can be freed after this function returns. + * 2. The @msg may be the original request message, modified in-place. + */ +void ctrl_if_send_response(ctrl_msg_t *msg); + +/* + * Register a receiver for typed messages from the domain controller. The + * handler (@hnd) is called for every received message of specified @type. + * Returns TRUE (non-zero) if the handler was successfully registered. + * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will + * occur in a context in which it is safe to yield (i.e., process context). + */ +#define CALLBACK_IN_BLOCKING_CONTEXT 1 +int ctrl_if_register_receiver( + uint8_t type, + ctrl_msg_handler_t hnd, + unsigned int flags); + +/* + * Unregister a receiver for typed messages from the domain controller. The + * handler (@hnd) will not be executed after this function returns. + */ +void ctrl_if_unregister_receiver(uint8_t type, ctrl_msg_handler_t hnd); + +/* Suspend/resume notifications. */ +void ctrl_if_suspend(void); +void ctrl_if_resume(void); + + +/* + * Returns TRUE if there are no outstanding message requests at the domain + * controller. This can be used to ensure that messages have really flushed + * through when it is not possible to use the response-callback interface. + * WARNING: If other subsystems are using the control interface then this + * function might never return TRUE! + */ +int ctrl_if_transmitter_empty(void); /* !! DANGEROUS FUNCTION !! */ + +/* + * Manually discard response messages from the domain controller. + * WARNING: This is usually done automatically -- this function should only + * be called when normal interrupt mechanisms are disabled! + */ +void ctrl_if_discard_responses(void); /* !! DANGEROUS FUNCTION !! */ + +#endif /* __ASM_XEN__CONTROL_IF_H__ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/evtchn.h b/freebsd-5.3-xen-sparse/i386-xen/include/evtchn.h new file mode 100644 index 0000000000..3e962e3014 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/evtchn.h @@ -0,0 +1,92 @@ +/****************************************************************************** + * evtchn.h + * + * Communication via Xen event channels. + * Also definitions for the device that demuxes notifications to userspace. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __ASM_EVTCHN_H__ +#define __ASM_EVTCHN_H__ + +#include <machine/hypervisor.h> +#include <machine/synch_bitops.h> +#include <machine/hypervisor-ifs.h> + +/* + * LOW-LEVEL DEFINITIONS + */ + +/* Force a proper event-channel callback from Xen. */ +void force_evtchn_callback(void); + +/* Entry point for notifications into Linux subsystems. */ +void evtchn_do_upcall(struct intrframe *frame); + +/* Entry point for notifications into the userland character device. */ +void evtchn_device_upcall(int port); + +static inline void +mask_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + synch_set_bit(port, &s->evtchn_mask[0]); +} + +static inline void +unmask_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + synch_clear_bit(port, &s->evtchn_mask[0]); + + /* + * The following is basically the equivalent of 'hw_resend_irq'. Just like + * a real IO-APIC we 'lose the interrupt edge' if the channel is masked. + */ + if ( synch_test_bit (port, &s->evtchn_pending[0]) && + !synch_test_and_set_bit(port>>5, &s->evtchn_pending_sel) ) + { + s->vcpu_data[0].evtchn_upcall_pending = 1; + if ( !s->vcpu_data[0].evtchn_upcall_mask ) + force_evtchn_callback(); + } +} + +static inline void +clear_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + synch_clear_bit(port, &s->evtchn_pending[0]); +} + +static inline void +notify_via_evtchn(int port) +{ + evtchn_op_t op; + op.cmd = EVTCHNOP_send; + op.u.send.local_port = port; + (void)HYPERVISOR_event_channel_op(&op); +} + +/* + * CHARACTER-DEVICE DEFINITIONS + */ + +#define PORT_NORMAL 0x0000 +#define PORT_EXCEPTION 0x8000 +#define PORTIDX_MASK 0x7fff + +/* /dev/xen/evtchn resides at device number major=10, minor=200 */ +#define EVTCHN_MINOR 200 + +/* /dev/xen/evtchn ioctls: */ +/* EVTCHN_RESET: Clear and reinit the event buffer. Clear error condition. */ +#define EVTCHN_RESET _IO('E', 1) +/* EVTCHN_BIND: Bind to the specified event-channel port. */ +#define EVTCHN_BIND _IO('E', 2) +/* EVTCHN_UNBIND: Unbind from the specified event-channel port. */ +#define EVTCHN_UNBIND _IO('E', 3) + +#endif /* __ASM_EVTCHN_H__ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/frame.h b/freebsd-5.3-xen-sparse/i386-xen/include/frame.h new file mode 100644 index 0000000000..a6572d85a9 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/frame.h @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)frame.h 5.2 (Berkeley) 1/18/91 + * $FreeBSD: src/sys/i386/include/frame.h,v 1.23 2003/07/22 08:11:15 peter Exp $ + */ + +#ifndef _MACHINE_FRAME_H_ +#define _MACHINE_FRAME_H_ 1 + +/* + * System stack frames. + */ + +/* + * Exception/Trap Stack Frame + */ + +struct trapframe { + int tf_fs; + int tf_es; + int tf_ds; + int tf_edi; + int tf_esi; + int tf_ebp; + int tf_isp; + int tf_ebx; + int tf_edx; + int tf_ecx; + int tf_eax; + int tf_trapno; + int tf_cr2; + /* below portion defined in 386 hardware */ + int tf_err; + int tf_eip; + int tf_cs; + int tf_eflags; + /* below only when crossing rings (e.g. user to kernel) */ + int tf_esp; + int tf_ss; +}; + +/* Interrupt stack frame */ + +struct intrframe { + int if_fs; + int if_es; + int if_ds; + int if_edi; + int if_esi; + int if_ebp; + int :32; + int if_ebx; + int if_edx; + int if_ecx; + int if_eax; + int :32; /* for compat with trap frame - trapno */ + int if_vec; /* cr2 in trap frame */ + int :32; /* for compat with trap frame - err */ + /* below portion defined in 386 hardware */ + int if_eip; + int if_cs; + int if_eflags; + /* below only when crossing rings (e.g. user to kernel) */ + int if_esp; + int if_ss; +}; + +/* frame of clock (same as interrupt frame) */ + +struct clockframe { + int cf_fs; + int cf_es; + int cf_ds; + int cf_edi; + int cf_esi; + int cf_ebp; + int :32; + int cf_ebx; + int cf_edx; + int cf_ecx; + int cf_eax; + int :32; /* for compat with trap frame - trapno */ + int cf_vec; /* cr2 in trap frame */ + int :32; /* for compat with trap frame - err */ + /* below portion defined in 386 hardware */ + int cf_eip; + int cf_cs; + int cf_eflags; + /* below only when crossing rings (e.g. user to kernel) */ + int cf_esp; + int cf_ss; +}; + +#define INTR_TO_TRAPFRAME(frame) ((struct trapframe *)&(frame)->if_fs) + +#endif /* _MACHINE_FRAME_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/hypervisor-ifs.h b/freebsd-5.3-xen-sparse/i386-xen/include/hypervisor-ifs.h new file mode 100644 index 0000000000..4f75d27a9a --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/hypervisor-ifs.h @@ -0,0 +1,36 @@ +#ifndef _HYPERVISOR_IFS_H_ +#define _HYPERVISOR_IFS_H_ + +#define s8 int8_t +#define s16 int16_t +#define s32 int32_t +#define s64 int64_t + +#define u8 uint8_t +#define u16 uint16_t +#define u32 uint32_t +#define u64 uint64_t + +#include <machine/xen-public/xen.h> +#include <machine/xen-public/io/domain_controller.h> +#include <machine/xen-public/io/netif.h> +#include <machine/xen-public/io/blkif.h> +#include <machine/xen-public/dom0_ops.h> +#include <machine/xen-public/event_channel.h> +#include <machine/xen-public/sched_ctl.h> +#include <machine/xen-public/physdev.h> +#undef blkif_sector_t /* XXX pre-processor didn't do the */ +#define blkif_sector_t uint64_t /* right thing */ + +#undef s8 +#undef s16 +#undef s32 +#undef s64 + +#undef u8 +#undef u16 +#undef u32 +#undef u64 + + +#endif diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/hypervisor.h b/freebsd-5.3-xen-sparse/i386-xen/include/hypervisor.h new file mode 100644 index 0000000000..95ee85f352 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/hypervisor.h @@ -0,0 +1,355 @@ +/****************************************************************************** + * hypervisor.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002, K A Fraser + */ + +#ifndef __HYPERVISOR_H__ +#define __HYPERVISOR_H__ + + +#include <machine/hypervisor-ifs.h> +#include <machine/frame.h> +#include "opt_xen.h" + +extern start_info_t *xen_start_info; + +/* arch/xen/mm/hypervisor.c */ +/* + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already + * be MACHINE addresses. + */ + + +void MULTICALL_flush_page_update_queue(void); + +#ifdef CONFIG_XEN_PHYSDEV_ACCESS +/* Allocate a contiguous empty region of low memory. Return virtual start. */ +unsigned long allocate_empty_lowmem_region(unsigned long pages); +/* Deallocate a contiguous region of low memory. Return it to the allocator. */ +void deallocate_lowmem_region(unsigned long vstart, unsigned long pages); +#endif + +typedef struct { unsigned long pte_low, pte_high; } pte_t; + +/* + * Assembler stubs for hyper-calls. + */ + +static inline int HYPERVISOR_set_trap_table(trap_info_t *table) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_trap_table), + "b" (table) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_mmu_update(mmu_update_t *req, + int count, + int *success_count) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_mmu_update), + "b" (req), "c" (count), "d" (success_count) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_gdt), + "b" (frame_list), "c" (entries) : "memory" ); + + + return ret; +} + +static inline int HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_stack_switch), + "b" (ss), "c" (esp) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_set_callbacks( + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_callbacks), + "b" (event_selector), "c" (event_address), + "d" (failsafe_selector), "S" (failsafe_address) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_fpu_taskswitch(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_fpu_taskswitch) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_yield(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_yield) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_block(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_block) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_shutdown(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift)) + : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_reboot(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift)) + : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_suspend(unsigned long srec) +{ + int ret; + /* NB. On suspend, control software expects a suspend record in %esi. */ + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)), + "S" (srec) : "memory" ); + + return ret; +} + +static inline long HYPERVISOR_set_timer_op(uint64_t timeout) +{ + int ret; + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_timer_op), + "b" (timeout_hi), "c" (timeout_lo) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_dom0_op(dom0_op_t *dom0_op) +{ + int ret; + dom0_op->interface_version = DOM0_INTERFACE_VERSION; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_dom0_op), + "b" (dom0_op) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_set_debugreg(int reg, unsigned long value) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_debugreg), + "b" (reg), "c" (value) : "memory" ); + + return ret; +} + +static inline unsigned long HYPERVISOR_get_debugreg(int reg) +{ + unsigned long ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_get_debugreg), + "b" (reg) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_update_descriptor( + unsigned long pa, unsigned long word1, unsigned long word2) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_update_descriptor), + "b" (pa), "c" (word1), "d" (word2) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_set_fast_trap(int idx) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_fast_trap), + "b" (idx) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_dom_mem_op(unsigned int op, + unsigned long *pages, + unsigned long nr_pages) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_dom_mem_op), + "b" (op), "c" (pages), "d" (nr_pages) : "memory" ); + return ret; +} + +static inline int HYPERVISOR_multicall(void *call_list, int nr_calls) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_multicall), + "b" (call_list), "c" (nr_calls) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_update_va_mapping( + unsigned long page_nr, pte_t new_val, unsigned long flags) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping), + "b" (page_nr), "c" ((new_val).pte_low), "d" (flags): + "memory" ); + /* XXX */ +#if 0 + if ( unlikely(ret < 0) ) + panic("Failed update VA mapping: %08lx, %08lx, %08lx", + page_nr, (new_val).pte_low, flags); +#endif + return ret; +} + +static inline int HYPERVISOR_event_channel_op(void *op) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_event_channel_op), + "b" (op) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_xen_version(int cmd) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_xen_version), + "b" (cmd) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_console_io(int cmd, int count, char *str) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_console_io), + "b" (cmd), "c" (count), "d" (str) : "memory" ); + + return ret; +} + +static __inline int HYPERVISOR_console_write(char *str, int count) +{ + return HYPERVISOR_console_io(CONSOLEIO_write, count, str); +} + +static inline int HYPERVISOR_physdev_op(void *physdev_op) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_physdev_op), + "b" (physdev_op) : "memory" ); + + return ret; +} + +static inline int HYPERVISOR_update_va_mapping_otherdomain( + unsigned long page_nr, pte_t new_val, unsigned long flags, domid_t domid) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping_otherdomain), + "b" (page_nr), "c" ((new_val).pte_low), "d" (flags), "S" (domid) : + "memory" ); + + return ret; +} + +static inline int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_vm_assist), + "b" (cmd), "c" (type) : "memory" ); + + return ret; +} + +#endif /* __HYPERVISOR_H__ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/md_var.h b/freebsd-5.3-xen-sparse/i386-xen/include/md_var.h new file mode 100644 index 0000000000..5822a1e3d1 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/md_var.h @@ -0,0 +1,108 @@ +/*- + * Copyright (c) 1995 Bruce D. Evans. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/i386/include/md_var.h,v 1.66 2003/11/03 22:37:28 jhb Exp $ + */ + +#ifndef _MACHINE_MD_VAR_H_ +#define _MACHINE_MD_VAR_H_ + +/* + * Miscellaneous machine-dependent declarations. + */ + +extern void (*bcopy_vector)(const void *from, void *to, size_t len); +extern void (*bzero_vector)(void *buf, size_t len); +extern int (*copyin_vector)(const void *udaddr, void *kaddr, size_t len); +extern int (*copyout_vector)(const void *kaddr, void *udaddr, size_t len); + +extern long Maxmem; +extern u_int atdevbase; /* offset in virtual memory of ISA io mem */ +extern u_int basemem; /* PA of original top of base memory */ +extern int busdma_swi_pending; +extern u_int cpu_exthigh; +extern u_int cpu_feature; +extern u_int cpu_fxsr; +extern u_int cpu_high; +extern u_int cpu_id; +extern u_int cpu_procinfo; +extern char cpu_vendor[]; +extern u_int cyrix_did; +extern uint16_t *elan_mmcr; +extern char kstack[]; +#ifdef PC98 +extern int need_pre_dma_flush; +extern int need_post_dma_flush; +#endif +extern char sigcode[]; +extern int szsigcode; +#ifdef COMPAT_FREEBSD4 +extern int szfreebsd4_sigcode; +#endif +#ifdef COMPAT_43 +extern int szosigcode; +#endif + +typedef void alias_for_inthand_t(u_int cs, u_int ef, u_int esp, u_int ss); +struct thread; +struct reg; +struct fpreg; +struct dbreg; + +void bcopyb(const void *from, void *to, size_t len); +void busdma_swi(void); +void cpu_setregs(void); +void cpu_switch_load_gs(void) __asm(__STRING(cpu_switch_load_gs)); +void doreti_iret(void) __asm(__STRING(doreti_iret)); +void doreti_iret_fault(void) __asm(__STRING(doreti_iret_fault)); +void doreti_popl_ds(void) __asm(__STRING(doreti_popl_ds)); +void doreti_popl_ds_fault(void) __asm(__STRING(doreti_popl_ds_fault)); +void doreti_popl_es(void) __asm(__STRING(doreti_popl_es)); +void doreti_popl_es_fault(void) __asm(__STRING(doreti_popl_es_fault)); +void doreti_popl_fs(void) __asm(__STRING(doreti_popl_fs)); +void doreti_popl_fs_fault(void) __asm(__STRING(doreti_popl_fs_fault)); +void scrit(void) __asm(__STRING(scrit)); +void ecrit(void) __asm(__STRING(ecrit)); +void critical_region_fixup(void) __asm(__STRING(critical_region_fixup)); +void enable_sse(void); +void fillw(int /*u_short*/ pat, void *base, size_t cnt); +void i486_bzero(void *buf, size_t len); +void i586_bcopy(const void *from, void *to, size_t len); +void i586_bzero(void *buf, size_t len); +int i586_copyin(const void *udaddr, void *kaddr, size_t len); +int i586_copyout(const void *kaddr, void *udaddr, size_t len); +void i686_pagezero(void *addr); +void sse2_pagezero(void *addr); +void init_AMD_Elan_sc520(void); +int is_physical_memory(vm_offset_t addr); +int isa_nmi(int cd); +vm_paddr_t kvtop(void *addr); +void setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int selec); +int user_dbreg_trap(void); + +#endif /* !_MACHINE_MD_VAR_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/multicall.h b/freebsd-5.3-xen-sparse/i386-xen/include/multicall.h new file mode 100644 index 0000000000..30de865ee2 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/multicall.h @@ -0,0 +1,98 @@ +/****************************************************************************** + * multicall.h + */ + +#ifndef __MULTICALL_H__ +#define __MULTICALL_H__ + +#include <machine/hypervisor.h> +#define MAX_MULTICALL_ENTS 8 +extern multicall_entry_t multicall_list[]; +extern int nr_multicall_ents; + +static inline void execute_multicall_list(void) +{ + if ( unlikely(nr_multicall_ents == 0) ) return; + (void)HYPERVISOR_multicall(multicall_list, nr_multicall_ents); + nr_multicall_ents = 0; +} + + +static inline void handle_edge(void) +{ + if (unlikely(nr_multicall_ents == MAX_MULTICALL_ENTS)) + execute_multicall_list(); +} + +static inline void queue_multicall0(unsigned long op) +{ + int i = nr_multicall_ents; + multicall_list[i].op = op; + nr_multicall_ents = i+1; + handle_edge(); +} + +static inline void queue_multicall1(unsigned long op, unsigned long arg1) +{ + int i = nr_multicall_ents; + multicall_list[i].op = op; + multicall_list[i].args[0] = arg1; + nr_multicall_ents = i+1; + handle_edge(); +} + +static inline void queue_multicall2( + unsigned long op, unsigned long arg1, unsigned long arg2) +{ + int i = nr_multicall_ents; + multicall_list[i].op = op; + multicall_list[i].args[0] = arg1; + multicall_list[i].args[1] = arg2; + nr_multicall_ents = i+1; + handle_edge(); +} + +static inline void queue_multicall3( + unsigned long op, unsigned long arg1, unsigned long arg2, + unsigned long arg3) +{ + int i = nr_multicall_ents; + multicall_list[i].op = op; + multicall_list[i].args[0] = arg1; + multicall_list[i].args[1] = arg2; + multicall_list[i].args[2] = arg3; + nr_multicall_ents = i+1; + handle_edge(); +} + +static inline void queue_multicall4( + unsigned long op, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4) +{ + int i = nr_multicall_ents; + multicall_list[i].op = op; + multicall_list[i].args[0] = arg1; + multicall_list[i].args[1] = arg2; + multicall_list[i].args[2] = arg3; + multicall_list[i].args[3] = arg4; + nr_multicall_ents = i+1; + handle_edge(); +} + +static inline void queue_multicall5( + unsigned long op, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5) +{ + int i = nr_multicall_ents; + multicall_list[i].op = op; + multicall_list[i].args[0] = arg1; + multicall_list[i].args[1] = arg2; + multicall_list[i].args[2] = arg3; + multicall_list[i].args[3] = arg4; + multicall_list[i].args[4] = arg5; + nr_multicall_ents = i+1; + handle_edge(); +} + + +#endif /* __MULTICALL_H__ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/param.h b/freebsd-5.3-xen-sparse/i386-xen/include/param.h new file mode 100644 index 0000000000..a45fdd67c3 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/param.h @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)param.h 5.8 (Berkeley) 6/28/91 + * $FreeBSD: src/sys/i386/include/param.h,v 1.69 2003/06/14 23:23:53 alc Exp $ + */ + +/* + * Machine dependent constants for Intel 386. + */ + +/* + * Round p (pointer or byte index) up to a correctly-aligned value + * for all data types (int, long, ...). The result is unsigned int + * and must be cast to any desired pointer type. + */ +#ifndef _ALIGNBYTES +#define _ALIGNBYTES (sizeof(int) - 1) +#endif +#ifndef _ALIGN +#define _ALIGN(p) (((unsigned)(p) + _ALIGNBYTES) & ~_ALIGNBYTES) +#endif + +#ifndef _MACHINE +#define _MACHINE i386-xen +#endif +#ifndef _MACHINE_ARCH +#define _MACHINE_ARCH i386-xen +#endif + +#ifndef _NO_NAMESPACE_POLLUTION + +#ifndef _MACHINE_PARAM_H_ +#define _MACHINE_PARAM_H_ + +#ifndef MACHINE +#define MACHINE "i386" +#endif +#ifndef MACHINE_ARCH +#define MACHINE_ARCH "i386" +#endif +#define MID_MACHINE MID_I386 + +#ifdef SMP +#define MAXCPU 16 +#else +#define MAXCPU 1 +#endif /* SMP */ + +#define ALIGNBYTES _ALIGNBYTES +#define ALIGN(p) _ALIGN(p) + +#define PAGE_SHIFT 12 /* LOG2(PAGE_SIZE) */ +#define PAGE_SIZE (1<<PAGE_SHIFT) /* bytes/page */ +#define PAGE_MASK (PAGE_SIZE-1) +#define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t))) + +#ifdef PAE +#define NPGPTD 4 +#define PDRSHIFT 21 /* LOG2(NBPDR) */ +#else +#define NPGPTD 1 +#define PDRSHIFT 22 /* LOG2(NBPDR) */ +#endif + +#define NBPTD (NPGPTD<<PAGE_SHIFT) +#define NPDEPTD (NBPTD/(sizeof (pd_entry_t))) +#define NPDEPG (PAGE_SIZE/(sizeof (pd_entry_t))) +#define NBPDR (1<<PDRSHIFT) /* bytes/page dir */ +#define PDRMASK (NBPDR-1) + +#define IOPAGES 2 /* pages of i/o permission bitmap */ + +#ifndef KSTACK_PAGES +#define KSTACK_PAGES 2 /* Includes pcb! */ +#endif +#define KSTACK_GUARD_PAGES 1 /* pages of kstack guard; 0 disables */ +#define UAREA_PAGES 1 /* holds struct user WITHOUT PCB (see def.) */ + +/* + * Ceiling on amount of swblock kva space, can be changed via + * the kern.maxswzone /boot/loader.conf variable. + */ +#ifndef VM_SWZONE_SIZE_MAX +#define VM_SWZONE_SIZE_MAX (32 * 1024 * 1024) +#endif + +/* + * Ceiling on size of buffer cache (really only effects write queueing, + * the VM page cache is not effected), can be changed via + * the kern.maxbcache /boot/loader.conf variable. + */ +#ifndef VM_BCACHE_SIZE_MAX +#define VM_BCACHE_SIZE_MAX (200 * 1024 * 1024) +#endif + +/* + * Mach derived conversion macros + */ +#define trunc_page(x) ((x) & ~PAGE_MASK) +#define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) +#define trunc_4mpage(x) ((x) & ~PDRMASK) +#define round_4mpage(x) ((((x)) + PDRMASK) & ~PDRMASK) + +#define atop(x) ((x) >> PAGE_SHIFT) +#define ptoa(x) ((x) << PAGE_SHIFT) + +#define i386_btop(x) ((x) >> PAGE_SHIFT) +#define i386_ptob(x) ((x) << PAGE_SHIFT) + +#define pgtok(x) ((x) * (PAGE_SIZE / 1024)) + +#endif /* !_MACHINE_PARAM_H_ */ +#endif /* !_NO_NAMESPACE_POLLUTION */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/pcb.h b/freebsd-5.3-xen-sparse/i386-xen/include/pcb.h new file mode 100644 index 0000000000..ff68761540 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/pcb.h @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)pcb.h 5.10 (Berkeley) 5/12/91 + * $FreeBSD: src/sys/i386/include/pcb.h,v 1.50 2003/09/30 08:11:36 jeff Exp $ + */ + +#ifndef _I386_PCB_H_ +#define _I386_PCB_H_ + +/* + * Intel 386 process control block + */ +#include <machine/npx.h> + +struct pcb { + int pcb_cr3; + int pcb_edi; + int pcb_esi; + int pcb_ebp; + int pcb_esp; + int pcb_eax; + int pcb_ebx; + int pcb_ecx; + int pcb_edx; + int pcb_eip; + + int pcb_dr0; + int pcb_dr1; + int pcb_dr2; + int pcb_dr3; + int pcb_dr6; + int pcb_dr7; + + union savefpu pcb_save; + u_int pcb_flags; +#define FP_SOFTFP 0x01 /* process using software fltng pnt emulator */ +#define PCB_DBREGS 0x02 /* process using debug registers */ +#define PCB_NPXTRAP 0x04 /* npx trap pending */ +#define PCB_NPXINITDONE 0x08 /* fpu state is initialized */ +#define PCB_VM86CALL 0x10 /* in vm86 call */ + + caddr_t pcb_onfault; /* copyin/out fault recovery */ + int pcb_cs; + int pcb_ds; + int pcb_ss; + int pcb_es; + int pcb_gs; + int pcb_fs; + struct pcb_ext *pcb_ext; /* optional pcb extension */ + int pcb_psl; /* process status long */ + void (*pcb_switchout)(void); /* Special switchout function. */ + u_long __pcb_spare[2]; /* adjust to avoid core dump size changes */ +}; + +#ifdef _KERNEL +struct trapframe; + +void makectx(struct trapframe *, struct pcb *); + +void savectx(struct pcb *); +#endif + +#endif /* _I386_PCB_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/pcpu.h b/freebsd-5.3-xen-sparse/i386-xen/include/pcpu.h new file mode 100644 index 0000000000..80a675cd4a --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/pcpu.h @@ -0,0 +1,173 @@ +/*- + * Copyright (c) Peter Wemm <peter@netplex.com.au> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/i386/include/pcpu.h,v 1.41 2003/11/20 23:23:22 peter Exp $ + */ + +#ifndef _MACHINE_PCPU_H_ +#define _MACHINE_PCPU_H_ + +#ifdef _KERNEL + +#include <machine/segments.h> +#include <machine/tss.h> + +/* + * The SMP parts are setup in pmap.c and locore.s for the BSP, and + * mp_machdep.c sets up the data for the AP's to "see" when they awake. + * The reason for doing it via a struct is so that an array of pointers + * to each CPU's data can be set up for things like "check curproc on all + * other processors" + */ +#define PCPU_MD_FIELDS \ + struct pcpu *pc_prvspace; /* Self-reference */ \ + struct pmap *pc_curpmap; \ + struct i386tss pc_common_tss; \ + struct segment_descriptor pc_common_tssd; \ + struct segment_descriptor *pc_tss_gdt; \ + int pc_currentldt; \ + u_int pc_acpi_id; \ + u_int pc_apic_id; \ + u_int pc_faultaddr; \ + u_int pc_trap_nesting; \ + u_int pc_pdir + +#if defined(lint) + +extern struct pcpu *pcpup; + +#define PCPU_GET(member) (pcpup->pc_ ## member) +#define PCPU_PTR(member) (&pcpup->pc_ ## member) +#define PCPU_SET(member,value) (pcpup->pc_ ## member = (value)) + +#elif defined(__GNUC__) + +/* + * Evaluates to the byte offset of the per-cpu variable name. + */ +#define __pcpu_offset(name) \ + __offsetof(struct pcpu, name) + +/* + * Evaluates to the type of the per-cpu variable name. + */ +#define __pcpu_type(name) \ + __typeof(((struct pcpu *)0)->name) + +/* + * Evaluates to the address of the per-cpu variable name. + */ +#define __PCPU_PTR(name) __extension__ ({ \ + __pcpu_type(name) *__p; \ + \ + __asm __volatile("movl %%fs:%1,%0; addl %2,%0" \ + : "=r" (__p) \ + : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))), \ + "i" (__pcpu_offset(name))); \ + \ + __p; \ +}) + +/* + * Evaluates to the value of the per-cpu variable name. + */ +#define __PCPU_GET(name) __extension__ ({ \ + __pcpu_type(name) __result; \ + \ + if (sizeof(__result) == 1) { \ + u_char __b; \ + __asm __volatile("movb %%fs:%1,%0" \ + : "=r" (__b) \ + : "m" (*(u_char *)(__pcpu_offset(name)))); \ + __result = *(__pcpu_type(name) *)(void *)&__b; \ + } else if (sizeof(__result) == 2) { \ + u_short __w; \ + __asm __volatile("movw %%fs:%1,%0" \ + : "=r" (__w) \ + : "m" (*(u_short *)(__pcpu_offset(name)))); \ + __result = *(__pcpu_type(name) *)(void *)&__w; \ + } else if (sizeof(__result) == 4) { \ + u_int __i; \ + __asm __volatile("movl %%fs:%1,%0" \ + : "=r" (__i) \ + : "m" (*(u_int *)(__pcpu_offset(name)))); \ + __result = *(__pcpu_type(name) *)(void *)&__i; \ + } else { \ + __result = *__PCPU_PTR(name); \ + } \ + \ + __result; \ +}) + +/* + * Sets the value of the per-cpu variable name to value val. + */ +#define __PCPU_SET(name, val) { \ + __pcpu_type(name) __val = (val); \ + \ + if (sizeof(__val) == 1) { \ + u_char __b; \ + __b = *(u_char *)&__val; \ + __asm __volatile("movb %1,%%fs:%0" \ + : "=m" (*(u_char *)(__pcpu_offset(name))) \ + : "r" (__b)); \ + } else if (sizeof(__val) == 2) { \ + u_short __w; \ + __w = *(u_short *)&__val; \ + __asm __volatile("movw %1,%%fs:%0" \ + : "=m" (*(u_short *)(__pcpu_offset(name))) \ + : "r" (__w)); \ + } else if (sizeof(__val) == 4) { \ + u_int __i; \ + __i = *(u_int *)&__val; \ + __asm __volatile("movl %1,%%fs:%0" \ + : "=m" (*(u_int *)(__pcpu_offset(name))) \ + : "r" (__i)); \ + } else { \ + *__PCPU_PTR(name) = __val; \ + } \ +} + +#define PCPU_GET(member) __PCPU_GET(pc_ ## member) +#define PCPU_PTR(member) __PCPU_PTR(pc_ ## member) +#define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val) + +static __inline struct thread * +__curthread(void) +{ + struct thread *td; + + __asm __volatile("movl %%fs:0,%0" : "=r" (td)); + return (td); +} +#define curthread (__curthread()) + +#else +#error gcc or lint is required to use this file +#endif + +#endif /* _KERNEL */ + +#endif /* ! _MACHINE_PCPU_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/pmap.h b/freebsd-5.3-xen-sparse/i386-xen/include/pmap.h new file mode 100644 index 0000000000..9e838b9bd4 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/pmap.h @@ -0,0 +1,355 @@ +/* + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Derived from hp300 version by Mike Hibler, this version by William + * Jolitz uses a recursive map [a pde points to the page directory] to + * map the page tables using the pagetables themselves. This is done to + * reduce the impact on kernel virtual memory for lots of sparse address + * space, and to reduce the cost of memory to each process. + * + * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 + * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 + * $FreeBSD: src/sys/i386/include/pmap.h,v 1.103 2003/11/08 03:01:26 alc Exp $ + */ + +#ifndef _MACHINE_PMAP_H_ +#define _MACHINE_PMAP_H_ + +/* + * Page-directory and page-table entires follow this format, with a few + * of the fields not present here and there, depending on a lot of things. + */ + /* ---- Intel Nomenclature ---- */ +#define PG_V 0x001 /* P Valid */ +#define PG_RW 0x002 /* R/W Read/Write */ +#define PG_U 0x004 /* U/S User/Supervisor */ +#define PG_NC_PWT 0x008 /* PWT Write through */ +#define PG_NC_PCD 0x010 /* PCD Cache disable */ +#define PG_A 0x020 /* A Accessed */ +#define PG_M 0x040 /* D Dirty */ +#define PG_PS 0x080 /* PS Page size (0=4k,1=4M) */ +#define PG_G 0x100 /* G Global */ +#define PG_AVAIL1 0x200 /* / Available for system */ +#define PG_AVAIL2 0x400 /* < programmers use */ +#define PG_AVAIL3 0x800 /* \ */ + + +/* Our various interpretations of the above */ +#define PG_W PG_AVAIL1 /* "Wired" pseudoflag */ +#define PG_MANAGED PG_AVAIL2 +#define PG_FRAME (~((vm_paddr_t)PAGE_MASK)) +#define PG_PROT (PG_RW|PG_U) /* all protection bits . */ +#define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */ + +#define PG_KERNEL (PG_V | PG_RW | PG_M | PG_A) +#define PG_KERNEL_NC (PG_KERNEL | PG_N) +#define PG_KERNEL_RO (PG_VALID | PG_M | PG_A) + +/* + * Page Protection Exception bits + */ + +#define PGEX_P 0x01 /* Protection violation vs. not present */ +#define PGEX_W 0x02 /* during a Write cycle */ +#define PGEX_U 0x04 /* access from User mode (UPL) */ +#define XEN_PAGES 16 + +/* + * Size of Kernel address space. This is the number of page table pages + * (4MB each) to use for the kernel. 256 pages == 1 Gigabyte. + * This **MUST** be a multiple of 4 (eg: 252, 256, 260, etc). + */ + +#ifndef KVA_PAGES +#ifdef PAE +#define KVA_PAGES 512 +#else +#define KVA_PAGES 256 +#endif +#endif + +/* + * Pte related macros + */ +#define VADDR(pdi, pti) ((vm_offset_t)(((pdi)<<PDRSHIFT)|((pti)<<PAGE_SHIFT))) + +#ifndef NKPT +#ifdef PAE +#define NKPT 120 /* actual number of kernel page tables */ +#else +#define NKPT 30 /* actual number of kernel page tables */ +#endif +#endif + +/* + * XEN NOTE: Xen consumes 64MB of memory, so subtract that from the number + * of page available to the kernel virutal address space. + */ +#ifndef NKPDE +#ifdef SMP +#define NKPDE (KVA_PAGES - 1 - XEN_PAGES) /* number of page tables/pde's */ +#else +#define NKPDE (KVA_PAGES - XEN_PAGES) /* number of page tables/pde's */ +#endif +#endif + +/* + * The *PTDI values control the layout of virtual memory + * + * XXX This works for now, but I am not real happy with it, I'll fix it + * right after I fix locore.s and the magic 28K hole + * + * SMP_PRIVPAGES: The per-cpu address space is 0xff80000 -> 0xffbfffff + */ + +/* + * XEN NOTE: We need to shift down the start of KVA by 64MB to account for + * Xen using the upper 64MB. + * + * The layout of VA for XenoBSD is: + * | USER | PTDPTDI | KVA | XEN | + * | 0x00000000 | 0xbfc00000 | 0xc0000000 | 0xfc000000 - 0xffffffff| + * + * Normally it is just: + * | USER | PTDPTDI | KVA | + * | 0x00000000 | 0xbfc00000 | 0xc0000000 - 0xffffffff | + */ + +#ifdef SMP +#define MPPTDI (NPDEPTD-1) /* per cpu ptd entry */ +#define KPTDI (MPPTDI-NKPDE-XEN_PAGES /* start of kernel virtual pde's */ +#else +#define KPTDI (NPDEPTD-NKPDE-XEN_PAGES) /* start of kernel virtual pde's */ +#endif /* SMP */ + +#define PTDPTDI (KPTDI-NPGPTD) /* ptd entry that points to ptd! */ + +/* + * XXX doesn't really belong here I guess... + */ +#define ISA_HOLE_START 0xa0000 +#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START) + +#ifndef LOCORE + +#include <sys/queue.h> +#include <sys/_lock.h> +#include <sys/_mutex.h> + + +typedef uint32_t pd_entry_t; +typedef uint32_t pt_entry_t; + +#define PTESHIFT (2) +#define PDESHIFT (2) + + +/* + * Address of current and alternate address space page table maps + * and directories. + */ +#ifdef _KERNEL +extern pt_entry_t PTmap[]; +extern pd_entry_t PTD[]; +extern pd_entry_t PTDpde[]; + +extern pd_entry_t *IdlePTD; /* physical address of "Idle" state directory */ + +#include <machine/xen-os.h> +#include <machine/xenvar.h> +#include <machine/xenpmap.h> + + +/* + * virtual address to page table entry and + * to physical address. Likewise for alternate address space. + * Note: these work recursively, thus vtopte of a pte will give + * the corresponding pde that in turn maps it. + */ +#define vtopte(va) (PTmap + i386_btop(va)) + +/* + * Given a virtual address, return the machine address of its PTE + * + */ +#define vtoptema(va) pmap_kextract_ma((vm_offset_t) vtopte(va)) + +/* + * Routine: pmap_kextract/pmap_kextract_ma + * Function: + * Extract the physical/machine page address associated + * kernel virtual address. + */ + +static __inline vm_paddr_t +pmap_kextract_ma(vm_offset_t va) +{ + vm_paddr_t ma; + if ((ma = PTD[va >> PDRSHIFT]) & PG_PS) { + ma = (ma & ~(NBPDR - 1)) | (va & (NBPDR - 1)); + } else { + ma = (*vtopte(va) & PG_FRAME) | (va & PAGE_MASK); + } + return ma; +} + +static __inline vm_paddr_t +pmap_kextract(vm_offset_t va) +{ + return xpmap_mtop(pmap_kextract_ma(va)); +} + +#define vtophys(va) pmap_kextract(((vm_offset_t) (va))) +#define vtomach(va) pmap_kextract_ma(((vm_offset_t) (va))) + +static __inline pt_entry_t +pte_load_clear(pt_entry_t *ptep) +{ + pt_entry_t r; + + r = PT_GET(ptep); + PT_CLEAR_VA(ptep, TRUE); + return (r); +} +static __inline pt_entry_t +pte_load_store(pt_entry_t *ptep, pt_entry_t v) +{ + pt_entry_t r; + r = PT_GET(ptep); + PT_SET_VA_MA(ptep, v, TRUE); + return (r); +} + +#define pte_store(ptep, pte) PT_SET_VA_MA(ptep, pte, TRUE); +#define pte_clear(pte) PT_CLEAR_VA(pte, TRUE); + + +#endif /* _KERNEL */ + +/* + * Pmap stuff + */ +struct pv_entry; + +struct md_page { + int pv_list_count; + TAILQ_HEAD(,pv_entry) pv_list; +}; + +struct pmap { + struct mtx pm_mtx; + pd_entry_t *pm_pdir; /* KVA of page directory */ + TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ + u_int pm_active; /* active on cpus */ + struct pmap_statistics pm_stats; /* pmap statistics */ + LIST_ENTRY(pmap) pm_list; /* List of all pmaps */ +}; + + +typedef struct pmap *pmap_t; + +#ifdef _KERNEL +extern struct pmap kernel_pmap_store; +#define kernel_pmap (&kernel_pmap_store) + +#define PMAP_LOCK(pmap)mtx_lock(&(pmap)->pm_mtx) +#define PMAP_LOCK_ASSERT(pmap, type) \ +mtx_assert(&(pmap)->pm_mtx, (type)) +#define PMAP_LOCK_DESTROY(pmap)mtx_destroy(&(pmap)->pm_mtx) +#define PMAP_LOCK_INIT(pmap)mtx_init(&(pmap)->pm_mtx, "pmap", \ + NULL, MTX_DEF | MTX_DUPOK) +#define PMAP_LOCKED(pmap)mtx_owned(&(pmap)->pm_mtx) +#define PMAP_MTX(pmap)(&(pmap)->pm_mtx) +#define PMAP_TRYLOCK(pmap)mtx_trylock(&(pmap)->pm_mtx) +#define PMAP_UNLOCK(pmap)mtx_unlock(&(pmap)->pm_mtx) + +#endif + +/* + * For each vm_page_t, there is a list of all currently valid virtual + * mappings of that page. An entry is a pv_entry_t, the list is pv_table. + */ +typedef struct pv_entry { + pmap_t pv_pmap; /* pmap where mapping lies */ + vm_offset_t pv_va; /* virtual address for mapping */ + TAILQ_ENTRY(pv_entry) pv_list; + TAILQ_ENTRY(pv_entry) pv_plist; +} *pv_entry_t; + +#ifdef _KERNEL + +#define NPPROVMTRR 8 +#define PPRO_VMTRRphysBase0 0x200 +#define PPRO_VMTRRphysMask0 0x201 +struct ppro_vmtrr { + u_int64_t base, mask; +}; +extern struct ppro_vmtrr PPro_vmtrr[NPPROVMTRR]; + +extern caddr_t CADDR1; +extern pt_entry_t *CMAP1; +extern vm_paddr_t avail_end; +extern vm_paddr_t phys_avail[]; +extern int pseflag; +extern int pgeflag; +extern char *ptvmmap; /* poor name! */ +extern vm_offset_t virtual_avail; +extern vm_offset_t virtual_end; + +#define pmap_page_is_mapped(m)(!TAILQ_EMPTY(&(m)->md.pv_list)) + +void pmap_bootstrap(vm_paddr_t, vm_paddr_t); +void pmap_kenter(vm_offset_t va, vm_paddr_t pa); +void pmap_kenter_ma(vm_offset_t va, vm_paddr_t pa); +void *pmap_kenter_temporary(vm_paddr_t pa, int i); +void pmap_kremove(vm_offset_t); +void *pmap_mapdev(vm_paddr_t, vm_size_t); +void pmap_unmapdev(vm_offset_t, vm_size_t); +pt_entry_t *pmap_pte(pmap_t, vm_offset_t) __pure2; +void pmap_set_pg(void); +void pmap_invalidate_page(pmap_t, vm_offset_t); +void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); +void pmap_invalidate_all(pmap_t); + +void pmap_map_readonly(pmap_t pmap, vm_offset_t va, int len); +void pmap_map_readwrite(pmap_t pmap, vm_offset_t va, int len); + + +#endif /* _KERNEL */ + +#endif /* !LOCORE */ + +#endif /* !_MACHINE_PMAP_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/segments.h b/freebsd-5.3-xen-sparse/i386-xen/include/segments.h new file mode 100644 index 0000000000..85cc20c1f5 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/segments.h @@ -0,0 +1,260 @@ +/*- + * Copyright (c) 1989, 1990 William F. Jolitz + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)segments.h 7.1 (Berkeley) 5/9/91 + * $FreeBSD: src/sys/i386/include/segments.h,v 1.36 2003/11/03 21:12:04 jhb Exp $ + */ + +#ifndef _MACHINE_SEGMENTS_H_ +#define _MACHINE_SEGMENTS_H_ + +/* + * 386 Segmentation Data Structures and definitions + * William F. Jolitz (william@ernie.berkeley.edu) 6/20/1989 + */ + +/* + * Selectors + */ + +#define ISPL(s) ((s)&3) /* what is the priority level of a selector */ +#define SEL_KPL 1 /* kernel priority level */ +#define SEL_UPL 3 /* user priority level */ +#define ISLDT(s) ((s)&SEL_LDT) /* is it local or global */ +#define SEL_LDT 4 /* local descriptor table */ +#define IDXSEL(s) (((s)>>3) & 0x1fff) /* index of selector */ +#define LSEL(s,r) (((s)<<3) | SEL_LDT | r) /* a local selector */ +#define GSEL(s,r) (((s)<<3) | r) /* a global selector */ + +/* + * Memory and System segment descriptors + */ +struct segment_descriptor { + unsigned sd_lolimit:16 ; /* segment extent (lsb) */ + unsigned sd_lobase:24 __packed; /* segment base address (lsb) */ + unsigned sd_type:5 ; /* segment type */ + unsigned sd_dpl:2 ; /* segment descriptor priority level */ + unsigned sd_p:1 ; /* segment descriptor present */ + unsigned sd_hilimit:4 ; /* segment extent (msb) */ + unsigned sd_xx:2 ; /* unused */ + unsigned sd_def32:1 ; /* default 32 vs 16 bit size */ + unsigned sd_gran:1 ; /* limit granularity (byte/page units)*/ + unsigned sd_hibase:8 ; /* segment base address (msb) */ +} ; + +/* + * Gate descriptors (e.g. indirect descriptors) + */ +struct gate_descriptor { + unsigned gd_looffset:16 ; /* gate offset (lsb) */ + unsigned gd_selector:16 ; /* gate segment selector */ + unsigned gd_stkcpy:5 ; /* number of stack wds to cpy */ + unsigned gd_xx:3 ; /* unused */ + unsigned gd_type:5 ; /* segment type */ + unsigned gd_dpl:2 ; /* segment descriptor priority level */ + unsigned gd_p:1 ; /* segment descriptor present */ + unsigned gd_hioffset:16 ; /* gate offset (msb) */ +} ; + +/* + * Generic descriptor + */ +union descriptor { + struct segment_descriptor sd; + struct gate_descriptor gd; +}; + + /* system segments and gate types */ +#define SDT_SYSNULL 0 /* system null */ +#define SDT_SYS286TSS 1 /* system 286 TSS available */ +#define SDT_SYSLDT 2 /* system local descriptor table */ +#define SDT_SYS286BSY 3 /* system 286 TSS busy */ +#define SDT_SYS286CGT 4 /* system 286 call gate */ +#define SDT_SYSTASKGT 5 /* system task gate */ +#define SDT_SYS286IGT 6 /* system 286 interrupt gate */ +#define SDT_SYS286TGT 7 /* system 286 trap gate */ +#define SDT_SYSNULL2 8 /* system null again */ +#define SDT_SYS386TSS 9 /* system 386 TSS available */ +#define SDT_SYSNULL3 10 /* system null again */ +#define SDT_SYS386BSY 11 /* system 386 TSS busy */ +#define SDT_SYS386CGT 12 /* system 386 call gate */ +#define SDT_SYSNULL4 13 /* system null again */ +#define SDT_SYS386IGT 14 /* system 386 interrupt gate */ +#define SDT_SYS386TGT 15 /* system 386 trap gate */ + + /* memory segment types */ +#define SDT_MEMRO 16 /* memory read only */ +#define SDT_MEMROA 17 /* memory read only accessed */ +#define SDT_MEMRW 18 /* memory read write */ +#define SDT_MEMRWA 19 /* memory read write accessed */ +#define SDT_MEMROD 20 /* memory read only expand dwn limit */ +#define SDT_MEMRODA 21 /* memory read only expand dwn limit accessed */ +#define SDT_MEMRWD 22 /* memory read write expand dwn limit */ +#define SDT_MEMRWDA 23 /* memory read write expand dwn limit accessed */ +#define SDT_MEME 24 /* memory execute only */ +#define SDT_MEMEA 25 /* memory execute only accessed */ +#define SDT_MEMER 26 /* memory execute read */ +#define SDT_MEMERA 27 /* memory execute read accessed */ +#define SDT_MEMEC 28 /* memory execute only conforming */ +#define SDT_MEMEAC 29 /* memory execute only accessed conforming */ +#define SDT_MEMERC 30 /* memory execute read conforming */ +#define SDT_MEMERAC 31 /* memory execute read accessed conforming */ + +/* + * Software definitions are in this convenient format, + * which are translated into inconvenient segment descriptors + * when needed to be used by the 386 hardware + */ + +struct soft_segment_descriptor { + unsigned ssd_base ; /* segment base address */ + unsigned ssd_limit ; /* segment extent */ + unsigned ssd_type:5 ; /* segment type */ + unsigned ssd_dpl:2 ; /* segment descriptor priority level */ + unsigned ssd_p:1 ; /* segment descriptor present */ + unsigned ssd_xx:4 ; /* unused */ + unsigned ssd_xx1:2 ; /* unused */ + unsigned ssd_def32:1 ; /* default 32 vs 16 bit size */ + unsigned ssd_gran:1 ; /* limit granularity (byte/page units)*/ +}; + +/* + * region descriptors, used to load gdt/idt tables before segments yet exist. + */ +struct region_descriptor { + unsigned rd_limit:16; /* segment extent */ + unsigned rd_base:32 __packed; /* base address */ +}; + +/* + * Segment Protection Exception code bits + */ + +#define SEGEX_EXT 0x01 /* recursive or externally induced */ +#define SEGEX_IDT 0x02 /* interrupt descriptor table */ +#define SEGEX_TI 0x04 /* local descriptor table */ + /* other bits are affected descriptor index */ +#define SEGEX_IDX(s) (((s)>>3)&0x1fff) + +/* + * Size of IDT table + */ + +#define NIDT 256 /* 32 reserved, 0x80 syscall, most are h/w */ +#define NRSVIDT 32 /* reserved entries for cpu exceptions */ + +/* + * Entries in the Interrupt Descriptor Table (IDT) + */ +#define IDT_DE 0 /* #DE: Divide Error */ +#define IDT_DB 1 /* #DB: Debug */ +#define IDT_NMI 2 /* Nonmaskable External Interrupt */ +#define IDT_BP 3 /* #BP: Breakpoint */ +#define IDT_OF 4 /* #OF: Overflow */ +#define IDT_BR 5 /* #BR: Bound Range Exceeded */ +#define IDT_UD 6 /* #UD: Undefined/Invalid Opcode */ +#define IDT_NM 7 /* #NM: No Math Coprocessor */ +#define IDT_DF 8 /* #DF: Double Fault */ +#define IDT_FPUGP 9 /* Coprocessor Segment Overrun */ +#define IDT_TS 10 /* #TS: Invalid TSS */ +#define IDT_NP 11 /* #NP: Segment Not Present */ +#define IDT_SS 12 /* #SS: Stack Segment Fault */ +#define IDT_GP 13 /* #GP: General Protection Fault */ +#define IDT_PF 14 /* #PF: Page Fault */ +#define IDT_MF 16 /* #MF: FPU Floating-Point Error */ +#define IDT_AC 17 /* #AC: Alignment Check */ +#define IDT_MC 18 /* #MC: Machine Check */ +#define IDT_XF 19 /* #XF: SIMD Floating-Point Exception */ +#define IDT_IO_INTS NRSVIDT /* Base of IDT entries for I/O interrupts. */ +#define IDT_SYSCALL 0x80 /* System Call Interrupt Vector */ + +/* + * Entries in the Global Descriptor Table (GDT) + */ +#define GNULL_SEL 0 /* Null Descriptor */ +#if 0 +#define GCODE_SEL 1 /* Kernel Code Descriptor */ +#define GDATA_SEL 2 /* Kernel Data Descriptor */ +#else +#define GCODE_SEL (__KERNEL_CS >> 3) /* Kernel Code Descriptor */ +#define GDATA_SEL (__KERNEL_DS >> 3) /* Kernel Data Descriptor */ +#endif +#define GPRIV_SEL 3 /* SMP Per-Processor Private Data */ +#define GPROC0_SEL 4 /* Task state process slot zero and up */ +#define GLDT_SEL 5 /* LDT - eventually one per process */ +#define GUSERLDT_SEL 6 /* User LDT */ +#define GTGATE_SEL 7 /* Process task switch gate */ +#define GBIOSLOWMEM_SEL 8 /* BIOS low memory access (must be entry 8) */ +#define GPANIC_SEL 9 /* Task state to consider panic from */ +#define GBIOSCODE32_SEL 10 /* BIOS interface (32bit Code) */ +#define GBIOSCODE16_SEL 11 /* BIOS interface (16bit Code) */ +#define GBIOSDATA_SEL 12 /* BIOS interface (Data) */ +#define GBIOSUTIL_SEL 13 /* BIOS interface (Utility) */ +#define GBIOSARGS_SEL 14 /* BIOS interface (Arguments) */ + +#define NGDT 4 + +/* + * Entries in the Local Descriptor Table (LDT) + */ +#define LSYS5CALLS_SEL 0 /* forced by intel BCS */ +#define LSYS5SIGR_SEL 1 +#define L43BSDCALLS_SEL 2 /* notyet */ +#define LUCODE_SEL 3 +#define LSOL26CALLS_SEL 4 /* Solaris >= 2.6 system call gate */ +#define LUDATA_SEL 5 +/* separate stack, es,fs,gs sels ? */ +/* #define LPOSIXCALLS_SEL 5*/ /* notyet */ +#define LBSDICALLS_SEL 16 /* BSDI system call gate */ +#define NLDT (LBSDICALLS_SEL + 1) + +#ifdef _KERNEL +extern int _default_ldt; +extern union descriptor *gdt; +extern struct soft_segment_descriptor gdt_segs[]; +extern struct gate_descriptor *idt; +extern union descriptor *ldt; +extern struct region_descriptor r_gdt, r_idt; + +void lgdt(struct region_descriptor *rdp); +void lgdt_finish(void); +void sdtossd(struct segment_descriptor *sdp, + struct soft_segment_descriptor *ssdp); +void ssdtosd(struct soft_segment_descriptor *ssdp, + struct segment_descriptor *sdp); +#endif /* _KERNEL */ + +#endif /* !_MACHINE_SEGMENTS_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/synch_bitops.h b/freebsd-5.3-xen-sparse/i386-xen/include/synch_bitops.h new file mode 100644 index 0000000000..31ec3d3468 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/synch_bitops.h @@ -0,0 +1,82 @@ +#ifndef __XEN_SYNCH_BITOPS_H__ +#define __XEN_SYNCH_BITOPS_H__ + +/* + * Copyright 1992, Linus Torvalds. + * Heavily modified to provide guaranteed strong synchronisation + * when communicating with Xen or other guest OSes running on other CPUs. + */ + + +#define ADDR (*(volatile long *) addr) + +static __inline__ void synch_set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btsl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btrl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_change_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btcl %1,%0" + : "=m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btrl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__ ( + "lock btcl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & + (((const volatile unsigned int *) addr)[nr >> 5])) != 0; +} + +static __inline__ int synch_var_test_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "btl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) ); + return oldbit; +} + +#define synch_test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + synch_const_test_bit((nr),(addr)) : \ + synch_var_test_bit((nr),(addr))) + +#endif /* __XEN_SYNCH_BITOPS_H__ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/trap.h b/freebsd-5.3-xen-sparse/i386-xen/include/trap.h new file mode 100644 index 0000000000..c61beb90aa --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/trap.h @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)trap.h 5.4 (Berkeley) 5/9/91 + * $FreeBSD: src/sys/i386/include/trap.h,v 1.13 2001/07/12 06:32:51 peter Exp $ + */ + +#ifndef _MACHINE_TRAP_H_ +#define _MACHINE_TRAP_H_ + +/* + * Trap type values + * also known in trap.c for name strings + */ + +#define T_PRIVINFLT 1 /* privileged instruction */ +#define T_BPTFLT 3 /* breakpoint instruction */ +#define T_ARITHTRAP 6 /* arithmetic trap */ +#define T_PROTFLT 9 /* protection fault */ +#define T_TRCTRAP 10 /* debug exception (sic) */ +#define T_PAGEFLT 12 /* page fault */ +#define T_ALIGNFLT 14 /* alignment fault */ + +#define T_NESTED 16 +#define T_HYPCALLBACK 17 /* hypervisor callback */ + + +#define T_DIVIDE 18 /* integer divide fault */ +#define T_NMI 19 /* non-maskable trap */ +#define T_OFLOW 20 /* overflow trap */ +#define T_BOUND 21 /* bound instruction fault */ +#define T_DNA 22 /* device not available fault */ +#define T_DOUBLEFLT 23 /* double fault */ +#define T_FPOPFLT 24 /* fp coprocessor operand fetch fault */ +#define T_TSSFLT 25 /* invalid tss fault */ +#define T_SEGNPFLT 26 /* segment not present fault */ +#define T_STKFLT 27 /* stack fault */ +#define T_MCHK 28 /* machine check trap */ +#define T_XMMFLT 29 /* SIMD floating-point exception */ +#define T_RESERVED 30 /* reserved (unknown) */ + +/* XXX most of the following codes aren't used, but could be. */ + +/* definitions for <sys/signal.h> */ +#define ILL_RESAD_FAULT T_RESADFLT +#define ILL_PRIVIN_FAULT T_PRIVINFLT +#define ILL_RESOP_FAULT T_RESOPFLT +#define ILL_ALIGN_FAULT T_ALIGNFLT +#define ILL_FPOP_FAULT T_FPOPFLT /* coprocessor operand fault */ + +/* portable macros for SIGFPE/ARITHTRAP */ +#define FPE_INTOVF 1 /* integer overflow */ +#define FPE_INTDIV 2 /* integer divide by zero */ +#define FPE_FLTDIV 3 /* floating point divide by zero */ +#define FPE_FLTOVF 4 /* floating point overflow */ +#define FPE_FLTUND 5 /* floating point underflow */ +#define FPE_FLTRES 6 /* floating point inexact result */ +#define FPE_FLTINV 7 /* invalid floating point operation */ +#define FPE_FLTSUB 8 /* subscript out of range */ + +/* old FreeBSD macros, deprecated */ +#define FPE_INTOVF_TRAP 0x1 /* integer overflow */ +#define FPE_INTDIV_TRAP 0x2 /* integer divide by zero */ +#define FPE_FLTDIV_TRAP 0x3 /* floating/decimal divide by zero */ +#define FPE_FLTOVF_TRAP 0x4 /* floating overflow */ +#define FPE_FLTUND_TRAP 0x5 /* floating underflow */ +#define FPE_FPU_NP_TRAP 0x6 /* floating point unit not present */ +#define FPE_SUBRNG_TRAP 0x7 /* subrange out of bounds */ + +/* codes for SIGBUS */ +#define BUS_PAGE_FAULT T_PAGEFLT /* page fault protection base */ +#define BUS_SEGNP_FAULT T_SEGNPFLT /* segment not present */ +#define BUS_STK_FAULT T_STKFLT /* stack segment */ +#define BUS_SEGM_FAULT T_RESERVED /* segment protection base */ + +/* Trap's coming from user mode */ +#define T_USER 0x100 + +#endif /* !_MACHINE_TRAP_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/ucontext.h b/freebsd-5.3-xen-sparse/i386-xen/include/ucontext.h new file mode 100644 index 0000000000..eda584b62e --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/ucontext.h @@ -0,0 +1,105 @@ +/*- + * Copyright (c) 1999 Marcel Moolenaar + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD: src/sys/i386/include/ucontext.h,v 1.10 2002/12/02 19:58:55 deischen Exp $ + */ + +#ifndef _MACHINE_UCONTEXT_H_ +#define _MACHINE_UCONTEXT_H_ + +typedef struct __mcontext { + /* + * The first 20 fields must match the definition of + * sigcontext. So that we can support sigcontext + * and ucontext_t at the same time. + */ + int mc_onstack; /* XXX - sigcontext compat. */ + int mc_gs; /* machine state (struct trapframe) */ + int mc_fs; + int mc_es; + int mc_ds; + int mc_edi; + int mc_esi; + int mc_ebp; + int mc_isp; + int mc_ebx; + int mc_edx; + int mc_ecx; + int mc_eax; + int mc_trapno; + int mc_cr2; + int mc_err; + int mc_eip; + int mc_cs; + int mc_eflags; + int mc_esp; + int mc_ss; + + int mc_len; /* sizeof(mcontext_t) */ +#define _MC_FPFMT_NODEV 0x10000 /* device not present or configured */ +#define _MC_FPFMT_387 0x10001 +#define _MC_FPFMT_XMM 0x10002 + int mc_fpformat; +#define _MC_FPOWNED_NONE 0x20000 /* FP state not used */ +#define _MC_FPOWNED_FPU 0x20001 /* FP state came from FPU */ +#define _MC_FPOWNED_PCB 0x20002 /* FP state came from PCB */ + int mc_ownedfp; + /* + * See <machine/npx.h> for the internals of mc_fpstate[]. + */ + int mc_fpstate[128] __aligned(16); + int mc_spare2[8]; +} mcontext_t; + +#if defined(_KERNEL) && defined(COMPAT_FREEBSD4) +struct mcontext4 { + int mc_onstack; /* XXX - sigcontext compat. */ + int mc_gs; /* machine state (struct trapframe) */ + int mc_fs; + int mc_es; + int mc_ds; + int mc_edi; + int mc_esi; + int mc_ebp; + int mc_isp; + int mc_ebx; + int mc_edx; + int mc_ecx; + int mc_eax; + int mc_trapno; + int mc_err; + int mc_eip; + int mc_cs; + int mc_eflags; + int mc_esp; /* machine state */ + int mc_ss; + int mc_fpregs[28]; /* env87 + fpacc87 + u_long */ + int __spare__[17]; +}; +#endif + +#endif /* !_MACHINE_UCONTEXT_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/vmparam.h b/freebsd-5.3-xen-sparse/i386-xen/include/vmparam.h new file mode 100644 index 0000000000..7fa9af3c68 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/vmparam.h @@ -0,0 +1,141 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)vmparam.h 5.9 (Berkeley) 5/12/91 + * $FreeBSD: src/sys/i386/include/vmparam.h,v 1.37 2003/10/01 23:46:08 peter Exp $ + */ + + +#ifndef _MACHINE_VMPARAM_H_ +#define _MACHINE_VMPARAM_H_ 1 + +/* + * Machine dependent constants for 386. + */ + +#define VM_PROT_READ_IS_EXEC /* if you can read -- then you can exec */ + +/* + * Virtual memory related constants, all in bytes + */ +#define MAXTSIZ (128UL*1024*1024) /* max text size */ +#ifndef DFLDSIZ +#define DFLDSIZ (128UL*1024*1024) /* initial data size limit */ +#endif +#ifndef MAXDSIZ +#define MAXDSIZ (512UL*1024*1024) /* max data size */ +#endif +#ifndef DFLSSIZ +#define DFLSSIZ (8UL*1024*1024) /* initial stack size limit */ +#endif +#ifndef MAXSSIZ +#define MAXSSIZ (64UL*1024*1024) /* max stack size */ +#endif +#ifndef SGROWSIZ +#define SGROWSIZ (128UL*1024) /* amount to grow stack */ +#endif + +#define USRTEXT (1*PAGE_SIZE) /* base of user text XXX bogus */ + +/* + * The time for a process to be blocked before being very swappable. + * This is a number of seconds which the system takes as being a non-trivial + * amount of real time. You probably shouldn't change this; + * it is used in subtle ways (fractions and multiples of it are, that is, like + * half of a ``long time'', almost a long time, etc.) + * It is related to human patience and other factors which don't really + * change over time. + */ +#define MAXSLP 20 + + +/* + * Kernel physical load address. + */ +#ifndef KERNLOAD +#define KERNLOAD (1 << PDRSHIFT) +#endif + +/* + * Virtual addresses of things. Derived from the page directory and + * page table indexes from pmap.h for precision. + * Because of the page that is both a PD and PT, it looks a little + * messy at times, but hey, we'll do anything to save a page :-) + */ + +#define VM_MAX_KERNEL_ADDRESS VADDR(KPTDI+NKPDE-1, NPTEPG-1) +#define VM_MIN_KERNEL_ADDRESS VADDR(PTDPTDI, PTDPTDI) + +#define KERNBASE VADDR(KPTDI, 0) + +#define UPT_MAX_ADDRESS VADDR(PTDPTDI, PTDPTDI) +#define UPT_MIN_ADDRESS VADDR(PTDPTDI, 0) + +#define VM_MAXUSER_ADDRESS VADDR(PTDPTDI-1, 0) + +#define USRSTACK VM_MAXUSER_ADDRESS + +#define VM_MAX_ADDRESS VADDR(PTDPTDI, PTDPTDI) +#define VM_MIN_ADDRESS ((vm_offset_t)0) + +/* virtual sizes (bytes) for various kernel submaps */ +#ifndef VM_KMEM_SIZE +#define VM_KMEM_SIZE (12 * 1024 * 1024) +#endif + +/* + * How many physical pages per KVA page allocated. + * min(max(VM_KMEM_SIZE, Physical memory/VM_KMEM_SIZE_SCALE), VM_KMEM_SIZE_MAX) + * is the total KVA space allocated for kmem_map. + */ +#ifndef VM_KMEM_SIZE_SCALE +#define VM_KMEM_SIZE_SCALE (3) +#endif + +/* + * Ceiling on amount of kmem_map kva space. + */ +#ifndef VM_KMEM_SIZE_MAX +#define VM_KMEM_SIZE_MAX (320 * 1024 * 1024) +#endif + +/* initial pagein size of beginning of executable file */ +#ifndef VM_INITIAL_PAGEIN +#define VM_INITIAL_PAGEIN 16 +#endif + +#endif /* _MACHINE_VMPARAM_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/xen-os.h b/freebsd-5.3-xen-sparse/i386-xen/include/xen-os.h new file mode 100644 index 0000000000..e483fc535c --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/xen-os.h @@ -0,0 +1,293 @@ +/****************************************************************************** + * os.h + * + * random collection of macros and definition + */ + +#ifndef _OS_H_ +#define _OS_H_ + +#ifndef NULL +#define NULL (void *)0 +#endif + +/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented + a mechanism by which the user can annotate likely branch directions and + expect the blocks to be reordered appropriately. Define __builtin_expect + to nothing for earlier compilers. */ + +#if __GNUC__ == 2 && __GNUC_MINOR__ < 96 +#define __builtin_expect(x, expected_value) (x) +#endif + + + +/* + * These are the segment descriptors provided for us by the hypervisor. + * For now, these are hardwired -- guest OSes cannot update the GDT + * or LDT. + * + * It shouldn't be hard to support descriptor-table frobbing -- let me + * know if the BSD or XP ports require flexibility here. + */ + + +/* + * these are also defined in hypervisor-if.h but can't be pulled in as + * they are used in start of day assembly. Need to clean up the .h files + * a bit more... + */ + +#ifndef FLAT_RING1_CS +#define FLAT_RING1_CS 0x0819 +#define FLAT_RING1_DS 0x0821 +#define FLAT_RING3_CS 0x082b +#define FLAT_RING3_DS 0x0833 +#endif + +#define __KERNEL_CS FLAT_RING1_CS +#define __KERNEL_DS FLAT_RING1_DS + +/* Everything below this point is not included by assembler (.S) files. */ +#ifndef __ASSEMBLY__ +#include <sys/types.h> + +#include <machine/hypervisor-ifs.h> +void printk(const char *fmt, ...); + +/* some function prototypes */ +void trap_init(void); + + +/* + * STI/CLI equivalents. These basically set and clear the virtual + * event_enable flag in teh shared_info structure. Note that when + * the enable bit is set, there may be pending events to be handled. + * We may therefore call into do_hypervisor_callback() directly. + */ +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#define __cli() \ +do { \ + HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + +#define __sti() \ +do { \ + shared_info_t *_shared = HYPERVISOR_shared_info; \ + barrier(); \ + _shared->vcpu_data[0].evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_shared->vcpu_data[0].evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ +} while (0) + +#define __save_flags(x) \ +do { \ + (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask; \ +} while (0) + +#define __restore_flags(x) \ +do { \ + shared_info_t *_shared = HYPERVISOR_shared_info; \ + barrier(); \ + if ( (_shared->vcpu_data[0].evtchn_upcall_mask = (x)) == 0 ) { \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_shared->vcpu_data[0].evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ + } \ +} while (0) + +#define __save_and_cli(x) \ +do { \ + (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask; \ + HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + +#define __save_and_sti(x) \ +do { \ + shared_info_t *_shared = HYPERVISOR_shared_info; \ + barrier(); \ + (x) = _shared->vcpu_data[0].evtchn_upcall_mask; \ + _shared->vcpu_data[0].evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_shared->vcpu_data[0].evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ +} while (0) + +#ifdef SMP +/* extra macros need for the SMP case */ +#error "global_irq_* not defined" +#endif + +#define cli() __cli() +#define sti() __sti() +#define save_flags(x) __save_flags(x) +#define restore_flags(x) __restore_flags(x) +#define save_and_cli(x) __save_and_cli(x) +#define save_and_sti(x) __save_and_sti(x) + +#define local_irq_save(x) __save_and_cli(x) +#define local_irq_set(x) __save_and_sti(x) +#define local_irq_restore(x) __restore_flags(x) +#define local_irq_disable() __cli() +#define local_irq_enable() __sti() + +#define mtx_lock_irqsave(lock, x) {local_irq_save((x)); mtx_lock_spin((lock));} +#define mtx_unlock_irqrestore(lock, x) {mtx_unlock_spin((lock)); local_irq_restore((x)); } + +#define mb() +#define rmb() +#define smp_mb() +#define wmb() + + + +/* This is a barrier for the compiler only, NOT the processor! */ +#define barrier() __asm__ __volatile__("": : :"memory") + +#define LOCK_PREFIX "" +#define LOCK "" +#define ADDR (*(volatile long *) addr) +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { volatile int counter; } atomic_t; + + + +#define xen_xchg(ptr,v) \ + ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) +struct __xchg_dummy { unsigned long a[100]; }; +#define __xg(x) ((volatile struct __xchg_dummy *)(x)) +static __inline unsigned long __xchg(unsigned long x, volatile void * ptr, + int size) +{ + switch (size) { + case 1: + __asm__ __volatile__("xchgb %b0,%1" + :"=q" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 2: + __asm__ __volatile__("xchgw %w0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 4: + __asm__ __volatile__("xchgl %0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + } + return x; +} + +/** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static __inline__ int test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( LOCK_PREFIX + "btrl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"=m" (ADDR) + :"Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int constant_test_bit(int nr, const volatile void * addr) +{ + return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0; +} + +static __inline__ int variable_test_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__( + "btl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit) + :"m" (ADDR),"Ir" (nr)); + return oldbit; +} + +#define test_bit(nr,addr) \ +(__builtin_constant_p(nr) ? \ + constant_test_bit((nr),(addr)) : \ + variable_test_bit((nr),(addr))) + + +/** + * set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static __inline__ void set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + "btsl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static __inline__ void clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( LOCK_PREFIX + "btrl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ + __asm__ __volatile__( + LOCK "incl %0" + :"=m" (v->counter) + :"m" (v->counter)); +} + + +#define rdtscll(val) \ + __asm__ __volatile__("rdtsc" : "=A" (val)) + + +#endif /* !__ASSEMBLY__ */ + +#endif /* _OS_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/xen_intr.h b/freebsd-5.3-xen-sparse/i386-xen/include/xen_intr.h new file mode 100644 index 0000000000..e35eafa5d2 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/xen_intr.h @@ -0,0 +1,50 @@ +/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- */ +#ifndef _XEN_INTR_H_ +#define _XEN_INTR_H_ + +/* +* The flat IRQ space is divided into two regions: +* 1. A one-to-one mapping of real physical IRQs. This space is only used +* if we have physical device-access privilege. This region is at the +* start of the IRQ space so that existing device drivers do not need +* to be modified to translate physical IRQ numbers into our IRQ space. +* 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These +* are bound using the provided bind/unbind functions. +*/ + +#define PIRQ_BASE 0 +#define NR_PIRQS 128 + +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS) +#define NR_DYNIRQS 128 + +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) + +#define pirq_to_irq(_x) ((_x) + PIRQ_BASE) +#define irq_to_pirq(_x) ((_x) - PIRQ_BASE) + +#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE) +#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE) + +/* Dynamic binding of event channels and VIRQ sources to Linux IRQ space. */ +extern int bind_virq_to_irq(int virq); +extern void unbind_virq_from_irq(int virq); +extern int bind_evtchn_to_irq(int evtchn); +extern void unbind_evtchn_from_irq(int evtchn); + +static __inline__ int irq_cannonicalize(int irq) +{ + return (irq == 2) ? 9 : irq; +} + +extern void disable_irq(unsigned int); +extern void disable_irq_nosync(unsigned int); +extern void enable_irq(unsigned int); + +extern void irq_suspend(void); +extern void irq_resume(void); + +extern void idle_block(void); + + +#endif /* _XEN_INTR_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/xenfunc.h b/freebsd-5.3-xen-sparse/i386-xen/include/xenfunc.h new file mode 100644 index 0000000000..93ffd7853a --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/xenfunc.h @@ -0,0 +1,85 @@ +/* $NetBSD:$ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENFUNC_H_ +#define _XEN_XENFUNC_H_ + +#include <machine/xen-os.h> +#include <machine/hypervisor.h> +#include <machine/xenpmap.h> +#include <machine/segments.h> +#include <sys/pcpu.h> +#define BKPT __asm__("int3"); +#define XPQ_CALL_DEPTH 5 +#define XPQ_CALL_COUNT 2 +#define PG_PRIV PG_AVAIL3 +typedef struct { + unsigned long pt_ref; + unsigned long pt_eip[XPQ_CALL_COUNT][XPQ_CALL_DEPTH]; +} pteinfo_t; + +extern pteinfo_t *pteinfo_list; +#ifdef XENDEBUG_LOW +#define __PRINTK(x) printk x +#else +#define __PRINTK(x) +#endif + +char *xen_setbootenv(char *cmd_line); +int xen_boothowto(char *envp); +void load_cr3(uint32_t val); +void xen_set_ldt(vm_offset_t, uint32_t); +void xen_machphys_update(unsigned long, unsigned long); +void xen_update_descriptor(union descriptor *, union descriptor *); +void lldt(u_short sel); +/* + * Invalidate a patricular VA on all cpus + * + * N.B. Made these global for external loadable modules to reference. + */ +static __inline void +invlpg(u_int addr) +{ + xpq_queue_invlpg(addr); +} + +static __inline void +invltlb(void) +{ + xpq_queue_tlb_flush(); + mcl_flush_queue(); +} + + +#endif /* _XEN_XENFUNC_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/xenpmap.h b/freebsd-5.3-xen-sparse/i386-xen/include/xenpmap.h new file mode 100644 index 0000000000..f445096228 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/xenpmap.h @@ -0,0 +1,132 @@ +/* $NetBSD:$ */ + +/* + * + * Copyright (c) 2004 Christian Limpach. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Christian Limpach. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _XEN_XENPMAP_H_ +#define _XEN_XENPMAP_H_ +#include <machine/xenvar.h> +void xpq_physbcopy(const unsigned long *, unsigned long, size_t); +void xpq_queue_invlpg(vm_offset_t); +void xpq_queue_pt_update(pt_entry_t *, pt_entry_t); +void xpq_queue_pt_switch(uint32_t); +void xpq_queue_set_ldt(vm_offset_t, uint32_t); +void xpq_queue_tlb_flush(void); +void xpq_queue_pin_table(uint32_t, int); +void xpq_queue_unpin_table(uint32_t); +void xpq_record(unsigned long, unsigned long); +void mcl_queue_pt_update(vm_offset_t, vm_offset_t); +void mcl_flush_queue(void); +void pmap_ref(pt_entry_t *pte, unsigned long ma); + + +#ifdef PMAP_DEBUG +#define PMAP_REF pmap_ref +#define PMAP_DEC_REF_PAGE pmap_dec_ref_page +#define PMAP_MARK_PRIV pmap_mark_privileged +#define PMAP_MARK_UNPRIV pmap_mark_unprivileged +#else +#define PMAP_MARK_PRIV(a) +#define PMAP_MARK_UNPRIV(a) +#define PMAP_REF(a, b) +#define PMAP_DEC_REF_PAGE(a) +#endif + +#define ALWAYS_SYNC 0 + +#define pmap_valid_entry(E) ((E) & PG_V) /* is PDE or PTE valid? */ + +#define XPQ_PIN_L1_TABLE 1 +#define XPQ_PIN_L2_TABLE 2 + +#define PT_GET(_ptp) \ + (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : *(_ptp)) +#define PT_SET_VA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), xpmap_ptom(_npte)); \ + xpq_queue_pt_update((pt_entry_t *)vtomach((_ptp)), \ + xpmap_ptom((_npte))); \ + if (sync || ALWAYS_SYNC) \ + mcl_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_SET_VA_MA(_ptp,_npte,sync) do { \ + PMAP_REF((_ptp), (_npte)); \ + xpq_queue_pt_update((pt_entry_t *)vtomach((_ptp)), (_npte)); \ + if (sync || ALWAYS_SYNC) \ + mcl_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_CLEAR_VA(_ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(_ptp), 0); \ + xpq_queue_pt_update((pt_entry_t *)vtomach(_ptp), 0); \ + if (sync || ALWAYS_SYNC) \ + mcl_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_CLEAR(_ptp, sync) do { \ + PMAP_REF((pt_entry_t *)(vtopte(_ptp)), 0); \ + mcl_queue_pt_update((unsigned long)_ptp, 0); \ + if (sync || ALWAYS_SYNC) \ + mcl_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_SET_MA(_va,_ma,sync) do { \ + PMAP_REF(vtopte((unsigned long)_va), (_ma)); \ + mcl_queue_pt_update((vm_offset_t )(_va), (_ma)); \ + if (sync || ALWAYS_SYNC) \ + mcl_flush_queue(); \ +} while (/*CONSTCOND*/0) +#define PT_SET(_va,_pa,sync) do { \ + PMAP_REF((pt_entry_t *)(vtopte(_va)), xpmap_ptom(_pa)); \ + mcl_queue_pt_update((vm_offset_t)(_va), \ + xpmap_ptom((_pa))); \ + if (sync || ALWAYS_SYNC) \ + mcl_flush_queue(); \ +} while (/*CONSTCOND*/0) + + + +#define PT_UPDATES_FLUSH() do { \ + mcl_flush_queue(); \ +} while (/*CONSTCOND*/0) + + +static __inline uint32_t +xpmap_mtop(uint32_t mpa) +{ + return (((xen_machine_phys[(mpa >> PAGE_SHIFT)]) << PAGE_SHIFT) + | (mpa & ~PG_FRAME)); +} + +static __inline vm_paddr_t +xpmap_ptom(uint32_t ppa) +{ + return phystomach(ppa) | (ppa & ~PG_FRAME); +} + +#endif /* _XEN_XENPMAP_H_ */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/include/xenvar.h b/freebsd-5.3-xen-sparse/i386-xen/include/xenvar.h new file mode 100644 index 0000000000..5a3d3acb0b --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/include/xenvar.h @@ -0,0 +1,30 @@ +#ifndef XENVAR_H_ +#define XENVAR_H_ + +#define XBOOTUP 0x1 +#define XPMAP 0x2 +extern int xendebug_flags; +#ifndef NOXENDEBUG +#define XENPRINTF printk +#else +#define XENPRINTF(x...) +#endif +extern unsigned long *xen_phys_machine; +#define TRACE_ENTER XENPRINTF("(file=%s, line=%d) entered %s\n", __FILE__, __LINE__, __FUNCTION__) +#define TRACE_EXIT XENPRINTF("(file=%s, line=%d) exiting %s\n", __FILE__, __LINE__, __FUNCTION__) +#define TRACE_DEBUG(argflags, _f, _a...) \ +if (xendebug_flags & argflags) XENPRINTF("(file=%s, line=%d) " _f "\n", __FILE__, __LINE__, ## _a); + +extern unsigned long *xen_machine_phys; +#define PTOM(i) (((unsigned long *)xen_phys_machine)[i]) +#define phystomach(pa) ((((unsigned long *)xen_phys_machine)[(pa >> PAGE_SHIFT)]) << PAGE_SHIFT) +void xpq_init(void); + +struct sockaddr_in; + +int xen_setnfshandle(void); +int setinaddr(struct sockaddr_in *addr, char *ipstr); + +#define RB_GDB_PAUSE RB_RESERVED1 + +#endif diff --git a/freebsd-5.3-xen-sparse/i386-xen/xen/blkfront/xb_blkfront.c b/freebsd-5.3-xen-sparse/i386-xen/xen/blkfront/xb_blkfront.c new file mode 100644 index 0000000000..66c80f3ece --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/xen/blkfront/xb_blkfront.c @@ -0,0 +1,925 @@ +/*- + * All rights reserved. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * XenoBSD block device driver + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <sys/bio.h> +#include <sys/bus.h> +#include <sys/conf.h> + +#include <machine/bus.h> +#include <sys/rman.h> +#include <machine/resource.h> +#include <machine/intr_machdep.h> +#include <machine/vmparam.h> + +#include <machine/hypervisor.h> +#include <machine/hypervisor-ifs.h> +#include <machine/xen-os.h> +#include <machine/xen_intr.h> +#include <machine/evtchn.h> + +#include <geom/geom_disk.h> +#include <machine/ctrl_if.h> +#include <machine/xenfunc.h> + +/* prototypes */ +struct xb_softc; +static void xb_startio(struct xb_softc *sc); +static void xb_vbdinit(void); +static void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp); +static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id); + +struct xb_softc { + device_t xb_dev; + struct disk xb_disk; /* disk params */ + struct bio_queue_head xb_bioq; /* sort queue */ + struct resource *xb_irq; + void *xb_resp_handler; + int xb_unit; + int xb_flags; +#define XB_OPEN (1<<0) /* drive is open (can't shut down) */ +}; + +/* Control whether runtime update of vbds is enabled. */ +#define ENABLE_VBD_UPDATE 1 + +#if ENABLE_VBD_UPDATE +static void vbd_update(void); +#else +static void vbd_update(void){}; +#endif + +#define BLKIF_STATE_CLOSED 0 +#define BLKIF_STATE_DISCONNECTED 1 +#define BLKIF_STATE_CONNECTED 2 + +static char *blkif_state_name[] = { + [BLKIF_STATE_CLOSED] = "closed", + [BLKIF_STATE_DISCONNECTED] = "disconnected", + [BLKIF_STATE_CONNECTED] = "connected", +}; + +static char * blkif_status_name[] = { + [BLKIF_INTERFACE_STATUS_CLOSED] = "closed", + [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", + [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected", + [BLKIF_INTERFACE_STATUS_CHANGED] = "changed", +}; + +#define WPRINTK(fmt, args...) printk("[XEN] " fmt, ##args) + +static int blkif_handle; +static unsigned int blkif_state = BLKIF_STATE_CLOSED; +static unsigned int blkif_evtchn; +static unsigned int blkif_irq; + +static int blkif_control_rsp_valid; +static blkif_response_t blkif_control_rsp; + +static unsigned long xb_rec_ring_free; +blkif_request_t xb_rec_ring[BLKIF_RING_SIZE]; /* shadow recovery ring */ + +/* XXX move to xb_vbd.c when VBD update support is added */ +#define MAX_VBDS 64 +static vdisk_t xb_diskinfo[MAX_VBDS]; +static int xb_ndisks; + +#define XBD_SECTOR_SIZE 512 /* XXX: assume for now */ +#define XBD_SECTOR_SHFT 9 + +static unsigned int xb_kick_pending; + +static struct mtx blkif_io_lock; + +static blkif_ring_t *xb_blk_ring; +static BLKIF_RING_IDX xb_resp_cons; /* Response consumer for comms ring. */ +static BLKIF_RING_IDX xb_req_prod; /* Private request producer */ + +static int xb_recovery = 0; /* "Recovery in progress" flag. Protected + * by the blkif_io_lock */ + +/* We plug the I/O ring if the driver is suspended or if the ring is full. */ +#define BLKIF_RING_FULL (((xb_req_prod - xb_resp_cons) == BLKIF_RING_SIZE) || \ + (blkif_state != BLKIF_STATE_CONNECTED)) + +void blkif_completion(blkif_request_t *req); +void xb_response_intr(void *); + +/* XXX: This isn't supported in FreeBSD, so ignore it for now. */ +#define TASK_UNINTERRUPTIBLE 0 + +static inline int +GET_ID_FROM_FREELIST( void ) +{ + unsigned long free = xb_rec_ring_free; + + KASSERT(free <= BLKIF_RING_SIZE, ("free %lu > BLKIF_RING_SIZE", free)); + + xb_rec_ring_free = xb_rec_ring[free].id; + + xb_rec_ring[free].id = 0x0fffffee; /* debug */ + + return free; +} + +static inline void +ADD_ID_TO_FREELIST( unsigned long id ) +{ + xb_rec_ring[id].id = xb_rec_ring_free; + xb_rec_ring_free = id; +} + +static inline void translate_req_to_pfn(blkif_request_t *xreq, + blkif_request_t *req) +{ + int i; + + xreq->operation = req->operation; + xreq->nr_segments = req->nr_segments; + xreq->device = req->device; + /* preserve id */ + xreq->sector_number = req->sector_number; + + for ( i = 0; i < req->nr_segments; i++ ){ + xreq->frame_and_sects[i] = xpmap_mtop(req->frame_and_sects[i]); + } +} + +static inline void translate_req_to_mfn(blkif_request_t *xreq, + blkif_request_t *req) +{ + int i; + + xreq->operation = req->operation; + xreq->nr_segments = req->nr_segments; + xreq->device = req->device; + xreq->id = req->id; /* copy id (unlike above) */ + xreq->sector_number = req->sector_number; + + for ( i = 0; i < req->nr_segments; i++ ){ + xreq->frame_and_sects[i] = xpmap_ptom(req->frame_and_sects[i]); + } +} + + +static inline void flush_requests(void) +{ + xb_blk_ring->req_prod = xb_req_prod; + notify_via_evtchn(blkif_evtchn); +} + + +#if ENABLE_VBD_UPDATE +static void vbd_update() +{ + XENPRINTF(">\n"); + XENPRINTF("<\n"); +} +#endif /* ENABLE_VBD_UPDATE */ + +void +xb_response_intr(void *xsc) +{ + struct xb_softc *sc = NULL; + struct bio *bp; + blkif_response_t *bret; + BLKIF_RING_IDX i, rp; + unsigned long flags; + + if (blkif_state == BLKIF_STATE_CLOSED) + return; + + mtx_lock_irqsave(&blkif_io_lock, flags); + + if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) || + unlikely(xb_recovery) ) { + mtx_unlock_irqrestore(&blkif_io_lock, flags); + return; + } + + rp = xb_blk_ring->resp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + /* sometimes we seem to lose i/o. stay in the interrupt handler while + * there is stuff to process: continually recheck the response producer. + */ + for ( i = xb_resp_cons; i != (rp = xb_blk_ring->resp_prod); i++ ) { + unsigned long id; + bret = &xb_blk_ring->ring[MASK_BLKIF_IDX(i)].resp; + + id = bret->id; + bp = (struct bio *)xb_rec_ring[id].id; + + blkif_completion(&xb_rec_ring[id]); + + ADD_ID_TO_FREELIST(id); /* overwrites req */ + + switch ( bret->operation ) { + case BLKIF_OP_READ: + /* had an unaligned buffer that needs to be copied */ + if (bp->bio_driver1) + bcopy(bp->bio_data, bp->bio_driver1, bp->bio_bcount); + case BLKIF_OP_WRITE: + + /* free the copy buffer */ + if (bp->bio_driver1) { + free(bp->bio_data, M_DEVBUF); + bp->bio_data = bp->bio_driver1; + bp->bio_driver1 = NULL; + } + + if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) { + XENPRINTF("Bad return from blkdev data request: %x\n", + bret->status); + bp->bio_flags |= BIO_ERROR; + } + + sc = (struct xb_softc *)bp->bio_disk->d_drv1; + + if (bp->bio_flags & BIO_ERROR) + bp->bio_error = EIO; + else + bp->bio_resid = 0; + + biodone(bp); + break; + case BLKIF_OP_PROBE: + memcpy(&blkif_control_rsp, bret, sizeof(*bret)); + blkif_control_rsp_valid = 1; + break; + default: + panic("received invalid operation"); + break; + } + } + + xb_resp_cons = i; + + if (sc && xb_kick_pending) { + xb_kick_pending = FALSE; + xb_startio(sc); + } + + mtx_unlock_irqrestore(&blkif_io_lock, flags); +} + +static int +xb_open(struct disk *dp) +{ + struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; + + if (sc == NULL) { + printk("xb%d: not found", sc->xb_unit); + return (ENXIO); + } + + /* block dev not active */ + if (blkif_state != BLKIF_STATE_CONNECTED) { + printk("xb%d: bad state: %dn", sc->xb_unit, blkif_state); + return(ENXIO); + } + + sc->xb_flags |= XB_OPEN; + return (0); +} + +static int +xb_close(struct disk *dp) +{ + struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; + + if (sc == NULL) + return (ENXIO); + sc->xb_flags &= ~XB_OPEN; + return (0); +} + +static int +xb_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) +{ + struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; + + TRACE_ENTER; + + if (sc == NULL) + return (ENXIO); + + return (ENOTTY); +} + +/* + * Dequeue buffers and place them in the shared communication ring. + * Return when no more requests can be accepted or all buffers have + * been queued. + * + * Signal XEN once the ring has been filled out. + */ +static void +xb_startio(struct xb_softc *sc) +{ + struct bio *bp; + unsigned long buffer_ma; + blkif_request_t *req; + int s, queued = 0; + unsigned long id; + unsigned int fsect, lsect; + + + if (unlikely(blkif_state != BLKIF_STATE_CONNECTED)) + return; + + s = splbio(); + + for (bp = bioq_first(&sc->xb_bioq); + bp && !BLKIF_RING_FULL; + xb_req_prod++, queued++, bp = bioq_first(&sc->xb_bioq)) { + + /* Check if the buffer is properly aligned */ + if ((vm_offset_t)bp->bio_data & PAGE_MASK) { + int align = (bp->bio_bcount < PAGE_SIZE/2) ? XBD_SECTOR_SIZE : + PAGE_SIZE; + caddr_t newbuf = malloc(bp->bio_bcount + align, M_DEVBUF, + M_WAITOK); + caddr_t alignbuf = (char *)roundup2((u_long)newbuf, align); + + /* save a copy of the current buffer */ + bp->bio_driver1 = bp->bio_data; + + /* Copy the data for a write */ + if (bp->bio_cmd == BIO_WRITE) + bcopy(bp->bio_data, alignbuf, bp->bio_bcount); + bp->bio_data = alignbuf; + } + + bioq_remove(&sc->xb_bioq, bp); + buffer_ma = vtomach(bp->bio_data); + fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT; + lsect = fsect + (bp->bio_bcount >> XBD_SECTOR_SHFT) - 1; + + KASSERT((buffer_ma & (XBD_SECTOR_SIZE-1)) == 0, + ("XEN buffer must be sector aligned")); + KASSERT(lsect <= 7, + ("XEN disk driver data cannot cross a page boundary")); + + buffer_ma &= ~PAGE_MASK; + + /* Fill out a communications ring structure. */ + req = &xb_blk_ring->ring[MASK_BLKIF_IDX(xb_req_prod)].req; + id = GET_ID_FROM_FREELIST(); + xb_rec_ring[id].id= (unsigned long)bp; + + req->id = id; + req->operation = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ : + BLKIF_OP_WRITE; + + req->sector_number= (blkif_sector_t)bp->bio_pblkno; + req->device = xb_diskinfo[sc->xb_unit].device; + + req->nr_segments = 1; /* not doing scatter/gather since buffer + * chaining is not supported. + */ + /* + * upper bits represent the machine address of the buffer and the + * lower bits is the number of sectors to be read/written. + */ + req->frame_and_sects[0] = buffer_ma | (fsect << 3) | lsect; + + /* Keep a private copy so we can reissue requests when recovering. */ + translate_req_to_pfn( &xb_rec_ring[id], req); + + } + + if (BLKIF_RING_FULL) + xb_kick_pending = TRUE; + + if (queued != 0) + flush_requests(); + splx(s); +} + +/* + * Read/write routine for a buffer. Finds the proper unit, place it on + * the sortq and kick the controller. + */ +static void +xb_strategy(struct bio *bp) +{ + struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1; + int s; + + /* bogus disk? */ + if (sc == NULL) { + bp->bio_error = EINVAL; + bp->bio_flags |= BIO_ERROR; + goto bad; + } + + s = splbio(); + /* + * Place it in the queue of disk activities for this disk + */ + bioq_disksort(&sc->xb_bioq, bp); + splx(s); + + xb_startio(sc); + return; + + bad: + /* + * Correctly set the bio to indicate a failed tranfer. + */ + bp->bio_resid = bp->bio_bcount; + biodone(bp); + return; +} + + +static int +xb_create(int unit) +{ + struct xb_softc *sc; + int error = 0; + + sc = (struct xb_softc *)malloc(sizeof(*sc), M_DEVBUF, M_WAITOK); + sc->xb_unit = unit; + + memset(&sc->xb_disk, 0, sizeof(sc->xb_disk)); + sc->xb_disk.d_unit = unit; + sc->xb_disk.d_open = xb_open; + sc->xb_disk.d_close = xb_close; + sc->xb_disk.d_ioctl = xb_ioctl; + sc->xb_disk.d_strategy = xb_strategy; + sc->xb_disk.d_name = "xbd"; + sc->xb_disk.d_drv1 = sc; + sc->xb_disk.d_sectorsize = XBD_SECTOR_SIZE; + sc->xb_disk.d_mediasize = xb_diskinfo[sc->xb_unit].capacity + << XBD_SECTOR_SHFT; +#if 0 + sc->xb_disk.d_maxsize = DFLTPHYS; +#else /* XXX: xen can't handle large single i/o requests */ + sc->xb_disk.d_maxsize = 4096; +#endif + + XENPRINTF("attaching device 0x%x unit %d capacity %llu\n", + xb_diskinfo[sc->xb_unit].device, sc->xb_unit, + sc->xb_disk.d_mediasize); + + disk_create(&sc->xb_disk, DISK_VERSION_00); + bioq_init(&sc->xb_bioq); + + return error; +} + +/* XXX move to xb_vbd.c when vbd update support is added */ +static void +xb_vbdinit(void) +{ + int i; + blkif_request_t req; + blkif_response_t rsp; + vdisk_t *buf; + + TRACE_ENTER; + + buf = (vdisk_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); + + /* Probe for disk information. */ + memset(&req, 0, sizeof(req)); + req.operation = BLKIF_OP_PROBE; + req.nr_segments = 1; + req.frame_and_sects[0] = vtomach(buf) | 7; + blkif_control_send(&req, &rsp); + + if ( rsp.status <= 0 ) { + printk("xb_identify: Could not identify disks (%d)\n", rsp.status); + free(buf, M_DEVBUF); + return; + } + + if ((xb_ndisks = rsp.status) > MAX_VBDS) + xb_ndisks = MAX_VBDS; + + memcpy(xb_diskinfo, buf, xb_ndisks * sizeof(vdisk_t)); + + for (i = 0; i < xb_ndisks; i++) + xb_create(i); + + free(buf, M_DEVBUF); +} + + +/***************************** COMMON CODE *******************************/ + +void +blkif_control_send(blkif_request_t *req, blkif_response_t *rsp) +{ + unsigned long flags, id; + + retry: + while ( (xb_req_prod - xb_resp_cons) == BLKIF_RING_SIZE ) { + tsleep( req, PWAIT | PCATCH, "blkif", hz); + } + + mtx_lock_irqsave(&blkif_io_lock, flags); + if ( (xb_req_prod - xb_resp_cons) == BLKIF_RING_SIZE ) + { + mtx_unlock_irqrestore(&blkif_io_lock, flags); + goto retry; + } + + xb_blk_ring->ring[MASK_BLKIF_IDX(xb_req_prod)].req = *req; + + id = GET_ID_FROM_FREELIST(); + xb_blk_ring->ring[MASK_BLKIF_IDX(xb_req_prod)].req.id = id; + xb_rec_ring[id].id = (unsigned long) req; + + translate_req_to_pfn( &xb_rec_ring[id], req ); + + xb_req_prod++; + flush_requests(); + + mtx_unlock_irqrestore(&blkif_io_lock, flags); + + while ( !blkif_control_rsp_valid ) + { + tsleep( &blkif_control_rsp_valid, PWAIT | PCATCH, "blkif", hz); + } + + memcpy(rsp, &blkif_control_rsp, sizeof(*rsp)); + blkif_control_rsp_valid = 0; +} + + +/* Send a driver status notification to the domain controller. */ +static void +send_driver_status(int ok) +{ + ctrl_msg_t cmsg = { + .type = CMSG_BLKIF_FE, + .subtype = CMSG_BLKIF_FE_DRIVER_STATUS, + .length = sizeof(blkif_fe_driver_status_t), + }; + blkif_fe_driver_status_t *msg = (void*)cmsg.msg; + + msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN); + + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} + +/* Tell the controller to bring up the interface. */ +static void +blkif_send_interface_connect(void) +{ + ctrl_msg_t cmsg = { + .type = CMSG_BLKIF_FE, + .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT, + .length = sizeof(blkif_fe_interface_connect_t), + }; + blkif_fe_interface_connect_t *msg = (void*)cmsg.msg; + + msg->handle = 0; + msg->shmem_frame = (vtomach(xb_blk_ring) >> PAGE_SHIFT); + + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} + +static void +blkif_free(void) +{ + + unsigned long flags; + + printk("[XEN] Recovering virtual block device driver\n"); + + /* Prevent new requests being issued until we fix things up. */ + mtx_lock_irqsave(&blkif_io_lock, flags); + xb_recovery = 1; + blkif_state = BLKIF_STATE_DISCONNECTED; + mtx_unlock_irqrestore(&blkif_io_lock, flags); + + /* Free resources associated with old device channel. */ + if (xb_blk_ring) { + free(xb_blk_ring, M_DEVBUF); + xb_blk_ring = NULL; + } + /* free_irq(blkif_irq, NULL);*/ + blkif_irq = 0; + + unbind_evtchn_from_irq(blkif_evtchn); + blkif_evtchn = 0; +} + +static void +blkif_close(void) +{ +} + +/* Move from CLOSED to DISCONNECTED state. */ +static void +blkif_disconnect(void) +{ + if (xb_blk_ring) free(xb_blk_ring, M_DEVBUF); + xb_blk_ring = (blkif_ring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); + xb_blk_ring->req_prod = xb_blk_ring->resp_prod = 0; + xb_resp_cons = xb_req_prod = 0; + blkif_state = BLKIF_STATE_DISCONNECTED; + blkif_send_interface_connect(); +} + +static void +blkif_reset(void) +{ + printk("[XEN] Recovering virtual block device driver\n"); + blkif_free(); + blkif_disconnect(); +} + +static void +blkif_recover(void) +{ + + int i; + + /* Hmm, requests might be re-ordered when we re-issue them. + * This will need to be fixed once we have barriers */ + + /* Stage 1 : Find active and move to safety. */ + for ( i = 0; i < BLKIF_RING_SIZE; i++ ) { + if ( xb_rec_ring[i].id >= KERNBASE ) { + translate_req_to_mfn( + &xb_blk_ring->ring[xb_req_prod].req, &xb_rec_ring[i]); + xb_req_prod++; + } + } + + printk("blkfront: recovered %d descriptors\n",xb_req_prod); + + /* Stage 2 : Set up shadow list. */ + for ( i = 0; i < xb_req_prod; i++ ) { + xb_rec_ring[i].id = xb_blk_ring->ring[i].req.id; + xb_blk_ring->ring[i].req.id = i; + translate_req_to_pfn(&xb_rec_ring[i], &xb_blk_ring->ring[i].req); + } + + /* Stage 3 : Set up free list. */ + for ( ; i < BLKIF_RING_SIZE; i++ ){ + xb_rec_ring[i].id = i+1; + } + xb_rec_ring_free = xb_req_prod; + xb_rec_ring[BLKIF_RING_SIZE-1].id = 0x0fffffff; + + /* xb_blk_ring->req_prod will be set when we flush_requests().*/ + wmb(); + + /* Switch off recovery mode, using a memory barrier to ensure that + * it's seen before we flush requests - we don't want to miss any + * interrupts. */ + xb_recovery = 0; + wmb(); + + /* Kicks things back into life. */ + flush_requests(); + + /* Now safe to left other peope use interface. */ + blkif_state = BLKIF_STATE_CONNECTED; +} + +static void +blkif_connect(blkif_fe_interface_status_t *status) +{ + int err = 0; + + blkif_evtchn = status->evtchn; + blkif_irq = bind_evtchn_to_irq(blkif_evtchn); + + err = intr_add_handler("xbd", blkif_irq, + (driver_intr_t *)xb_response_intr, NULL, + INTR_TYPE_BIO | INTR_MPSAFE, NULL); + if(err){ + printk("[XEN] blkfront request_irq failed (err=%d)\n", err); + return; + } + + if ( xb_recovery ) { + blkif_recover(); + } else { + /* Probe for discs attached to the interface. */ + xb_vbdinit(); + + /* XXX: transition state after probe */ + blkif_state = BLKIF_STATE_CONNECTED; + } + + /* Kick pending requests. */ +#if 0 /* XXX: figure out sortq logic */ + mtx_lock_irq(&blkif_io_lock); + kick_pending_request_queues(); + mtx_unlock_irq(&blkif_io_lock); +#endif +} + +static void +unexpected(blkif_fe_interface_status_t *status) +{ + WPRINTK(" Unexpected blkif status %s in state %s\n", + blkif_status_name[status->status], + blkif_state_name[blkif_state]); +} + +static void +blkif_status(blkif_fe_interface_status_t *status) +{ + if (status->handle != blkif_handle) { + WPRINTK(" Invalid blkif: handle=%u", status->handle); + return; + } + + switch (status->status) { + + case BLKIF_INTERFACE_STATUS_CLOSED: + switch(blkif_state){ + case BLKIF_STATE_CLOSED: + unexpected(status); + break; + case BLKIF_STATE_DISCONNECTED: + case BLKIF_STATE_CONNECTED: + unexpected(status); + blkif_close(); + break; + } + break; + + case BLKIF_INTERFACE_STATUS_DISCONNECTED: + switch(blkif_state){ + case BLKIF_STATE_CLOSED: + blkif_disconnect(); + break; + case BLKIF_STATE_DISCONNECTED: + case BLKIF_STATE_CONNECTED: + unexpected(status); + blkif_reset(); + break; + } + break; + + case BLKIF_INTERFACE_STATUS_CONNECTED: + switch(blkif_state){ + case BLKIF_STATE_CLOSED: + unexpected(status); + blkif_disconnect(); + blkif_connect(status); + break; + case BLKIF_STATE_DISCONNECTED: + blkif_connect(status); + break; + case BLKIF_STATE_CONNECTED: + unexpected(status); + blkif_connect(status); + break; + } + break; + + case BLKIF_INTERFACE_STATUS_CHANGED: + switch(blkif_state){ + case BLKIF_STATE_CLOSED: + case BLKIF_STATE_DISCONNECTED: + unexpected(status); + break; + case BLKIF_STATE_CONNECTED: + vbd_update(); + break; + } + break; + + default: + WPRINTK("Invalid blkif status: %d\n", status->status); + break; + } +} + + +static void +blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) + { + case CMSG_BLKIF_FE_INTERFACE_STATUS: + if ( msg->length != sizeof(blkif_fe_interface_status_t) ) + goto parse_error; + blkif_status((blkif_fe_interface_status_t *) + &msg->msg[0]); + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + msg->length = 0; + ctrl_if_send_response(msg); +} + +static int +wait_for_blkif(void) +{ + int err = 0; + int i; + send_driver_status(1); + + /* + * We should read 'nr_interfaces' from response message and wait + * for notifications before proceeding. For now we assume that we + * will be notified of exactly one interface. + */ + for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*hz); i++ ) + { + tsleep(&blkif_state, PWAIT | PCATCH, "blkif", hz); + } + + if (blkif_state != BLKIF_STATE_CONNECTED){ + printk("[XEN] Timeout connecting block device driver!\n"); + err = -ENOSYS; + } + return err; +} + + +static void +xb_init(void *unused) +{ + int i; + + printk("[XEN] Initialising virtual block device driver\n"); + + xb_rec_ring_free = 0; + for (i = 0; i < BLKIF_RING_SIZE; i++) { + xb_rec_ring[i].id = i+1; + } + xb_rec_ring[BLKIF_RING_SIZE-1].id = 0x0fffffff; + + (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, 0); + + wait_for_blkif(); +} + +#if 0 /* XXX not yet */ +void +blkdev_suspend(void) +{ +} + +void +blkdev_resume(void) +{ + send_driver_status(1); +} +#endif + +/* XXXXX THIS IS A TEMPORARY FUNCTION UNTIL WE GET GRANT TABLES */ + +void +blkif_completion(blkif_request_t *req) +{ + int i; + + switch ( req->operation ) + { + case BLKIF_OP_READ: + for ( i = 0; i < req->nr_segments; i++ ) + { + unsigned long pfn = req->frame_and_sects[i] >> PAGE_SHIFT; + unsigned long mfn = xen_phys_machine[pfn]; + xen_machphys_update(mfn, pfn); + } + break; + } + +} +MTX_SYSINIT(ioreq, &blkif_io_lock, "BIO LOCK", MTX_SPIN); +SYSINIT(xbdev, SI_SUB_PSEUDO, SI_ORDER_ANY, xb_init, NULL) diff --git a/freebsd-5.3-xen-sparse/i386-xen/xen/char/console.c b/freebsd-5.3-xen-sparse/i386-xen/xen/char/console.c new file mode 100644 index 0000000000..7ea8e3eb4f --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/xen/char/console.c @@ -0,0 +1,536 @@ +#include <sys/cdefs.h> + + +#include <sys/param.h> +#include <sys/module.h> +#include <sys/systm.h> +#include <sys/consio.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/tty.h> +#include <sys/systm.h> +#include <sys/taskqueue.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <machine/stdarg.h> +#include <machine/xen-os.h> +#include <machine/hypervisor.h> +#include <machine/ctrl_if.h> +#include <sys/cons.h> + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> +#endif + +static char driver_name[] = "xc"; +devclass_t xc_devclass; +static void xcstart (struct tty *); +static int xcparam (struct tty *, struct termios *); +static void xcstop (struct tty *, int); +static void xc_timeout(void *); +static void xencons_tx_flush_task_routine(void *,int ); +static void __xencons_tx_flush(void); +static void xencons_rx(ctrl_msg_t *msg,unsigned long id); +static boolean_t xcons_putc(int c); + +/* switch console so that shutdown can occur gracefully */ +static void xc_shutdown(void *arg, int howto); +static int xc_mute; + +void xcons_force_flush(void); + +static cn_probe_t xccnprobe; +static cn_init_t xccninit; +static cn_getc_t xccngetc; +static cn_putc_t xccnputc; +static cn_checkc_t xccncheckc; + +#define XC_POLLTIME (hz/10) + +CONS_DRIVER(xc, xccnprobe, xccninit, NULL, xccngetc, + xccncheckc, xccnputc, NULL); + +static int xen_console_up; +static boolean_t xc_tx_task_queued; +static boolean_t xc_start_needed; +static struct callout xc_callout; +struct mtx cn_mtx; + +#define RBUF_SIZE 1024 +#define RBUF_MASK(_i) ((_i)&(RBUF_SIZE-1)) +#define WBUF_SIZE 4096 +#define WBUF_MASK(_i) ((_i)&(WBUF_SIZE-1)) +static char wbuf[WBUF_SIZE]; +static char rbuf[RBUF_SIZE]; +static int rc, rp; +static int cnsl_evt_reg; +static unsigned int wc, wp; /* write_cons, write_prod */ +static struct task xencons_tx_flush_task = { {NULL},0,0,&xencons_tx_flush_task_routine,NULL }; + + +#define CDEV_MAJOR 12 +#define XCUNIT(x) (minor(x)) +#define ISTTYOPEN(tp) ((tp) && ((tp)->t_state & TS_ISOPEN)) +#define CN_LOCK_INIT(x, _name) \ + mtx_init(&x, _name, _name, MTX_SPIN) +#define CN_LOCK(l, f) mtx_lock_irqsave(&(l), (f)) +#define CN_UNLOCK(l, f) mtx_unlock_irqrestore(&(l), (f)) +#define CN_LOCK_ASSERT(x) mtx_assert(&x, MA_OWNED) +#define CN_LOCK_DESTROY(x) mtx_destroy(&x) + + +static struct tty *xccons; + +struct xc_softc { + int xc_unit; + struct cdev *xc_dev; +}; + + +static d_open_t xcopen; +static d_close_t xcclose; +static d_ioctl_t xcioctl; + +static struct cdevsw xc_cdevsw = { + /* version */ D_VERSION_00, + /* maj */ CDEV_MAJOR, + /* flags */ D_TTY | D_NEEDGIANT, + /* name */ driver_name, + + /* open */ xcopen, + /* fdopen */ 0, + /* close */ xcclose, + /* read */ ttyread, + /* write */ ttywrite, + /* ioctl */ xcioctl, + /* poll */ ttypoll, + /* mmap */ 0, + /* strategy */ 0, + /* dump */ 0, + /* kqfilter */ ttykqfilter +}; + +static void +xccnprobe(struct consdev *cp) +{ + cp->cn_pri = CN_REMOTE; + cp->cn_tp = xccons; + sprintf(cp->cn_name, "%s0", driver_name); +} + + +static void +xccninit(struct consdev *cp) +{ + CN_LOCK_INIT(cn_mtx,"XCONS LOCK"); + +} +int +xccngetc(struct consdev *dev) +{ + int c; + if (xc_mute) + return 0; + do { + if ((c = xccncheckc(dev)) == -1) { + /* polling without sleeping in Xen doesn't work well. + * Sleeping gives other things like clock a chance to + * run + */ + tsleep(&cn_mtx, PWAIT | PCATCH, "console sleep", + XC_POLLTIME); + } + } while( c == -1 ); + return c; +} + +int +xccncheckc(struct consdev *dev) +{ + int ret = (xc_mute ? 0 : -1); + int flags; + CN_LOCK(cn_mtx, flags); + if ( (rp - rc) ){ + /* we need to return only one char */ + ret = (int)rbuf[RBUF_MASK(rc)]; + rc++; + } + CN_UNLOCK(cn_mtx, flags); + return(ret); +} + +static void +xccnputc(struct consdev *dev, int c) +{ + int flags; + CN_LOCK(cn_mtx, flags); + xcons_putc(c); + CN_UNLOCK(cn_mtx, flags); +} + +static boolean_t +xcons_putc(int c) +{ + int force_flush = xc_mute || +#ifdef DDB + db_active || +#endif + panicstr; /* we're not gonna recover, so force + * flush + */ + + if ( (wp-wc) < (WBUF_SIZE-1) ){ + if ( (wbuf[WBUF_MASK(wp++)] = c) == '\n' ) { + wbuf[WBUF_MASK(wp++)] = '\r'; + if (force_flush) + xcons_force_flush(); + } + } else if (force_flush) { + xcons_force_flush(); + + } + if (cnsl_evt_reg) + __xencons_tx_flush(); + + /* inform start path that we're pretty full */ + return ((wp - wc) >= WBUF_SIZE - 100) ? TRUE : FALSE; +} + +static void +xc_identify(driver_t *driver, device_t parent) +{ + device_t child; + child = BUS_ADD_CHILD(parent, 0, driver_name, 0); + device_set_driver(child, driver); + device_set_desc(child, "Xen Console"); +} + +static int +xc_probe(device_t dev) +{ + struct xc_softc *sc = (struct xc_softc *)device_get_softc(dev); + + sc->xc_unit = device_get_unit(dev); + return (0); +} + +static int +xc_attach(device_t dev) +{ + struct xc_softc *sc = (struct xc_softc *)device_get_softc(dev); + + sc->xc_dev = make_dev(&xc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "xc%r", 0); + xccons = ttymalloc(NULL); + + sc->xc_dev->si_drv1 = (void *)sc; + sc->xc_dev->si_tty = xccons; + + xccons->t_oproc = xcstart; + xccons->t_param = xcparam; + xccons->t_stop = xcstop; + xccons->t_dev = sc->xc_dev; + + callout_init(&xc_callout, 0); + + /* Ensure that we don't attach before the event channel is able to receive + * a registration. The XenBus code delays the probe/attach order until + * this has occurred. + */ + (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0); + cnsl_evt_reg = 1; + + callout_reset(&xc_callout, XC_POLLTIME, xc_timeout, xccons); + + /* register handler to flush console on shutdown */ + if ((EVENTHANDLER_REGISTER(shutdown_post_sync, xc_shutdown, + NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) + printf("xencons: shutdown event registration failed!\n"); + + return (0); +} + +/* + * return 0 for all console input, force flush all output. + */ +static void +xc_shutdown(void *arg, int howto) +{ + xc_mute = 1; + xcons_force_flush(); + +} + +static void +xencons_rx(ctrl_msg_t *msg,unsigned long id) +{ + int i, flags; + struct tty *tp = xccons; + + CN_LOCK(cn_mtx, flags); + for ( i = 0; i < msg->length; i++ ) { + if ( xen_console_up ) + (*linesw[tp->t_line]->l_rint)(msg->msg[i], tp); + else + rbuf[RBUF_MASK(rp++)] = msg->msg[i]; + } + CN_UNLOCK(cn_mtx, flags); + msg->length = 0; + ctrl_if_send_response(msg); +} + +static void +__xencons_tx_flush(void) +{ + int sz, work_done = 0; + ctrl_msg_t msg; + + while ( wc != wp ) + { + sz = wp - wc; + if ( sz > sizeof(msg.msg) ) + sz = sizeof(msg.msg); + if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) ) + sz = WBUF_SIZE - WBUF_MASK(wc); + + msg.type = CMSG_CONSOLE; + msg.subtype = CMSG_CONSOLE_DATA; + msg.length = sz; + memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz); + + if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ){ + wc += sz; + } + else if (xc_tx_task_queued) { + /* avoid the extra enqueue check if we know we're already queued */ + break; + } else if (ctrl_if_enqueue_space_callback(&xencons_tx_flush_task)) { + xc_tx_task_queued = TRUE; + break; + } + + work_done = 1; + } + + if ( work_done && xen_console_up ) + ttwakeup(xccons); +} +static void +xencons_tx_flush_task_routine(void * data, int arg) +{ + int flags; + CN_LOCK(cn_mtx, flags); + xc_tx_task_queued = FALSE; + __xencons_tx_flush(); + CN_UNLOCK(cn_mtx, flags); +} + +int +xcopen(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct xc_softc *sc; + int unit = XCUNIT(dev); + struct tty *tp; + int s, error; + + sc = (struct xc_softc *)device_get_softc( + devclass_get_device(xc_devclass, unit)); + if (sc == NULL) + return (ENXIO); + + tp = dev->si_tty; + s = spltty(); + if (!ISTTYOPEN(tp)) { + tp->t_state |= TS_CARR_ON; + ttychars(tp); + tp->t_iflag = TTYDEF_IFLAG; + tp->t_oflag = TTYDEF_OFLAG; + tp->t_cflag = TTYDEF_CFLAG|CLOCAL; + tp->t_lflag = TTYDEF_LFLAG; + tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; + xcparam(tp, &tp->t_termios); + ttsetwater(tp); + } else if (tp->t_state & TS_XCLUDE && suser(td)) { + splx(s); + return (EBUSY); + } + splx(s); + + xen_console_up = 1; + + error = (*linesw[tp->t_line]->l_open)(dev, tp); + + return error; +} + +int +xcclose(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct tty *tp = dev->si_tty; + + if (tp == NULL) + return (0); + xen_console_up = 0; + + spltty(); + (*linesw[tp->t_line]->l_close)(tp, flag); + tty_close(tp); + spl0(); + return (0); +} + + +int +xcioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) +{ + struct tty *tp = dev->si_tty; + int error; + + error = (*linesw[tp->t_line]->l_ioctl)(tp, cmd, data, flag, td); + if (error != ENOIOCTL) + return (error); + error = ttioctl(tp, cmd, data, flag); + if (error != ENOIOCTL) + return (error); + return (ENOTTY); +} + +static inline int +__xencons_put_char(int ch) +{ + char _ch = (char)ch; + if ( (wp - wc) == WBUF_SIZE ) + return 0; + wbuf[WBUF_MASK(wp++)] = _ch; + return 1; +} + + +static void +xcstart(struct tty *tp) +{ + int flags; + int s; + boolean_t cons_full = FALSE; + + s = spltty(); + CN_LOCK(cn_mtx, flags); + if (tp->t_state & (TS_TIMEOUT | TS_TTSTOP)) { + ttwwakeup(tp); + CN_UNLOCK(cn_mtx, flags); + return; + } + + tp->t_state |= TS_BUSY; + while (tp->t_outq.c_cc != 0 && !cons_full) + cons_full = xcons_putc(getc(&tp->t_outq)); + + /* if the console is close to full leave our state as busy */ + if (!cons_full) { + tp->t_state &= ~TS_BUSY; + ttwwakeup(tp); + } else { + /* let the timeout kick us in a bit */ + xc_start_needed = TRUE; + } + CN_UNLOCK(cn_mtx, flags); + splx(s); +} + +static void +xcstop(struct tty *tp, int flag) +{ + + if (tp->t_state & TS_BUSY) { + if ((tp->t_state & TS_TTSTOP) == 0) { + tp->t_state |= TS_FLUSH; + } + } +} + +static void +xc_timeout(void *v) +{ + struct tty *tp; + int c; + + tp = (struct tty *)v; + + while ((c = xccncheckc(NULL)) != -1) { + if (tp->t_state & TS_ISOPEN) { + (*linesw[tp->t_line]->l_rint)(c, tp); + } + } + + if (xc_start_needed) { + xc_start_needed = FALSE; + xcstart(tp); + } + + callout_reset(&xc_callout, XC_POLLTIME, xc_timeout, tp); +} + +/* + * Set line parameters. + */ +int +xcparam(struct tty *tp, struct termios *t) +{ + tp->t_ispeed = t->c_ispeed; + tp->t_ospeed = t->c_ospeed; + tp->t_cflag = t->c_cflag; + return (0); +} + + +static device_method_t xc_methods[] = { + DEVMETHOD(device_identify, xc_identify), + DEVMETHOD(device_probe, xc_probe), + DEVMETHOD(device_attach, xc_attach), + {0, 0} +}; + +static driver_t xc_driver = { + driver_name, + xc_methods, + sizeof(struct xc_softc), +}; + +/*** Forcibly flush console data before dying. ***/ +void +xcons_force_flush(void) +{ + ctrl_msg_t msg; + int sz; + + /* + * We use dangerous control-interface functions that require a quiescent + * system and no interrupts. Try to ensure this with a global cli(). + */ + cli(); + + /* Spin until console data is flushed through to the domain controller. */ + while ( (wc != wp) && !ctrl_if_transmitter_empty() ) + { + /* Interrupts are disabled -- we must manually reap responses. */ + ctrl_if_discard_responses(); + + if ( (sz = wp - wc) == 0 ) + continue; + if ( sz > sizeof(msg.msg) ) + sz = sizeof(msg.msg); + if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) ) + sz = WBUF_SIZE - WBUF_MASK(wc); + + msg.type = CMSG_CONSOLE; + msg.subtype = CMSG_CONSOLE_DATA; + msg.length = sz; + memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz); + + if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) + wc += sz; + } +} + +DRIVER_MODULE(xc, xenbus, xc_driver, xc_devclass, 0, 0); diff --git a/freebsd-5.3-xen-sparse/i386-xen/xen/misc/evtchn_dev.c b/freebsd-5.3-xen-sparse/i386-xen/xen/misc/evtchn_dev.c new file mode 100644 index 0000000000..de379b6bf9 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/xen/misc/evtchn_dev.c @@ -0,0 +1,410 @@ +/****************************************************************************** + * evtchn.c + * + * Xenolinux driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004, K A Fraser + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/uio.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/selinfo.h> +#include <sys/poll.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/ioccom.h> + +#include <machine/cpufunc.h> +#include <machine/intr_machdep.h> +#include <machine/xen-os.h> +#include <machine/xen_intr.h> +#include <machine/bus.h> +#include <sys/rman.h> +#include <machine/resource.h> +#include <machine/synch_bitops.h> + +#include <machine/hypervisor.h> + + +typedef struct evtchn_sotfc { + + struct selinfo ev_rsel; +} evtchn_softc_t; + + +#ifdef linuxcrap +/* NB. This must be shared amongst drivers if more things go in /dev/xen */ +static devfs_handle_t xen_dev_dir; +#endif + +/* Only one process may open /dev/xen/evtchn at any time. */ +static unsigned long evtchn_dev_inuse; + +/* Notification ring, accessed via /dev/xen/evtchn. */ +#define RING_SIZE 2048 /* 2048 16-bit entries */ +#define RING_MASK(_i) ((_i)&(RING_SIZE-1)) +static uint16_t *ring; +static unsigned int ring_cons, ring_prod, ring_overflow; + +/* Which ports is user-space bound to? */ +static uint32_t bound_ports[32]; + +/* Unique address for processes to sleep on */ +static void *evtchn_waddr = ˚ + +static struct mtx lock, upcall_lock; + +static d_read_t evtchn_read; +static d_write_t evtchn_write; +static d_ioctl_t evtchn_ioctl; +static d_poll_t evtchn_poll; +static d_open_t evtchn_open; +static d_close_t evtchn_close; + + +void +evtchn_device_upcall(int port) +{ + mtx_lock(&upcall_lock); + + mask_evtchn(port); + clear_evtchn(port); + + if ( ring != NULL ) { + if ( (ring_prod - ring_cons) < RING_SIZE ) { + ring[RING_MASK(ring_prod)] = (uint16_t)port; + if ( ring_cons == ring_prod++ ) { + wakeup(evtchn_waddr); + } + } + else { + ring_overflow = 1; + } + } + + mtx_unlock(&upcall_lock); +} + +static void +__evtchn_reset_buffer_ring(void) +{ + /* Initialise the ring to empty. Clear errors. */ + ring_cons = ring_prod = ring_overflow = 0; +} + +static int +evtchn_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + int rc; + unsigned int count, c, p, sst = 0, bytes1 = 0, bytes2 = 0; + count = uio->uio_resid; + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) + { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + for ( ; ; ) { + if ( (c = ring_cons) != (p = ring_prod) ) + break; + + if ( ring_overflow ) { + rc = EFBIG; + goto out; + } + + if (sst != 0) { + rc = EINTR; + goto out; + } + + /* PCATCH == check for signals before and after sleeping + * PWAIT == priority of waiting on resource + */ + sst = tsleep(evtchn_waddr, PWAIT|PCATCH, "evchwt", 10); + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if ( ((c ^ p) & RING_SIZE) != 0 ) { + bytes1 = (RING_SIZE - RING_MASK(c)) * sizeof(uint16_t); + bytes2 = RING_MASK(p) * sizeof(uint16_t); + } + else { + bytes1 = (p - c) * sizeof(uint16_t); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if ( bytes1 > count ) { + bytes1 = count; + bytes2 = 0; + } + else if ( (bytes1 + bytes2) > count ) { + bytes2 = count - bytes1; + } + + if ( uiomove(&ring[RING_MASK(c)], bytes1, uio) || + ((bytes2 != 0) && uiomove(&ring[0], bytes2, uio))) + /* keeping this around as its replacement is not equivalent + * copyout(&ring[0], &buf[bytes1], bytes2) + */ + { + rc = EFAULT; + goto out; + } + + ring_cons += (bytes1 + bytes2) / sizeof(uint16_t); + + rc = bytes1 + bytes2; + + out: + + return rc; +} + +static int +evtchn_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + int rc, i, count; + + count = uio->uio_resid; + + uint16_t *kbuf = (uint16_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); + + + if ( kbuf == NULL ) + return ENOMEM; + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + if ( uiomove(kbuf, count, uio) != 0 ) { + rc = EFAULT; + goto out; + } + + mtx_lock_spin(&lock); + for ( i = 0; i < (count/2); i++ ) + if ( test_bit(kbuf[i], &bound_ports[0]) ) + unmask_evtchn(kbuf[i]); + mtx_unlock_spin(&lock); + + rc = count; + + out: + free(kbuf, M_DEVBUF); + return rc; +} + +static int +evtchn_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg, + int mode, struct thread *td __unused) +{ + int rc = 0; + + mtx_lock_spin(&lock); + + switch ( cmd ) + { + case EVTCHN_RESET: + __evtchn_reset_buffer_ring(); + break; + case EVTCHN_BIND: + if ( !synch_test_and_set_bit((int)arg, &bound_ports[0]) ) + unmask_evtchn((int)arg); + else + rc = EINVAL; + break; + case EVTCHN_UNBIND: + if ( synch_test_and_clear_bit((int)arg, &bound_ports[0]) ) + mask_evtchn((int)arg); + else + rc = EINVAL; + break; + default: + rc = ENOSYS; + break; + } + + mtx_unlock_spin(&lock); + + return rc; +} + +static int +evtchn_poll(struct cdev *dev, int poll_events, struct thread *td) +{ + + evtchn_softc_t *sc; + unsigned int mask = POLLOUT | POLLWRNORM; + + sc = dev->si_drv1; + + if ( ring_cons != ring_prod ) + mask |= POLLIN | POLLRDNORM; + else if ( ring_overflow ) + mask = POLLERR; + else + selrecord(td, &sc->ev_rsel); + + + return mask; +} + + +static int +evtchn_open(struct cdev *dev, int flag, int otyp, struct thread *td) +{ + uint16_t *_ring; + + if (flag & O_NONBLOCK) + return EBUSY; + + if ( synch_test_and_set_bit(0, &evtchn_dev_inuse) ) + return EBUSY; + + if ( (_ring = (uint16_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK)) == NULL ) + return ENOMEM; + + mtx_lock_spin(&lock); + ring = _ring; + __evtchn_reset_buffer_ring(); + mtx_unlock_spin(&lock); + + + return 0; +} + +static int +evtchn_close(struct cdev *dev, int flag, int otyp, struct thread *td __unused) +{ + int i; + + mtx_lock_spin(&lock); + if (ring != NULL) { + free(ring, M_DEVBUF); + ring = NULL; + } + for ( i = 0; i < NR_EVENT_CHANNELS; i++ ) + if ( synch_test_and_clear_bit(i, &bound_ports[0]) ) + mask_evtchn(i); + mtx_unlock_spin(&lock); + + evtchn_dev_inuse = 0; + + return 0; +} + + + +/* XXX wild assed guess as to a safe major number */ +#define EVTCHN_MAJOR 140 + +static struct cdevsw evtchn_devsw = { + d_version: D_VERSION_00, + d_open: evtchn_open, + d_close: evtchn_close, + d_read: evtchn_read, + d_write: evtchn_write, + d_ioctl: evtchn_ioctl, + d_poll: evtchn_poll, + d_name: "evtchn", + d_maj: EVTCHN_MAJOR, + d_flags: 0, +}; + + +/* XXX - if this device is ever supposed to support use by more than one process + * this global static will have to go away + */ +static struct cdev *evtchn_dev; + + + +static int +evtchn_init(void *dummy __unused) +{ + /* XXX I believe we don't need these leaving them here for now until we + * have some semblance of it working + */ +#if 0 + devfs_handle_t symlink_handle; + int err, pos; + char link_dest[64]; +#endif + mtx_init(&upcall_lock, "evtchup", NULL, MTX_DEF); + + /* (DEVFS) create '/dev/misc/evtchn'. */ + evtchn_dev = make_dev(&evtchn_devsw, 0, UID_ROOT, GID_WHEEL, 0600, "xen/evtchn"); + + mtx_init(&lock, "evch", NULL, MTX_SPIN | MTX_NOWITNESS); + + evtchn_dev->si_drv1 = malloc(sizeof(evtchn_softc_t), M_DEVBUF, M_WAITOK); + bzero(evtchn_dev->si_drv1, sizeof(evtchn_softc_t)); + + /* XXX I don't think we need any of this rubbish */ +#if 0 + if ( err != 0 ) + { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + + /* (DEVFS) create directory '/dev/xen'. */ + xen_dev_dir = devfs_mk_dir(NULL, "xen", NULL); + + /* (DEVFS) &link_dest[pos] == '../misc/evtchn'. */ + pos = devfs_generate_path(evtchn_miscdev.devfs_handle, + &link_dest[3], + sizeof(link_dest) - 3); + if ( pos >= 0 ) + strncpy(&link_dest[pos], "../", 3); + /* (DEVFS) symlink '/dev/xen/evtchn' -> '../misc/evtchn'. */ + (void)devfs_mk_symlink(xen_dev_dir, + "evtchn", + DEVFS_FL_DEFAULT, + &link_dest[pos], + &symlink_handle, + NULL); + + /* (DEVFS) automatically destroy the symlink with its destination. */ + devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle); +#endif + printk("Event-channel device installed.\n"); + + return 0; +} + + +SYSINIT(evtchn_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, evtchn_init, NULL); + + +#if 0 + +static void cleanup_module(void) +{ + destroy_dev(evtchn_dev); +; +} + +module_init(init_module); +module_exit(cleanup_module); +#endif diff --git a/freebsd-5.3-xen-sparse/i386-xen/xen/misc/npx.c b/freebsd-5.3-xen-sparse/i386-xen/xen/misc/npx.c new file mode 100644 index 0000000000..604aec78c1 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/xen/misc/npx.c @@ -0,0 +1,1109 @@ +/*- + * Copyright (c) 1990 William Jolitz. + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)npx.c 7.2 (Berkeley) 5/12/91 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/i386/isa/npx.c,v 1.144 2003/11/03 21:53:38 jhb Exp $"); + +#include "opt_cpu.h" +#include "opt_debug_npx.h" +#include "opt_isa.h" +#include "opt_npx.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <machine/bus.h> +#include <sys/rman.h> +#ifdef NPX_DEBUG +#include <sys/syslog.h> +#endif +#include <sys/signalvar.h> +#include <sys/user.h> + +#include <machine/asmacros.h> +#include <machine/cputypes.h> +#include <machine/frame.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#include <machine/psl.h> +#include <machine/clock.h> +#include <machine/resource.h> +#include <machine/specialreg.h> +#include <machine/segments.h> +#include <machine/ucontext.h> + +#include <machine/multicall.h> + +#include <i386/isa/icu.h> +#ifdef PC98 +#include <pc98/pc98/pc98.h> +#else +#include <i386/isa/isa.h> +#endif +#include <machine/intr_machdep.h> +#ifdef DEV_ISA +#include <isa/isavar.h> +#endif + +#if !defined(CPU_ENABLE_SSE) && defined(I686_CPU) +#define CPU_ENABLE_SSE +#endif +#if defined(CPU_DISABLE_SSE) +#undef CPU_ENABLE_SSE +#endif + +/* + * 387 and 287 Numeric Coprocessor Extension (NPX) Driver. + */ + +/* Configuration flags. */ +#define NPX_DISABLE_I586_OPTIMIZED_BCOPY (1 << 0) +#define NPX_DISABLE_I586_OPTIMIZED_BZERO (1 << 1) +#define NPX_DISABLE_I586_OPTIMIZED_COPYIO (1 << 2) + +#if defined(__GNUC__) && !defined(lint) + +#define fldcw(addr) __asm("fldcw %0" : : "m" (*(addr))) +#define fnclex() __asm("fnclex") +#define fninit() __asm("fninit") +#define fnsave(addr) __asm __volatile("fnsave %0" : "=m" (*(addr))) +#define fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr))) +#define fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr))) +#define fp_divide_by_0() __asm("fldz; fld1; fdiv %st,%st(1); fnop") +#define frstor(addr) __asm("frstor %0" : : "m" (*(addr))) +#ifdef CPU_ENABLE_SSE +#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) +#endif +#define start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \ + : : "n" (CR0_TS) : "ax") +#define stop_emulating() __asm("clts") + +#else /* not __GNUC__ */ + +void fldcw(caddr_t addr); +void fnclex(void); +void fninit(void); +void fnsave(caddr_t addr); +void fnstcw(caddr_t addr); +void fnstsw(caddr_t addr); +void fp_divide_by_0(void); +void frstor(caddr_t addr); +#ifdef CPU_ENABLE_SSE +void fxsave(caddr_t addr); +void fxrstor(caddr_t addr); +#endif +void start_emulating(void); +void stop_emulating(void); + +#endif /* __GNUC__ */ + +#ifdef CPU_ENABLE_SSE +#define GET_FPU_CW(thread) \ + (cpu_fxsr ? \ + (thread)->td_pcb->pcb_save.sv_xmm.sv_env.en_cw : \ + (thread)->td_pcb->pcb_save.sv_87.sv_env.en_cw) +#define GET_FPU_SW(thread) \ + (cpu_fxsr ? \ + (thread)->td_pcb->pcb_save.sv_xmm.sv_env.en_sw : \ + (thread)->td_pcb->pcb_save.sv_87.sv_env.en_sw) +#else /* CPU_ENABLE_SSE */ +#define GET_FPU_CW(thread) \ + (thread->td_pcb->pcb_save.sv_87.sv_env.en_cw) +#define GET_FPU_SW(thread) \ + (thread->td_pcb->pcb_save.sv_87.sv_env.en_sw) +#endif /* CPU_ENABLE_SSE */ + +typedef u_char bool_t; + +static void fpusave(union savefpu *); +static void fpurstor(union savefpu *); +static int npx_attach(device_t dev); +static void npx_identify(driver_t *driver, device_t parent); +#if 0 +static void npx_intr(void *); +#endif +static int npx_probe(device_t dev); +#ifdef I586_CPU_XXX +static long timezero(const char *funcname, + void (*func)(void *buf, size_t len)); +#endif /* I586_CPU */ + +int hw_float; /* XXX currently just alias for npx_exists */ + +SYSCTL_INT(_hw,HW_FLOATINGPT, floatingpoint, + CTLFLAG_RD, &hw_float, 0, + "Floatingpoint instructions executed in hardware"); +#if 0 +static volatile u_int npx_intrs_while_probing; +#endif +static union savefpu npx_cleanstate; +static bool_t npx_cleanstate_ready; +static bool_t npx_ex16; +static bool_t npx_exists; +static bool_t npx_irq13; + +alias_for_inthand_t probetrap; +#if 0 +__asm(" \n\ + .text \n\ + .p2align 2,0x90 \n\ + .type " __XSTRING(CNAME(probetrap)) ",@function \n\ +" __XSTRING(CNAME(probetrap)) ": \n\ + ss \n\ + incl " __XSTRING(CNAME(npx_traps_while_probing)) " \n\ + fnclex \n\ + iret \n\ +"); +#endif +/* + * Identify routine. Create a connection point on our parent for probing. + */ +static void +npx_identify(driver, parent) + driver_t *driver; + device_t parent; +{ + device_t child; + + child = BUS_ADD_CHILD(parent, 0, "npx", 0); + if (child == NULL) + panic("npx_identify"); +} +#if 0 +/* + * Do minimal handling of npx interrupts to convert them to traps. + */ +static void +npx_intr(dummy) + void *dummy; +{ + struct thread *td; + + npx_intrs_while_probing++; + + /* + * The BUSY# latch must be cleared in all cases so that the next + * unmasked npx exception causes an interrupt. + */ +#ifdef PC98 + outb(0xf8, 0); +#else + outb(0xf0, 0); +#endif + + /* + * fpcurthread is normally non-null here. In that case, schedule an + * AST to finish the exception handling in the correct context + * (this interrupt may occur after the thread has entered the + * kernel via a syscall or an interrupt). Otherwise, the npx + * state of the thread that caused this interrupt must have been + * pushed to the thread's pcb, and clearing of the busy latch + * above has finished the (essentially null) handling of this + * interrupt. Control will eventually return to the instruction + * that caused it and it will repeat. We will eventually (usually + * soon) win the race to handle the interrupt properly. + */ + td = PCPU_GET(fpcurthread); + if (td != NULL) { + td->td_pcb->pcb_flags |= PCB_NPXTRAP; + mtx_lock_spin(&sched_lock); + td->td_flags |= TDF_ASTPENDING; + mtx_unlock_spin(&sched_lock); + } +} +#endif + +static int +npx_probe(device_t dev) +{ + + return 1; +} + +#if 0 +/* + * Probe routine. Initialize cr0 to give correct behaviour for [f]wait + * whether the device exists or not (XXX should be elsewhere). Set flags + * to tell npxattach() what to do. Modify device struct if npx doesn't + * need to use interrupts. Return 0 if device exists. + */ +static int +npx_probe(device_t dev) +{ + struct gate_descriptor save_idt_npxtrap; + struct resource *ioport_res, *irq_res; + void *irq_cookie; + int ioport_rid, irq_num, irq_rid; + u_short control; + u_short status; + + save_idt_npxtrap = idt[IDT_MF]; + setidt(IDT_MF, probetrap, SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + ioport_rid = 0; + ioport_res = bus_alloc_resource(dev, SYS_RES_IOPORT, &ioport_rid, + IO_NPX, IO_NPX, IO_NPXSIZE, RF_ACTIVE); + if (ioport_res == NULL) + panic("npx: can't get ports"); +#ifdef PC98 + if (resource_int_value("npx", 0, "irq", &irq_num) != 0) + irq_num = 8; +#else + if (resource_int_value("npx", 0, "irq", &irq_num) != 0) + irq_num = 13; +#endif + irq_rid = 0; + irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &irq_rid, irq_num, + irq_num, 1, RF_ACTIVE); + if (irq_res == NULL) + panic("npx: can't get IRQ"); + if (bus_setup_intr(dev, irq_res, INTR_TYPE_MISC | INTR_FAST, npx_intr, + NULL, &irq_cookie) != 0) + panic("npx: can't create intr"); + + /* + * Partially reset the coprocessor, if any. Some BIOS's don't reset + * it after a warm boot. + */ +#ifdef PC98 + outb(0xf8,0); +#else + outb(0xf1, 0); /* full reset on some systems, NOP on others */ + outb(0xf0, 0); /* clear BUSY# latch */ +#endif + /* + * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT + * instructions. We must set the CR0_MP bit and use the CR0_TS + * bit to control the trap, because setting the CR0_EM bit does + * not cause WAIT instructions to trap. It's important to trap + * WAIT instructions - otherwise the "wait" variants of no-wait + * control instructions would degenerate to the "no-wait" variants + * after FP context switches but work correctly otherwise. It's + * particularly important to trap WAITs when there is no NPX - + * otherwise the "wait" variants would always degenerate. + * + * Try setting CR0_NE to get correct error reporting on 486DX's. + * Setting it should fail or do nothing on lesser processors. + */ + load_cr0(rcr0() | CR0_MP | CR0_NE); + /* + * But don't trap while we're probing. + */ + stop_emulating(); + /* + * Finish resetting the coprocessor, if any. If there is an error + * pending, then we may get a bogus IRQ13, but npx_intr() will handle + * it OK. Bogus halts have never been observed, but we enabled + * IRQ13 and cleared the BUSY# latch early to handle them anyway. + */ + fninit(); + + device_set_desc(dev, "math processor"); + + /* + * Don't use fwait here because it might hang. + * Don't use fnop here because it usually hangs if there is no FPU. + */ + DELAY(1000); /* wait for any IRQ13 */ +#ifdef DIAGNOSTIC + if (npx_intrs_while_probing != 0) + printf("fninit caused %u bogus npx interrupt(s)\n", + npx_intrs_while_probing); + if (npx_traps_while_probing != 0) + printf("fninit caused %u bogus npx trap(s)\n", + npx_traps_while_probing); +#endif + /* + * Check for a status of mostly zero. + */ + status = 0x5a5a; + fnstsw(&status); + if ((status & 0xb8ff) == 0) { + /* + * Good, now check for a proper control word. + */ + control = 0x5a5a; + fnstcw(&control); + if ((control & 0x1f3f) == 0x033f) { + hw_float = npx_exists = 1; + /* + * We have an npx, now divide by 0 to see if exception + * 16 works. + */ + control &= ~(1 << 2); /* enable divide by 0 trap */ + fldcw(&control); +#ifdef FPU_ERROR_BROKEN + /* + * FPU error signal doesn't work on some CPU + * accelerator board. + */ + npx_ex16 = 1; + return (0); +#endif + npx_traps_while_probing = npx_intrs_while_probing = 0; + fp_divide_by_0(); + if (npx_traps_while_probing != 0) { + /* + * Good, exception 16 works. + */ + npx_ex16 = 1; + goto no_irq13; + } + if (npx_intrs_while_probing != 0) { + /* + * Bad, we are stuck with IRQ13. + */ + npx_irq13 = 1; + idt[IDT_MF] = save_idt_npxtrap; +#ifdef SMP + if (mp_ncpus > 1) + panic("npx0 cannot use IRQ 13 on an SMP system"); +#endif + return (0); + } + /* + * Worse, even IRQ13 is broken. Use emulator. + */ + } + } + /* + * Probe failed, but we want to get to npxattach to initialize the + * emulator and say that it has been installed. XXX handle devices + * that aren't really devices better. + */ +#ifdef SMP + if (mp_ncpus > 1) + panic("npx0 cannot be emulated on an SMP system"); +#endif + /* FALLTHROUGH */ +no_irq13: + idt[IDT_MF] = save_idt_npxtrap; + bus_teardown_intr(dev, irq_res, irq_cookie); + + /* + * XXX hack around brokenness of bus_teardown_intr(). If we left the + * irq active then we would get it instead of exception 16. + */ + { + struct intsrc *isrc; + + isrc = intr_lookup_source(irq_num); + isrc->is_pic->pic_disable_source(isrc); + } + + bus_release_resource(dev, SYS_RES_IRQ, irq_rid, irq_res); + bus_release_resource(dev, SYS_RES_IOPORT, ioport_rid, ioport_res); + return (0); +} +#endif + +/* + * Attach routine - announce which it is, and wire into system + */ +static int +npx_attach(device_t dev) +{ + int flags; + register_t s; + + if (resource_int_value("npx", 0, "flags", &flags) != 0) + flags = 0; + + if (flags) + device_printf(dev, "flags 0x%x ", flags); + if (npx_irq13) { + device_printf(dev, "using IRQ 13 interface\n"); + } else { + if (npx_ex16) + device_printf(dev, "INT 16 interface\n"); + else + device_printf(dev, "WARNING: no FPU!\n"); + } + npxinit(__INITIAL_NPXCW__); + + if (npx_cleanstate_ready == 0) { + s = intr_disable(); + stop_emulating(); + fpusave(&npx_cleanstate); + start_emulating(); + npx_cleanstate_ready = 1; + intr_restore(s); + } +#ifdef I586_CPU_XXX + if (cpu_class == CPUCLASS_586 && npx_ex16 && npx_exists && + timezero("i586_bzero()", i586_bzero) < + timezero("bzero()", bzero) * 4 / 5) { + if (!(flags & NPX_DISABLE_I586_OPTIMIZED_BCOPY)) + bcopy_vector = i586_bcopy; + if (!(flags & NPX_DISABLE_I586_OPTIMIZED_BZERO)) + bzero_vector = i586_bzero; + if (!(flags & NPX_DISABLE_I586_OPTIMIZED_COPYIO)) { + copyin_vector = i586_copyin; + copyout_vector = i586_copyout; + } + } +#endif + + return (0); /* XXX unused */ +} + +/* + * Initialize floating point unit. + */ +void +npxinit(control) + u_short control; +{ + static union savefpu dummy; + register_t savecrit; + + if (!npx_exists) + return; + /* + * fninit has the same h/w bugs as fnsave. Use the detoxified + * fnsave to throw away any junk in the fpu. npxsave() initializes + * the fpu and sets fpcurthread = NULL as important side effects. + */ + savecrit = intr_disable(); + npxsave(&dummy); + stop_emulating(); +#ifdef CPU_ENABLE_SSE + /* XXX npxsave() doesn't actually initialize the fpu in the SSE case. */ + if (cpu_fxsr) + fninit(); +#endif + fldcw(&control); + start_emulating(); + intr_restore(savecrit); +} + +/* + * Free coprocessor (if we have it). + */ +void +npxexit(td) + struct thread *td; +{ + register_t savecrit; + + savecrit = intr_disable(); + if (curthread == PCPU_GET(fpcurthread)) + npxsave(&PCPU_GET(curpcb)->pcb_save); + intr_restore(savecrit); +#ifdef NPX_DEBUG + if (npx_exists) { + u_int masked_exceptions; + + masked_exceptions = GET_FPU_CW(td) & GET_FPU_SW(td) & 0x7f; + /* + * Log exceptions that would have trapped with the old + * control word (overflow, divide by 0, and invalid operand). + */ + if (masked_exceptions & 0x0d) + log(LOG_ERR, + "pid %d (%s) exited with masked floating point exceptions 0x%02x\n", + td->td_proc->p_pid, td->td_proc->p_comm, + masked_exceptions); + } +#endif +} + +int +npxformat() +{ + + if (!npx_exists) + return (_MC_FPFMT_NODEV); +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) + return (_MC_FPFMT_XMM); +#endif + return (_MC_FPFMT_387); +} + +/* + * The following mechanism is used to ensure that the FPE_... value + * that is passed as a trapcode to the signal handler of the user + * process does not have more than one bit set. + * + * Multiple bits may be set if the user process modifies the control + * word while a status word bit is already set. While this is a sign + * of bad coding, we have no choise than to narrow them down to one + * bit, since we must not send a trapcode that is not exactly one of + * the FPE_ macros. + * + * The mechanism has a static table with 127 entries. Each combination + * of the 7 FPU status word exception bits directly translates to a + * position in this table, where a single FPE_... value is stored. + * This FPE_... value stored there is considered the "most important" + * of the exception bits and will be sent as the signal code. The + * precedence of the bits is based upon Intel Document "Numerical + * Applications", Chapter "Special Computational Situations". + * + * The macro to choose one of these values does these steps: 1) Throw + * away status word bits that cannot be masked. 2) Throw away the bits + * currently masked in the control word, assuming the user isn't + * interested in them anymore. 3) Reinsert status word bit 7 (stack + * fault) if it is set, which cannot be masked but must be presered. + * 4) Use the remaining bits to point into the trapcode table. + * + * The 6 maskable bits in order of their preference, as stated in the + * above referenced Intel manual: + * 1 Invalid operation (FP_X_INV) + * 1a Stack underflow + * 1b Stack overflow + * 1c Operand of unsupported format + * 1d SNaN operand. + * 2 QNaN operand (not an exception, irrelavant here) + * 3 Any other invalid-operation not mentioned above or zero divide + * (FP_X_INV, FP_X_DZ) + * 4 Denormal operand (FP_X_DNML) + * 5 Numeric over/underflow (FP_X_OFL, FP_X_UFL) + * 6 Inexact result (FP_X_IMP) + */ +static char fpetable[128] = { + 0, + FPE_FLTINV, /* 1 - INV */ + FPE_FLTUND, /* 2 - DNML */ + FPE_FLTINV, /* 3 - INV | DNML */ + FPE_FLTDIV, /* 4 - DZ */ + FPE_FLTINV, /* 5 - INV | DZ */ + FPE_FLTDIV, /* 6 - DNML | DZ */ + FPE_FLTINV, /* 7 - INV | DNML | DZ */ + FPE_FLTOVF, /* 8 - OFL */ + FPE_FLTINV, /* 9 - INV | OFL */ + FPE_FLTUND, /* A - DNML | OFL */ + FPE_FLTINV, /* B - INV | DNML | OFL */ + FPE_FLTDIV, /* C - DZ | OFL */ + FPE_FLTINV, /* D - INV | DZ | OFL */ + FPE_FLTDIV, /* E - DNML | DZ | OFL */ + FPE_FLTINV, /* F - INV | DNML | DZ | OFL */ + FPE_FLTUND, /* 10 - UFL */ + FPE_FLTINV, /* 11 - INV | UFL */ + FPE_FLTUND, /* 12 - DNML | UFL */ + FPE_FLTINV, /* 13 - INV | DNML | UFL */ + FPE_FLTDIV, /* 14 - DZ | UFL */ + FPE_FLTINV, /* 15 - INV | DZ | UFL */ + FPE_FLTDIV, /* 16 - DNML | DZ | UFL */ + FPE_FLTINV, /* 17 - INV | DNML | DZ | UFL */ + FPE_FLTOVF, /* 18 - OFL | UFL */ + FPE_FLTINV, /* 19 - INV | OFL | UFL */ + FPE_FLTUND, /* 1A - DNML | OFL | UFL */ + FPE_FLTINV, /* 1B - INV | DNML | OFL | UFL */ + FPE_FLTDIV, /* 1C - DZ | OFL | UFL */ + FPE_FLTINV, /* 1D - INV | DZ | OFL | UFL */ + FPE_FLTDIV, /* 1E - DNML | DZ | OFL | UFL */ + FPE_FLTINV, /* 1F - INV | DNML | DZ | OFL | UFL */ + FPE_FLTRES, /* 20 - IMP */ + FPE_FLTINV, /* 21 - INV | IMP */ + FPE_FLTUND, /* 22 - DNML | IMP */ + FPE_FLTINV, /* 23 - INV | DNML | IMP */ + FPE_FLTDIV, /* 24 - DZ | IMP */ + FPE_FLTINV, /* 25 - INV | DZ | IMP */ + FPE_FLTDIV, /* 26 - DNML | DZ | IMP */ + FPE_FLTINV, /* 27 - INV | DNML | DZ | IMP */ + FPE_FLTOVF, /* 28 - OFL | IMP */ + FPE_FLTINV, /* 29 - INV | OFL | IMP */ + FPE_FLTUND, /* 2A - DNML | OFL | IMP */ + FPE_FLTINV, /* 2B - INV | DNML | OFL | IMP */ + FPE_FLTDIV, /* 2C - DZ | OFL | IMP */ + FPE_FLTINV, /* 2D - INV | DZ | OFL | IMP */ + FPE_FLTDIV, /* 2E - DNML | DZ | OFL | IMP */ + FPE_FLTINV, /* 2F - INV | DNML | DZ | OFL | IMP */ + FPE_FLTUND, /* 30 - UFL | IMP */ + FPE_FLTINV, /* 31 - INV | UFL | IMP */ + FPE_FLTUND, /* 32 - DNML | UFL | IMP */ + FPE_FLTINV, /* 33 - INV | DNML | UFL | IMP */ + FPE_FLTDIV, /* 34 - DZ | UFL | IMP */ + FPE_FLTINV, /* 35 - INV | DZ | UFL | IMP */ + FPE_FLTDIV, /* 36 - DNML | DZ | UFL | IMP */ + FPE_FLTINV, /* 37 - INV | DNML | DZ | UFL | IMP */ + FPE_FLTOVF, /* 38 - OFL | UFL | IMP */ + FPE_FLTINV, /* 39 - INV | OFL | UFL | IMP */ + FPE_FLTUND, /* 3A - DNML | OFL | UFL | IMP */ + FPE_FLTINV, /* 3B - INV | DNML | OFL | UFL | IMP */ + FPE_FLTDIV, /* 3C - DZ | OFL | UFL | IMP */ + FPE_FLTINV, /* 3D - INV | DZ | OFL | UFL | IMP */ + FPE_FLTDIV, /* 3E - DNML | DZ | OFL | UFL | IMP */ + FPE_FLTINV, /* 3F - INV | DNML | DZ | OFL | UFL | IMP */ + FPE_FLTSUB, /* 40 - STK */ + FPE_FLTSUB, /* 41 - INV | STK */ + FPE_FLTUND, /* 42 - DNML | STK */ + FPE_FLTSUB, /* 43 - INV | DNML | STK */ + FPE_FLTDIV, /* 44 - DZ | STK */ + FPE_FLTSUB, /* 45 - INV | DZ | STK */ + FPE_FLTDIV, /* 46 - DNML | DZ | STK */ + FPE_FLTSUB, /* 47 - INV | DNML | DZ | STK */ + FPE_FLTOVF, /* 48 - OFL | STK */ + FPE_FLTSUB, /* 49 - INV | OFL | STK */ + FPE_FLTUND, /* 4A - DNML | OFL | STK */ + FPE_FLTSUB, /* 4B - INV | DNML | OFL | STK */ + FPE_FLTDIV, /* 4C - DZ | OFL | STK */ + FPE_FLTSUB, /* 4D - INV | DZ | OFL | STK */ + FPE_FLTDIV, /* 4E - DNML | DZ | OFL | STK */ + FPE_FLTSUB, /* 4F - INV | DNML | DZ | OFL | STK */ + FPE_FLTUND, /* 50 - UFL | STK */ + FPE_FLTSUB, /* 51 - INV | UFL | STK */ + FPE_FLTUND, /* 52 - DNML | UFL | STK */ + FPE_FLTSUB, /* 53 - INV | DNML | UFL | STK */ + FPE_FLTDIV, /* 54 - DZ | UFL | STK */ + FPE_FLTSUB, /* 55 - INV | DZ | UFL | STK */ + FPE_FLTDIV, /* 56 - DNML | DZ | UFL | STK */ + FPE_FLTSUB, /* 57 - INV | DNML | DZ | UFL | STK */ + FPE_FLTOVF, /* 58 - OFL | UFL | STK */ + FPE_FLTSUB, /* 59 - INV | OFL | UFL | STK */ + FPE_FLTUND, /* 5A - DNML | OFL | UFL | STK */ + FPE_FLTSUB, /* 5B - INV | DNML | OFL | UFL | STK */ + FPE_FLTDIV, /* 5C - DZ | OFL | UFL | STK */ + FPE_FLTSUB, /* 5D - INV | DZ | OFL | UFL | STK */ + FPE_FLTDIV, /* 5E - DNML | DZ | OFL | UFL | STK */ + FPE_FLTSUB, /* 5F - INV | DNML | DZ | OFL | UFL | STK */ + FPE_FLTRES, /* 60 - IMP | STK */ + FPE_FLTSUB, /* 61 - INV | IMP | STK */ + FPE_FLTUND, /* 62 - DNML | IMP | STK */ + FPE_FLTSUB, /* 63 - INV | DNML | IMP | STK */ + FPE_FLTDIV, /* 64 - DZ | IMP | STK */ + FPE_FLTSUB, /* 65 - INV | DZ | IMP | STK */ + FPE_FLTDIV, /* 66 - DNML | DZ | IMP | STK */ + FPE_FLTSUB, /* 67 - INV | DNML | DZ | IMP | STK */ + FPE_FLTOVF, /* 68 - OFL | IMP | STK */ + FPE_FLTSUB, /* 69 - INV | OFL | IMP | STK */ + FPE_FLTUND, /* 6A - DNML | OFL | IMP | STK */ + FPE_FLTSUB, /* 6B - INV | DNML | OFL | IMP | STK */ + FPE_FLTDIV, /* 6C - DZ | OFL | IMP | STK */ + FPE_FLTSUB, /* 6D - INV | DZ | OFL | IMP | STK */ + FPE_FLTDIV, /* 6E - DNML | DZ | OFL | IMP | STK */ + FPE_FLTSUB, /* 6F - INV | DNML | DZ | OFL | IMP | STK */ + FPE_FLTUND, /* 70 - UFL | IMP | STK */ + FPE_FLTSUB, /* 71 - INV | UFL | IMP | STK */ + FPE_FLTUND, /* 72 - DNML | UFL | IMP | STK */ + FPE_FLTSUB, /* 73 - INV | DNML | UFL | IMP | STK */ + FPE_FLTDIV, /* 74 - DZ | UFL | IMP | STK */ + FPE_FLTSUB, /* 75 - INV | DZ | UFL | IMP | STK */ + FPE_FLTDIV, /* 76 - DNML | DZ | UFL | IMP | STK */ + FPE_FLTSUB, /* 77 - INV | DNML | DZ | UFL | IMP | STK */ + FPE_FLTOVF, /* 78 - OFL | UFL | IMP | STK */ + FPE_FLTSUB, /* 79 - INV | OFL | UFL | IMP | STK */ + FPE_FLTUND, /* 7A - DNML | OFL | UFL | IMP | STK */ + FPE_FLTSUB, /* 7B - INV | DNML | OFL | UFL | IMP | STK */ + FPE_FLTDIV, /* 7C - DZ | OFL | UFL | IMP | STK */ + FPE_FLTSUB, /* 7D - INV | DZ | OFL | UFL | IMP | STK */ + FPE_FLTDIV, /* 7E - DNML | DZ | OFL | UFL | IMP | STK */ + FPE_FLTSUB, /* 7F - INV | DNML | DZ | OFL | UFL | IMP | STK */ +}; + +/* + * Preserve the FP status word, clear FP exceptions, then generate a SIGFPE. + * + * Clearing exceptions is necessary mainly to avoid IRQ13 bugs. We now + * depend on longjmp() restoring a usable state. Restoring the state + * or examining it might fail if we didn't clear exceptions. + * + * The error code chosen will be one of the FPE_... macros. It will be + * sent as the second argument to old BSD-style signal handlers and as + * "siginfo_t->si_code" (second argument) to SA_SIGINFO signal handlers. + * + * XXX the FP state is not preserved across signal handlers. So signal + * handlers cannot afford to do FP unless they preserve the state or + * longjmp() out. Both preserving the state and longjmp()ing may be + * destroyed by IRQ13 bugs. Clearing FP exceptions is not an acceptable + * solution for signals other than SIGFPE. + */ +int +npxtrap() +{ + register_t savecrit; + u_short control, status; + + if (!npx_exists) { + printf("npxtrap: fpcurthread = %p, curthread = %p, npx_exists = %d\n", + PCPU_GET(fpcurthread), curthread, npx_exists); + panic("npxtrap from nowhere"); + } + savecrit = intr_disable(); + + /* + * Interrupt handling (for another interrupt) may have pushed the + * state to memory. Fetch the relevant parts of the state from + * wherever they are. + */ + if (PCPU_GET(fpcurthread) != curthread) { + control = GET_FPU_CW(curthread); + status = GET_FPU_SW(curthread); + } else { + fnstcw(&control); + fnstsw(&status); + } + + if (PCPU_GET(fpcurthread) == curthread) + fnclex(); + intr_restore(savecrit); + return (fpetable[status & ((~control & 0x3f) | 0x40)]); +} + +/* + * Implement device not available (DNA) exception + * + * It would be better to switch FP context here (if curthread != fpcurthread) + * and not necessarily for every context switch, but it is too hard to + * access foreign pcb's. + */ + +static int err_count = 0; + +int +npxdna() +{ + struct pcb *pcb; + register_t s; + u_short control; + + if (!npx_exists) + return (0); + if (PCPU_GET(fpcurthread) == curthread) { + printf("npxdna: fpcurthread == curthread %d times\n", + ++err_count); + stop_emulating(); + return (1); + } + if (PCPU_GET(fpcurthread) != NULL) { + printf("npxdna: fpcurthread = %p (%d), curthread = %p (%d)\n", + PCPU_GET(fpcurthread), + PCPU_GET(fpcurthread)->td_proc->p_pid, + curthread, curthread->td_proc->p_pid); + panic("npxdna"); + } + s = intr_disable(); + stop_emulating(); + /* + * Record new context early in case frstor causes an IRQ13. + */ + PCPU_SET(fpcurthread, curthread); + pcb = PCPU_GET(curpcb); + + if ((pcb->pcb_flags & PCB_NPXINITDONE) == 0) { + /* + * This is the first time this thread has used the FPU or + * the PCB doesn't contain a clean FPU state. Explicitly + * initialize the FPU and load the default control word. + */ + fninit(); + control = __INITIAL_NPXCW__; + fldcw(&control); + pcb->pcb_flags |= PCB_NPXINITDONE; + } else { + /* + * The following frstor may cause an IRQ13 when the state + * being restored has a pending error. The error will + * appear to have been triggered by the current (npx) user + * instruction even when that instruction is a no-wait + * instruction that should not trigger an error (e.g., + * fnclex). On at least one 486 system all of the no-wait + * instructions are broken the same as frstor, so our + * treatment does not amplify the breakage. On at least + * one 386/Cyrix 387 system, fnclex works correctly while + * frstor and fnsave are broken, so our treatment breaks + * fnclex if it is the first FPU instruction after a context + * switch. + */ + fpurstor(&pcb->pcb_save); + } + intr_restore(s); + + return (1); +} + +/* + * Wrapper for fnsave instruction, partly to handle hardware bugs. When npx + * exceptions are reported via IRQ13, spurious IRQ13's may be triggered by + * no-wait npx instructions. See the Intel application note AP-578 for + * details. This doesn't cause any additional complications here. IRQ13's + * are inherently asynchronous unless the CPU is frozen to deliver them -- + * one that started in userland may be delivered many instructions later, + * after the process has entered the kernel. It may even be delivered after + * the fnsave here completes. A spurious IRQ13 for the fnsave is handled in + * the same way as a very-late-arriving non-spurious IRQ13 from user mode: + * it is normally ignored at first because we set fpcurthread to NULL; it is + * normally retriggered in npxdna() after return to user mode. + * + * npxsave() must be called with interrupts disabled, so that it clears + * fpcurthread atomically with saving the state. We require callers to do the + * disabling, since most callers need to disable interrupts anyway to call + * npxsave() atomically with checking fpcurthread. + * + * A previous version of npxsave() went to great lengths to excecute fnsave + * with interrupts enabled in case executing it froze the CPU. This case + * can't happen, at least for Intel CPU/NPX's. Spurious IRQ13's don't imply + * spurious freezes. + */ +void +npxsave(addr) + union savefpu *addr; +{ + + stop_emulating(); + fpusave(addr); + + start_emulating(); + PCPU_SET(fpcurthread, NULL); + queue_multicall0(__HYPERVISOR_fpu_taskswitch); +} + +/* + * This should be called with interrupts disabled and only when the owning + * FPU thread is non-null. + */ +void +npxdrop() +{ + struct thread *td; + + td = PCPU_GET(fpcurthread); + PCPU_SET(fpcurthread, NULL); + td->td_pcb->pcb_flags &= ~PCB_NPXINITDONE; + start_emulating(); +} + +/* + * Get the state of the FPU without dropping ownership (if possible). + * It returns the FPU ownership status. + */ +int +npxgetregs(td, addr) + struct thread *td; + union savefpu *addr; +{ + register_t s; + + if (!npx_exists) + return (_MC_FPOWNED_NONE); + + if ((td->td_pcb->pcb_flags & PCB_NPXINITDONE) == 0) { + if (npx_cleanstate_ready) + bcopy(&npx_cleanstate, addr, sizeof(npx_cleanstate)); + else + bzero(addr, sizeof(*addr)); + return (_MC_FPOWNED_NONE); + } + s = intr_disable(); + if (td == PCPU_GET(fpcurthread)) { + fpusave(addr); +#ifdef CPU_ENABLE_SSE + if (!cpu_fxsr) +#endif + /* + * fnsave initializes the FPU and destroys whatever + * context it contains. Make sure the FPU owner + * starts with a clean state next time. + */ + npxdrop(); + intr_restore(s); + return (_MC_FPOWNED_FPU); + } else { + intr_restore(s); + bcopy(&td->td_pcb->pcb_save, addr, sizeof(*addr)); + return (_MC_FPOWNED_PCB); + } +} + +/* + * Set the state of the FPU. + */ +void +npxsetregs(td, addr) + struct thread *td; + union savefpu *addr; +{ + register_t s; + + if (!npx_exists) + return; + + s = intr_disable(); + if (td == PCPU_GET(fpcurthread)) { + fpurstor(addr); + intr_restore(s); + } else { + intr_restore(s); + bcopy(addr, &td->td_pcb->pcb_save, sizeof(*addr)); + } + curthread->td_pcb->pcb_flags |= PCB_NPXINITDONE; +} + +static void +fpusave(addr) + union savefpu *addr; +{ + +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) + fxsave(addr); + else +#endif + fnsave(addr); +} + +static void +fpurstor(addr) + union savefpu *addr; +{ + +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) + fxrstor(addr); + else +#endif + frstor(addr); +} + +#ifdef I586_CPU_XXX +static long +timezero(funcname, func) + const char *funcname; + void (*func)(void *buf, size_t len); + +{ + void *buf; +#define BUFSIZE 1048576 + long usec; + struct timeval finish, start; + + buf = malloc(BUFSIZE, M_TEMP, M_NOWAIT); + if (buf == NULL) + return (BUFSIZE); + microtime(&start); + (*func)(buf, BUFSIZE); + microtime(&finish); + usec = 1000000 * (finish.tv_sec - start.tv_sec) + + finish.tv_usec - start.tv_usec; + if (usec <= 0) + usec = 1; + if (bootverbose) + printf("%s bandwidth = %u kBps\n", funcname, + (u_int32_t)(((BUFSIZE >> 10) * 1000000) / usec)); + free(buf, M_TEMP); + return (usec); +} +#endif /* I586_CPU */ + +static device_method_t npx_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, npx_identify), + DEVMETHOD(device_probe, npx_probe), + DEVMETHOD(device_attach, npx_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + { 0, 0 } +}; + +static driver_t npx_driver = { + "npx", + npx_methods, + 1, /* no softc */ +}; + +static devclass_t npx_devclass; +DRIVER_MODULE(npx, nexus, npx_driver, npx_devclass, 0, 0); + +#ifdef DEV_ISA +/* + * We prefer to attach to the root nexus so that the usual case (exception 16) + * doesn't describe the processor as being `on isa'. + */ +DRIVER_MODULE(npx, nexus, npx_driver, npx_devclass, 0, 0); + +/* + * This sucks up the legacy ISA support assignments from PNPBIOS/ACPI. + */ +static struct isa_pnp_id npxisa_ids[] = { + { 0x040cd041, "Legacy ISA coprocessor support" }, /* PNP0C04 */ + { 0 } +}; + +static int +npxisa_probe(device_t dev) +{ + int result; + if ((result = ISA_PNP_PROBE(device_get_parent(dev), dev, npxisa_ids)) <= 0) { + device_quiet(dev); + } + return(result); +} + +static int +npxisa_attach(device_t dev) +{ + return (0); +} + +static device_method_t npxisa_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, npxisa_probe), + DEVMETHOD(device_attach, npxisa_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + { 0, 0 } +}; + +static driver_t npxisa_driver = { + "npxisa", + npxisa_methods, + 1, /* no softc */ +}; + +static devclass_t npxisa_devclass; + +DRIVER_MODULE(npxisa, isa, npxisa_driver, npxisa_devclass, 0, 0); +#ifndef PC98 +DRIVER_MODULE(npxisa, acpi, npxisa_driver, npxisa_devclass, 0, 0); +#endif +#endif /* DEV_ISA */ diff --git a/freebsd-5.3-xen-sparse/i386-xen/xen/netfront/xn_netfront.c b/freebsd-5.3-xen-sparse/i386-xen/xen/netfront/xn_netfront.c new file mode 100644 index 0000000000..e25f218eb3 --- /dev/null +++ b/freebsd-5.3-xen-sparse/i386-xen/xen/netfront/xn_netfront.c @@ -0,0 +1,1436 @@ +/* + * + * Copyright (c) 2004 Kip Macy + * All rights reserved. + * + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_nfsroot.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/if_arp.h> +#include <net/ethernet.h> +#include <net/if_dl.h> +#include <net/if_media.h> + +#include <net/bpf.h> + +#include <net/if_types.h> +#include <net/if_vlan_var.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/clock.h> /* for DELAY */ +#include <machine/bus_memio.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <machine/frame.h> + + +#include <sys/bus.h> +#include <sys/rman.h> + +#include <machine/intr_machdep.h> + +#include <machine/xen-os.h> +#include <machine/hypervisor.h> +#include <machine/hypervisor-ifs.h> +#include <machine/xen_intr.h> +#include <machine/evtchn.h> +#include <machine/ctrl_if.h> + +struct xn_softc; +static void xn_txeof(struct xn_softc *); +static void xn_rxeof(struct xn_softc *); +static void xn_alloc_rx_buffers(struct xn_softc *); + +static void xn_tick_locked(struct xn_softc *); +static void xn_tick(void *); + +static void xn_intr(void *); +static void xn_start_locked(struct ifnet *); +static void xn_start(struct ifnet *); +static int xn_ioctl(struct ifnet *, u_long, caddr_t); +static void xn_ifinit_locked(struct xn_softc *); +static void xn_ifinit(void *); +static void xn_stop(struct xn_softc *); +#ifdef notyet +static void xn_watchdog(struct ifnet *); +#endif +/* Xenolinux helper functions */ +static void network_connect(struct xn_softc *, netif_fe_interface_status_t *); +static void create_netdev(int handle, struct xn_softc **); +static void netif_ctrlif_rx(ctrl_msg_t *,unsigned long); + +static void xn_free_rx_ring(struct xn_softc *); + +static void xn_free_tx_ring(struct xn_softc *); + + + +/* XXX: This isn't supported in FreeBSD, so ignore it for now. */ +#define TASK_UNINTERRUPTIBLE 0 +#define INVALID_P2M_ENTRY (~0UL) + +/* + * If the backend driver is pipelining transmit requests then we can be very + * aggressive in avoiding new-packet notifications -- only need to send a + * notification if there are no outstanding unreceived responses. + * If the backend may be buffering our transmit buffers for any reason then we + * are rather more conservative. + */ +#ifdef CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER +#define TX_TEST_IDX resp_prod /* aggressive: any outstanding responses? */ +#else +#define TX_TEST_IDX req_cons /* conservative: not seen all our requests? */ +#endif + +/* + * Mbuf pointers. We need these to keep track of the virtual addresses + * of our mbuf chains since we can only convert from virtual to physical, + * not the other way around. The size must track the free index arrays. + */ +struct xn_chain_data { + struct mbuf *xn_tx_chain[NETIF_TX_RING_SIZE+1]; + struct mbuf *xn_rx_chain[NETIF_RX_RING_SIZE+1]; +}; + +struct xn_softc { + struct arpcom arpcom; /* interface info */ + device_t xn_dev; + SLIST_ENTRY(xn_softc) xn_links; + struct mtx xn_mtx; + void *xn_intrhand; + struct resource *xn_res; + u_int8_t xn_ifno; /* interface number */ + struct xn_chain_data xn_cdata; /* mbufs */ + + netif_tx_interface_t *xn_tx_if; + netif_rx_interface_t *xn_rx_if; + + int xn_if_flags; + int xn_txcnt; + int xn_rxbufcnt; + struct callout xn_stat_ch; + unsigned int xn_irq; + unsigned int xn_evtchn; + + + /* What is the status of our connection to the remote backend? */ +#define BEST_CLOSED 0 +#define BEST_DISCONNECTED 1 +#define BEST_CONNECTED 2 + unsigned int xn_backend_state; + + /* Is this interface open or closed (down or up)? */ +#define UST_CLOSED 0 +#define UST_OPEN 1 + unsigned int xn_user_state; + + /* Receive-ring batched refills. */ +#define RX_MIN_TARGET 64 /* XXX: larger than linux. was causing packet + * loss at the default of 8. + */ +#define RX_MAX_TARGET NETIF_RX_RING_SIZE + int xn_rx_target; /* number to allocate */ + struct mbuf *xn_rx_batch; /* head of the batch queue */ + struct mbuf *xn_rx_batchtail; + int xn_rx_batchlen; /* how many queued */ + + int xn_rx_resp_cons; + int xn_tx_resp_cons; + unsigned short xn_rx_free_idxs[NETIF_RX_RING_SIZE+1]; + unsigned short xn_tx_free_idxs[NETIF_RX_RING_SIZE+1]; +}; + +static unsigned long xn_rx_pfns[NETIF_RX_RING_SIZE]; +static multicall_entry_t xn_rx_mcl[NETIF_RX_RING_SIZE+1]; +static mmu_update_t xn_rx_mmu[NETIF_RX_RING_SIZE]; + +static SLIST_HEAD(, xn_softc) xn_dev_list = + SLIST_HEAD_INITIALIZER(xn_dev_list); + +#define XN_LOCK_INIT(_sc, _name) \ + mtx_init(&(_sc)->xn_mtx, _name, MTX_NETWORK_LOCK, MTX_DEF) +#define XN_LOCK(_sc) mtx_lock(&(_sc)->xn_mtx) +#define XN_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->xn_mtx, MA_OWNED) +#define XN_UNLOCK(_sc) mtx_unlock(&(_sc)->xn_mtx) +#define XN_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->xn_mtx) + +/* Access macros for acquiring freeing slots in xn_free_{tx,rx}_idxs[]. */ +#define ADD_ID_TO_FREELIST(_list, _id) \ + (_list)[(_id)] = (_list)[0]; \ + (_list)[0] = (_id); +#define GET_ID_FROM_FREELIST(_list) \ + ({ unsigned short _id = (_list)[0]; \ + (_list)[0] = (_list)[_id]; \ + (unsigned short)_id; }) +#define FREELIST_EMPTY(_list, _maxid) \ + ((_list)[0] == (_maxid+1)) + +static char *status_name[] = { + [NETIF_INTERFACE_STATUS_CLOSED] = "closed", + [NETIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", + [NETIF_INTERFACE_STATUS_CONNECTED] = "connected", + [NETIF_INTERFACE_STATUS_CHANGED] = "changed", +}; + +static char *be_state_name[] = { + [BEST_CLOSED] = "closed", + [BEST_DISCONNECTED] = "disconnected", + [BEST_CONNECTED] = "connected", +}; + +#define IPRINTK(fmt, args...) \ + printk("[XEN] " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk("[XEN] " fmt, ##args) + +static struct xn_softc * +find_sc_by_handle(unsigned int handle) +{ + struct xn_softc *sc; + SLIST_FOREACH(sc, &xn_dev_list, xn_links) + { + if ( sc->xn_ifno == handle ) + return sc; + } + return NULL; +} + +/** Network interface info. */ +struct netif_ctrl { + /** Number of interfaces. */ + int interface_n; + /** Number of connected interfaces. */ + int connected_n; + /** Error code. */ + int err; + int up; +}; + +static struct netif_ctrl netctrl; + +static void +netctrl_init(void) +{ + /* + * netctrl is already in bss, why are we setting it? + */ + memset(&netctrl, 0, sizeof(netctrl)); + netctrl.up = NETIF_DRIVER_STATUS_DOWN; +} + +/** Get or set a network interface error. + */ +static int +netctrl_err(int err) +{ + if ( (err < 0) && !netctrl.err ) + netctrl.err = err; + return netctrl.err; +} + +/** Test if all network interfaces are connected. + * + * @return 1 if all connected, 0 if not, negative error code otherwise + */ +static int +netctrl_connected(void) +{ + int ok; + + if (netctrl.err) + ok = netctrl.err; + else if (netctrl.up == NETIF_DRIVER_STATUS_UP) + ok = (netctrl.connected_n == netctrl.interface_n); + else + ok = 0; + + return ok; +} + +/** Count the connected network interfaces. + * + * @return connected count + */ +static int +netctrl_connected_count(void) +{ + + struct xn_softc *sc; + unsigned int connected; + + connected = 0; + + SLIST_FOREACH(sc, &xn_dev_list, xn_links) + { + if ( sc->xn_backend_state == BEST_CONNECTED ) + connected++; + } + + netctrl.connected_n = connected; + XENPRINTF("> connected_n=%d interface_n=%d\n", + netctrl.connected_n, netctrl.interface_n); + return connected; +} + +static __inline struct mbuf* +makembuf (struct mbuf *buf) +{ + struct mbuf *m = NULL; + + MGETHDR (m, M_DONTWAIT, MT_DATA); + + if (! m) + return 0; + + M_MOVE_PKTHDR(m, buf); + + MCLGET (m, M_DONTWAIT); + + m->m_pkthdr.len = buf->m_pkthdr.len; + m->m_len = buf->m_len; + m_copydata(buf, 0, buf->m_pkthdr.len, mtod(m,caddr_t) ); + m->m_ext.ext_args = (vm_paddr_t *)vtophys(mtod(m,caddr_t)); + + return m; +} + + + +static void +xn_free_rx_ring(struct xn_softc *sc) +{ +#if 0 + int i; + + for (i = 0; i < NETIF_RX_RING_SIZE; i++) { + if (sc->xn_cdata.xn_rx_chain[MASK_NETIF_RX_IDX(i)] != NULL) { + m_freem(sc->xn_cdata.xn_rx_chain[MASK_NETIF_RX_IDX(i)]); + sc->xn_cdata.xn_rx_chain[MASK_NETIF_RX_IDX(i)] = NULL; + } + } + + sc->xn_rx_resp_cons = 0; + sc->xn_rx_if->req_prod = 0; + sc->xn_rx_if->event = sc->xn_rx_resp_cons ; +#endif +} + +static void +xn_free_tx_ring(struct xn_softc *sc) +{ +#if 0 + int i; + + for (i = 0; i < NETIF_TX_RING_SIZE; i++) { + if (sc->xn_cdata.xn_tx_chain[MASK_NETIF_TX_IDX(i)] != NULL) { + m_freem(sc->xn_cdata.xn_tx_chain[MASK_NETIF_TX_IDX(i)]); + sc->xn_cdata.xn_tx_chain[MASK_NETIF_TX_IDX(i)] = NULL; + } + } + + return; +#endif +} + +static void +xn_alloc_rx_buffers(struct xn_softc *sc) +{ + unsigned short id; + struct mbuf *m_new, *next; + int i, batch_target; + NETIF_RING_IDX req_prod = sc->xn_rx_if->req_prod; + + if (unlikely(sc->xn_backend_state != BEST_CONNECTED) ) + return; + + /* + * Allocate skbuffs greedily, even though we batch updates to the + * receive ring. This creates a less bursty demand on the memory allocator, + * so should reduce the chance of failed allocation requests both for + * ourself and for other kernel subsystems. + */ + batch_target = sc->xn_rx_target - (req_prod - sc->xn_rx_resp_cons); + for ( i = sc->xn_rx_batchlen; i < batch_target; i++, sc->xn_rx_batchlen++) { + MGETHDR(m_new, M_DONTWAIT, MT_DATA); + if (m_new == NULL) + break; + + MCLGET(m_new, M_DONTWAIT); + if (!(m_new->m_flags & M_EXT)) { + m_freem(m_new); + break; + } + m_new->m_len = m_new->m_pkthdr.len = MCLBYTES; + + /* queue the mbufs allocated */ + if (!sc->xn_rx_batch) + sc->xn_rx_batch = m_new; + + if (sc->xn_rx_batchtail) + sc->xn_rx_batchtail->m_next = m_new; + sc->xn_rx_batchtail = m_new; + } + + /* Is the batch large enough to be worthwhile? */ + if ( i < (sc->xn_rx_target/2) ) + return; + + for (i = 0, m_new = sc->xn_rx_batch; m_new; + i++, sc->xn_rx_batchlen--, m_new = next) { + + next = m_new->m_next; + m_new->m_next = NULL; + + m_new->m_ext.ext_args = (vm_paddr_t *)vtophys(m_new->m_ext.ext_buf); + + id = GET_ID_FROM_FREELIST(sc->xn_rx_free_idxs); + KASSERT(id != 0, ("alloc_rx_buffers: found free receive index of 0\n")); + sc->xn_cdata.xn_rx_chain[MASK_NETIF_RX_IDX(id)] = m_new; + + sc->xn_rx_if->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.id = id; + + xn_rx_pfns[i] = vtomach(mtod(m_new,vm_offset_t)) >> PAGE_SHIFT; + + /* Remove this page from pseudo phys map before passing back to Xen. */ + xen_phys_machine[((unsigned long)m_new->m_ext.ext_args >> PAGE_SHIFT)] + = INVALID_P2M_ENTRY; + + xn_rx_mcl[i].op = __HYPERVISOR_update_va_mapping; + xn_rx_mcl[i].args[0] = (unsigned long)mtod(m_new,vm_offset_t) + >> PAGE_SHIFT; + xn_rx_mcl[i].args[1] = 0; + xn_rx_mcl[i].args[2] = 0; + + } + + KASSERT(i, ("no mbufs processed")); /* should have returned earlier */ + KASSERT(sc->xn_rx_batchlen == 0, ("not all mbufs processed")); + sc->xn_rx_batch = sc->xn_rx_batchtail = NULL; + + /* + * We may have allocated buffers which have entries outstanding + in the page * update queue -- make sure we flush those first! */ + PT_UPDATES_FLUSH(); + + /* After all PTEs have been zapped we blow away stale TLB entries. */ + xn_rx_mcl[i-1].args[2] = UVMF_FLUSH_TLB; + + /* Give away a batch of pages. */ + xn_rx_mcl[i].op = __HYPERVISOR_dom_mem_op; + xn_rx_mcl[i].args[0] = (unsigned long) MEMOP_decrease_reservation; + xn_rx_mcl[i].args[1] = (unsigned long)xn_rx_pfns; + xn_rx_mcl[i].args[2] = (unsigned long)i; + xn_rx_mcl[i].args[3] = 0; + xn_rx_mcl[i].args[4] = DOMID_SELF; + + /* Zap PTEs and give away pages in one big multicall. */ + (void)HYPERVISOR_multicall(xn_rx_mcl, i+1); + + /* Check return status of HYPERVISOR_dom_mem_op(). */ + if ( xn_rx_mcl[i].args[5] != i ) + panic("Unable to reduce memory reservation\n"); + + /* Above is a suitable barrier to ensure backend will see requests. */ + sc->xn_rx_if->req_prod = req_prod + i; + + /* Adjust our floating fill target if we risked running out of buffers. */ + if ( ((req_prod - sc->xn_rx_if->resp_prod) < (sc->xn_rx_target / 4)) && + ((sc->xn_rx_target *= 2) > RX_MAX_TARGET) ) + sc->xn_rx_target = RX_MAX_TARGET; +} + +static void +xn_rxeof(struct xn_softc *sc) +{ + struct ifnet *ifp; + netif_rx_response_t *rx; + NETIF_RING_IDX i, rp; + mmu_update_t *mmu = xn_rx_mmu; + multicall_entry_t *mcl = xn_rx_mcl; + struct mbuf *tail_mbuf = NULL, *head_mbuf = NULL, *m, *next; + + XN_LOCK_ASSERT(sc); + if (sc->xn_backend_state != BEST_CONNECTED) + return; + + ifp = &sc->arpcom.ac_if; + + rp = sc->xn_rx_if->resp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for (i = sc->xn_rx_resp_cons; i != rp; i++) { + + rx = &sc->xn_rx_if->ring[MASK_NETIF_RX_IDX(i)].resp; + KASSERT(rx->id != 0, ("xn_rxeof: found free receive index of 0\n")); + + /* + * An error here is very odd. Usually indicates a backend bug, + * low-memory condition, or that we didn't have reservation headroom. + * Whatever - print an error and queue the id again straight away. + */ + if (unlikely(rx->status <= 0)) { + printk("bad buffer on RX ring!(%d)\n", rx->status); + sc->xn_rx_if->ring[MASK_NETIF_RX_IDX(sc->xn_rx_if->req_prod)].req.id + = rx->id; + wmb(); + sc->xn_rx_if->req_prod++; + continue; + } + + m = (struct mbuf *) + sc->xn_cdata.xn_rx_chain[MASK_NETIF_RX_IDX(rx->id)]; + if (m->m_next) + panic("mbuf is already part of a valid mbuf chain"); + ADD_ID_TO_FREELIST(sc->xn_rx_free_idxs, rx->id); + + m->m_data += (rx->addr & PAGE_MASK); + m->m_pkthdr.len = m->m_len = rx->status; + m->m_pkthdr.rcvif = ifp; + + /* Remap the page. */ + mmu->ptr = (rx->addr & ~PAGE_MASK) | MMU_MACHPHYS_UPDATE; + mmu->val = (unsigned long)m->m_ext.ext_args >> PAGE_SHIFT; + mmu++; + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = (unsigned long)m->m_data >> PAGE_SHIFT; + mcl->args[1] = (rx->addr & ~PAGE_MASK) | PG_KERNEL; + mcl->args[2] = 0; + mcl++; + + xen_phys_machine[((unsigned long)m->m_ext.ext_args >> PAGE_SHIFT)] = + (rx->addr >> PAGE_SHIFT); + + if (unlikely(!head_mbuf)) + head_mbuf = m; + + if (tail_mbuf) + tail_mbuf->m_next = m; + tail_mbuf = m; + + sc->xn_cdata.xn_rx_chain[MASK_NETIF_RX_IDX(rx->id)] = NULL; + sc->xn_rxbufcnt++; + } + + /* Do all the remapping work, and M->P updates, in one big hypercall. */ + if (likely((mcl - xn_rx_mcl) != 0)) { + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)xn_rx_mmu; + mcl->args[1] = mmu - xn_rx_mmu; + mcl->args[2] = 0; + mcl++; + (void)HYPERVISOR_multicall(xn_rx_mcl, mcl - xn_rx_mcl); + } + + + /* + * Process all the mbufs after the remapping is complete. + * Break the mbuf chain first though. + */ + for (m = head_mbuf; m; m = next) { + next = m->m_next; + m->m_next = NULL; + + ifp->if_ipackets++; + + XN_UNLOCK(sc); + + /* Pass it up. */ + (*ifp->if_input)(ifp, m); + XN_LOCK(sc); + } + + sc->xn_rx_resp_cons = i; + + /* If we get a callback with very few responses, reduce fill target. */ + /* NB. Note exponential increase, linear decrease. */ + if (((sc->xn_rx_if->req_prod - sc->xn_rx_if->resp_prod) > + ((3*sc->xn_rx_target) / 4)) && (--sc->xn_rx_target < RX_MIN_TARGET)) + sc->xn_rx_target = RX_MIN_TARGET; + + xn_alloc_rx_buffers(sc); + + sc->xn_rx_if->event = i + 1; +} + +static void +xn_txeof(struct xn_softc *sc) +{ + NETIF_RING_IDX i, prod; + unsigned short id; + struct ifnet *ifp; + struct mbuf *m; + + XN_LOCK_ASSERT(sc); + + if (sc->xn_backend_state != BEST_CONNECTED) + return; + + ifp = &sc->arpcom.ac_if; + ifp->if_timer = 0; + + do { + prod = sc->xn_tx_if->resp_prod; + + for (i = sc->xn_tx_resp_cons; i != prod; i++) { + id = sc->xn_tx_if->ring[MASK_NETIF_TX_IDX(i)].resp.id; + m = sc->xn_cdata.xn_tx_chain[MASK_NETIF_TX_IDX(id)]; + + KASSERT(m != NULL, ("mbuf not found in xn_tx_chain")); + M_ASSERTVALID(m); + + m_freem(m); + sc->xn_cdata.xn_tx_chain[MASK_NETIF_TX_IDX(id)] = NULL; + ADD_ID_TO_FREELIST(sc->xn_tx_free_idxs, id); + sc->xn_txcnt--; + } + sc->xn_tx_resp_cons = prod; + + /* + * Set a new event, then check for race with update of tx_cons. Note + * that it is essential to schedule a callback, no matter how few + * buffers are pending. Even if there is space in the transmit ring, + * higher layers may be blocked because too much data is outstanding: + * in such cases notification from Xen is likely to be the only kick + * that we'll get. + */ + sc->xn_tx_if->event = + prod + ((sc->xn_tx_if->req_prod - prod) >> 1) + 1; + + mb(); + + } while (prod != sc->xn_tx_if->resp_prod); +} + +static void +xn_intr(void *xsc) +{ + struct xn_softc *sc = xsc; + struct ifnet *ifp = &sc->arpcom.ac_if; + + XN_LOCK(sc); + + /* sometimes we seem to lose packets. stay in the interrupt handler while + * there is stuff to process: continually recheck the response producer. + */ + do { + xn_txeof(sc); + + if (sc->xn_rx_resp_cons != sc->xn_rx_if->resp_prod && + sc->xn_user_state == UST_OPEN) + xn_rxeof(sc); + + if (ifp->if_flags & IFF_RUNNING && ifp->if_snd.ifq_head != NULL) + xn_start_locked(ifp); + } while (sc->xn_rx_resp_cons != sc->xn_rx_if->resp_prod && + sc->xn_user_state == UST_OPEN); + + XN_UNLOCK(sc); + return; +} + +static void +xn_tick_locked(struct xn_softc *sc) +{ + XN_LOCK_ASSERT(sc); + callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc); + + /* XXX placeholder for printing debug information */ + +} + + +static void +xn_tick(void *xsc) +{ + struct xn_softc *sc; + + sc = xsc; + XN_LOCK(sc); + xn_tick_locked(sc); + XN_UNLOCK(sc); + +} +static void +xn_start_locked(struct ifnet *ifp) +{ + unsigned short id; + struct mbuf *m_head, *new_m; + struct xn_softc *sc = ifp->if_softc; + netif_tx_request_t *tx; + NETIF_RING_IDX i, start; + + if (sc->xn_backend_state != BEST_CONNECTED) + return; + + for (i = start = sc->xn_tx_if->req_prod; TRUE; i++, sc->xn_txcnt++) { + + IF_DEQUEUE(&ifp->if_snd, m_head); + if (m_head == NULL) + break; + + if (FREELIST_EMPTY(sc->xn_tx_free_idxs, NETIF_TX_RING_SIZE)) { + IF_PREPEND(&ifp->if_snd, m_head); + ifp->if_flags |= IFF_OACTIVE; + break; + } + + i = sc->xn_tx_if->req_prod; + + id = GET_ID_FROM_FREELIST(sc->xn_tx_free_idxs); + + /* + * Start packing the mbufs in this chain into + * the fragment pointers. Stop when we run out + * of fragments or hit the end of the mbuf chain. + */ + new_m = makembuf(m_head); + tx = &(sc->xn_tx_if->ring[MASK_NETIF_TX_IDX(i)].req); + tx->id = id; + tx->size = new_m->m_pkthdr.len; + new_m->m_next = NULL; + new_m->m_nextpkt = NULL; + + m_freem(m_head); + tx->addr = vtomach(mtod(new_m, vm_offset_t)); + + sc->xn_cdata.xn_tx_chain[MASK_NETIF_TX_IDX(id)] = new_m; + BPF_MTAP(ifp, new_m); + } + + sc->xn_tx_if->req_prod = i; + xn_txeof(sc); + + /* Only notify Xen if we really have to. */ + if (sc->xn_tx_if->TX_TEST_IDX == start) + notify_via_evtchn(sc->xn_evtchn); + return; +} + +static void +xn_start(struct ifnet *ifp) +{ + struct xn_softc *sc; + sc = ifp->if_softc; + XN_LOCK(sc); + xn_start_locked(ifp); + XN_UNLOCK(sc); +} + + + +/* equivalent of network_open() in Linux */ +static void +xn_ifinit_locked(struct xn_softc *sc) +{ + struct ifnet *ifp; + + XN_LOCK_ASSERT(sc); + + ifp = &sc->arpcom.ac_if; + + if (ifp->if_flags & IFF_RUNNING) + return; + + xn_stop(sc); + + sc->xn_user_state = UST_OPEN; + + xn_alloc_rx_buffers(sc); + sc->xn_rx_if->event = sc->xn_rx_resp_cons + 1; + + ifp->if_flags |= IFF_RUNNING; + ifp->if_flags &= ~IFF_OACTIVE; + + callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc); + +} + + +static void +xn_ifinit(void *xsc) +{ + struct xn_softc *sc = xsc; + + XN_LOCK(sc); + xn_ifinit_locked(sc); + XN_UNLOCK(sc); + +} + + +static int +xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct xn_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *) data; + int mask, error = 0; + switch(cmd) { + case SIOCSIFMTU: + /* XXX can we alter the MTU on a VN ?*/ +#ifdef notyet + if (ifr->ifr_mtu > XN_JUMBO_MTU) + error = EINVAL; + else +#endif + { + ifp->if_mtu = ifr->ifr_mtu; + ifp->if_flags &= ~IFF_RUNNING; + xn_ifinit(sc); + } + break; + case SIOCSIFFLAGS: + XN_LOCK(sc); + if (ifp->if_flags & IFF_UP) { + /* + * If only the state of the PROMISC flag changed, + * then just use the 'set promisc mode' command + * instead of reinitializing the entire NIC. Doing + * a full re-init means reloading the firmware and + * waiting for it to start up, which may take a + * second or two. + */ +#ifdef notyet + /* No promiscuous mode with Xen */ + if (ifp->if_flags & IFF_RUNNING && + ifp->if_flags & IFF_PROMISC && + !(sc->xn_if_flags & IFF_PROMISC)) { + XN_SETBIT(sc, XN_RX_MODE, + XN_RXMODE_RX_PROMISC); + } else if (ifp->if_flags & IFF_RUNNING && + !(ifp->if_flags & IFF_PROMISC) && + sc->xn_if_flags & IFF_PROMISC) { + XN_CLRBIT(sc, XN_RX_MODE, + XN_RXMODE_RX_PROMISC); + } else +#endif + xn_ifinit_locked(sc); + } else { + if (ifp->if_flags & IFF_RUNNING) { + xn_stop(sc); + } + } + sc->xn_if_flags = ifp->if_flags; + XN_UNLOCK(sc); + error = 0; + break; + case SIOCSIFCAP: + mask = ifr->ifr_reqcap ^ ifp->if_capenable; + if (mask & IFCAP_HWCSUM) { + if (IFCAP_HWCSUM & ifp->if_capenable) + ifp->if_capenable &= ~IFCAP_HWCSUM; + else + ifp->if_capenable |= IFCAP_HWCSUM; + } + error = 0; + break; + case SIOCADDMULTI: + case SIOCDELMULTI: +#ifdef notyet + if (ifp->if_flags & IFF_RUNNING) { + XN_LOCK(sc); + xn_setmulti(sc); + XN_UNLOCK(sc); + error = 0; + } +#endif + /* FALLTHROUGH */ + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + error = EINVAL; + break; + default: + error = ether_ioctl(ifp, cmd, data); + } + + return (error); +} + +static void +xn_stop(struct xn_softc *sc) +{ + struct ifnet *ifp; + + XN_LOCK_ASSERT(sc); + + ifp = &sc->arpcom.ac_if; + + callout_stop(&sc->xn_stat_ch); + + xn_free_rx_ring(sc); + xn_free_tx_ring(sc); + + ifp->if_flags &= ~(IFF_RUNNING | IFF_OACTIVE); +} + +/* START of Xenolinux helper functions adapted to FreeBSD */ +static void +network_connect(struct xn_softc *sc, netif_fe_interface_status_t *status) +{ + struct ifnet *ifp; + int i, requeue_idx; + netif_tx_request_t *tx; + + XN_LOCK(sc); + + ifp = &sc->arpcom.ac_if; + /* first time through, setup the ifp info */ + if (ifp->if_softc == NULL) { + ifp->if_softc = sc; + if_initname(ifp, "xn", sc->xn_ifno); + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX; + ifp->if_ioctl = xn_ioctl; + ifp->if_output = ether_output; + ifp->if_start = xn_start; +#ifdef notyet + ifp->if_watchdog = xn_watchdog; +#endif + ifp->if_init = xn_ifinit; + ifp->if_mtu = ETHERMTU; + ifp->if_snd.ifq_maxlen = NETIF_TX_RING_SIZE - 1; + +#ifdef notyet + ifp->if_hwassist = XN_CSUM_FEATURES; + ifp->if_capabilities = IFCAP_HWCSUM; + ifp->if_capenable = ifp->if_capabilities; +#endif + + ether_ifattach(ifp, sc->arpcom.ac_enaddr); + callout_init(&sc->xn_stat_ch, CALLOUT_MPSAFE); + } + + /* Recovery procedure: */ + + /* Step 1: Reinitialise variables. */ + sc->xn_rx_resp_cons = sc->xn_tx_resp_cons = 0; + sc->xn_rxbufcnt = sc->xn_txcnt = 0; + sc->xn_rx_if->event = sc->xn_tx_if->event = 1; + + /* Step 2: Rebuild the RX and TX ring contents. + * NB. We could just free the queued TX packets now but we hope + * that sending them out might do some good. We have to rebuild + * the RX ring because some of our pages are currently flipped out + * so we can't just free the RX skbs. + */ + + /* Rebuild the TX buffer freelist and the TX ring itself. + * NB. This reorders packets. We could keep more private state + * to avoid this but maybe it doesn't matter so much given the + * interface has been down. + */ + for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ ) + { + if (sc->xn_cdata.xn_tx_chain[i] != NULL) + { + struct mbuf *m = sc->xn_cdata.xn_tx_chain[i]; + + tx = &sc->xn_tx_if->ring[requeue_idx++].req; + + tx->id = i; + tx->addr = vtomach(mtod(m, vm_offset_t)); + tx->size = m->m_pkthdr.len; + sc->xn_txcnt++; + } + } + wmb(); + sc->xn_tx_if->req_prod = requeue_idx; + + /* Rebuild the RX buffer freelist and the RX ring itself. */ + for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ ) + if (sc->xn_cdata.xn_rx_chain[i] != NULL) + sc->xn_rx_if->ring[requeue_idx++].req.id = i; + wmb(); + sc->xn_rx_if->req_prod = requeue_idx; + + printk("[XEN] Netfront recovered tx=%d rxfree=%d\n", + sc->xn_tx_if->req_prod,sc->xn_rx_if->req_prod); + + + /* Step 3: All public and private state should now be sane. Get + * ready to start sending and receiving packets and give the driver + * domain a kick because we've probably just requeued some + * packets. + */ + sc->xn_backend_state = BEST_CONNECTED; + wmb(); + notify_via_evtchn(status->evtchn); + xn_txeof(sc); + + XN_UNLOCK(sc); +} + + +static void +vif_show(struct xn_softc *sc) +{ +#if DEBUG + if (sc) { + IPRINTK("<vif handle=%u %s(%s) evtchn=%u irq=%u tx=%p rx=%p>\n", + sc->xn_ifno, + be_state_name[sc->xn_backend_state], + sc->xn_user_state ? "open" : "closed", + sc->xn_evtchn, + sc->xn_irq, + sc->xn_tx_if, + sc->xn_rx_if); + } else { + IPRINTK("<vif NULL>\n"); + } +#endif +} + +/* Send a connect message to xend to tell it to bring up the interface. */ +static void +send_interface_connect(struct xn_softc *sc) +{ + ctrl_msg_t cmsg = { + .type = CMSG_NETIF_FE, + .subtype = CMSG_NETIF_FE_INTERFACE_CONNECT, + .length = sizeof(netif_fe_interface_connect_t), + }; + netif_fe_interface_connect_t *msg = (void*)cmsg.msg; + + vif_show(sc); + msg->handle = sc->xn_ifno; + msg->tx_shmem_frame = (vtomach(sc->xn_tx_if) >> PAGE_SHIFT); + msg->rx_shmem_frame = (vtomach(sc->xn_rx_if) >> PAGE_SHIFT); + + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} + +/* Send a driver status notification to the domain controller. */ +static int +send_driver_status(int ok) +{ + int err = 0; + ctrl_msg_t cmsg = { + .type = CMSG_NETIF_FE, + .subtype = CMSG_NETIF_FE_DRIVER_STATUS, + .length = sizeof(netif_fe_driver_status_t), + }; + netif_fe_driver_status_t *msg = (void*)cmsg.msg; + + msg->status = (ok ? NETIF_DRIVER_STATUS_UP : NETIF_DRIVER_STATUS_DOWN); + err = ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + return err; +} + +/* Stop network device and free tx/rx queues and irq. + */ +static void +vif_release(struct xn_softc *sc) +{ + /* Stop old i/f to prevent errors whilst we rebuild the state. */ + XN_LOCK(sc); + /* sc->xn_backend_state = BEST_DISCONNECTED; */ + XN_UNLOCK(sc); + + /* Free resources. */ + if(sc->xn_tx_if != NULL) { + unbind_evtchn_from_irq(sc->xn_evtchn); + free(sc->xn_tx_if, M_DEVBUF); + free(sc->xn_rx_if, M_DEVBUF); + sc->xn_irq = 0; + sc->xn_evtchn = 0; + sc->xn_tx_if = NULL; + sc->xn_rx_if = NULL; + } +} + +/* Release vif resources and close it down completely. + */ +static void +vif_close(struct xn_softc *sc) +{ + vif_show(sc); + WPRINTK("Unexpected netif-CLOSED message in state %s\n", + be_state_name[sc->xn_backend_state]); + vif_release(sc); + sc->xn_backend_state = BEST_CLOSED; + /* todo: take dev down and free. */ + vif_show(sc); +} + +/* Move the vif into disconnected state. + * Allocates tx/rx pages. + * Sends connect message to xend. + */ +static void +vif_disconnect(struct xn_softc *sc) +{ + if (sc->xn_tx_if) free(sc->xn_tx_if, M_DEVBUF); + if (sc->xn_rx_if) free(sc->xn_rx_if, M_DEVBUF); + + // Before this sc->xn_tx_if and sc->xn_rx_if had better be null. + sc->xn_tx_if = (netif_tx_interface_t *)malloc(PAGE_SIZE,M_DEVBUF,M_WAITOK); + sc->xn_rx_if = (netif_rx_interface_t *)malloc(PAGE_SIZE,M_DEVBUF,M_WAITOK); + memset(sc->xn_tx_if, 0, PAGE_SIZE); + memset(sc->xn_rx_if, 0, PAGE_SIZE); + sc->xn_backend_state = BEST_DISCONNECTED; + send_interface_connect(sc); + vif_show(sc); +} + +/* Begin interface recovery. + * + * NB. Whilst we're recovering, we turn the carrier state off. We + * take measures to ensure that this device isn't used for + * anything. We also stop the queue for this device. Various + * different approaches (e.g. continuing to buffer packets) have + * been tested but don't appear to improve the overall impact on + * TCP connections. + * + * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery + * is initiated by a special "RESET" message - disconnect could + * just mean we're not allowed to use this interface any more. + */ +static void +vif_reset(struct xn_softc *sc) +{ + IPRINTK("Attempting to reconnect network interface: handle=%u\n", + sc->xn_ifno); + vif_release(sc); + vif_disconnect(sc); + vif_show(sc); +} + +/* Move the vif into connected state. + * Sets the mac and event channel from the message. + * Binds the irq to the event channel. + */ +static void +vif_connect( + struct xn_softc *sc, netif_fe_interface_status_t *status) +{ + memcpy(sc->arpcom.ac_enaddr, status->mac, ETHER_ADDR_LEN); + network_connect(sc, status); + + sc->xn_evtchn = status->evtchn; + sc->xn_irq = bind_evtchn_to_irq(sc->xn_evtchn); + + (void)intr_add_handler("xn", sc->xn_irq, (driver_intr_t *)xn_intr, sc, + INTR_TYPE_NET | INTR_MPSAFE, &sc->xn_intrhand); + netctrl_connected_count(); + /* vif_wake(dev); Not needed for FreeBSD */ + vif_show(sc); +} + +/** Create a network device. + * @param handle device handle + */ +static void +create_netdev(int handle, struct xn_softc **sc) +{ + int i; + + *sc = (struct xn_softc *)malloc(sizeof(**sc), M_DEVBUF, M_WAITOK); + memset(*sc, 0, sizeof(struct xn_softc)); + + (*sc)->xn_backend_state = BEST_CLOSED; + (*sc)->xn_user_state = UST_CLOSED; + (*sc)->xn_ifno = handle; + + XN_LOCK_INIT(*sc, "xnetif"); + (*sc)->xn_rx_target = RX_MIN_TARGET; + + /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ + for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ ) + (*sc)->xn_tx_free_idxs[i] = (i+1); + for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ ) + (*sc)->xn_rx_free_idxs[i] = (i+1); + + SLIST_INSERT_HEAD(&xn_dev_list, *sc, xn_links); +} + +/* Get the target interface for a status message. + * Creates the interface when it makes sense. + * The returned interface may be null when there is no error. + * + * @param status status message + * @param sc return parameter for interface state + * @return 0 on success, error code otherwise + */ +static int +target_vif(netif_fe_interface_status_t *status, struct xn_softc **sc) +{ + int err = 0; + + XENPRINTF("> handle=%d\n", status->handle); + if ( status->handle < 0 ) + { + err = -EINVAL; + goto exit; + } + + if ( (*sc = find_sc_by_handle(status->handle)) != NULL ) + goto exit; + + if ( status->status == NETIF_INTERFACE_STATUS_CLOSED ) + goto exit; + if ( status->status == NETIF_INTERFACE_STATUS_CHANGED ) + goto exit; + + /* It's a new interface in a good state - create it. */ + XENPRINTF("> create device...\n"); + create_netdev(status->handle, sc); + netctrl.interface_n++; + +exit: + return err; +} + +/* Handle an interface status message. */ +static void +netif_interface_status(netif_fe_interface_status_t *status) +{ + int err = 0; + struct xn_softc *sc = NULL; + + XENPRINTF("> status=%s handle=%d\n", + status_name[status->status], status->handle); + + if ( (err = target_vif(status, &sc)) != 0 ) + { + WPRINTK("Invalid netif: handle=%u\n", status->handle); + return; + } + + if ( sc == NULL ) + { + XENPRINTF("> no vif\n"); + return; + } + + vif_show(sc); + + switch ( status->status ) + { + case NETIF_INTERFACE_STATUS_CLOSED: + switch ( sc->xn_backend_state ) + { + case BEST_CLOSED: + case BEST_DISCONNECTED: + case BEST_CONNECTED: + vif_close(sc); + break; + } + break; + + case NETIF_INTERFACE_STATUS_DISCONNECTED: + switch ( sc->xn_backend_state ) + { + case BEST_CLOSED: + vif_disconnect(sc); + break; + case BEST_DISCONNECTED: + case BEST_CONNECTED: + vif_reset(sc); + break; + } + break; + + case NETIF_INTERFACE_STATUS_CONNECTED: + switch ( sc->xn_backend_state ) + { + case BEST_CLOSED: + WPRINTK("Unexpected netif status %s in state %s\n", + status_name[status->status], + be_state_name[sc->xn_backend_state]); + vif_disconnect(sc); + vif_connect(sc, status); + break; + case BEST_DISCONNECTED: + vif_connect(sc, status); + break; + } + break; + + case NETIF_INTERFACE_STATUS_CHANGED: + /* + * The domain controller is notifying us that a device has been + * added or removed. + */ + break; + + default: + WPRINTK("Invalid netif status code %d\n", status->status); + break; + } + vif_show(sc); +} + +/* + * Initialize the network control interface. + */ +static void +netif_driver_status(netif_fe_driver_status_t *status) +{ + XENPRINTF("> status=%d\n", status->status); + netctrl.up = status->status; + //netctrl.interface_n = status->max_handle; + //netctrl.connected_n = 0; + netctrl_connected_count(); +} + +/* Receive handler for control messages. */ +static void +netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + + switch ( msg->subtype ) + { + case CMSG_NETIF_FE_INTERFACE_STATUS: + if ( msg->length != sizeof(netif_fe_interface_status_t) ) + goto error; + netif_interface_status((netif_fe_interface_status_t *) + &msg->msg[0]); + break; + + case CMSG_NETIF_FE_DRIVER_STATUS: + if ( msg->length != sizeof(netif_fe_driver_status_t) ) + goto error; + netif_driver_status((netif_fe_driver_status_t *) + &msg->msg[0]); + break; + + error: + default: + msg->length = 0; + break; + } + + ctrl_if_send_response(msg); +} + +#if 1 +/* Wait for all interfaces to be connected. + * + * This works OK, but we'd like to use the probing mode (see below). + */ +static int probe_interfaces(void) +{ + int err = 0, conn = 0; + int wait_i, wait_n = 100; + + for ( wait_i = 0; wait_i < wait_n; wait_i++) + { + XENPRINTF("> wait_i=%d\n", wait_i); + conn = netctrl_connected(); + if(conn) break; + tsleep(&xn_dev_list, PWAIT | PCATCH, "netif", hz); + } + + XENPRINTF("> wait finished...\n"); + if ( conn <= 0 ) + { + err = netctrl_err(-ENETDOWN); + WPRINTK("Failed to connect all virtual interfaces: err=%d\n", err); + } + + XENPRINTF("< err=%d\n", err); + + return err; +} +#else +/* Probe for interfaces until no more are found. + * + * This is the mode we'd like to use, but at the moment it panics the kernel. +*/ +static int +probe_interfaces(void) +{ + int err = 0; + int wait_i, wait_n = 100; + ctrl_msg_t cmsg = { + .type = CMSG_NETIF_FE, + .subtype = CMSG_NETIF_FE_INTERFACE_STATUS, + .length = sizeof(netif_fe_interface_status_t), + }; + netif_fe_interface_status_t msg = {}; + ctrl_msg_t rmsg = {}; + netif_fe_interface_status_t *reply = (void*)rmsg.msg; + int state = TASK_UNINTERRUPTIBLE; + uint32_t query = -1; + + + netctrl.interface_n = 0; + for ( wait_i = 0; wait_i < wait_n; wait_i++ ) + { + XENPRINTF("> wait_i=%d query=%d\n", wait_i, query); + msg.handle = query; + memcpy(cmsg.msg, &msg, sizeof(msg)); + XENPRINTF("> set_current_state...\n"); + set_current_state(state); + XENPRINTF("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); + XENPRINTF("> sending...\n"); + err = ctrl_if_send_message_and_get_response(&cmsg, &rmsg, state); + XENPRINTF("> err=%d\n", err); + if(err) goto exit; + XENPRINTF("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); + if((int)reply->handle < 0){ + // No more interfaces. + break; + } + query = -reply->handle - 2; + XENPRINTF(">netif_interface_status ...\n"); + netif_interface_status(reply); + } + + exit: + if ( err ) + { + err = netctrl_err(-ENETDOWN); + WPRINTK("Connecting virtual network interfaces failed: err=%d\n", err); + } + + XENPRINTF("< err=%d\n", err); + return err; +} + +#endif + +static void +xn_init(void *unused) +{ + + int err = 0; + + netctrl_init(); + (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + send_driver_status(1); + err = probe_interfaces(); + + if (err) + ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx); +} + +SYSINIT(xndev, SI_SUB_PSEUDO, SI_ORDER_ANY, xn_init, NULL) diff --git a/freebsd-5.3-xen-sparse/kern/kern_fork.c b/freebsd-5.3-xen-sparse/kern/kern_fork.c new file mode 100644 index 0000000000..4b38ee45b6 --- /dev/null +++ b/freebsd-5.3-xen-sparse/kern/kern_fork.c @@ -0,0 +1,846 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/kern/kern_fork.c,v 1.234.2.4 2004/09/18 04:11:35 julian Exp $"); + +#include "opt_ktrace.h" +#include "opt_mac.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/eventhandler.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/sysctl.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/pioctl.h> +#include <sys/resourcevar.h> +#include <sys/sched.h> +#include <sys/syscall.h> +#include <sys/vmmeter.h> +#include <sys/vnode.h> +#include <sys/acct.h> +#include <sys/mac.h> +#include <sys/ktr.h> +#include <sys/ktrace.h> +#include <sys/unistd.h> +#include <sys/sx.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/uma.h> + +#include <sys/user.h> +#include <machine/critical.h> + +#ifndef _SYS_SYSPROTO_H_ +struct fork_args { + int dummy; +}; +#endif + +static int forksleep; /* Place for fork1() to sleep on. */ + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +fork(td, uap) + struct thread *td; + struct fork_args *uap; +{ + int error; + struct proc *p2; + + error = fork1(td, RFFDG | RFPROC, 0, &p2); + if (error == 0) { + td->td_retval[0] = p2->p_pid; + td->td_retval[1] = 0; + } + return (error); +} + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +vfork(td, uap) + struct thread *td; + struct vfork_args *uap; +{ + int error; + struct proc *p2; + + error = fork1(td, RFFDG | RFPROC /* | RFPPWAIT | RFMEM */, 0, &p2); + if (error == 0) { + td->td_retval[0] = p2->p_pid; + td->td_retval[1] = 0; + } + return (error); +} + +/* + * MPSAFE + */ +int +rfork(td, uap) + struct thread *td; + struct rfork_args *uap; +{ + struct proc *p2; + int error; + + /* Don't allow kernel-only flags. */ + if ((uap->flags & RFKERNELONLY) != 0) + return (EINVAL); + + error = fork1(td, uap->flags, 0, &p2); + if (error == 0) { + td->td_retval[0] = p2 ? p2->p_pid : 0; + td->td_retval[1] = 0; + } + return (error); +} + +int nprocs = 1; /* process 0 */ +int lastpid = 0; +SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, + "Last used PID"); + +/* + * Random component to lastpid generation. We mix in a random factor to make + * it a little harder to predict. We sanity check the modulus value to avoid + * doing it in critical paths. Don't let it be too small or we pointlessly + * waste randomness entropy, and don't let it be impossibly large. Using a + * modulus that is too big causes a LOT more process table scans and slows + * down fork processing as the pidchecked caching is defeated. + */ +static int randompid = 0; + +static int +sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) +{ + int error, pid; + + error = sysctl_wire_old_buffer(req, sizeof(int)); + if (error != 0) + return(error); + sx_xlock(&allproc_lock); + pid = randompid; + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error == 0 && req->newptr != NULL) { + if (pid < 0 || pid > PID_MAX - 100) /* out of range */ + pid = PID_MAX - 100; + else if (pid < 2) /* NOP */ + pid = 0; + else if (pid < 100) /* Make it reasonable */ + pid = 100; + randompid = pid; + } + sx_xunlock(&allproc_lock); + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); + +int +fork1(td, flags, pages, procp) + struct thread *td; + int flags; + int pages; + struct proc **procp; +{ + struct proc *p1, *p2, *pptr; + uid_t uid; + struct proc *newproc; + int ok, trypid; + static int curfail, pidchecked = 0; + static struct timeval lastfail; + struct filedesc *fd; + struct filedesc_to_leader *fdtol; + struct thread *td2; + struct ksegrp *kg2; + struct sigacts *newsigacts; + int error; + + /* Can't copy and clear. */ + if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) + return (EINVAL); + + p1 = td->td_proc; + + /* + * Here we don't create a new process, but we divorce + * certain parts of a process from itself. + */ + if ((flags & RFPROC) == 0) { + mtx_lock(&Giant); + vm_forkproc(td, NULL, NULL, flags); + mtx_unlock(&Giant); + + /* + * Close all file descriptors. + */ + if (flags & RFCFDG) { + struct filedesc *fdtmp; + FILEDESC_LOCK(td->td_proc->p_fd); + fdtmp = fdinit(td->td_proc->p_fd); + FILEDESC_UNLOCK(td->td_proc->p_fd); + fdfree(td); + p1->p_fd = fdtmp; + } + + /* + * Unshare file descriptors (from parent). + */ + if (flags & RFFDG) { + FILEDESC_LOCK(p1->p_fd); + if (p1->p_fd->fd_refcnt > 1) { + struct filedesc *newfd; + + newfd = fdcopy(td->td_proc->p_fd); + FILEDESC_UNLOCK(p1->p_fd); + fdfree(td); + p1->p_fd = newfd; + } else + FILEDESC_UNLOCK(p1->p_fd); + } + *procp = NULL; + return (0); + } + + /* + * Note 1:1 allows for forking with one thread coming out on the + * other side with the expectation that the process is about to + * exec. + */ + if (p1->p_flag & P_HADTHREADS) { + /* + * Idle the other threads for a second. + * Since the user space is copied, it must remain stable. + * In addition, all threads (from the user perspective) + * need to either be suspended or in the kernel, + * where they will try restart in the parent and will + * be aborted in the child. + */ + PROC_LOCK(p1); + if (thread_single(SINGLE_NO_EXIT)) { + /* Abort. Someone else is single threading before us. */ + PROC_UNLOCK(p1); + return (ERESTART); + } + PROC_UNLOCK(p1); + /* + * All other activity in this process + * is now suspended at the user boundary, + * (or other safe places if we think of any). + */ + } + + /* Allocate new proc. */ + newproc = uma_zalloc(proc_zone, M_WAITOK); +#ifdef MAC + mac_init_proc(newproc); +#endif + knlist_init(&newproc->p_klist, &newproc->p_mtx); + + /* We have to lock the process tree while we look for a pid. */ + sx_slock(&proctree_lock); + + /* + * Although process entries are dynamically created, we still keep + * a global limit on the maximum number we will create. Don't allow + * a nonprivileged user to use the last ten processes; don't let root + * exceed the limit. The variable nprocs is the current number of + * processes, maxproc is the limit. + */ + sx_xlock(&allproc_lock); + uid = td->td_ucred->cr_ruid; + if ((nprocs >= maxproc - 10 && + suser_cred(td->td_ucred, SUSER_RUID) != 0) || + nprocs >= maxproc) { + error = EAGAIN; + goto fail; + } + + /* + * Increment the count of procs running with this uid. Don't allow + * a nonprivileged user to exceed their current limit. + */ + PROC_LOCK(p1); + ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, + (uid != 0) ? lim_cur(p1, RLIMIT_NPROC) : 0); + PROC_UNLOCK(p1); + if (!ok) { + error = EAGAIN; + goto fail; + } + + /* + * Increment the nprocs resource before blocking can occur. There + * are hard-limits as to the number of processes that can run. + */ + nprocs++; + + /* + * Find an unused process ID. We remember a range of unused IDs + * ready to use (from lastpid+1 through pidchecked-1). + * + * If RFHIGHPID is set (used during system boot), do not allocate + * low-numbered pids. + */ + trypid = lastpid + 1; + if (flags & RFHIGHPID) { + if (trypid < 10) + trypid = 10; + } else { + if (randompid) + trypid += arc4random() % randompid; + } +retry: + /* + * If the process ID prototype has wrapped around, + * restart somewhat above 0, as the low-numbered procs + * tend to include daemons that don't exit. + */ + if (trypid >= PID_MAX) { + trypid = trypid % PID_MAX; + if (trypid < 100) + trypid += 100; + pidchecked = 0; + } + if (trypid >= pidchecked) { + int doingzomb = 0; + + pidchecked = PID_MAX; + /* + * Scan the active and zombie procs to check whether this pid + * is in use. Remember the lowest pid that's greater + * than trypid, so we can avoid checking for a while. + */ + p2 = LIST_FIRST(&allproc); +again: + for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) { + PROC_LOCK(p2); + while (p2->p_pid == trypid || + (p2->p_pgrp != NULL && + (p2->p_pgrp->pg_id == trypid || + (p2->p_session != NULL && + p2->p_session->s_sid == trypid)))) { + trypid++; + if (trypid >= pidchecked) { + PROC_UNLOCK(p2); + goto retry; + } + } + if (p2->p_pid > trypid && pidchecked > p2->p_pid) + pidchecked = p2->p_pid; + if (p2->p_pgrp != NULL) { + if (p2->p_pgrp->pg_id > trypid && + pidchecked > p2->p_pgrp->pg_id) + pidchecked = p2->p_pgrp->pg_id; + if (p2->p_session != NULL && + p2->p_session->s_sid > trypid && + pidchecked > p2->p_session->s_sid) + pidchecked = p2->p_session->s_sid; + } + PROC_UNLOCK(p2); + } + if (!doingzomb) { + doingzomb = 1; + p2 = LIST_FIRST(&zombproc); + goto again; + } + } + sx_sunlock(&proctree_lock); + + /* + * RFHIGHPID does not mess with the lastpid counter during boot. + */ + if (flags & RFHIGHPID) + pidchecked = 0; + else + lastpid = trypid; + + p2 = newproc; + p2->p_state = PRS_NEW; /* protect against others */ + p2->p_pid = trypid; + LIST_INSERT_HEAD(&allproc, p2, p_list); + LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); + sx_xunlock(&allproc_lock); + + /* + * Malloc things while we don't hold any locks. + */ + if (flags & RFSIGSHARE) + newsigacts = NULL; + else + newsigacts = sigacts_alloc(); + + /* + * Copy filedesc. + */ + if (flags & RFCFDG) { + FILEDESC_LOCK(td->td_proc->p_fd); + fd = fdinit(td->td_proc->p_fd); + FILEDESC_UNLOCK(td->td_proc->p_fd); + fdtol = NULL; + } else if (flags & RFFDG) { + FILEDESC_LOCK(p1->p_fd); + fd = fdcopy(td->td_proc->p_fd); + FILEDESC_UNLOCK(p1->p_fd); + fdtol = NULL; + } else { + fd = fdshare(p1->p_fd); + if (p1->p_fdtol == NULL) + p1->p_fdtol = + filedesc_to_leader_alloc(NULL, + NULL, + p1->p_leader); + if ((flags & RFTHREAD) != 0) { + /* + * Shared file descriptor table and + * shared process leaders. + */ + fdtol = p1->p_fdtol; + FILEDESC_LOCK(p1->p_fd); + fdtol->fdl_refcount++; + FILEDESC_UNLOCK(p1->p_fd); + } else { + /* + * Shared file descriptor table, and + * different process leaders + */ + fdtol = filedesc_to_leader_alloc(p1->p_fdtol, + p1->p_fd, + p2); + } + } + /* + * Make a proc table entry for the new process. + * Start by zeroing the section of proc that is zero-initialized, + * then copy the section that is copied directly from the parent. + */ + td2 = FIRST_THREAD_IN_PROC(p2); + kg2 = FIRST_KSEGRP_IN_PROC(p2); + + /* Allocate and switch to an alternate kstack if specified. */ + if (pages != 0) + vm_thread_new_altkstack(td2, pages); + + PROC_LOCK(p2); + PROC_LOCK(p1); + +#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start)) + + bzero(&p2->p_startzero, + (unsigned) RANGEOF(struct proc, p_startzero, p_endzero)); + bzero(&td2->td_startzero, + (unsigned) RANGEOF(struct thread, td_startzero, td_endzero)); + bzero(&kg2->kg_startzero, + (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero)); + + bcopy(&p1->p_startcopy, &p2->p_startcopy, + (unsigned) RANGEOF(struct proc, p_startcopy, p_endcopy)); + bcopy(&td->td_startcopy, &td2->td_startcopy, + (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy)); + bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy, + (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy)); +#undef RANGEOF + + td2->td_sigstk = td->td_sigstk; + + /* + * Duplicate sub-structures as needed. + * Increase reference counts on shared objects. + * The p_stats substruct is set in vm_forkproc. + */ + p2->p_flag = 0; + if (p1->p_flag & P_PROFIL) + startprofclock(p2); + mtx_lock_spin(&sched_lock); + p2->p_sflag = PS_INMEM; + /* + * Allow the scheduler to adjust the priority of the child and + * parent while we hold the sched_lock. + */ + sched_fork(td, td2); + + mtx_unlock_spin(&sched_lock); + p2->p_ucred = crhold(td->td_ucred); + td2->td_ucred = crhold(p2->p_ucred); /* XXXKSE */ + + pargs_hold(p2->p_args); + + if (flags & RFSIGSHARE) { + p2->p_sigacts = sigacts_hold(p1->p_sigacts); + } else { + sigacts_copy(newsigacts, p1->p_sigacts); + p2->p_sigacts = newsigacts; + } + if (flags & RFLINUXTHPN) + p2->p_sigparent = SIGUSR1; + else + p2->p_sigparent = SIGCHLD; + + p2->p_textvp = p1->p_textvp; + p2->p_fd = fd; + p2->p_fdtol = fdtol; + + /* + * p_limit is copy-on-write. Bump its refcount. + */ + p2->p_limit = lim_hold(p1->p_limit); + PROC_UNLOCK(p1); + PROC_UNLOCK(p2); + + /* Bump references to the text vnode (for procfs) */ + if (p2->p_textvp) + vref(p2->p_textvp); + + /* + * Set up linkage for kernel based threading. + */ + if ((flags & RFTHREAD) != 0) { + mtx_lock(&ppeers_lock); + p2->p_peers = p1->p_peers; + p1->p_peers = p2; + p2->p_leader = p1->p_leader; + mtx_unlock(&ppeers_lock); + PROC_LOCK(p1->p_leader); + if ((p1->p_leader->p_flag & P_WEXIT) != 0) { + PROC_UNLOCK(p1->p_leader); + /* + * The task leader is exiting, so process p1 is + * going to be killed shortly. Since p1 obviously + * isn't dead yet, we know that the leader is either + * sending SIGKILL's to all the processes in this + * task or is sleeping waiting for all the peers to + * exit. We let p1 complete the fork, but we need + * to go ahead and kill the new process p2 since + * the task leader may not get a chance to send + * SIGKILL to it. We leave it on the list so that + * the task leader will wait for this new process + * to commit suicide. + */ + PROC_LOCK(p2); + psignal(p2, SIGKILL); + PROC_UNLOCK(p2); + } else + PROC_UNLOCK(p1->p_leader); + } else { + p2->p_peers = NULL; + p2->p_leader = p2; + } + + sx_xlock(&proctree_lock); + PGRP_LOCK(p1->p_pgrp); + PROC_LOCK(p2); + PROC_LOCK(p1); + + /* + * Preserve some more flags in subprocess. P_PROFIL has already + * been preserved. + */ + p2->p_flag |= p1->p_flag & P_SUGID; + td2->td_pflags |= td->td_pflags & TDP_ALTSTACK; + SESS_LOCK(p1->p_session); + if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) + p2->p_flag |= P_CONTROLT; + SESS_UNLOCK(p1->p_session); + if (flags & RFPPWAIT) + p2->p_flag |= P_PPWAIT; + + p2->p_pgrp = p1->p_pgrp; + LIST_INSERT_AFTER(p1, p2, p_pglist); + PGRP_UNLOCK(p1->p_pgrp); + LIST_INIT(&p2->p_children); + + callout_init(&p2->p_itcallout, CALLOUT_MPSAFE); + +#ifdef KTRACE + /* + * Copy traceflag and tracefile if enabled. + */ + mtx_lock(&ktrace_mtx); + KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode")); + if (p1->p_traceflag & KTRFAC_INHERIT) { + p2->p_traceflag = p1->p_traceflag; + if ((p2->p_tracevp = p1->p_tracevp) != NULL) { + VREF(p2->p_tracevp); + KASSERT(p1->p_tracecred != NULL, + ("ktrace vnode with no cred")); + p2->p_tracecred = crhold(p1->p_tracecred); + } + } + mtx_unlock(&ktrace_mtx); +#endif + + /* + * If PF_FORK is set, the child process inherits the + * procfs ioctl flags from its parent. + */ + if (p1->p_pfsflags & PF_FORK) { + p2->p_stops = p1->p_stops; + p2->p_pfsflags = p1->p_pfsflags; + } + + /* + * This begins the section where we must prevent the parent + * from being swapped. + */ + _PHOLD(p1); + PROC_UNLOCK(p1); + + /* + * Attach the new process to its parent. + * + * If RFNOWAIT is set, the newly created process becomes a child + * of init. This effectively disassociates the child from the + * parent. + */ + if (flags & RFNOWAIT) + pptr = initproc; + else + pptr = p1; + p2->p_pptr = pptr; + LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); + sx_xunlock(&proctree_lock); + + /* Inform accounting that we have forked. */ + p2->p_acflag = AFORK; + PROC_UNLOCK(p2); + + /* + * Finish creating the child process. It will return via a different + * execution path later. (ie: directly into user mode) + */ + mtx_lock(&Giant); + vm_forkproc(td, p2, td2, flags); + + if (flags == (RFFDG | RFPROC)) { + cnt.v_forks++; + cnt.v_forkpages += p2->p_vmspace->vm_dsize + + p2->p_vmspace->vm_ssize; + } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { + cnt.v_vforks++; + cnt.v_vforkpages += p2->p_vmspace->vm_dsize + + p2->p_vmspace->vm_ssize; + } else if (p1 == &proc0) { + cnt.v_kthreads++; + cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + + p2->p_vmspace->vm_ssize; + } else { + cnt.v_rforks++; + cnt.v_rforkpages += p2->p_vmspace->vm_dsize + + p2->p_vmspace->vm_ssize; + } + mtx_unlock(&Giant); + + /* + * Both processes are set up, now check if any loadable modules want + * to adjust anything. + * What if they have an error? XXX + */ + EVENTHANDLER_INVOKE(process_fork, p1, p2, flags); + + /* + * Set the child start time and mark the process as being complete. + */ + microuptime(&p2->p_stats->p_start); + mtx_lock_spin(&sched_lock); + p2->p_state = PRS_NORMAL; + + /* + * If RFSTOPPED not requested, make child runnable and add to + * run queue. + */ + if ((flags & RFSTOPPED) == 0) { + TD_SET_CAN_RUN(td2); + setrunqueue(td2, SRQ_BORING); + } + mtx_unlock_spin(&sched_lock); + + /* + * Now can be swapped. + */ + PROC_LOCK(p1); + _PRELE(p1); + + /* + * Tell any interested parties about the new process. + */ + KNOTE_LOCKED(&p1->p_klist, NOTE_FORK | p2->p_pid); + + PROC_UNLOCK(p1); + + /* + * Preserve synchronization semantics of vfork. If waiting for + * child to exec or exit, set P_PPWAIT on child, and sleep on our + * proc (in case of exit). + */ + PROC_LOCK(p2); + while (p2->p_flag & P_PPWAIT) + msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0); + PROC_UNLOCK(p2); + + /* + * If other threads are waiting, let them continue now. + */ + if (p1->p_flag & P_HADTHREADS) { + PROC_LOCK(p1); + thread_single_end(); + PROC_UNLOCK(p1); + } + + /* + * Return child proc pointer to parent. + */ + *procp = p2; + return (0); +fail: + sx_sunlock(&proctree_lock); + if (ppsratecheck(&lastfail, &curfail, 1)) + printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n", + uid); + sx_xunlock(&allproc_lock); +#ifdef MAC + mac_destroy_proc(newproc); +#endif + uma_zfree(proc_zone, newproc); + if (p1->p_flag & P_HADTHREADS) { + PROC_LOCK(p1); + thread_single_end(); + PROC_UNLOCK(p1); + } + tsleep(&forksleep, PUSER, "fork", hz / 2); + return (error); +} + +/* + * Handle the return of a child process from fork1(). This function + * is called from the MD fork_trampoline() entry point. + */ +void +fork_exit(callout, arg, frame) + void (*callout)(void *, struct trapframe *); + void *arg; + struct trapframe *frame; +{ + struct proc *p; + struct thread *td; + + /* + * Finish setting up thread glue so that it begins execution in a + * non-nested critical section with sched_lock held but not recursed. + */ + td = curthread; + p = td->td_proc; + td->td_oncpu = PCPU_GET(cpuid); + KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); + + sched_lock.mtx_lock = (uintptr_t)td; + mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); + cpu_critical_fork_exit(); + CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)", + td, td->td_sched, p->p_pid, p->p_comm); + + /* + * Processes normally resume in mi_switch() after being + * cpu_switch()'ed to, but when children start up they arrive here + * instead, so we must do much the same things as mi_switch() would. + */ + + if ((td = PCPU_GET(deadthread))) { + PCPU_SET(deadthread, NULL); + thread_stash(td); + } + td = curthread; + mtx_unlock_spin(&sched_lock); + + /* + * cpu_set_fork_handler intercepts this function call to + * have this call a non-return function to stay in kernel mode. + * initproc has its own fork handler, but it does return. + */ + KASSERT(callout != NULL, ("NULL callout in fork_exit")); + callout(arg, frame); + + /* + * Check if a kernel thread misbehaved and returned from its main + * function. + */ + PROC_LOCK(p); + if (p->p_flag & P_KTHREAD) { + PROC_UNLOCK(p); + printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", + p->p_comm, p->p_pid); + kthread_exit(0); + } + PROC_UNLOCK(p); + mtx_assert(&Giant, MA_NOTOWNED); +} + +/* + * Simplified back end of syscall(), used when returning from fork() + * directly into user mode. Giant is not held on entry, and must not + * be held on return. This function is passed in to fork_exit() as the + * first parameter and is called when returning to a new userland process. + */ +void +fork_return(td, frame) + struct thread *td; + struct trapframe *frame; +{ + + userret(td, frame, 0); +#ifdef KTRACE + if (KTRPOINT(td, KTR_SYSRET)) + ktrsysret(SYS_fork, 0, 0); +#endif + mtx_assert(&Giant, MA_NOTOWNED); +} diff --git a/freebsd-5.3-xen-sparse/mkbuildtree b/freebsd-5.3-xen-sparse/mkbuildtree new file mode 100644 index 0000000000..ce4c91d431 --- /dev/null +++ b/freebsd-5.3-xen-sparse/mkbuildtree @@ -0,0 +1,119 @@ +#!/bin/bash + +# mkbuildtree <build tree> +# +# Creates symbolic links in <build tree> for the sparse tree +# in the current directory. + +# Script to determine the relative path between two directories. +# Copyright (c) D. J. Hawkey Jr. 2002 +# Fixed for Xen project by K. Fraser in 2003. +abs_to_rel () +{ + local CWD SRCPATH + + if [ "$1" != "/" -a "${1##*[^/]}" = "/" ]; then + SRCPATH=${1%?} + else + SRCPATH=$1 + fi + if [ "$2" != "/" -a "${2##*[^/]}" = "/" ]; then + DESTPATH=${2%?} + else + DESTPATH=$2 + fi + + CWD=$PWD + [ "${1%%[^/]*}" != "/" ] && cd $1 && SRCPATH=$PWD + [ "${2%%[^/]*}" != "/" ] && cd $2 && DESTPATH=$PWD + [ "$CWD" != "$PWD" ] && cd $CWD + + BASEPATH=$SRCPATH + + [ "$SRCPATH" = "$DESTPATH" ] && DESTPATH="." && return + [ "$SRCPATH" = "/" ] && DESTPATH=${DESTPATH#?} && return + + while [ "$BASEPATH/" != "${DESTPATH%${DESTPATH#$BASEPATH/}}" ]; do + BASEPATH=${BASEPATH%/*} + done + + SRCPATH=${SRCPATH#$BASEPATH} + DESTPATH=${DESTPATH#$BASEPATH} + DESTPATH=${DESTPATH#?} + while [ -n "$SRCPATH" ]; do + SRCPATH=${SRCPATH%/*} + DESTPATH="../$DESTPATH" + done + + [ -z "$BASEPATH" ] && BASEPATH="/" + [ "${DESTPATH##*[^/]}" = "/" ] && DESTPATH=${DESTPATH%?} +} + +# relative_lndir <target_dir> +# Creates a tree of symlinks in the current working directory that mirror +# real files in <target_dir>. <target_dir> should be relative to the current +# working directory. Symlinks in <target_dir> are ignored. Source-control files +# are ignored. +relative_lndir () +{ + local SYMLINK_DIR REAL_DIR pref i j + SYMLINK_DIR=$PWD + REAL_DIR=$1 + ( + cd $REAL_DIR + for i in `find . -type d | grep -v SCCS`; do + [ -d $SYMLINK_DIR/$i ] || mkdir -p $SYMLINK_DIR/$i + ( + cd $i + pref=`echo $i | sed -e 's#/[^/]*#../#g' -e 's#^\.##'` + for j in `find . -type f -o -type l -maxdepth 1`; do + ln -sf ${pref}${REAL_DIR}/$i/$j ${SYMLINK_DIR}/$i/$j + done + ) + done + ) +} + +[ "$1" == "" ] && { echo "Syntax: $0 <linux tree to xenify>"; exit 1; } + +# Get absolute path to the destination directory +pushd . >/dev/null +cd ${1} +AD=$PWD +popd >/dev/null + +# Get absolute path to the source directory +AS=`pwd` + +# Get name of sparse directory +SDN=$(basename $AS) + +# Get path to source, relative to destination +abs_to_rel ${AD} ${AS} +RS=$DESTPATH + +# Remove old copies of files and directories at the destination +for i in `find sys -type f -o -type l` ; do rm -f ${AD}/${i#./} ; done + +# We now work from the destination directory +cd ${AD} + +# Remove old symlinks +find sys -type l | while read f +do + case $(readlink $f) in + */$SDN/*) + rm -f $f + ;; + esac +done + +if [ -f ${AD}/BUILDING ]; then + # Create symlinks of files and directories which exist in the sparse source + (cd sys && relative_lndir ../${RS}/sys) +else + # Create symlinks of files and directories which exist in the sparse source + relative_lndir ${RS} + rm -f mkbuildtree +fi + diff --git a/freebsd-5.3-xen-sparse/xenfbsd_kernel_build b/freebsd-5.3-xen-sparse/xenfbsd_kernel_build new file mode 100644 index 0000000000..dc2c927c06 --- /dev/null +++ b/freebsd-5.3-xen-sparse/xenfbsd_kernel_build @@ -0,0 +1,7 @@ +#!/bin/csh -f +cd i386-xen/conf +config XENCONF +cd ../compile/XENCONF +make kernel-clean +ln -s ../../include/xen-public/io/ring.h +make kernel-depend; make -j4 kernel |