diff options
author | vh249@kneesaa.uk.xensource.com <vh249@kneesaa.uk.xensource.com> | 2005-07-11 09:35:19 -0500 |
---|---|---|
committer | vh249@kneesaa.uk.xensource.com <vh249@kneesaa.uk.xensource.com> | 2005-07-11 09:35:19 -0500 |
commit | a7585e4041167bd489707e7c7bb1e54718888568 (patch) | |
tree | bb2f2b8c211a034f6378f19b77c4b97bfc91799f | |
parent | 105077619922d8c782b74491afd1b406dc654fa7 (diff) | |
download | xen-a7585e4041167bd489707e7c7bb1e54718888568.tar.gz xen-a7585e4041167bd489707e7c7bb1e54718888568.tar.bz2 xen-a7585e4041167bd489707e7c7bb1e54718888568.zip |
upgrade linux sparse tree from 2.6.11 to 2.6.12
Signed-off-by: Vincent Hanquez <vincent@xensource.com>
--HG--
rename : patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch => patches/linux-2.6.12/i386-cpu-hotplug-updated-for-mm.patch
rename : patches/linux-2.6.11/net-csum.patch => patches/linux-2.6.12/net-csum.patch
rename : patches/linux-2.6.11/rcu-nohz.patch => patches/linux-2.6.12/rcu-nohz.patch
rename : patches/linux-2.6.11/smp-alts.patch => patches/linux-2.6.12/smp-alts.patch
rename : patches/linux-2.6.11/x86_64-linux.patch => patches/linux-2.6.12/x86_64-linux.patch
100 files changed, 3548 insertions, 6107 deletions
diff --git a/buildconfigs/mk.linux-2.6-xen0 b/buildconfigs/mk.linux-2.6-xen0 index 72f49267ca..b06f289078 100644 --- a/buildconfigs/mk.linux-2.6-xen0 +++ b/buildconfigs/mk.linux-2.6-xen0 @@ -2,7 +2,7 @@ OS = linux LINUX_SERIES = 2.6 -LINUX_VER = 2.6.11 +LINUX_VER = 2.6.12 EXTRAVERSION = xen0 diff --git a/buildconfigs/mk.linux-2.6-xenU b/buildconfigs/mk.linux-2.6-xenU index bd9856c39b..c98e296742 100644 --- a/buildconfigs/mk.linux-2.6-xenU +++ b/buildconfigs/mk.linux-2.6-xenU @@ -2,7 +2,7 @@ OS = linux LINUX_SERIES = 2.6 -LINUX_VER = 2.6.11 +LINUX_VER = 2.6.12 EXTRAVERSION = xenU diff --git a/linux-2.6-xen-sparse/arch/xen/Kconfig b/linux-2.6-xen-sparse/arch/xen/Kconfig index 480c4e8fd1..9ff5a27ad1 100644 --- a/linux-2.6-xen-sparse/arch/xen/Kconfig +++ b/linux-2.6-xen-sparse/arch/xen/Kconfig @@ -194,3 +194,5 @@ source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "arch/xen/Kconfig.debug" diff --git a/linux-2.6-xen-sparse/arch/xen/Kconfig.debug b/linux-2.6-xen-sparse/arch/xen/Kconfig.debug new file mode 100644 index 0000000000..663eacc9e5 --- /dev/null +++ b/linux-2.6-xen-sparse/arch/xen/Kconfig.debug @@ -0,0 +1,129 @@ +menu "Kernel hacking" + +source "lib/Kconfig.debug" + +# X86 +config EARLY_PRINTK + bool "Early printk" if EMBEDDED && DEBUG_KERNEL + default y + depends on X86 + help + Write kernel log output directly into the VGA buffer or to a serial + port. + + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server. You should normally N here, + unless you want to debug such a crash. + +config DEBUG_STACKOVERFLOW + bool "Check for stack overflows" + depends on DEBUG_KERNEL && X86 + +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL && X86 + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". + +config DEBUG_STACK_USAGE + bool "Stack utilization instrumentation" + depends on DEBUG_KERNEL && X86 + help + Enables the display of the minimum amount of free stack which each + task has ever had available in the sysrq-T and sysrq-P debug output. + + This option will slow down process creation somewhat. + +comment "Page alloc debug is incompatible with Software Suspend on i386" + depends on DEBUG_KERNEL && SOFTWARE_SUSPEND && X86 + +config DEBUG_PAGEALLOC + bool "Page alloc debugging" + depends on DEBUG_KERNEL && !SOFTWARE_SUSPEND && X86 + help + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + +config 4KSTACKS + bool "Use 4Kb for kernel stacks instead of 8Kb" + depends on DEBUG_KERNEL && X86 + help + If you say Y here the kernel will use a 4Kb stacksize for the + kernel stack attached to each process/thread. This facilitates + running more threads on a system and also reduces the pressure + on the VM subsystem for higher order allocations. This option + will also use IRQ stacks to compensate for the reduced stackspace. + +config X86_FIND_SMP_CONFIG + bool + depends on X86_LOCAL_APIC || X86_VOYAGER && X86 + default y + +config X86_MPPARSE + bool + depends on X86_LOCAL_APIC && !X86_VISWS && X86 + default y + +# X86_64 + +# !SMP for now because the context switch early causes GPF in segment reloading +# and the GS base checking does the wrong thing then, causing a hang. +config CHECKING + bool "Additional run-time checks" + depends on DEBUG_KERNEL && !SMP && X86_64 + help + Enables some internal consistency checks for kernel debugging. + You should normally say N. + +config INIT_DEBUG + bool "Debug __init statements" + depends on DEBUG_KERNEL && X86_64 + help + Fill __init and __initdata at the end of boot. This helps debugging + illegal uses of __init and __initdata after initialization. + +config IOMMU_DEBUG + depends on GART_IOMMU && DEBUG_KERNEL && X86_64 + bool "Enable IOMMU debugging" + help + Force the IOMMU to on even when you have less than 4GB of + memory and add debugging code. On overflow always panic. And + allow to enable IOMMU leak tracing. Can be disabled at boot + time with iommu=noforce. This will also enable scatter gather + list merging. Currently not recommended for production + code. When you use it make sure you have a big enough + IOMMU/AGP aperture. Most of the options enabled by this can + be set more finegrained using the iommu= command line + options. See Documentation/x86_64/boot-options.txt for more + details. + +config IOMMU_LEAK + bool "IOMMU leak tracing" + depends on DEBUG_KERNEL && X86_64 + depends on IOMMU_DEBUG + help + Add a simple leak tracer to the IOMMU code. This is useful when you + are debugging a buggy device driver that leaks IOMMU mappings. + +#config X86_REMOTE_DEBUG +# bool "kgdb debugging stub" + +# X86 & X86_64 +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". + +endmenu diff --git a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 index 4df0524b23..ba014eac18 100644 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.11.12-xen0 -# Wed Jul 6 18:26:29 2005 +# Linux kernel version: 2.6.12-xen0 +# Sat Jul 9 09:19:47 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -34,6 +34,7 @@ CONFIG_EXPERIMENTAL=y CONFIG_BROKEN=y CONFIG_BROKEN_ON_SMP=y CONFIG_LOCK_KERNEL=y +CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup @@ -45,7 +46,6 @@ CONFIG_SYSVIPC=y # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set -CONFIG_LOG_BUF_SHIFT=14 CONFIG_HOTPLUG=y CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set @@ -53,15 +53,18 @@ CONFIG_KOBJECT_UEVENT=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SHMEM=y CONFIG_CC_ALIGN_FUNCTIONS=0 CONFIG_CC_ALIGN_LABELS=0 CONFIG_CC_ALIGN_LOOPS=0 CONFIG_CC_ALIGN_JUMPS=0 # CONFIG_TINY_SHMEM is not set +CONFIG_BASE_SMALL=0 # # Loadable module support @@ -101,6 +104,7 @@ CONFIG_MPENTIUM4=y # CONFIG_MWINCHIPC6 is not set # CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set +# CONFIG_MGEODEGX1 is not set # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set # CONFIG_X86_GENERIC is not set @@ -121,6 +125,7 @@ CONFIG_X86_USE_PPRO_CHECKSUM=y # CONFIG_SMP is not set CONFIG_PREEMPT=y CONFIG_PREEMPT_BKL=y +# CONFIG_X86_REBOOTFIXUPS is not set CONFIG_MICROCODE=y CONFIG_X86_CPUID=y @@ -154,6 +159,8 @@ CONFIG_PCI_MMCONFIG=y # CONFIG_PCI_MSI is not set CONFIG_PCI_LEGACY_PROC=y # CONFIG_PCI_NAMES is not set +# CONFIG_PCI_DEBUG is not set +CONFIG_ISA_DMA_API=y CONFIG_ISA=y # CONFIG_EISA is not set # CONFIG_MCA is not set @@ -165,11 +172,6 @@ CONFIG_ISA=y # CONFIG_PCCARD is not set # -# PC-card bridges -# -CONFIG_PCMCIA_PROBE=y - -# # PCI Hotplug Support # # CONFIG_HOTPLUG_PCI is not set @@ -177,8 +179,10 @@ CONFIG_PCMCIA_PROBE=y # # Kernel hacking # +# CONFIG_PRINTK_TIME is not set CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y +CONFIG_LOG_BUF_SHIFT=14 # CONFIG_SCHEDSTATS is not set # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_PREEMPT is not set @@ -202,6 +206,7 @@ CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_X86_BIOS_REBOOT=y CONFIG_PC=y +CONFIG_SECCOMP=y # # Executable file formats @@ -358,7 +363,7 @@ CONFIG_BLK_DEV_SD=y # # SCSI Transport Attributes # -# CONFIG_SCSI_SPI_ATTRS is not set +CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_FC_ATTRS is not set # CONFIG_SCSI_ISCSI_ATTRS is not set @@ -435,6 +440,7 @@ CONFIG_SCSI_QLA2XXX=y # CONFIG_SCSI_QLA2300 is not set # CONFIG_SCSI_QLA2322 is not set # CONFIG_SCSI_QLA6312 is not set +# CONFIG_SCSI_LPFC is not set # CONFIG_SCSI_SEAGATE is not set # CONFIG_SCSI_SYM53C416 is not set # CONFIG_SCSI_DC395x is not set @@ -468,6 +474,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_DM_SNAPSHOT=y CONFIG_DM_MIRROR=y # CONFIG_DM_ZERO is not set +# CONFIG_DM_MULTIPATH is not set # # Fusion MPT device support @@ -496,7 +503,6 @@ CONFIG_NET=y # CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set -# CONFIG_NETLINK_DEV is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y @@ -676,7 +682,6 @@ CONFIG_PCNET32=y # CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set CONFIG_E100=y -# CONFIG_E100_NAPI is not set # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set CONFIG_NE2K_PCI=y @@ -709,6 +714,7 @@ CONFIG_E1000=y # CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y +# CONFIG_BNX2 is not set # # Ethernet (10000 Mbit) @@ -766,19 +772,6 @@ CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 # CONFIG_INPUT_EVBUG is not set # -# Input I/O drivers -# -# CONFIG_GAMEPORT is not set -CONFIG_SOUND_GAMEPORT=y -CONFIG_SERIO=y -CONFIG_SERIO_I8042=y -CONFIG_SERIO_SERPORT=y -# CONFIG_SERIO_CT82C710 is not set -# CONFIG_SERIO_PCIPS2 is not set -CONFIG_SERIO_LIBPS2=y -# CONFIG_SERIO_RAW is not set - -# # Input Device Drivers # CONFIG_INPUT_KEYBOARD=y @@ -799,6 +792,18 @@ CONFIG_MOUSE_PS2=y # CONFIG_INPUT_MISC is not set # +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=y +# CONFIG_SERIO_CT82C710 is not set +# CONFIG_SERIO_PCIPS2 is not set +CONFIG_SERIO_LIBPS2=y +# CONFIG_SERIO_RAW is not set +# CONFIG_GAMEPORT is not set + +# # Character devices # CONFIG_VT=y @@ -814,6 +819,7 @@ CONFIG_HW_CONSOLE=y # # Non-8250 serial port support # +# CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 @@ -846,7 +852,6 @@ CONFIG_AGP_ATI=m CONFIG_AGP_AMD=m CONFIG_AGP_AMD64=m CONFIG_AGP_INTEL=m -CONFIG_AGP_INTEL_MCH=m CONFIG_AGP_NVIDIA=m CONFIG_AGP_SIS=m CONFIG_AGP_SWORKS=m @@ -868,6 +873,11 @@ CONFIG_DRM_SIS=m # CONFIG_HANGCHECK_TIMER is not set # +# TPM devices +# +# CONFIG_TCG_TPM is not set + +# # I2C support # # CONFIG_I2C is not set @@ -913,6 +923,8 @@ CONFIG_DUMMY_CONSOLE=y # # USB support # +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB=y # CONFIG_USB_DEBUG is not set @@ -923,14 +935,14 @@ CONFIG_USB=y # CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_OTG is not set -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB_ARCH_HAS_OHCI=y # # USB Host Controller Drivers # # CONFIG_USB_EHCI_HCD is not set CONFIG_USB_OHCI_HCD=y +# CONFIG_USB_OHCI_BIG_ENDIAN is not set +CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set @@ -967,7 +979,6 @@ CONFIG_USB_HIDINPUT=y # # CONFIG_USB_MDC800 is not set # CONFIG_USB_MICROTEK is not set -# CONFIG_USB_HPUSBSCSI is not set # # USB Multimedia devices @@ -986,6 +997,7 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_PEGASUS is not set # CONFIG_USB_RTL8150 is not set # CONFIG_USB_USBNET is not set +CONFIG_USB_MON=y # # USB port drivers @@ -1232,6 +1244,7 @@ CONFIG_CRYPTO_SHA1=m # CONFIG_CRYPTO_SHA256 is not set # CONFIG_CRYPTO_SHA512 is not set # CONFIG_CRYPTO_WP512 is not set +# CONFIG_CRYPTO_TGR192 is not set CONFIG_CRYPTO_DES=m # CONFIG_CRYPTO_BLOWFISH is not set # CONFIG_CRYPTO_TWOFISH is not set diff --git a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 index 2150f4381b..b1e78d5503 100644 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.11.1-xen0 -# Tue May 10 11:07:02 2005 +# Linux kernel version: 2.6.12-xen0 +# Wed Jun 29 10:01:20 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -33,6 +33,7 @@ CONFIG_EXPERIMENTAL=y # CONFIG_CLEAN_COMPILE is not set CONFIG_BROKEN=y CONFIG_BROKEN_ON_SMP=y +CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup @@ -44,22 +45,24 @@ CONFIG_SYSVIPC=y # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set -CONFIG_LOG_BUF_SHIFT=14 # CONFIG_HOTPLUG is not set CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SHMEM=y CONFIG_CC_ALIGN_FUNCTIONS=0 CONFIG_CC_ALIGN_LABELS=0 CONFIG_CC_ALIGN_LOOPS=0 CONFIG_CC_ALIGN_JUMPS=0 # CONFIG_TINY_SHMEM is not set +CONFIG_BASE_SMALL=0 # # Loadable module support @@ -74,6 +77,7 @@ CONFIG_KMOD=y CONFIG_XENARCH="x86_64" CONFIG_X86=y CONFIG_MMU=y +CONFIG_UID16=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y CONFIG_X86_CMPXCHG=y @@ -93,15 +97,17 @@ CONFIG_X86_IO_APIC=y CONFIG_PCI=y CONFIG_PCI_DIRECT=y # CONFIG_PCI_MMCONFIG is not set -CONFIG_EARLY_PRINTK=y +CONFIG_ISA_DMA_API=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_SECCOMP=y # # X86_64 processor configuration # CONFIG_X86_64=y CONFIG_64BIT=y +CONFIG_EARLY_PRINTK=y # # Processor type and features @@ -135,6 +141,9 @@ CONFIG_DUMMY_IOMMU=y # CONFIG_IA32_EMULATION=y # CONFIG_IA32_AOUT is not set +CONFIG_COMPAT=y +CONFIG_SYSVIPC_COMPAT=y + # # Executable file formats # @@ -285,7 +294,7 @@ CONFIG_BLK_DEV_SD=y # # SCSI Transport Attributes # -# CONFIG_SCSI_SPI_ATTRS is not set +CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_FC_ATTRS is not set # CONFIG_SCSI_ISCSI_ATTRS is not set @@ -352,6 +361,7 @@ CONFIG_SCSI_QLA2XXX=y # CONFIG_SCSI_QLA2300 is not set # CONFIG_SCSI_QLA2322 is not set # CONFIG_SCSI_QLA6312 is not set +# CONFIG_SCSI_LPFC is not set # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set @@ -388,7 +398,6 @@ CONFIG_NET=y # CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set -# CONFIG_NETLINK_DEV is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y @@ -553,7 +562,6 @@ CONFIG_PCNET32=y # CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set CONFIG_E100=y -# CONFIG_E100_NAPI is not set # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set CONFIG_NE2K_PCI=y @@ -584,6 +592,7 @@ CONFIG_E1000=y # CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y +# CONFIG_BNX2 is not set # # Ethernet (10000 Mbit) @@ -641,19 +650,6 @@ CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 # CONFIG_INPUT_EVBUG is not set # -# Input I/O drivers -# -# CONFIG_GAMEPORT is not set -CONFIG_SOUND_GAMEPORT=y -CONFIG_SERIO=y -CONFIG_SERIO_I8042=y -CONFIG_SERIO_SERPORT=y -# CONFIG_SERIO_CT82C710 is not set -# CONFIG_SERIO_PCIPS2 is not set -CONFIG_SERIO_LIBPS2=y -# CONFIG_SERIO_RAW is not set - -# # Input Device Drivers # CONFIG_INPUT_KEYBOARD=y @@ -671,6 +667,18 @@ CONFIG_MOUSE_PS2=y # CONFIG_INPUT_MISC is not set # +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=y +# CONFIG_SERIO_CT82C710 is not set +# CONFIG_SERIO_PCIPS2 is not set +CONFIG_SERIO_LIBPS2=y +# CONFIG_SERIO_RAW is not set +# CONFIG_GAMEPORT is not set + +# # Character devices # CONFIG_VT=y @@ -686,6 +694,7 @@ CONFIG_HW_CONSOLE=y # # Non-8250 serial port support # +# CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 @@ -702,7 +711,6 @@ CONFIG_LEGACY_PTY_COUNT=256 # CONFIG_HW_RANDOM is not set # CONFIG_NVRAM is not set CONFIG_RTC=y -# CONFIG_GEN_RTC is not set # CONFIG_DTLK is not set # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set @@ -713,7 +721,7 @@ CONFIG_RTC=y # CONFIG_FTAPE is not set CONFIG_AGP=m CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL_MCH=m +# CONFIG_AGP_INTEL is not set CONFIG_DRM=m CONFIG_DRM_TDFX=m # CONFIG_DRM_GAMMA is not set @@ -727,6 +735,11 @@ CONFIG_DRM_SIS=m # CONFIG_HANGCHECK_TIMER is not set # +# TPM devices +# +# CONFIG_TCG_TPM is not set + +# # I2C support # # CONFIG_I2C is not set @@ -771,13 +784,9 @@ CONFIG_DUMMY_CONSOLE=y # # USB support # -# CONFIG_USB is not set CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y - -# -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information -# +# CONFIG_USB is not set # # USB Gadget Support @@ -994,6 +1003,7 @@ CONFIG_CRYPTO_SHA1=m # CONFIG_CRYPTO_SHA256 is not set # CONFIG_CRYPTO_SHA512 is not set # CONFIG_CRYPTO_WP512 is not set +# CONFIG_CRYPTO_TGR192 is not set CONFIG_CRYPTO_DES=m # CONFIG_CRYPTO_BLOWFISH is not set # CONFIG_CRYPTO_TWOFISH is not set @@ -1019,5 +1029,14 @@ CONFIG_CRYPTO_CRC32C=m # # CONFIG_CRC_CCITT is not set CONFIG_CRC32=y -CONFIG_LIBCRC32C=y +CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y + +# +# Kernel hacking +# +# CONFIG_PRINTK_TIME is not set +# CONFIG_DEBUG_KERNEL is not set +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_X86_FIND_SMP_CONFIG=y +CONFIG_X86_MPPARSE=y diff --git a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 index c2011b63b2..10d3dac92b 100644 --- a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.11.12-xenU -# Wed Jul 6 22:40:19 2005 +# Linux kernel version: 2.6.12-xenU +# Sun Jul 10 17:32:04 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -29,6 +29,7 @@ CONFIG_HAVE_ARCH_DEV_ALLOC_SKB=y CONFIG_EXPERIMENTAL=y CONFIG_CLEAN_COMPILE=y CONFIG_LOCK_KERNEL=y +CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup @@ -40,23 +41,26 @@ CONFIG_SYSVIPC=y # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set -CONFIG_LOG_BUF_SHIFT=14 CONFIG_HOTPLUG=y CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set +# CONFIG_CPUSETS is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SHMEM=y CONFIG_CC_ALIGN_FUNCTIONS=0 CONFIG_CC_ALIGN_LABELS=0 CONFIG_CC_ALIGN_LOOPS=0 CONFIG_CC_ALIGN_JUMPS=0 # CONFIG_TINY_SHMEM is not set +CONFIG_BASE_SMALL=0 # # Loadable module support @@ -97,6 +101,7 @@ CONFIG_MPENTIUM4=y # CONFIG_MWINCHIPC6 is not set # CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set +# CONFIG_MGEODEGX1 is not set # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set # CONFIG_X86_GENERIC is not set @@ -120,6 +125,7 @@ CONFIG_NR_CPUS=8 # CONFIG_SCHED_SMT is not set CONFIG_PREEMPT=y CONFIG_PREEMPT_BKL=y +# CONFIG_X86_REBOOTFIXUPS is not set CONFIG_X86_CPUID=y # @@ -132,35 +138,14 @@ CONFIG_HIGHMEM=y CONFIG_HAVE_DEC_LOCK=y # CONFIG_REGPARM is not set CONFIG_HOTPLUG_CPU=y - -# -# Kernel hacking -# -CONFIG_DEBUG_KERNEL=y -CONFIG_MAGIC_SYSRQ=y -# CONFIG_SCHEDSTATS is not set -# CONFIG_DEBUG_SLAB is not set -# CONFIG_DEBUG_PREEMPT is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_SPINLOCK_SLEEP is not set -# CONFIG_DEBUG_KOBJECT is not set -# CONFIG_DEBUG_HIGHMEM is not set -CONFIG_DEBUG_BUGVERBOSE=y -# CONFIG_DEBUG_INFO is not set -# CONFIG_DEBUG_FS is not set -# CONFIG_FRAME_POINTER is not set -CONFIG_EARLY_PRINTK=y -# CONFIG_DEBUG_STACKOVERFLOW is not set -# CONFIG_KPROBES is not set -# CONFIG_DEBUG_STACK_USAGE is not set -# CONFIG_DEBUG_PAGEALLOC is not set -# CONFIG_4KSTACKS is not set CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_X86_SMP=y CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y CONFIG_PC=y +CONFIG_SECCOMP=y +CONFIG_EARLY_PRINTK=y # # Executable file formats @@ -256,7 +241,6 @@ CONFIG_NET=y # CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set -# CONFIG_NETLINK_DEV is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y @@ -523,6 +507,7 @@ CONFIG_CRYPTO_MD5=m # CONFIG_CRYPTO_SHA256 is not set # CONFIG_CRYPTO_SHA512 is not set # CONFIG_CRYPTO_WP512 is not set +# CONFIG_CRYPTO_TGR192 is not set # CONFIG_CRYPTO_DES is not set # CONFIG_CRYPTO_BLOWFISH is not set # CONFIG_CRYPTO_TWOFISH is not set @@ -551,3 +536,27 @@ CONFIG_CRYPTO_CRC32C=m # CONFIG_CRC32 is not set CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y + +# +# Kernel hacking +# +# CONFIG_PRINTK_TIME is not set +CONFIG_DEBUG_KERNEL=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_LOG_BUF_SHIFT=14 +# CONFIG_SCHEDSTATS is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_PREEMPT is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_SPINLOCK_SLEEP is not set +# CONFIG_DEBUG_KOBJECT is not set +# CONFIG_DEBUG_HIGHMEM is not set +CONFIG_DEBUG_BUGVERBOSE=y +# CONFIG_DEBUG_INFO is not set +# CONFIG_DEBUG_FS is not set +# CONFIG_FRAME_POINTER is not set +# CONFIG_DEBUG_STACKOVERFLOW is not set +# CONFIG_KPROBES is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_4KSTACKS is not set diff --git a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 index de7cc84038..68364af0d0 100644 --- a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.11.10-xenU -# Mon May 23 15:07:58 2005 +# Linux kernel version: 2.6.12-xenU +# Thu Jul 7 11:43:14 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -29,6 +29,7 @@ CONFIG_HAVE_ARCH_DEV_ALLOC_SKB=y CONFIG_EXPERIMENTAL=y CONFIG_CLEAN_COMPILE=y CONFIG_BROKEN_ON_SMP=y +CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup @@ -42,22 +43,24 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_SYSCTL=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y -CONFIG_LOG_BUF_SHIFT=14 CONFIG_HOTPLUG=y CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y CONFIG_KALLSYMS_EXTRA_PASS=y +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SHMEM=y CONFIG_CC_ALIGN_FUNCTIONS=0 CONFIG_CC_ALIGN_LABELS=0 CONFIG_CC_ALIGN_LOOPS=0 CONFIG_CC_ALIGN_JUMPS=0 # CONFIG_TINY_SHMEM is not set +CONFIG_BASE_SMALL=0 # # Loadable module support @@ -72,6 +75,7 @@ CONFIG_KMOD=y CONFIG_XENARCH="x86_64" CONFIG_X86=y CONFIG_MMU=y +CONFIG_UID16=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y CONFIG_X86_CMPXCHG=y @@ -89,15 +93,17 @@ CONFIG_X86_CPUID=y # CONFIG_X86_LOCAL_APIC is not set # CONFIG_X86_IO_APIC is not set # CONFIG_PCI is not set -CONFIG_EARLY_PRINTK=y +CONFIG_ISA_DMA_API=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_SECCOMP=y # # X86_64 processor configuration # CONFIG_X86_64=y CONFIG_64BIT=y +CONFIG_EARLY_PRINTK=y # # Processor type and features @@ -130,6 +136,9 @@ CONFIG_DUMMY_IOMMU=y # CONFIG_IA32_EMULATION=y # CONFIG_IA32_AOUT is not set +CONFIG_COMPAT=y +CONFIG_SYSVIPC_COMPAT=y + # # Executable file formats # @@ -226,6 +235,7 @@ CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m CONFIG_DM_ZERO=m +# CONFIG_DM_MULTIPATH is not set # # Networking support @@ -237,7 +247,6 @@ CONFIG_NET=y # CONFIG_PACKET=y CONFIG_PACKET_MMAP=y -CONFIG_NETLINK_DEV=y CONFIG_UNIX=y CONFIG_NET_KEY=m CONFIG_INET=y @@ -246,6 +255,7 @@ CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_ROUTE_FWMARK=y CONFIG_IP_ROUTE_MULTIPATH=y +# CONFIG_IP_ROUTE_MULTIPATH_CACHED is not set CONFIG_IP_ROUTE_VERBOSE=y # CONFIG_IP_PNP is not set CONFIG_NET_IPIP=m @@ -373,7 +383,7 @@ CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m # -# IPv6: Netfilter Configuration +# IPv6: Netfilter Configuration (EXPERIMENTAL) # # CONFIG_IP6_NF_QUEUE is not set CONFIG_IP6_NF_IPTABLES=m @@ -480,6 +490,7 @@ CONFIG_NET_SCH_INGRESS=m CONFIG_NET_QOS=y CONFIG_NET_ESTIMATOR=y CONFIG_NET_CLS=y +# CONFIG_NET_CLS_BASIC is not set CONFIG_NET_CLS_TCINDEX=m CONFIG_NET_CLS_ROUTE4=m CONFIG_NET_CLS_ROUTE=y @@ -490,6 +501,7 @@ CONFIG_NET_CLS_IND=y # CONFIG_CLS_U32_MARK is not set CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m +# CONFIG_NET_EMATCH is not set # CONFIG_NET_CLS_ACT is not set CONFIG_NET_CLS_POLICE=y @@ -554,6 +566,11 @@ CONFIG_ACT200L_DONGLE=m # # FIR device drivers # +# CONFIG_NSC_FIR is not set +# CONFIG_WINBOND_FIR is not set +# CONFIG_SMC_IRCC_FIR is not set +# CONFIG_ALI_FIR is not set +# CONFIG_VIA_FIR is not set CONFIG_BT=m CONFIG_BT_L2CAP=m CONFIG_BT_SCO=m @@ -577,7 +594,6 @@ CONFIG_DUMMY=m CONFIG_BONDING=m CONFIG_EQUALIZER=m CONFIG_TUN=m -CONFIG_ETHERTAP=m # # Ethernet (10 or 100Mbit) @@ -853,7 +869,7 @@ CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1 CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_SECURITY_SELINUX_DEVELOP=y CONFIG_SECURITY_SELINUX_AVC_STATS=y -# CONFIG_SECURITY_SELINUX_MLS is not set +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # # Cryptographic options @@ -867,6 +883,7 @@ CONFIG_CRYPTO_SHA1=y CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_WP512=m +# CONFIG_CRYPTO_TGR192 is not set CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_TWOFISH=m @@ -895,3 +912,10 @@ CONFIG_CRC32=y CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=m + +# +# Kernel hacking +# +# CONFIG_PRINTK_TIME is not set +# CONFIG_DEBUG_KERNEL is not set +CONFIG_LOG_BUF_SHIFT=14 diff --git a/linux-2.6-xen-sparse/arch/xen/i386/Kconfig b/linux-2.6-xen-sparse/arch/xen/i386/Kconfig index dec06cdfd1..f0cd7eac8f 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/Kconfig +++ b/linux-2.6-xen-sparse/arch/xen/i386/Kconfig @@ -74,6 +74,7 @@ config M386 - "Winchip-C6" for original IDT Winchip. - "Winchip-2" for IDT Winchip 2. - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. + - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above). @@ -201,6 +202,11 @@ config MWINCHIP3D stores for this CPU, which can increase performance of some operations. +config MGEODEGX1 + bool "GeodeGX1" + help + Select this for a Geode GX1 (Cyrix MediaGX) chip. + config MCYRIXIII bool "CyrixIII/VIA-C3" help @@ -249,7 +255,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || X86_GENERIC default "4" if X86_ELAN || M486 || M386 - default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 + default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODEGX1 default "6" if MK7 || MK8 || MPENTIUMM config RWSEM_GENERIC_SPINLOCK @@ -268,7 +274,7 @@ config GENERIC_CALIBRATE_DELAY config X86_PPRO_FENCE bool - depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 + depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 default y config X86_F00F_BUG @@ -298,7 +304,7 @@ config X86_POPAD_OK config X86_ALIGNMENT_16 bool - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 + depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 default y config X86_GOOD_APIC @@ -434,7 +440,7 @@ config PREEMPT_BKL #config X86_TSC # bool -# depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2) && !X86_NUMAQ +# depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ # default y #config X86_MCE @@ -474,6 +480,24 @@ config PREEMPT_BKL # Enabling this feature will cause a message to be printed when the P4 # enters thermal throttling. +config X86_REBOOTFIXUPS + bool "Enable X86 board specific fixups for reboot" + depends on X86 + default n + ---help--- + This enables chipset and/or board specific fixups to be done + in order to get reboot to work correctly. This is only needed on + some combinations of hardware and BIOS. The symptom, for which + this config is intended, is when reboot ends with a stalled/hung + system. + + Currently, the only fixup is for the Geode GX1/CS5530A/TROM2.1. + combination. + + Say Y if you want to enable the fixup. Currently, it's safe to + enable this option even if you don't need it. + Say N otherwise. + config MICROCODE tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" depends on XEN_PRIVILEGED_GUEST @@ -599,6 +623,16 @@ config HAVE_ARCH_BOOTMEM_NODE depends on NUMA default y +config HAVE_MEMORY_PRESENT + bool + depends on DISCONTIGMEM + default y + +config NEED_NODE_MEMMAP_SIZE + bool + depends on DISCONTIGMEM + default y + #config HIGHPTE # bool "Allocate 3rd-level pagetables from highmem" # depends on HIGHMEM4G || HIGHMEM64G @@ -682,14 +716,19 @@ config REGPARM config X86_LOCAL_APIC bool - depends on !SMP && X86_UP_APIC + depends on XEN_PRIVILEGED_GUEST && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)) default y config X86_IO_APIC bool - depends on !SMP && X86_UP_IOAPIC + depends on XEN_PRIVILEGED_GUEST && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))) default y +config X86_VISWS_APIC + bool + depends on X86_VISWS + default y + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" depends on SMP && HOTPLUG && EXPERIMENTAL @@ -704,20 +743,10 @@ if XEN_PHYSDEV_ACCESS menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" -config X86_VISWS_APIC - bool - depends on X86_VISWS - default y - -config X86_LOCAL_APIC - bool - depends on (X86_VISWS || SMP) && !X86_VOYAGER - default y - config X86_UP_APIC - bool "Local APIC support on uniprocessors" if !SMP - depends on !(X86_VISWS || X86_VOYAGER) - ---help--- + bool "Local APIC support on uniprocessors" + depends on !SMP && !(X86_VISWS || X86_VOYAGER) + help A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU system which has a processor with a local APIC, you can say Y here to @@ -727,28 +756,18 @@ config X86_UP_APIC performance counters), and the NMI watchdog which detects hard lockups. - If you have a system with several CPUs, you do not need to say Y - here: the local APIC will be used automatically. - config X86_UP_IOAPIC bool "IO-APIC support on uniprocessors" - depends on !SMP && X86_UP_APIC + depends on X86_UP_APIC help An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an SMP-capable replacement for PC-style interrupt controllers. Most - SMP systems and a small number of uniprocessor systems have one. + SMP systems and many recent uniprocessor systems have one. + If you have a single-CPU system with an IO-APIC, you can say Y here to use it. If you say Y here even though your machine doesn't have an IO-APIC, then the kernel will still run with no slowdown at all. - If you have a system with several CPUs, you do not need to say Y - here: the IO-APIC will be used automatically. - -config X86_IO_APIC - bool - depends on SMP && !(X86_VISWS || X86_VOYAGER) - default y - config PCI bool "PCI support" if !X86_VISWS depends on !X86_VOYAGER @@ -809,7 +828,7 @@ config PCI_DIRECT config PCI_MMCONFIG bool - depends on PCI && (PCI_GOMMCONFIG || (PCI_GOANY && ACPI)) + depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) select ACPI_BOOT default y @@ -817,6 +836,10 @@ source "drivers/pci/pcie/Kconfig" source "drivers/pci/Kconfig" +config ISA_DMA_API + bool + default y + config ISA bool "ISA support" depends on !(X86_VOYAGER || X86_VISWS) @@ -846,18 +869,14 @@ config EISA source "drivers/eisa/Kconfig" config MCA - bool "MCA support" - depends on !(X86_VISWS || X86_VOYAGER) + bool "MCA support" if !(X86_VISWS || X86_VOYAGER) + default y if X86_VOYAGER help MicroChannel Architecture is found in some IBM PS/2 machines and laptops. It is a bus system similar to PCI or ISA. See <file:Documentation/mca.txt> (and especially the web page given there) before attempting to build an MCA bus kernel. -config MCA - depends on X86_VOYAGER - default y if X86_VOYAGER - source "drivers/mca/Kconfig" config SCx200 @@ -880,8 +899,6 @@ endmenu endif -source "arch/i386/Kconfig.debug" - # # Use the generic interrupt handling code in kernel/irq/: # @@ -918,4 +935,21 @@ config PC depends on X86 && !EMBEDDED default y +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc/<pid>/seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + endmenu diff --git a/linux-2.6-xen-sparse/arch/xen/i386/Makefile b/linux-2.6-xen-sparse/arch/xen/i386/Makefile index 053c0984ac..313f5708af 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/Makefile +++ b/linux-2.6-xen-sparse/arch/xen/i386/Makefile @@ -14,6 +14,8 @@ # 19990713 Artur Skawina <skawina@geocities.com> # Added '-march' and '-mpreferred-stack-boundary' support # +# 20050320 Kianusch Sayah Karadji <kianusch@sk-tech.net> +# Added support for GEODE CPU XENARCH := $(subst ",,$(CONFIG_XENARCH)) @@ -56,6 +58,9 @@ cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) # AMD Elan support cflags-$(CONFIG_X86_ELAN) += -march=i486 +# Geode GX1 support +cflags-$(CONFIG_MGEODEGX1) += $(call cc-option,-march=pentium-mmx,-march=i486) + # -mregparm=3 works ok on gcc-3.0 and later # GCC_VERSION := $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC)) diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile b/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile index 273a4b9f44..fe6e9db107 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o +c-obj-$(CONFIG_X86_REBOOTFIXUPS)+= reboot_fixups.o c-obj-$(CONFIG_X86_NUMAQ) += numaq.o c-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o c-obj-$(CONFIG_MODULES) += module.o @@ -53,11 +54,11 @@ c-obj-$(CONFIG_SCx200) += scx200.o # Note: kbuild does not track this dependency due to usage of .incbin $(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so) -targets += vsyscall.lds +targets += vsyscall-note.o vsyscall.lds # The DSO images are built using a special linker script. quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -nostdlib -m32 $(SYSCFLAGS_$(@F)) \ + cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \ -Wl,-T,$(filter-out FORCE,$^) -o $@ export CPPFLAGS_vsyscall.lds += -P -C -U$(ARCH) @@ -67,7 +68,8 @@ SYSCFLAGS_vsyscall-sysenter.so = $(vsyscall-flags) SYSCFLAGS_vsyscall-int80.so = $(vsyscall-flags) $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ -$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE +$(obj)/vsyscall-%.so: $(src)/vsyscall.lds \ + $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) # We also create a special relocatable object that should mirror the symbol @@ -78,17 +80,20 @@ $(obj)/built-in.o: $(obj)/vsyscall-syms.o $(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o SYSCFLAGS_vsyscall-syms.o = -r -$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds $(obj)/vsyscall-sysenter.o FORCE +$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ + $(obj)/vsyscall-sysenter.o FORCE $(call if_changed,syscall) c-link := init_task.o -s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o vsyscall.lds.o +s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o vsyscall.lds.o syscall_table.o $(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-obj-m) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $@ $(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S +$(obj)/entry.o: $(src)/entry.S $(src)/syscall_table.S + obj-y += $(c-obj-y) $(s-obj-y) obj-m += $(c-obj-m) diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/acpi/boot.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/acpi/boot.c index 86ad650024..230673abb6 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/acpi/boot.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/acpi/boot.c @@ -604,6 +604,12 @@ static int __init acpi_parse_fadt(unsigned long phys, unsigned long size) acpi_fadt.sci_int = fadt->sci_int; #endif +#ifdef CONFIG_ACPI_BUS + /* initialize rev and apic_phys_dest_mode for x86_64 genapic */ + acpi_fadt.revision = fadt->revision; + acpi_fadt.force_apic_physical_destination_mode = fadt->force_apic_physical_destination_mode; +#endif + #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ if (fadt->revision >= FADT2_REVISION_ID) { diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c index 197225266d..dd1f64f830 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c @@ -22,6 +22,9 @@ DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]); EXPORT_PER_CPU_SYMBOL(cpu_gdt_table); +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); + static int cachesize_override __initdata = -1; static int disable_x86_fxsr __initdata = 0; static int disable_x86_serial_nr __initdata = 1; @@ -202,7 +205,7 @@ static inline int flag_is_changeable_p(u32 flag) /* Probe for the CPUID instruction */ -int __init have_cpuid_p(void) +static int __init have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -210,7 +213,7 @@ int __init have_cpuid_p(void) /* Do minimum CPU detection early. Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. The others are not touched to avoid unwanted side effects. */ -void __init early_cpu_detect(void) +static void __init early_cpu_detect(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -243,6 +246,10 @@ void __init early_cpu_detect(void) } early_intel_workaround(c); + +#ifdef CONFIG_X86_HT + phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; +#endif } void __init generic_identify(struct cpuinfo_x86 * c) @@ -431,25 +438,15 @@ void __init identify_cpu(struct cpuinfo_x86 *c) mcheck_init(c); #endif } -/* - * Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c - */ - -void __init dodgy_tsc(void) -{ - if (( boot_cpu_data.x86_vendor == X86_VENDOR_CYRIX ) || - ( boot_cpu_data.x86_vendor == X86_VENDOR_NSC )) - cpu_devs[X86_VENDOR_CYRIX]->c_init(&boot_cpu_data); -} #ifdef CONFIG_X86_HT void __init detect_ht(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - int index_lsb, index_msb, tmp; + int index_msb, tmp; int cpu = smp_processor_id(); - if (!cpu_has(c, X86_FEATURE_HT)) + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; cpuid(1, &eax, &ebx, &ecx, &edx); @@ -458,7 +455,6 @@ void __init detect_ht(struct cpuinfo_x86 *c) if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); } else if (smp_num_siblings > 1 ) { - index_lsb = 0; index_msb = 31; if (smp_num_siblings > NR_CPUS) { @@ -467,21 +463,34 @@ void __init detect_ht(struct cpuinfo_x86 *c) return; } tmp = smp_num_siblings; - while ((tmp & 1) == 0) { - tmp >>=1 ; - index_lsb++; - } - tmp = smp_num_siblings; while ((tmp & 0x80000000 ) == 0) { tmp <<=1 ; index_msb--; } - if (index_lsb != index_msb ) + if (smp_num_siblings & (smp_num_siblings - 1)) index_msb++; phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); printk(KERN_INFO "CPU: Physical Processor ID: %d\n", phys_proc_id[cpu]); + + smp_num_siblings = smp_num_siblings / c->x86_num_cores; + + tmp = smp_num_siblings; + index_msb = 31; + while ((tmp & 0x80000000) == 0) { + tmp <<=1 ; + index_msb--; + } + + if (smp_num_siblings & (smp_num_siblings - 1)) + index_msb++; + + cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + + if (c->x86_num_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + cpu_core_id[cpu]); } } #endif @@ -528,7 +537,6 @@ extern int transmeta_init_cpu(void); extern int rise_init_cpu(void); extern int nexgen_init_cpu(void); extern int umc_init_cpu(void); -void early_cpu_detect(void); void __init early_cpu_init(void) { diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c index c06cf9396c..501ea3fce8 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c @@ -31,7 +31,7 @@ struct mtrr_ops *mtrr_if = &generic_mtrr_ops; unsigned int num_var_ranges; unsigned int *usage_table; -void __init set_num_var_ranges(void) +static void __init set_num_var_ranges(void) { dom0_op_t op; diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S b/linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S index 064be004e7..1fa27ad04c 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S @@ -752,296 +752,6 @@ ENTRY(fixup_4gb_segment) pushl $do_fixup_4gb_segment jmp error_code -.data -ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long sys_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long sys_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 - old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_olduname - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long old_readdir - .long old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ioperm - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long sys_iopl /* 110 */ - .long sys_vhangup - .long sys_ni_syscall /* old "idle" system call */ - .long sys_vm86old - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long sys_sigreturn - .long sys_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_modify_ldt - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* reserved for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long sys_vm86 - .long sys_ni_syscall /* Old sys_query_module */ - .long sys_poll - .long sys_nfsservctl - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long sys_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long sys_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* reserved for streams1 */ - .long sys_ni_syscall /* reserved for streams2 */ - .long sys_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap2 - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - .long sys_getdents64 /* 220 */ - .long sys_fcntl64 - .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall - .long sys_gettid - .long sys_readahead /* 225 */ - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr /* 230 */ - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr /* 235 */ - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_tkill - .long sys_sendfile64 - .long sys_futex /* 240 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_set_thread_area - .long sys_get_thread_area - .long sys_io_setup /* 245 */ - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 255 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 260 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 265 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 270 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_ni_syscall /* sys_vserver */ - .long sys_mbind - .long sys_get_mempolicy - .long sys_set_mempolicy - .long sys_mq_open - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive /* 280 */ - .long sys_mq_notify - .long sys_mq_getsetattr - .long sys_ni_syscall /* reserved for kexec */ - .long sys_waitid - .long sys_ni_syscall /* 285 */ /* available */ - .long sys_add_key - .long sys_request_key - .long sys_keyctl +#include "syscall_table.S" syscall_table_size=(.-sys_call_table) diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c index ed58906c1e..d767aa6da2 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c @@ -99,6 +99,11 @@ EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); EXPORT_SYMBOL(__get_user_4); +EXPORT_SYMBOL(__put_user_1); +EXPORT_SYMBOL(__put_user_2); +EXPORT_SYMBOL(__put_user_4); +EXPORT_SYMBOL(__put_user_8); + EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); @@ -114,7 +119,6 @@ EXPORT_SYMBOL(dma_alloc_coherent); EXPORT_SYMBOL(dma_free_coherent); #ifdef CONFIG_PCI -EXPORT_SYMBOL(pcibios_penalize_isa_irq); EXPORT_SYMBOL(pci_mem_start); #endif @@ -146,7 +150,6 @@ EXPORT_SYMBOL(smp_call_function); /* TLB flushing */ EXPORT_SYMBOL(flush_tlb_page); -EXPORT_SYMBOL_GPL(flush_tlb_all); #endif #ifdef CONFIG_X86_IO_APIC @@ -168,10 +171,6 @@ EXPORT_SYMBOL(rtc_lock); EXPORT_SYMBOL_GPL(set_nmi_callback); EXPORT_SYMBOL_GPL(unset_nmi_callback); -#undef memcmp -extern int memcmp(const void *,const void *,__kernel_size_t); -EXPORT_SYMBOL(memcmp); - EXPORT_SYMBOL(register_die_notifier); #ifdef CONFIG_HAVE_DEC_LOCK EXPORT_SYMBOL(_atomic_dec_and_lock); diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/io_apic.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/io_apic.c index 882ff3fe9c..0b786dbfc7 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/io_apic.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/io_apic.c @@ -231,7 +231,7 @@ static void unmask_IO_APIC_irq (unsigned int irq) spin_unlock_irqrestore(&ioapic_lock, flags); } -void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; unsigned long flags; @@ -310,7 +310,7 @@ cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS]; static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH; static int physical_balance = 0; -struct irq_cpu_info { +static struct irq_cpu_info { unsigned long * last_irq; unsigned long * irq_delta; unsigned long irq; @@ -321,7 +321,7 @@ struct irq_cpu_info { #define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) #define IDLE_ENOUGH(cpu,now) \ - (idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1)) + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) @@ -332,7 +332,7 @@ struct irq_cpu_info { #define BALANCED_IRQ_MORE_DELTA (HZ/10) #define BALANCED_IRQ_LESS_DELTA (HZ) -long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL; +static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL; static unsigned long move(int curr_cpu, cpumask_t allowed_mask, unsigned long now, int direction) @@ -733,8 +733,8 @@ void fastcall send_IPI_self(int vector) */ #define MAX_PIRQS 8 -int pirq_entries [MAX_PIRQS]; -int pirqs_enabled; +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; int skip_ioapic_setup; static int __init ioapic_setup(char *str) @@ -1231,7 +1231,7 @@ static inline void ioapic_register_intr(int irq, int vector, unsigned long trigg #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) #endif -void __init setup_IO_APIC_irqs(void) +static void __init setup_IO_APIC_irqs(void) { struct IO_APIC_route_entry entry; int apic, pin, idx, irq, first_notcon = 1, vector; @@ -1311,7 +1311,7 @@ void __init setup_IO_APIC_irqs(void) * Set up the 8259A-master output pin: */ #ifndef CONFIG_XEN -void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) +static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) { struct IO_APIC_route_entry entry; unsigned long flags; @@ -2234,7 +2234,6 @@ static inline void check_timer(void) disable_8259A_irq(0); setup_nmi(); enable_8259A_irq(0); - check_nmi_watchdog(); } return; } @@ -2257,7 +2256,6 @@ static inline void check_timer(void) add_pin_to_irq(0, 0, pin2); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - check_nmi_watchdog(); } return; } @@ -2363,7 +2361,7 @@ struct sysfs_ioapic_data { }; static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; -static int ioapic_suspend(struct sys_device *dev, u32 state) +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/irq.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/irq.c index 3565536d6c..9bdd14fc19 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/irq.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/irq.c @@ -19,6 +19,9 @@ #include <linux/cpu.h> #include <linux/delay.h> +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp; +EXPORT_PER_CPU_SYMBOL(irq_stat); + #ifndef CONFIG_X86_LOCAL_APIC /* * 'what should we do if we get a hw irq event on an illegal vector'. @@ -244,7 +247,7 @@ skip: #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); for_each_cpu(j) - seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs); + seq_printf(p, "%10u ", per_cpu(irq_stat, j).apic_timer_irqs); seq_putc(p, '\n'); #endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/mpparse.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/mpparse.c index 16f2ee8c80..c025cc3d4c 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/mpparse.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/mpparse.c @@ -49,7 +49,7 @@ int mp_bus_id_to_node [MAX_MP_BUSSES]; int mp_bus_id_to_local [MAX_MP_BUSSES]; int quad_local_to_mp_bus_id [NR_CPUS/4][4]; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -int mp_current_pci_id; +static int mp_current_pci_id; /* I/O APIC entries */ struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; @@ -120,7 +120,7 @@ static int MP_valid_apicid(int apicid, int version) #endif #ifndef CONFIG_XEN -void __init MP_processor_info (struct mpc_config_processor *m) +static void __init MP_processor_info (struct mpc_config_processor *m) { int ver, apicid; physid_mask_t tmp; @@ -871,7 +871,7 @@ void __init mp_register_lapic ( #define MP_ISA_BUS 0 #define MP_MAX_IOAPIC_PIN 127 -struct mp_ioapic_routing { +static struct mp_ioapic_routing { int apic_id; int gsi_base; int gsi_end; @@ -989,6 +989,7 @@ void __init mp_override_legacy_irq ( return; } +int es7000_plat; void __init mp_config_acpi_legacy_irqs (void) { @@ -1003,9 +1004,9 @@ void __init mp_config_acpi_legacy_irqs (void) Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); /* - * ES7000 has no legacy identity mappings + * Older generations of ES7000 have no legacy identity mappings */ - if (es7000_plat) + if (es7000_plat == 1) return; /* diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c index dc51c7972a..efd0dab89b 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c @@ -25,7 +25,7 @@ struct dma_coherent_mem { }; void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int gfp) + dma_addr_t *dma_handle, unsigned int __nocast gfp) { void *ret; struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c index eba38c6579..6cc2be6450 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c @@ -37,6 +37,7 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/ptrace.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -60,7 +61,7 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -int hlt_counter; +static int hlt_counter; unsigned long boot_option_idle_override = 0; EXPORT_SYMBOL(boot_option_idle_override); @@ -77,7 +78,7 @@ unsigned long thread_saved_pc(struct task_struct *tsk) * Powermanagement idle function, if any.. */ void (*pm_idle)(void); -static cpumask_t cpu_idle_map; +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) { @@ -150,8 +151,8 @@ void cpu_idle (void) while (1) { while (!need_resched()) { - if (cpu_isset(cpu, cpu_idle_map)) - cpu_clear(cpu, cpu_idle_map); + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; rmb(); if (cpu_is_offline(cpu)) { @@ -162,7 +163,7 @@ void cpu_idle (void) play_dead(); } - irq_stat[cpu].idle_timestamp = jiffies; + __get_cpu_var(irq_stat).idle_timestamp = jiffies; xen_idle(); } schedule(); @@ -171,16 +172,28 @@ void cpu_idle (void) void cpu_idle_wait(void) { - int cpu; + unsigned int cpu, this_cpu = get_cpu(); cpumask_t map; - for_each_online_cpu(cpu) - cpu_set(cpu, cpu_idle_map); + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; wmb(); do { ssleep(1); - cpus_and(map, cpu_idle_map, cpu_online_map); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); } while (!cpus_empty(map)); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -314,6 +327,17 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, int err; childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; + /* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * This is necessary to guarantee that the entire "struct pt_regs" + * is accessable even if the CPU haven't stored the SS/ESP registers + * on the stack (interrupt gate does not save these registers + * when switching to the same priv ring). + * Therefore beware: accessing the xss/esp fields of the + * "struct pt_regs" is possible, but they may contain the + * completely wrong values. + */ + childregs = (struct pt_regs *) ((unsigned long) childregs - 8); *childregs = *regs; childregs->eax = 0; childregs->esp = esp; @@ -434,12 +458,6 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) return 1; } -/* - * This special macro can be used to load a debugging register - */ -#define loaddebug(thread,register) \ - HYPERVISOR_set_debugreg((register), \ - (thread->debugreg[register])) /* * switch_to(x,yn) should switch tasks from x to y. @@ -767,3 +785,9 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) return 0; } +unsigned long arch_align_stack(unsigned long sp) +{ + if (randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c index 938bcabd86..bc86051db3 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c @@ -40,6 +40,7 @@ #include <linux/efi.h> #include <linux/init.h> #include <linux/edd.h> +#include <linux/nodemask.h> #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/notifier.h> @@ -82,7 +83,6 @@ struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 0, 1, 0, -1 }; struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 0, 1, 0, -1 }; unsigned long mmu_cr4_features; -EXPORT_SYMBOL_GPL(mmu_cr4_features); #ifdef CONFIG_ACPI_INTERPRETER int acpi_disabled = 0; @@ -125,8 +125,6 @@ struct edid_info edid_info; struct ist_info ist_info; struct e820map e820; -unsigned char aux_device_present; - extern void early_cpu_init(void); extern void dmi_scan_machine(void); extern void generic_apic_probe(char *); @@ -457,10 +455,10 @@ struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ unsigned long long addr; /* address for this change point */ }; -struct change_member change_point_list[2*E820MAX] __initdata; -struct change_member *change_point[2*E820MAX] __initdata; -struct e820entry *overlap_list[E820MAX] __initdata; -struct e820entry new_bios[E820MAX] __initdata; +static struct change_member change_point_list[2*E820MAX] __initdata; +static struct change_member *change_point[2*E820MAX] __initdata; +static struct e820entry *overlap_list[E820MAX] __initdata; +static struct e820entry new_bios[E820MAX] __initdata; static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) { @@ -1000,8 +998,6 @@ unsigned long __init find_max_low_pfn(void) return max_low_pfn; } -#ifndef CONFIG_DISCONTIGMEM - /* * Free all available memory for boot time allocation. Used * as a callback function by efi_memory_walk() @@ -1075,15 +1071,16 @@ static void __init reserve_ebda_region(void) reserve_bootmem(addr, PAGE_SIZE); } +#ifndef CONFIG_DISCONTIGMEM +void __init setup_bootmem_allocator(void); static unsigned long __init setup_memory(void) { - unsigned long bootmap_size, start_pfn, max_low_pfn; /* * partially used pages are not usable - thus * we are rounding upwards: */ - start_pfn = PFN_UP(__pa(xen_start_info.pt_base)) + xen_start_info.nr_pt_frames; + min_low_pfn = PFN_UP(__pa(xen_start_info.pt_base)) + xen_start_info.nr_pt_frames; find_max_pfn(); @@ -1099,10 +1096,43 @@ static unsigned long __init setup_memory(void) #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); + + return max_low_pfn; +} + +void __init zone_sizes_init(void) +{ + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; + unsigned int max_dma, low; + + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + low = max_low_pfn; + + if (low < max_dma) + zones_size[ZONE_DMA] = low; + else { + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = low - max_dma; +#ifdef CONFIG_HIGHMEM + zones_size[ZONE_HIGHMEM] = highend_pfn - low; +#endif + } + free_area_init(zones_size); +} +#else +extern unsigned long setup_memory(void); +extern void zone_sizes_init(void); +#endif /* !CONFIG_DISCONTIGMEM */ + +void __init setup_bootmem_allocator(void) +{ + unsigned long bootmap_size; /* * Initialize the boot-time allocator (with low memory only): */ - bootmap_size = init_bootmem(start_pfn, max_low_pfn); + bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); register_bootmem_low_pages(max_low_pfn); @@ -1112,7 +1142,7 @@ static unsigned long __init setup_memory(void) * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + + reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); /* reserve EBDA region, it's a 4K region */ @@ -1159,12 +1189,25 @@ static unsigned long __init setup_memory(void) #endif phys_to_machine_mapping = (unsigned int *)xen_start_info.mfn_list; +} - return max_low_pfn; +/* + * The node 0 pgdat is initialized before all of these because + * it's needed for bootmem. node>0 pgdats have their virtual + * space allocated before the pagetables are in place to access + * them, so they can't be cleared then. + * + * This should all compile down to nothing when NUMA is off. + */ +void __init remapped_pgdat_init(void) +{ + int nid; + + for_each_online_node(nid) { + if (nid != 0) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + } } -#else -extern unsigned long setup_memory(void); -#endif /* !CONFIG_DISCONTIGMEM */ /* * Request address space for all standard RAM and ROM resources @@ -1443,7 +1486,6 @@ void __init setup_arch(char **cmdline_p) machine_submodel_id = SYS_DESC_TABLE.table[1]; BIOS_revision = SYS_DESC_TABLE.table[2]; } - aux_device_present = AUX_DEVICE_INFO; bootloader_type = LOADER_TYPE; #ifdef CONFIG_XEN_PHYSDEV_ACCESS @@ -1503,6 +1545,8 @@ void __init setup_arch(char **cmdline_p) smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ #endif paging_init(); + remapped_pgdat_init(); + zone_sizes_init(); #ifdef CONFIG_X86_FIND_SMP_CONFIG /* @@ -1586,11 +1630,13 @@ void __init setup_arch(char **cmdline_p) } #endif +#ifdef CONFIG_ACPI_BOOT /* * Parse the ACPI tables for possible boot-time SMP configuration. */ acpi_boot_table_init(); acpi_boot_init(); +#endif #ifdef CONFIG_X86_LOCAL_APIC if (smp_found_config) diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/signal.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/signal.c index 9e17fc80e9..32925b5e08 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/signal.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/signal.c @@ -93,7 +93,7 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, if (act) { old_sigset_t mask; - if (verify_area(VERIFY_READ, act, sizeof(*act)) || + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) return -EFAULT; @@ -105,7 +105,7 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) return -EFAULT; @@ -187,7 +187,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax struct _fpstate __user * buf; err |= __get_user(buf, &sc->fpstate); if (buf) { - if (verify_area(VERIFY_READ, buf, sizeof(*buf))) + if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) goto badframe; err |= restore_i387(buf); } else { @@ -213,7 +213,7 @@ asmlinkage int sys_sigreturn(unsigned long __unused) sigset_t set; int eax; - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 @@ -243,7 +243,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) sigset_t set; int eax; - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; @@ -557,6 +557,16 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, } } + /* + * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so + * that register information in the sigcontext is correct. + */ + if (unlikely(regs->eflags & TF_MASK) + && likely(current->ptrace & PT_DTRACE)) { + current->ptrace &= ~PT_DTRACE; + regs->eflags &= ~TF_MASK; + } + /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) setup_rt_frame(sig, ka, info, oldset, regs); @@ -608,8 +618,7 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) * inside the kernel. */ if (unlikely(current->thread.debugreg[7])) { - HYPERVISOR_set_debugreg(7, - current->thread.debugreg[7]); + loaddebug(¤t->thread, 7); } /* Whee! Actually deliver the signal. */ diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/smp.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/smp.c index fddadbba25..56729ce885 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/smp.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/smp.c @@ -197,7 +197,7 @@ void send_IPI_mask_bitmask(cpumask_t mask, int vector) local_irq_restore(flags); } -inline void send_IPI_mask_sequence(cpumask_t mask, int vector) +void send_IPI_mask_sequence(cpumask_t mask, int vector) { send_IPI_mask_bitmask(mask, vector); diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c index 494befa697..485fc6abcf 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c @@ -69,6 +69,8 @@ static int __initdata smp_b_stepping; int smp_num_siblings = 1; int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ EXPORT_SYMBOL(phys_proc_id); +int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */ +EXPORT_SYMBOL(cpu_core_id); /* bitmap of online cpus */ cpumask_t cpu_online_map; @@ -84,9 +86,6 @@ u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff }; EXPORT_SYMBOL(x86_cpu_to_apicid); -/* Set when the idlers are all forked */ -int smp_threads_ready; - #if 0 /* * Trampoline 80x86 program as an array. @@ -122,6 +121,8 @@ static unsigned long __init setup_trampoline(void) } #endif +static void map_cpu_to_logical_apicid(void); + /* * We are called very early to get the low memory for the * SMP bootup trampoline page. @@ -352,7 +353,7 @@ extern void calibrate_delay(void); static atomic_t init_deasserted; -void __init smp_callin(void) +static void __init smp_callin(void) { int cpuid, phys_id; unsigned long timeout; @@ -449,7 +450,7 @@ void __init smp_callin(void) #endif } -int cpucount; +static int cpucount; static irqreturn_t ldebug_interrupt( @@ -567,7 +568,7 @@ static inline void unmap_cpu_to_node(int cpu) u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -void map_cpu_to_logical_apicid(void) +static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); int apicid = smp_processor_id(); @@ -576,7 +577,7 @@ void map_cpu_to_logical_apicid(void) map_cpu_to_node(cpu, apicid_to_node(apicid)); } -void unmap_cpu_to_logical_apicid(int cpu) +static void unmap_cpu_to_logical_apicid(int cpu) { cpu_2_logical_apicid[cpu] = BAD_APICID; unmap_cpu_to_node(cpu); @@ -861,6 +862,9 @@ static int __init do_boot_cpu(int apicid) if (cpu_gdt_descr[0].size > PAGE_SIZE) BUG(); cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; + printk("GDT: copying %d bytes from %lx to %lx\n", + cpu_gdt_descr[0].size, cpu_gdt_descr[0].address, + cpu_gdt_descr[cpu].address); memcpy((void *)cpu_gdt_descr[cpu].address, (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size); @@ -916,6 +920,7 @@ static int __init do_boot_cpu(int apicid) ctxt.ctrlreg[3] = (unsigned long)virt_to_machine(swapper_pg_dir); boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt); + printk("boot error: %ld\n", boot_error); if (!boot_error) { /* @@ -1016,9 +1021,6 @@ static int __init do_boot_cpu(int apicid) return boot_error; } -cycles_t cacheflush_time; -unsigned long cache_decay_ticks; - static void smp_tune_scheduling (void) { unsigned long cachesize; /* kB */ @@ -1039,7 +1041,6 @@ static void smp_tune_scheduling (void) * this basically disables processor-affinity * scheduling on SMP without a TSC. */ - cacheflush_time = 0; return; } else { cachesize = boot_cpu_data.x86_cache_size; @@ -1047,17 +1048,7 @@ static void smp_tune_scheduling (void) cachesize = 16; /* Pentiums, 2x8kB cache */ bandwidth = 100; } - - cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } - - cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1; - - printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", - (long)cacheflush_time/(cpu_khz/1000), - ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); - printk("task migration cache decay timeout: %ld msecs.\n", - cache_decay_ticks); } /* @@ -1071,6 +1062,8 @@ static int boot_cpu_logical_apicid; void *xquad_portio; cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; +cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_core_map); static void __init smp_boot_cpus(unsigned int max_cpus) { @@ -1102,6 +1095,9 @@ static void __init smp_boot_cpus(unsigned int max_cpus) cpus_clear(cpu_sibling_map[0]); cpu_set(0, cpu_sibling_map[0]); + cpus_clear(cpu_core_map[0]); + cpu_set(0, cpu_core_map[0]); + #ifdef CONFIG_X86_IO_APIC /* * If we couldn't find an SMP configuration at boot time, @@ -1119,6 +1115,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus) " Using dummy APIC emulation.\n"); #endif map_cpu_to_logical_apicid(); + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); return; } #endif @@ -1144,6 +1142,10 @@ static void __init smp_boot_cpus(unsigned int max_cpus) printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); smpboot_clear_io_apic_irqs(); phys_cpu_present_map = physid_mask_of_physid(0); + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); return; } @@ -1246,10 +1248,13 @@ static void __init smp_boot_cpus(unsigned int max_cpus) * construct cpu_sibling_map[], so that we can tell sibling CPUs * efficiently. */ - for (cpu = 0; cpu < NR_CPUS; cpu++) + for (cpu = 0; cpu < NR_CPUS; cpu++) { cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + } for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct cpuinfo_x86 *c = cpu_data + cpu; int siblings = 0; int i; if (!cpu_isset(cpu, cpu_callout_map)) @@ -1259,7 +1264,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) for (i = 0; i < NR_CPUS; i++) { if (!cpu_isset(i, cpu_callout_map)) continue; - if (phys_proc_id[cpu] == phys_proc_id[i]) { + if (cpu_core_id[cpu] == cpu_core_id[i]) { siblings++; cpu_set(i, cpu_sibling_map[cpu]); } @@ -1269,15 +1274,23 @@ static void __init smp_boot_cpus(unsigned int max_cpus) cpu_set(cpu, cpu_sibling_map[cpu]); } - if (siblings != smp_num_siblings) + if (siblings != smp_num_siblings) { printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); + smp_num_siblings = siblings; + } + if (c->x86_num_cores > 1) { + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + } + } + } else { + cpu_core_map[cpu] = cpu_sibling_map[cpu]; + } } -#if 0 - if (nmi_watchdog == NMI_LOCAL_APIC) - check_nmi_watchdog(); -#endif - smpboot_setup_io_apic(); #if 0 diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c index 821d6905b0..0ca8d7eb64 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c @@ -190,6 +190,35 @@ static void __get_time_values_from_xen(void) ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); }) /* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock = 0; +EXPORT_SYMBOL(cmos_lock); + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + val = inb_p(RTC_PORT(1)); + lock_cmos_suffix(addr); + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + outb_p(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +/* * This version of gettimeofday has microsecond resolution * and better than microsecond precision on fast x86 machines with TSC. */ @@ -349,16 +378,23 @@ static int set_rtc_mmss(unsigned long nowtime) { int retval; + WARN_ON(irqs_disabled()); + /* gets recalled with irq locally disabled */ - spin_lock(&rtc_lock); + spin_lock_irq(&rtc_lock); if (efi_enabled) retval = efi_set_rtc_mmss(nowtime); else retval = mach_set_rtc_mmss(nowtime); - spin_unlock(&rtc_lock); + spin_unlock_irq(&rtc_lock); return retval; } +#else +static int set_rtc_mmss(unsigned long nowtime) +{ + return 0; +} #endif /* monotonic_clock(): returns # of nanoseconds passed since time_init() @@ -503,29 +539,6 @@ static inline void do_timer_interrupt(int irq, void *dev_id, last_update_to_xen = xtime.tv_sec; } - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - */ - if ((time_status & STA_UNSYNC) == 0 && - xtime.tv_sec > last_rtc_update + 660 && - (xtime.tv_nsec / 1000) - >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - (xtime.tv_nsec / 1000) - <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) { - /* horrible...FIXME */ - if (efi_enabled) { - if (efi_set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - last_rtc_update = xtime.tv_sec - 600; - } else if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ - } #endif } @@ -565,10 +578,59 @@ unsigned long get_cmos_time(void) return retval; } +static void sync_cmos_clock(unsigned long dummy); + +static struct timer_list sync_cmos_timer = + TIMER_INITIALIZER(sync_cmos_clock, 0, 0); + +static void sync_cmos_clock(unsigned long dummy) +{ + struct timeval now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if ((time_status & STA_UNSYNC) != 0) + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + + do_gettimeofday(&now); + if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && + now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) + fail = set_rtc_mmss(now.tv_sec); + + next.tv_usec = USEC_AFTER - now.tv_usec; + if (next.tv_usec <= 0) + next.tv_usec += USEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_usec >= USEC_PER_SEC) { + next.tv_sec++; + next.tv_usec -= USEC_PER_SEC; + } + mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); +} + +void notify_arch_cmos_timer(void) +{ + mod_timer(&sync_cmos_timer, jiffies + 1); +} static long clock_cmos_diff, sleep_start; -static int timer_suspend(struct sys_device *dev, u32 state) +static int timer_suspend(struct sys_device *dev, pm_message_t state) { /* * Estimate time zone so that set_time can update the clock @@ -626,14 +688,14 @@ device_initcall(time_init_device); #ifdef CONFIG_HPET_TIMER extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ -void __init hpet_time_init(void) +static void __init hpet_time_init(void) { xtime.tv_sec = get_cmos_time(); xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (hpet_enable() >= 0) { + if ((hpet_enable() >= 0) && hpet_use_timer) { printk("Using HPET for base-timer\n"); } diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c index 539c1d5b7d..d34ca827e6 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c @@ -342,8 +342,7 @@ void die(const char * str, struct pt_regs * regs, long err) if (panic_on_oops) { printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(5 * HZ); + ssleep(5); panic("Fatal exception"); } do_exit(SIGSEGV); @@ -450,6 +449,7 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) #ifdef CONFIG_X86_MCE DO_ERROR(18, SIGBUS, "machine check", machine_check) #endif @@ -636,16 +636,15 @@ void unset_nmi_callback(void) } #ifdef CONFIG_KPROBES -fastcall int do_int3(struct pt_regs *regs, long error_code) +fastcall void do_int3(struct pt_regs *regs, long error_code) { if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) - return 1; + return; /* This is an interrupt gate, because kprobes wants interrupts disabled. Normal trap handlers don't. */ restore_interrupts(regs); do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); - return 0; } #endif @@ -702,8 +701,6 @@ fastcall void do_debug(struct pt_regs * regs, long error_code) /* * Single-stepping through TF: make sure we ignore any events in * kernel space (but re-enable TF when returning to user mode). - * And if the event was due to a debugger (PT_DTRACE), clear the - * TF flag so that register information is correct. */ if (condition & DR_STEP) { /* @@ -713,11 +710,6 @@ fastcall void do_debug(struct pt_regs * regs, long error_code) */ if ((regs->xcs & 2) == 0) goto clear_TF_reenable; - - if (likely(tsk->ptrace & PT_DTRACE)) { - tsk->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } } /* Ok, finally something we can handle */ @@ -807,7 +799,7 @@ fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) math_error((void __user *)regs->eip); } -void simd_math_error(void __user *eip) +static void simd_math_error(void __user *eip) { struct task_struct * task; siginfo_t info; @@ -879,6 +871,51 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs, } } +fastcall void setup_x86_bogus_stack(unsigned char * stk) +{ + unsigned long *switch16_ptr, *switch32_ptr; + struct pt_regs *regs; + unsigned long stack_top, stack_bot; + unsigned short iret_frame16_off; + int cpu = smp_processor_id(); + /* reserve the space on 32bit stack for the magic switch16 pointer */ + memmove(stk, stk + 8, sizeof(struct pt_regs)); + switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); + regs = (struct pt_regs *)stk; + /* now the switch32 on 16bit stack */ + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; + /* copy iret frame on 16bit stack */ + memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); + /* fill in the switch pointers */ + switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; + switch16_ptr[1] = __ESPFIX_SS; + switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + + 8 - CPU_16BIT_STACK_SIZE; + switch32_ptr[1] = __KERNEL_DS; +} + +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) +{ + unsigned long *switch32_ptr; + unsigned char *stack16, *stack32; + unsigned long stack_top, stack_bot; + int len; + int cpu = smp_processor_id(); + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + /* copy the data from 16bit stack to 32bit stack */ + len = CPU_16BIT_STACK_SIZE - 8 - sp; + stack16 = (unsigned char *)(stack_bot + sp); + stack32 = (unsigned char *) + (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); + memcpy(stack32, stack16, len); + return stack32; +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -980,3 +1017,10 @@ void smp_trap_init(trap_info_t *trap_ctxt) trap_ctxt[t->vector].address = t->address; } } + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("kstack=", kstack_setup); diff --git a/linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c b/linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c index 1bf278733f..1cfe059f8b 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c @@ -77,7 +77,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) * force other mappings to Oops if they'll try to access * this pte without first remap it */ - pte_clear(kmap_pte-idx); + pte_clear(&init_mm, vaddr, kmap_pte-idx); __flush_tlb_one(vaddr); #endif diff --git a/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c b/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c index 044568c42b..7c8b95a8f2 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c @@ -248,13 +248,10 @@ static inline int page_is_ram(unsigned long pagenr) pte_t *kmap_pte; pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); -EXPORT_SYMBOL(kmap_pte); - #define kmap_get_fixmap_pte(vaddr) \ pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) -void __init kmap_init(void) +static void __init kmap_init(void) { unsigned long kmap_vstart; @@ -265,7 +262,7 @@ void __init kmap_init(void) kmap_prot = PAGE_KERNEL; } -void __init permanent_kmaps_init(pgd_t *pgd_base) +static void __init permanent_kmaps_init(pgd_t *pgd_base) { pgd_t *pgd; pud_t *pud; @@ -297,7 +294,7 @@ void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) } #ifndef CONFIG_DISCONTIGMEM -void __init set_highmem_pages_init(int bad_ppro) +static void __init set_highmem_pages_init(int bad_ppro) { int pfn; for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) @@ -426,38 +423,6 @@ void zap_low_mappings (void) flush_tlb_all(); } -#ifndef CONFIG_DISCONTIGMEM -void __init zone_sizes_init(void) -{ - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int /*max_dma,*/ high, low; - - /* - * XEN: Our notion of "DMA memory" is fake when running over Xen. - * We simply put all RAM in the DMA zone so that those drivers which - * needlessly specify GFP_DMA do not get starved of RAM unnecessarily. - * Those drivers that *do* require lowmem are screwed anyway when - * running over Xen! - */ - /*max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;*/ - low = max_low_pfn; - high = highend_pfn; - - /*if (low < max_dma)*/ - zones_size[ZONE_DMA] = low; - /*else*/ { - /*zones_size[ZONE_DMA] = max_dma;*/ - /*zones_size[ZONE_NORMAL] = low - max_dma;*/ -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; -#endif - } - free_area_init(zones_size); -} -#else -extern void zone_sizes_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ - static int disable_nx __initdata = 0; u64 __supported_pte_mask = ~_PAGE_NX; @@ -560,7 +525,6 @@ void __init paging_init(void) __flush_tlb_all(); kmap_init(); - zone_sizes_init(); /* Switch to the real shared_info page, and clear the dummy page. */ set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); @@ -586,7 +550,7 @@ void __init paging_init(void) * but fortunately the switch to using exceptions got rid of all that. */ -void __init test_wp_bit(void) +static void __init test_wp_bit(void) { printk("Checking if this processor honours the WP bit even in supervisor mode... "); @@ -605,20 +569,17 @@ void __init test_wp_bit(void) } } -#ifndef CONFIG_DISCONTIGMEM static void __init set_max_mapnr_init(void) { #ifdef CONFIG_HIGHMEM - max_mapnr = num_physpages = highend_pfn; + num_physpages = highend_pfn; #else - max_mapnr = num_physpages = max_low_pfn; + num_physpages = max_low_pfn; +#endif +#ifndef CONFIG_DISCONTIGMEM + max_mapnr = num_physpages; #endif } -#define __free_all_bootmem() free_all_bootmem() -#else -#define __free_all_bootmem() free_all_bootmem_node(NODE_DATA(0)) -extern void set_max_mapnr_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ static struct kcore_list kcore_mem, kcore_vmalloc; @@ -650,16 +611,16 @@ void __init mem_init(void) set_max_mapnr_init(); #ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE); + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk("vmalloc area: %lx-%lx, maxmem %lx\n", VMALLOC_START,VMALLOC_END,MAXMEM); BUG_ON(VMALLOC_START > VMALLOC_END); /* this will put all low memory onto the freelists */ - totalram_pages += __free_all_bootmem(); + totalram_pages += free_all_bootmem(); /* XEN: init and count low-mem pages outside initial allocation. */ for (pfn = xen_start_info.nr_pages; pfn < max_low_pfn; pfn++) { ClearPageReserved(&mem_map[pfn]); diff --git a/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c b/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c index 06495cd9de..8043cc1c4d 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c @@ -342,7 +342,7 @@ void pgd_free(pgd_t *pgd) if (PTRS_PER_PMD > 1) for (i = 0; i < USER_PTRS_PER_PGD; ++i) kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - /* in the non-PAE case, clear_page_range() clears user pgd entries */ + /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } diff --git a/linux-2.6-xen-sparse/arch/xen/i386/pci/irq.c b/linux-2.6-xen-sparse/arch/xen/i386/pci/irq.c index 7eeea04f72..3b0b096b30 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/pci/irq.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/pci/irq.c @@ -500,6 +500,9 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH6_1: case PCI_DEVICE_ID_INTEL_ICH7_0: case PCI_DEVICE_ID_INTEL_ICH7_1: + case PCI_DEVICE_ID_INTEL_ICH7_30: + case PCI_DEVICE_ID_INTEL_ICH7_31: + case PCI_DEVICE_ID_INTEL_ESB2_0: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; @@ -1031,66 +1034,60 @@ void pcibios_penalize_isa_irq(int irq) static int pirq_enable_irq(struct pci_dev *dev) { u8 pin; - extern int via_interrupt_line_quirk; struct pci_dev *temp_dev; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { - char *msg; - msg = ""; + char *msg = ""; + + pin--; /* interrupt pins are numbered starting from 1 */ + if (io_apic_assign_pci_irqs) { int irq; - if (pin) { - pin--; /* interrupt pins are numbered starting from 1 */ - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); - /* - * Busses behind bridges are typically not listed in the MP-table. - * In this case we have to look up the IRQ based on the parent bus, - * parent slot, and pin number. The SMP code detects such bridged - * busses itself so we should get into this branch reliably. - */ - temp_dev = dev; - while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ - struct pci_dev * bridge = dev->bus->self; - - pin = (pin + PCI_SLOT(dev->devfn)) % 4; - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin); - if (irq >= 0) - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", - pci_name(bridge), 'A' + pin, irq); - dev = bridge; - } - dev = temp_dev; - if (irq >= 0) { + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the MP-table. + * In this case we have to look up the IRQ based on the parent bus, + * parent slot, and pin number. The SMP code detects such bridged + * busses itself so we should get into this branch reliably. + */ + temp_dev = dev; + while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ + struct pci_dev * bridge = dev->bus->self; + + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", + pci_name(bridge), 'A' + pin, irq); + dev = bridge; + } + dev = temp_dev; + if (irq >= 0) { #ifdef CONFIG_PCI_MSI - if (!platform_legacy_irq(irq)) - irq = IO_APIC_VECTOR(irq); + if (!platform_legacy_irq(irq)) + irq = IO_APIC_VECTOR(irq); #endif - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", - pci_name(dev), 'A' + pin, irq); - dev->irq = irq; - return 0; - } else - msg = " Probably buggy MP table."; - } + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", + pci_name(dev), 'A' + pin, irq); + dev->irq = irq; + return 0; + } else + msg = " Probably buggy MP table."; } else if (pci_probe & PCI_BIOS_IRQ_SCAN) msg = ""; else msg = " Please try using pci=biosirq."; - + /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) return 0; - + printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", - 'A' + pin - 1, pci_name(dev), msg); + 'A' + pin, pci_name(dev), msg); } - /* VIA bridges use interrupt line for apic/pci steering across - the V-Link */ - else if (via_interrupt_line_quirk) - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq & 15); return 0; } diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig b/linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig index 6785cf8e8a..38e37b20b1 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig @@ -66,23 +66,6 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. -config HPET_TIMER - bool - default n - help - Use the IA-PC HPET (High Precision Event Timer) to manage - time in preference to the PIT and RTC, if a HPET is - present. The HPET provides a stable time base on SMP - systems, unlike the RTC, but it is more expensive to access, - as it is off-chip. You can find the HPET spec at - <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>. - - If unsure, say Y. - -config HPET_EMULATE_RTC - bool "Provide RTC interrupt" - depends on HPET_TIMER && RTC=y - config GENERIC_ISA_DMA bool default y @@ -255,7 +238,7 @@ config PREEMPT config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on SMP - default off + default n help SMT scheduler support improves the CPU scheduler's decision making when dealing with Intel Pentium 4 chips with HyperThreading at a @@ -312,6 +295,23 @@ config NR_CPUS This is purely to save memory - each supported CPU requires memory in the static kernel configuration. +config HPET_TIMER + bool + default n + help + Use the IA-PC HPET (High Precision Event Timer) to manage + time in preference to the PIT and RTC, if a HPET is + present. The HPET provides a stable time base on SMP + systems, unlike the RTC, but it is more expensive to access, + as it is off-chip. You can find the HPET spec at + <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>. + + If unsure, say Y. + +config HPET_EMULATE_RTC + bool "Provide RTC interrupt" + depends on HPET_TIMER && RTC=y + config GART_IOMMU bool "IOMMU support" depends on PCI @@ -346,6 +346,24 @@ config X86_MCE machine check error logs. See ftp://ftp.x86-64.org/pub/linux/tools/mcelog +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc/<pid>/seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + + endmenu # @@ -359,6 +377,11 @@ config GENERIC_IRQ_PROBE bool default y +# we have no ISA slots, but we do have ISA-style DMA. +config ISA_DMA_API + bool + default y + menu "Power management options" source kernel/power/Kconfig @@ -380,7 +403,7 @@ config PCI_DIRECT config PCI_MMCONFIG bool "Support mmconfig PCI config space access" - depends on PCI + depends on PCI && ACPI select ACPI_BOOT config UNORDERED_IO @@ -393,6 +416,8 @@ config UNORDERED_IO from i386. Requires that the driver writer used memory barriers properly. +#source "drivers/pci/pcie/Kconfig" + #source "drivers/pci/Kconfig" #source "drivers/pcmcia/Kconfig" @@ -444,12 +469,8 @@ endmenu #source "arch/x86_64/oprofile/Kconfig" -#source "arch/x86_64/Kconfig.debug" - # source "security/Kconfig" # source "crypto/Kconfig" -# source "lib/Kconfig" - endmenu diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/Makefile b/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/Makefile index 4dd24552cc..fe88b369a1 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/Makefile +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/Makefile @@ -48,10 +48,14 @@ $(obj)/vsyscall.lds: $(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): @ln -fsn $(srctree)/arch/x86_64/ia32/$(notdir $@) $@ -$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S +$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S ../../i386/kernel/vsyscall-note.S $(obj)/vsyscall-sysenter.S: $(obj)/vsyscall-sigreturn.S $(obj)/vsyscall-syscall.S: $(obj)/vsyscall-sigreturn.S +../../i386/kernel/vsyscall-note.S: + @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $(srctree)/arch/xen/i386/kernel/$(notdir $@) + make -C arch/xen/i386/kernel vsyscall-note.S + obj-y += $(c-obj-y) $(s-obj-y) clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/ia32entry.S b/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/ia32entry.S index 521c881c98..388d49b8b7 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/ia32entry.S +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/ia32entry.S @@ -96,7 +96,7 @@ ENTRY(ia32_sysenter_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz sysenter_tracesys sysenter_do_call: cmpl $(IA32_NR_syscalls),%eax @@ -184,7 +184,7 @@ ENTRY(ia32_cstar_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz cstar_tracesys cstar_do_call: cmpl $IA32_NR_syscalls,%eax @@ -263,7 +263,7 @@ ENTRY(ia32_syscall) this could be a problem. */ SAVE_ARGS 0,0,1 GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls),%eax @@ -617,7 +617,7 @@ ia32_sys_call_table: .quad compat_sys_mq_notify .quad compat_sys_mq_getsetattr .quad quiet_ni_syscall /* reserved for kexec */ - .quad sys32_waitid + .quad compat_sys_waitid .quad quiet_ni_syscall /* sys_altroot */ .quad sys_add_key .quad sys_request_key diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/syscall32.c b/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/syscall32.c index 0f97d1cf54..ecccf66a2e 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/syscall32.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/syscall32.c @@ -9,6 +9,7 @@ #include <linux/gfp.h> #include <linux/init.h> #include <linux/stringify.h> +#include <linux/security.h> #include <asm/proto.h> #include <asm/tlbflush.h> #include <asm/ia32_unistd.h> @@ -50,51 +51,57 @@ extern int sysctl_vsyscall32; char *syscall32_page; -/* - * Map the 32bit vsyscall page on demand. - * - * RED-PEN: This knows too much about high level VM. - * - * Alternative would be to generate a vma with appropriate backing options - * and let it be handled by generic VM. - */ -int __map_syscall32(struct mm_struct *mm, unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pte_t *pte; - pmd_t *pmd; - int err = -ENOMEM; - - spin_lock(&mm->page_table_lock); - pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (pud) { - pmd = pmd_alloc(mm, pud, address); - if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) { - if (pte_none(*pte)) { - set_pte(pte, - mk_pte(virt_to_page(syscall32_page), - PAGE_KERNEL_VSYSCALL32)); - } - /* Flush only the local CPU. Other CPUs taking a fault - will just end up here again - This probably not needed and just paranoia. */ - __flush_tlb_one(address); - err = 0; - } - } - spin_unlock(&mm->page_table_lock); - return err; +static struct page * +syscall32_nopage(struct vm_area_struct *vma, unsigned long adr, int *type) +{ + struct page *p = virt_to_page(adr - vma->vm_start + syscall32_page); + get_page(p); + return p; } -int map_syscall32(struct mm_struct *mm, unsigned long address) +/* Prevent VMA merging */ +static void syscall32_vma_close(struct vm_area_struct *vma) { - int err; - down_read(&mm->mmap_sem); - err = __map_syscall32(mm, address); - up_read(&mm->mmap_sem); - return err; +} + +static struct vm_operations_struct syscall32_vm_ops = { + .close = syscall32_vma_close, + .nopage = syscall32_nopage, +}; + +struct linux_binprm; + +/* Setup a VMA at program startup for the vsyscall page */ +int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) +{ + int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) + return -ENOMEM; + if (security_vm_enough_memory(npages)) { + kmem_cache_free(vm_area_cachep, vma); + return -ENOMEM; + } + + memset(vma, 0, sizeof(struct vm_area_struct)); + /* Could randomize here */ + vma->vm_start = VSYSCALL32_BASE; + vma->vm_end = VSYSCALL32_END; + /* MAYWRITE to allow gdb to COW and set breakpoints */ + vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYEXEC|VM_MAYWRITE; + vma->vm_flags |= mm->def_flags; + vma->vm_page_prot = protection_map[vma->vm_flags & 7]; + vma->vm_ops = &syscall32_vm_ops; + vma->vm_mm = mm; + + down_write(&mm->mmap_sem); + insert_vm_struct(mm, vma); + mm->total_vm += npages; + up_write(&mm->mmap_sem); + return 0; } static int __init init_syscall32(void) @@ -102,7 +109,6 @@ static int __init init_syscall32(void) syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); if (!syscall32_page) panic("Cannot allocate syscall32 page"); - SetPageReserved(virt_to_page(syscall32_page)); #ifdef USE_INT80 /* diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile index c340cf6b45..8b30e7c99b 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile @@ -38,6 +38,7 @@ c-obj-$(CONFIG_X86_IO_APIC) += genapic.o genapic_cluster.o genapic_flat.o c-obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o #obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o c-obj-$(CONFIG_MODULES) += module.o diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/asm-offsets.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/asm-offsets.c index b965d6d1eb..83c8171d5f 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/asm-offsets.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/asm-offsets.c @@ -63,8 +63,8 @@ int main(void) offsetof (struct rt_sigframe32, uc.uc_mcontext)); BLANK(); #endif - DEFINE(SIZEOF_PBE, sizeof(struct pbe)); DEFINE(pbe_address, offsetof(struct pbe, address)); DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); + DEFINE(pbe_next, offsetof(struct pbe, next)); return 0; } diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c index 7c56a9f53c..5751a578a6 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c @@ -2,6 +2,12 @@ * Handle the memory map. * The functions here do the job until bootmem takes over. * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $ + * + * Getting sanitize_e820_map() in sync with i386 version by applying change: + * - Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@slit.de>, December 2002. + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * */ #include <linux/config.h> #include <linux/kernel.h> @@ -279,7 +285,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) int chgidx, still_changing; int overlap_entries; int new_bios_entry; - int old_nr, new_nr; + int old_nr, new_nr, chg_nr; int i; /* @@ -333,20 +339,24 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) for (i=0; i < 2*old_nr; i++) change_point[i] = &change_point_list[i]; - /* record all known change-points (starting and ending addresses) */ + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ chgidx = 0; for (i=0; i < old_nr; i++) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } } + chg_nr = chgidx; /* sort change-point list by memory addresses (low -> high) */ still_changing = 1; while (still_changing) { still_changing = 0; - for (i=1; i < 2*old_nr; i++) { + for (i=1; i < chg_nr; i++) { /* if <current_addr> > <last_addr>, swap */ /* or, if current=<start_addr> & last=<end_addr>, swap */ if ((change_point[i]->addr < change_point[i-1]->addr) || @@ -369,7 +379,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) last_type = 0; /* start with undefined memory type */ last_addr = 0; /* start with 0 as last starting address */ /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < 2*old_nr; chgidx++) + for (chgidx=0; chgidx < chg_nr; chgidx++) { /* keep track of all overlapping bios entries */ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) @@ -512,22 +522,67 @@ extern unsigned long xen_override_max_pfn; void __init parse_memopt(char *p, char **from) { - /* - * mem=XXX[kKmM] limits kernel memory to XXX+1MB - * - * It would be more logical to count from 0 instead of from - * HIGH_MEMORY, but we keep that for now for i386 compatibility. - * - * No support for custom mapping like i386. The reason is - * that we need to read the e820 map anyways to handle the - * ACPI mappings in the direct map. Also on x86-64 there - * should be always a good e820 map. This is only an upper - * limit, you cannot force usage of memory not in e820. - * - * -AK - */ - end_user_pfn = memparse(p, from) + HIGH_MEMORY; + end_user_pfn = memparse(p, from); end_user_pfn >>= PAGE_SHIFT; xen_override_max_pfn = (unsigned long) end_user_pfn; } +unsigned long pci_mem_start = 0xaeedbabe; + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(void) +{ + unsigned long gapstart, gapsize; + unsigned long last; + int i; + int found = 0; + + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + + if (!found) { + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); + } + + /* + * Start allocating dynamic PCI memory a bit into the gap, + * aligned up to the nearest megabyte. + * + * Question: should we try to pad it up a bit (do something + * like " + (gapsize >> 3)" in there too?). We now have the + * technology. + */ + pci_mem_start = (gapstart + 0xfffff) & ~0xfffff; + + printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); +} diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/early_printk.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/early_printk.c index 9abb0406a6..7bc0773a2d 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/early_printk.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/early_printk.c @@ -62,7 +62,7 @@ static struct console early_vga_console = { #ifndef CONFIG_XEN /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ -int early_serial_base = 0x3f8; /* ttyS0 */ +static int early_serial_base = 0x3f8; /* ttyS0 */ #define XMTRDY 0x20 diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S index 8f20ff647b..8ad08b3c95 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S @@ -54,10 +54,7 @@ .code64 -#ifdef CONFIG_PREEMPT -#define preempt_stop XEN_BLOCK_EVENTS(%rsi) -#else -#define preempt_stop +#ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -157,7 +154,7 @@ ENTRY(ret_from_fork) CFI_DEFAULT_STACK call schedule_tail GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) jnz rff_trace rff_action: RESTORE_REST @@ -207,7 +204,7 @@ ENTRY(system_call) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) XEN_UNBLOCK_EVENTS(%r11) GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -220,7 +217,7 @@ ENTRY(system_call) */ .globl ret_from_sys_call ret_from_sys_call: - movl $_TIF_WORK_MASK,%edi + movl $_TIF_ALLWORK_MASK,%edi /* edi: flagmask */ sysret_check: GET_THREAD_INFO(%rcx) @@ -316,6 +313,7 @@ int_careful: pushq %rdi call schedule popq %rdi + cli jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ @@ -330,9 +328,8 @@ int_very_careful: leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave popq %rdi - btr $TIF_SYSCALL_TRACE,%edi - btr $TIF_SYSCALL_AUDIT,%edi - btr $TIF_SINGLESTEP,%edi + andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi + cli jmp int_restore_rest int_signal: @@ -344,6 +341,7 @@ int_signal: 1: movl $_TIF_NEED_RESCHED,%edi int_restore_rest: RESTORE_REST + cli jmp int_with_check CFI_ENDPROC @@ -511,7 +509,7 @@ retint_signal: RESTORE_REST XEN_BLOCK_EVENTS(%rsi) movl $_TIF_NEED_RESCHED,%edi - GET_THREAD_INFO(%rcx) + GET_THREAD_INFO(%rcx) jmp retint_check #ifdef CONFIG_PREEMPT @@ -524,15 +522,8 @@ retint_kernel: bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) jnc retint_restore_args bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ - jc retint_restore_args - movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx) -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) - call schedule - XEN_BLOCK_EVENTS(%rsi) -/* cli */ - GET_THREAD_INFO(%rcx) - movl $0,threadinfo_preempt_count(%rcx) + jnc retint_restore_args + call preempt_schedule_irq jmp retint_kernel /* check again */ #endif CFI_ENDPROC @@ -610,6 +601,7 @@ ENTRY(spurious_interrupt) movq ORIG_RAX(%rsp),%rsi movq $-1,ORIG_RAX(%rsp) call \sym + cli .endm /* @@ -934,8 +926,6 @@ ENTRY(debug) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug - /* switch back to process stack to restore the state ptrace touched */ - movq %rax,%rsp jmp paranoid_exit CFI_ENDPROC @@ -946,38 +936,62 @@ ENTRY(nmi) pushq $-1 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_nmi + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + */ /* ebx: no swapgs flag */ #endif paranoid_exit: testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_restore paranoid_swapgs: -/* cli - swapgs */ +/* swapgs */ paranoid_restore: RESTORE_ALL 8 /* iretq */ paranoid_userspace: -/* cli */ GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%edx - testl $_TIF_NEED_RESCHED,%edx - jnz paranoid_resched - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx - jnz paranoid_signal - jmp paranoid_swapgs -paranoid_resched: +# movl threadinfo_flags(%rcx),%edx +# testl $_TIF_NEED_RESCHED,%edx +# jnz paranoid_resched +# testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx +# jnz paranoid_signal +# jmp paranoid_swapgs +#paranoid_resched: +#/* sti */ +# call schedule +# jmp paranoid_exit +#paranoid_signal: + movl threadinfo_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule + movl %ebx,%edx /* arg3: thread flags */ /* sti */ - call schedule - jmp paranoid_exit -paranoid_signal: -/* sti */ - xorl %esi,%esi /* oldset */ - movq %rsp,%rdi /* &pt_regs */ +# xorl %esi,%esi /* oldset */ +# movq %rsp,%rdi /* &pt_regs */ + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ call do_notify_resume - jmp paranoid_exit +# jmp paranoid_exit + cli + jmp paranoid_userspace +paranoid_schedule: + sti + call schedule + cli + jmp paranoid_userspace CFI_ENDPROC - + ENTRY(int3) zeroentry do_int3 @@ -1000,7 +1014,6 @@ ENTRY(reserved) ENTRY(double_fault) CFI_STARTPROC paranoidentry do_double_fault - movq %rax,%rsp jmp paranoid_exit CFI_ENDPROC @@ -1014,7 +1027,6 @@ ENTRY(segment_not_present) ENTRY(stack_segment) CFI_STARTPROC paranoidentry do_stack_segment - movq %rax,%rsp jmp paranoid_exit CFI_ENDPROC diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S index d7000838d8..0bc0f62001 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S @@ -39,6 +39,8 @@ .text .code64 + .globl startup_64 +startup_64: ENTRY(_start) cld movq init_rsp(%rip),%rsp @@ -136,9 +138,8 @@ ENTRY(cpu_gdt_table) .quad 0x00affa000000ffff /* __USER_CS */ .quad 0x00cffa000000ffff /* __KERNEL32_CS */ .quad 0,0 /* TSS */ - .quad 0 /* LDT */ + .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ - .quad 0 /* unused now */ gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ @@ -184,12 +185,6 @@ gdt: .endr #endif -ENTRY(gdt_table32) - .quad 0x0000000000000000 /* This one is magic */ - .quad 0x0000000000000000 /* unused */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ -gdt32_end: - /* We need valid kernel segments for data and code in long mode too * IRET will check the segment types kkeil 2000/10/28 * Also sysret mandates a special GDT layout diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head64.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head64.c index a4304edf6d..990f8fb190 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head64.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head64.c @@ -36,8 +36,6 @@ static void __init clear_bss(void) } #endif -extern char x86_boot_params[2048]; - #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -52,7 +50,7 @@ static void __init copy_bootdata(char *real_mode_data) int new_data; char * command_line; - memcpy(x86_boot_params, real_mode_data, 2048); + memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { @@ -105,9 +103,6 @@ void __init x86_64_start_kernel(char * real_mode_data) #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif - /* default console: */ - if (!strstr(saved_command_line, "console=")) - strcat(saved_command_line, " console=tty0"); #if 0 s = strstr(saved_command_line, "earlyprintk="); if (s != NULL) diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/io_apic.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/io_apic.c index 647f09b32e..4cbb1aed8f 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/io_apic.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/io_apic.c @@ -37,11 +37,14 @@ #include <asm/desc.h> #include <asm/proto.h> #include <asm/mach_apic.h> +#include <asm/acpi.h> #define __apicdebuginit __init int sis_apic_bug; /* not actually supported, dummy for compile */ +static int no_timer_check; + static DEFINE_SPINLOCK(ioapic_lock); /* @@ -192,7 +195,7 @@ static void unmask_IO_APIC_irq (unsigned int irq) spin_unlock_irqrestore(&ioapic_lock, flags); } -void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; unsigned long flags; @@ -232,8 +235,8 @@ static void clear_IO_APIC (void) */ #define MAX_PIRQS 8 -int pirq_entries [MAX_PIRQS]; -int pirqs_enabled; +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; int skip_ioapic_setup; int ioapic_force; @@ -758,7 +761,7 @@ static inline void ioapic_register_intr(int irq, int vector, unsigned long trigg #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) #endif /* !CONFIG_XEN */ -void __init setup_IO_APIC_irqs(void) +static void __init setup_IO_APIC_irqs(void) { struct IO_APIC_route_entry entry; int apic, pin, idx, irq, first_notcon = 1, vector; @@ -828,7 +831,7 @@ void __init setup_IO_APIC_irqs(void) * Set up the 8259A-master output pin as broadcast to all * CPUs. */ -void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) +static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) { struct IO_APIC_route_entry entry; unsigned long flags; @@ -1000,6 +1003,8 @@ void __apicdebuginit print_IO_APIC(void) return; } +#if 0 + static __apicdebuginit void print_APIC_bitfield (int base) { unsigned int v; @@ -1141,6 +1146,8 @@ void __apicdebuginit print_PIC(void) v = inb(0x4d1) << 8 | inb(0x4d0); printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } +#endif /* 0 */ + #else void __init print_IO_APIC(void) { } #endif /* !CONFIG_XEN */ @@ -1661,13 +1668,12 @@ static inline void check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ unmask_IO_APIC_irq(0); - if (timer_irq_works()) { + if (!no_timer_check && timer_irq_works()) { nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); setup_nmi(); enable_8259A_irq(0); - check_nmi_watchdog(); } return; } @@ -1687,7 +1693,6 @@ static inline void check_timer(void) nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - check_nmi_watchdog(); } return; } @@ -1736,6 +1741,13 @@ static inline void check_timer(void) #define check_timer() ((void)0) #endif /* !CONFIG_XEN */ +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + /* * * IRQ's that are handled by the PIC in the MPS IOAPIC case. @@ -1777,7 +1789,7 @@ struct sysfs_ioapic_data { }; static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; -static int ioapic_suspend(struct sys_device *dev, u32 state) +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; @@ -1871,78 +1883,6 @@ device_initcall(ioapic_init_sysfs); #define IO_APIC_MAX_ID 0xFE -int __init io_apic_get_unique_id (int ioapic, int apic_id) -{ -#ifndef CONFIG_XEN - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = phys_cpu_present_map; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= IO_APIC_MAX_ID) { - apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (physid_isset(apic_id, apic_id_map)) { - - for (i = 0; i < IO_APIC_MAX_ID; i++) { - if (!physid_isset(i, apic_id_map)) - break; - } - - if (i == IO_APIC_MAX_ID) - panic("Max apic_id exceeded!\n"); - - apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - physid_set(apic_id, apic_id_map); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) - panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); - } - - apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); -#endif /* !CONFIG_XEN */ - - return apic_id; -} - - int __init io_apic_get_version (int ioapic) { union IO_APIC_reg_01 reg_01; diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ioport.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ioport.c index ff96e22341..7d7385cf61 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ioport.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ioport.c @@ -25,8 +25,7 @@ * */ -// asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -asmlinkage long sys_iopl(unsigned int new_io_pl) +asmlinkage long sys_iopl(unsigned int new_io_pl, struct pt_regs *regs) { unsigned int old_io_pl = current->thread.io_pl; physdev_op_t op; @@ -59,5 +58,5 @@ asmlinkage long sys_iopl(unsigned int new_io_pl) */ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) { - return turn_on ? sys_iopl(3) : 0; + return turn_on ? sys_iopl(3, NULL) : 0; } diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/mpparse.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/mpparse.c index c2aff7edad..059f276767 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/mpparse.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/mpparse.c @@ -30,6 +30,7 @@ #include <asm/pgalloc.h> #include <asm/io_apic.h> #include <asm/proto.h> +#include <asm/acpi.h> /* Have we found an MP table */ int smp_found_config; @@ -46,7 +47,7 @@ unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL }; -int mp_current_pci_id = 0; +static int mp_current_pci_id = 0; /* I/O APIC entries */ struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; @@ -108,6 +109,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info (struct mpc_config_processor *m) { int ver; + static int found_bsp=0; if (!(m->mpc_cpuflag & CPU_ENABLED)) return; @@ -127,11 +129,6 @@ static void __init MP_processor_info (struct mpc_config_processor *m) " Processor ignored.\n", NR_CPUS); return; } - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } num_processors++; @@ -151,7 +148,19 @@ static void __init MP_processor_info (struct mpc_config_processor *m) ver = 0x10; } apic_version[m->mpc_apicid] = ver; - bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + /* + * bios_cpu_apicid is required to have processors listed + * in same order as logical cpu numbers. Hence the first + * entry is BSP, and so on. + */ + bios_cpu_apicid[0] = m->mpc_apicid; + x86_cpu_to_apicid[0] = m->mpc_apicid; + found_bsp = 1; + } else { + bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid; + x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid; + } } #else void __init MP_processor_info (struct mpc_config_processor *m) @@ -714,7 +723,7 @@ void __init mp_register_lapic ( #define MP_ISA_BUS 0 #define MP_MAX_IOAPIC_PIN 127 -struct mp_ioapic_routing { +static struct mp_ioapic_routing { int apic_id; int gsi_start; int gsi_end; @@ -764,7 +773,7 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; mp_ioapics[idx].mpc_apicaddr = address; - mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + mp_ioapics[idx].mpc_apicid = id; mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); /* diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/process.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/process.c index 48afb51919..a6b1a843cf 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/process.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/process.c @@ -37,6 +37,7 @@ #include <linux/irq.h> #include <linux/ptrace.h> #include <linux/utsname.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -59,7 +60,7 @@ asmlinkage extern void ret_from_fork(void); unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; -atomic_t hlt_counter = ATOMIC_INIT(0); +static atomic_t hlt_counter = ATOMIC_INIT(0); unsigned long boot_option_idle_override = 0; EXPORT_SYMBOL(boot_option_idle_override); @@ -68,7 +69,7 @@ EXPORT_SYMBOL(boot_option_idle_override); * Powermanagement idle function, if any.. */ void (*pm_idle)(void); -static cpumask_t cpu_idle_map; +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) { @@ -140,8 +141,8 @@ void cpu_idle (void) /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { - if (cpu_isset(cpu, cpu_idle_map)) - cpu_clear(cpu, cpu_idle_map); + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; rmb(); if (cpu_is_offline(cpu)) @@ -150,22 +151,35 @@ void cpu_idle (void) __IRQ_STAT(cpu,idle_timestamp) = jiffies; xen_idle(); } + schedule(); } } void cpu_idle_wait(void) { - int cpu; + unsigned int cpu, this_cpu = get_cpu(); cpumask_t map; - for_each_online_cpu(cpu) - cpu_set(cpu, cpu_idle_map); + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; wmb(); do { ssleep(1); - cpus_and(map, cpu_idle_map, cpu_online_map); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); } while (!cpus_empty(map)); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -199,11 +213,11 @@ void __show_regs(struct pt_regs * regs) printk("R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); - asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); - asm("movl %%es,%0" : "=r" (es)); - asm("movl %%fs,%0" : "=r" (fsindex)); - asm("movl %%gs,%0" : "=r" (gsindex)); + asm("mov %%ds,%0" : "=r" (ds)); + asm("mov %%cs,%0" : "=r" (cs)); + asm("mov %%es,%0" : "=r" (es)); + asm("mov %%fs,%0" : "=r" (fsindex)); + asm("mov %%gs,%0" : "=r" (gsindex)); rdmsrl(MSR_FS_BASE, fs); rdmsrl(MSR_GS_BASE, gs); @@ -343,10 +357,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; - asm("movl %%gs,%0" : "=m" (p->thread.gsindex)); - asm("movl %%fs,%0" : "=m" (p->thread.fsindex)); - asm("movl %%es,%0" : "=m" (p->thread.es)); - asm("movl %%ds,%0" : "=m" (p->thread.ds)); + asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); + asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); + asm("mov %%es,%0" : "=m" (p->thread.es)); + asm("mov %%ds,%0" : "=m" (p->thread.ds)); if (unlikely(me->thread.io_bitmap_ptr != NULL)) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); @@ -651,7 +665,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) set_32bit_tls(task, FS_TLS, addr); if (doit) { load_TLS(&task->thread, cpu); - asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); + asm volatile("mov %0,%%fs" :: "r" (FS_TLS_SEL)); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; @@ -661,7 +675,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) if (doit) { /* set the selector to 0 to not confuse __switch_to */ - asm volatile("movl %0,%%fs" :: "r" (0)); + asm volatile("mov %0,%%fs" :: "r" (0)); ret = HYPERVISOR_set_segment_base(SEGBASE_FS, addr); } @@ -723,3 +737,10 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) boot_option_idle_override = 1; return 1; } + +unsigned long arch_align_stack(unsigned long sp) +{ + if (randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c index 72d4cd59e4..080121e1ab 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c @@ -86,7 +86,6 @@ unsigned long xen_override_max_pfn; struct cpuinfo_x86 boot_cpu_data; unsigned long mmu_cr4_features; -EXPORT_SYMBOL_GPL(mmu_cr4_features); int acpi_disabled; EXPORT_SYMBOL(acpi_disabled); @@ -98,9 +97,6 @@ int __initdata acpi_force = 0; int acpi_numa __initdata; -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -124,8 +120,6 @@ struct sys_desc_table_struct { struct edid_info edid_info; struct e820map e820; -unsigned char aux_device_present; - extern int root_mountflags; extern char _text, _etext, _edata, _end; @@ -377,6 +371,10 @@ static __init void parse_cmdline_early (char ** cmdline_p) else if (!memcmp(from, "acpi=strict", 11)) { acpi_strict = 1; } +#ifdef CONFIG_X86_IO_APIC + else if (!memcmp(from, "acpi_skip_timer_override", 24)) + acpi_skip_timer_override = 1; +#endif #endif #if 0 @@ -603,7 +601,6 @@ static void __init print_memory_map(char *who) void __init setup_arch(char **cmdline_p) { - unsigned long low_mem_size; int i, j; physdev_op_t op; @@ -618,7 +615,6 @@ void __init setup_arch(char **cmdline_p) screen_info = SCREEN_INFO; #endif edid_info = EDID_INFO; - aux_device_present = AUX_DEVICE_INFO; saved_video_mode = SAVED_VIDEO_MODE; bootloader_type = LOADER_TYPE; @@ -821,13 +817,7 @@ void __init setup_arch(char **cmdline_p) request_resource(&ioport_resource, &standard_io_resources[i]); } - /* Will likely break when you have unassigned resources with more - than 4GB memory and bridges that don't support more than 4GB. - Doing it properly would require to use pci_alloc_consistent - in this case. */ - low_mem_size = ((end_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff; - if (low_mem_size > pci_mem_start) - pci_mem_start = low_mem_size; + e820_setup_gap(); #ifdef CONFIG_GART_IOMMU iommu_hole_init(); @@ -867,7 +857,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; - if (c->x86_cpuid_level < 0x80000004) + if (c->extended_cpuid_level < 0x80000004) return 0; v = (unsigned int *) c->x86_model_id; @@ -883,7 +873,7 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, eax, ebx, ecx, edx; - n = c->x86_cpuid_level; + n = c->extended_cpuid_level; if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); @@ -913,14 +903,50 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c) } } +/* + * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * Assumes number of cores is a power of two. + */ +static void __init amd_detect_cmp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + int cpu = smp_processor_id(); + int node = 0; + unsigned bits; + if (c->x86_num_cores == 1) + return; + + bits = 0; + while ((1 << bits) < c->x86_num_cores) + bits++; + + /* Low order bits define the core id (index of core in socket) */ + cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1); + /* Convert the APIC ID into the socket ID */ + phys_proc_id[cpu] >>= bits; + +#ifdef CONFIG_NUMA + /* When an ACPI SRAT table is available use the mappings from SRAT + instead. */ + if (acpi_numa <= 0) { + node = phys_proc_id[cpu]; + if (!node_online(node)) + node = first_node(node_online_map); + cpu_to_node[cpu] = node; + } else { + node = cpu_to_node[cpu]; + } +#endif + + printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", + cpu, c->x86_num_cores, node, cpu_core_id[cpu]); +#endif +} static int __init init_amd(struct cpuinfo_x86 *c) { int r; int level; -#ifdef CONFIG_NUMA - int cpu; -#endif /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ @@ -943,26 +969,12 @@ static int __init init_amd(struct cpuinfo_x86 *c) } display_cacheinfo(c); - if (c->x86_cpuid_level >= 0x80000008) { + if (c->extended_cpuid_level >= 0x80000008) { c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; if (c->x86_num_cores & (c->x86_num_cores - 1)) c->x86_num_cores = 1; -#ifdef CONFIG_NUMA - /* On a dual core setup the lower bits of apic id - distingush the cores. Fix up the CPU<->node mappings - here based on that. - Assumes number of cores is a power of two. - When using SRAT use mapping from SRAT. */ - cpu = c->x86_apicid; - if (acpi_numa <= 0 && c->x86_num_cores > 1) { - cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1); - if (!node_online(cpu_to_node[cpu])) - cpu_to_node[cpu] = first_node(node_online_map); - } - printk(KERN_INFO "CPU %d(%d) -> Node %d\n", - cpu, c->x86_num_cores, cpu_to_node[cpu]); -#endif + amd_detect_cmp(c); } return r; @@ -972,10 +984,10 @@ static void __init detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; - int index_lsb, index_msb, tmp; + int index_msb, tmp; int cpu = smp_processor_id(); - if (!cpu_has(c, X86_FEATURE_HT)) + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; cpuid(1, &eax, &ebx, &ecx, &edx); @@ -984,7 +996,6 @@ static void __init detect_ht(struct cpuinfo_x86 *c) if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); } else if (smp_num_siblings > 1) { - index_lsb = 0; index_msb = 31; /* * At this point we only support two siblings per @@ -996,45 +1007,65 @@ static void __init detect_ht(struct cpuinfo_x86 *c) return; } tmp = smp_num_siblings; - while ((tmp & 1) == 0) { - tmp >>=1 ; - index_lsb++; - } - tmp = smp_num_siblings; while ((tmp & 0x80000000 ) == 0) { tmp <<=1 ; index_msb--; } - if (index_lsb != index_msb ) + if (smp_num_siblings & (smp_num_siblings - 1)) index_msb++; phys_proc_id[cpu] = phys_pkg_id(index_msb); printk(KERN_INFO "CPU: Physical Processor ID: %d\n", phys_proc_id[cpu]); + + smp_num_siblings = smp_num_siblings / c->x86_num_cores; + + tmp = smp_num_siblings; + index_msb = 31; + while ((tmp & 0x80000000) == 0) { + tmp <<=1 ; + index_msb--; + } + if (smp_num_siblings & (smp_num_siblings - 1)) + index_msb++; + + cpu_core_id[cpu] = phys_pkg_id(index_msb); + + if (c->x86_num_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + cpu_core_id[cpu]); } #endif } -static void __init sched_cmp_hack(struct cpuinfo_x86 *c) +/* + * find out the number of processor cores on the die + */ +static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP - /* AMD dual core looks like HT but isn't really. Hide it from the - scheduler. This works around problems with the domain scheduler. - Also probably gives slightly better scheduling and disables - SMT nice which is harmful on dual core. - TBD tune the domain scheduler for dual core. */ - if (c->x86_vendor == X86_VENDOR_AMD && cpu_has(c, X86_FEATURE_CMP_LEGACY)) - smp_num_siblings = 1; -#endif + unsigned int eax; + + if (c->cpuid_level < 4) + return 1; + + __asm__("cpuid" + : "=a" (eax) + : "0" (4), "c" (0) + : "bx", "dx"); + + if (eax & 0x1f) + return ((eax >> 26) + 1); + else + return 1; } - + static void __init init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; init_intel_cacheinfo(c); - n = c->x86_cpuid_level; + n = c->extended_cpuid_level; if (n >= 0x80000008) { unsigned eax = cpuid_eax(0x80000008); c->x86_virt_bits = (eax >> 8) & 0xff; @@ -1043,6 +1074,9 @@ static void __init init_intel(struct cpuinfo_x86 *c) if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 >= 15) + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + c->x86_num_cores = intel_num_cpu_cores(c); } void __init get_cpu_vendor(struct cpuinfo_x86 *c) @@ -1079,8 +1113,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) c->x86_clflush_size = 64; c->x86_cache_alignment = c->x86_clflush_size; c->x86_num_cores = 1; - c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data; - c->x86_cpuid_level = 0; + c->extended_cpuid_level = 0; memset(&c->x86_capability, 0, sizeof c->x86_capability); /* Get vendor name */ @@ -1108,11 +1141,14 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) } if (c->x86_capability[0] & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - c->x86_apicid = misc >> 24; } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; } + +#ifdef CONFIG_SMP + phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; +#endif } /* @@ -1127,11 +1163,11 @@ void __init identify_cpu(struct cpuinfo_x86 *c) /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); - c->x86_cpuid_level = xlvl; + c->extended_cpuid_level = xlvl; if ((xlvl & 0xffff0000) == 0x80000000) { if (xlvl >= 0x80000001) { c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[5] = cpuid_ecx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); } if (xlvl >= 0x80000004) get_model_name(c); /* Default name */ @@ -1172,7 +1208,6 @@ void __init identify_cpu(struct cpuinfo_x86 *c) select_idle_routine(c); detect_ht(c); - sched_cmp_hack(c); /* * On SMP, boot_cpu_data holds the common feature set between @@ -1231,7 +1266,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, /* AMD-defined */ - "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow", @@ -1243,7 +1278,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL, + "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, + "constant_tsc", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1254,17 +1290,25 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* VIA/Cyrix/Centaur-defined */ + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + /* AMD-defined (#2) */ "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static char *x86_power_flags[] = { "ts", /* temperature sensor */ "fid", /* frequency id control */ "vid", /* voltage id control */ "ttp", /* thermal trip */ + "tm", + "stc" }; @@ -1299,8 +1343,14 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); #ifdef CONFIG_SMP - seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]); - seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings); + if (smp_num_siblings * c->x86_num_cores > 1) { + int cpu = c - cpu_data; + seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]); + seq_printf(m, "siblings\t: %d\n", + c->x86_num_cores * smp_num_siblings); + seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]); + seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); + } #endif seq_printf(m, @@ -1342,12 +1392,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, " [%d]", i); } } - seq_printf(m, "\n"); - - if (c->x86_num_cores > 1) - seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); - seq_printf(m, "\n\n"); + seq_printf(m, "\n\n"); return 0; } diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c index 03452e1bf3..07e0035617 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c @@ -16,6 +16,7 @@ #include <linux/string.h> #include <linux/bootmem.h> #include <linux/bitops.h> +#include <asm/bootsetup.h> #include <asm/pda.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -32,7 +33,7 @@ #include <asm-xen/hypervisor.h> -char x86_boot_params[2048] __initdata = {0,}; +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/signal.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/signal.c index 2282257d98..5a2c6f179d 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/signal.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/signal.c @@ -28,6 +28,7 @@ #include <asm/uaccess.h> #include <asm/i387.h> #include <asm/proto.h> +#include <asm/ia32_unistd.h> /* #define DEBUG_SIG 1 */ @@ -83,7 +84,7 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct rt_sigframe { - char *pretcode; + char __user *pretcode; struct ucontext uc; struct siginfo info; }; @@ -121,7 +122,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned err |= __get_user(buf, &sc->fpstate); if (buf) { - if (verify_area(VERIFY_READ, buf, sizeof(*buf))) + if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) goto badframe; err |= restore_i387(buf); } else { @@ -147,7 +148,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) unsigned long eax; frame = (struct rt_sigframe __user *)(regs->rsp - 8); - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) { + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { goto badframe; } if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { @@ -160,9 +161,8 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) { + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) goto badframe; - } #ifdef DEBUG_SIG printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); @@ -186,7 +186,6 @@ static inline int setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) { int err = 0; - unsigned long eflags; err |= __put_user(0, &sc->gs); err |= __put_user(0, &sc->fs); @@ -210,11 +209,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo err |= __put_user(me->thread.trap_no, &sc->trapno); err |= __put_user(me->thread.error_code, &sc->err); err |= __put_user(regs->rip, &sc->rip); - eflags = regs->eflags; - if (current->ptrace & PT_PTRACED) { - eflags &= ~TF_MASK; - } - err |= __put_user(eflags, &sc->eflags); + err |= __put_user(regs->eflags, &sc->eflags); err |= __put_user(mask, &sc->oldmask); err |= __put_user(me->thread.cr2, &sc->cr2); @@ -253,28 +248,25 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, if (used_math()) { fp = get_stack(ka, regs, sizeof(struct _fpstate)); - frame = (void __user *)round_down((unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; + frame = (void __user *)round_down( + (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) { - goto give_sigsegv; - } + if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) + goto give_sigsegv; if (save_i387(fp) < 0) err |= -1; - } else { + } else frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; - } - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) { + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - } if (ka->sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, info); - if (err) { + if (err) goto give_sigsegv; } - } /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); @@ -288,9 +280,8 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, if (sizeof(*set) == 16) { __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); - } else { - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - } + } else + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -302,9 +293,8 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; } - if (err) { + if (err) goto give_sigsegv; - } #ifdef DEBUG_SIG printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); @@ -329,14 +319,9 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->rsp = (unsigned long)frame; set_fs(USER_DS); - if (regs->eflags & TF_MASK) { - if ((current->ptrace & (PT_PTRACED | PT_DTRACE)) == (PT_PTRACED | PT_DTRACE)) { - ptrace_notify(SIGTRAP); - } else { - regs->eflags &= ~TF_MASK; - } - } - + regs->eflags &= ~TF_MASK; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); #ifdef DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); @@ -357,7 +342,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) { #ifdef DEBUG_SIG - printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", current->pid, sig, + printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", + current->pid, sig, regs->rip, regs->rsp, regs); #endif @@ -375,10 +361,23 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, regs->rax = -EINTR; break; } - /* fallthrough */ + /* fallthrough */ case -ERESTARTNOINTR: regs->rax = regs->orig_rax; regs->rip -= 2; + break; + } + } + + /* + * If TF is set due to a debugger (PT_DTRACE), clear the TF + * flag so that register information in the sigcontext is + * correct. + */ + if (unlikely(regs->eflags & TF_MASK)) { + if (likely(current->ptrace & PT_DTRACE)) { + current->ptrace &= ~PT_DTRACE; + regs->eflags &= ~TF_MASK; } } @@ -418,9 +417,8 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * kernel mode. Just return without doing anything * if so. */ - if ((regs->cs & 2) != 2) { + if ((regs->cs & 2) != 2) return 1; - } if (try_to_freeze(0)) goto no_signal; @@ -457,7 +455,9 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) regs->rip -= 2; } if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { - regs->rax = __NR_restart_syscall; + regs->rax = test_thread_flag(TIF_IA32) ? + __NR_ia32_restart_syscall : + __NR_restart_syscall; regs->rip -= 2; } } diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c index b183fafacd..1e379ed17b 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c @@ -25,7 +25,9 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/mach_apic.h> +#include <asm/mmu_context.h> #include <asm/proto.h> +#include <asm/apicdef.h> /* * Smarter SMP flushing macros. @@ -41,7 +43,7 @@ static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL 0xffffffff +#define FLUSH_ALL -1ULL /* * We cannot call mmdrop() because we are in interrupt context, @@ -52,7 +54,7 @@ static inline void leave_mm (unsigned long cpu) if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask); - __flush_tlb(); + load_cr3(swapper_pg_dir); } /* @@ -372,7 +374,10 @@ void smp_send_stop(void) __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); if (!nolock) spin_unlock(&call_lock); - smp_stop_cpu(); + + local_irq_disable(); + disable_local_APIC(); + local_irq_enable(); } /* @@ -409,3 +414,27 @@ asmlinkage void smp_call_function_interrupt(void) atomic_inc(&call_data->finished); } } + +int safe_smp_processor_id(void) +{ + int apicid, i; + + if (disable_apic) + return 0; + + apicid = hard_smp_processor_id(); + if (x86_cpu_to_apicid[apicid] == apicid) + return apicid; + + for (i = 0; i < NR_CPUS; ++i) { + if (x86_cpu_to_apicid[i] == apicid) + return i; + } + + /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, + * or called too early. Either way, we must be CPU 0. */ + if (x86_cpu_to_apicid[0] == BAD_APICID) + return 0; + + return 0; /* Should not happen */ +} diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c index a7e2c3e95e..f1ec0f3459 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c @@ -12,8 +12,7 @@ * Pentium Pro and Pentium-II/Xeon MP machines. * Original development of Linux SMP code supported by Caldera. * - * This code is released under the GNU General Public License version 2 or - * later. + * This code is released under the GNU General Public License version 2 * * Fixes * Felix Koop : NR_CPUS used properly @@ -31,9 +30,13 @@ * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. * Maciej W. Rozycki : Bits for genuine 82489DX APICs * Andi Kleen : Changed for SMP boot into long mode. - * Rusty Russell : Hacked into shape for new "hotplug" boot process. + * Rusty Russell : Hacked into shape for new "hotplug" boot process. + * Andi Kleen : Converted to new state machine. + * Various cleanups. + * Probably mostly hotplug CPU ready now. */ + #include <linux/config.h> #include <linux/init.h> @@ -53,19 +56,35 @@ #include <asm/kdebug.h> #include <asm/tlbflush.h> #include <asm/proto.h> +#include <asm/nmi.h> + +/* Change for real CPU hotplug. Note other files need to be fixed + first too. */ +#define __cpuinit __init +#define __cpuinitdata __initdata /* Number of siblings per CPU package */ int smp_num_siblings = 1; /* Package ID of each logical CPU */ u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; EXPORT_SYMBOL(phys_proc_id); +EXPORT_SYMBOL(cpu_core_id); /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map; +EXPORT_SYMBOL(cpu_online_map); + +/* + * Private maps to synchronize booting between AP and BP. + * Probably not needed anymore, but it makes for easier debugging. -AK + */ cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -static cpumask_t smp_commenced_mask; + +cpumask_t cpu_possible_map; +EXPORT_SYMBOL(cpu_possible_map); /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; @@ -74,13 +93,15 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; int smp_threads_ready; cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; +cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_core_map); /* * Trampoline 80x86 program as an array. */ -extern unsigned char trampoline_data []; -extern unsigned char trampoline_end []; +extern unsigned char trampoline_data[]; +extern unsigned char trampoline_end[]; /* * Currently trivial. Write the real->protected mode @@ -88,11 +109,9 @@ extern unsigned char trampoline_end []; * has made sure it's suitably aligned. */ -static unsigned long __init setup_trampoline(void) +static unsigned long __cpuinit setup_trampoline(void) { void *tramp = __va(SMP_TRAMPOLINE_BASE); - extern volatile __u32 tramp_gdt_ptr; - tramp_gdt_ptr = __pa_symbol(&cpu_gdt_table); memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(tramp); } @@ -102,154 +121,224 @@ static unsigned long __init setup_trampoline(void) * a given CPU */ -static void __init smp_store_cpu_info(int id) +static void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = cpu_data + id; *c = boot_cpu_data; identify_cpu(c); + print_cpu_info(c); } /* - * TSC synchronization. + * New Funky TSC sync algorithm borrowed from IA64. + * Main advantage is that it doesn't reset the TSCs fully and + * in general looks more robust and it works better than my earlier + * attempts. I believe it was written by David Mosberger. Some minor + * adjustments for x86-64 by me -AK + * + * Original comment reproduced below. + * + * Synchronize TSC of the current (slave) CPU with the TSC of the + * MASTER CPU (normally the time-keeper CPU). We use a closed loop to + * eliminate the possibility of unaccounted-for errors (such as + * getting a machine check in the middle of a calibration step). The + * basic idea is for the slave to ask the master what itc value it has + * and to read its own itc before and after the master responds. Each + * iteration gives us three timestamps: * - * We first check whether all CPUs have their TSC's synchronized, - * then we print a warning if not, and always resync. + * slave master + * + * t0 ---\ + * ---\ + * ---> + * tm + * /--- + * /--- + * t1 <--- + * + * + * The goal is to adjust the slave's TSC such that tm falls exactly + * half-way between t0 and t1. If we achieve this, the clocks are + * synchronized provided the interconnect between the slave and the + * master is symmetric. Even if the interconnect were asymmetric, we + * would still know that the synchronization error is smaller than the + * roundtrip latency (t0 - t1). + * + * When the interconnect is quiet and symmetric, this lets us + * synchronize the TSC to within one or two cycles. However, we can + * only *guarantee* that the synchronization is accurate to within a + * round-trip time, which is typically in the range of several hundred + * cycles (e.g., ~500 cycles). In practice, this means that the TSCs + * are usually almost perfectly synchronized, but we shouldn't assume + * that the accuracy is much better than half a micro second or so. + * + * [there are other errors like the latency of RDTSC and of the + * WRMSR. These can also account to hundreds of cycles. So it's + * probably worse. It claims 153 cycles error on a dual Opteron, + * but I suspect the numbers are actually somewhat worse -AK] */ -static atomic_t tsc_start_flag = ATOMIC_INIT(0); -static atomic_t tsc_count_start = ATOMIC_INIT(0); -static atomic_t tsc_count_stop = ATOMIC_INIT(0); -static unsigned long long tsc_values[NR_CPUS]; - -#define NR_LOOPS 5 +#define MASTER 0 +#define SLAVE (SMP_CACHE_BYTES/8) -extern unsigned int fast_gettimeoffset_quotient; +/* Intentionally don't use cpu_relax() while TSC synchronization + because we don't want to go into funky power save modi or cause + hypervisors to schedule us away. Going to sleep would likely affect + latency and low latency is the primary objective here. -AK */ +#define no_cpu_relax() barrier() -static void __init synchronize_tsc_bp (void) -{ - int i; - unsigned long long t0; - unsigned long long sum, avg; - long long delta; - long one_usec; - int buggy = 0; +static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); +static volatile __cpuinitdata unsigned long go[SLAVE + 1]; +static int notscsync __cpuinitdata; - printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus()); +#undef DEBUG_TSC_SYNC - one_usec = cpu_khz; +#define NUM_ROUNDS 64 /* magic value */ +#define NUM_ITERS 5 /* likewise */ - atomic_set(&tsc_start_flag, 1); - wmb(); +/* Callback on boot CPU */ +static __cpuinit void sync_master(void *arg) +{ + unsigned long flags, i; - /* - * We loop a few times to get a primed instruction cache, - * then the last pass is more or less synchronized and - * the BP and APs set their cycle counters to zero all at - * once. This reduces the chance of having random offsets - * between the processors, and guarantees that the maximum - * delay between the cycle counters is never bigger than - * the latency of information-passing (cachelines) between - * two CPUs. - */ - for (i = 0; i < NR_LOOPS; i++) { - /* - * all APs synchronize but they loop on '== num_cpus' - */ - while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb(); - atomic_set(&tsc_count_stop, 0); - wmb(); - /* - * this lets the APs save their current TSC: - */ - atomic_inc(&tsc_count_start); + if (smp_processor_id() != boot_cpu_id) + return; - sync_core(); - rdtscll(tsc_values[smp_processor_id()]); - /* - * We clear the TSC in the last loop: - */ - if (i == NR_LOOPS-1) - write_tsc(0, 0); + go[MASTER] = 0; - /* - * Wait for all APs to leave the synchronization point: - */ - while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb(); - atomic_set(&tsc_count_start, 0); - wmb(); - atomic_inc(&tsc_count_stop); + local_irq_save(flags); + { + for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { + while (!go[MASTER]) + no_cpu_relax(); + go[MASTER] = 0; + rdtscll(go[SLAVE]); + } } + local_irq_restore(flags); +} - sum = 0; - for (i = 0; i < NR_CPUS; i++) { - if (cpu_isset(i, cpu_callout_map)) { - t0 = tsc_values[i]; - sum += t0; - } - } - avg = sum / num_booting_cpus(); +/* + * Return the number of cycles by which our tsc differs from the tsc + * on the master (time-keeper) CPU. A positive number indicates our + * tsc is ahead of the master, negative that it is behind. + */ +static inline long +get_delta(long *rt, long *master) +{ + unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; + unsigned long tcenter, t0, t1, tm; + int i; - sum = 0; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; + for (i = 0; i < NUM_ITERS; ++i) { + rdtscll(t0); + go[MASTER] = 1; + while (!(tm = go[SLAVE])) + no_cpu_relax(); + go[SLAVE] = 0; + rdtscll(t1); - delta = tsc_values[i] - avg; - if (delta < 0) - delta = -delta; - /* - * We report bigger than 2 microseconds clock differences. - */ - if (delta > 2*one_usec) { - long realdelta; - if (!buggy) { - buggy = 1; - printk("\n"); - } - realdelta = delta / one_usec; - if (tsc_values[i] < avg) - realdelta = -realdelta; + if (t1 - t0 < best_t1 - best_t0) + best_t0 = t0, best_t1 = t1, best_tm = tm; + } - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", - i, realdelta); - } + *rt = best_t1 - best_t0; + *master = best_tm - best_t0; - sum += delta; - } - if (!buggy) - printk("passed.\n"); + /* average best_t0 and best_t1 without overflow: */ + tcenter = (best_t0/2 + best_t1/2); + if (best_t0 % 2 + best_t1 % 2 == 2) + ++tcenter; + return tcenter - best_tm; } -static void __init synchronize_tsc_ap (void) +static __cpuinit void sync_tsc(void) { - int i; + int i, done = 0; + long delta, adj, adjust_latency = 0; + unsigned long flags, rt, master_time_stamp, bound; +#if DEBUG_TSC_SYNC + static struct syncdebug { + long rt; /* roundtrip time */ + long master; /* master's timestamp */ + long diff; /* difference between midpoint and master's timestamp */ + long lat; /* estimate of tsc adjustment latency */ + } t[NUM_ROUNDS] __cpuinitdata; +#endif - /* - * Not every cpu is online at the time - * this gets called, so we first wait for the BP to - * finish SMP initialization: - */ - while (!atomic_read(&tsc_start_flag)) mb(); + go[MASTER] = 1; + + smp_call_function(sync_master, NULL, 1, 0); + + while (go[MASTER]) /* wait for master to be ready */ + no_cpu_relax(); - for (i = 0; i < NR_LOOPS; i++) { - atomic_inc(&tsc_count_start); - while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb(); + spin_lock_irqsave(&tsc_sync_lock, flags); + { + for (i = 0; i < NUM_ROUNDS; ++i) { + delta = get_delta(&rt, &master_time_stamp); + if (delta == 0) { + done = 1; /* let's lock on to this... */ + bound = rt; + } - sync_core(); - rdtscll(tsc_values[smp_processor_id()]); - if (i == NR_LOOPS-1) - write_tsc(0, 0); + if (!done) { + unsigned long t; + if (i > 0) { + adjust_latency += -delta; + adj = -delta + adjust_latency/4; + } else + adj = -delta; - atomic_inc(&tsc_count_stop); - while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); + rdtscll(t); + wrmsrl(MSR_IA32_TSC, t + adj); + } +#if DEBUG_TSC_SYNC + t[i].rt = rt; + t[i].master = master_time_stamp; + t[i].diff = delta; + t[i].lat = adjust_latency/4; +#endif + } } + spin_unlock_irqrestore(&tsc_sync_lock, flags); + +#if DEBUG_TSC_SYNC + for (i = 0; i < NUM_ROUNDS; ++i) + printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", + t[i].rt, t[i].master, t[i].diff, t[i].lat); +#endif + + printk(KERN_INFO + "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " + "maxerr %lu cycles)\n", + smp_processor_id(), boot_cpu_id, delta, rt); } -#undef NR_LOOPS -static atomic_t init_deasserted; +static void __cpuinit tsc_sync_wait(void) +{ + if (notscsync || !cpu_has_tsc) + return; + printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), + boot_cpu_id); + sync_tsc(); +} -void __init smp_callin(void) +static __init int notscsync_setup(char *s) +{ + notscsync = 1; + return 0; +} +__setup("notscsync", notscsync_setup); + +static atomic_t init_deasserted __cpuinitdata; + +/* + * Report back to the Boot Processor. + * Running on AP. + */ +void __cpuinit smp_callin(void) { int cpuid, phys_id; unsigned long timeout; @@ -260,7 +349,8 @@ void __init smp_callin(void) * our local APIC. We have to wait for the IPI or we'll * lock up on an APIC access. */ - while (!atomic_read(&init_deasserted)); + while (!atomic_read(&init_deasserted)) + cpu_relax(); /* * (This works even if the APIC is not enabled.) @@ -291,7 +381,7 @@ void __init smp_callin(void) */ if (cpu_isset(cpuid, cpu_callout_map)) break; - rep_nop(); + cpu_relax(); } if (!time_before(jiffies, timeout)) { @@ -309,8 +399,6 @@ void __init smp_callin(void) Dprintk("CALLIN, before setup_local_APIC().\n"); setup_local_APIC(); - local_irq_enable(); - /* * Get our bogomips. */ @@ -324,26 +412,16 @@ void __init smp_callin(void) */ smp_store_cpu_info(cpuid); - local_irq_disable(); - /* * Allow the master to continue. */ cpu_set(cpuid, cpu_callin_map); - - /* - * Synchronize the TSC with the BP - */ - if (cpu_has_tsc) - synchronize_tsc_ap(); } -int cpucount; - /* - * Activate a secondary processor. + * Setup code on secondary processor (after comming out of the trampoline) */ -void __init start_secondary(void) +void __cpuinit start_secondary(void) { /* * Dont put anything before smp_callin(), SMP @@ -356,14 +434,10 @@ void __init start_secondary(void) /* otherwise gcc will move up the smp_processor_id before the cpu_init */ barrier(); - Dprintk("cpu %d: waiting for commence\n", smp_processor_id()); - while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) - rep_nop(); - Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); setup_secondary_APIC_clock(); - Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); + Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -371,27 +445,27 @@ void __init start_secondary(void) enable_8259A_irq(0); } - - enable_APIC_timer(); + enable_APIC_timer(); /* - * low-memory mappings have been cleared, flush them from - * the local TLBs too. + * Allow the master to continue. */ - local_flush_tlb(); - - Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id()); cpu_set(smp_processor_id(), cpu_online_map); - wmb(); - + mb(); + + /* Wait for TSC sync to not schedule things before. + We still process interrupts, which could see an inconsistent + time in that window unfortunately. */ + tsc_sync_wait(); + cpu_idle(); } -extern volatile unsigned long init_rsp; +extern volatile unsigned long init_rsp; extern void (*initial_code)(void); #if APIC_DEBUG -static inline void inquire_remote_apic(int apicid) +static void inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; @@ -428,7 +502,10 @@ static inline void inquire_remote_apic(int apicid) } #endif -static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) +/* + * Kick the secondary to wake up. + */ +static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) { unsigned long send_status = 0, accept_status = 0; int maxlvt, timeout, num_starts, j; @@ -551,33 +628,35 @@ static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_ return (send_status | accept_status); } -static void __init do_boot_cpu (int apicid) +/* + * Boot one CPU. + */ +static int __cpuinit do_boot_cpu(int cpu, int apicid) { struct task_struct *idle; unsigned long boot_error; - int timeout, cpu; + int timeout; unsigned long start_rip; - - cpu = ++cpucount; /* * We can't use kernel_thread since we must avoid to * reschedule the child. */ idle = fork_idle(cpu); - if (IS_ERR(idle)) - panic("failed fork for CPU %d", cpu); - x86_cpu_to_apicid[cpu] = apicid; + if (IS_ERR(idle)) { + printk("failed fork for CPU %d\n", cpu); + return PTR_ERR(idle); + } cpu_pda[cpu].pcurrent = idle; start_rip = setup_trampoline(); - init_rsp = idle->thread.rsp; + init_rsp = idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; initial_code = start_secondary; clear_ti_thread_flag(idle->thread_info, TIF_FORK); - printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, + printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, start_rip, init_rsp); /* @@ -614,7 +693,7 @@ static void __init do_boot_cpu (int apicid) /* * Starting actual IPI sequence... */ - boot_error = wakeup_secondary_via_INIT(apicid, start_rip); + boot_error = wakeup_secondary_via_INIT(apicid, start_rip); if (!boot_error) { /* @@ -635,8 +714,6 @@ static void __init do_boot_cpu (int apicid) if (cpu_isset(cpu, cpu_callin_map)) { /* number CPUs logically, starting from 1 (BSP is 0) */ - Dprintk("OK.\n"); - print_cpu_info(&cpu_data[cpu]); Dprintk("CPU has booted.\n"); } else { boot_error = 1; @@ -655,76 +732,131 @@ static void __init do_boot_cpu (int apicid) if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ - cpucount--; + cpu_clear(cpu, cpu_present_map); + cpu_clear(cpu, cpu_possible_map); x86_cpu_to_apicid[cpu] = BAD_APICID; x86_cpu_to_log_apicid[cpu] = BAD_APICID; + return -EIO; } + + return 0; } cycles_t cacheflush_time; unsigned long cache_decay_ticks; -static void smp_tune_scheduling (void) +/* + * Construct cpu_sibling_map[], so that we can tell the sibling CPU + * on SMT systems efficiently. + */ +static __cpuinit void detect_siblings(void) { - int cachesize; /* kB */ - unsigned long bandwidth = 1000; /* MB/s */ - /* - * Rough estimation for SMP scheduling, this is the number of - * cycles it takes for a fully memory-limited process to flush - * the SMP-local cache. - * - * (For a P5 this pretty much means we will choose another idle - * CPU almost always at wakeup time (this is due to the small - * L1 cache), on PIIs it's around 50-100 usecs, depending on - * the cache size) - */ - - if (!cpu_khz) { - /* - * this basically disables processor-affinity - * scheduling on SMP without a TSC. - */ - cacheflush_time = 0; - return; - } else { - cachesize = boot_cpu_data.x86_cache_size; - if (cachesize == -1) { - cachesize = 16; /* Pentiums, 2x8kB cache */ - bandwidth = 100; - } + int cpu; - cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); } - cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + for_each_online_cpu (cpu) { + struct cpuinfo_x86 *c = cpu_data + cpu; + int siblings = 0; + int i; + if (smp_num_siblings > 1) { + for_each_online_cpu (i) { + if (cpu_core_id[cpu] == cpu_core_id[i]) { + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); + } + } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); + } - printk(KERN_INFO "per-CPU timeslice cutoff: %ld.%02ld usecs.\n", - (long)cacheflush_time/(cpu_khz/1000), - ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); - printk(KERN_INFO "task migration cache decay timeout: %ld msecs.\n", - (cache_decay_ticks + 1) * 1000 / HZ); + if (siblings != smp_num_siblings) { + printk(KERN_WARNING + "WARNING: %d siblings found for CPU%d, should be %d\n", + siblings, cpu, smp_num_siblings); + smp_num_siblings = siblings; + } + if (c->x86_num_cores > 1) { + for_each_online_cpu(i) { + if (phys_proc_id[cpu] == phys_proc_id[i]) + cpu_set(i, cpu_core_map[cpu]); + } + } else + cpu_core_map[cpu] = cpu_sibling_map[cpu]; + } } /* - * Cycle through the processors sending APIC IPIs to boot each. + * Cleanup possible dangling ends... */ - -static void __init smp_boot_cpus(unsigned int max_cpus) +static __cpuinit void smp_cleanup_boot(void) { - unsigned apicid, cpu, bit, kicked; + /* + * Paranoid: Set warm reset code and vector here back + * to default values. + */ + CMOS_WRITE(0, 0xf); - nmi_watchdog_default(); + /* + * Reset trampoline flag + */ + *((volatile int *) phys_to_virt(0x467)) = 0; +#ifndef CONFIG_HOTPLUG_CPU /* - * Setup boot CPU information + * Free pages reserved for SMP bootup. + * When you add hotplug CPU support later remove this + * Note there is more work to be done for later CPU bootup. */ - smp_store_cpu_info(0); /* Final full version of the data */ - printk(KERN_INFO "CPU%d: ", 0); - print_cpu_info(&cpu_data[0]); - current_thread_info()->cpu = 0; - smp_tune_scheduling(); + free_page((unsigned long) __va(PAGE_SIZE)); + free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE)); +#endif +} + +/* + * Fall back to non SMP mode after errors. + * + * RED-PEN audit/test this more. I bet there is more state messed up here. + */ +static __cpuinit void disable_smp(void) +{ + cpu_present_map = cpumask_of_cpu(0); + cpu_possible_map = cpumask_of_cpu(0); + if (smp_found_config) + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); + else + phys_cpu_present_map = physid_mask_of_physid(0); + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); +} + +/* + * Handle user cpus=... parameter. + */ +static __cpuinit void enforce_max_cpus(unsigned max_cpus) +{ + int i, k; + k = 0; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + if (++k > max_cpus) { + cpu_clear(i, cpu_possible_map); + cpu_clear(i, cpu_present_map); + } + } +} +/* + * Various sanity checks. + */ +static int __cpuinit smp_sanity_check(unsigned max_cpus) +{ if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", hard_smp_processor_id()); @@ -737,13 +869,11 @@ static void __init smp_boot_cpus(unsigned int max_cpus) */ if (!smp_found_config) { printk(KERN_NOTICE "SMP motherboard not detected.\n"); - io_apic_irqs = 0; - cpu_online_map = cpumask_of_cpu(0); - phys_cpu_present_map = physid_mask_of_physid(0); + disable_smp(); if (APIC_init_uniprocessor()) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); - goto smp_done; + return -1; } /* @@ -763,196 +893,143 @@ static void __init smp_boot_cpus(unsigned int max_cpus) printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", boot_cpu_id); printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); - io_apic_irqs = 0; - cpu_online_map = cpumask_of_cpu(0); - phys_cpu_present_map = physid_mask_of_physid(0); - disable_apic = 1; - goto smp_done; + nr_ioapics = 0; + return -1; } - verify_local_APIC(); - /* * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - smp_found_config = 0; printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); - io_apic_irqs = 0; - cpu_online_map = cpumask_of_cpu(0); - phys_cpu_present_map = physid_mask_of_physid(0); - disable_apic = 1; - goto smp_done; + nr_ioapics = 0; + return -1; } - connect_bsp_APIC(); - setup_local_APIC(); - - if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) - BUG(); - - x86_cpu_to_apicid[0] = boot_cpu_id; - - /* - * Now scan the CPU present map and fire up the other CPUs. - */ - Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); + return 0; +} - kicked = 1; - for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { - apicid = cpu_present_to_apicid(bit); - /* - * Don't even attempt to start the boot CPU! - */ - if (apicid == boot_cpu_id || (apicid == BAD_APICID)) - continue; +/* + * Prepare for SMP bootup. The MP table or ACPI has been read + * earlier. Just do some sanity checking here and enable APIC mode. + */ +void __cpuinit smp_prepare_cpus(unsigned int max_cpus) +{ + int i; - if (!physid_isset(apicid, phys_cpu_present_map)) - continue; - if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) - continue; + nmi_watchdog_default(); + current_cpu_data = boot_cpu_data; + current_thread_info()->cpu = 0; /* needed? */ - do_boot_cpu(apicid); - ++kicked; - } + enforce_max_cpus(max_cpus); /* - * Cleanup possible dangling ends... + * Fill in cpu_present_mask */ - { - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - CMOS_WRITE(0, 0xf); - - *((volatile int *) phys_to_virt(0x467)) = 0; + for (i = 0; i < NR_CPUS; i++) { + int apicid = cpu_present_to_apicid(i); + if (physid_isset(apicid, phys_cpu_present_map)) { + cpu_set(i, cpu_present_map); + /* possible map would be different if we supported real + CPU hotplug. */ + cpu_set(i, cpu_possible_map); + } } - /* - * Allow the user to impress friends. - */ - - Dprintk("Before bogomips.\n"); - if (!cpucount) { - printk(KERN_INFO "Only one processor found.\n"); - } else { - unsigned long bogosum = 0; - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (cpu_isset(cpu, cpu_callout_map)) - bogosum += cpu_data[cpu].loops_per_jiffy; - printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", - cpucount+1, - bogosum/(500000/HZ), - (bogosum/(5000/HZ))%100); - Dprintk("Before bogocount - setting activated=1.\n"); + if (smp_sanity_check(max_cpus) < 0) { + printk(KERN_INFO "SMP disabled\n"); + disable_smp(); + return; } + /* - * Construct cpu_sibling_map[], so that we can tell the - * sibling CPU efficiently. + * Switch from PIC to APIC mode. */ - for (cpu = 0; cpu < NR_CPUS; cpu++) - cpus_clear(cpu_sibling_map[cpu]); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - int siblings = 0; - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; - - if (smp_num_siblings > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (phys_proc_id[cpu] == phys_proc_id[i]) { - siblings++; - cpu_set(i, cpu_sibling_map[cpu]); - } - } - } else { - siblings++; - cpu_set(cpu, cpu_sibling_map[cpu]); - } + connect_bsp_APIC(); + setup_local_APIC(); - if (siblings != smp_num_siblings) { - printk(KERN_WARNING - "WARNING: %d siblings found for CPU%d, should be %d\n", - siblings, cpu, smp_num_siblings); - smp_num_siblings = siblings; - } + if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { + panic("Boot APIC ID in local APIC unexpected (%d vs %d)", + GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); + /* Or can we switch back to PIC here? */ } - Dprintk("Boot done.\n"); - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: + * Now start the IO-APICs */ if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); else nr_ioapics = 0; - setup_boot_APIC_clock(); - /* - * Synchronize the TSC with the AP + * Set up local APIC timer on boot CPU. */ - if (cpu_has_tsc && cpucount) - synchronize_tsc_bp(); - smp_done: - time_init_smp(); + setup_boot_APIC_clock(); } -/* These are wrappers to interface to the new boot process. Someone - who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ -void __init smp_prepare_cpus(unsigned int max_cpus) +/* + * Early setup to make printk work. + */ +void __init smp_prepare_boot_cpu(void) { - smp_boot_cpus(max_cpus); + int me = smp_processor_id(); + cpu_set(me, cpu_online_map); + cpu_set(me, cpu_callout_map); } -void __devinit smp_prepare_boot_cpu(void) +/* + * Entry point to boot a CPU. + * + * This is all __cpuinit, not __devinit for now because we don't support + * CPU hotplug (yet). + */ +int __cpuinit __cpu_up(unsigned int cpu) { - cpu_set(smp_processor_id(), cpu_online_map); - cpu_set(smp_processor_id(), cpu_callout_map); -} + int err; + int apicid = cpu_present_to_apicid(cpu); -int __devinit __cpu_up(unsigned int cpu) -{ - /* This only works at boot for x86. See "rewrite" above. */ - if (cpu_isset(cpu, smp_commenced_mask)) { - local_irq_enable(); - return -ENOSYS; + WARN_ON(irqs_disabled()); + + Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); + + if (apicid == BAD_APICID || apicid == boot_cpu_id || + !physid_isset(apicid, phys_cpu_present_map)) { + printk("__cpu_up: bad cpu %d\n", cpu); + return -EINVAL; } - /* In case one didn't come up */ - if (!cpu_isset(cpu, cpu_callin_map)) { - local_irq_enable(); - return -EIO; + /* Boot it! */ + err = do_boot_cpu(cpu, apicid); + if (err < 0) { + Dprintk("do_boot_cpu failed %d\n", err); + return err; } - local_irq_enable(); /* Unleash the CPU! */ Dprintk("waiting for cpu %d\n", cpu); - cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) - mb(); + cpu_relax(); return 0; } -void __init smp_cpus_done(unsigned int max_cpus) +/* + * Finish the SMP boot. + */ +void __cpuinit smp_cpus_done(unsigned int max_cpus) { + zap_low_mappings(); + smp_cleanup_boot(); + #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif - zap_low_mappings(); -} + detect_siblings(); + time_init_gtod(); + + check_nmi_watchdog(); +} diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c index f13565ee9b..10ae319338 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c @@ -28,6 +28,7 @@ #include <linux/interrupt.h> #include <linux/module.h> #include <linux/moduleparam.h> +#include <linux/nmi.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -119,95 +120,106 @@ int printk_address(unsigned long address) } #endif -unsigned long *in_exception_stack(int cpu, unsigned long stack) -{ - int k; +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, const char **idp) +{ + static const char ids[N_EXCEPTION_STACKS][8] = { + [DEBUG_STACK - 1] = "#DB", + [NMI_STACK - 1] = "NMI", + [DOUBLEFAULT_STACK - 1] = "#DF", + [STACKFAULT_STACK - 1] = "#SS", + [MCE_STACK - 1] = "#MC", + }; + unsigned k; + for (k = 0; k < N_EXCEPTION_STACKS; k++) { - struct tss_struct *tss = &per_cpu(init_tss, cpu); - unsigned long end = tss->ist[k] + EXCEPTION_STKSZ; + unsigned long end; - if (stack >= tss->ist[k] && stack <= end) + end = per_cpu(init_tss, cpu).ist[k]; + if (stack >= end) + continue; + if (stack >= end - EXCEPTION_STKSZ) { + if (*usedp & (1U << k)) + break; + *usedp |= 1U << k; + *idp = ids[k]; return (unsigned long *)end; + } } return NULL; -} +} /* * x86-64 can have upto three kernel stacks: * process stack * interrupt stack - * severe exception (double fault, nmi, stack fault) hardware stack - * Check and process them in order. + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ void show_trace(unsigned long *stack) { unsigned long addr; - unsigned long *irqstack, *irqstack_end, *estack_end; - const int cpu = safe_smp_processor_id(); + const unsigned cpu = safe_smp_processor_id(); + unsigned long *irqstack_end = (unsigned long *)cpu_pda[cpu].irqstackptr; int i; + unsigned used = 0; printk("\nCall Trace:"); - i = 0; - - estack_end = in_exception_stack(cpu, (unsigned long)stack); - if (estack_end) { - while (stack < estack_end) { - addr = *stack++; - if (__kernel_text_address(addr)) { - i += printk_address(addr); - i += printk(" "); - if (i > 50) { - printk("\n"); - i = 0; - } - } + +#define HANDLE_STACK(cond) \ + do while (cond) { \ + addr = *stack++; \ + if (kernel_text_address(addr)) { \ + /* \ + * If the address is either in the text segment of the \ + * kernel, or in the region which contains vmalloc'ed \ + * memory, it *may* be the address of a calling \ + * routine; if so, print it so that someone tracing \ + * down the cause of the crash will be able to figure \ + * out the call path that was taken. \ + */ \ + i += printk_address(addr); \ + if (i > 50) { \ + printk("\n "); \ + i = 0; \ + } \ + else \ + i += printk(" "); \ + } \ + } while (0) + + for(i = 0; ; ) { + const char *id; + unsigned long *estack_end; + estack_end = in_exception_stack(cpu, (unsigned long)stack, + &used, &id); + + if (estack_end) { + i += printk(" <%s> ", id); + HANDLE_STACK (stack < estack_end); + i += printk(" <EOE> "); + stack = (unsigned long *) estack_end[-2]; + continue; } - i += printk(" <EOE> "); - i += 7; - stack = (unsigned long *) estack_end[-2]; - } - - irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); - irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE + 64); - - if (stack >= irqstack && stack < irqstack_end) { - printk("<IRQ> "); - while (stack < irqstack_end) { - addr = *stack++; - /* - * If the address is either in the text segment of the - * kernel, or in the region which contains vmalloc'ed - * memory, it *may* be the address of a calling - * routine; if so, print it so that someone tracing - * down the cause of the crash will be able to figure - * out the call path that was taken. - */ - if (__kernel_text_address(addr)) { - i += printk_address(addr); - i += printk(" "); - if (i > 50) { - printk("\n "); - i = 0; - } + if (irqstack_end) { + unsigned long *irqstack; + irqstack = irqstack_end - + (IRQSTACKSIZE - 64) / sizeof(*irqstack); + + if (stack >= irqstack && stack < irqstack_end) { + i += printk(" <IRQ> "); + HANDLE_STACK (stack < irqstack_end); + stack = (unsigned long *) (irqstack_end[-1]); + irqstack_end = NULL; + i += printk(" <EOI> "); + continue; } - } - stack = (unsigned long *) (irqstack_end[-1]); - printk(" <EOI> "); - i += 7; - } - - while (((long) stack & (THREAD_SIZE-1)) != 0) { - addr = *stack++; - if (__kernel_text_address(addr)) { - i += printk_address(addr); - i += printk(" "); - if (i > 50) { - printk("\n "); - i = 0; - } } + break; } + + HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); +#undef HANDLE_STACK printk("\n"); } @@ -243,6 +255,7 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp) if (i && ((i % 4) == 0)) printk("\n "); printk("%016lx ", *stack++); + touch_nmi_watchdog(); } show_trace((unsigned long *)rsp); } @@ -319,10 +332,12 @@ void handle_BUG(struct pt_regs *regs) printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line); } +#ifdef CONFIG_BUG void out_of_line_bug(void) { BUG(); } +#endif static DEFINE_SPINLOCK(die_lock); static int die_owner = -1; @@ -348,7 +363,6 @@ void oops_end(void) die_owner = -1; bust_spinlocks(0); spin_unlock(&die_lock); - local_irq_enable(); /* make sure back scroll still works */ if (panic_on_oops) panic("Oops"); } @@ -489,24 +503,8 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) DO_ERROR(18, SIGSEGV, "reserved", reserved) - -#define DO_ERROR_STACK(trapnr, signr, str, name) \ -asmlinkage void *do_##name(struct pt_regs * regs, long error_code) \ -{ \ - struct pt_regs *pr = ((struct pt_regs *)(current->thread.rsp0))-1; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return regs; \ - if (regs->cs & 3) { \ - memcpy(pr, regs, sizeof(struct pt_regs)); \ - regs = pr; \ - } \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ - return regs; \ -} - -DO_ERROR_STACK(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_STACK( 8, SIGSEGV, "double fault", double_fault) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR( 8, SIGSEGV, "double fault", double_fault) asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) { @@ -585,6 +583,8 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) printk("Do you have a strange power saving mode enabled?\n"); } +/* Runs on IST stack. This code must keep interrupts off all the time. + Nested NMIs are prevented by the CPU. */ asmlinkage void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; @@ -619,15 +619,6 @@ asmlinkage void default_do_nmi(struct pt_regs *regs) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); - - /* - * Reassert NMI in case it became active meanwhile - * as it's edge-triggered. - */ - outb(0x8f, 0x70); - inb(0x71); /* dummy */ - outb(0x0f, 0x70); - inb(0x71); /* dummy */ } asmlinkage void do_int3(struct pt_regs * regs, long error_code) @@ -639,20 +630,34 @@ asmlinkage void do_int3(struct pt_regs * regs, long error_code) return; } +/* Help handler running on IST stack to switch back to user stack + for scheduling or signal handling. The actual stack switch is done in + entry.S */ +asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->rsp) + ; + /* Exception from user space */ + else if (eregs->cs & 3) + regs = ((struct pt_regs *)current->thread.rsp0) - 1; + /* Exception from kernel and interrupts are enabled. Move to + kernel process stack. */ + else if (eregs->eflags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; +} + /* runs on IST stack. */ -asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code) +asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) { - struct pt_regs *pr; unsigned long condition; struct task_struct *tsk = current; siginfo_t info; - pr = (struct pt_regs *)(current->thread.rsp0)-1; - if (regs->cs & 3) { - memcpy(pr, regs, sizeof(struct pt_regs)); - regs = pr; - } - #ifdef CONFIG_CHECKING { /* RED-PEN interaction with debugger - could destroy gs */ @@ -669,9 +674,9 @@ asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code) asm("movq %%db6,%0" : "=r" (condition)); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) { - return regs; - } + SIGTRAP) == NOTIFY_STOP) + return; + conditional_sti(regs); /* Mask out spurious debug traps due to lazy DR7 setting */ @@ -684,9 +689,7 @@ asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code) tsk->thread.debugreg6 = condition; /* Mask out spurious TF errors due to lazy TF clearing */ - if ((condition & DR_STEP) && - (notify_die(DIE_DEBUGSTEP, "debugstep", regs, condition, - 1, SIGTRAP) != NOTIFY_STOP)) { + if (condition & DR_STEP) { /* * The TF error should be masked out only if the current * process is not traced and if the TRAP flag has been set @@ -698,8 +701,14 @@ asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code) */ if ((regs->cs & 3) == 0) goto clear_TF_reenable; - if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) - goto clear_TF; + /* + * Was the TF flag set by a debugger? If so, clear it now, + * so that register information is correct. + */ + if (tsk->ptrace & PT_DTRACE) { + regs->eflags &= ~TF_MASK; + tsk->ptrace &= ~PT_DTRACE; + } } /* Ok, finally something we can handle */ @@ -715,18 +724,11 @@ asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code) force_sig_info(SIGTRAP, &info, tsk); clear_dr7: asm volatile("movq %0,%%db7"::"r"(0UL)); - notify_die(DIE_DEBUG, "debug", regs, condition, 1, SIGTRAP); - return regs; + return; clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - -clear_TF: - /* RED-PEN could cause spurious errors */ - if (notify_die(DIE_DEBUG, "debug2", regs, condition, 1, SIGTRAP) - != NOTIFY_STOP) regs->eflags &= ~TF_MASK; - return regs; } static int kernel_math_error(struct pt_regs *regs, char *str) @@ -738,14 +740,8 @@ static int kernel_math_error(struct pt_regs *regs, char *str) return 1; } notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE); -#if 0 - /* This should be a die, but warn only for now */ + /* Illegal floating point operation in the kernel */ die(str, regs, 0); -#else - printk(KERN_DEBUG "%s: %s at ", current->comm, str); - printk_address(regs->rip); - printk("\n"); -#endif return 0; } @@ -829,7 +825,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) conditional_sti(regs); if ((regs->cs & 3) == 0 && - kernel_math_error(regs, "simd math error")) + kernel_math_error(regs, "kernel simd math error")) return; /* diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/vsyscall.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/vsyscall.c index f980cdefff..737ffeb6d0 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/vsyscall.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/vsyscall.c @@ -9,30 +9,14 @@ * a different vsyscall implementation for Linux/IA32 and for the name. * * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located - * at virtual address -10Mbyte+1024bytes etc... There are at max 8192 + * at virtual address -10Mbyte+1024bytes etc... There are at max 4 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid - * jumping out of line if necessary. + * jumping out of line if necessary. We cannot add more with this + * mechanism because older kernels won't return -ENOSYS. + * If we want more than four we need a vDSO. * - * Note: the concept clashes with user mode linux. If you use UML just - * set the kernel.vsyscall sysctl to 0. - */ - -/* - * TODO 2001-03-20: - * - * 1) make page fault handler detect faults on page1-page-last of the vsyscall - * virtual space, and make it increase %rip and write -ENOSYS in %rax (so - * we'll be able to upgrade to a new glibc without upgrading kernel after - * we add more vsyscalls. - * 2) Possibly we need a fixmap table for the vsyscalls too if we want - * to avoid SIGSEGV and we want to return -EFAULT from the vsyscalls as well. - * Can we segfault inside a "syscall"? We can fix this anytime and those fixes - * won't be visible for userspace. Not fixing this is a noop for correct programs, - * broken programs will segfault and there's no security risk until we choose to - * fix it. - * - * These are not urgent things that we need to address only before shipping the first - * production binary kernels. + * Note: the concept clashes with user mode linux. If you use UML and + * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ #include <linux/time.h> @@ -41,6 +25,7 @@ #include <linux/timer.h> #include <linux/seqlock.h> #include <linux/jiffies.h> +#include <linux/sysctl.h> #include <asm/vsyscall.h> #include <asm/pgtable.h> @@ -62,8 +47,7 @@ static force_inline void timeval_normalize(struct timeval * tv) time_t __sec; __sec = tv->tv_usec / 1000000; - if (__sec) - { + if (__sec) { tv->tv_usec %= 1000000; tv->tv_sec += __sec; } @@ -81,13 +65,14 @@ static force_inline void do_vgettimeofday(struct timeval * tv) usec = (__xtime.tv_nsec / 1000) + (__jiffies - __wall_jiffies) * (1000000 / HZ); - if (__vxtime.mode == VXTIME_TSC) { + if (__vxtime.mode != VXTIME_HPET) { sync_core(); rdtscll(t); - if (t < __vxtime.last_tsc) t = __vxtime.last_tsc; + if (t < __vxtime.last_tsc) + t = __vxtime.last_tsc; usec += ((t - __vxtime.last_tsc) * __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ + /* See comment in x86_64 do_gettimeofday. */ } else { usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - __vxtime.last) * __vxtime.quot) >> 32; @@ -101,14 +86,13 @@ static force_inline void do_vgettimeofday(struct timeval * tv) /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ static force_inline void do_get_tz(struct timezone * tz) { - *tz = __sys_tz; + *tz = __sys_tz; } - static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz) { int ret; - asm volatile("syscall" + asm volatile("vsysc2: syscall" : "=a" (ret) : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); return ret; @@ -117,7 +101,7 @@ static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz) static force_inline long time_syscall(long *t) { long secs; - asm volatile("syscall" + asm volatile("vsysc1: syscall" : "=a" (secs) : "0" (__NR_time),"D" (t) : __syscall_clobber); return secs; @@ -126,7 +110,7 @@ static force_inline long time_syscall(long *t) static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { if (unlikely(!__sysctl_vsyscall)) - return gettimeofday(tv,tz); + return gettimeofday(tv,tz); if (tv) do_vgettimeofday(tv); if (tz) @@ -153,9 +137,71 @@ static long __vsyscall(2) venosys_0(void) static long __vsyscall(3) venosys_1(void) { return -ENOSYS; +} + +#ifdef CONFIG_SYSCTL + +#define SYSCALL 0x050f +#define NOP2 0x9090 +/* + * NOP out syscall in vsyscall page when not needed. + */ +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + extern u16 vsysc1, vsysc2; + u16 *map1, *map2; + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + if (!write) + return ret; + /* gcc has some trouble with __va(__pa()), so just do it this + way. */ + map1 = ioremap(__pa_symbol(&vsysc1), 2); + if (!map1) + return -ENOMEM; + map2 = ioremap(__pa_symbol(&vsysc2), 2); + if (!map2) { + ret = -ENOMEM; + goto out; + } + if (!sysctl_vsyscall) { + *map1 = SYSCALL; + *map2 = SYSCALL; + } else { + *map1 = NOP2; + *map2 = NOP2; + } + iounmap(map2); +out: + iounmap(map1); + return ret; } +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + return -ENOSYS; +} + +static ctl_table kernel_table2[] = { + { .ctl_name = 99, .procname = "vsyscall64", + .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, + .strategy = vsyscall_sysctl_nostrat, + .proc_handler = vsyscall_sysctl_change }, + { 0, } +}; + +static ctl_table kernel_root_table2[] = { + { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, + .child = kernel_table2 }, + { 0 }, +}; + +#endif + static void __init map_vsyscall(void) { extern char __vsyscall_0; @@ -176,14 +222,15 @@ static void __init map_vsyscall_user(void) static int __init vsyscall_init(void) { - BUG_ON(((unsigned long) &vgettimeofday != - VSYSCALL_ADDR(__NR_vgettimeofday))); + BUG_ON(((unsigned long) &vgettimeofday != + VSYSCALL_ADDR(__NR_vgettimeofday))); BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); map_vsyscall(); map_vsyscall_user(); /* establish tranlation for user address space */ - sysctl_vsyscall = 0; /* TBD */ - +#ifdef CONFIG_SYSCTL + register_sysctl_table(kernel_root_table2, 0); +#endif return 0; } diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/x8664_ksyms.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/x8664_ksyms.c index b8df766c18..442cc247e9 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/x8664_ksyms.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/x8664_ksyms.c @@ -105,7 +105,6 @@ EXPORT_SYMBOL(pci_free_consistent); #endif #ifdef CONFIG_PCI -EXPORT_SYMBOL(pcibios_penalize_isa_irq); EXPORT_SYMBOL(pci_mem_start); #endif @@ -142,35 +141,23 @@ EXPORT_SYMBOL(rtc_lock); #undef memmove #undef memchr #undef strlen -#undef strcpy #undef strncmp #undef strncpy #undef strchr -#undef strcmp -#undef strcpy -#undef strcat -#undef memcmp extern void * memset(void *,int,__kernel_size_t); extern size_t strlen(const char *); extern void * memmove(void * dest,const void *src,size_t count); -extern char * strcpy(char * dest,const char *src); -extern int strcmp(const char * cs,const char * ct); extern void *memchr(const void *s, int c, size_t n); extern void * memcpy(void *,const void *,__kernel_size_t); extern void * __memcpy(void *,const void *,__kernel_size_t); -extern char * strcat(char *, const char *); -extern int memcmp(const void * cs,const void * ct,size_t count); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(memmove); -EXPORT_SYMBOL(strcpy); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strncpy); EXPORT_SYMBOL(strchr); -EXPORT_SYMBOL(strcmp); -EXPORT_SYMBOL(strcat); EXPORT_SYMBOL(strncat); EXPORT_SYMBOL(memchr); EXPORT_SYMBOL(strrchr); @@ -178,7 +165,6 @@ EXPORT_SYMBOL(strnlen); EXPORT_SYMBOL(memscan); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); -EXPORT_SYMBOL(memcmp); #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM /* prototypes are wrong, these are assembly with custom calling functions */ @@ -209,8 +195,9 @@ EXPORT_SYMBOL(smp_num_siblings); extern void do_softirq_thunk(void); EXPORT_SYMBOL(do_softirq_thunk); -void out_of_line_bug(void); +#ifdef CONFIG_BUG EXPORT_SYMBOL(out_of_line_bug); +#endif EXPORT_SYMBOL(init_level4_pgt); @@ -219,7 +206,6 @@ EXPORT_SYMBOL(__supported_pte_mask); #ifdef CONFIG_SMP EXPORT_SYMBOL(flush_tlb_page); -EXPORT_SYMBOL_GPL(flush_tlb_all); #endif EXPORT_SYMBOL(cpu_khz); diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/fault.c b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/fault.c index 8b42292232..874b3afede 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/fault.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/fault.c @@ -65,21 +65,19 @@ void bust_spinlocks(int yes) static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned long error_code) { - unsigned char *instr = (unsigned char *)(regs->rip); + unsigned char *instr; int scan_more = 1; int prefetch = 0; - unsigned char *max_instr = instr + 15; + unsigned char *max_instr; /* If it was a exec fault ignore */ if (error_code & (1<<4)) return 0; - /* Code segments in LDT could have a non zero base. Don't check - when that's possible */ - if (regs->cs & (1<<2)) - return 0; + instr = (unsigned char *)convert_rip_to_linear(current, regs); + max_instr = instr + 15; - if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE) + if ((regs->cs & 3) != 0 && instr >= (unsigned char *)TASK_SIZE) return 0; while (scan_more && instr < max_instr) { @@ -238,6 +236,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, /* * Handle a fault on the vmalloc or module mapping area + * + * This assumes no large pages in there. */ static int vmalloc_fault(unsigned long address) { @@ -276,7 +276,10 @@ static int vmalloc_fault(unsigned long address) if (!pte_present(*pte_ref)) return -1; pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref)) + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) BUG(); __flush_tlb_all(); return 0; @@ -361,7 +364,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, * protection error (error_code & 1) == 0. */ if (unlikely(address >= TASK_SIZE)) { - if (!(error_code & 5)) { + if (!(error_code & 5) && + ((address >= VMALLOC_START && address < VMALLOC_END) || + (address >= MODULES_VADDR && address < MODULES_END))) { if (vmalloc_fault(address) < 0) goto bad_area_nosemaphore; return; @@ -471,17 +476,6 @@ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: - -#ifdef CONFIG_IA32_EMULATION - /* 32bit vsyscall. map on demand. */ - if (test_thread_flag(TIF_IA32) && - address >= VSYSCALL32_BASE && address < VSYSCALL32_END) { - if (map_syscall32(mm, address) < 0) - goto out_of_memory2; - return; - } -#endif - /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { if (is_prefetch(regs, address, error_code)) @@ -563,7 +557,6 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); -out_of_memory2: if (current->pid == 1) { yield(); goto again; diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c index 0e85f182b2..0efcf264fa 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c @@ -919,9 +919,9 @@ static __init int x8664_sysctl_init(void) __initcall(x8664_sysctl_init); #endif -/* Pseudo VMAs to allow ptrace access for the vsyscall pages. x86-64 has two - different ones: one for 32bit and one for 64bit. Use the appropiate - for the target task. */ +/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only + covers the 64bit vsyscall page now. 32bit has a real VMA now and does + not need special handling anymore. */ static struct vm_area_struct gate_vma = { .vm_start = VSYSCALL_START, @@ -929,22 +929,11 @@ static struct vm_area_struct gate_vma = { .vm_page_prot = PAGE_READONLY }; -static struct vm_area_struct gate32_vma = { - .vm_start = VSYSCALL32_BASE, - .vm_end = VSYSCALL32_END, - .vm_page_prot = PAGE_READONLY -}; - struct vm_area_struct *get_gate_vma(struct task_struct *tsk) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - /* lookup code assumes the pages are present. set them up - now */ - if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0) - return NULL; - return &gate32_vma; - } + if (test_tsk_thread_flag(tsk, TIF_IA32)) + return NULL; #endif return &gate_vma; } @@ -952,6 +941,8 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) int in_gate_area(struct task_struct *task, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(task); + if (!vma) + return 0; return (addr >= vma->vm_start) && (addr < vma->vm_end); } @@ -961,6 +952,5 @@ int in_gate_area(struct task_struct *task, unsigned long addr) */ int in_gate_area_no_task(unsigned long addr) { - return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) || - ((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END))); + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } diff --git a/linux-2.6-xen-sparse/drivers/Makefile b/linux-2.6-xen-sparse/drivers/Makefile index 5fab89854a..786cbad645 100644 --- a/linux-2.6-xen-sparse/drivers/Makefile +++ b/linux-2.6-xen-sparse/drivers/Makefile @@ -48,8 +48,8 @@ obj-$(CONFIG_PARIDE) += block/paride/ obj-$(CONFIG_TC) += tc/ obj-$(CONFIG_USB) += usb/ obj-$(CONFIG_USB_GADGET) += usb/gadget/ -obj-$(CONFIG_INPUT) += input/ obj-$(CONFIG_GAMEPORT) += input/gameport/ +obj-$(CONFIG_INPUT) += input/ obj-$(CONFIG_I2O) += message/ obj-$(CONFIG_I2C) += i2c/ obj-$(CONFIG_W1) += w1/ @@ -62,5 +62,6 @@ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_MMC) += mmc/ obj-$(CONFIG_INFINIBAND) += infiniband/ +obj-$(CONFIG_BLK_DEV_SGIIOC4) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ diff --git a/linux-2.6-xen-sparse/drivers/char/mem.c b/linux-2.6-xen-sparse/drivers/char/mem.c index 8eae836f01..96726fad53 100644 --- a/linux-2.6-xen-sparse/drivers/char/mem.c +++ b/linux-2.6-xen-sparse/drivers/char/mem.c @@ -23,6 +23,7 @@ #include <linux/devfs_fs_kernel.h> #include <linux/ptrace.h> #include <linux/device.h> +#include <linux/backing-dev.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -76,14 +77,6 @@ static inline int uncached_access(struct file *file, unsigned long addr) * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases. */ return !(efi_mem_attributes(addr) & EFI_MEMORY_WB); -#elif defined(CONFIG_PPC64) - /* On PPC64, we always do non-cacheable access to the IO hole and - * cacheable elsewhere. Cache paradox can checkstop the CPU and - * the high_memory heuristic below is wrong on machines with memory - * above the IO hole... Ah, and of course, XFree86 doesn't pass - * O_SYNC when mapping us to tap IO space. Surprised ? - */ - return !page_is_ram(addr >> PAGE_SHIFT); #else /* * Accessing memory above the top the kernel knows about or through a file pointer @@ -111,38 +104,6 @@ static inline int valid_phys_addr_range(unsigned long addr, size_t *count) } #endif -static ssize_t do_write_mem(void *p, unsigned long realp, - const char __user * buf, size_t count, loff_t *ppos) -{ - ssize_t written; - unsigned long copied; - - written = 0; -#if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) - /* we don't have page 0 mapped on sparc and m68k.. */ - if (realp < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-realp; - if (sz > count) sz = count; - /* Hmm. Do something? */ - buf+=sz; - p+=sz; - count-=sz; - written+=sz; - } -#endif - copied = copy_from_user(p, buf, count); - if (copied) { - ssize_t ret = written + (count - copied); - - if (ret) - return ret; - return -EFAULT; - } - written += count; - *ppos += written; - return written; -} - #ifndef ARCH_HAS_DEV_MEM /* * This funcion reads the *physical* memory. The f_pos points directly to the @@ -152,15 +113,16 @@ static ssize_t read_mem(struct file * file, char __user * buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; - ssize_t read; + ssize_t read, sz; + char *ptr; if (!valid_phys_addr_range(p, &count)) return -EFAULT; read = 0; -#if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) +#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED /* we don't have page 0 mapped on sparc and m68k.. */ if (p < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-p; + sz = PAGE_SIZE - p; if (sz > count) sz = count; if (sz > 0) { @@ -173,9 +135,33 @@ static ssize_t read_mem(struct file * file, char __user * buf, } } #endif - if (copy_to_user(buf, __va(p), count)) - return -EFAULT; - read += count; + + while (count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + /* + * On ia64 if a page has been mapped somewhere as + * uncached, then it must also be accessed uncached + * by the kernel or data corruption may occur + */ + ptr = xlate_dev_mem_ptr(p); + + if (copy_to_user(buf, ptr, sz)) + return -EFAULT; + buf += sz; + p += sz; + count -= sz; + read += sz; + } + *ppos += read; return read; } @@ -184,16 +170,76 @@ static ssize_t write_mem(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; + ssize_t written, sz; + unsigned long copied; + void *ptr; if (!valid_phys_addr_range(p, &count)) return -EFAULT; - return do_write_mem(__va(p), p, buf, count, ppos); + + written = 0; + +#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED + /* we don't have page 0 mapped on sparc and m68k.. */ + if (p < PAGE_SIZE) { + unsigned long sz = PAGE_SIZE - p; + if (sz > count) + sz = count; + /* Hmm. Do something? */ + buf += sz; + p += sz; + count -= sz; + written += sz; + } +#endif + + while (count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + /* + * On ia64 if a page has been mapped somewhere as + * uncached, then it must also be accessed uncached + * by the kernel or data corruption may occur + */ + ptr = xlate_dev_mem_ptr(p); + + copied = copy_from_user(ptr, buf, sz); + if (copied) { + ssize_t ret; + + ret = written + (sz - copied); + if (ret) + return ret; + return -EFAULT; + } + buf += sz; + p += sz; + count -= sz; + written += sz; + } + + *ppos += written; + return written; } #endif static int mmap_kmem(struct file * file, struct vm_area_struct * vma) { -#ifdef pgprot_noncached +#if defined(__HAVE_PHYS_MEM_ACCESS_PROT) + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + + vma->vm_page_prot = phys_mem_access_prot(file, offset, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +#elif defined(pgprot_noncached) unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; int uncached; @@ -212,6 +258,25 @@ static int mmap_kmem(struct file * file, struct vm_area_struct * vma) return 0; } +#if 0 +static int mmap_kmem(struct file * file, struct vm_area_struct * vma) +{ + unsigned long long val; + /* + * RED-PEN: on some architectures there is more mapped memory + * than available in mem_map which pfn_valid checks + * for. Perhaps should add a new macro here. + * + * RED-PEN: vmalloc is not supported right now. + */ + if (!pfn_valid(vma->vm_pgoff)) + return -EIO; + val = (u64)vma->vm_pgoff << PAGE_SHIFT; + vma->vm_pgoff = __pa(val) >> PAGE_SHIFT; + return mmap_mem(file, vma); +} +#endif + extern long vread(char *buf, char *addr, unsigned long count); extern long vwrite(char *buf, char *addr, unsigned long count); @@ -222,33 +287,55 @@ static ssize_t read_kmem(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; - ssize_t read = 0; - ssize_t virtr = 0; + ssize_t low_count, read, sz; char * kbuf; /* k-addr because vread() takes vmlist_lock rwlock */ - + + read = 0; if (p < (unsigned long) high_memory) { - read = count; + low_count = count; if (count > (unsigned long) high_memory - p) - read = (unsigned long) high_memory - p; + low_count = (unsigned long) high_memory - p; -#if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) +#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE && read > 0) { + if (p < PAGE_SIZE && low_count > 0) { size_t tmp = PAGE_SIZE - p; - if (tmp > read) tmp = read; + if (tmp > low_count) tmp = low_count; if (clear_user(buf, tmp)) return -EFAULT; buf += tmp; p += tmp; - read -= tmp; + read += tmp; + low_count -= tmp; count -= tmp; } #endif - if (copy_to_user(buf, (char *)p, read)) - return -EFAULT; - p += read; - buf += read; - count -= read; + while (low_count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, low_count); + + /* + * On ia64 if a page has been mapped somewhere as + * uncached, then it must also be accessed uncached + * by the kernel or data corruption may occur + */ + kbuf = xlate_dev_kmem_ptr((char *)p); + + if (copy_to_user(buf, kbuf, sz)) + return -EFAULT; + buf += sz; + p += sz; + read += sz; + low_count -= sz; + count -= sz; + } } if (count > 0) { @@ -269,15 +356,79 @@ static ssize_t read_kmem(struct file *file, char __user *buf, } count -= len; buf += len; - virtr += len; + read += len; p += len; } free_page((unsigned long)kbuf); } *ppos = p; - return virtr + read; + return read; } + +static inline ssize_t +do_write_kmem(void *p, unsigned long realp, const char __user * buf, + size_t count, loff_t *ppos) +{ + ssize_t written, sz; + unsigned long copied; + + written = 0; +#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED + /* we don't have page 0 mapped on sparc and m68k.. */ + if (realp < PAGE_SIZE) { + unsigned long sz = PAGE_SIZE - realp; + if (sz > count) + sz = count; + /* Hmm. Do something? */ + buf += sz; + p += sz; + realp += sz; + count -= sz; + written += sz; + } +#endif + + while (count > 0) { + char *ptr; + /* + * Handle first page in case it's not aligned + */ + if (-realp & (PAGE_SIZE - 1)) + sz = -realp & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + /* + * On ia64 if a page has been mapped somewhere as + * uncached, then it must also be accessed uncached + * by the kernel or data corruption may occur + */ + ptr = xlate_dev_kmem_ptr(p); + + copied = copy_from_user(ptr, buf, sz); + if (copied) { + ssize_t ret; + + ret = written + (sz - copied); + if (ret) + return ret; + return -EFAULT; + } + buf += sz; + p += sz; + realp += sz; + count -= sz; + written += sz; + } + + *ppos += written; + return written; +} + + /* * This function writes to the *virtual* memory as seen by the kernel. */ @@ -296,7 +447,7 @@ static ssize_t write_kmem(struct file * file, const char __user * buf, if (count > (unsigned long) high_memory - p) wrote = (unsigned long) high_memory - p; - written = do_write_mem((void*)p, p, buf, wrote, ppos); + written = do_write_kmem((void*)p, p, buf, wrote, ppos); if (written != wrote) return written; wrote = written; @@ -344,7 +495,7 @@ static ssize_t read_port(struct file * file, char __user * buf, unsigned long i = *ppos; char __user *tmp = buf; - if (verify_area(VERIFY_WRITE,buf,count)) + if (!access_ok(VERIFY_WRITE, buf, count)) return -EFAULT; while (count-- > 0 && i < 65536) { if (__put_user(inb(i),tmp) < 0) @@ -362,7 +513,7 @@ static ssize_t write_port(struct file * file, const char __user * buf, unsigned long i = *ppos; const char __user * tmp = buf; - if (verify_area(VERIFY_READ,buf,count)) + if (!access_ok(VERIFY_READ,buf,count)) return -EFAULT; while (count-- > 0 && i < 65536) { char c; @@ -568,7 +719,6 @@ static int open_port(struct inode * inode, struct file * filp) return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } -#define mmap_mem mmap_kmem #define zero_lseek null_lseek #define full_lseek null_lseek #define write_zero write_null @@ -581,7 +731,7 @@ static struct file_operations mem_fops = { .llseek = memory_lseek, .read = read_mem, .write = write_mem, - .mmap = mmap_mem, + .mmap = mmap_kmem, .open = open_mem, }; #else @@ -618,6 +768,10 @@ static struct file_operations zero_fops = { .mmap = mmap_zero, }; +static struct backing_dev_info zero_bdi = { + .capabilities = BDI_CAP_MAP_COPY, +}; + static struct file_operations full_fops = { .llseek = full_lseek, .read = read_full, @@ -664,6 +818,7 @@ static int memory_open(struct inode * inode, struct file * filp) break; #endif case 5: + filp->f_mapping->backing_dev_info = &zero_bdi; filp->f_op = &zero_fops; break; case 7: diff --git a/linux-2.6-xen-sparse/drivers/char/tty_io.c b/linux-2.6-xen-sparse/drivers/char/tty_io.c index a8d33b5288..69e42bdcb4 100644 --- a/linux-2.6-xen-sparse/drivers/char/tty_io.c +++ b/linux-2.6-xen-sparse/drivers/char/tty_io.c @@ -187,7 +187,7 @@ char *tty_name(struct tty_struct *tty, char *buf) EXPORT_SYMBOL(tty_name); -inline int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, +int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine) { #ifdef TTY_PARANOIA_CHECK @@ -1791,7 +1791,6 @@ retry_open: } #ifdef CONFIG_VT if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) { - extern int fg_console; extern struct tty_driver *console_driver; driver = console_driver; index = fg_console; @@ -2018,11 +2017,10 @@ static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty, return 0; #ifdef CONFIG_VT if (tty->driver->type == TTY_DRIVER_TYPE_CONSOLE) { - unsigned int currcons = tty->index; int rc; acquire_console_sem(); - rc = vc_resize(currcons, tmp_ws.ws_col, tmp_ws.ws_row); + rc = vc_resize(tty->driver_data, tmp_ws.ws_col, tmp_ws.ws_row); release_console_sem(); if (rc) return -ENXIO; @@ -2634,6 +2632,7 @@ static void initialize_tty_struct(struct tty_struct *tty) tty->magic = TTY_MAGIC; tty_ldisc_assign(tty, tty_ldisc_get(N_TTY)); tty->pgrp = -1; + tty->overrun_time = jiffies; tty->flip.char_buf_ptr = tty->flip.char_buf; tty->flip.flag_buf_ptr = tty->flip.flag_buf; INIT_WORK(&tty->flip.work, flush_to_ldisc, tty); diff --git a/linux-2.6-xen-sparse/include/asm-generic/pgtable.h b/linux-2.6-xen-sparse/include/asm-generic/pgtable.h index 950f9466a6..e1a95778eb 100644 --- a/linux-2.6-xen-sparse/include/asm-generic/pgtable.h +++ b/linux-2.6-xen-sparse/include/asm-generic/pgtable.h @@ -16,7 +16,7 @@ #ifndef __HAVE_ARCH_SET_PTE_ATOMIC #define ptep_establish(__vma, __address, __ptep, __entry) \ do { \ - set_pte(__ptep, __entry); \ + set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ flush_tlb_page(__vma, __address); \ } while (0) #else /* __HAVE_ARCH_SET_PTE_ATOMIC */ @@ -37,7 +37,7 @@ do { \ */ #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ do { \ - set_pte(__ptep, __entry); \ + set_pte_at((__vma)>vm_mm, (__address), __ptep, __entry); \ flush_tlb_page(__vma, __address); \ } while (0) #endif @@ -53,20 +53,24 @@ do { \ #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_young(pte)) - return 0; - set_pte(ptep, pte_mkold(pte)); - return 1; -} +#define ptep_test_and_clear_young(__vma, __address, __ptep) \ +({ \ + pte_t __pte = *(__ptep); \ + int r = 1; \ + if (!pte_young(__pte)) \ + r = 0; \ + else \ + set_pte_at((__vma)->vm_mm, (__address), \ + (__ptep), pte_mkold(__pte)); \ + r; \ +}) #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH #define ptep_clear_flush_young(__vma, __address, __ptep) \ ({ \ - int __young = ptep_test_and_clear_young(__ptep); \ + int __young; \ + __young = ptep_test_and_clear_young(__vma, __address, __ptep); \ if (__young) \ flush_tlb_page(__vma, __address); \ __young; \ @@ -74,20 +78,24 @@ static inline int ptep_test_and_clear_young(pte_t *ptep) #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY -static inline int ptep_test_and_clear_dirty(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_dirty(pte)) - return 0; - set_pte(ptep, pte_mkclean(pte)); - return 1; -} +#define ptep_test_and_clear_dirty(__vma, __address, __ptep) \ +({ \ + pte_t __pte = *__ptep; \ + int r = 1; \ + if (!pte_dirty(__pte)) \ + r = 0; \ + else \ + set_pte_at((__vma)->vm_mm, (__address), (__ptep), \ + pte_mkclean(__pte)); \ + r; \ +}) #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH #define ptep_clear_flush_dirty(__vma, __address, __ptep) \ ({ \ - int __dirty = ptep_test_and_clear_dirty(__ptep); \ + int __dirty; \ + __dirty = ptep_test_and_clear_dirty(__vma, __address, __ptep); \ if (__dirty) \ flush_tlb_page(__vma, __address); \ __dirty; \ @@ -95,36 +103,29 @@ static inline int ptep_test_and_clear_dirty(pte_t *ptep) #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(pte_t *ptep) -{ - pte_t pte = *ptep; - pte_clear(ptep); - return pte; -} +#define ptep_get_and_clear(__mm, __address, __ptep) \ +({ \ + pte_t __pte = *(__ptep); \ + pte_clear((__mm), (__address), (__ptep)); \ + __pte; \ +}) #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH #define ptep_clear_flush(__vma, __address, __ptep) \ ({ \ - pte_t __pte = ptep_get_and_clear(__ptep); \ + pte_t __pte; \ + __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \ flush_tlb_page(__vma, __address); \ __pte; \ }) #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(pte_t *ptep) +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = *ptep; - set_pte(ptep, pte_wrprotect(old_pte)); -} -#endif - -#ifndef __HAVE_ARCH_PTEP_MKDIRTY -static inline void ptep_mkdirty(pte_t *ptep) -{ - pte_t old_pte = *ptep; - set_pte(ptep, pte_mkdirty(old_pte)); + set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif @@ -144,4 +145,77 @@ static inline void ptep_mkdirty(pte_t *ptep) #define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) #endif +#ifndef __HAVE_ARCH_LAZY_MMU_PROT_UPDATE +#define lazy_mmu_prot_update(pte) do { } while (0) +#endif + +/* + * When walking page tables, get the address of the next boundary, + * or the end address of the range if that comes earlier. Although no + * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. + */ + +#define pgd_addr_end(addr, end) \ +({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ + (__boundary - 1 < (end) - 1)? __boundary: (end); \ +}) + +#ifndef pud_addr_end +#define pud_addr_end(addr, end) \ +({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ + (__boundary - 1 < (end) - 1)? __boundary: (end); \ +}) +#endif + +#ifndef pmd_addr_end +#define pmd_addr_end(addr, end) \ +({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ + (__boundary - 1 < (end) - 1)? __boundary: (end); \ +}) +#endif + +#ifndef __ASSEMBLY__ +/* + * When walking page tables, we usually want to skip any p?d_none entries; + * and any p?d_bad entries - reporting the error before resetting to none. + * Do the tests inline, but report and clear the bad entry in mm/memory.c. + */ +void pgd_clear_bad(pgd_t *); +void pud_clear_bad(pud_t *); +void pmd_clear_bad(pmd_t *); + +static inline int pgd_none_or_clear_bad(pgd_t *pgd) +{ + if (pgd_none(*pgd)) + return 1; + if (unlikely(pgd_bad(*pgd))) { + pgd_clear_bad(pgd); + return 1; + } + return 0; +} + +static inline int pud_none_or_clear_bad(pud_t *pud) +{ + if (pud_none(*pud)) + return 1; + if (unlikely(pud_bad(*pud))) { + pud_clear_bad(pud); + return 1; + } + return 0; +} + +static inline int pmd_none_or_clear_bad(pmd_t *pmd) +{ + if (pmd_none(*pmd)) + return 1; + if (unlikely(pmd_bad(*pmd))) { + pmd_clear_bad(pmd); + return 1; + } + return 0; +} +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_GENERIC_PGTABLE_H */ diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h index 85f022109c..e2e13a9579 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h @@ -4,6 +4,8 @@ #include <asm/ldt.h> #include <asm/segment.h> +#define CPU_16BIT_STACK_SIZE 1024 + #ifndef __ASSEMBLY__ #include <linux/preempt.h> @@ -13,6 +15,8 @@ extern struct desc_struct cpu_gdt_table[NR_CPUS][GDT_ENTRIES]; +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); + struct Xgt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h index 43b4f5780b..41ac456d12 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h @@ -11,7 +11,7 @@ #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag); + dma_addr_t *dma_handle, unsigned int __nocast flag); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/highmem.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/highmem.h index e3e4a531d2..ddf86feacf 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/highmem.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/highmem.h @@ -33,8 +33,6 @@ extern pte_t *kmap_pte; extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; -extern void kmap_init(void); - /* * Right now we initialize only a single pte table. It can be extended * easily, subsequent pte tables have to be allocated in one physical diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/io.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/io.h index 2fa9f47ccc..f3e03cd0a9 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/io.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/io.h @@ -50,6 +50,17 @@ #include <linux/vmalloc.h> #include <asm/fixmap.h> +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +#define xlate_dev_mem_ptr(p) __va(p) + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p + /** * virt_to_phys - map virtual addresses to physical * @address: address to remap diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h index f46144e37f..f54bb7bdd7 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h @@ -34,10 +34,10 @@ static inline void __prepare_arch_switch(void) * are always kernel segments while inside the kernel. Must * happen before reload of cr3/ldt (i.e., not in __switch_to). */ - __asm__ __volatile__ ( "movl %%fs,%0 ; movl %%gs,%1" + __asm__ __volatile__ ( "mov %%fs,%0 ; mov %%gs,%1" : "=m" (*(int *)¤t->thread.fs), "=m" (*(int *)¤t->thread.gs)); - __asm__ __volatile__ ( "movl %0,%%fs ; movl %0,%%gs" + __asm__ __volatile__ ( "mov %0,%%fs ; mov %0,%%gs" : : "r" (0) ); } @@ -100,7 +100,7 @@ static inline void switch_mm(struct mm_struct *prev, } #define deactivate_mm(tsk, mm) \ - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) + asm("mov %0,%%fs ; mov %0,%%gs": :"r" (0)) #define activate_mm(prev, next) \ switch_mm((prev),(next),NULL) diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h index 404da2640b..2c30b449f1 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h @@ -2,7 +2,6 @@ #define _I386_PGALLOC_H #include <linux/config.h> -#include <asm/processor.h> #include <asm/fixmap.h> #include <linux/threads.h> #include <linux/mm.h> /* for struct page */ diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h index 9eddbd8012..91f1354048 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h @@ -14,6 +14,7 @@ * hook is made available. */ #define set_pte(pteptr, pteval) (*(pteptr) = pteval) +#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) #ifndef CONFIG_XEN_SHADOW_MODE @@ -22,7 +23,7 @@ #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) #endif -#define ptep_get_and_clear(xp) __pte_ma(xchg(&(xp)->pte_low, 0)) +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) /* * We detect special mappings in one of two ways: diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h index f611f04781..4890d7a479 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h @@ -61,7 +61,7 @@ void paging_init(void); #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_PGD_NR 0 +#define FIRST_USER_ADDRESS 0 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) @@ -194,15 +194,15 @@ extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are - * done without a 'verify_area(VERIFY_WRITE,..)' + * done without a 'access_ok(VERIFY_WRITE,..)' */ -#undef TEST_VERIFY_AREA +#undef TEST_ACCESS_OK /* The boot page tables (all created as a single array) */ extern unsigned long pg0[]; #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) #define pmd_none(x) (!pmd_val(x)) /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. @@ -246,32 +246,26 @@ static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return p # include <asm/pgtable-2level.h> #endif -static inline int ptep_test_and_clear_dirty(pte_t *ptep) +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { if (!pte_dirty(*ptep)) return 0; return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } -static inline int ptep_test_and_clear_young(pte_t *ptep) +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { if (!pte_young(*ptep)) return 0; return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); } -static inline void ptep_set_wrprotect(pte_t *ptep) +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { if (pte_write(*ptep)) clear_bit(_PAGE_BIT_RW, &ptep->pte_low); } -static inline void ptep_mkdirty(pte_t *ptep) -{ - if (!pte_dirty(*ptep)) - set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); -} - /* * Macro to mark a page protection value as "uncacheable". On processors which do not support * it, this is a no-op. @@ -483,11 +477,14 @@ direct_remap_area_pages(vma->vm_mm,from,phys,size,prot,DOMID_IO) #define io_remap_pfn_range(vma,from,pfn,size,prot) \ direct_remap_area_pages(vma->vm_mm,from,pfn<<PAGE_SHIFT,size,prot,DOMID_IO) +#define MK_IOSPACE_PFN(space, pfn) (pfn) +#define GET_IOSPACE(pfn) 0 +#define GET_PFN(pfn) (pfn) + #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY #define __HAVE_ARCH_PTEP_GET_AND_CLEAR #define __HAVE_ARCH_PTEP_SET_WRPROTECT -#define __HAVE_ARCH_PTEP_MKDIRTY #define __HAVE_ARCH_PTE_SAME #include <asm-generic/pgtable.h> diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/processor.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/processor.h index fd54b409e2..604b6db1a2 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/processor.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/processor.h @@ -99,12 +99,12 @@ extern struct cpuinfo_x86 cpu_data[]; #endif extern int phys_proc_id[NR_CPUS]; +extern int cpu_core_id[NR_CPUS]; extern char ignore_fpu_irq; extern void identify_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern void dodgy_tsc(void); #ifdef CONFIG_X86_HT extern void detect_ht(struct cpuinfo_x86 *c); @@ -138,7 +138,7 @@ static inline void detect_ht(struct cpuinfo_x86 *c) {} * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx * resulting in stale register contents being returned. */ -static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { __asm__("cpuid" : "=a" (*eax), @@ -148,6 +148,18 @@ static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) : "0" (op), "c"(0)); } +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, + int *edx) +{ + __asm__("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (op), "c" (count)); +} + /* * CPUID functions returning a single datum */ @@ -501,6 +513,13 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa regs->esp = new_esp; \ } while (0) +/* + * This special macro can be used to load a debugging register + */ +#define loaddebug(thread,register) \ + HYPERVISOR_set_debugreg((register), \ + ((thread)->debugreg[register])) + /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/segment.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/segment.h index 5496d69023..95edad964c 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/segment.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/segment.h @@ -38,7 +38,7 @@ * 24 - APM BIOS support * 25 - APM BIOS support * - * 26 - unused + * 26 - ESPFIX small SS * 27 - unused * 28 - unused * 29 - unused @@ -71,6 +71,9 @@ #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/setup.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/setup.h index 03a3a64183..0a0073ae71 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/setup.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/setup.h @@ -16,7 +16,7 @@ #define MAXMEM_PFN PFN_DOWN(MAXMEM) #define MAX_NONPAE_PFN (1 << 20) -#define PARAM_SIZE 2048 +#define PARAM_SIZE 4096 #define COMMAND_LINE_SIZE 256 #define OLD_CL_MAGIC_ADDR 0x90020 diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h index fcf9959526..5d8ab26fa4 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h @@ -84,7 +84,7 @@ static inline unsigned long _get_base(char * addr) #define loadsegment(seg,value) \ asm volatile("\n" \ "1:\t" \ - "movl %0,%%" #seg "\n" \ + "mov %0,%%" #seg "\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3:\t" \ @@ -96,13 +96,13 @@ static inline unsigned long _get_base(char * addr) ".align 4\n\t" \ ".long 1b,3b\n" \ ".previous" \ - : :"m" (*(unsigned int *)&(value))) + : :"m" (value)) /* * Save a segment register away */ #define savesegment(seg, value) \ - asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value))) + asm volatile("mov %%" #seg ",%0":"=m" (value)) /* * Clear and set 'TS' bit respectively @@ -574,4 +574,6 @@ void enable_hlt(void); extern int es7000_plat; void cpu_idle_wait(void); +extern unsigned long arch_align_stack(unsigned long sp); + #endif diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/bootsetup.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/bootsetup.h index 731d8678ab..e36521ab35 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/bootsetup.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/bootsetup.h @@ -2,7 +2,8 @@ #ifndef _X86_64_BOOTSETUP_H #define _X86_64_BOOTSETUP_H 1 -extern char x86_boot_params[2048]; +#define BOOT_PARAM_SIZE 4096 +extern char x86_boot_params[BOOT_PARAM_SIZE]; /* * This is set up by the setup-routine at boot-time diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/io.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/io.h index 90466f397b..0d6c53a95c 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/io.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/io.h @@ -36,11 +36,7 @@ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ -#ifdef SLOW_IO_BY_JUMPING -#define __SLOW_DOWN_IO "\njmp 1f\n1:\tjmp 1f\n1:" -#else #define __SLOW_DOWN_IO "\noutb %%al,$0x80" -#endif #ifdef REALLY_SLOW_IO #define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO @@ -360,6 +356,17 @@ out: extern int iommu_bio_merge; #define BIO_VMERGE_BOUNDARY iommu_bio_merge +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +#define xlate_dev_mem_ptr(p) __va(p) + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p + #endif /* __KERNEL__ */ #endif diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h index 8acd7990dc..f1a45ec1a9 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h @@ -145,10 +145,6 @@ static inline pgd_t __pgd(unsigned long x) #define __pgprot(x) ((pgprot_t) { (x) } ) -extern unsigned long vm_stack_flags, vm_stack_flags32; -extern unsigned long vm_data_default_flags, vm_data_default_flags32; -extern unsigned long vm_force_exec32; - #define __START_KERNEL 0xffffffff80100000UL #define __START_KERNEL_map 0xffffffff80000000UL #define __PAGE_OFFSET 0xffff880000000000UL diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h index 325d700c3b..d6dad2dcce 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h @@ -1,7 +1,6 @@ #ifndef _X86_64_PGALLOC_H #define _X86_64_PGALLOC_H -#include <asm/processor.h> #include <asm/fixmap.h> #include <asm/pda.h> #include <linux/threads.h> @@ -163,6 +162,8 @@ extern __inline__ void pte_free_kernel(pte_t *pte) extern void pte_free(struct page *pte); //#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +//#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) +//#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) #define __pte_free_tlb(tlb,x) pte_free((x)) #define __pmd_free_tlb(tlb,x) pmd_free((x)) diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h index 9745edc23b..25a884c08a 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h @@ -132,7 +132,7 @@ extern inline void pgd_clear (pgd_t * pgd) * each domain will have separate page tables, with their own versions of * accessed & dirty state. */ -static inline pte_t ptep_get_and_clear(pte_t *xp) +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp) { pte_t pte = *xp; if (pte.pte) @@ -150,7 +150,7 @@ static inline pte_t ptep_get_and_clear(pte_t *xp) #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_PGD_NR 0 +#define FIRST_USER_ADDRESS 0 #ifndef __ASSEMBLY__ #define MAXMEM 0x3fffffffffffUL @@ -262,10 +262,11 @@ static inline unsigned long pud_bad(pud_t pud) val &= ~(_PAGE_USER | _PAGE_DIRTY); return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); } +#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) #define pte_none(x) (!(x).pte) #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) @@ -339,7 +340,9 @@ extern inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; retu extern inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; } extern inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; } -static inline int ptep_test_and_clear_dirty(pte_t *ptep) +struct vm_area_struct; + +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; int ret = pte_dirty(pte); @@ -348,7 +351,7 @@ static inline int ptep_test_and_clear_dirty(pte_t *ptep) return ret; } -static inline int ptep_test_and_clear_young(pte_t *ptep) +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; int ret = pte_young(pte); @@ -357,18 +360,12 @@ static inline int ptep_test_and_clear_young(pte_t *ptep) return ret; } -static inline void ptep_set_wrprotect(pte_t *ptep) +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; if (pte_write(pte)) set_pte(ptep, pte_wrprotect(pte)); } -static inline void ptep_mkdirty(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_dirty(pte)) - xen_l1_entry_update(ptep, pte_mkdirty(pte).pte); -} /* * Macro to mark a page protection value as "uncacheable". @@ -517,6 +514,13 @@ int __direct_remap_area_pages(struct mm_struct *mm, #define io_remap_page_range(vma, vaddr, paddr, size, prot) \ remap_pfn_range(vma, vaddr, (paddr) >> PAGE_SHIFT, size, prot) +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + remap_pfn_range(vma, vaddr, pfn, size, prot) + +#define MK_IOSPACE_PFN(space, pfn) (pfn) +#define GET_IOSPACE(pfn) 0 +#define GET_PFN(pfn) (pfn) + #define HAVE_ARCH_UNMAPPED_AREA #define pgtable_cache_init() do { } while (0) @@ -534,7 +538,6 @@ int __direct_remap_area_pages(struct mm_struct *mm, #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY #define __HAVE_ARCH_PTEP_GET_AND_CLEAR #define __HAVE_ARCH_PTEP_SET_WRPROTECT -#define __HAVE_ARCH_PTEP_MKDIRTY #define __HAVE_ARCH_PTE_SAME #include <asm-generic/pgtable.h> diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h index e4a683206f..5c3e70a12a 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h @@ -62,9 +62,8 @@ struct cpuinfo_x86 { int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/ __u8 x86_virt_bits, x86_phys_bits; __u8 x86_num_cores; - __u8 x86_apicid; __u32 x86_power; - __u32 x86_cpuid_level; /* Max CPUID function supported */ + __u32 extended_cpuid_level; /* Max extended CPUID function supported */ unsigned long loops_per_jiffy; } ____cacheline_aligned; @@ -92,7 +91,6 @@ extern char ignore_irq13; extern void identify_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern void dodgy_tsc(void); /* * EFLAGS bits @@ -169,9 +167,9 @@ static inline void set_in_cr4 (unsigned long mask) /* - * User space process size. 47bits. + * User space process size. 47bits minus one guard page. */ -#define TASK_SIZE (0x800000000000UL) +#define TASK_SIZE (0x800000000000UL - 4096) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/ptrace.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/ptrace.h index 2af8edd82f..e1c0ea6d0b 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/ptrace.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/ptrace.h @@ -91,6 +91,11 @@ extern unsigned long profile_pc(struct pt_regs *regs); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); +struct task_struct; + +extern unsigned long +convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs); + enum { EF_CF = 0x00000001, EF_PF = 0x00000004, diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/segment.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/segment.h index db5926a999..e00247dc81 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/segment.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/segment.h @@ -24,10 +24,9 @@ #define GDT_ENTRY_TLS 1 #define GDT_ENTRY_TSS 8 /* needs two entries */ -#define GDT_ENTRY_LDT 10 -#define GDT_ENTRY_TLS_MIN 11 -#define GDT_ENTRY_TLS_MAX 13 -/* 14 free */ +#define GDT_ENTRY_LDT 10 /* needs two entries */ +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 #define GDT_ENTRY_KERNELCS16 15 #define GDT_ENTRY_TLS_ENTRIES 3 diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/smp.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/smp.h index 82b5cc2b9f..fd0e5ea9a2 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/smp.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/smp.h @@ -31,12 +31,16 @@ extern int disable_apic; struct pt_regs; +extern cpumask_t cpu_present_mask; +extern cpumask_t cpu_possible_map; +extern cpumask_t cpu_online_map; +extern cpumask_t cpu_callout_map; + /* * Private routines/data */ extern void smp_alloc_memory(void); -extern cpumask_t cpu_online_map; extern volatile unsigned long smp_invalidate_needed; extern int pic_mode; extern int smp_num_siblings; @@ -44,11 +48,12 @@ extern void smp_flush_tlb(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void smp_send_reschedule(int cpu); extern void smp_invalidate_rcv(void); /* Process an NMI */ -extern void (*mtrr_hook) (void); extern void zap_low_mappings(void); void smp_stop_cpu(void); extern cpumask_t cpu_sibling_map[NR_CPUS]; +extern cpumask_t cpu_core_map[NR_CPUS]; extern u8 phys_proc_id[NR_CPUS]; +extern u8 cpu_core_id[NR_CPUS]; #define SMP_TRAMPOLINE_BASE 0x6000 @@ -58,10 +63,6 @@ extern u8 phys_proc_id[NR_CPUS]; * compresses data structures. */ -extern cpumask_t cpu_callout_map; -extern cpumask_t cpu_callin_map; -#define cpu_possible_map cpu_callout_map - static inline int num_booting_cpus(void) { return cpus_weight(cpu_callout_map); @@ -77,7 +78,7 @@ extern __inline int hard_smp_processor_id(void) } #endif -#define safe_smp_processor_id() (disable_apic ? 0 : x86_apicid_to_cpu(hard_smp_processor_id())) +extern int safe_smp_processor_id(void); #endif /* !ASSEMBLY */ @@ -99,22 +100,6 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) return cpus_addr(cpumask)[0]; } -static inline int x86_apicid_to_cpu(u8 apicid) -{ - int i; - - for (i = 0; i < NR_CPUS; ++i) - if (x86_cpu_to_apicid[i] == apicid) - return i; - - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, - * or called too early. Either way, we must be CPU 0. */ - if (x86_cpu_to_apicid[0] == BAD_APICID) - return 0; - - return -1; -} - static inline int cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < NR_CPUS) diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/system.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/system.h index 17a8fec17c..b0ad9c2030 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/system.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/system.h @@ -400,4 +400,6 @@ void enable_hlt(void); #define HAVE_EAT_KEY void eat_key(void); +extern unsigned long arch_align_stack(unsigned long sp); + #endif diff --git a/linux-2.6-xen-sparse/include/linux/gfp.h b/linux-2.6-xen-sparse/include/linux/gfp.h index 42fe579514..f9d3d5eb99 100644 --- a/linux-2.6-xen-sparse/include/linux/gfp.h +++ b/linux-2.6-xen-sparse/include/linux/gfp.h @@ -26,26 +26,28 @@ struct vm_area_struct; * * __GFP_NORETRY: The VM implementation must not retry indefinitely. */ -#define __GFP_WAIT 0x10 /* Can wait and reschedule? */ -#define __GFP_HIGH 0x20 /* Should access emergency pools? */ -#define __GFP_IO 0x40 /* Can start physical IO? */ -#define __GFP_FS 0x80 /* Can call down to low-level FS? */ -#define __GFP_COLD 0x100 /* Cache-cold page required */ -#define __GFP_NOWARN 0x200 /* Suppress page allocation failure warning */ -#define __GFP_REPEAT 0x400 /* Retry the allocation. Might fail */ -#define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */ -#define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ -#define __GFP_NO_GROW 0x2000 /* Slab internal usage */ -#define __GFP_COMP 0x4000 /* Add compound page metadata */ -#define __GFP_ZERO 0x8000 /* Return zeroed page on success */ - -#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ +#define __GFP_WAIT 0x10u /* Can wait and reschedule? */ +#define __GFP_HIGH 0x20u /* Should access emergency pools? */ +#define __GFP_IO 0x40u /* Can start physical IO? */ +#define __GFP_FS 0x80u /* Can call down to low-level FS? */ +#define __GFP_COLD 0x100u /* Cache-cold page required */ +#define __GFP_NOWARN 0x200u /* Suppress page allocation failure warning */ +#define __GFP_REPEAT 0x400u /* Retry the allocation. Might fail */ +#define __GFP_NOFAIL 0x800u /* Retry for ever. Cannot fail */ +#define __GFP_NORETRY 0x1000u /* Do not retry. Might fail */ +#define __GFP_NO_GROW 0x2000u /* Slab internal usage */ +#define __GFP_COMP 0x4000u /* Add compound page metadata */ +#define __GFP_ZERO 0x8000u /* Return zeroed page on success */ +#define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */ + +#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) /* if you forget to add the bitmask here kernel will crash, period */ #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ - __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP) + __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ + __GFP_NOMEMALLOC) #define GFP_ATOMIC (__GFP_HIGH) #define GFP_NOIO (__GFP_WAIT) @@ -86,7 +88,7 @@ struct vm_area_struct; extern struct page * FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *)); -static inline struct page *alloc_pages_node(int nid, unsigned int gfp_mask, +static inline struct page *alloc_pages_node(int nid, unsigned int __nocast gfp_mask, unsigned int order) { if (unlikely(order >= MAX_ORDER)) @@ -97,17 +99,17 @@ static inline struct page *alloc_pages_node(int nid, unsigned int gfp_mask, } #ifdef CONFIG_NUMA -extern struct page *alloc_pages_current(unsigned gfp_mask, unsigned order); +extern struct page *alloc_pages_current(unsigned int __nocast gfp_mask, unsigned order); static inline struct page * -alloc_pages(unsigned int gfp_mask, unsigned int order) +alloc_pages(unsigned int __nocast gfp_mask, unsigned int order) { if (unlikely(order >= MAX_ORDER)) return NULL; return alloc_pages_current(gfp_mask, order); } -extern struct page *alloc_page_vma(unsigned gfp_mask, +extern struct page *alloc_page_vma(unsigned __nocast gfp_mask, struct vm_area_struct *vma, unsigned long addr); #else #define alloc_pages(gfp_mask, order) \ @@ -116,8 +118,8 @@ extern struct page *alloc_page_vma(unsigned gfp_mask, #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); -extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +extern unsigned long FASTCALL(__get_free_pages(unsigned int __nocast gfp_mask, unsigned int order)); +extern unsigned long FASTCALL(get_zeroed_page(unsigned int __nocast gfp_mask)); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff --git a/linux-2.6-xen-sparse/include/linux/mm.h b/linux-2.6-xen-sparse/include/linux/mm.h index b797aaba2b..087fda00fe 100644 --- a/linux-2.6-xen-sparse/include/linux/mm.h +++ b/linux-2.6-xen-sparse/include/linux/mm.h @@ -37,10 +37,6 @@ extern int sysctl_legacy_va_layout; #include <asm/processor.h> #include <asm/atomic.h> -#ifndef MM_VM_SIZE -#define MM_VM_SIZE(mm) ((TASK_SIZE + PGDIR_SIZE - 1) & PGDIR_MASK) -#endif - #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) /* @@ -164,7 +160,8 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ -#define VM_FOREIGN 0x01000000 /* Has pages belonging to another VM */ +#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#define VM_FOREIGN 0x02000000 /* Has pages belonging to another VM */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -582,17 +579,19 @@ struct zap_details { pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ - unsigned long break_addr; /* Where unmap_vmas stopped */ unsigned long truncate_count; /* Compare vm_truncate_count */ }; -void zap_page_range(struct vm_area_struct *vma, unsigned long address, +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, +unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); -void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end); +void free_pgd_range(struct mmu_gather **tlb, unsigned long addr, + unsigned long end, unsigned long floor, unsigned long ceiling); +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, @@ -639,9 +638,9 @@ extern unsigned long do_mremap(unsigned long addr, * These functions are passed a count `nr_to_scan' and a gfpmask. They should * scan `nr_to_scan' objects, attempting to free them. * - * The callback must the number of objects which remain in the cache. + * The callback must return the number of objects which remain in the cache. * - * The callback will be passes nr_to_scan == 0 when the VM is querying the + * The callback will be passed nr_to_scan == 0 when the VM is querying the * cache size, so a fastpath for that case is appropriate. */ typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); @@ -728,6 +727,7 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff); extern void exit_mmap(struct mm_struct *); +extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); @@ -843,7 +843,7 @@ static inline void vm_stat_unaccount(struct vm_area_struct *vma) } /* update per process rss and vm hiwater data */ -extern void update_mem_hiwater(void); +extern void update_mem_hiwater(struct task_struct *tsk); #ifndef CONFIG_DEBUG_PAGEALLOC static inline void @@ -861,5 +861,8 @@ int in_gate_area_no_task(unsigned long addr); #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ +/* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */ +#define OOM_DISABLE -17 + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/linux-2.6-xen-sparse/include/linux/skbuff.h b/linux-2.6-xen-sparse/include/linux/skbuff.h index dad5d9bc1b..d10a3c5dc0 100644 --- a/linux-2.6-xen-sparse/include/linux/skbuff.h +++ b/linux-2.6-xen-sparse/include/linux/skbuff.h @@ -83,12 +83,6 @@ * Any questions? No questions, good. --ANK */ -#ifdef __i386__ -#define NET_CALLER(arg) (*(((void **)&arg) - 1)) -#else -#define NET_CALLER(arg) __builtin_return_address(0) -#endif - struct net_device; #ifdef CONFIG_NETFILTER @@ -146,6 +140,20 @@ struct skb_shared_info { skb_frag_t frags[MAX_SKB_FRAGS]; }; +/* We divide dataref into two halves. The higher 16 bits hold references + * to the payload part of skb->data. The lower 16 bits hold references to + * the entire skb->data. It is up to the users of the skb to agree on + * where the payload starts. + * + * All users must obey the rule that the skb->data reference count must be + * greater than or equal to the payload reference count. + * + * Holding a reference to the payload part means that the user does not + * care about modifications to the header part of skb->data. + */ +#define SKB_DATAREF_SHIFT 16 +#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) + /** * struct sk_buff - socket buffer * @next: Next buffer in list @@ -159,14 +167,16 @@ struct skb_shared_info { * @h: Transport layer header * @nh: Network layer header * @mac: Link layer header - * @dst: FIXME: Describe this field + * @dst: destination entry + * @sp: the security path, used for xfrm * @cb: Control buffer. Free for use by every layer. Put private vars here * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header * @csum: Checksum - * @__unused: Dead field, may be reused + * @local_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) + * @nohdr: Payload reference only, must not modify header * @proto_csum_valid: Protocol csum validated since arriving at localhost * @proto_csum_blank: Protocol csum must be added before leaving localhost * @pkt_type: Packet class @@ -189,6 +199,8 @@ struct skb_shared_info { * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @private: Data which is private to the HIPPI implementation * @tc_index: Traffic control index + * @tc_verd: traffic control verdict + * @tc_classid: traffic control classid */ struct sk_buff { @@ -241,6 +253,7 @@ struct sk_buff { csum; unsigned char local_df, cloned:1, + nohdr:1, proto_csum_valid:1, proto_csum_blank:1, pkt_type, @@ -374,7 +387,42 @@ static inline void kfree_skb(struct sk_buff *skb) */ static inline int skb_cloned(const struct sk_buff *skb) { - return skb->cloned && atomic_read(&skb_shinfo(skb)->dataref) != 1; + return skb->cloned && + (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1; +} + +/** + * skb_header_cloned - is the header a clone + * @skb: buffer to check + * + * Returns true if modifying the header part of the buffer requires + * the data to be copied. + */ +static inline int skb_header_cloned(const struct sk_buff *skb) +{ + int dataref; + + if (!skb->cloned) + return 0; + + dataref = atomic_read(&skb_shinfo(skb)->dataref); + dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT); + return dataref != 1; +} + +/** + * skb_header_release - release reference to header + * @skb: buffer to operate on + * + * Drop a reference to the header part of the buffer. This is done + * by acquiring a payload reference. You must not read from the header + * part of skb->data after this. + */ +static inline void skb_header_release(struct sk_buff *skb) +{ + BUG_ON(skb->nohdr); + skb->nohdr = 1; + atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref); } /** @@ -925,6 +973,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } +#ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB /** * __dev_alloc_skb - allocate an skbuff for sending * @length: length to allocate @@ -937,7 +986,6 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) * * %NULL is returned in there is no free memory. */ -#ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB static inline struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask) { @@ -1058,6 +1106,42 @@ static inline int skb_linearize(struct sk_buff *skb, int gfp) return __skb_linearize(skb, gfp); } +/** + * skb_postpull_rcsum - update checksum for received skb after pull + * @skb: buffer to update + * @start: start of data before pull + * @len: length of data pulled + * + * After doing a pull on a received packet, you need to call this to + * update the CHECKSUM_HW checksum, or set ip_summed to CHECKSUM_NONE + * so that it can be recomputed from scratch. + */ + +static inline void skb_postpull_rcsum(struct sk_buff *skb, + const void *start, int len) +{ + if (skb->ip_summed == CHECKSUM_HW) + skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0)); +} + +/** + * pskb_trim_rcsum - trim received skb and update checksum + * @skb: buffer to trim + * @len: new length + * + * This is exactly the same as pskb_trim except that it ensures the + * checksum of received packets are still valid after the operation. + */ + +static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) +{ + if (len >= skb->len) + return 0; + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + return __pskb_trim(skb, len); +} + static inline void *kmap_skb_frag(const skb_frag_t *frag) { #ifdef CONFIG_HIGHMEM @@ -1098,6 +1182,8 @@ extern unsigned int skb_checksum(const struct sk_buff *skb, int offset, int len, unsigned int csum); extern int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); +extern int skb_store_bits(const struct sk_buff *skb, int offset, + void *from, int len); extern unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int csum); @@ -1122,22 +1208,6 @@ static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, extern void skb_init(void); extern void skb_add_mtu(int mtu); -struct skb_iter { - /* Iteration functions set these */ - unsigned char *data; - unsigned int len; - - /* Private to iteration */ - unsigned int nextfrag; - struct sk_buff *fraglist; -}; - -/* Keep iterating until skb_iter_next returns false. */ -extern void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i); -extern int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i); -/* Call this if aborting loop before !skb_iter_next */ -extern void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i); - #ifdef CONFIG_NETFILTER static inline void nf_conntrack_put(struct nf_conntrack *nfct) { diff --git a/linux-2.6-xen-sparse/mm/highmem.c b/linux-2.6-xen-sparse/mm/highmem.c index 846297fb25..b71abaf94b 100644 --- a/linux-2.6-xen-sparse/mm/highmem.c +++ b/linux-2.6-xen-sparse/mm/highmem.c @@ -30,9 +30,9 @@ static mempool_t *page_pool, *isa_page_pool; -static void *page_pool_alloc(int gfp_mask, void *data) +static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data) { - int gfp = gfp_mask | (int) (long) data; + unsigned int gfp = gfp_mask | (unsigned int) (long) data; return alloc_page(gfp); } @@ -90,7 +90,8 @@ static void flush_all_zero_pkmaps(void) * So no dangers, even with speculative execution. */ page = pte_page(pkmap_page_table[i]); - pte_clear(&pkmap_page_table[i]); + pte_clear(&init_mm, (unsigned long)page_address(page), + &pkmap_page_table[i]); set_page_address(page, NULL); } @@ -138,7 +139,8 @@ start: } } vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + set_pte_at(&init_mm, vaddr, + &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); pkmap_count[last_pkmap_nr] = 1; set_page_address(page, (void *)vaddr); @@ -332,6 +334,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) continue; mempool_free(bvec->bv_page, pool); + dec_page_state(nr_bounce); } bio_endio(bio_orig, bio_orig->bi_size, err); @@ -412,6 +415,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, to->bv_page = mempool_alloc(pool, q->bounce_gfp); to->bv_len = from->bv_len; to->bv_offset = from->bv_offset; + inc_page_state(nr_bounce); if (rw == WRITE) { char *vto, *vfrom; diff --git a/linux-2.6-xen-sparse/mm/memory.c b/linux-2.6-xen-sparse/mm/memory.c index 7a051b1b41..bfd0814d37 100644 --- a/linux-2.6-xen-sparse/mm/memory.c +++ b/linux-2.6-xen-sparse/mm/memory.c @@ -46,7 +46,6 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/rmap.h> -#include <linux/acct.h> #include <linux/module.h> #include <linux/init.h> @@ -84,116 +83,205 @@ EXPORT_SYMBOL(high_memory); EXPORT_SYMBOL(vmalloc_earlyreserve); /* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + +/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static inline void clear_pmd_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long start, unsigned long end) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) { - struct page *page; - - if (pmd_none(*pmd)) - return; - if (unlikely(pmd_bad(*pmd))) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - if (!((start | end) & ~PMD_MASK)) { - /* Only clear full, aligned ranges */ - page = pmd_page(*pmd); - pmd_clear(pmd); - dec_page_state(nr_page_table_pages); - tlb->mm->nr_ptes--; - pte_free_tlb(tlb, page); - } + struct page *page = pmd_page(*pmd); + pmd_clear(pmd); + pte_free_tlb(tlb, page); + dec_page_state(nr_page_table_pages); + tlb->mm->nr_ptes--; } -static inline void clear_pud_range(struct mmu_gather *tlb, pud_t *pud, unsigned long start, unsigned long end) +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { - unsigned long addr = start, next; - pmd_t *pmd, *__pmd; + pmd_t *pmd; + unsigned long next; + unsigned long start; - if (pud_none(*pud)) - return; - if (unlikely(pud_bad(*pud))) { - pud_ERROR(*pud); - pud_clear(pud); + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + free_pte_range(tlb, pmd); + } while (pmd++, addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; } + if (end - 1 > ceiling - 1) + return; - pmd = __pmd = pmd_offset(pud, start); - do { - next = (addr + PMD_SIZE) & PMD_MASK; - if (next > end || next <= addr) - next = end; - - clear_pmd_range(tlb, pmd, addr, next); - pmd++; - addr = next; - } while (addr && (addr < end)); - - if (!((start | end) & ~PUD_MASK)) { - /* Only clear full, aligned ranges */ - pud_clear(pud); - pmd_free_tlb(tlb, __pmd); - } + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd); } - -static inline void clear_pgd_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long start, unsigned long end) +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { - unsigned long addr = start, next; - pud_t *pud, *__pud; + pud_t *pud; + unsigned long next; + unsigned long start; - if (pgd_none(*pgd)) - return; - if (unlikely(pgd_bad(*pgd))) { - pgd_ERROR(*pgd); - pgd_clear(pgd); + start = addr; + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + free_pmd_range(tlb, pud, addr, next, floor, ceiling); + } while (pud++, addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; } + if (end - 1 > ceiling - 1) + return; - pud = __pud = pud_offset(pgd, start); - do { - next = (addr + PUD_SIZE) & PUD_MASK; - if (next > end || next <= addr) - next = end; - - clear_pud_range(tlb, pud, addr, next); - pud++; - addr = next; - } while (addr && (addr < end)); - - if (!((start | end) & ~PGDIR_MASK)) { - /* Only clear full, aligned ranges */ - pgd_clear(pgd); - pud_free_tlb(tlb, __pud); - } + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud); } /* - * This function clears user-level page tables of a process. + * This function frees user-level page tables of a process. * * Must be called with pagetable lock held. */ -void clear_page_range(struct mmu_gather *tlb, unsigned long start, unsigned long end) +void free_pgd_range(struct mmu_gather **tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { - unsigned long addr = start, next; - pgd_t * pgd = pgd_offset(tlb->mm, start); - unsigned long i; - - for (i = pgd_index(start); i <= pgd_index(end-1); i++) { - next = (addr + PGDIR_SIZE) & PGDIR_MASK; - if (next > end || next <= addr) - next = end; - - clear_pgd_range(tlb, pgd, addr, next); - pgd++; - addr = next; + pgd_t *pgd; + unsigned long next; + unsigned long start; + + /* + * The next few lines have given us lots of grief... + * + * Why are we testing PMD* at this top level? Because often + * there will be no work to do at all, and we'd prefer not to + * go all the way down to the bottom just to discover that. + * + * Why all these "- 1"s? Because 0 represents both the bottom + * of the address space and the top of it (using -1 for the + * top wouldn't help much: the masks would do the wrong thing). + * The rule is that addr 0 and floor 0 refer to the bottom of + * the address space, but end 0 and ceiling 0 refer to the top + * Comparisons need to use "end - 1" and "ceiling - 1" (though + * that end 0 case should be mythical). + * + * Wherever addr is brought up or ceiling brought down, we must + * be careful to reject "the opposite 0" before it confuses the + * subsequent tests. But what about where end is brought down + * by PMD_SIZE below? no, end can't go down to 0 there. + * + * Whereas we round start (addr) and ceiling down, by different + * masks at different levels, in order to test whether a table + * now has no other vmas using it, so can be freed, we don't + * bother to round floor or end up - the tests don't need that. + */ + + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; } + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; + + start = addr; + pgd = pgd_offset((*tlb)->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + free_pud_range(*tlb, pgd, addr, next, floor, ceiling); + } while (pgd++, addr = next, addr != end); + + if (!tlb_is_full_mm(*tlb)) + flush_tlb_pgtables((*tlb)->mm, start, end); } -pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, + unsigned long floor, unsigned long ceiling) +{ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + unsigned long addr = vma->vm_start; + + if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } else { + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE + && !is_hugepage_only_range(vma->vm_mm, next->vm_start, + HPAGE_SIZE)) { + vma = next; + next = vma->vm_next; + } + free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } + vma = next; + } +} + +pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { if (!pmd_present(*pmd)) { struct page *new; @@ -254,20 +342,7 @@ out: */ static inline void -copy_swap_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t pte) -{ - if (pte_file(pte)) - return; - swap_duplicate(pte_to_swp_entry(pte)); - if (list_empty(&dst_mm->mmlist)) { - spin_lock(&mmlist_lock); - list_add(&dst_mm->mmlist, &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } -} - -static inline void -copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, unsigned long addr) { @@ -275,12 +350,21 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct page *page; unsigned long pfn; - /* pte contains position in swap, so copy. */ - if (!pte_present(pte)) { - copy_swap_pte(dst_mm, src_mm, pte); - set_pte(dst_pte, pte); + /* pte contains position in swap or file, so copy. */ + if (unlikely(!pte_present(pte))) { + if (!pte_file(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + list_add(&dst_mm->mmlist, &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + } + set_pte_at(dst_mm, addr, dst_pte, pte); return; } + pfn = pte_pfn(pte); /* the pte points outside of valid memory, the * mapping is assumed to be good, meaningful @@ -292,7 +376,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = pfn_to_page(pfn); if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); + set_pte_at(dst_mm, addr, dst_pte, pte); return; } @@ -301,7 +385,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * in the parent and the child */ if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { - ptep_set_wrprotect(src_pte); + ptep_set_wrprotect(src_mm, addr, src_pte); pte = *src_pte; } @@ -313,172 +397,137 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - dst_mm->rss++; + inc_mm_counter(dst_mm, rss); if (PageAnon(page)) - dst_mm->anon_rss++; - set_pte(dst_pte, pte); + inc_mm_counter(dst_mm, anon_rss); + set_pte_at(dst_mm, addr, dst_pte, pte); page_dup_rmap(page); } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; - pte_t *s, *d; unsigned long vm_flags = vma->vm_flags; + int progress; - d = dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); +again: + dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); if (!dst_pte) return -ENOMEM; + src_pte = pte_offset_map_nested(src_pmd, addr); + progress = 0; spin_lock(&src_mm->page_table_lock); - s = src_pte = pte_offset_map_nested(src_pmd, addr); - for (; addr < end; addr += PAGE_SIZE, s++, d++) { - if (pte_none(*s)) + do { + /* + * We are holding two locks at this point - either of them + * could generate latencies in another task on another CPU. + */ + if (progress >= 32 && (need_resched() || + need_lockbreak(&src_mm->page_table_lock) || + need_lockbreak(&dst_mm->page_table_lock))) + break; + if (pte_none(*src_pte)) { + progress++; continue; - copy_one_pte(dst_mm, src_mm, d, s, vm_flags, addr); - } - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); + } + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); spin_unlock(&src_mm->page_table_lock); + + pte_unmap_nested(src_pte - 1); + pte_unmap(dst_pte - 1); cond_resched_lock(&dst_mm->page_table_lock); + if (addr != end) + goto again; return 0; } -static int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; - int err = 0; unsigned long next; - src_pmd = pmd_offset(src_pud, addr); dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); if (!dst_pmd) return -ENOMEM; - - for (; addr < end; addr = next, src_pmd++, dst_pmd++) { - next = (addr + PMD_SIZE) & PMD_MASK; - if (next > end || next <= addr) - next = end; - if (pmd_none(*src_pmd)) - continue; - if (pmd_bad(*src_pmd)) { - pmd_ERROR(*src_pmd); - pmd_clear(src_pmd); + src_pmd = pmd_offset(src_pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(src_pmd)) continue; - } - err = copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next); - if (err) - break; - } - return err; + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; } -static int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; - int err = 0; unsigned long next; - src_pud = pud_offset(src_pgd, addr); dst_pud = pud_alloc(dst_mm, dst_pgd, addr); if (!dst_pud) return -ENOMEM; - - for (; addr < end; addr = next, src_pud++, dst_pud++) { - next = (addr + PUD_SIZE) & PUD_MASK; - if (next > end || next <= addr) - next = end; - if (pud_none(*src_pud)) - continue; - if (pud_bad(*src_pud)) { - pud_ERROR(*src_pud); - pud_clear(src_pud); + src_pud = pud_offset(src_pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(src_pud)) continue; - } - err = copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next); - if (err) - break; - } - return err; + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; } -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *vma) { pgd_t *src_pgd, *dst_pgd; - unsigned long addr, start, end, next; - int err = 0; + unsigned long next; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst, src, vma); - - start = vma->vm_start; - src_pgd = pgd_offset(src, start); - dst_pgd = pgd_offset(dst, start); - - end = vma->vm_end; - addr = start; - while (addr && (addr < end-1)) { - next = (addr + PGDIR_SIZE) & PGDIR_MASK; - if (next > end || next <= addr) - next = end; - if (pgd_none(*src_pgd)) - goto next_pgd; - if (pgd_bad(*src_pgd)) { - pgd_ERROR(*src_pgd); - pgd_clear(src_pgd); - goto next_pgd; - } - err = copy_pud_range(dst, src, dst_pgd, src_pgd, - vma, addr, next); - if (err) - break; + return copy_hugetlb_page_range(dst_mm, src_mm, vma); -next_pgd: - src_pgd++; - dst_pgd++; - addr = next; - } - - return err; + dst_pgd = pgd_offset(dst_mm, addr); + src_pgd = pgd_offset(src_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + return 0; } -static void zap_pte_range(struct mmu_gather *tlb, - pmd_t *pmd, unsigned long address, - unsigned long size, struct zap_details *details) +static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr, unsigned long end, + struct zap_details *details) { - unsigned long offset; - pte_t *ptep; + pte_t *pte; - if (pmd_none(*pmd)) - return; - if (unlikely(pmd_bad(*pmd))) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - ptep = pte_offset_map(pmd, address); - offset = address & ~PMD_MASK; - if (offset + size > PMD_SIZE) - size = PMD_SIZE - offset; - size &= PAGE_MASK; - if (details && !details->check_mapping && !details->nonlinear_vma) - details = NULL; - for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { - pte_t pte = *ptep; - if (pte_none(pte)) + pte = pte_offset_map(pmd, addr); + do { + pte_t ptent = *pte; + if (pte_none(ptent)) continue; - if (pte_present(pte)) { + if (pte_present(ptent)) { struct page *page = NULL; - unsigned long pfn = pte_pfn(pte); + unsigned long pfn = pte_pfn(ptent); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); if (PageReserved(page)) @@ -502,19 +551,20 @@ static void zap_pte_range(struct mmu_gather *tlb, page->index > details->last_index)) continue; } - pte = ptep_get_and_clear(ptep); - tlb_remove_tlb_entry(tlb, ptep, address+offset); + ptent = ptep_get_and_clear(tlb->mm, addr, pte); + tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - address+offset) != page->index) - set_pte(ptep, pgoff_to_pte(page->index)); - if (pte_dirty(pte)) + addr) != page->index) + set_pte_at(tlb->mm, addr, pte, + pgoff_to_pte(page->index)); + if (pte_dirty(ptent)) set_page_dirty(page); if (PageAnon(page)) - tlb->mm->anon_rss--; - else if (pte_young(pte)) + dec_mm_counter(tlb->mm, anon_rss); + else if (pte_young(ptent)) mark_page_accessed(page); tlb->freed++; page_remove_rmap(page); @@ -527,78 +577,64 @@ static void zap_pte_range(struct mmu_gather *tlb, */ if (unlikely(details)) continue; - if (!pte_file(pte)) - free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); - } - pte_unmap(ptep-1); + if (!pte_file(ptent)) + free_swap_and_cache(pte_to_swp_entry(ptent)); + pte_clear(tlb->mm, addr, pte); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); } -static void zap_pmd_range(struct mmu_gather *tlb, - pud_t *pud, unsigned long address, - unsigned long size, struct zap_details *details) +static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + struct zap_details *details) { - pmd_t * pmd; - unsigned long end; + pmd_t *pmd; + unsigned long next; - if (pud_none(*pud)) - return; - if (unlikely(pud_bad(*pud))) { - pud_ERROR(*pud); - pud_clear(pud); - return; - } - pmd = pmd_offset(pud, address); - end = address + size; - if (end > ((address + PUD_SIZE) & PUD_MASK)) - end = ((address + PUD_SIZE) & PUD_MASK); + pmd = pmd_offset(pud, addr); do { - zap_pte_range(tlb, pmd, address, end - address, details); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + zap_pte_range(tlb, pmd, addr, next, details); + } while (pmd++, addr = next, addr != end); } -static void zap_pud_range(struct mmu_gather *tlb, - pgd_t * pgd, unsigned long address, - unsigned long end, struct zap_details *details) +static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + struct zap_details *details) { - pud_t * pud; + pud_t *pud; + unsigned long next; - if (pgd_none(*pgd)) - return; - if (unlikely(pgd_bad(*pgd))) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - pud = pud_offset(pgd, address); + pud = pud_offset(pgd, addr); do { - zap_pmd_range(tlb, pud, address, end - address, details); - address = (address + PUD_SIZE) & PUD_MASK; - pud++; - } while (address && (address < end)); + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + zap_pmd_range(tlb, pud, addr, next, details); + } while (pud++, addr = next, addr != end); } -static void unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long address, - unsigned long end, struct zap_details *details) +static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details) { - unsigned long next; pgd_t *pgd; - int i; + unsigned long next; - BUG_ON(address >= end); - pgd = pgd_offset(vma->vm_mm, address); + if (details && !details->check_mapping && !details->nonlinear_vma) + details = NULL; + + BUG_ON(addr >= end); tlb_start_vma(tlb, vma); - for (i = pgd_index(address); i <= pgd_index(end-1); i++) { - next = (address + PGDIR_SIZE) & PGDIR_MASK; - if (next <= address || next > end) - next = end; - zap_pud_range(tlb, pgd, address, next, details); - address = next; - pgd++; - } + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + zap_pud_range(tlb, pgd, addr, next, details); + } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } @@ -619,7 +655,7 @@ static void unmap_page_range(struct mmu_gather *tlb, * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here * @details: details of nonlinear truncation or shared cache invalidation * - * Returns the number of vma's which were covered by the unmapping. + * Returns the end address of the unmapping (restart addr if interrupted). * * Unmap all pages in the vma list. Called under page_table_lock. * @@ -636,7 +672,7 @@ static void unmap_page_range(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, +unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) @@ -644,12 +680,11 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, unsigned long zap_bytes = ZAP_BLOCK_SIZE; unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; - int ret = 0; + unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = tlb_is_full_mm(*tlbp); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { - unsigned long start; unsigned long end; start = max(vma->vm_start, start_addr); @@ -662,7 +697,6 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; - ret++; while (start != end) { unsigned long block; @@ -693,7 +727,6 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, if (i_mmap_lock) { /* must reset count of rss freed */ *tlbp = tlb_gather_mmu(mm, fullmm); - details->break_addr = start; goto out; } spin_unlock(&mm->page_table_lock); @@ -707,7 +740,7 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, } } out: - return ret; + return start; /* which is now the end (or restart) address */ } /** @@ -717,7 +750,7 @@ out: * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation */ -void zap_page_range(struct vm_area_struct *vma, unsigned long address, +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; @@ -727,16 +760,16 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, if (is_vm_hugetlb_page(vma)) { zap_hugepage_range(vma, address, size); - return; + return end; } lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); - unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); + end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); tlb_finish_mmu(tlb, address, end); - acct_update_integrals(); spin_unlock(&mm->page_table_lock); + return end; } /* @@ -1005,111 +1038,78 @@ out: EXPORT_SYMBOL(get_user_pages); -static void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t prot) { - unsigned long end; + pte_t *pte; - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + return -ENOMEM; do { - pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); + pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); BUG_ON(!pte_none(*pte)); - set_pte(pte, zero_pte); - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); + set_pte_at(mm, addr, pte, zero_pte); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); + return 0; } -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, - unsigned long address, unsigned long size, pgprot_t prot) +static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, pgprot_t prot) { - unsigned long base, end; + pmd_t *pmd; + unsigned long next; - base = address & PUD_MASK; - address &= ~PUD_MASK; - end = address + size; - if (end > PUD_SIZE) - end = PUD_SIZE; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); - if (!pte) + next = pmd_addr_end(addr, end); + if (zeromap_pte_range(mm, pmd, addr, next, prot)) return -ENOMEM; - zeromap_pte_range(pte, base + address, end - address, prot); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); + } while (pmd++, addr = next, addr != end); return 0; } -static inline int zeromap_pud_range(struct mm_struct *mm, pud_t * pud, - unsigned long address, - unsigned long size, pgprot_t prot) +static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, pgprot_t prot) { - unsigned long base, end; - int error = 0; - - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; + pud_t *pud; + unsigned long next; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; do { - pmd_t * pmd = pmd_alloc(mm, pud, base + address); - error = -ENOMEM; - if (!pmd) - break; - error = zeromap_pmd_range(mm, pmd, base + address, - end - address, prot); - if (error) - break; - address = (address + PUD_SIZE) & PUD_MASK; - pud++; - } while (address && (address < end)); + next = pud_addr_end(addr, end); + if (zeromap_pmd_range(mm, pud, addr, next, prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); return 0; } -int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, pgprot_t prot) +int zeromap_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgprot_t prot) { - int i; - int error = 0; - pgd_t * pgd; - unsigned long beg = address; - unsigned long end = address + size; + pgd_t *pgd; unsigned long next; + unsigned long end = addr + size; struct mm_struct *mm = vma->vm_mm; + int err; - pgd = pgd_offset(mm, address); - flush_cache_range(vma, beg, end); - BUG_ON(address >= end); - BUG_ON(end > vma->vm_end); - + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); spin_lock(&mm->page_table_lock); - for (i = pgd_index(address); i <= pgd_index(end-1); i++) { - pud_t *pud = pud_alloc(mm, pgd, address); - error = -ENOMEM; - if (!pud) - break; - next = (address + PGDIR_SIZE) & PGDIR_MASK; - if (next <= beg || next > end) - next = end; - error = zeromap_pud_range(mm, pud, address, - next - address, prot); - if (error) + do { + next = pgd_addr_end(addr, end); + err = zeromap_pud_range(mm, pgd, addr, next, prot); + if (err) break; - address = next; - pgd++; - } - /* - * Why flush? zeromap_pte_range has a BUG_ON for !pte_none() - */ - flush_tlb_range(vma, beg, end); + } while (pgd++, addr = next, addr != end); spin_unlock(&mm->page_table_lock); - return error; + return err; } /* @@ -1117,95 +1117,74 @@ int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ -static inline void -remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long pfn, pgprot_t prot) +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) { - unsigned long end; + pte_t *pte; - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + return -ENOMEM; do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte(pte, pfn_pte(pfn, prot)); - address += PAGE_SIZE; + set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; - pte++; - } while (address && (address < end)); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); + return 0; } -static inline int -remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, unsigned long pfn, pgprot_t prot) +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) { - unsigned long base, end; - - base = address & PUD_MASK; - address &= ~PUD_MASK; - end = address + size; - if (end > PUD_SIZE) - end = PUD_SIZE; - pfn -= (address >> PAGE_SHIFT); + pmd_t *pmd; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); - if (!pte) + next = pmd_addr_end(addr, end); + if (remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) return -ENOMEM; - remap_pte_range(pte, base + address, end - address, - (address >> PAGE_SHIFT) + pfn, prot); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); + } while (pmd++, addr = next, addr != end); return 0; } -static inline int remap_pud_range(struct mm_struct *mm, pud_t * pud, - unsigned long address, unsigned long size, - unsigned long pfn, pgprot_t prot) +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) { - unsigned long base, end; - int error; - - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - pfn -= address >> PAGE_SHIFT; + pud_t *pud; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; do { - pmd_t *pmd = pmd_alloc(mm, pud, base+address); - error = -ENOMEM; - if (!pmd) - break; - error = remap_pmd_range(mm, pmd, base + address, end - address, - (address >> PAGE_SHIFT) + pfn, prot); - if (error) - break; - address = (address + PUD_SIZE) & PUD_MASK; - pud++; - } while (address && (address < end)); - return error; + next = pud_addr_end(addr, end); + if (remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; } /* Note: this is only safe if the mm semaphore is held when called. */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { - int error = 0; pgd_t *pgd; - unsigned long beg = from; - unsigned long end = from + size; unsigned long next; + unsigned long end = addr + size; struct mm_struct *mm = vma->vm_mm; - int i; - - pfn -= from >> PAGE_SHIFT; - pgd = pgd_offset(mm, from); - flush_cache_range(vma, beg, end); - BUG_ON(from >= end); + int err; /* * Physically remapped pages are special. Tell the @@ -1217,31 +1196,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, */ vma->vm_flags |= VM_IO | VM_RESERVED; + BUG_ON(addr >= end); + pfn -= addr >> PAGE_SHIFT; + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); spin_lock(&mm->page_table_lock); - for (i = pgd_index(beg); i <= pgd_index(end-1); i++) { - pud_t *pud = pud_alloc(mm, pgd, from); - error = -ENOMEM; - if (!pud) - break; - next = (from + PGDIR_SIZE) & PGDIR_MASK; - if (next > end || next <= from) - next = end; - error = remap_pud_range(mm, pud, from, end - from, - pfn + (from >> PAGE_SHIFT), prot); - if (error) + do { + next = pgd_addr_end(addr, end); + err = remap_pud_range(mm, pgd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) break; - from = next; - pgd++; - } - /* - * Why flush? remap_pte_range has a BUG_ON for !pte_none() - */ - flush_tlb_range(vma, beg, end); + } while (pgd++, addr = next, addr != end); spin_unlock(&mm->page_table_lock); - - return error; + return err; } - EXPORT_SYMBOL(remap_pfn_range); /* @@ -1265,11 +1234,11 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page { pte_t entry; - flush_cache_page(vma, address); entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), vma); ptep_establish(vma, address, page_table, entry); update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); } /* @@ -1317,11 +1286,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { - flush_cache_page(vma, address); + flush_cache_page(vma, address, pfn); entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), vma); ptep_set_access_flags(vma, address, page_table, entry, 1); update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); return VM_FAULT_MINOR; @@ -1355,13 +1325,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, pte))) { if (PageAnon(old_page)) - mm->anon_rss--; - if (PageReserved(old_page)) { - ++mm->rss; - acct_update_integrals(); - update_mem_hiwater(); - } else + dec_mm_counter(mm, anon_rss); + if (PageReserved(old_page)) + inc_mm_counter(mm, rss); + else page_remove_rmap(old_page); + flush_cache_page(vma, address, pfn); break_cow(vma, new_page, address, page_table); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1405,7 +1374,7 @@ no_new_page: * i_mmap_lock. * * In order to make forward progress despite repeatedly restarting some - * large vma, note the break_addr set by unmap_vmas when it breaks out: + * large vma, note the restart_addr from unmap_vmas when it breaks out: * and restart from that address when we reach that vma again. It might * have been split or merged, shrunk or extended, but never shifted: so * restart_addr remains valid so long as it remains in the vma's range. @@ -1443,8 +1412,8 @@ again: } } - details->break_addr = end_addr; - zap_page_range(vma, start_addr, end_addr - start_addr, details); + restart_addr = zap_page_range(vma, start_addr, + end_addr - start_addr, details); /* * We cannot rely on the break test in unmap_vmas: @@ -1455,14 +1424,14 @@ again: need_break = need_resched() || need_lockbreak(details->i_mmap_lock); - if (details->break_addr >= end_addr) { + if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ vma->vm_truncate_count = details->truncate_count; if (!need_break) return 0; } else { /* Note restart_addr in vma's truncate_count field */ - vma->vm_truncate_count = details->break_addr; + vma->vm_truncate_count = restart_addr; if (!need_break) goto again; } @@ -1750,12 +1719,13 @@ static int do_swap_page(struct mm_struct * mm, spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (unlikely(!pte_same(*page_table, orig_pte))) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); ret = VM_FAULT_MINOR; - goto out; + goto out_nomap; + } + + if (unlikely(!PageUptodate(page))) { + ret = VM_FAULT_SIGBUS; + goto out_nomap; } /* The page isn't present yet, go ahead with the fault. */ @@ -1764,10 +1734,7 @@ static int do_swap_page(struct mm_struct * mm, if (vm_swap_full()) remove_exclusive_swap_page(page); - mm->rss++; - acct_update_integrals(); - update_mem_hiwater(); - + inc_mm_counter(mm, rss); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -1776,7 +1743,7 @@ static int do_swap_page(struct mm_struct * mm, unlock_page(page); flush_icache_page(vma, page); - set_pte(page_table, pte); + set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); if (write_access) { @@ -1788,10 +1755,17 @@ static int do_swap_page(struct mm_struct * mm, /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); + lazy_mmu_prot_update(pte); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: return ret; +out_nomap: + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + unlock_page(page); + page_cache_release(page); + goto out; } /* @@ -1831,9 +1805,7 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); goto out; } - mm->rss++; - acct_update_integrals(); - update_mem_hiwater(); + inc_mm_counter(mm, rss); entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)), vma); @@ -1842,11 +1814,12 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_add_anon_rmap(page, vma, addr); } - ptep_establish_new(vma, addr, page_table, entry); + set_pte_at(vma, addr, page_table, entry); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); + lazy_mmu_prot_update(entry); spin_unlock(&mm->page_table_lock); out: return VM_FAULT_MINOR; @@ -1949,15 +1922,13 @@ retry: /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { if (!PageReserved(new_page)) - ++mm->rss; - acct_update_integrals(); - update_mem_hiwater(); + inc_mm_counter(mm, rss); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_establish_new(vma, address, page_table, entry); + set_pte_at(vma, address, page_table, entry); if (anon) { lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1974,6 +1945,7 @@ retry: /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); spin_unlock(&mm->page_table_lock); out: return ret; @@ -2001,7 +1973,7 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, */ if (!vma->vm_ops || !vma->vm_ops->populate || (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(pte); + pte_clear(mm, address, pte); return do_no_page(mm, vma, address, write_access, pte, pmd); } @@ -2068,6 +2040,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, entry = pte_mkyoung(entry); ptep_set_access_flags(vma, address, pte, entry, write_access); update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); pte_unmap(pte); spin_unlock(&mm->page_table_lock); return VM_FAULT_MINOR; @@ -2117,15 +2090,12 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, return VM_FAULT_OOM; } -#ifndef __ARCH_HAS_4LEVEL_HACK +#ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. * * We've already handled the fast-path in-line, and we own the * page table lock. - * - * On a two-level or three-level page table, this ends up actually being - * entirely optimized away. */ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { @@ -2149,15 +2119,14 @@ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr out: return pud_offset(pgd, address); } +#endif /* __PAGETABLE_PUD_FOLDED */ +#ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. * * We've already handled the fast-path in-line, and we own the * page table lock. - * - * On a two-level page table, this ends up actually being entirely - * optimized away. */ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { @@ -2173,38 +2142,24 @@ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ +#ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) { pmd_free(new); goto out; } pud_populate(mm, pud, new); - out: - return pmd_offset(pud, address); -} #else -pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - pmd_t *new; - - spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ if (pgd_present(*pud)) { pmd_free(new); goto out; } pgd_populate(mm, pud, new); -out: +#endif /* __ARCH_HAS_4LEVEL_HACK */ + + out: return pmd_offset(pud, address); } -#endif +#endif /* __PAGETABLE_PMD_FOLDED */ int make_pages_present(unsigned long addr, unsigned long end) { @@ -2271,13 +2226,13 @@ EXPORT_SYMBOL(vmalloc_to_pfn); * update_mem_hiwater * - update per process rss and vm high water data */ -void update_mem_hiwater(void) +void update_mem_hiwater(struct task_struct *tsk) { - struct task_struct *tsk = current; - if (tsk->mm) { - if (tsk->mm->hiwater_rss < tsk->mm->rss) - tsk->mm->hiwater_rss = tsk->mm->rss; + unsigned long rss = get_mm_counter(tsk->mm, rss); + + if (tsk->mm->hiwater_rss < rss) + tsk->mm->hiwater_rss = rss; if (tsk->mm->hiwater_vm < tsk->mm->total_vm) tsk->mm->hiwater_vm = tsk->mm->total_vm; } diff --git a/linux-2.6-xen-sparse/mm/mmap.c b/linux-2.6-xen-sparse/mm/mmap.c index 848200e1b8..f2dd282348 100644 --- a/linux-2.6-xen-sparse/mm/mmap.c +++ b/linux-2.6-xen-sparse/mm/mmap.c @@ -21,7 +21,6 @@ #include <linux/hugetlb.h> #include <linux/profile.h> #include <linux/module.h> -#include <linux/acct.h> #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> @@ -30,6 +29,10 @@ #include <asm/cacheflush.h> #include <asm/tlb.h> +static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); + /* * WARNING: the debugging will use recursive algorithms so never enable this * unless you know what you are doing. @@ -873,7 +876,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, int error; struct rb_node ** rb_link, * rb_parent; int accountable = 1; - unsigned long charged = 0; + unsigned long charged = 0, reqprot = prot; if (file) { if (is_file_hugepages(file)) @@ -897,16 +900,16 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, prot |= PROT_EXEC; if (!len) - return addr; + return -EINVAL; /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len || len > TASK_SIZE) - return -EINVAL; + return -ENOMEM; /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) - return -EINVAL; + return -EOVERFLOW; /* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) @@ -934,9 +937,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - locked += len; + lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; } @@ -991,7 +995,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } } - error = security_file_mmap(file, prot, flags); + error = security_file_mmap(file, reqprot, prot, flags); if (error) return error; @@ -1006,8 +1010,7 @@ munmap_back: } /* Check against address space limit. */ - if ((mm->total_vm << PAGE_SHIFT) + len - > current->signal->rlim[RLIMIT_AS].rlim_cur) + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; if (accountable && (!(flags & MAP_NORESERVE) || @@ -1121,8 +1124,6 @@ out: pgoff, flags & MAP_NONBLOCK); down_write(&mm->mmap_sem); } - acct_update_integrals(); - update_mem_hiwater(); return addr; unmap_and_free_vma: @@ -1132,7 +1133,8 @@ unmap_and_free_vma: fput(file); /* Undo any partial mapping done by a device driver. */ - zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + charged = 0; free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: @@ -1221,19 +1223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, const unsigned long flags) { - struct vm_area_struct *vma, *prev_vma; + struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long base = mm->mmap_base, addr = addr0; - int first_time = 1; + unsigned long addr = addr0; /* requested length too big for entire address space */ if (len > TASK_SIZE) return -ENOMEM; - /* dont allow allocations above current base */ - if (mm->free_area_cache > base) - mm->free_area_cache = base; - /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); @@ -1243,48 +1240,34 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } -try_again: + /* either no address requested or can't fit in requested address hole */ + addr = mm->free_area_cache; + /* make sure it can fit in the remaining address space */ - if (mm->free_area_cache < len) - goto fail; + if (addr > len) { + vma = find_vma(mm, addr-len); + if (!vma || addr <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr-len); + } + + addr = mm->mmap_base-len; - /* either no address requested or cant fit in requested address hole */ - addr = (mm->free_area_cache - len) & PAGE_MASK; do { /* * Lookup failure means no vma is above this address, - * i.e. return with success: + * else if new region fits below vma->vm_start, + * return with success: */ - if (!(vma = find_vma_prev(mm, addr, &prev_vma))) - return addr; - - /* - * new region fits between prev_vma->vm_end and - * vma->vm_start, use it: - */ - if (addr+len <= vma->vm_start && - (!prev_vma || (addr >= prev_vma->vm_end))) + vma = find_vma(mm, addr); + if (!vma || addr+len <= vma->vm_start) /* remember the address as a hint for next time */ return (mm->free_area_cache = addr); - else - /* pull free_area_cache down to the first hole */ - if (mm->free_area_cache == vma->vm_end) - mm->free_area_cache = vma->vm_start; /* try just below the current vma->vm_start */ addr = vma->vm_start-len; - } while (len <= vma->vm_start); + } while (len < vma->vm_start); -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (first_time) { - mm->free_area_cache = base; - first_time = 0; - goto try_again; - } /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario @@ -1296,7 +1279,7 @@ fail: /* * Restore the topdown base: */ - mm->free_area_cache = base; + mm->free_area_cache = mm->mmap_base; return addr; } @@ -1309,43 +1292,50 @@ void arch_unmap_area_topdown(struct vm_area_struct *area) */ if (area->vm_end > area->vm_mm->free_area_cache) area->vm_mm->free_area_cache = area->vm_end; + + /* dont allow allocations above current base */ + if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base) + area->vm_mm->free_area_cache = area->vm_mm->mmap_base; } unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - if (flags & MAP_FIXED) { - unsigned long ret; + unsigned long ret; - if (addr > TASK_SIZE - len) - return -ENOMEM; - if (addr & ~PAGE_MASK) - return -EINVAL; - if (file && is_file_hugepages(file)) { - /* - * Check if the given range is hugepage aligned, and - * can be made suitable for hugepages. - */ - ret = prepare_hugepage_range(addr, len); - } else { - /* - * Ensure that a normal request is not falling in a - * reserved hugepage range. For some archs like IA-64, - * there is a separate region for hugepages. - */ - ret = is_hugepage_only_range(addr, len); - } - if (ret) - return -EINVAL; - return addr; - } + if (!(flags & MAP_FIXED)) { + unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - if (file && file->f_op && file->f_op->get_unmapped_area) - return file->f_op->get_unmapped_area(file, addr, len, - pgoff, flags); + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + addr = get_area(file, addr, len, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + } - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (addr & ~PAGE_MASK) + return -EINVAL; + if (file && is_file_hugepages(file)) { + /* + * Check if the given range is hugepage aligned, and + * can be made suitable for hugepages. + */ + ret = prepare_hugepage_range(addr, len); + } else { + /* + * Ensure that a normal request is not falling in a + * reserved hugepage range. For some archs like IA-64, + * there is a separate region for hugepages. + */ + ret = is_hugepage_only_range(current->mm, addr, len); + } + if (ret) + return -EINVAL; + return addr; } EXPORT_SYMBOL(get_unmapped_area); @@ -1434,7 +1424,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un struct rlimit *rlim = current->signal->rlim; /* address space limit tests */ - if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT) + if (!may_expand_vm(mm, grow)) return -ENOMEM; /* Stack limit test */ @@ -1463,8 +1453,6 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); - acct_update_integrals(); - update_mem_hiwater(); return 0; } @@ -1592,66 +1580,6 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) } #endif -/* - * Try to free as many page directory entries as we can, - * without having to work very hard at actually scanning - * the page tables themselves. - * - * Right now we try to free page tables if we have a nice - * PGDIR-aligned area that got free'd up. We could be more - * granular if we want to, but this is fast and simple, - * and covers the bad cases. - * - * "prev", if it exists, points to a vma before the one - * we just free'd - but there's no telling how much before. - */ -static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, - unsigned long start, unsigned long end) -{ - unsigned long first = start & PGDIR_MASK; - unsigned long last = end + PGDIR_SIZE - 1; - struct mm_struct *mm = tlb->mm; - - if (last > MM_VM_SIZE(mm) || last < end) - last = MM_VM_SIZE(mm); - - if (!prev) { - prev = mm->mmap; - if (!prev) - goto no_mmaps; - if (prev->vm_end > start) { - if (last > prev->vm_start) - last = prev->vm_start; - goto no_mmaps; - } - } - for (;;) { - struct vm_area_struct *next = prev->vm_next; - - if (next) { - if (next->vm_start < start) { - prev = next; - continue; - } - if (last > next->vm_start) - last = next->vm_start; - } - if (prev->vm_end > first) - first = prev->vm_end; - break; - } -no_mmaps: - if (last < first) /* for arches with discontiguous pgd indices */ - return; - if (first < FIRST_USER_PGD_NR * PGDIR_SIZE) - first = FIRST_USER_PGD_NR * PGDIR_SIZE; - /* No point trying to free anything if we're in the same pte page */ - if ((first & PMD_MASK) < (last & PMD_MASK)) { - clear_page_range(tlb, first, last); - flush_tlb_pgtables(mm, first, last); - } -} - /* Normal function to fix up a mapping * This function is the default for when an area has no specific * function. This may be used as part of a more specific routine. @@ -1677,14 +1605,13 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) * Ok - we have the memory areas we should free on the 'free' list, * so release them, and do the vma updates. */ -static void unmap_vma_list(struct mm_struct *mm, - struct vm_area_struct *mpnt) +static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { do { - struct vm_area_struct *next = mpnt->vm_next; - unmap_vma(mm, mpnt); - mpnt = next; - } while (mpnt != NULL); + struct vm_area_struct *next = vma->vm_next; + unmap_vma(mm, vma); + vma = next; + } while (vma); validate_mm(mm); } @@ -1694,24 +1621,22 @@ static void unmap_vma_list(struct mm_struct *mm, * Called with the page table lock held. */ static void unmap_region(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *prev, - unsigned long start, - unsigned long end) + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end) { + struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; struct mmu_gather *tlb; unsigned long nr_accounted = 0; lru_add_drain(); + spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - - if (is_hugepage_only_range(start, end - start)) - hugetlb_free_pgtables(tlb, prev, start, end); - else - free_pgtables(tlb, prev, start, end); + free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); + spin_unlock(&mm->page_table_lock); } /* @@ -1797,7 +1722,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) { unsigned long end; - struct vm_area_struct *mpnt, *prev, *last; + struct vm_area_struct *vma, *prev, *last; if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -1806,14 +1731,14 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) return -EINVAL; /* Find the first overlapping VMA */ - mpnt = find_vma_prev(mm, start, &prev); - if (!mpnt) + vma = find_vma_prev(mm, start, &prev); + if (!vma) return 0; - /* we have start < mpnt->vm_end */ + /* we have start < vma->vm_end */ /* if it doesn't overlap, we have nothing.. */ end = start + len; - if (mpnt->vm_start >= end) + if (vma->vm_start >= end) return 0; /* @@ -1823,11 +1748,11 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) * unmapped vm_area_struct will remain in use: so lower split_vma * places tmp vma above, and higher split_vma places tmp vma below. */ - if (start > mpnt->vm_start) { - int error = split_vma(mm, mpnt, start, 0); + if (start > vma->vm_start) { + int error = split_vma(mm, vma, start, 0); if (error) return error; - prev = mpnt; + prev = vma; } /* Does it split the last one? */ @@ -1837,18 +1762,16 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (error) return error; } - mpnt = prev? prev->vm_next: mm->mmap; + vma = prev? prev->vm_next: mm->mmap; /* * Remove the vma's, and unmap the actual pages */ - detach_vmas_to_be_unmapped(mm, mpnt, prev, end); - spin_lock(&mm->page_table_lock); - unmap_region(mm, mpnt, prev, start, end); - spin_unlock(&mm->page_table_lock); + detach_vmas_to_be_unmapped(mm, vma, prev, end); + unmap_region(mm, vma, prev, start, end); /* Fix up all other VM information */ - unmap_vma_list(mm, mpnt); + unmap_vma_list(mm, vma); return 0; } @@ -1903,9 +1826,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len) */ if (mm->def_flags & VM_LOCKED) { unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - locked += len; + lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; } @@ -1928,8 +1852,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) } /* Check against address space limits *after* clearing old maps... */ - if ((mm->total_vm << PAGE_SHIFT) + len - > current->signal->rlim[RLIMIT_AS].rlim_cur) + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) @@ -1968,8 +1891,6 @@ out: mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); } - acct_update_integrals(); - update_mem_hiwater(); return addr; } @@ -1979,8 +1900,9 @@ EXPORT_SYMBOL(do_brk); void exit_mmap(struct mm_struct *mm) { struct mmu_gather *tlb; - struct vm_area_struct *vma; + struct vm_area_struct *vma = mm->mmap; unsigned long nr_accounted = 0; + unsigned long end; #ifdef arch_exit_mmap arch_exit_mmap(mm); @@ -1990,21 +1912,17 @@ void exit_mmap(struct mm_struct *mm) spin_lock(&mm->page_table_lock); - tlb = tlb_gather_mmu(mm, 1); flush_cache_mm(mm); - /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ - mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, - ~0UL, &nr_accounted, NULL); + tlb = tlb_gather_mmu(mm, 1); + /* Use -1 here to ensure all VMAs in the mm are unmapped */ + end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - BUG_ON(mm->map_count); /* This is just debugging */ - clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm)); - - tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + tlb_finish_mmu(tlb, 0, end); - vma = mm->mmap; mm->mmap = mm->mmap_cache = NULL; mm->mm_rb = RB_ROOT; - mm->rss = 0; + set_mm_counter(mm, rss, 0); mm->total_vm = 0; mm->locked_vm = 0; @@ -2019,6 +1937,8 @@ void exit_mmap(struct mm_struct *mm) remove_vm_struct(vma); vma = next; } + + BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address @@ -2106,3 +2026,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } return new_vma; } + +/* + * Return true if the calling process may expand its vm space by the passed + * number of pages + */ +int may_expand_vm(struct mm_struct *mm, unsigned long npages) +{ + unsigned long cur = mm->total_vm; /* pages */ + unsigned long lim; + + lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + + if (cur + npages > lim) + return 0; + return 1; +} diff --git a/linux-2.6-xen-sparse/mm/page_alloc.c b/linux-2.6-xen-sparse/mm/page_alloc.c index 4d55438fc3..cf6749cd25 100644 --- a/linux-2.6-xen-sparse/mm/page_alloc.c +++ b/linux-2.6-xen-sparse/mm/page_alloc.c @@ -31,19 +31,26 @@ #include <linux/topology.h> #include <linux/sysctl.h> #include <linux/cpu.h> +#include <linux/cpuset.h> #include <linux/nodemask.h> #include <linux/vmalloc.h> #include <asm/tlbflush.h> #include "internal.h" -/* MCD - HACK: Find somewhere to initialize this EARLY, or make this initializer cleaner */ +/* + * MCD - HACK: Find somewhere to initialize this EARLY, or make this + * initializer cleaner + */ nodemask_t node_online_map = { { [0] = 1UL } }; +EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map = NODE_MASK_ALL; +EXPORT_SYMBOL(node_possible_map); struct pglist_data *pgdat_list; unsigned long totalram_pages; unsigned long totalhigh_pages; long nr_swap_pages; + /* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) @@ -191,6 +198,37 @@ static inline void rmv_page_order(struct page *page) } /* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contigious at least up to MAX_ORDER + */ +static inline struct page * +__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) +{ + unsigned long buddy_idx = page_idx ^ (1 << order); + + return page + (buddy_idx - page_idx); +} + +static inline unsigned long +__find_combined_index(unsigned long page_idx, unsigned int order) +{ + return (page_idx & ~(1 << order)); +} + +/* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is free && @@ -233,50 +271,49 @@ static inline int page_is_buddy(struct page *page, int order) * -- wli */ -static inline void __free_pages_bulk (struct page *page, struct page *base, +static inline void __free_pages_bulk (struct page *page, struct zone *zone, unsigned int order) { unsigned long page_idx; - struct page *coalesced; int order_size = 1 << order; if (unlikely(order)) destroy_compound_page(page, order); - page_idx = page - base; + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); BUG_ON(page_idx & (order_size - 1)); BUG_ON(bad_range(zone, page)); zone->free_pages += order_size; while (order < MAX_ORDER-1) { + unsigned long combined_idx; struct free_area *area; struct page *buddy; - int buddy_idx; - buddy_idx = (page_idx ^ (1 << order)); - buddy = base + buddy_idx; + combined_idx = __find_combined_index(page_idx, order); + buddy = __page_find_buddy(page, page_idx, order); + if (bad_range(zone, buddy)) break; if (!page_is_buddy(buddy, order)) - break; - /* Move the buddy up one level. */ + break; /* Move the buddy up one level. */ list_del(&buddy->lru); area = zone->free_area + order; area->nr_free--; rmv_page_order(buddy); - page_idx &= buddy_idx; + page = page + (combined_idx - page_idx); + page_idx = combined_idx; order++; } - coalesced = base + page_idx; - set_page_order(coalesced, order); - list_add(&coalesced->lru, &zone->free_area[order].free_list); + set_page_order(page, order); + list_add(&page->lru, &zone->free_area[order].free_list); zone->free_area[order].nr_free++; } static inline void free_pages_check(const char *function, struct page *page) { - if ( page_mapped(page) || + if ( page_mapcount(page) || page->mapping != NULL || page_count(page) != 0 || (page->flags & ( @@ -309,10 +346,9 @@ free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order) { unsigned long flags; - struct page *base, *page = NULL; + struct page *page = NULL; int ret = 0; - base = zone->zone_mem_map; spin_lock_irqsave(&zone->lock, flags); zone->all_unreclaimable = 0; zone->pages_scanned = 0; @@ -320,7 +356,7 @@ free_pages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); - __free_pages_bulk(page, base, zone, order); + __free_pages_bulk(page, zone, order); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -405,7 +441,7 @@ void set_page_refs(struct page *page, int order) */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapped(page) || + if (page->mapping || page_mapcount(page) || (page->flags & ( 1 << PG_private | 1 << PG_locked | @@ -601,7 +637,7 @@ void fastcall free_cold_page(struct page *page) free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, int gfp_flags) +static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags) { int i; @@ -616,7 +652,7 @@ static inline void prep_zero_page(struct page *page, int order, int gfp_flags) * or two. */ static struct page * -buffered_rmqueue(struct zone *zone, int order, int gfp_flags) +buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) { unsigned long flags; struct page *page = NULL; @@ -694,7 +730,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, * This is the 'heart' of the zoned buddy allocator. */ struct page * fastcall -__alloc_pages(unsigned int gfp_mask, unsigned int order, +__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, struct zonelist *zonelist) { const int wait = gfp_mask & __GFP_WAIT; @@ -734,6 +770,9 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, classzone_idx, 0, 0)) continue; + if (!cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; @@ -745,6 +784,9 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, /* * Go through the zonelist again. Let __GFP_HIGH and allocations * coming from realtime tasks to go deeper into reserves + * + * This is the last chance, in general, before the goto nopage. + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. */ for (i = 0; (z = zones[i]) != NULL; i++) { if (!zone_watermark_ok(z, order, z->pages_min, @@ -752,18 +794,27 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, gfp_mask & __GFP_HIGH)) continue; + if (wait && !cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; } /* This allocation should allow future memory freeing. */ - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { - /* go through the zonelist yet again, ignoring mins */ - for (i = 0; (z = zones[i]) != NULL; i++) { - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; + + if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) + && !in_interrupt()) { + if (!(gfp_mask & __GFP_NOMEMALLOC)) { + /* go through the zonelist yet again, ignoring mins */ + for (i = 0; (z = zones[i]) != NULL; i++) { + if (!cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } } goto nopage; } @@ -800,6 +851,9 @@ rebalance: gfp_mask & __GFP_HIGH)) continue; + if (!cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; @@ -816,6 +870,9 @@ rebalance: classzone_idx, 0, 0)) continue; + if (!cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; @@ -862,7 +919,7 @@ EXPORT_SYMBOL(__alloc_pages); /* * Common helper functions. */ -fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) +fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order) { struct page * page; page = alloc_pages(gfp_mask, order); @@ -873,7 +930,7 @@ fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int orde EXPORT_SYMBOL(__get_free_pages); -fastcall unsigned long get_zeroed_page(unsigned int gfp_mask) +fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask) { struct page * page; @@ -1302,8 +1359,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli #define MAX_NODE_LOAD (num_online_nodes()) static int __initdata node_load[MAX_NUMNODES]; /** - * find_next_best_node - find the next node that should appear in a given - * node's fallback list + * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending * @used_node_mask: nodemask_t of already used nodes * @@ -1372,7 +1428,6 @@ static void __init build_zonelists(pg_data_t *pgdat) /* initialize zonelists */ for (i = 0; i < GFP_ZONETYPES; i++) { zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); zonelist->zones[0] = NULL; } @@ -1419,7 +1474,6 @@ static void __init build_zonelists(pg_data_t *pgdat) struct zonelist *zonelist; zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); j = 0; k = ZONE_NORMAL; @@ -1461,6 +1515,7 @@ void __init build_all_zonelists(void) for_each_online_node(i) build_zonelists(NODE_DATA(i)); printk("Built %i zonelists\n", num_online_nodes()); + cpuset_init_current_mems_allowed(); } /* @@ -1623,6 +1678,18 @@ static void __init free_area_init_core(struct pglist_data *pgdat, if (batch < 1) batch = 1; + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << fls(batch + batch/2)) - 1; + for (cpu = 0; cpu < NR_CPUS; cpu++) { struct per_cpu_pages *pcp; @@ -1681,14 +1748,25 @@ static void __init free_area_init_core(struct pglist_data *pgdat, } } -void __init node_alloc_mem_map(struct pglist_data *pgdat) +static void __init alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long size; - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); - pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); + /* Skip empty nodes */ + if (!pgdat->node_spanned_pages) + return; + + /* ia64 gets its own node_mem_map, before this, without bootmem */ + if (!pgdat->node_mem_map) { + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); + } #ifndef CONFIG_DISCONTIGMEM - mem_map = contig_page_data.node_mem_map; + /* + * With no DISCONTIG, the global mem_map is just set as node 0's + */ + if (pgdat == NODE_DATA(0)) + mem_map = NODE_DATA(0)->node_mem_map; #endif } @@ -1700,8 +1778,7 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat, pgdat->node_start_pfn = node_start_pfn; calculate_zone_totalpages(pgdat, zones_size, zholes_size); - if (!pfn_to_page(node_start_pfn)) - node_alloc_mem_map(pgdat); + alloc_node_mem_map(pgdat); free_area_init_core(pgdat, zones_size, zholes_size); } @@ -1823,6 +1900,7 @@ static char *vmstat_text[] = { "allocstall", "pgrotated", + "nr_bounce", }; static void *vmstat_start(struct seq_file *m, loff_t *pos) @@ -1926,15 +2004,20 @@ static void setup_per_zone_lowmem_reserve(void) for_each_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { - struct zone * zone = pgdat->node_zones + j; + struct zone *zone = pgdat->node_zones + j; unsigned long present_pages = zone->present_pages; zone->lowmem_reserve[j] = 0; for (idx = j-1; idx >= 0; idx--) { - struct zone * lower_zone = pgdat->node_zones + idx; + struct zone *lower_zone; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) + sysctl_lowmem_reserve_ratio[idx] = 1; - lower_zone->lowmem_reserve[j] = present_pages / sysctl_lowmem_reserve_ratio[idx]; + lower_zone = pgdat->node_zones + idx; + lower_zone->lowmem_reserve[j] = present_pages / + sysctl_lowmem_reserve_ratio[idx]; present_pages += lower_zone->present_pages; } } @@ -2041,7 +2124,7 @@ module_init(init_per_zone_pages_min) * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, file, buffer, length, ppos); setup_per_zone_pages_min(); @@ -2058,7 +2141,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, file, buffer, length, ppos); setup_per_zone_lowmem_reserve(); diff --git a/linux-2.6-xen-sparse/net/core/dev.c b/linux-2.6-xen-sparse/net/core/dev.c index b5e12b06ec..8c73647ecb 100644 --- a/linux-2.6-xen-sparse/net/core/dev.c +++ b/linux-2.6-xen-sparse/net/core/dev.c @@ -7,7 +7,7 @@ * 2 of the License, or (at your option) any later version. * * Derived from the non IP parts of dev.c 1.0.19 - * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * @@ -766,6 +766,18 @@ int dev_change_name(struct net_device *dev, char *newname) } /** + * netdev_features_change - device changes fatures + * @dev: device to cause notification + * + * Called to indicate a device has changed features. + */ +void netdev_features_change(struct net_device *dev) +{ + notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL(netdev_features_change); + +/** * netdev_state_change - device changes state * @dev: device to cause notification * @@ -1219,6 +1231,19 @@ int __skb_linearize(struct sk_buff *skb, int gfp_mask) * A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. + * + * ----------------------------------------------------------------------------------- + * I notice this method can also return errors from the queue disciplines, + * including NET_XMIT_DROP, which is a positive value. So, errors can also + * be positive. + * + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) + * + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. + * --BLG */ int dev_queue_xmit(struct sk_buff *skb) @@ -1456,13 +1481,10 @@ int netif_rx(struct sk_buff *skb) struct softnet_data *queue; unsigned long flags; -#ifdef CONFIG_NETPOLL - if (skb->dev->netpoll_rx && netpoll_rx(skb)) { - kfree_skb(skb); + /* if netpoll wants it, pretend we never saw it */ + if (netpoll_rx(skb)) return NET_RX_DROP; - } -#endif - + if (!skb->stamp.tv_sec) net_timestamp(&skb->stamp); @@ -1590,6 +1612,10 @@ static __inline__ int deliver_skb(struct sk_buff *skb, #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); +struct net_bridge; +struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, + unsigned char *addr); +void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); static __inline__ int handle_bridge(struct sk_buff **pskb, struct packet_type **pt_prev, int *ret) @@ -1658,12 +1684,9 @@ int netif_receive_skb(struct sk_buff *skb) int ret = NET_RX_DROP; unsigned short type; -#ifdef CONFIG_NETPOLL - if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) { - kfree_skb(skb); + /* if we've gotten here through NAPI, check netpoll */ + if (skb->dev->poll && netpoll_rx(skb)) return NET_RX_DROP; - } -#endif if (!skb->stamp.tv_sec) net_timestamp(&skb->stamp); @@ -1761,6 +1784,7 @@ static int process_backlog(struct net_device *backlog_dev, int *budget) struct softnet_data *queue = &__get_cpu_var(softnet_data); unsigned long start_time = jiffies; + backlog_dev->weight = weight_p; for (;;) { struct sk_buff *skb; struct net_device *dev; @@ -1821,8 +1845,10 @@ static void net_rx_action(struct softirq_action *h) dev = list_entry(queue->poll_list.next, struct net_device, poll_list); + netpoll_poll_lock(dev); if (dev->quota <= 0 || dev->poll(dev, &budget)) { + netpoll_poll_unlock(dev); local_irq_disable(); list_del(&dev->poll_list); list_add_tail(&dev->poll_list, &queue->poll_list); @@ -1831,6 +1857,7 @@ static void net_rx_action(struct softirq_action *h) else dev->quota = dev->weight; } else { + netpoll_poll_unlock(dev); dev_put(dev); local_irq_disable(); } @@ -2340,6 +2367,21 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) return err; } +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +{ + int err; + + if (!dev->set_mac_address) + return -EOPNOTSUPP; + if (sa->sa_family != dev->type) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + err = dev->set_mac_address(dev, sa); + if (!err) + notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); + return err; +} /* * Perform the SIOCxIFxxx calls. @@ -2386,17 +2428,7 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) return 0; case SIOCSIFHWADDR: - if (!dev->set_mac_address) - return -EOPNOTSUPP; - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - err = dev->set_mac_address(dev, &ifr->ifr_hwaddr); - if (!err) - notifier_call_chain(&netdev_chain, - NETDEV_CHANGEADDR, dev); - return err; + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) @@ -3112,7 +3144,7 @@ void free_netdev(struct net_device *dev) void synchronize_net(void) { might_sleep(); - synchronize_kernel(); + synchronize_rcu(); } /** @@ -3362,6 +3394,7 @@ EXPORT_SYMBOL(dev_set_allmulti); EXPORT_SYMBOL(dev_set_promiscuity); EXPORT_SYMBOL(dev_change_flags); EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(dev_set_mac_address); EXPORT_SYMBOL(free_netdev); EXPORT_SYMBOL(netdev_boot_setup_check); EXPORT_SYMBOL(netdev_set_master); @@ -3377,9 +3410,12 @@ EXPORT_SYMBOL(unregister_netdevice); EXPORT_SYMBOL(unregister_netdevice_notifier); EXPORT_SYMBOL(net_enable_timestamp); EXPORT_SYMBOL(net_disable_timestamp); +EXPORT_SYMBOL(dev_get_flags); #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) EXPORT_SYMBOL(br_handle_frame_hook); +EXPORT_SYMBOL(br_fdb_get_hook); +EXPORT_SYMBOL(br_fdb_put_hook); #endif #ifdef CONFIG_KMOD diff --git a/linux-2.6-xen-sparse/net/core/skbuff.c b/linux-2.6-xen-sparse/net/core/skbuff.c index be2801e883..9e144aa414 100644 --- a/linux-2.6-xen-sparse/net/core/skbuff.c +++ b/linux-2.6-xen-sparse/net/core/skbuff.c @@ -86,8 +86,10 @@ static kmem_cache_t *skbuff_head_cache; */ void skb_over_panic(struct sk_buff *skb, int sz, void *here) { - printk(KERN_INFO "skput:over: %p:%d put:%d dev:%s", - here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); + printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " + "data:%p tail:%p end:%p dev:%s\n", + here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end, + skb->dev ? skb->dev->name : "<NULL>"); BUG(); } @@ -102,8 +104,10 @@ void skb_over_panic(struct sk_buff *skb, int sz, void *here) void skb_under_panic(struct sk_buff *skb, int sz, void *here) { - printk(KERN_INFO "skput:under: %p:%d put:%d dev:%s", - here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); + printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " + "data:%p tail:%p end:%p dev:%s\n", + here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end, + skb->dev ? skb->dev->name : "<NULL>"); BUG(); } @@ -241,7 +245,8 @@ static void skb_clone_fraglist(struct sk_buff *skb) void skb_release_data(struct sk_buff *skb) { if (!skb->cloned || - atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &skb_shinfo(skb)->dataref)) { if (skb_shinfo(skb)->nr_frags) { int i; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) @@ -275,20 +280,14 @@ void kfree_skbmem(struct sk_buff *skb) void __kfree_skb(struct sk_buff *skb) { - if (skb->list) { - printk(KERN_WARNING "Warning: kfree_skb passed an skb still " - "on a list (from %p).\n", NET_CALLER(skb)); - BUG(); - } + BUG_ON(skb->list != NULL); dst_release(skb->dst); #ifdef CONFIG_XFRM secpath_put(skb->sp); #endif - if(skb->destructor) { - if (in_irq()) - printk(KERN_WARNING "Warning: kfree_skb on " - "hard IRQ %p\n", NET_CALLER(skb)); + if (skb->destructor) { + WARN_ON(in_irq()); skb->destructor(skb); } #ifdef CONFIG_NETFILTER @@ -353,6 +352,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) C(csum); C(local_df); n->cloned = 1; + n->nohdr = 0; C(proto_csum_valid); C(proto_csum_blank); C(pkt_type); @@ -606,6 +606,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) skb->h.raw += off; skb->nh.raw += off; skb->cloned = 0; + skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); return 0; @@ -984,70 +985,94 @@ fault: return -EFAULT; } -/* Keep iterating until skb_iter_next returns false. */ -void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i) -{ - i->len = skb_headlen(skb); - i->data = (unsigned char *)skb->data; - i->nextfrag = 0; - i->fraglist = NULL; -} +/** + * skb_store_bits - store bits from kernel buffer to skb + * @skb: destination buffer + * @offset: offset in destination + * @from: source buffer + * @len: number of bytes to copy + * + * Copy the specified number of bytes from the source buffer to the + * destination skb. This function handles all the messy bits of + * traversing fragment lists and such. + */ -int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i) +int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len) { - /* Unmap previous, if not head fragment. */ - if (i->nextfrag) - kunmap_skb_frag(i->data); - - if (i->fraglist) { - fraglist: - /* We're iterating through fraglist. */ - if (i->nextfrag < skb_shinfo(i->fraglist)->nr_frags) { - i->data = kmap_skb_frag(&skb_shinfo(i->fraglist) - ->frags[i->nextfrag]); - i->len = skb_shinfo(i->fraglist)->frags[i->nextfrag] - .size; - i->nextfrag++; - return 1; - } - /* Fragments with fragments? Too hard! */ - BUG_ON(skb_shinfo(i->fraglist)->frag_list); - i->fraglist = i->fraglist->next; - if (!i->fraglist) - goto end; - - i->len = skb_headlen(i->fraglist); - i->data = i->fraglist->data; - i->nextfrag = 0; - return 1; + int i, copy; + int start = skb_headlen(skb); + + if (offset > (int)skb->len - len) + goto fault; + + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + memcpy(skb->data + offset, from, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; } - if (i->nextfrag < skb_shinfo(skb)->nr_frags) { - i->data = kmap_skb_frag(&skb_shinfo(skb)->frags[i->nextfrag]); - i->len = skb_shinfo(skb)->frags[i->nextfrag].size; - i->nextfrag++; - return 1; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + int end; + + BUG_TRAP(start <= offset + len); + + end = start + frag->size; + if ((copy = end - offset) > 0) { + u8 *vaddr; + + if (copy > len) + copy = len; + + vaddr = kmap_skb_frag(frag); + memcpy(vaddr + frag->page_offset + offset - start, + from, copy); + kunmap_skb_frag(vaddr); + + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; } - i->fraglist = skb_shinfo(skb)->frag_list; - if (i->fraglist) - goto fraglist; + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; -end: - /* Bug trap for callers */ - i->data = NULL; - return 0; -} + for (; list; list = list->next) { + int end; -void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i) -{ - /* Unmap previous, if not head fragment. */ - if (i->data && i->nextfrag) - kunmap_skb_frag(i->data); - /* Bug trap for callers */ - i->data = NULL; + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_store_bits(list, offset - start, + from, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; } +EXPORT_SYMBOL(skb_store_bits); + /* Checksum skb data. */ unsigned int skb_checksum(const struct sk_buff *skb, int offset, @@ -1446,7 +1471,7 @@ static inline void skb_split_no_header(struct sk_buff *skb, if (pos < len) { /* Split frag. - * We have to variants in this case: + * We have two variants in this case: * 1. Move all the frag to the second * part, if it is possible. F.e. * this approach is mandatory for TUX, @@ -1469,6 +1494,9 @@ static inline void skb_split_no_header(struct sk_buff *skb, /** * skb_split - Split fragmented skb to two parts at length len. + * @skb: the buffer to split + * @skb1: the buffer to receive the second part + * @len: new length for skb */ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { @@ -1518,6 +1546,3 @@ EXPORT_SYMBOL(skb_queue_tail); EXPORT_SYMBOL(skb_unlink); EXPORT_SYMBOL(skb_append); EXPORT_SYMBOL(skb_split); -EXPORT_SYMBOL(skb_iter_first); -EXPORT_SYMBOL(skb_iter_next); -EXPORT_SYMBOL(skb_iter_abort); diff --git a/patches/linux-2.6.11/agpgart.patch b/patches/linux-2.6.11/agpgart.patch deleted file mode 100644 index 87dded22d4..0000000000 --- a/patches/linux-2.6.11/agpgart.patch +++ /dev/null @@ -1,437 +0,0 @@ ---- linux-2.6.11/drivers/char/agp/agp.h 2005-03-02 07:38:07 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/agp.h 2005-03-22 11:14:02 +00:00 -@@ -272,6 +272,8 @@ - #define AGP_GENERIC_SIZES_ENTRIES 11 - extern struct aper_size_info_16 agp3_generic_sizes[]; - -+#define virt_to_gart(x) (phys_to_gart(virt_to_phys(x))) -+#define gart_to_virt(x) (phys_to_virt(gart_to_phys(x))) - - extern int agp_off; - extern int agp_try_unsupported_boot; ---- linux-2.6.11/drivers/char/agp/ali-agp.c 2005-03-02 07:38:13 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/ali-agp.c 2005-03-22 11:14:56 +00:00 -@@ -150,7 +150,7 @@ - pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); - pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, - (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | -- virt_to_phys(addr)) | ALI_CACHE_FLUSH_EN )); -+ virt_to_gart(addr)) | ALI_CACHE_FLUSH_EN )); - return addr; - } - -@@ -174,7 +174,7 @@ - pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); - pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, - (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | -- virt_to_phys(addr)) | ALI_CACHE_FLUSH_EN)); -+ virt_to_gart(addr)) | ALI_CACHE_FLUSH_EN)); - agp_generic_destroy_page(addr); - } - ---- linux-2.6.11/drivers/char/agp/amd-k7-agp.c 2005-03-02 07:38:33 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/amd-k7-agp.c 2005-03-22 11:14:56 +00:00 -@@ -43,7 +43,7 @@ - - SetPageReserved(virt_to_page(page_map->real)); - global_cache_flush(); -- page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), -+ page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), - PAGE_SIZE); - if (page_map->remapped == NULL) { - ClearPageReserved(virt_to_page(page_map->real)); -@@ -154,7 +154,7 @@ - - agp_bridge->gatt_table_real = (u32 *)page_dir.real; - agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; -- agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); -+ agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); - - /* Get the address for the gart region. - * This is a bus address even on the alpha, b/c its -@@ -167,7 +167,7 @@ - - /* Calculate the agp offset */ - for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { -- writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1, -+ writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1, - page_dir.remapped+GET_PAGE_DIR_OFF(addr)); - readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ - } ---- linux-2.6.11/drivers/char/agp/amd64-agp.c 2005-03-02 07:38:13 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/amd64-agp.c 2005-03-22 11:14:56 +00:00 -@@ -218,7 +218,7 @@ - - static int amd_8151_configure(void) - { -- unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); -+ unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real); - - /* Configure AGP regs in each x86-64 host bridge. */ - for_each_nb() { -@@ -590,7 +590,7 @@ - { - struct agp_bridge_data *bridge = pci_get_drvdata(pdev); - -- release_mem_region(virt_to_phys(bridge->gatt_table_real), -+ release_mem_region(virt_to_gart(bridge->gatt_table_real), - amd64_aperture_sizes[bridge->aperture_size_idx].size); - agp_remove_bridge(bridge); - agp_put_bridge(bridge); ---- linux-2.6.11/drivers/char/agp/ati-agp.c 2005-03-02 07:38:13 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/ati-agp.c 2005-03-22 11:14:56 +00:00 -@@ -61,7 +61,7 @@ - - SetPageReserved(virt_to_page(page_map->real)); - err = map_page_into_agp(virt_to_page(page_map->real)); -- page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), -+ page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), - PAGE_SIZE); - if (page_map->remapped == NULL || err) { - ClearPageReserved(virt_to_page(page_map->real)); ---- linux-2.6.11/drivers/char/agp/backend.c 2005-03-02 07:38:13 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/backend.c 2005-03-22 11:14:56 +00:00 -@@ -142,7 +142,7 @@ - return -ENOMEM; - } - -- bridge->scratch_page_real = virt_to_phys(addr); -+ bridge->scratch_page_real = virt_to_gart(addr); - bridge->scratch_page = - bridge->driver->mask_memory(bridge->scratch_page_real, 0); - } -@@ -186,7 +186,7 @@ - err_out: - if (bridge->driver->needs_scratch_page) - bridge->driver->agp_destroy_page( -- phys_to_virt(bridge->scratch_page_real)); -+ gart_to_virt(bridge->scratch_page_real)); - if (got_gatt) - bridge->driver->free_gatt_table(); - if (got_keylist) { -@@ -211,7 +211,7 @@ - if (bridge->driver->agp_destroy_page && - bridge->driver->needs_scratch_page) - bridge->driver->agp_destroy_page( -- phys_to_virt(bridge->scratch_page_real)); -+ gart_to_virt(bridge->scratch_page_real)); - } - - /* XXX Kludge alert: agpgart isn't ready for multiple bridges yet */ ---- linux-2.6.11/drivers/char/agp/efficeon-agp.c 2005-03-02 07:37:30 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/efficeon-agp.c 2005-03-22 11:15:17 +00:00 -@@ -219,7 +219,7 @@ - - efficeon_private.l1_table[index] = page; - -- value = __pa(page) | pati | present | index; -+ value = virt_to_gart(page) | pati | present | index; - - pci_write_config_dword(agp_bridge->dev, - EFFICEON_ATTPAGE, value); ---- linux-2.6.11/drivers/char/agp/generic.c 2005-03-02 07:37:55 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/generic.c 2005-03-22 11:17:37 +00:00 -@@ -151,7 +151,7 @@ - } - if (curr->page_count != 0) { - for (i = 0; i < curr->page_count; i++) { -- agp_bridge->driver->agp_destroy_page(phys_to_virt(curr->memory[i])); -+ agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i])); - } - } - agp_free_key(curr->key); -@@ -204,7 +204,7 @@ - agp_free_memory(new); - return NULL; - } -- new->memory[i] = virt_to_phys(addr); -+ new->memory[i] = virt_to_gart(addr); - new->page_count++; - } - -@@ -697,8 +697,7 @@ - break; - } - -- table = (char *) __get_free_pages(GFP_KERNEL, -- page_order); -+ table = alloc_gatt_pages(page_order); - - if (table == NULL) { - i++; -@@ -729,7 +728,7 @@ - size = ((struct aper_size_info_fixed *) temp)->size; - page_order = ((struct aper_size_info_fixed *) temp)->page_order; - num_entries = ((struct aper_size_info_fixed *) temp)->num_entries; -- table = (char *) __get_free_pages(GFP_KERNEL, page_order); -+ table = alloc_gatt_pages(page_order); - } - - if (table == NULL) -@@ -744,7 +743,7 @@ - agp_gatt_table = (void *)table; - - agp_bridge->driver->cache_flush(); -- agp_bridge->gatt_table = ioremap_nocache(virt_to_phys(table), -+ agp_bridge->gatt_table = ioremap_nocache(virt_to_gart(table), - (PAGE_SIZE * (1 << page_order))); - agp_bridge->driver->cache_flush(); - -@@ -752,11 +751,11 @@ - for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) - ClearPageReserved(page); - -- free_pages((unsigned long) table, page_order); -+ free_gatt_pages(table, page_order); - - return -ENOMEM; - } -- agp_bridge->gatt_bus_addr = virt_to_phys(agp_bridge->gatt_table_real); -+ agp_bridge->gatt_bus_addr = virt_to_gart(agp_bridge->gatt_table_real); - - /* AK: bogus, should encode addresses > 4GB */ - for (i = 0; i < num_entries; i++) { -@@ -810,7 +809,7 @@ - for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) - ClearPageReserved(page); - -- free_pages((unsigned long) agp_bridge->gatt_table_real, page_order); -+ free_gatt_pages(agp_bridge->gatt_table_real, page_order); - - agp_gatt_table = NULL; - agp_bridge->gatt_table = NULL; ---- linux-2.6.11/drivers/char/agp/hp-agp.c 2005-03-02 07:38:19 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/hp-agp.c 2005-03-22 11:14:56 +00:00 -@@ -110,7 +110,7 @@ - hp->gart_size = HP_ZX1_GART_SIZE; - hp->gatt_entries = hp->gart_size / hp->io_page_size; - -- hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); -+ hp->io_pdir = gart_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); - hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)]; - - if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) { -@@ -248,7 +248,7 @@ - agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS); - - if (hp->io_pdir_owner) { -- writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); -+ writel(virt_to_gart(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); - readl(hp->ioc_regs+HP_ZX1_PDIR_BASE); - writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG); - readl(hp->ioc_regs+HP_ZX1_TCNFG); ---- linux-2.6.11/drivers/char/agp/i460-agp.c 2005-03-02 07:38:10 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/i460-agp.c 2005-03-22 11:14:56 +00:00 -@@ -371,7 +371,7 @@ - } - memset(lp->alloced_map, 0, map_size); - -- lp->paddr = virt_to_phys(lpage); -+ lp->paddr = virt_to_gart(lpage); - lp->refcount = 0; - atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp); - return 0; -@@ -382,7 +382,7 @@ - kfree(lp->alloced_map); - lp->alloced_map = NULL; - -- free_pages((unsigned long) phys_to_virt(lp->paddr), I460_IO_PAGE_SHIFT - PAGE_SHIFT); -+ free_pages((unsigned long) gart_to_virt(lp->paddr), I460_IO_PAGE_SHIFT - PAGE_SHIFT); - atomic_sub(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp); - } - ---- linux-2.6.11/drivers/char/agp/intel-agp.c 2005-03-02 07:38:09 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/intel-agp.c 2005-03-22 11:14:56 +00:00 -@@ -285,7 +285,7 @@ - if (new == NULL) - return NULL; - -- new->memory[0] = virt_to_phys(addr); -+ new->memory[0] = virt_to_gart(addr); - if (pg_count == 4) { - /* kludge to get 4 physical pages for ARGB cursor */ - new->memory[1] = new->memory[0] + PAGE_SIZE; -@@ -328,10 +328,10 @@ - agp_free_key(curr->key); - if(curr->type == AGP_PHYS_MEMORY) { - if (curr->page_count == 4) -- i8xx_destroy_pages(phys_to_virt(curr->memory[0])); -+ i8xx_destroy_pages(gart_to_virt(curr->memory[0])); - else - agp_bridge->driver->agp_destroy_page( -- phys_to_virt(curr->memory[0])); -+ gart_to_virt(curr->memory[0])); - vfree(curr->memory); - } - kfree(curr); ---- linux-2.6.11/drivers/char/agp/intel-mch-agp.c 2005-03-02 07:37:48 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/intel-mch-agp.c 2005-03-22 11:14:56 +00:00 -@@ -51,7 +51,7 @@ - if (new == NULL) - return NULL; - -- new->memory[0] = virt_to_phys(addr); -+ new->memory[0] = virt_to_gart(addr); - new->page_count = 1; - new->num_scratch_pages = 1; - new->type = AGP_PHYS_MEMORY; -@@ -63,7 +63,7 @@ - { - agp_free_key(curr->key); - if(curr->type == AGP_PHYS_MEMORY) { -- agp_bridge->driver->agp_destroy_page(phys_to_virt(curr->memory[0])); -+ agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0])); - vfree(curr->memory); - } - kfree(curr); ---- linux-2.6.11/drivers/char/agp/sworks-agp.c 2005-03-02 07:38:37 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/sworks-agp.c 2005-03-22 11:14:56 +00:00 -@@ -51,7 +51,7 @@ - } - SetPageReserved(virt_to_page(page_map->real)); - global_cache_flush(); -- page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), -+ page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), - PAGE_SIZE); - if (page_map->remapped == NULL) { - ClearPageReserved(virt_to_page(page_map->real)); -@@ -162,7 +162,7 @@ - /* Create a fake scratch directory */ - for(i = 0; i < 1024; i++) { - writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i); -- writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); -+ writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); - } - - retval = serverworks_create_gatt_pages(value->num_entries / 1024); -@@ -174,7 +174,7 @@ - - agp_bridge->gatt_table_real = (u32 *)page_dir.real; - agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; -- agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); -+ agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); - - /* Get the address for the gart region. - * This is a bus address even on the alpha, b/c its -@@ -187,7 +187,7 @@ - /* Calculate the agp offset */ - - for(i = 0; i < value->num_entries / 1024; i++) -- writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); -+ writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); - - return 0; - } ---- linux-2.6.11/drivers/char/agp/uninorth-agp.c 2005-03-02 07:38:09 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/uninorth-agp.c 2005-03-22 11:14:56 +00:00 -@@ -200,7 +200,7 @@ - - agp_bridge->gatt_table_real = (u32 *) table; - agp_bridge->gatt_table = (u32 *)table; -- agp_bridge->gatt_bus_addr = virt_to_phys(table); -+ agp_bridge->gatt_bus_addr = virt_to_gart(table); - - for (i = 0; i < num_entries; i++) { - agp_bridge->gatt_table[i] = ---- linux-2.6.11/include/asm-alpha/agp.h 2005-03-02 07:37:39 +00:00 -+++ linux-2.6.11-agp/include/asm-alpha/agp.h 2005-03-22 11:18:34 +00:00 -@@ -10,4 +10,14 @@ - #define flush_agp_mappings() - #define flush_agp_cache() mb() - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif ---- linux-2.6.11/include/asm-i386/agp.h 2005-03-02 07:37:31 +00:00 -+++ linux-2.6.11-agp/include/asm-i386/agp.h 2005-03-22 11:18:39 +00:00 -@@ -21,4 +21,14 @@ - worth it. Would need a page for it. */ - #define flush_agp_cache() asm volatile("wbinvd":::"memory") - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif ---- linux-2.6.11/include/asm-ia64/agp.h 2005-03-02 07:38:09 +00:00 -+++ linux-2.6.11-agp/include/asm-ia64/agp.h 2005-03-22 11:18:45 +00:00 -@@ -18,4 +18,14 @@ - #define flush_agp_mappings() /* nothing */ - #define flush_agp_cache() mb() - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif /* _ASM_IA64_AGP_H */ ---- linux-2.6.11/include/asm-ppc/agp.h 2005-03-02 07:38:08 +00:00 -+++ linux-2.6.11-agp/include/asm-ppc/agp.h 2005-03-22 11:18:52 +00:00 -@@ -10,4 +10,14 @@ - #define flush_agp_mappings() - #define flush_agp_cache() mb() - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif ---- linux-2.6.11/include/asm-sparc64/agp.h 2005-03-02 07:37:48 +00:00 -+++ linux-2.6.11-agp/include/asm-sparc64/agp.h 2005-03-22 11:18:59 +00:00 -@@ -8,4 +8,14 @@ - #define flush_agp_mappings() - #define flush_agp_cache() mb() - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif ---- linux-2.6.11/include/asm-x86_64/agp.h 2005-03-02 07:37:30 +00:00 -+++ linux-2.6.11-agp/include/asm-x86_64/agp.h 2005-03-22 11:18:22 +00:00 -@@ -19,4 +19,14 @@ - worth it. Would need a page for it. */ - #define flush_agp_cache() asm volatile("wbinvd":::"memory") - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif diff --git a/patches/linux-2.6.11/iomap.patch b/patches/linux-2.6.11/iomap.patch deleted file mode 100644 index 81b4f3f2ab..0000000000 --- a/patches/linux-2.6.11/iomap.patch +++ /dev/null @@ -1,120 +0,0 @@ -diff -ur linux-2.6.11/drivers/char/agp/frontend.c linux-2.6.11-io/drivers/char/agp/frontend.c ---- linux-2.6.11/drivers/char/agp/frontend.c 2005-03-02 07:37:49.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/agp/frontend.c 2005-03-15 17:38:30.000000000 +0000 -@@ -627,7 +627,7 @@ - DBG("client vm_ops=%p", kerninfo.vm_ops); - if (kerninfo.vm_ops) { - vma->vm_ops = kerninfo.vm_ops; -- } else if (remap_pfn_range(vma, vma->vm_start, -+ } else if (io_remap_pfn_range(vma, vma->vm_start, - (kerninfo.aper_base + offset) >> PAGE_SHIFT, - size, vma->vm_page_prot)) { - goto out_again; -@@ -643,7 +643,7 @@ - DBG("controller vm_ops=%p", kerninfo.vm_ops); - if (kerninfo.vm_ops) { - vma->vm_ops = kerninfo.vm_ops; -- } else if (remap_pfn_range(vma, vma->vm_start, -+ } else if (io_remap_pfn_range(vma, vma->vm_start, - kerninfo.aper_base >> PAGE_SHIFT, - size, vma->vm_page_prot)) { - goto out_again; -diff -ur linux-2.6.11/drivers/char/drm/drm_vm.c linux-2.6.11-io/drivers/char/drm/drm_vm.c ---- linux-2.6.11/drivers/char/drm/drm_vm.c 2005-03-02 07:38:33.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/drm/drm_vm.c 2005-03-15 17:43:26.000000000 +0000 -@@ -630,7 +630,7 @@ - vma->vm_end - vma->vm_start, - vma->vm_page_prot, 0)) - #else -- if (remap_pfn_range(DRM_RPR_ARG(vma) vma->vm_start, -+ if (io_remap_pfn_range(vma, vma->vm_start, - (VM_OFFSET(vma) + offset) >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) -diff -ur linux-2.6.11/drivers/char/drm/i810_dma.c linux-2.6.11-io/drivers/char/drm/i810_dma.c ---- linux-2.6.11/drivers/char/drm/i810_dma.c 2005-03-02 07:37:55.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/drm/i810_dma.c 2005-03-15 17:53:36.000000000 +0000 -@@ -139,7 +139,7 @@ - buf_priv->currently_mapped = I810_BUF_MAPPED; - unlock_kernel(); - -- if (remap_pfn_range(DRM_RPR_ARG(vma) vma->vm_start, -+ if (io_remap_pfn_range(vma, vma->vm_start, - VM_OFFSET(vma) >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) return -EAGAIN; -diff -ur linux-2.6.11/drivers/char/drm/i830_dma.c linux-2.6.11-io/drivers/char/drm/i830_dma.c ---- linux-2.6.11/drivers/char/drm/i830_dma.c 2005-03-02 07:37:48.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/drm/i830_dma.c 2005-03-15 17:53:46.000000000 +0000 -@@ -157,7 +157,7 @@ - buf_priv->currently_mapped = I830_BUF_MAPPED; - unlock_kernel(); - -- if (remap_pfn_range(DRM_RPR_ARG(vma) vma->vm_start, -+ if (io_remap_pfn_range(vma, vma->vm_start, - VM_OFFSET(vma) >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) return -EAGAIN; -diff -ur linux-2.6.11/drivers/char/hpet.c linux-2.6.11-io/drivers/char/hpet.c ---- linux-2.6.11/drivers/char/hpet.c 2005-03-02 07:38:10.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/hpet.c 2005-03-15 17:37:22.000000000 +0000 -@@ -76,6 +76,7 @@ - struct hpets { - struct hpets *hp_next; - struct hpet __iomem *hp_hpet; -+ unsigned long hp_hpet_phys; - struct time_interpolator *hp_interpolator; - unsigned long hp_period; - unsigned long hp_delta; -@@ -265,7 +266,7 @@ - return -EINVAL; - - devp = file->private_data; -- addr = (unsigned long)devp->hd_hpet; -+ addr = devp->hd_hpets->hp_hpet_phys; - - if (addr & (PAGE_SIZE - 1)) - return -ENOSYS; -@@ -274,7 +275,7 @@ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - addr = __pa(addr); - -- if (remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, -+ if (io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - printk(KERN_ERR "remap_pfn_range failed in hpet.c\n"); - return -EAGAIN; -@@ -795,6 +796,7 @@ - - hpetp->hp_which = hpet_nhpet++; - hpetp->hp_hpet = hdp->hd_address; -+ hpetp->hp_hpet_phys = hdp->hd_phys_address; - - hpetp->hp_ntimer = hdp->hd_nirqs; - -diff -ur linux-2.6.11/drivers/sbus/char/flash.c linux-2.6.11-io/drivers/sbus/char/flash.c ---- linux-2.6.11/drivers/sbus/char/flash.c 2005-03-02 07:38:10.000000000 +0000 -+++ linux-2.6.11-io/drivers/sbus/char/flash.c 2005-03-15 17:20:22.000000000 +0000 -@@ -75,7 +75,7 @@ - pgprot_val(vma->vm_page_prot) |= _PAGE_E; - vma->vm_flags |= (VM_SHM | VM_LOCKED); - -- if (remap_pfn_range(vma, vma->vm_start, addr, size, vma->vm_page_prot)) -+ if (io_remap_pfn_range(vma, vma->vm_start, addr, size, vma->vm_page_prot)) - return -EAGAIN; - - return 0; -diff -ur linux-2.6.11/include/linux/mm.h linux-2.6.11-io/include/linux/mm.h ---- linux-2.6.11/include/linux/mm.h 2005-03-02 07:37:47.000000000 +0000 -+++ linux-2.6.11-io/include/linux/mm.h 2005-03-15 17:03:46.000000000 +0000 -@@ -815,6 +815,10 @@ - extern int check_user_page_readable(struct mm_struct *mm, unsigned long address); - int remap_pfn_range(struct vm_area_struct *, unsigned long, - unsigned long, unsigned long, pgprot_t); -+/* Allow arch override for mapping of device and I/O (non-RAM) pages. */ -+#ifndef io_remap_pfn_range -+#define io_remap_pfn_range remap_pfn_range -+#endif - - #ifdef CONFIG_PROC_FS - void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); diff --git a/patches/linux-2.6.11/linux-2.6.11.12.patch b/patches/linux-2.6.11/linux-2.6.11.12.patch deleted file mode 100644 index 592ea13001..0000000000 --- a/patches/linux-2.6.11/linux-2.6.11.12.patch +++ /dev/null @@ -1,2579 +0,0 @@ -diff --git a/Documentation/SecurityBugs b/Documentation/SecurityBugs -new file mode 100644 ---- /dev/null -+++ b/Documentation/SecurityBugs -@@ -0,0 +1,38 @@ -+Linux kernel developers take security very seriously. As such, we'd -+like to know when a security bug is found so that it can be fixed and -+disclosed as quickly as possible. Please report security bugs to the -+Linux kernel security team. -+ -+1) Contact -+ -+The Linux kernel security team can be contacted by email at -+<security@kernel.org>. This is a private list of security officers -+who will help verify the bug report and develop and release a fix. -+It is possible that the security team will bring in extra help from -+area maintainers to understand and fix the security vulnerability. -+ -+As it is with any bug, the more information provided the easier it -+will be to diagnose and fix. Please review the procedure outlined in -+REPORTING-BUGS if you are unclear about what information is helpful. -+Any exploit code is very helpful and will not be released without -+consent from the reporter unless it has already been made public. -+ -+2) Disclosure -+ -+The goal of the Linux kernel security team is to work with the -+bug submitter to bug resolution as well as disclosure. We prefer -+to fully disclose the bug as soon as possible. It is reasonable to -+delay disclosure when the bug or the fix is not yet fully understood, -+the solution is not well-tested or for vendor coordination. However, we -+expect these delays to be short, measurable in days, not weeks or months. -+A disclosure date is negotiated by the security team working with the -+bug submitter as well as vendors. However, the kernel security team -+holds the final say when setting a disclosure date. The timeframe for -+disclosure is from immediate (esp. if it's already publically known) -+to a few weeks. As a basic default policy, we expect report date to -+disclosure date to be on the order of 7 days. -+ -+3) Non-disclosure agreements -+ -+The Linux kernel security team is not a formal body and therefore unable -+to enter any non-disclosure agreements. -diff --git a/MAINTAINERS b/MAINTAINERS ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -1966,6 +1966,11 @@ M: christer@weinigel.se - W: http://www.weinigel.se - S: Supported - -+SECURITY CONTACT -+P: Security Officers -+M: security@kernel.org -+S: Supported -+ - SELINUX SECURITY MODULE - P: Stephen Smalley - M: sds@epoch.ncsc.mil -diff --git a/Makefile b/Makefile ---- a/Makefile -+++ b/Makefile -@@ -1,8 +1,8 @@ - VERSION = 2 - PATCHLEVEL = 6 - SUBLEVEL = 11 --EXTRAVERSION = --NAME=Woozy Numbat -+EXTRAVERSION = .12 -+NAME=Woozy Beaver - - # *DOCUMENTATION* - # To see a list of typical targets execute "make help" -diff --git a/REPORTING-BUGS b/REPORTING-BUGS ---- a/REPORTING-BUGS -+++ b/REPORTING-BUGS -@@ -16,6 +16,10 @@ code relevant to what you were doing. If - describe how to recreate it. That is worth even more than the oops itself. - The list of maintainers is in the MAINTAINERS file in this directory. - -+ If it is a security bug, please copy the Security Contact listed -+in the MAINTAINERS file. They can help coordinate bugfix and disclosure. -+See Documentation/SecurityBugs for more infomation. -+ - If you are totally stumped as to whom to send the report, send it to - linux-kernel@vger.kernel.org. (For more information on the linux-kernel - mailing list see http://www.tux.org/lkml/). -diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S ---- a/arch/ia64/kernel/fsys.S -+++ b/arch/ia64/kernel/fsys.S -@@ -611,8 +611,10 @@ GLOBAL_ENTRY(fsys_bubble_down) - movl r2=ia64_ret_from_syscall - ;; - mov rp=r2 // set the real return addr -- tbit.z p8,p0=r3,TIF_SYSCALL_TRACE -+ and r3=_TIF_SYSCALL_TRACEAUDIT,r3 - ;; -+ cmp.eq p8,p0=r3,r0 -+ - (p10) br.cond.spnt.many ia64_ret_from_syscall // p10==true means out registers are more than 8 - (p8) br.call.sptk.many b6=b6 // ignore this return addr - br.cond.sptk ia64_trace_syscall -diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c ---- a/arch/ia64/kernel/signal.c -+++ b/arch/ia64/kernel/signal.c -@@ -224,7 +224,8 @@ ia64_rt_sigreturn (struct sigscratch *sc - * could be corrupted. - */ - retval = (long) &ia64_leave_kernel; -- if (test_thread_flag(TIF_SYSCALL_TRACE)) -+ if (test_thread_flag(TIF_SYSCALL_TRACE) -+ || test_thread_flag(TIF_SYSCALL_AUDIT)) - /* - * strace expects to be notified after sigreturn returns even though the - * context to which we return may not be in the middle of a syscall. -diff --git a/arch/ppc/oprofile/op_model_fsl_booke.c b/arch/ppc/oprofile/op_model_fsl_booke.c ---- a/arch/ppc/oprofile/op_model_fsl_booke.c -+++ b/arch/ppc/oprofile/op_model_fsl_booke.c -@@ -150,7 +150,6 @@ static void fsl_booke_handle_interrupt(s - int is_kernel; - int val; - int i; -- unsigned int cpu = smp_processor_id(); - - /* set the PMM bit (see comment below) */ - mtmsr(mfmsr() | MSR_PMM); -@@ -162,7 +161,7 @@ static void fsl_booke_handle_interrupt(s - val = ctr_read(i); - if (val < 0) { - if (oprofile_running && ctr[i].enabled) { -- oprofile_add_sample(pc, is_kernel, i, cpu); -+ oprofile_add_pc(pc, is_kernel, i); - ctr_write(i, reset_value[i]); - } else { - ctr_write(i, 0); -diff --git a/arch/ppc/platforms/4xx/ebony.h b/arch/ppc/platforms/4xx/ebony.h ---- a/arch/ppc/platforms/4xx/ebony.h -+++ b/arch/ppc/platforms/4xx/ebony.h -@@ -61,8 +61,8 @@ - */ - - /* OpenBIOS defined UART mappings, used before early_serial_setup */ --#define UART0_IO_BASE (u8 *) 0xE0000200 --#define UART1_IO_BASE (u8 *) 0xE0000300 -+#define UART0_IO_BASE 0xE0000200 -+#define UART1_IO_BASE 0xE0000300 - - /* external Epson SG-615P */ - #define BASE_BAUD 691200 -diff --git a/arch/ppc/platforms/4xx/luan.h b/arch/ppc/platforms/4xx/luan.h ---- a/arch/ppc/platforms/4xx/luan.h -+++ b/arch/ppc/platforms/4xx/luan.h -@@ -47,9 +47,9 @@ - #define RS_TABLE_SIZE 3 - - /* PIBS defined UART mappings, used before early_serial_setup */ --#define UART0_IO_BASE (u8 *) 0xa0000200 --#define UART1_IO_BASE (u8 *) 0xa0000300 --#define UART2_IO_BASE (u8 *) 0xa0000600 -+#define UART0_IO_BASE 0xa0000200 -+#define UART1_IO_BASE 0xa0000300 -+#define UART2_IO_BASE 0xa0000600 - - #define BASE_BAUD 11059200 - #define STD_UART_OP(num) \ -diff --git a/arch/ppc/platforms/4xx/ocotea.h b/arch/ppc/platforms/4xx/ocotea.h ---- a/arch/ppc/platforms/4xx/ocotea.h -+++ b/arch/ppc/platforms/4xx/ocotea.h -@@ -56,8 +56,8 @@ - #define RS_TABLE_SIZE 2 - - /* OpenBIOS defined UART mappings, used before early_serial_setup */ --#define UART0_IO_BASE (u8 *) 0xE0000200 --#define UART1_IO_BASE (u8 *) 0xE0000300 -+#define UART0_IO_BASE 0xE0000200 -+#define UART1_IO_BASE 0xE0000300 - - #define BASE_BAUD 11059200/16 - #define STD_UART_OP(num) \ -diff --git a/arch/ppc64/kernel/pSeries_iommu.c b/arch/ppc64/kernel/pSeries_iommu.c ---- a/arch/ppc64/kernel/pSeries_iommu.c -+++ b/arch/ppc64/kernel/pSeries_iommu.c -@@ -401,6 +401,8 @@ static void iommu_bus_setup_pSeriesLP(st - struct device_node *dn, *pdn; - unsigned int *dma_window = NULL; - -+ DBG("iommu_bus_setup_pSeriesLP, bus %p, bus->self %p\n", bus, bus->self); -+ - dn = pci_bus_to_OF_node(bus); - - /* Find nearest ibm,dma-window, walking up the device tree */ -@@ -455,6 +457,56 @@ static void iommu_dev_setup_pSeries(stru - } - } - -+static void iommu_dev_setup_pSeriesLP(struct pci_dev *dev) -+{ -+ struct device_node *pdn, *dn; -+ struct iommu_table *tbl; -+ int *dma_window = NULL; -+ -+ DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, dev->pretty_name); -+ -+ /* dev setup for LPAR is a little tricky, since the device tree might -+ * contain the dma-window properties per-device and not neccesarily -+ * for the bus. So we need to search upwards in the tree until we -+ * either hit a dma-window property, OR find a parent with a table -+ * already allocated. -+ */ -+ dn = pci_device_to_OF_node(dev); -+ -+ for (pdn = dn; pdn && !pdn->iommu_table; pdn = pdn->parent) { -+ dma_window = (unsigned int *)get_property(pdn, "ibm,dma-window", NULL); -+ if (dma_window) -+ break; -+ } -+ -+ /* Check for parent == NULL so we don't try to setup the empty EADS -+ * slots on POWER4 machines. -+ */ -+ if (dma_window == NULL || pdn->parent == NULL) { -+ /* Fall back to regular (non-LPAR) dev setup */ -+ DBG("No dma window for device, falling back to regular setup\n"); -+ iommu_dev_setup_pSeries(dev); -+ return; -+ } else { -+ DBG("Found DMA window, allocating table\n"); -+ } -+ -+ if (!pdn->iommu_table) { -+ /* iommu_table_setparms_lpar needs bussubno. */ -+ pdn->bussubno = pdn->phb->bus->number; -+ -+ tbl = (struct iommu_table *)kmalloc(sizeof(struct iommu_table), -+ GFP_KERNEL); -+ -+ iommu_table_setparms_lpar(pdn->phb, pdn, tbl, dma_window); -+ -+ pdn->iommu_table = iommu_init_table(tbl); -+ } -+ -+ if (pdn != dn) -+ dn->iommu_table = pdn->iommu_table; -+} -+ - static void iommu_bus_setup_null(struct pci_bus *b) { } - static void iommu_dev_setup_null(struct pci_dev *d) { } - -@@ -479,13 +531,14 @@ void iommu_init_early_pSeries(void) - ppc_md.tce_free = tce_free_pSeriesLP; - } - ppc_md.iommu_bus_setup = iommu_bus_setup_pSeriesLP; -+ ppc_md.iommu_dev_setup = iommu_dev_setup_pSeriesLP; - } else { - ppc_md.tce_build = tce_build_pSeries; - ppc_md.tce_free = tce_free_pSeries; - ppc_md.iommu_bus_setup = iommu_bus_setup_pSeries; -+ ppc_md.iommu_dev_setup = iommu_dev_setup_pSeries; - } - -- ppc_md.iommu_dev_setup = iommu_dev_setup_pSeries; - - pci_iommu_init(); - } -diff --git a/arch/sparc/kernel/ptrace.c b/arch/sparc/kernel/ptrace.c ---- a/arch/sparc/kernel/ptrace.c -+++ b/arch/sparc/kernel/ptrace.c -@@ -531,18 +531,6 @@ asmlinkage void do_ptrace(struct pt_regs - pt_error_return(regs, EIO); - goto out_tsk; - } -- if (addr != 1) { -- if (addr & 3) { -- pt_error_return(regs, EINVAL); -- goto out_tsk; -- } --#ifdef DEBUG_PTRACE -- printk ("Original: %08lx %08lx\n", child->thread.kregs->pc, child->thread.kregs->npc); -- printk ("Continuing with %08lx %08lx\n", addr, addr+4); --#endif -- child->thread.kregs->pc = addr; -- child->thread.kregs->npc = addr + 4; -- } - - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); -diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c ---- a/arch/sparc64/kernel/ptrace.c -+++ b/arch/sparc64/kernel/ptrace.c -@@ -514,25 +514,6 @@ asmlinkage void do_ptrace(struct pt_regs - pt_error_return(regs, EIO); - goto out_tsk; - } -- if (addr != 1) { -- unsigned long pc_mask = ~0UL; -- -- if ((child->thread_info->flags & _TIF_32BIT) != 0) -- pc_mask = 0xffffffff; -- -- if (addr & 3) { -- pt_error_return(regs, EINVAL); -- goto out_tsk; -- } --#ifdef DEBUG_PTRACE -- printk ("Original: %016lx %016lx\n", -- child->thread_info->kregs->tpc, -- child->thread_info->kregs->tnpc); -- printk ("Continuing with %016lx %016lx\n", addr, addr+4); --#endif -- child->thread_info->kregs->tpc = (addr & pc_mask); -- child->thread_info->kregs->tnpc = ((addr + 4) & pc_mask); -- } - - if (request == PTRACE_SYSCALL) { - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); -diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c ---- a/arch/sparc64/kernel/signal32.c -+++ b/arch/sparc64/kernel/signal32.c -@@ -192,10 +192,13 @@ int copy_siginfo_to_user32(compat_siginf - err |= __put_user(from->si_uid, &to->si_uid); - break; - case __SI_FAULT >> 16: -- case __SI_POLL >> 16: - err |= __put_user(from->si_trapno, &to->si_trapno); - err |= __put_user((unsigned long)from->si_addr, &to->si_addr); - break; -+ case __SI_POLL >> 16: -+ err |= __put_user(from->si_band, &to->si_band); -+ err |= __put_user(from->si_fd, &to->si_fd); -+ break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: - err |= __put_user(from->si_pid, &to->si_pid); -diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S ---- a/arch/sparc64/kernel/systbls.S -+++ b/arch/sparc64/kernel/systbls.S -@@ -75,7 +75,7 @@ sys_call_table32: - /*260*/ .word compat_sys_sched_getaffinity, compat_sys_sched_setaffinity, sys32_timer_settime, compat_sys_timer_gettime, sys_timer_getoverrun - .word sys_timer_delete, sys32_timer_create, sys_ni_syscall, compat_sys_io_setup, sys_io_destroy - /*270*/ .word sys32_io_submit, sys_io_cancel, compat_sys_io_getevents, sys32_mq_open, sys_mq_unlink -- .word sys_mq_timedsend, sys_mq_timedreceive, compat_sys_mq_notify, compat_sys_mq_getsetattr, compat_sys_waitid -+ .word compat_sys_mq_timedsend, compat_sys_mq_timedreceive, compat_sys_mq_notify, compat_sys_mq_getsetattr, compat_sys_waitid - /*280*/ .word sys_ni_syscall, sys_add_key, sys_request_key, sys_keyctl - - #endif /* CONFIG_COMPAT */ -diff --git a/arch/um/include/sysdep-i386/syscalls.h b/arch/um/include/sysdep-i386/syscalls.h ---- a/arch/um/include/sysdep-i386/syscalls.h -+++ b/arch/um/include/sysdep-i386/syscalls.h -@@ -23,6 +23,9 @@ extern long sys_mmap2(unsigned long addr - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); - -+/* On i386 they choose a meaningless naming.*/ -+#define __NR_kexec_load __NR_sys_kexec_load -+ - #define ARCH_SYSCALLS \ - [ __NR_waitpid ] = (syscall_handler_t *) sys_waitpid, \ - [ __NR_break ] = (syscall_handler_t *) sys_ni_syscall, \ -@@ -101,15 +104,12 @@ extern long sys_mmap2(unsigned long addr - [ 223 ] = (syscall_handler_t *) sys_ni_syscall, \ - [ __NR_set_thread_area ] = (syscall_handler_t *) sys_ni_syscall, \ - [ __NR_get_thread_area ] = (syscall_handler_t *) sys_ni_syscall, \ -- [ __NR_fadvise64 ] = (syscall_handler_t *) sys_fadvise64, \ - [ 251 ] = (syscall_handler_t *) sys_ni_syscall, \ -- [ __NR_remap_file_pages ] = (syscall_handler_t *) sys_remap_file_pages, \ -- [ __NR_utimes ] = (syscall_handler_t *) sys_utimes, \ -- [ __NR_vserver ] = (syscall_handler_t *) sys_ni_syscall, -- -+ [ 285 ] = (syscall_handler_t *) sys_ni_syscall, -+ - /* 222 doesn't yet have a name in include/asm-i386/unistd.h */ - --#define LAST_ARCH_SYSCALL __NR_vserver -+#define LAST_ARCH_SYSCALL 285 - - /* - * Overrides for Emacs so that we follow Linus's tabbing style. -diff --git a/arch/um/include/sysdep-x86_64/syscalls.h b/arch/um/include/sysdep-x86_64/syscalls.h ---- a/arch/um/include/sysdep-x86_64/syscalls.h -+++ b/arch/um/include/sysdep-x86_64/syscalls.h -@@ -71,12 +71,7 @@ extern syscall_handler_t sys_arch_prctl; - [ __NR_iopl ] = (syscall_handler_t *) sys_ni_syscall, \ - [ __NR_set_thread_area ] = (syscall_handler_t *) sys_ni_syscall, \ - [ __NR_get_thread_area ] = (syscall_handler_t *) sys_ni_syscall, \ -- [ __NR_remap_file_pages ] = (syscall_handler_t *) sys_remap_file_pages, \ - [ __NR_semtimedop ] = (syscall_handler_t *) sys_semtimedop, \ -- [ __NR_fadvise64 ] = (syscall_handler_t *) sys_fadvise64, \ -- [ 223 ] = (syscall_handler_t *) sys_ni_syscall, \ -- [ __NR_utimes ] = (syscall_handler_t *) sys_utimes, \ -- [ __NR_vserver ] = (syscall_handler_t *) sys_ni_syscall, \ - [ 251 ] = (syscall_handler_t *) sys_ni_syscall, - - #define LAST_ARCH_SYSCALL 251 -diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c ---- a/arch/um/kernel/skas/uaccess.c -+++ b/arch/um/kernel/skas/uaccess.c -@@ -61,7 +61,8 @@ static void do_buffer_op(void *jmpbuf, v - void *arg; - int *res; - -- va_copy(args, *(va_list *)arg_ptr); -+ /* Some old gccs recognize __va_copy, but not va_copy */ -+ __va_copy(args, *(va_list *)arg_ptr); - addr = va_arg(args, unsigned long); - len = va_arg(args, int); - is_write = va_arg(args, int); -diff --git a/arch/um/kernel/sys_call_table.c b/arch/um/kernel/sys_call_table.c ---- a/arch/um/kernel/sys_call_table.c -+++ b/arch/um/kernel/sys_call_table.c -@@ -48,7 +48,6 @@ extern syscall_handler_t sys_vfork; - extern syscall_handler_t old_select; - extern syscall_handler_t sys_modify_ldt; - extern syscall_handler_t sys_rt_sigsuspend; --extern syscall_handler_t sys_vserver; - extern syscall_handler_t sys_mbind; - extern syscall_handler_t sys_get_mempolicy; - extern syscall_handler_t sys_set_mempolicy; -@@ -242,6 +241,7 @@ syscall_handler_t *sys_call_table[] = { - [ __NR_epoll_create ] = (syscall_handler_t *) sys_epoll_create, - [ __NR_epoll_ctl ] = (syscall_handler_t *) sys_epoll_ctl, - [ __NR_epoll_wait ] = (syscall_handler_t *) sys_epoll_wait, -+ [ __NR_remap_file_pages ] = (syscall_handler_t *) sys_remap_file_pages, - [ __NR_set_tid_address ] = (syscall_handler_t *) sys_set_tid_address, - [ __NR_timer_create ] = (syscall_handler_t *) sys_timer_create, - [ __NR_timer_settime ] = (syscall_handler_t *) sys_timer_settime, -@@ -252,12 +252,10 @@ syscall_handler_t *sys_call_table[] = { - [ __NR_clock_gettime ] = (syscall_handler_t *) sys_clock_gettime, - [ __NR_clock_getres ] = (syscall_handler_t *) sys_clock_getres, - [ __NR_clock_nanosleep ] = (syscall_handler_t *) sys_clock_nanosleep, -- [ __NR_statfs64 ] = (syscall_handler_t *) sys_statfs64, -- [ __NR_fstatfs64 ] = (syscall_handler_t *) sys_fstatfs64, - [ __NR_tgkill ] = (syscall_handler_t *) sys_tgkill, - [ __NR_utimes ] = (syscall_handler_t *) sys_utimes, -- [ __NR_fadvise64_64 ] = (syscall_handler_t *) sys_fadvise64_64, -- [ __NR_vserver ] = (syscall_handler_t *) sys_vserver, -+ [ __NR_fadvise64 ] = (syscall_handler_t *) sys_fadvise64, -+ [ __NR_vserver ] = (syscall_handler_t *) sys_ni_syscall, - [ __NR_mbind ] = (syscall_handler_t *) sys_mbind, - [ __NR_get_mempolicy ] = (syscall_handler_t *) sys_get_mempolicy, - [ __NR_set_mempolicy ] = (syscall_handler_t *) sys_set_mempolicy, -@@ -267,9 +265,8 @@ syscall_handler_t *sys_call_table[] = { - [ __NR_mq_timedreceive ] = (syscall_handler_t *) sys_mq_timedreceive, - [ __NR_mq_notify ] = (syscall_handler_t *) sys_mq_notify, - [ __NR_mq_getsetattr ] = (syscall_handler_t *) sys_mq_getsetattr, -- [ __NR_sys_kexec_load ] = (syscall_handler_t *) sys_ni_syscall, -+ [ __NR_kexec_load ] = (syscall_handler_t *) sys_ni_syscall, - [ __NR_waitid ] = (syscall_handler_t *) sys_waitid, -- [ 285 ] = (syscall_handler_t *) sys_ni_syscall, - [ __NR_add_key ] = (syscall_handler_t *) sys_add_key, - [ __NR_request_key ] = (syscall_handler_t *) sys_request_key, - [ __NR_keyctl ] = (syscall_handler_t *) sys_keyctl, -diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c ---- a/arch/x86_64/kernel/apic.c -+++ b/arch/x86_64/kernel/apic.c -@@ -775,9 +775,7 @@ void __init setup_boot_APIC_clock (void) - - void __init setup_secondary_APIC_clock(void) - { -- local_irq_disable(); /* FIXME: Do we need this? --RR */ - setup_APIC_timer(calibration_result); -- local_irq_enable(); - } - - void __init disable_APIC_timer(void) -diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c ---- a/arch/x86_64/kernel/ptrace.c -+++ b/arch/x86_64/kernel/ptrace.c -@@ -129,13 +129,13 @@ static int putreg(struct task_struct *ch - value &= 0xffff; - return 0; - case offsetof(struct user_regs_struct,fs_base): -- if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) -- return -EIO; -+ if (value >= TASK_SIZE) -+ return -EIO; - child->thread.fs = value; - return 0; - case offsetof(struct user_regs_struct,gs_base): -- if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) -- return -EIO; -+ if (value >= TASK_SIZE) -+ return -EIO; - child->thread.gs = value; - return 0; - case offsetof(struct user_regs_struct, eflags): -@@ -149,6 +149,11 @@ static int putreg(struct task_struct *ch - return -EIO; - value &= 0xffff; - break; -+ case offsetof(struct user_regs_struct, rip): -+ /* Check if the new RIP address is canonical */ -+ if (value >= TASK_SIZE) -+ return -EIO; -+ break; - } - put_stack_long(child, regno - sizeof(struct pt_regs), value); - return 0; -@@ -247,7 +252,7 @@ asmlinkage long sys_ptrace(long request, - break; - - switch (addr) { -- case 0 ... sizeof(struct user_regs_struct): -+ case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - tmp = getreg(child, addr); - break; - case offsetof(struct user, u_debugreg[0]): -@@ -292,7 +297,7 @@ asmlinkage long sys_ptrace(long request, - break; - - switch (addr) { -- case 0 ... sizeof(struct user_regs_struct): -+ case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - ret = putreg(child, addr, data); - break; - /* Disallows to set a breakpoint into the vsyscall */ -diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c ---- a/arch/x86_64/kernel/smpboot.c -+++ b/arch/x86_64/kernel/smpboot.c -@@ -309,8 +309,6 @@ void __init smp_callin(void) - Dprintk("CALLIN, before setup_local_APIC().\n"); - setup_local_APIC(); - -- local_irq_enable(); -- - /* - * Get our bogomips. - */ -@@ -324,8 +322,6 @@ void __init smp_callin(void) - */ - smp_store_cpu_info(cpuid); - -- local_irq_disable(); -- - /* - * Allow the master to continue. - */ -diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c ---- a/arch/x86_64/mm/fault.c -+++ b/arch/x86_64/mm/fault.c -@@ -236,6 +236,8 @@ static noinline void pgtable_bad(unsigne - - /* - * Handle a fault on the vmalloc or module mapping area -+ * -+ * This assumes no large pages in there. - */ - static int vmalloc_fault(unsigned long address) - { -@@ -274,7 +276,10 @@ static int vmalloc_fault(unsigned long a - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); -- if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref)) -+ /* Don't use pte_page here, because the mappings can point -+ outside mem_map, and the NUMA hash lookup cannot handle -+ that. */ -+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - __flush_tlb_all(); - return 0; -@@ -348,7 +353,9 @@ asmlinkage void do_page_fault(struct pt_ - * protection error (error_code & 1) == 0. - */ - if (unlikely(address >= TASK_SIZE)) { -- if (!(error_code & 5)) { -+ if (!(error_code & 5) && -+ ((address >= VMALLOC_START && address < VMALLOC_END) || -+ (address >= MODULES_VADDR && address < MODULES_END))) { - if (vmalloc_fault(address) < 0) - goto bad_area_nosemaphore; - return; -diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c ---- a/arch/x86_64/mm/ioremap.c -+++ b/arch/x86_64/mm/ioremap.c -@@ -266,7 +266,7 @@ void iounmap(volatile void __iomem *addr - if ((p->flags >> 20) && - p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) { - /* p->size includes the guard page, but cpa doesn't like that */ -- change_page_attr(virt_to_page(__va(p->phys_addr)), -+ change_page_attr_addr((unsigned long)(__va(p->phys_addr)), - (p->size - PAGE_SIZE) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); -diff --git a/drivers/block/ioctl.c b/drivers/block/ioctl.c ---- a/drivers/block/ioctl.c -+++ b/drivers/block/ioctl.c -@@ -237,3 +237,5 @@ long compat_blkdev_ioctl(struct file *fi - } - return ret; - } -+ -+EXPORT_SYMBOL_GPL(blkdev_ioctl); -diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c ---- a/drivers/block/pktcdvd.c -+++ b/drivers/block/pktcdvd.c -@@ -2400,7 +2400,7 @@ static int pkt_ioctl(struct inode *inode - case CDROM_LAST_WRITTEN: - case CDROM_SEND_PACKET: - case SCSI_IOCTL_SEND_COMMAND: -- return ioctl_by_bdev(pd->bdev, cmd, arg); -+ return blkdev_ioctl(pd->bdev->bd_inode, file, cmd, arg); - - case CDROMEJECT: - /* -@@ -2408,7 +2408,7 @@ static int pkt_ioctl(struct inode *inode - * have to unlock it or else the eject command fails. - */ - pkt_lock_door(pd, 0); -- return ioctl_by_bdev(pd->bdev, cmd, arg); -+ return blkdev_ioctl(pd->bdev->bd_inode, file, cmd, arg); - - default: - printk("pktcdvd: Unknown ioctl for %s (%x)\n", pd->name, cmd); -diff --git a/drivers/char/drm/drm_ioctl.c b/drivers/char/drm/drm_ioctl.c ---- a/drivers/char/drm/drm_ioctl.c -+++ b/drivers/char/drm/drm_ioctl.c -@@ -326,6 +326,8 @@ int drm_setversion(DRM_IOCTL_ARGS) - - DRM_COPY_FROM_USER_IOCTL(sv, argp, sizeof(sv)); - -+ memset(&version, 0, sizeof(version)); -+ - dev->driver->version(&version); - retv.drm_di_major = DRM_IF_MAJOR; - retv.drm_di_minor = DRM_IF_MINOR; -diff --git a/drivers/char/raw.c b/drivers/char/raw.c ---- a/drivers/char/raw.c -+++ b/drivers/char/raw.c -@@ -122,7 +122,7 @@ raw_ioctl(struct inode *inode, struct fi - { - struct block_device *bdev = filp->private_data; - -- return ioctl_by_bdev(bdev, command, arg); -+ return blkdev_ioctl(bdev->bd_inode, filp, command, arg); - } - - static void bind_device(struct raw_config_request *rq) -diff --git a/drivers/i2c/chips/eeprom.c b/drivers/i2c/chips/eeprom.c ---- a/drivers/i2c/chips/eeprom.c -+++ b/drivers/i2c/chips/eeprom.c -@@ -130,7 +130,8 @@ static ssize_t eeprom_read(struct kobjec - - /* Hide Vaio security settings to regular users (16 first bytes) */ - if (data->nature == VAIO && off < 16 && !capable(CAP_SYS_ADMIN)) { -- int in_row1 = 16 - off; -+ size_t in_row1 = 16 - off; -+ in_row1 = min(in_row1, count); - memset(buf, 0, in_row1); - if (count - in_row1 > 0) - memcpy(buf + in_row1, &data->data[16], count - in_row1); -diff --git a/drivers/i2c/chips/it87.c b/drivers/i2c/chips/it87.c ---- a/drivers/i2c/chips/it87.c -+++ b/drivers/i2c/chips/it87.c -@@ -631,7 +631,7 @@ static ssize_t show_alarms(struct device - struct it87_data *data = it87_update_device(dev); - return sprintf(buf,"%d\n", ALARMS_FROM_REG(data->alarms)); - } --static DEVICE_ATTR(alarms, S_IRUGO | S_IWUSR, show_alarms, NULL); -+static DEVICE_ATTR(alarms, S_IRUGO, show_alarms, NULL); - - static ssize_t - show_vrm_reg(struct device *dev, char *buf) -diff --git a/drivers/i2c/chips/via686a.c b/drivers/i2c/chips/via686a.c ---- a/drivers/i2c/chips/via686a.c -+++ b/drivers/i2c/chips/via686a.c -@@ -554,7 +554,7 @@ static ssize_t show_alarms(struct device - struct via686a_data *data = via686a_update_device(dev); - return sprintf(buf,"%d\n", ALARMS_FROM_REG(data->alarms)); - } --static DEVICE_ATTR(alarms, S_IRUGO | S_IWUSR, show_alarms, NULL); -+static DEVICE_ATTR(alarms, S_IRUGO, show_alarms, NULL); - - /* The driver. I choose to use type i2c_driver, as at is identical to both - smbus_driver and isa_driver, and clients could be of either kind */ -diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c ---- a/drivers/ide/ide-disk.c -+++ b/drivers/ide/ide-disk.c -@@ -133,6 +133,8 @@ static ide_startstop_t __ide_do_rw_disk( - if (hwif->no_lba48_dma && lba48 && dma) { - if (block + rq->nr_sectors > 1ULL << 28) - dma = 0; -+ else -+ lba48 = 0; - } - - if (!dma) { -@@ -146,7 +148,7 @@ static ide_startstop_t __ide_do_rw_disk( - /* FIXME: SELECT_MASK(drive, 0) ? */ - - if (drive->select.b.lba) { -- if (drive->addressing == 1) { -+ if (lba48) { - task_ioreg_t tasklets[10]; - - pr_debug("%s: LBA=0x%012llx\n", drive->name, block); -diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h ---- a/drivers/input/serio/i8042-x86ia64io.h -+++ b/drivers/input/serio/i8042-x86ia64io.h -@@ -88,7 +88,7 @@ static struct dmi_system_id __initdata i - }; - #endif - --#ifdef CONFIG_ACPI -+#if defined(__ia64__) && defined(CONFIG_ACPI) - #include <linux/acpi.h> - #include <acpi/acpi_bus.h> - -@@ -281,7 +281,7 @@ static inline int i8042_platform_init(vo - i8042_kbd_irq = I8042_MAP_IRQ(1); - i8042_aux_irq = I8042_MAP_IRQ(12); - --#ifdef CONFIG_ACPI -+#if defined(__ia64__) && defined(CONFIG_ACPI) - if (i8042_acpi_init()) - return -1; - #endif -@@ -300,7 +300,7 @@ static inline int i8042_platform_init(vo - - static inline void i8042_platform_exit(void) - { --#ifdef CONFIG_ACPI -+#if defined(__ia64__) && defined(CONFIG_ACPI) - i8042_acpi_exit(); - #endif - } -diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc ---- a/drivers/md/raid6altivec.uc -+++ b/drivers/md/raid6altivec.uc -@@ -108,7 +108,11 @@ int raid6_have_altivec(void); - int raid6_have_altivec(void) - { - /* This assumes either all CPUs have Altivec or none does */ -+#ifdef CONFIG_PPC64 - return cur_cpu_spec->cpu_features & CPU_FTR_ALTIVEC; -+#else -+ return cur_cpu_spec[0]->cpu_features & CPU_FTR_ALTIVEC; -+#endif - } - #endif - -diff --git a/drivers/media/video/adv7170.c b/drivers/media/video/adv7170.c ---- a/drivers/media/video/adv7170.c -+++ b/drivers/media/video/adv7170.c -@@ -130,7 +130,7 @@ adv7170_write_block (struct i2c_client * - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/media/video/adv7175.c b/drivers/media/video/adv7175.c ---- a/drivers/media/video/adv7175.c -+++ b/drivers/media/video/adv7175.c -@@ -126,7 +126,7 @@ adv7175_write_block (struct i2c_client * - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/media/video/bt819.c b/drivers/media/video/bt819.c ---- a/drivers/media/video/bt819.c -+++ b/drivers/media/video/bt819.c -@@ -146,7 +146,7 @@ bt819_write_block (struct i2c_client *cl - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/media/video/bttv-cards.c b/drivers/media/video/bttv-cards.c ---- a/drivers/media/video/bttv-cards.c -+++ b/drivers/media/video/bttv-cards.c -@@ -1939,7 +1939,6 @@ struct tvcard bttv_tvcards[] = { - .no_tda9875 = 1, - .no_tda7432 = 1, - .tuner_type = TUNER_ABSENT, -- .no_video = 1, - .pll = PLL_28, - },{ - .name = "Teppro TEV-560/InterVision IV-560", -@@ -2718,8 +2717,6 @@ void __devinit bttv_init_card2(struct bt - } - btv->pll.pll_current = -1; - -- bttv_reset_audio(btv); -- - /* tuner configuration (from card list / autodetect / insmod option) */ - if (UNSET != bttv_tvcards[btv->c.type].tuner_type) - if(UNSET == btv->tuner_type) -diff --git a/drivers/media/video/saa7110.c b/drivers/media/video/saa7110.c ---- a/drivers/media/video/saa7110.c -+++ b/drivers/media/video/saa7110.c -@@ -60,8 +60,10 @@ MODULE_PARM_DESC(debug, "Debug level (0- - - #define I2C_SAA7110 0x9C /* or 0x9E */ - -+#define SAA7110_NR_REG 0x35 -+ - struct saa7110 { -- unsigned char reg[54]; -+ u8 reg[SAA7110_NR_REG]; - - int norm; - int input; -@@ -95,31 +97,28 @@ saa7110_write_block (struct i2c_client * - unsigned int len) - { - int ret = -1; -- u8 reg = *data++; -+ u8 reg = *data; /* first register to write to */ - -- len--; -+ /* Sanity check */ -+ if (reg + (len - 1) > SAA7110_NR_REG) -+ return ret; - - /* the saa7110 has an autoincrement function, use it if - * the adapter understands raw I2C */ - if (i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { - struct saa7110 *decoder = i2c_get_clientdata(client); - struct i2c_msg msg; -- u8 block_data[54]; - -- msg.len = 0; -- msg.buf = (char *) block_data; -+ msg.len = len; -+ msg.buf = (char *) data; - msg.addr = client->addr; -- msg.flags = client->flags; -- while (len >= 1) { -- msg.len = 0; -- block_data[msg.len++] = reg; -- while (len-- >= 1 && msg.len < 54) -- block_data[msg.len++] = -- decoder->reg[reg++] = *data++; -- ret = i2c_transfer(client->adapter, &msg, 1); -- } -+ msg.flags = 0; -+ ret = i2c_transfer(client->adapter, &msg, 1); -+ -+ /* Cache the written data */ -+ memcpy(decoder->reg + reg, data + 1, len - 1); - } else { -- while (len-- >= 1) { -+ for (++data, --len; len; len--) { - if ((ret = saa7110_write(client, reg++, - *data++)) < 0) - break; -@@ -192,7 +191,7 @@ saa7110_selmux (struct i2c_client *clien - return 0; - } - --static const unsigned char initseq[] = { -+static const unsigned char initseq[1 + SAA7110_NR_REG] = { - 0, 0x4C, 0x3C, 0x0D, 0xEF, 0xBD, 0xF2, 0x03, 0x00, - /* 0x08 */ 0xF8, 0xF8, 0x60, 0x60, 0x00, 0x86, 0x18, 0x90, - /* 0x10 */ 0x00, 0x59, 0x40, 0x46, 0x42, 0x1A, 0xFF, 0xDA, -diff --git a/drivers/media/video/saa7114.c b/drivers/media/video/saa7114.c ---- a/drivers/media/video/saa7114.c -+++ b/drivers/media/video/saa7114.c -@@ -163,7 +163,7 @@ saa7114_write_block (struct i2c_client * - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/media/video/saa7185.c b/drivers/media/video/saa7185.c ---- a/drivers/media/video/saa7185.c -+++ b/drivers/media/video/saa7185.c -@@ -118,7 +118,7 @@ saa7185_write_block (struct i2c_client * - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c ---- a/drivers/net/3c59x.c -+++ b/drivers/net/3c59x.c -@@ -1581,7 +1581,8 @@ vortex_up(struct net_device *dev) - - if (VORTEX_PCI(vp)) { - pci_set_power_state(VORTEX_PCI(vp), PCI_D0); /* Go active */ -- pci_restore_state(VORTEX_PCI(vp)); -+ if (vp->pm_state_valid) -+ pci_restore_state(VORTEX_PCI(vp)); - pci_enable_device(VORTEX_PCI(vp)); - } - -@@ -2741,6 +2742,7 @@ vortex_down(struct net_device *dev, int - outl(0, ioaddr + DownListPtr); - - if (final_down && VORTEX_PCI(vp)) { -+ vp->pm_state_valid = 1; - pci_save_state(VORTEX_PCI(vp)); - acpi_set_WOL(dev); - } -@@ -3243,9 +3245,10 @@ static void acpi_set_WOL(struct net_devi - outw(RxEnable, ioaddr + EL3_CMD); - - pci_enable_wake(VORTEX_PCI(vp), 0, 1); -+ -+ /* Change the power state to D3; RxEnable doesn't take effect. */ -+ pci_set_power_state(VORTEX_PCI(vp), PCI_D3hot); - } -- /* Change the power state to D3; RxEnable doesn't take effect. */ -- pci_set_power_state(VORTEX_PCI(vp), PCI_D3hot); - } - - -diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c ---- a/drivers/net/amd8111e.c -+++ b/drivers/net/amd8111e.c -@@ -1381,6 +1381,8 @@ static int amd8111e_open(struct net_devi - - if(amd8111e_restart(dev)){ - spin_unlock_irq(&lp->lock); -+ if (dev->irq) -+ free_irq(dev->irq, dev); - return -ENOMEM; - } - /* Start ipg timer */ -diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c ---- a/drivers/net/ppp_async.c -+++ b/drivers/net/ppp_async.c -@@ -1000,7 +1000,7 @@ static void async_lcp_peek(struct asyncp - data += 4; - dlen -= 4; - /* data[0] is code, data[1] is length */ -- while (dlen >= 2 && dlen >= data[1]) { -+ while (dlen >= 2 && dlen >= data[1] && data[1] >= 2) { - switch (data[0]) { - case LCP_MRU: - val = (data[2] << 8) + data[3]; -diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c ---- a/drivers/net/r8169.c -+++ b/drivers/net/r8169.c -@@ -1683,16 +1683,19 @@ static void rtl8169_free_rx_skb(struct r - rtl8169_make_unusable_by_asic(desc); - } - --static inline void rtl8169_return_to_asic(struct RxDesc *desc, int rx_buf_sz) -+static inline void rtl8169_mark_to_asic(struct RxDesc *desc, u32 rx_buf_sz) - { -- desc->opts1 |= cpu_to_le32(DescOwn + rx_buf_sz); -+ u32 eor = le32_to_cpu(desc->opts1) & RingEnd; -+ -+ desc->opts1 = cpu_to_le32(DescOwn | eor | rx_buf_sz); - } - --static inline void rtl8169_give_to_asic(struct RxDesc *desc, dma_addr_t mapping, -- int rx_buf_sz) -+static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping, -+ u32 rx_buf_sz) - { - desc->addr = cpu_to_le64(mapping); -- desc->opts1 |= cpu_to_le32(DescOwn + rx_buf_sz); -+ wmb(); -+ rtl8169_mark_to_asic(desc, rx_buf_sz); - } - - static int rtl8169_alloc_rx_skb(struct pci_dev *pdev, struct sk_buff **sk_buff, -@@ -1712,7 +1715,7 @@ static int rtl8169_alloc_rx_skb(struct p - mapping = pci_map_single(pdev, skb->tail, rx_buf_sz, - PCI_DMA_FROMDEVICE); - -- rtl8169_give_to_asic(desc, mapping, rx_buf_sz); -+ rtl8169_map_to_asic(desc, mapping, rx_buf_sz); - - out: - return ret; -@@ -2150,7 +2153,7 @@ static inline int rtl8169_try_rx_copy(st - skb_reserve(skb, NET_IP_ALIGN); - eth_copy_and_sum(skb, sk_buff[0]->tail, pkt_size, 0); - *sk_buff = skb; -- rtl8169_return_to_asic(desc, rx_buf_sz); -+ rtl8169_mark_to_asic(desc, rx_buf_sz); - ret = 0; - } - } -diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c ---- a/drivers/net/sis900.c -+++ b/drivers/net/sis900.c -@@ -236,7 +236,7 @@ static int __devinit sis900_get_mac_addr - signature = (u16) read_eeprom(ioaddr, EEPROMSignature); - if (signature == 0xffff || signature == 0x0000) { - printk (KERN_INFO "%s: Error EERPOM read %x\n", -- net_dev->name, signature); -+ pci_name(pci_dev), signature); - return 0; - } - -@@ -268,7 +268,7 @@ static int __devinit sis630e_get_mac_add - if (!isa_bridge) - isa_bridge = pci_get_device(PCI_VENDOR_ID_SI, 0x0018, isa_bridge); - if (!isa_bridge) { -- printk("%s: Can not find ISA bridge\n", net_dev->name); -+ printk("%s: Can not find ISA bridge\n", pci_name(pci_dev)); - return 0; - } - pci_read_config_byte(isa_bridge, 0x48, ®); -@@ -456,10 +456,6 @@ static int __devinit sis900_probe(struct - net_dev->tx_timeout = sis900_tx_timeout; - net_dev->watchdog_timeo = TX_TIMEOUT; - net_dev->ethtool_ops = &sis900_ethtool_ops; -- -- ret = register_netdev(net_dev); -- if (ret) -- goto err_unmap_rx; - - /* Get Mac address according to the chip revision */ - pci_read_config_byte(pci_dev, PCI_CLASS_REVISION, &revision); -@@ -476,7 +472,7 @@ static int __devinit sis900_probe(struct - - if (ret == 0) { - ret = -ENODEV; -- goto err_out_unregister; -+ goto err_unmap_rx; - } - - /* 630ET : set the mii access mode as software-mode */ -@@ -486,7 +482,7 @@ static int __devinit sis900_probe(struct - /* probe for mii transceiver */ - if (sis900_mii_probe(net_dev) == 0) { - ret = -ENODEV; -- goto err_out_unregister; -+ goto err_unmap_rx; - } - - /* save our host bridge revision */ -@@ -496,6 +492,10 @@ static int __devinit sis900_probe(struct - pci_dev_put(dev); - } - -+ ret = register_netdev(net_dev); -+ if (ret) -+ goto err_unmap_rx; -+ - /* print some information about our NIC */ - printk(KERN_INFO "%s: %s at %#lx, IRQ %d, ", net_dev->name, - card_name, ioaddr, net_dev->irq); -@@ -505,8 +505,6 @@ static int __devinit sis900_probe(struct - - return 0; - -- err_out_unregister: -- unregister_netdev(net_dev); - err_unmap_rx: - pci_free_consistent(pci_dev, RX_TOTAL_SIZE, sis_priv->rx_ring, - sis_priv->rx_ring_dma); -@@ -533,6 +531,7 @@ static int __devinit sis900_probe(struct - static int __init sis900_mii_probe(struct net_device * net_dev) - { - struct sis900_private * sis_priv = net_dev->priv; -+ const char *dev_name = pci_name(sis_priv->pci_dev); - u16 poll_bit = MII_STAT_LINK, status = 0; - unsigned long timeout = jiffies + 5 * HZ; - int phy_addr; -@@ -582,21 +581,20 @@ static int __init sis900_mii_probe(struc - mii_phy->phy_types = - (mii_status & (MII_STAT_CAN_TX_FDX | MII_STAT_CAN_TX)) ? LAN : HOME; - printk(KERN_INFO "%s: %s transceiver found at address %d.\n", -- net_dev->name, mii_chip_table[i].name, -+ dev_name, mii_chip_table[i].name, - phy_addr); - break; - } - - if( !mii_chip_table[i].phy_id1 ) { - printk(KERN_INFO "%s: Unknown PHY transceiver found at address %d.\n", -- net_dev->name, phy_addr); -+ dev_name, phy_addr); - mii_phy->phy_types = UNKNOWN; - } - } - - if (sis_priv->mii == NULL) { -- printk(KERN_INFO "%s: No MII transceivers found!\n", -- net_dev->name); -+ printk(KERN_INFO "%s: No MII transceivers found!\n", dev_name); - return 0; - } - -@@ -621,7 +619,7 @@ static int __init sis900_mii_probe(struc - poll_bit ^= (mdio_read(net_dev, sis_priv->cur_phy, MII_STATUS) & poll_bit); - if (time_after_eq(jiffies, timeout)) { - printk(KERN_WARNING "%s: reset phy and link down now\n", -- net_dev->name); -+ dev_name); - return -ETIME; - } - } -@@ -691,7 +689,7 @@ static u16 sis900_default_phy(struct net - sis_priv->mii = default_phy; - sis_priv->cur_phy = default_phy->phy_addr; - printk(KERN_INFO "%s: Using transceiver found at address %d as default\n", -- net_dev->name,sis_priv->cur_phy); -+ pci_name(sis_priv->pci_dev), sis_priv->cur_phy); - } - - status = mdio_read(net_dev, sis_priv->cur_phy, MII_CONTROL); -diff --git a/drivers/net/tun.c b/drivers/net/tun.c ---- a/drivers/net/tun.c -+++ b/drivers/net/tun.c -@@ -229,7 +229,7 @@ static __inline__ ssize_t tun_get_user(s - size_t len = count; - - if (!(tun->flags & TUN_NO_PI)) { -- if ((len -= sizeof(pi)) > len) -+ if ((len -= sizeof(pi)) > count) - return -EINVAL; - - if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) -diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c ---- a/drivers/net/via-rhine.c -+++ b/drivers/net/via-rhine.c -@@ -1197,8 +1197,10 @@ static int rhine_open(struct net_device - dev->name, rp->pdev->irq); - - rc = alloc_ring(dev); -- if (rc) -+ if (rc) { -+ free_irq(rp->pdev->irq, dev); - return rc; -+ } - alloc_rbufs(dev); - alloc_tbufs(dev); - rhine_chip_reset(dev); -@@ -1899,6 +1901,9 @@ static void rhine_shutdown (struct devic - struct rhine_private *rp = netdev_priv(dev); - void __iomem *ioaddr = rp->base; - -+ if (!(rp->quirks & rqWOL)) -+ return; /* Nothing to do for non-WOL adapters */ -+ - rhine_power_init(dev); - - /* Make sure we use pattern 0, 1 and not 4, 5 */ -diff --git a/drivers/net/wan/hd6457x.c b/drivers/net/wan/hd6457x.c ---- a/drivers/net/wan/hd6457x.c -+++ b/drivers/net/wan/hd6457x.c -@@ -315,7 +315,7 @@ static inline void sca_rx(card_t *card, - #endif - stats->rx_packets++; - stats->rx_bytes += skb->len; -- skb->dev->last_rx = jiffies; -+ dev->last_rx = jiffies; - skb->protocol = hdlc_type_trans(skb, dev); - netif_rx(skb); - } -diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c ---- a/drivers/pci/hotplug/pciehp_ctrl.c -+++ b/drivers/pci/hotplug/pciehp_ctrl.c -@@ -1354,10 +1354,11 @@ static u32 remove_board(struct pci_func - dbg("PCI Bridge Hot-Remove s:b:d:f(%02x:%02x:%02x:%02x)\n", - ctrl->seg, func->bus, func->device, func->function); - bridge_slot_remove(func); -- } else -+ } else { - dbg("PCI Function Hot-Remove s:b:d:f(%02x:%02x:%02x:%02x)\n", - ctrl->seg, func->bus, func->device, func->function); - slot_remove(func); -+ } - - func = pciehp_slot_find(ctrl->slot_bus, device, 0); - } -diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c ---- a/drivers/usb/serial/visor.c -+++ b/drivers/usb/serial/visor.c -@@ -386,6 +386,7 @@ struct visor_private { - int bytes_in; - int bytes_out; - int outstanding_urbs; -+ int throttled; - }; - - /* number of outstanding urbs to prevent userspace DoS from happening */ -@@ -415,6 +416,7 @@ static int visor_open (struct usb_serial - priv->bytes_in = 0; - priv->bytes_out = 0; - priv->outstanding_urbs = 0; -+ priv->throttled = 0; - spin_unlock_irqrestore(&priv->lock, flags); - - /* -@@ -602,6 +604,7 @@ static void visor_read_bulk_callback (st - struct tty_struct *tty; - unsigned long flags; - int i; -+ int throttled; - int result; - - dbg("%s - port %d", __FUNCTION__, port->number); -@@ -627,18 +630,21 @@ static void visor_read_bulk_callback (st - } - spin_lock_irqsave(&priv->lock, flags); - priv->bytes_in += urb->actual_length; -+ throttled = priv->throttled; - spin_unlock_irqrestore(&priv->lock, flags); - -- /* Continue trying to always read */ -- usb_fill_bulk_urb (port->read_urb, port->serial->dev, -- usb_rcvbulkpipe(port->serial->dev, -- port->bulk_in_endpointAddress), -- port->read_urb->transfer_buffer, -- port->read_urb->transfer_buffer_length, -- visor_read_bulk_callback, port); -- result = usb_submit_urb(port->read_urb, GFP_ATOMIC); -- if (result) -- dev_err(&port->dev, "%s - failed resubmitting read urb, error %d\n", __FUNCTION__, result); -+ /* Continue trying to always read if we should */ -+ if (!throttled) { -+ usb_fill_bulk_urb (port->read_urb, port->serial->dev, -+ usb_rcvbulkpipe(port->serial->dev, -+ port->bulk_in_endpointAddress), -+ port->read_urb->transfer_buffer, -+ port->read_urb->transfer_buffer_length, -+ visor_read_bulk_callback, port); -+ result = usb_submit_urb(port->read_urb, GFP_ATOMIC); -+ if (result) -+ dev_err(&port->dev, "%s - failed resubmitting read urb, error %d\n", __FUNCTION__, result); -+ } - return; - } - -@@ -683,16 +689,26 @@ exit: - - static void visor_throttle (struct usb_serial_port *port) - { -+ struct visor_private *priv = usb_get_serial_port_data(port); -+ unsigned long flags; -+ - dbg("%s - port %d", __FUNCTION__, port->number); -- usb_kill_urb(port->read_urb); -+ spin_lock_irqsave(&priv->lock, flags); -+ priv->throttled = 1; -+ spin_unlock_irqrestore(&priv->lock, flags); - } - - - static void visor_unthrottle (struct usb_serial_port *port) - { -+ struct visor_private *priv = usb_get_serial_port_data(port); -+ unsigned long flags; - int result; - - dbg("%s - port %d", __FUNCTION__, port->number); -+ spin_lock_irqsave(&priv->lock, flags); -+ priv->throttled = 0; -+ spin_unlock_irqrestore(&priv->lock, flags); - - port->read_urb->dev = port->serial->dev; - result = usb_submit_urb(port->read_urb, GFP_ATOMIC); -diff --git a/drivers/video/matrox/matroxfb_accel.c b/drivers/video/matrox/matroxfb_accel.c ---- a/drivers/video/matrox/matroxfb_accel.c -+++ b/drivers/video/matrox/matroxfb_accel.c -@@ -438,13 +438,21 @@ static void matroxfb_1bpp_imageblit(WPMI - } else if (step == 1) { - /* Special case for 1..8bit widths */ - while (height--) { -- mga_writel(mmio, 0, *chardata); -+#if defined(__BIG_ENDIAN) -+ fb_writel((*chardata) << 24, mmio.vaddr); -+#else -+ fb_writel(*chardata, mmio.vaddr); -+#endif - chardata++; - } - } else if (step == 2) { - /* Special case for 9..15bit widths */ - while (height--) { -- mga_writel(mmio, 0, *(u_int16_t*)chardata); -+#if defined(__BIG_ENDIAN) -+ fb_writel((*(u_int16_t*)chardata) << 16, mmio.vaddr); -+#else -+ fb_writel(*(u_int16_t*)chardata, mmio.vaddr); -+#endif - chardata += 2; - } - } else { -@@ -454,7 +462,7 @@ static void matroxfb_1bpp_imageblit(WPMI - - for (i = 0; i < step; i += 4) { - /* Hope that there are at least three readable bytes beyond the end of bitmap */ -- mga_writel(mmio, 0, get_unaligned((u_int32_t*)(chardata + i))); -+ fb_writel(get_unaligned((u_int32_t*)(chardata + i)),mmio.vaddr); - } - chardata += step; - } -diff --git a/drivers/video/matrox/matroxfb_base.h b/drivers/video/matrox/matroxfb_base.h ---- a/drivers/video/matrox/matroxfb_base.h -+++ b/drivers/video/matrox/matroxfb_base.h -@@ -170,14 +170,14 @@ static inline void mga_memcpy_toio(vaddr - - if ((unsigned long)src & 3) { - while (len >= 4) { -- writel(get_unaligned((u32 *)src), addr); -+ fb_writel(get_unaligned((u32 *)src), addr); - addr++; - len -= 4; - src += 4; - } - } else { - while (len >= 4) { -- writel(*(u32 *)src, addr); -+ fb_writel(*(u32 *)src, addr); - addr++; - len -= 4; - src += 4; -diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c ---- a/fs/binfmt_elf.c -+++ b/fs/binfmt_elf.c -@@ -257,7 +257,7 @@ create_elf_tables(struct linux_binprm *b - } - - /* Populate argv and envp */ -- p = current->mm->arg_start; -+ p = current->mm->arg_end = current->mm->arg_start; - while (argc-- > 0) { - size_t len; - __put_user((elf_addr_t)p, argv++); -@@ -1008,6 +1008,7 @@ out_free_ph: - static int load_elf_library(struct file *file) - { - struct elf_phdr *elf_phdata; -+ struct elf_phdr *eppnt; - unsigned long elf_bss, bss, len; - int retval, error, i, j; - struct elfhdr elf_ex; -@@ -1031,44 +1032,47 @@ static int load_elf_library(struct file - /* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */ - - error = -ENOMEM; -- elf_phdata = (struct elf_phdr *) kmalloc(j, GFP_KERNEL); -+ elf_phdata = kmalloc(j, GFP_KERNEL); - if (!elf_phdata) - goto out; - -+ eppnt = elf_phdata; - error = -ENOEXEC; -- retval = kernel_read(file, elf_ex.e_phoff, (char *) elf_phdata, j); -+ retval = kernel_read(file, elf_ex.e_phoff, (char *)eppnt, j); - if (retval != j) - goto out_free_ph; - - for (j = 0, i = 0; i<elf_ex.e_phnum; i++) -- if ((elf_phdata + i)->p_type == PT_LOAD) j++; -+ if ((eppnt + i)->p_type == PT_LOAD) -+ j++; - if (j != 1) - goto out_free_ph; - -- while (elf_phdata->p_type != PT_LOAD) elf_phdata++; -+ while (eppnt->p_type != PT_LOAD) -+ eppnt++; - - /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, -- ELF_PAGESTART(elf_phdata->p_vaddr), -- (elf_phdata->p_filesz + -- ELF_PAGEOFFSET(elf_phdata->p_vaddr)), -+ ELF_PAGESTART(eppnt->p_vaddr), -+ (eppnt->p_filesz + -+ ELF_PAGEOFFSET(eppnt->p_vaddr)), - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, -- (elf_phdata->p_offset - -- ELF_PAGEOFFSET(elf_phdata->p_vaddr))); -+ (eppnt->p_offset - -+ ELF_PAGEOFFSET(eppnt->p_vaddr))); - up_write(¤t->mm->mmap_sem); -- if (error != ELF_PAGESTART(elf_phdata->p_vaddr)) -+ if (error != ELF_PAGESTART(eppnt->p_vaddr)) - goto out_free_ph; - -- elf_bss = elf_phdata->p_vaddr + elf_phdata->p_filesz; -+ elf_bss = eppnt->p_vaddr + eppnt->p_filesz; - if (padzero(elf_bss)) { - error = -EFAULT; - goto out_free_ph; - } - -- len = ELF_PAGESTART(elf_phdata->p_filesz + elf_phdata->p_vaddr + ELF_MIN_ALIGN - 1); -- bss = elf_phdata->p_memsz + elf_phdata->p_vaddr; -+ len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1); -+ bss = eppnt->p_memsz + eppnt->p_vaddr; - if (bss > len) { - down_write(¤t->mm->mmap_sem); - do_brk(len, bss - len); -@@ -1275,7 +1279,7 @@ static void fill_prstatus(struct elf_prs - static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, - struct mm_struct *mm) - { -- int i, len; -+ unsigned int i, len; - - /* first copy the parameters from user space */ - memset(psinfo, 0, sizeof(struct elf_prpsinfo)); -diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c ---- a/fs/cramfs/inode.c -+++ b/fs/cramfs/inode.c -@@ -70,6 +70,7 @@ static struct inode *get_cramfs_inode(st - inode->i_data.a_ops = &cramfs_aops; - } else { - inode->i_size = 0; -+ inode->i_blocks = 0; - init_special_inode(inode, inode->i_mode, - old_decode_dev(cramfs_inode->size)); - } -diff --git a/fs/eventpoll.c b/fs/eventpoll.c ---- a/fs/eventpoll.c -+++ b/fs/eventpoll.c -@@ -619,6 +619,7 @@ eexit_1: - return error; - } - -+#define MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) - - /* - * Implement the event wait interface for the eventpoll file. It is the kernel -@@ -635,7 +636,7 @@ asmlinkage long sys_epoll_wait(int epfd, - current, epfd, events, maxevents, timeout)); - - /* The maximum number of event must be greater than zero */ -- if (maxevents <= 0) -+ if (maxevents <= 0 || maxevents > MAX_EVENTS) - return -EINVAL; - - /* Verify that the area passed by the user is writeable */ -diff --git a/fs/exec.c b/fs/exec.c ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -814,7 +814,7 @@ void get_task_comm(char *buf, struct tas - { - /* buf must be at least sizeof(tsk->comm) in size */ - task_lock(tsk); -- memcpy(buf, tsk->comm, sizeof(tsk->comm)); -+ strncpy(buf, tsk->comm, sizeof(tsk->comm)); - task_unlock(tsk); - } - -diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c ---- a/fs/ext2/dir.c -+++ b/fs/ext2/dir.c -@@ -592,6 +592,7 @@ int ext2_make_empty(struct inode *inode, - goto fail; - } - kaddr = kmap_atomic(page, KM_USER0); -+ memset(kaddr, 0, chunk_size); - de = (struct ext2_dir_entry_2 *)kaddr; - de->name_len = 1; - de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1)); -diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c ---- a/fs/ext3/balloc.c -+++ b/fs/ext3/balloc.c -@@ -268,7 +268,8 @@ void ext3_discard_reservation(struct ino - - if (!rsv_is_empty(&rsv->rsv_window)) { - spin_lock(rsv_lock); -- rsv_window_remove(inode->i_sb, rsv); -+ if (!rsv_is_empty(&rsv->rsv_window)) -+ rsv_window_remove(inode->i_sb, rsv); - spin_unlock(rsv_lock); - } - } -diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c ---- a/fs/hfs/mdb.c -+++ b/fs/hfs/mdb.c -@@ -333,6 +333,8 @@ void hfs_mdb_close(struct super_block *s - * Release the resources associated with the in-core MDB. */ - void hfs_mdb_put(struct super_block *sb) - { -+ if (!HFS_SB(sb)) -+ return; - /* free the B-trees */ - hfs_btree_close(HFS_SB(sb)->ext_tree); - hfs_btree_close(HFS_SB(sb)->cat_tree); -@@ -340,4 +342,7 @@ void hfs_mdb_put(struct super_block *sb) - /* free the buffers holding the primary and alternate MDBs */ - brelse(HFS_SB(sb)->mdb_bh); - brelse(HFS_SB(sb)->alt_mdb_bh); -+ -+ kfree(HFS_SB(sb)); -+ sb->s_fs_info = NULL; - } -diff --git a/fs/hfs/super.c b/fs/hfs/super.c ---- a/fs/hfs/super.c -+++ b/fs/hfs/super.c -@@ -263,7 +263,7 @@ static int hfs_fill_super(struct super_b - res = -EINVAL; - if (!parse_options((char *)data, sbi)) { - hfs_warn("hfs_fs: unable to parse mount options.\n"); -- goto bail3; -+ goto bail; - } - - sb->s_op = &hfs_super_operations; -@@ -276,7 +276,7 @@ static int hfs_fill_super(struct super_b - hfs_warn("VFS: Can't find a HFS filesystem on dev %s.\n", - hfs_mdb_name(sb)); - res = -EINVAL; -- goto bail2; -+ goto bail; - } - - /* try to get the root inode */ -@@ -306,10 +306,8 @@ bail_iput: - iput(root_inode); - bail_no_root: - hfs_warn("hfs_fs: get root inode failed.\n"); -+bail: - hfs_mdb_put(sb); --bail2: --bail3: -- kfree(sbi); - return res; - } - -diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c ---- a/fs/hfsplus/super.c -+++ b/fs/hfsplus/super.c -@@ -207,7 +207,9 @@ static void hfsplus_write_super(struct s - static void hfsplus_put_super(struct super_block *sb) - { - dprint(DBG_SUPER, "hfsplus_put_super\n"); -- if (!(sb->s_flags & MS_RDONLY)) { -+ if (!sb->s_fs_info) -+ return; -+ if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; - - vhdr->modify_date = hfsp_now2mt(); -@@ -223,6 +225,8 @@ static void hfsplus_put_super(struct sup - iput(HFSPLUS_SB(sb).alloc_file); - iput(HFSPLUS_SB(sb).hidden_dir); - brelse(HFSPLUS_SB(sb).s_vhbh); -+ kfree(sb->s_fs_info); -+ sb->s_fs_info = NULL; - } - - static int hfsplus_statfs(struct super_block *sb, struct kstatfs *buf) -diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c ---- a/fs/isofs/inode.c -+++ b/fs/isofs/inode.c -@@ -685,6 +685,8 @@ root_found: - sbi->s_log_zone_size = isonum_723 (h_pri->logical_block_size); - sbi->s_max_size = isonum_733(h_pri->volume_space_size); - } else { -+ if (!pri) -+ goto out_freebh; - rootp = (struct iso_directory_record *) pri->root_directory_record; - sbi->s_nzones = isonum_733 (pri->volume_space_size); - sbi->s_log_zone_size = isonum_723 (pri->logical_block_size); -@@ -1395,6 +1397,9 @@ struct inode *isofs_iget(struct super_bl - struct inode *inode; - struct isofs_iget5_callback_data data; - -+ if (offset >= 1ul << sb->s_blocksize_bits) -+ return NULL; -+ - data.block = block; - data.offset = offset; - -diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c ---- a/fs/isofs/rock.c -+++ b/fs/isofs/rock.c -@@ -53,6 +53,7 @@ - if(LEN & 1) LEN++; \ - CHR = ((unsigned char *) DE) + LEN; \ - LEN = *((unsigned char *) DE) - LEN; \ -+ if (LEN<0) LEN=0; \ - if (ISOFS_SB(inode->i_sb)->s_rock_offset!=-1) \ - { \ - LEN-=ISOFS_SB(inode->i_sb)->s_rock_offset; \ -@@ -73,6 +74,10 @@ - offset1 = 0; \ - pbh = sb_bread(DEV->i_sb, block); \ - if(pbh){ \ -+ if (offset > pbh->b_size || offset + cont_size > pbh->b_size){ \ -+ brelse(pbh); \ -+ goto out; \ -+ } \ - memcpy(buffer + offset1, pbh->b_data + offset, cont_size - offset1); \ - brelse(pbh); \ - chr = (unsigned char *) buffer; \ -@@ -103,12 +108,13 @@ int get_rock_ridge_filename(struct iso_d - struct rock_ridge * rr; - int sig; - -- while (len > 1){ /* There may be one byte for padding somewhere */ -+ while (len > 2){ /* There may be one byte for padding somewhere */ - rr = (struct rock_ridge *) chr; -- if (rr->len == 0) goto out; /* Something got screwed up here */ -+ if (rr->len < 3) goto out; /* Something got screwed up here */ - sig = isonum_721(chr); - chr += rr->len; - len -= rr->len; -+ if (len < 0) goto out; /* corrupted isofs */ - - switch(sig){ - case SIG('R','R'): -@@ -122,6 +128,7 @@ int get_rock_ridge_filename(struct iso_d - break; - case SIG('N','M'): - if (truncate) break; -+ if (rr->len < 5) break; - /* - * If the flags are 2 or 4, this indicates '.' or '..'. - * We don't want to do anything with this, because it -@@ -186,12 +193,13 @@ parse_rock_ridge_inode_internal(struct i - struct rock_ridge * rr; - int rootflag; - -- while (len > 1){ /* There may be one byte for padding somewhere */ -+ while (len > 2){ /* There may be one byte for padding somewhere */ - rr = (struct rock_ridge *) chr; -- if (rr->len == 0) goto out; /* Something got screwed up here */ -+ if (rr->len < 3) goto out; /* Something got screwed up here */ - sig = isonum_721(chr); - chr += rr->len; - len -= rr->len; -+ if (len < 0) goto out; /* corrupted isofs */ - - switch(sig){ - #ifndef CONFIG_ZISOFS /* No flag for SF or ZF */ -@@ -462,7 +470,7 @@ static int rock_ridge_symlink_readpage(s - struct rock_ridge *rr; - - if (!ISOFS_SB(inode->i_sb)->s_rock) -- panic ("Cannot have symlink with high sierra variant of iso filesystem\n"); -+ goto error; - - block = ei->i_iget5_block; - lock_kernel(); -@@ -487,13 +495,15 @@ static int rock_ridge_symlink_readpage(s - SETUP_ROCK_RIDGE(raw_inode, chr, len); - - repeat: -- while (len > 1) { /* There may be one byte for padding somewhere */ -+ while (len > 2) { /* There may be one byte for padding somewhere */ - rr = (struct rock_ridge *) chr; -- if (rr->len == 0) -+ if (rr->len < 3) - goto out; /* Something got screwed up here */ - sig = isonum_721(chr); - chr += rr->len; - len -= rr->len; -+ if (len < 0) -+ goto out; /* corrupted isofs */ - - switch (sig) { - case SIG('R', 'R'): -@@ -543,6 +553,7 @@ static int rock_ridge_symlink_readpage(s - fail: - brelse(bh); - unlock_kernel(); -+ error: - SetPageError(page); - kunmap(page); - unlock_page(page); -diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c ---- a/fs/jbd/checkpoint.c -+++ b/fs/jbd/checkpoint.c -@@ -339,8 +339,10 @@ int log_do_checkpoint(journal_t *journal - } - } while (jh != last_jh && !retry); - -- if (batch_count) -+ if (batch_count) { - __flush_batch(journal, bhs, &batch_count); -+ retry = 1; -+ } - - /* - * If someone cleaned up this transaction while we slept, we're -diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c ---- a/fs/jbd/transaction.c -+++ b/fs/jbd/transaction.c -@@ -1775,10 +1775,10 @@ static int journal_unmap_buffer(journal_ - JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - ret = __dispose_buffer(jh, - journal->j_running_transaction); -+ journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); -- journal_put_journal_head(jh); - return ret; - } else { - /* There is no currently-running transaction. So the -@@ -1789,10 +1789,10 @@ static int journal_unmap_buffer(journal_ - JBUFFER_TRACE(jh, "give to committing trans"); - ret = __dispose_buffer(jh, - journal->j_committing_transaction); -+ journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); -- journal_put_journal_head(jh); - return ret; - } else { - /* The orphan record's transaction has -@@ -1813,10 +1813,10 @@ static int journal_unmap_buffer(journal_ - journal->j_running_transaction); - jh->b_next_transaction = NULL; - } -+ journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); -- journal_put_journal_head(jh); - return 0; - } else { - /* Good, the buffer belongs to the running transaction. -diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h ---- a/include/asm-x86_64/processor.h -+++ b/include/asm-x86_64/processor.h -@@ -160,9 +160,9 @@ static inline void clear_in_cr4 (unsigne - - - /* -- * User space process size. 47bits. -+ * User space process size. 47bits minus one guard page. - */ --#define TASK_SIZE (0x800000000000UL) -+#define TASK_SIZE (0x800000000000UL - 4096) - - /* This decides where the kernel will search for a free chunk of vm - * space during mmap's. -diff --git a/include/linux/err.h b/include/linux/err.h ---- a/include/linux/err.h -+++ b/include/linux/err.h -@@ -13,6 +13,8 @@ - * This should be a per-architecture thing, to allow different - * error and pointer decisions. - */ -+#define IS_ERR_VALUE(x) unlikely((x) > (unsigned long)-1000L) -+ - static inline void *ERR_PTR(long error) - { - return (void *) error; -@@ -25,7 +27,7 @@ static inline long PTR_ERR(const void *p - - static inline long IS_ERR(const void *ptr) - { -- return unlikely((unsigned long)ptr > (unsigned long)-1000L); -+ return IS_ERR_VALUE((unsigned long)ptr); - } - - #endif /* _LINUX_ERR_H */ -diff --git a/kernel/exit.c b/kernel/exit.c ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -516,8 +516,6 @@ static inline void choose_new_parent(tas - */ - BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); - p->real_parent = reaper; -- if (p->parent == p->real_parent) -- BUG(); - } - - static inline void reparent_thread(task_t *p, task_t *father, int traced) -diff --git a/kernel/signal.c b/kernel/signal.c ---- a/kernel/signal.c -+++ b/kernel/signal.c -@@ -1728,6 +1728,7 @@ do_signal_stop(int signr) - * with another processor delivering a stop signal, - * then the SIGCONT that wakes us up should clear it. - */ -+ read_unlock(&tasklist_lock); - return 0; - } - -diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c ---- a/lib/rwsem-spinlock.c -+++ b/lib/rwsem-spinlock.c -@@ -140,12 +140,12 @@ void fastcall __sched __down_read(struct - - rwsemtrace(sem, "Entering __down_read"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irq(&sem->wait_lock); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - goto out; - } - -@@ -160,7 +160,7 @@ void fastcall __sched __down_read(struct - list_add_tail(&waiter.list, &sem->wait_list); - - /* we don't need to touch the semaphore struct anymore */ -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - for (;;) { -@@ -181,10 +181,12 @@ void fastcall __sched __down_read(struct - */ - int fastcall __down_read_trylock(struct rw_semaphore *sem) - { -+ unsigned long flags; - int ret = 0; -+ - rwsemtrace(sem, "Entering __down_read_trylock"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ -@@ -192,7 +194,7 @@ int fastcall __down_read_trylock(struct - ret = 1; - } - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __down_read_trylock"); - return ret; -@@ -209,12 +211,12 @@ void fastcall __sched __down_write(struc - - rwsemtrace(sem, "Entering __down_write"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irq(&sem->wait_lock); - - if (sem->activity == 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity = -1; -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - goto out; - } - -@@ -229,7 +231,7 @@ void fastcall __sched __down_write(struc - list_add_tail(&waiter.list, &sem->wait_list); - - /* we don't need to touch the semaphore struct anymore */ -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - for (;;) { -@@ -250,10 +252,12 @@ void fastcall __sched __down_write(struc - */ - int fastcall __down_write_trylock(struct rw_semaphore *sem) - { -+ unsigned long flags; - int ret = 0; -+ - rwsemtrace(sem, "Entering __down_write_trylock"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity == 0 && list_empty(&sem->wait_list)) { - /* granted */ -@@ -261,7 +265,7 @@ int fastcall __down_write_trylock(struct - ret = 1; - } - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __down_write_trylock"); - return ret; -@@ -272,14 +276,16 @@ int fastcall __down_write_trylock(struct - */ - void fastcall __up_read(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering __up_read"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __up_read"); - } -@@ -289,15 +295,17 @@ void fastcall __up_read(struct rw_semaph - */ - void fastcall __up_write(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering __up_write"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __up_write"); - } -@@ -308,15 +316,17 @@ void fastcall __up_write(struct rw_semap - */ - void fastcall __downgrade_write(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering __downgrade_write"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __downgrade_write"); - } -diff --git a/lib/rwsem.c b/lib/rwsem.c ---- a/lib/rwsem.c -+++ b/lib/rwsem.c -@@ -150,7 +150,7 @@ rwsem_down_failed_common(struct rw_semap - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - - /* set up my own style of waitqueue */ -- spin_lock(&sem->wait_lock); -+ spin_lock_irq(&sem->wait_lock); - waiter->task = tsk; - get_task_struct(tsk); - -@@ -163,7 +163,7 @@ rwsem_down_failed_common(struct rw_semap - if (!(count & RWSEM_ACTIVE_MASK)) - sem = __rwsem_do_wake(sem, 0); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - for (;;) { -@@ -219,15 +219,17 @@ rwsem_down_write_failed(struct rw_semaph - */ - struct rw_semaphore fastcall *rwsem_wake(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering rwsem_wake"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving rwsem_wake"); - -@@ -241,15 +243,17 @@ struct rw_semaphore fastcall *rwsem_wake - */ - struct rw_semaphore fastcall *rwsem_downgrade_wake(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering rwsem_downgrade_wake"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving rwsem_downgrade_wake"); - return sem; -diff --git a/mm/mmap.c b/mm/mmap.c ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -1315,37 +1315,40 @@ unsigned long - get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) - { -- if (flags & MAP_FIXED) { -- unsigned long ret; -+ unsigned long ret; - -- if (addr > TASK_SIZE - len) -- return -ENOMEM; -- if (addr & ~PAGE_MASK) -- return -EINVAL; -- if (file && is_file_hugepages(file)) { -- /* -- * Check if the given range is hugepage aligned, and -- * can be made suitable for hugepages. -- */ -- ret = prepare_hugepage_range(addr, len); -- } else { -- /* -- * Ensure that a normal request is not falling in a -- * reserved hugepage range. For some archs like IA-64, -- * there is a separate region for hugepages. -- */ -- ret = is_hugepage_only_range(addr, len); -- } -- if (ret) -- return -EINVAL; -- return addr; -- } -+ if (!(flags & MAP_FIXED)) { -+ unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - -- if (file && file->f_op && file->f_op->get_unmapped_area) -- return file->f_op->get_unmapped_area(file, addr, len, -- pgoff, flags); -+ get_area = current->mm->get_unmapped_area; -+ if (file && file->f_op && file->f_op->get_unmapped_area) -+ get_area = file->f_op->get_unmapped_area; -+ addr = get_area(file, addr, len, pgoff, flags); -+ if (IS_ERR_VALUE(addr)) -+ return addr; -+ } - -- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); -+ if (addr > TASK_SIZE - len) -+ return -ENOMEM; -+ if (addr & ~PAGE_MASK) -+ return -EINVAL; -+ if (file && is_file_hugepages(file)) { -+ /* -+ * Check if the given range is hugepage aligned, and -+ * can be made suitable for hugepages. -+ */ -+ ret = prepare_hugepage_range(addr, len); -+ } else { -+ /* -+ * Ensure that a normal request is not falling in a -+ * reserved hugepage range. For some archs like IA-64, -+ * there is a separate region for hugepages. -+ */ -+ ret = is_hugepage_only_range(addr, len); -+ } -+ if (ret) -+ return -EINVAL; -+ return addr; - } - - EXPORT_SYMBOL(get_unmapped_area); -diff --git a/mm/rmap.c b/mm/rmap.c ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -641,7 +641,7 @@ static void try_to_unmap_cluster(unsigne - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; -- pte_t *pte; -+ pte_t *pte, *original_pte; - pte_t pteval; - struct page *page; - unsigned long address; -@@ -673,7 +673,7 @@ static void try_to_unmap_cluster(unsigne - if (!pmd_present(*pmd)) - goto out_unlock; - -- for (pte = pte_offset_map(pmd, address); -+ for (original_pte = pte = pte_offset_map(pmd, address); - address < end; pte++, address += PAGE_SIZE) { - - if (!pte_present(*pte)) -@@ -710,7 +710,7 @@ static void try_to_unmap_cluster(unsigne - (*mapcount)--; - } - -- pte_unmap(pte); -+ pte_unmap(original_pte); - - out_unlock: - spin_unlock(&mm->page_table_lock); -diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c ---- a/net/bluetooth/af_bluetooth.c -+++ b/net/bluetooth/af_bluetooth.c -@@ -64,7 +64,7 @@ static kmem_cache_t *bt_sock_cache; - - int bt_sock_register(int proto, struct net_proto_family *ops) - { -- if (proto >= BT_MAX_PROTO) -+ if (proto < 0 || proto >= BT_MAX_PROTO) - return -EINVAL; - - if (bt_proto[proto]) -@@ -77,7 +77,7 @@ EXPORT_SYMBOL(bt_sock_register); - - int bt_sock_unregister(int proto) - { -- if (proto >= BT_MAX_PROTO) -+ if (proto < 0 || proto >= BT_MAX_PROTO) - return -EINVAL; - - if (!bt_proto[proto]) -@@ -92,7 +92,7 @@ static int bt_sock_create(struct socket - { - int err = 0; - -- if (proto >= BT_MAX_PROTO) -+ if (proto < 0 || proto >= BT_MAX_PROTO) - return -EINVAL; - - #if defined(CONFIG_KMOD) -diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c ---- a/net/bridge/br_input.c -+++ b/net/bridge/br_input.c -@@ -54,6 +54,9 @@ int br_handle_frame_finish(struct sk_buf - struct net_bridge_fdb_entry *dst; - int passedup = 0; - -+ /* insert into forwarding database after filtering to avoid spoofing */ -+ br_fdb_insert(p->br, p, eth_hdr(skb)->h_source, 0); -+ - if (br->dev->flags & IFF_PROMISC) { - struct sk_buff *skb2; - -@@ -108,8 +111,7 @@ int br_handle_frame(struct net_bridge_po - if (eth_hdr(skb)->h_source[0] & 1) - goto err; - -- if (p->state == BR_STATE_LEARNING || -- p->state == BR_STATE_FORWARDING) -+ if (p->state == BR_STATE_LEARNING) - br_fdb_insert(p->br, p, eth_hdr(skb)->h_source, 0); - - if (p->br->stp_enabled && -diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c ---- a/net/bridge/br_stp_bpdu.c -+++ b/net/bridge/br_stp_bpdu.c -@@ -140,6 +140,9 @@ int br_stp_handle_bpdu(struct sk_buff *s - struct net_bridge *br = p->br; - unsigned char *buf; - -+ /* insert into forwarding database after filtering to avoid spoofing */ -+ br_fdb_insert(p->br, p, eth_hdr(skb)->h_source, 0); -+ - /* need at least the 802 and STP headers */ - if (!pskb_may_pull(skb, sizeof(header)+1) || - memcmp(skb->data, header, sizeof(header))) -diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c ---- a/net/bridge/netfilter/ebtables.c -+++ b/net/bridge/netfilter/ebtables.c -@@ -179,9 +179,10 @@ unsigned int ebt_do_table (unsigned int - struct ebt_chainstack *cs; - struct ebt_entries *chaininfo; - char *base; -- struct ebt_table_info *private = table->private; -+ struct ebt_table_info *private; - - read_lock_bh(&table->lock); -+ private = table->private; - cb_base = COUNTER_BASE(private->counters, private->nentries, - smp_processor_id()); - if (private->chainstack) -diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c ---- a/net/ipv4/fib_hash.c -+++ b/net/ipv4/fib_hash.c -@@ -919,13 +919,23 @@ out: - return fa; - } - -+static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) -+{ -+ struct fib_alias *fa = fib_get_first(seq); -+ -+ if (fa) -+ while (pos && (fa = fib_get_next(seq))) -+ --pos; -+ return pos ? NULL : fa; -+} -+ - static void *fib_seq_start(struct seq_file *seq, loff_t *pos) - { - void *v = NULL; - - read_lock(&fib_hash_lock); - if (ip_fib_main_table) -- v = *pos ? fib_get_next(seq) : SEQ_START_TOKEN; -+ v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - return v; - } - -diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c ---- a/net/ipv4/netfilter/ip_queue.c -+++ b/net/ipv4/netfilter/ip_queue.c -@@ -3,6 +3,7 @@ - * communicating with userspace via netlink. - * - * (C) 2000-2002 James Morris <jmorris@intercode.com.au> -+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as -@@ -14,6 +15,7 @@ - * Zander). - * 2000-08-01: Added Nick Williams' MAC support. - * 2002-06-25: Code cleanup. -+ * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte) - * - */ - #include <linux/module.h> -@@ -66,7 +68,15 @@ static DECLARE_MUTEX(ipqnl_sem); - static void - ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) - { -+ /* TCP input path (and probably other bits) assume to be called -+ * from softirq context, not from syscall, like ipq_issue_verdict is -+ * called. TCP input path deadlocks with locks taken from timer -+ * softirq, e.g. We therefore emulate this by local_bh_disable() */ -+ -+ local_bh_disable(); - nf_reinject(entry->skb, entry->info, verdict); -+ local_bh_enable(); -+ - kfree(entry); - } - -diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c ---- a/net/ipv4/tcp_input.c -+++ b/net/ipv4/tcp_input.c -@@ -1653,7 +1653,10 @@ static void DBGUNDO(struct sock *sk, str - static void tcp_undo_cwr(struct tcp_sock *tp, int undo) - { - if (tp->prior_ssthresh) { -- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); -+ if (tcp_is_bic(tp)) -+ tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); -+ else -+ tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); - - if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { - tp->snd_ssthresh = tp->prior_ssthresh; -diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c ---- a/net/ipv4/tcp_timer.c -+++ b/net/ipv4/tcp_timer.c -@@ -38,6 +38,7 @@ static void tcp_keepalive_timer (unsigne - - #ifdef TCP_DEBUG - const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; -+EXPORT_SYMBOL(tcp_timer_bug_msg); - #endif - - /* -diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c ---- a/net/ipv4/xfrm4_output.c -+++ b/net/ipv4/xfrm4_output.c -@@ -103,17 +103,17 @@ int xfrm4_output(struct sk_buff *skb) - goto error_nolock; - } - -- spin_lock_bh(&x->lock); -- err = xfrm_state_check(x, skb); -- if (err) -- goto error; -- - if (x->props.mode) { - err = xfrm4_tunnel_check_size(skb); - if (err) -- goto error; -+ goto error_nolock; - } - -+ spin_lock_bh(&x->lock); -+ err = xfrm_state_check(x, skb); -+ if (err) -+ goto error; -+ - xfrm4_encap(skb); - - err = x->type->output(skb); -diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c ---- a/net/ipv6/xfrm6_output.c -+++ b/net/ipv6/xfrm6_output.c -@@ -103,17 +103,17 @@ int xfrm6_output(struct sk_buff *skb) - goto error_nolock; - } - -- spin_lock_bh(&x->lock); -- err = xfrm_state_check(x, skb); -- if (err) -- goto error; -- - if (x->props.mode) { - err = xfrm6_tunnel_check_size(skb); - if (err) -- goto error; -+ goto error_nolock; - } - -+ spin_lock_bh(&x->lock); -+ err = xfrm_state_check(x, skb); -+ if (err) -+ goto error; -+ - xfrm6_encap(skb); - - err = x->type->output(skb); -diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c ---- a/net/netrom/nr_in.c -+++ b/net/netrom/nr_in.c -@@ -74,7 +74,6 @@ static int nr_queue_rx_frame(struct sock - static int nr_state1_machine(struct sock *sk, struct sk_buff *skb, - int frametype) - { -- bh_lock_sock(sk); - switch (frametype) { - case NR_CONNACK: { - nr_cb *nr = nr_sk(sk); -@@ -103,8 +102,6 @@ static int nr_state1_machine(struct sock - default: - break; - } -- bh_unlock_sock(sk); -- - return 0; - } - -@@ -116,7 +113,6 @@ static int nr_state1_machine(struct sock - static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, - int frametype) - { -- bh_lock_sock(sk); - switch (frametype) { - case NR_CONNACK | NR_CHOKE_FLAG: - nr_disconnect(sk, ECONNRESET); -@@ -132,8 +128,6 @@ static int nr_state2_machine(struct sock - default: - break; - } -- bh_unlock_sock(sk); -- - return 0; - } - -@@ -154,7 +148,6 @@ static int nr_state3_machine(struct sock - nr = skb->data[18]; - ns = skb->data[17]; - -- bh_lock_sock(sk); - switch (frametype) { - case NR_CONNREQ: - nr_write_internal(sk, NR_CONNACK); -@@ -265,8 +258,6 @@ static int nr_state3_machine(struct sock - default: - break; - } -- bh_unlock_sock(sk); -- - return queued; - } - -diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c ---- a/net/rose/rose_route.c -+++ b/net/rose/rose_route.c -@@ -727,7 +727,8 @@ int rose_rt_ioctl(unsigned int cmd, void - } - if (rose_route.mask > 10) /* Mask can't be more than 10 digits */ - return -EINVAL; -- -+ if (rose_route.ndigis > 8) /* No more than 8 digipeats */ -+ return -EINVAL; - err = rose_add_node(&rose_route, dev); - dev_put(dev); - return err; -diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c ---- a/net/sched/sch_netem.c -+++ b/net/sched/sch_netem.c -@@ -184,10 +184,15 @@ static int netem_enqueue(struct sk_buff - /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); -- -- pr_debug("netem_enqueue: dup %p\n", skb2); -- if (skb2) -- delay_skb(sch, skb2); -+ if (skb2) { -+ struct Qdisc *rootq = sch->dev->qdisc; -+ u32 dupsave = q->duplicate; -+ -+ /* prevent duplicating a dup... */ -+ q->duplicate = 0; -+ rootq->enqueue(skb2, rootq); -+ q->duplicate = dupsave; -+ } - } - - /* If doing simple delay then gap == 0 so all packets -diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c ---- a/net/xfrm/xfrm_state.c -+++ b/net/xfrm/xfrm_state.c -@@ -609,7 +609,7 @@ static struct xfrm_state *__xfrm_find_ac - - for (i = 0; i < XFRM_DST_HSIZE; i++) { - list_for_each_entry(x, xfrm_state_bydst+i, bydst) { -- if (x->km.seq == seq) { -+ if (x->km.seq == seq && x->km.state == XFRM_STATE_ACQ) { - xfrm_state_hold(x); - return x; - } -diff --git a/security/keys/key.c b/security/keys/key.c ---- a/security/keys/key.c -+++ b/security/keys/key.c -@@ -57,9 +57,10 @@ struct key_user *key_user_lookup(uid_t u - { - struct key_user *candidate = NULL, *user; - struct rb_node *parent = NULL; -- struct rb_node **p = &key_user_tree.rb_node; -+ struct rb_node **p; - - try_again: -+ p = &key_user_tree.rb_node; - spin_lock(&key_user_lock); - - /* search the tree for a user record with a matching UID */ -diff --git a/sound/core/timer.c b/sound/core/timer.c ---- a/sound/core/timer.c -+++ b/sound/core/timer.c -@@ -1117,7 +1117,8 @@ static void snd_timer_user_append_to_tqu - if (tu->qused >= tu->queue_size) { - tu->overrun++; - } else { -- memcpy(&tu->queue[tu->qtail++], tread, sizeof(*tread)); -+ memcpy(&tu->tqueue[tu->qtail++], tread, sizeof(*tread)); -+ tu->qtail %= tu->queue_size; - tu->qused++; - } - } -@@ -1140,6 +1141,8 @@ static void snd_timer_user_ccallback(snd - spin_lock(&tu->qlock); - snd_timer_user_append_to_tqueue(tu, &r1); - spin_unlock(&tu->qlock); -+ kill_fasync(&tu->fasync, SIGIO, POLL_IN); -+ wake_up(&tu->qchange_sleep); - } - - static void snd_timer_user_tinterrupt(snd_timer_instance_t *timeri, -diff --git a/sound/pci/ac97/ac97_codec.c b/sound/pci/ac97/ac97_codec.c ---- a/sound/pci/ac97/ac97_codec.c -+++ b/sound/pci/ac97/ac97_codec.c -@@ -1185,7 +1185,7 @@ snd_kcontrol_t *snd_ac97_cnew(const snd_ - /* - * create mute switch(es) for normal stereo controls - */ --static int snd_ac97_cmute_new(snd_card_t *card, char *name, int reg, ac97_t *ac97) -+static int snd_ac97_cmute_new_stereo(snd_card_t *card, char *name, int reg, int check_stereo, ac97_t *ac97) - { - snd_kcontrol_t *kctl; - int err; -@@ -1196,7 +1196,7 @@ static int snd_ac97_cmute_new(snd_card_t - - mute_mask = 0x8000; - val = snd_ac97_read(ac97, reg); -- if (ac97->flags & AC97_STEREO_MUTES) { -+ if (check_stereo || (ac97->flags & AC97_STEREO_MUTES)) { - /* check whether both mute bits work */ - val1 = val | 0x8080; - snd_ac97_write(ac97, reg, val1); -@@ -1254,7 +1254,7 @@ static int snd_ac97_cvol_new(snd_card_t - /* - * create a mute-switch and a volume for normal stereo/mono controls - */ --static int snd_ac97_cmix_new(snd_card_t *card, const char *pfx, int reg, ac97_t *ac97) -+static int snd_ac97_cmix_new_stereo(snd_card_t *card, const char *pfx, int reg, int check_stereo, ac97_t *ac97) - { - int err; - char name[44]; -@@ -1265,7 +1265,7 @@ static int snd_ac97_cmix_new(snd_card_t - - if (snd_ac97_try_bit(ac97, reg, 15)) { - sprintf(name, "%s Switch", pfx); -- if ((err = snd_ac97_cmute_new(card, name, reg, ac97)) < 0) -+ if ((err = snd_ac97_cmute_new_stereo(card, name, reg, check_stereo, ac97)) < 0) - return err; - } - check_volume_resolution(ac97, reg, &lo_max, &hi_max); -@@ -1277,6 +1277,8 @@ static int snd_ac97_cmix_new(snd_card_t - return 0; - } - -+#define snd_ac97_cmix_new(card, pfx, reg, ac97) snd_ac97_cmix_new_stereo(card, pfx, reg, 0, ac97) -+#define snd_ac97_cmute_new(card, name, reg, ac97) snd_ac97_cmute_new_stereo(card, name, reg, 0, ac97) - - static unsigned int snd_ac97_determine_spdif_rates(ac97_t *ac97); - -@@ -1327,7 +1329,8 @@ static int snd_ac97_mixer_build(ac97_t * - - /* build surround controls */ - if (snd_ac97_try_volume_mix(ac97, AC97_SURROUND_MASTER)) { -- if ((err = snd_ac97_cmix_new(card, "Surround Playback", AC97_SURROUND_MASTER, ac97)) < 0) -+ /* Surround Master (0x38) is with stereo mutes */ -+ if ((err = snd_ac97_cmix_new_stereo(card, "Surround Playback", AC97_SURROUND_MASTER, 1, ac97)) < 0) - return err; - } - -diff --git a/sound/usb/usbaudio.c b/sound/usb/usbaudio.c ---- a/sound/usb/usbaudio.c -+++ b/sound/usb/usbaudio.c -@@ -3276,7 +3276,7 @@ static void snd_usb_audio_disconnect(str - } - usb_chip[chip->index] = NULL; - up(®ister_mutex); -- snd_card_free_in_thread(card); -+ snd_card_free(card); - } else { - up(®ister_mutex); - } -diff --git a/sound/usb/usx2y/usbusx2y.c b/sound/usb/usx2y/usbusx2y.c ---- a/sound/usb/usx2y/usbusx2y.c -+++ b/sound/usb/usx2y/usbusx2y.c -@@ -1,6 +1,11 @@ - /* - * usbusy2y.c - ALSA USB US-428 Driver - * -+2005-04-14 Karsten Wiese -+ Version 0.8.7.2: -+ Call snd_card_free() instead of snd_card_free_in_thread() to prevent oops with dead keyboard symptom. -+ Tested ok with kernel 2.6.12-rc2. -+ - 2004-12-14 Karsten Wiese - Version 0.8.7.1: - snd_pcm_open for rawusb pcm-devices now returns -EBUSY if called without rawusb's hwdep device being open. -@@ -143,7 +148,7 @@ - - - MODULE_AUTHOR("Karsten Wiese <annabellesgarden@yahoo.de>"); --MODULE_DESCRIPTION("TASCAM "NAME_ALLCAPS" Version 0.8.7.1"); -+MODULE_DESCRIPTION("TASCAM "NAME_ALLCAPS" Version 0.8.7.2"); - MODULE_LICENSE("GPL"); - MODULE_SUPPORTED_DEVICE("{{TASCAM(0x1604), "NAME_ALLCAPS"(0x8001)(0x8005)(0x8007) }}"); - -@@ -430,8 +435,6 @@ static void usX2Y_usb_disconnect(struct - if (ptr) { - usX2Ydev_t* usX2Y = usX2Y((snd_card_t*)ptr); - struct list_head* p; -- if (usX2Y->chip_status == USX2Y_STAT_CHIP_HUP) // on 2.6.1 kernel snd_usbmidi_disconnect() -- return; // calls us back. better leave :-) . - usX2Y->chip.shutdown = 1; - usX2Y->chip_status = USX2Y_STAT_CHIP_HUP; - usX2Y_unlinkSeq(&usX2Y->AS04); -@@ -443,7 +446,7 @@ static void usX2Y_usb_disconnect(struct - } - if (usX2Y->us428ctls_sharedmem) - wake_up(&usX2Y->us428ctls_wait_queue_head); -- snd_card_free_in_thread((snd_card_t*)ptr); -+ snd_card_free((snd_card_t*)ptr); - } - } - diff --git a/patches/linux-2.6.11/udp-frag.patch b/patches/linux-2.6.11/udp-frag.patch deleted file mode 100644 index 9e8a26eb20..0000000000 --- a/patches/linux-2.6.11/udp-frag.patch +++ /dev/null @@ -1,55 +0,0 @@ -diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c ---- a/net/ipv4/udp.c -+++ b/net/ipv4/udp.c -@@ -738,7 +738,7 @@ int udp_ioctl(struct sock *sk, int cmd, - unsigned long amount; - - amount = 0; -- spin_lock_irq(&sk->sk_receive_queue.lock); -+ spin_lock_bh(&sk->sk_receive_queue.lock); - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { - /* -@@ -748,7 +748,7 @@ int udp_ioctl(struct sock *sk, int cmd, - */ - amount = skb->len - sizeof(struct udphdr); - } -- spin_unlock_irq(&sk->sk_receive_queue.lock); -+ spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); - } - -@@ -848,12 +848,12 @@ csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; -- spin_lock_irq(&sk->sk_receive_queue.lock); -+ spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } -- spin_unlock_irq(&sk->sk_receive_queue.lock); -+ spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } -@@ -1334,7 +1334,7 @@ unsigned int udp_poll(struct file *file, - struct sk_buff_head *rcvq = &sk->sk_receive_queue; - struct sk_buff *skb; - -- spin_lock_irq(&rcvq->lock); -+ spin_lock_bh(&rcvq->lock); - while ((skb = skb_peek(rcvq)) != NULL) { - if (udp_checksum_complete(skb)) { - UDP_INC_STATS_BH(UDP_MIB_INERRORS); -@@ -1345,7 +1345,7 @@ unsigned int udp_poll(struct file *file, - break; - } - } -- spin_unlock_irq(&rcvq->lock); -+ spin_unlock_bh(&rcvq->lock); - - /* nothing to see, move along */ - if (skb == NULL) - diff --git a/patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch b/patches/linux-2.6.12/i386-cpu-hotplug-updated-for-mm.patch index ec39143743..abd3c2af3c 100644 --- a/patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch +++ b/patches/linux-2.6.12/i386-cpu-hotplug-updated-for-mm.patch @@ -1,65 +1,7 @@ - -From: Zwane Mwaikambo <zwane@linuxpower.ca> - -Find attached the i386 cpu hotplug patch updated for Ingo's latest round of -goodies. In order to avoid dumping cpu hotplug code into kernel/irq/* i -dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The -difference being that on cpu offline, fixup_irqs() is called before we -clear the cpu from cpu_online_map and a long delay in order to ensure that -we never have any queued external interrupts on the APICs. Due to my usual -test victims being in boxes a continent away this hasn't been tested, but -i'll cover bug reports (nudge, Nathan! ;) - -1) Add CONFIG_HOTPLUG_CPU -2) disable local APIC timer on dead cpus. -3) Disable preempt around irq balancing to prevent CPUs going down. -4) Print irq stats for all possible cpus. -5) Debugging check for interrupts on offline cpus. -6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. -7) play_dead() for offline cpus to spin inside. -8) Handle offline cpus set in flush_tlb_others(). -9) Grab lock earlier in smp_call_function() to prevent CPUs going down. -10) Implement __cpu_disable() and __cpu_die(). -11) Enable local interrupts in cpu_enable() after fixup_irqs() -12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. -13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. - -Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> -DESC -ppc64: fix hotplug cpu -EDESC -From: Zwane Mwaikambo <zwane@fsmlabs.com> - -I seem to have broken this when I moved the clearing of the dying cpu to -arch specific code. - -Signed-off-by: Zwane Mwaikambo <zwane@fsmlabs.com> -Signed-off-by: Andrew Morton <akpm@osdl.org> ---- - - 25-akpm/arch/i386/Kconfig | 9 ++ - 25-akpm/arch/i386/kernel/apic.c | 3 - 25-akpm/arch/i386/kernel/io_apic.c | 2 - 25-akpm/arch/i386/kernel/irq.c | 66 +++++++++++++++++---- - 25-akpm/arch/i386/kernel/msr.c | 2 - 25-akpm/arch/i386/kernel/process.c | 35 +++++++++++ - 25-akpm/arch/i386/kernel/smp.c | 25 +++++--- - 25-akpm/arch/i386/kernel/smpboot.c | 98 ++++++++++++++++++++++++++++++-- - 25-akpm/arch/i386/kernel/traps.c | 8 ++ - 25-akpm/arch/ia64/kernel/smpboot.c | 3 - 25-akpm/arch/ppc64/kernel/pSeries_smp.c | 5 + - 25-akpm/arch/s390/kernel/smp.c | 4 - - 25-akpm/include/asm-i386/cpu.h | 2 - 25-akpm/include/asm-i386/irq.h | 4 + - 25-akpm/include/asm-i386/smp.h | 3 - 25-akpm/kernel/cpu.c | 14 +--- - arch/ppc64/kernel/smp.c | 0 - 17 files changed, 242 insertions(+), 41 deletions(-) - -diff -puN arch/i386/Kconfig~i386-cpu-hotplug-updated-for-mm arch/i386/Kconfig ---- 25/arch/i386/Kconfig~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/Kconfig 2005-02-23 02:20:06.000000000 -0800 -@@ -1205,6 +1205,15 @@ config SCx200 +diff -Naur linux-2.6.12.orig/arch/i386/Kconfig linux-2.6.12/arch/i386/Kconfig +--- linux-2.6.12.orig/arch/i386/Kconfig 2005-07-08 12:33:40.000000000 -0400 ++++ linux-2.6.12/arch/i386/Kconfig 2005-07-08 12:34:10.000000000 -0400 +@@ -1226,6 +1226,15 @@ This support is also available as a module. If compiled as a module, it will be called scx200. @@ -75,9 +17,9 @@ diff -puN arch/i386/Kconfig~i386-cpu-hotplug-updated-for-mm arch/i386/Kconfig source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" -diff -puN arch/i386/kernel/apic.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/apic.c ---- 25/arch/i386/kernel/apic.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/apic.c 2005-02-23 02:20:06.000000000 -0800 +diff -Naur linux-2.6.12.orig/arch/i386/kernel/apic.c linux-2.6.12/arch/i386/kernel/apic.c +--- linux-2.6.12.orig/arch/i386/kernel/apic.c 2005-07-08 12:33:40.000000000 -0400 ++++ linux-2.6.12/arch/i386/kernel/apic.c 2005-07-08 12:34:10.000000000 -0400 @@ -26,6 +26,7 @@ #include <linux/mc146818rtc.h> #include <linux/kernel_stat.h> @@ -86,7 +28,7 @@ diff -puN arch/i386/kernel/apic.c~i386-cpu-hotplug-updated-for-mm arch/i386/kern #include <asm/atomic.h> #include <asm/smp.h> -@@ -1048,7 +1049,7 @@ void __init setup_secondary_APIC_clock(v +@@ -1048,7 +1049,7 @@ setup_APIC_timer(calibration_result); } @@ -95,10 +37,10 @@ diff -puN arch/i386/kernel/apic.c~i386-cpu-hotplug-updated-for-mm arch/i386/kern { if (using_apic_timer) { unsigned long v; -diff -puN arch/i386/kernel/io_apic.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/io_apic.c ---- 25/arch/i386/kernel/io_apic.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/io_apic.c 2005-02-23 02:20:06.000000000 -0800 -@@ -576,9 +576,11 @@ static int balanced_irq(void *unused) +diff -Naur linux-2.6.12.orig/arch/i386/kernel/io_apic.c linux-2.6.12/arch/i386/kernel/io_apic.c +--- linux-2.6.12.orig/arch/i386/kernel/io_apic.c 2005-07-08 12:33:40.000000000 -0400 ++++ linux-2.6.12/arch/i386/kernel/io_apic.c 2005-07-08 12:34:10.000000000 -0400 +@@ -576,9 +576,11 @@ try_to_freeze(PF_FREEZE); if (time_after(jiffies, prev_balance_time+balanced_irq_interval)) { @@ -110,9 +52,9 @@ diff -puN arch/i386/kernel/io_apic.c~i386-cpu-hotplug-updated-for-mm arch/i386/k } } return 0; -diff -puN arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/irq.c ---- 25/arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/irq.c 2005-02-23 02:20:06.000000000 -0800 +diff -Naur linux-2.6.12.orig/arch/i386/kernel/irq.c linux-2.6.12/arch/i386/kernel/irq.c +--- linux-2.6.12.orig/arch/i386/kernel/irq.c 2005-07-08 12:33:40.000000000 -0400 ++++ linux-2.6.12/arch/i386/kernel/irq.c 2005-07-08 12:36:06.000000000 -0400 @@ -15,6 +15,9 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> @@ -121,9 +63,9 @@ diff -puN arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne +#include <linux/cpu.h> +#include <linux/delay.h> - #ifndef CONFIG_X86_LOCAL_APIC - /* -@@ -209,9 +212,8 @@ int show_interrupts(struct seq_file *p, + DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp; + EXPORT_PER_CPU_SYMBOL(irq_stat); +@@ -210,9 +213,8 @@ if (i == 0) { seq_printf(p, " "); @@ -135,7 +77,7 @@ diff -puN arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne seq_putc(p, '\n'); } -@@ -224,9 +226,8 @@ int show_interrupts(struct seq_file *p, +@@ -225,9 +227,8 @@ #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else @@ -147,7 +89,7 @@ diff -puN arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne #endif seq_printf(p, " %14s", irq_desc[i].handler->typename); seq_printf(p, " %s", action->name); -@@ -239,16 +240,13 @@ skip: +@@ -240,16 +241,13 @@ spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); @@ -155,20 +97,20 @@ diff -puN arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne - if (cpu_online(j)) - seq_printf(p, "%10u ", nmi_count(j)); + for_each_cpu(j) -+ seq_printf(p, "%10u ", nmi_count(j)); ++ seq_printf(p, "%10u ", nmi_count(j)); seq_putc(p, '\n'); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", -- irq_stat[j].apic_timer_irqs); +- per_cpu(irq_stat,j).apic_timer_irqs); + for_each_cpu(j) -+ seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs); ++ seq_printf(p, "%10u ", per_cpu(irq_stat,j).apic_timer_irqs); seq_putc(p, '\n'); #endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -@@ -258,3 +256,45 @@ skip: +@@ -259,3 +257,45 @@ } return 0; } @@ -214,10 +156,10 @@ diff -puN arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne +} +#endif + -diff -puN arch/i386/kernel/msr.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/msr.c ---- 25/arch/i386/kernel/msr.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/msr.c 2005-02-23 02:20:06.000000000 -0800 -@@ -260,7 +260,7 @@ static struct file_operations msr_fops = +diff -Naur linux-2.6.12.orig/arch/i386/kernel/msr.c linux-2.6.12/arch/i386/kernel/msr.c +--- linux-2.6.12.orig/arch/i386/kernel/msr.c 2005-07-08 12:33:40.000000000 -0400 ++++ linux-2.6.12/arch/i386/kernel/msr.c 2005-07-08 12:34:10.000000000 -0400 +@@ -260,7 +260,7 @@ .open = msr_open, }; @@ -226,9 +168,9 @@ diff -puN arch/i386/kernel/msr.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne { int err = 0; struct class_device *class_err; -diff -puN arch/i386/kernel/process.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/process.c ---- 25/arch/i386/kernel/process.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/process.c 2005-02-23 02:20:06.000000000 -0800 +diff -Naur linux-2.6.12.orig/arch/i386/kernel/process.c linux-2.6.12/arch/i386/kernel/process.c +--- linux-2.6.12.orig/arch/i386/kernel/process.c 2005-07-08 12:33:40.000000000 -0400 ++++ linux-2.6.12/arch/i386/kernel/process.c 2005-07-08 12:36:43.000000000 -0400 @@ -13,6 +13,7 @@ #include <stdarg.h> @@ -237,7 +179,7 @@ diff -puN arch/i386/kernel/process.c~i386-cpu-hotplug-updated-for-mm arch/i386/k #include <linux/errno.h> #include <linux/sched.h> #include <linux/fs.h> -@@ -55,6 +56,9 @@ +@@ -54,6 +55,9 @@ #include <linux/irq.h> #include <linux/err.h> @@ -246,8 +188,8 @@ diff -puN arch/i386/kernel/process.c~i386-cpu-hotplug-updated-for-mm arch/i386/k + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); - int hlt_counter; -@@ -139,6 +143,34 @@ static void poll_idle (void) + static int hlt_counter; +@@ -138,6 +142,34 @@ } } @@ -282,19 +224,19 @@ diff -puN arch/i386/kernel/process.c~i386-cpu-hotplug-updated-for-mm arch/i386/k /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a -@@ -162,6 +194,9 @@ void cpu_idle (void) +@@ -160,6 +192,9 @@ if (!idle) idle = default_idle; + if (cpu_is_offline(cpu)) + play_dead(); + - irq_stat[cpu].idle_timestamp = jiffies; + __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } -diff -puN arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/smpboot.c ---- 25/arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/smpboot.c 2005-02-23 02:20:06.000000000 -0800 +diff -Naur linux-2.6.12.orig/arch/i386/kernel/smpboot.c linux-2.6.12/arch/i386/kernel/smpboot.c +--- linux-2.6.12.orig/arch/i386/kernel/smpboot.c 2005-07-08 12:33:40.000000000 -0400 ++++ linux-2.6.12/arch/i386/kernel/smpboot.c 2005-07-08 12:34:10.000000000 -0400 @@ -44,6 +44,9 @@ #include <linux/smp_lock.h> #include <linux/irq.h> @@ -305,9 +247,9 @@ diff -puN arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/i386/k #include <linux/delay.h> #include <linux/mc146818rtc.h> -@@ -89,6 +92,9 @@ extern unsigned char trampoline_end []; - static unsigned char *trampoline_base; - static int trampoline_exec; +@@ -90,6 +93,9 @@ + + static void map_cpu_to_logical_apicid(void); +/* State of each CPU. */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -315,7 +257,7 @@ diff -puN arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/i386/k /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller -@@ -1095,6 +1101,9 @@ static void __init smp_boot_cpus(unsigne +@@ -1107,6 +1113,9 @@ who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) { @@ -325,7 +267,7 @@ diff -puN arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/i386/k smp_boot_cpus(max_cpus); } -@@ -1104,20 +1113,99 @@ void __devinit smp_prepare_boot_cpu(void +@@ -1116,20 +1125,99 @@ cpu_set(smp_processor_id(), cpu_callout_map); } @@ -430,9 +372,9 @@ diff -puN arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/i386/k local_irq_enable(); /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); -diff -puN arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/smp.c ---- 25/arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/smp.c 2005-02-23 02:20:06.000000000 -0800 +diff -Naur linux-2.6.12.orig/arch/i386/kernel/smp.c linux-2.6.12/arch/i386/kernel/smp.c +--- linux-2.6.12.orig/arch/i386/kernel/smp.c 2005-07-08 12:33:40.000000000 -0400 ++++ linux-2.6.12/arch/i386/kernel/smp.c 2005-07-08 12:34:10.000000000 -0400 @@ -19,6 +19,7 @@ #include <linux/mc146818rtc.h> #include <linux/cache.h> @@ -441,7 +383,7 @@ diff -puN arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne #include <asm/mtrr.h> #include <asm/tlbflush.h> -@@ -163,7 +164,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu +@@ -163,7 +164,7 @@ unsigned long flags; local_irq_save(flags); @@ -450,7 +392,7 @@ diff -puN arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne /* * Wait for idle. */ -@@ -345,21 +346,21 @@ out: +@@ -345,21 +346,21 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { @@ -477,7 +419,7 @@ diff -puN arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. -@@ -484,6 +485,7 @@ void smp_send_nmi_allbutself(void) +@@ -474,6 +475,7 @@ */ void smp_send_reschedule(int cpu) { @@ -485,7 +427,7 @@ diff -puN arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } -@@ -524,10 +526,16 @@ int smp_call_function (void (*func) (voi +@@ -514,10 +516,16 @@ */ { struct call_data_struct data; @@ -504,7 +446,7 @@ diff -puN arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); -@@ -539,7 +547,6 @@ int smp_call_function (void (*func) (voi +@@ -529,7 +537,6 @@ if (wait) atomic_set(&data.finished, 0); @@ -512,10 +454,10 @@ diff -puN arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/i386/kerne call_data = &data; mb(); -diff -puN arch/i386/kernel/traps.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/traps.c ---- 25/arch/i386/kernel/traps.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/traps.c 2005-02-23 02:20:06.000000000 -0800 -@@ -669,6 +669,14 @@ fastcall void do_nmi(struct pt_regs * re +diff -Naur linux-2.6.12.orig/arch/i386/kernel/traps.c linux-2.6.12/arch/i386/kernel/traps.c +--- linux-2.6.12.orig/arch/i386/kernel/traps.c 2005-07-08 12:33:40.000000000 -0400 ++++ linux-2.6.12/arch/i386/kernel/traps.c 2005-07-08 12:34:10.000000000 -0400 +@@ -624,6 +624,14 @@ nmi_enter(); cpu = smp_processor_id(); @@ -530,26 +472,28 @@ diff -puN arch/i386/kernel/traps.c~i386-cpu-hotplug-updated-for-mm arch/i386/ker ++nmi_count(cpu); if (!nmi_callback(regs, cpu)) -diff -puN arch/ia64/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/ia64/kernel/smpboot.c ---- 25/arch/ia64/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/ia64/kernel/smpboot.c 2005-02-23 02:20:06.000000000 -0800 -@@ -590,9 +590,10 @@ int __cpu_disable(void) - if (cpu == 0) - return -EBUSY; +diff -Naur linux-2.6.12.orig/arch/ppc64/kernel/pSeries_smp.c linux-2.6.12/arch/ppc64/kernel/pSeries_smp.c +--- linux-2.6.12.orig/arch/ppc64/kernel/pSeries_smp.c 2005-07-08 12:33:42.000000000 -0400 ++++ linux-2.6.12/arch/ppc64/kernel/pSeries_smp.c 2005-07-08 12:34:10.000000000 -0400 +@@ -92,10 +92,13 @@ + int pSeries_cpu_disable(void) + { ++ int cpu = smp_processor_id(); ++ + cpu_clear(cpu, cpu_online_map); - fixup_irqs(); - local_flush_tlb_all(); -- printk ("Disabled cpu %u\n", smp_processor_id()); -+ printk("Disabled cpu %u\n", cpu); - return 0; - } + systemcfg->processorCount--; + + /*fix boot_cpuid here*/ +- if (smp_processor_id() == boot_cpuid) ++ if (cpu == boot_cpuid) + boot_cpuid = any_online_cpu(cpu_online_map); -diff -puN arch/ppc64/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/ppc64/kernel/smp.c -diff -puN arch/s390/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/s390/kernel/smp.c ---- 25/arch/s390/kernel/smp.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/s390/kernel/smp.c 2005-02-23 02:20:06.000000000 -0800 -@@ -679,12 +679,14 @@ __cpu_disable(void) + /* FIXME: abstract this to not be platform specific later on */ +diff -Naur linux-2.6.12.orig/arch/s390/kernel/smp.c linux-2.6.12/arch/s390/kernel/smp.c +--- linux-2.6.12.orig/arch/s390/kernel/smp.c 2005-07-08 12:33:42.000000000 -0400 ++++ linux-2.6.12/arch/s390/kernel/smp.c 2005-07-08 12:34:10.000000000 -0400 +@@ -679,12 +679,14 @@ { unsigned long flags; ec_creg_mask_parms cr_parms; @@ -565,9 +509,9 @@ diff -puN arch/s390/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/s390/kerne #ifdef CONFIG_PFAULT /* Disable pfault pseudo page faults on this cpu. */ -diff -puN include/asm-i386/cpu.h~i386-cpu-hotplug-updated-for-mm include/asm-i386/cpu.h ---- 25/include/asm-i386/cpu.h~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/include/asm-i386/cpu.h 2005-02-23 02:20:06.000000000 -0800 +diff -Naur linux-2.6.12.orig/include/asm-i386/cpu.h linux-2.6.12/include/asm-i386/cpu.h +--- linux-2.6.12.orig/include/asm-i386/cpu.h 2005-07-08 12:33:58.000000000 -0400 ++++ linux-2.6.12/include/asm-i386/cpu.h 2005-07-08 12:34:10.000000000 -0400 @@ -5,6 +5,7 @@ #include <linux/cpu.h> #include <linux/topology.h> @@ -576,16 +520,16 @@ diff -puN include/asm-i386/cpu.h~i386-cpu-hotplug-updated-for-mm include/asm-i38 #include <asm/node.h> -@@ -17,4 +18,5 @@ extern int arch_register_cpu(int num); +@@ -16,4 +17,5 @@ extern void arch_unregister_cpu(int); #endif +DECLARE_PER_CPU(int, cpu_state); #endif /* _ASM_I386_CPU_H_ */ -diff -puN include/asm-i386/irq.h~i386-cpu-hotplug-updated-for-mm include/asm-i386/irq.h ---- 25/include/asm-i386/irq.h~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/include/asm-i386/irq.h 2005-02-23 02:20:06.000000000 -0800 -@@ -38,4 +38,8 @@ extern void release_vm86_irqs(struct tas +diff -Naur linux-2.6.12.orig/include/asm-i386/irq.h linux-2.6.12/include/asm-i386/irq.h +--- linux-2.6.12.orig/include/asm-i386/irq.h 2005-07-08 12:33:58.000000000 -0400 ++++ linux-2.6.12/include/asm-i386/irq.h 2005-07-08 12:34:10.000000000 -0400 +@@ -38,4 +38,8 @@ extern int irqbalance_disable(char *str); #endif @@ -594,10 +538,10 @@ diff -puN include/asm-i386/irq.h~i386-cpu-hotplug-updated-for-mm include/asm-i38 +#endif + #endif /* _ASM_IRQ_H */ -diff -puN include/asm-i386/smp.h~i386-cpu-hotplug-updated-for-mm include/asm-i386/smp.h ---- 25/include/asm-i386/smp.h~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/include/asm-i386/smp.h 2005-02-23 02:20:06.000000000 -0800 -@@ -85,6 +85,9 @@ static __inline int logical_smp_processo +diff -Naur linux-2.6.12.orig/include/asm-i386/smp.h linux-2.6.12/include/asm-i386/smp.h +--- linux-2.6.12.orig/include/asm-i386/smp.h 2005-07-08 12:33:58.000000000 -0400 ++++ linux-2.6.12/include/asm-i386/smp.h 2005-07-08 12:34:10.000000000 -0400 +@@ -83,6 +83,9 @@ } #endif @@ -607,10 +551,10 @@ diff -puN include/asm-i386/smp.h~i386-cpu-hotplug-updated-for-mm include/asm-i38 #endif /* !__ASSEMBLY__ */ #define NO_PROC_ID 0xFF /* No processor magic marker */ -diff -puN kernel/cpu.c~i386-cpu-hotplug-updated-for-mm kernel/cpu.c ---- 25/kernel/cpu.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/kernel/cpu.c 2005-02-23 02:20:06.000000000 -0800 -@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused) +diff -Naur linux-2.6.12.orig/kernel/cpu.c linux-2.6.12/kernel/cpu.c +--- linux-2.6.12.orig/kernel/cpu.c 2005-07-08 12:33:26.000000000 -0400 ++++ linux-2.6.12/kernel/cpu.c 2005-07-08 12:34:10.000000000 -0400 +@@ -63,19 +63,15 @@ { int err; @@ -635,22 +579,3 @@ diff -puN kernel/cpu.c~i386-cpu-hotplug-updated-for-mm kernel/cpu.c } int cpu_down(unsigned int cpu) -diff -puN arch/ppc64/kernel/pSeries_smp.c~i386-cpu-hotplug-updated-for-mm arch/ppc64/kernel/pSeries_smp.c ---- 25/arch/ppc64/kernel/pSeries_smp.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:08.000000000 -0800 -+++ 25-akpm/arch/ppc64/kernel/pSeries_smp.c 2005-02-23 02:20:08.000000000 -0800 -@@ -86,10 +86,13 @@ static int query_cpu_stopped(unsigned in - - int pSeries_cpu_disable(void) - { -+ int cpu = smp_processor_id(); -+ -+ cpu_clear(cpu, cpu_online_map); - systemcfg->processorCount--; - - /*fix boot_cpuid here*/ -- if (smp_processor_id() == boot_cpuid) -+ if (cpu == boot_cpuid) - boot_cpuid = any_online_cpu(cpu_online_map); - - /* FIXME: abstract this to not be platform specific later on */ -_ diff --git a/patches/linux-2.6.11/net-csum.patch b/patches/linux-2.6.12/net-csum.patch index 115cc1ed13..37a1fbf9c1 100644 --- a/patches/linux-2.6.11/net-csum.patch +++ b/patches/linux-2.6.12/net-csum.patch @@ -1,14 +1,3 @@ -diff -ur linux-2.6.11/net/ipv4/netfilter/ip_conntrack_proto_tcp.c linux-2.6.11-csum/net/ipv4/netfilter/ip_conntrack_proto_tcp.c ---- linux-2.6.11/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2005-05-27 11:47:48 +01:00 -+++ linux-2.6.11-csum/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2005-05-27 11:48:07 +01:00 -@@ -803,6 +803,7 @@ - */ - /* FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING -+ && skb->ip_summed != CHECKSUM_UNNECESSARY - && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP, - skb->ip_summed == CHECKSUM_HW ? skb->csum - : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { diff -ur linux-2.6.11/net/ipv4/netfilter/ip_conntrack_proto_udp.c linux-2.6.11-csum/net/ipv4/netfilter/ip_conntrack_proto_udp.c --- linux-2.6.11/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2005-05-27 11:47:48 +01:00 +++ linux-2.6.11-csum/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2005-05-27 11:48:07 +01:00 diff --git a/patches/linux-2.6.11/rcu-nohz.patch b/patches/linux-2.6.12/rcu-nohz.patch index d7bafb3a62..d7bafb3a62 100644 --- a/patches/linux-2.6.11/rcu-nohz.patch +++ b/patches/linux-2.6.12/rcu-nohz.patch diff --git a/patches/linux-2.6.11/smp-alts.patch b/patches/linux-2.6.12/smp-alts.patch index 5d18c5e71a..5d18c5e71a 100644 --- a/patches/linux-2.6.11/smp-alts.patch +++ b/patches/linux-2.6.12/smp-alts.patch diff --git a/patches/linux-2.6.11/x86_64-linux.patch b/patches/linux-2.6.12/x86_64-linux.patch index 57d4f07a06..57d4f07a06 100644 --- a/patches/linux-2.6.11/x86_64-linux.patch +++ b/patches/linux-2.6.12/x86_64-linux.patch |