aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--xen/arch/x86/Makefile2
-rw-r--r--xen/arch/x86/numa.c302
-rw-r--r--xen/arch/x86/setup.c34
-rw-r--r--xen/arch/x86/smpboot.c4
-rw-r--r--xen/arch/x86/srat.c325
-rw-r--r--xen/drivers/acpi/Makefile1
-rw-r--r--xen/drivers/acpi/numa.c216
-rw-r--r--xen/include/asm-x86/acpi.h3
-rw-r--r--xen/include/asm-x86/config.h5
-rw-r--r--xen/include/asm-x86/mach-generic/mach_apic.h6
-rw-r--r--xen/include/asm-x86/numa.h65
-rw-r--r--xen/include/asm-x86/numnodes.h26
-rw-r--r--xen/include/asm-x86/topology.h40
-rw-r--r--xen/include/xen/config.h2
-rw-r--r--xen/include/xen/nodemask.h342
-rw-r--r--xen/include/xen/numa.h35
-rw-r--r--xen/include/xen/topology.h27
17 files changed, 1428 insertions, 7 deletions
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index 31f2793fb9..89cc508d02 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -28,12 +28,14 @@ obj-y += microcode.o
obj-y += mm.o
obj-y += mpparse.o
obj-y += nmi.o
+obj-y += numa.o
obj-y += physdev.o
obj-y += rwlock.o
obj-y += setup.o
obj-y += shutdown.o
obj-y += smp.o
obj-y += smpboot.o
+obj-y += srat.o
obj-y += string.o
obj-y += sysctl.o
obj-y += time.o
diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
new file mode 100644
index 0000000000..86bff703f3
--- /dev/null
+++ b/xen/arch/x86/numa.c
@@ -0,0 +1,302 @@
+/*
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
+ */
+
+#include <xen/mm.h>
+#include <xen/string.h>
+#include <xen/init.h>
+#include <xen/ctype.h>
+#include <xen/nodemask.h>
+#include <xen/numa.h>
+#include <xen/keyhandler.h>
+#include <xen/time.h>
+
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+/* from proto.h */
+#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
+
+struct node_data node_data[MAX_NUMNODES];
+
+int memnode_shift;
+u8 memnodemap[NODEMAPSIZE];
+
+unsigned int cpu_to_node[NR_CPUS] __read_mostly = {
+ [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
+
+nodemask_t node_online_map = { { [0] = 1UL } };
+
+int numa_off __initdata;
+
+int acpi_numa __initdata;
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init
+populate_memnodemap(const struct node *nodes, int numnodes, int shift)
+{
+ int i;
+ int res = -1;
+ unsigned long addr, end;
+
+ if (shift >= 64)
+ return -1;
+ memset(memnodemap, 0xff, sizeof(memnodemap));
+ for (i = 0; i < numnodes; i++) {
+ addr = nodes[i].start;
+ end = nodes[i].end;
+ if (addr >= end)
+ continue;
+ if ((end >> shift) >= NODEMAPSIZE)
+ return 0;
+ do {
+ if (memnodemap[addr >> shift] != 0xff)
+ return -1;
+ memnodemap[addr >> shift] = i;
+ addr += (1UL << shift);
+ } while (addr < end);
+ res = 1;
+ }
+ return res;
+}
+
+int __init compute_hash_shift(struct node *nodes, int numnodes)
+{
+ int shift = 20;
+
+ while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
+ shift++;
+
+ printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
+ shift);
+
+ if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+ printk(KERN_INFO
+ "Your memory is not aligned you need to rebuild your kernel "
+ "with a bigger NODEMAPSIZE shift=%d\n",
+ shift);
+ return -1;
+ }
+ return shift;
+}
+
+/* initialize NODE_DATA given nodeid and start/end */
+void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
+{
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = end >> PAGE_SHIFT;
+
+ NODE_DATA(nodeid)->node_id = nodeid;
+ NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+ NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+ node_set_online(nodeid);
+}
+
+void __init numa_init_array(void)
+{
+ int rr, i;
+ /* There are unfortunately some poorly designed mainboards around
+ that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ mapping. To avoid this fill in the mapping for all possible
+ CPUs, as the number of CPUs is not known yet.
+ We round robin the existing nodes. */
+ rr = first_node(node_online_map);
+ for (i = 0; i < NR_CPUS; i++) {
+ if (cpu_to_node[i] != NUMA_NO_NODE)
+ continue;
+ numa_set_node(i, rr);
+ rr = next_node(rr, node_online_map);
+ if (rr == MAX_NUMNODES)
+ rr = first_node(node_online_map);
+ }
+
+}
+
+#ifdef CONFIG_NUMA_EMU
+/* default to faking a single node as fallback for non-NUMA hardware */
+int numa_fake __initdata = 1;
+
+/* Numa emulation */
+static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+ struct node nodes[MAX_NUMNODES];
+ unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+
+ /* Kludge needed for the hash function */
+ if (hweight64(sz) > 1) {
+ unsigned long x = 1;
+ while ((x << 1) < sz)
+ x <<= 1;
+ if (x < sz/2)
+ printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
+ sz = x;
+ }
+
+ memset(&nodes,0,sizeof(nodes));
+ for (i = 0; i < numa_fake; i++) {
+ nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+ if (i == numa_fake-1)
+ sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
+ nodes[i].end = nodes[i].start + sz;
+ printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
+ i,
+ nodes[i].start, nodes[i].end,
+ (nodes[i].end - nodes[i].start) >> 20);
+ node_set_online(i);
+ }
+ memnode_shift = compute_hash_shift(nodes, numa_fake);
+ if (memnode_shift < 0) {
+ memnode_shift = 0;
+ printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
+ return -1;
+ }
+ for_each_online_node(i)
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ numa_init_array();
+ return 0;
+}
+#endif
+
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+
+#ifdef CONFIG_ACPI_NUMA
+ if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+ end_pfn << PAGE_SHIFT))
+ return;
+#endif
+
+#ifdef CONFIG_NUMA_EMU
+ /* fake a numa node for non-numa hardware */
+ if (numa_fake && !numa_emulation(start_pfn, end_pfn))
+ return;
+#endif
+
+ printk(KERN_INFO "%s\n",
+ numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+ printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
+ start_pfn << PAGE_SHIFT,
+ end_pfn << PAGE_SHIFT);
+ /* setup dummy node covering all memory */
+ memnode_shift = 63;
+ memnodemap[0] = 0;
+ nodes_clear(node_online_map);
+ node_set_online(0);
+ for (i = 0; i < NR_CPUS; i++)
+ numa_set_node(i, 0);
+ node_to_cpumask[0] = cpumask_of_cpu(0);
+ setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+__cpuinit void numa_add_cpu(int cpu)
+{
+ set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+ cpu_to_node[cpu] = node;
+}
+
+/* [numa=off] */
+__init int numa_setup(char *opt)
+{
+ if (!strncmp(opt,"off",3))
+ numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+ if(!strncmp(opt, "fake=", 5)) {
+ numa_fake = simple_strtoul(opt+5,NULL,0); ;
+ if (numa_fake >= MAX_NUMNODES)
+ numa_fake = MAX_NUMNODES;
+ }
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ if (!strncmp(opt,"noacpi",6))
+ acpi_numa = -1;
+#endif
+ return 1;
+}
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ */
+void __init init_cpu_to_node(void)
+{
+ int i;
+ for (i = 0; i < NR_CPUS; i++) {
+ u8 apicid = x86_cpu_to_apicid[i];
+ if (apicid == BAD_APICID)
+ continue;
+ if (apicid_to_node[apicid] == NUMA_NO_NODE)
+ continue;
+ numa_set_node(i,apicid_to_node[apicid]);
+ }
+}
+
+EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(node_to_cpumask);
+EXPORT_SYMBOL(memnode_shift);
+EXPORT_SYMBOL(memnodemap);
+EXPORT_SYMBOL(node_data);
+
+static void dump_numa(unsigned char key)
+{
+ s_time_t now = NOW();
+ int i;
+
+ printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
+ (u32)(now>>32), (u32)now);
+
+ for_each_online_node(i) {
+ unsigned long pa = (NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT;
+ printk("idx%d -> NODE%d start->%lu size->%lu\n",
+ i, NODE_DATA(i)->node_id,
+ NODE_DATA(i)->node_start_pfn,
+ NODE_DATA(i)->node_spanned_pages);
+ /* sanity check phys_to_nid() */
+ printk("phys_to_nid(%lx) -> %d should be %d\n", pa, phys_to_nid(pa),
+ NODE_DATA(i)->node_id);
+ }
+ for_each_online_cpu(i)
+ printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]);
+}
+
+static __init int register_numa_trigger(void)
+{
+ register_keyhandler('u', dump_numa, "dump numa info");
+ return 0;
+}
+__initcall(register_numa_trigger);
+
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 2c8b638944..03da8af41c 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -16,6 +16,7 @@
#include <xen/percpu.h>
#include <xen/hypercall.h>
#include <xen/keyhandler.h>
+#include <xen/numa.h>
#include <public/version.h>
#include <asm/bitops.h>
#include <asm/smp.h>
@@ -25,10 +26,12 @@
#include <asm/desc.h>
#include <asm/shadow.h>
#include <asm/e820.h>
+#include <asm/numa.h>
#include <acm/acm_hooks.h>
extern void dmi_scan_machine(void);
extern void generic_apic_probe(void);
+extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
/*
* opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the
@@ -60,6 +63,9 @@ boolean_param("watchdog", opt_watchdog);
static void parse_acpi_param(char *s);
custom_param("acpi", parse_acpi_param);
+extern int numa_setup(char *s);
+custom_param("numa", numa_setup);
+
/* **** Linux config option: propagated to domain0. */
/* acpi_skip_timer_override: Skip IRQ0 overrides. */
extern int acpi_skip_timer_override;
@@ -257,6 +263,20 @@ static void __init init_idle_domain(void)
setup_idle_pagetable();
}
+static void srat_detect_node(int cpu)
+{
+ unsigned node;
+ u8 apicid = x86_cpu_to_apicid[cpu];
+
+ node = apicid_to_node[apicid];
+ if (node == NUMA_NO_NODE)
+ node = 0;
+ numa_set_node(cpu, node);
+
+ if (acpi_numa > 0)
+ printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
+}
+
void __init __start_xen(multiboot_info_t *mbi)
{
char __cmdline[] = "", *cmdline = __cmdline;
@@ -485,6 +505,12 @@ void __init __start_xen(multiboot_info_t *mbi)
init_frametable();
+ acpi_boot_table_init();
+
+ acpi_numa_init();
+
+ numa_initmem_init(0, max_page);
+
end_boot_allocator();
/* Initialise the Xen heap, skipping RAM holes. */
@@ -536,9 +562,10 @@ void __init __start_xen(multiboot_info_t *mbi)
generic_apic_probe();
- acpi_boot_table_init();
acpi_boot_init();
+ init_cpu_to_node();
+
if ( smp_found_config )
get_smp_config();
@@ -589,6 +616,11 @@ void __init __start_xen(multiboot_info_t *mbi)
break;
if ( !cpu_online(i) )
__cpu_up(i);
+
+ /* setup cpu_to_node[] */
+ srat_detect_node(i);
+ /* setup node_to_cpumask based on cpu_to_node[] */
+ numa_add_cpu(i);
}
printk("Brought up %ld CPUs\n", (long)num_online_cpus());
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index eb2d21111c..b971069cf2 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -43,6 +43,8 @@
#include <xen/delay.h>
#include <xen/softirq.h>
#include <xen/serial.h>
+#include <xen/numa.h>
+#include <asm/numa.h>
#include <asm/current.h>
#include <asm/mc146818rtc.h>
#include <asm/desc.h>
@@ -628,7 +630,7 @@ u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICI
static void map_cpu_to_logical_apicid(void)
{
int cpu = smp_processor_id();
- int apicid = logical_smp_processor_id();
+ int apicid = hard_smp_processor_id();
cpu_2_logical_apicid[cpu] = apicid;
map_cpu_to_node(cpu, apicid_to_node(apicid));
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
new file mode 100644
index 0000000000..dfa85b2539
--- /dev/null
+++ b/xen/arch/x86/srat.c
@@ -0,0 +1,325 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ *
+ * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
+ */
+
+#if 0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/proto.h>
+#include <xen/bitmap.h>
+#include <xen/numa.h>
+#include <xen/topology.h>
+#include <asm/e820.h>
+#endif
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/inttypes.h>
+#include <xen/nodemask.h>
+#include <xen/acpi.h>
+
+#include <asm/numa.h>
+#include <asm/page.h>
+
+static struct acpi_table_slit *acpi_slit;
+
+static nodemask_t nodes_parsed __initdata;
+static nodemask_t nodes_found __initdata;
+static struct node nodes[MAX_NUMNODES] __initdata;
+static u8 pxm2node[256] = { [0 ... 255] = 0xff };
+
+/* Too small nodes confuse the VM badly. Usually they result
+ from BIOS bugs. */
+#define NODE_MIN_SIZE (4*1024*1024)
+
+static int node_to_pxm(int n);
+
+int pxm_to_node(int pxm)
+{
+ if ((unsigned)pxm >= 256)
+ return -1;
+ /* Extend 0xff to (int)-1 */
+ return (signed char)pxm2node[pxm];
+}
+
+static __init int setup_node(int pxm)
+{
+ unsigned node = pxm2node[pxm];
+ if (node == 0xff) {
+ if (nodes_weight(nodes_found) >= MAX_NUMNODES)
+ return -1;
+ node = first_unset_node(nodes_found);
+ node_set(node, nodes_found);
+ pxm2node[pxm] = node;
+ }
+ return pxm2node[pxm];
+}
+
+static __init int conflicting_nodes(u64 start, u64 end)
+{
+ int i;
+ for_each_node_mask(i, nodes_parsed) {
+ struct node *nd = &nodes[i];
+ if (nd->start == nd->end)
+ continue;
+ if (nd->end > start && nd->start < end)
+ return i;
+ if (nd->end == end && nd->start == start)
+ return i;
+ }
+ return -1;
+}
+
+static __init void cutoff_node(int i, u64 start, u64 end)
+{
+ struct node *nd = &nodes[i];
+ if (nd->start < start) {
+ nd->start = start;
+ if (nd->end < nd->start)
+ nd->start = nd->end;
+ }
+ if (nd->end > end) {
+ nd->end = end;
+ if (nd->start > nd->end)
+ nd->start = nd->end;
+ }
+}
+
+static __init void bad_srat(void)
+{
+ int i;
+ printk(KERN_ERR "SRAT: SRAT not used.\n");
+ acpi_numa = -1;
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ apicid_to_node[i] = NUMA_NO_NODE;
+}
+
+static __init inline int srat_disabled(void)
+{
+ return numa_off || acpi_numa < 0;
+}
+
+/*
+ * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
+ * up the NUMA heuristics which wants the local node to have a smaller
+ * distance than the others.
+ * Do some quick checks here and only use the SLIT if it passes.
+ */
+static __init int slit_valid(struct acpi_table_slit *slit)
+{
+ int i, j;
+ int d = slit->localities;
+ for (i = 0; i < d; i++) {
+ for (j = 0; j < d; j++) {
+ u8 val = slit->entry[d*i + j];
+ if (i == j) {
+ if (val != 10)
+ return 0;
+ } else if (val <= 10)
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+ if (!slit_valid(slit)) {
+ printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
+ return;
+ }
+ acpi_slit = slit;
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
+{
+ int pxm, node;
+ if (srat_disabled())
+ return;
+ if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat();
+ return;
+ }
+ if (pa->flags.enabled == 0)
+ return;
+ pxm = pa->proximity_domain;
+ node = setup_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+ bad_srat();
+ return;
+ }
+ apicid_to_node[pa->apic_id] = node;
+ acpi_numa = 1;
+ printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+ pxm, pa->apic_id, node);
+}
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
+{
+ struct node *nd;
+ u64 start, end;
+ int node, pxm;
+ int i;
+
+ if (srat_disabled())
+ return;
+ if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
+ bad_srat();
+ return;
+ }
+ if (ma->flags.enabled == 0)
+ return;
+ start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
+ end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
+ pxm = ma->proximity_domain;
+ node = setup_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+ bad_srat();
+ return;
+ }
+ /* It is fine to add this area to the nodes data it will be used later*/
+ if (ma->flags.hot_pluggable == 1)
+ printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - %"PRIx64" \n",
+ start, end);
+ i = conflicting_nodes(start, end);
+ if (i == node) {
+ printk(KERN_WARNING
+ "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%"
+ PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, nodes[i].end);
+ } else if (i >= 0) {
+ printk(KERN_ERR
+ "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d (%"
+ PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i),
+ nodes[i].start, nodes[i].end);
+ bad_srat();
+ return;
+ }
+ nd = &nodes[node];
+ if (!node_test_and_set(node, nodes_parsed)) {
+ nd->start = start;
+ nd->end = end;
+ } else {
+ if (start < nd->start)
+ nd->start = start;
+ if (nd->end < end)
+ nd->end = end;
+ }
+ printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, pxm,
+ nd->start, nd->end);
+}
+
+/* Sanity check to catch more bad SRATs (they are amazingly common).
+ Make sure the PXMs cover all memory. */
+static int nodes_cover_memory(void)
+{
+ int i;
+ u64 pxmram, e820ram;
+
+ pxmram = 0;
+ for_each_node_mask(i, nodes_parsed) {
+ u64 s = nodes[i].start >> PAGE_SHIFT;
+ u64 e = nodes[i].end >> PAGE_SHIFT;
+ pxmram += e - s;
+ }
+
+ e820ram = max_page;
+ /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
+ if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+ printk(KERN_ERR "SRAT: PXMs only cover %"PRIu64"MB of your %"
+ PRIu64"MB e820 RAM. Not used.\n",
+ (pxmram << PAGE_SHIFT) >> 20,
+ (e820ram << PAGE_SHIFT) >> 20);
+ return 0;
+ }
+ return 1;
+}
+
+static void unparse_node(int node)
+{
+ int i;
+ node_clear(node, nodes_parsed);
+ for (i = 0; i < MAX_LOCAL_APIC; i++) {
+ if (apicid_to_node[i] == node)
+ apicid_to_node[i] = NUMA_NO_NODE;
+ }
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+/* Use the information discovered above to actually set up the nodes. */
+int __init acpi_scan_nodes(u64 start, u64 end)
+{
+ int i;
+
+ /* First clean up the node list */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cutoff_node(i, start, end);
+ if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
+ unparse_node(i);
+ }
+
+ if (acpi_numa <= 0)
+ return -1;
+
+ if (!nodes_cover_memory()) {
+ bad_srat();
+ return -1;
+ }
+
+ memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+ if (memnode_shift < 0) {
+ printk(KERN_ERR
+ "SRAT: No NUMA node hash function found. Contact maintainer\n");
+ bad_srat();
+ return -1;
+ }
+
+ /* Finally register nodes */
+ for_each_node_mask(i, nodes_parsed)
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ for (i = 0; i < NR_CPUS; i++) {
+ if (cpu_to_node[i] == NUMA_NO_NODE)
+ continue;
+ if (!node_isset(cpu_to_node[i], nodes_parsed))
+ numa_set_node(i, NUMA_NO_NODE);
+ }
+ numa_init_array();
+ return 0;
+}
+
+static int node_to_pxm(int n)
+{
+ int i;
+ if (pxm2node[n] == n)
+ return n;
+ for (i = 0; i < 256; i++)
+ if (pxm2node[i] == n)
+ return i;
+ return 0;
+}
+
+int __node_distance(int a, int b)
+{
+ int index;
+
+ if (!acpi_slit)
+ return a == b ? 10 : 20;
+ index = acpi_slit->localities * node_to_pxm(a);
+ return acpi_slit->entry[index + node_to_pxm(b)];
+}
+
+EXPORT_SYMBOL(__node_distance);
diff --git a/xen/drivers/acpi/Makefile b/xen/drivers/acpi/Makefile
index 68dafe3a52..08844a529d 100644
--- a/xen/drivers/acpi/Makefile
+++ b/xen/drivers/acpi/Makefile
@@ -1 +1,2 @@
obj-y += tables.o
+obj-y += numa.o
diff --git a/xen/drivers/acpi/numa.c b/xen/drivers/acpi/numa.c
new file mode 100644
index 0000000000..ecf426ece4
--- /dev/null
+++ b/xen/drivers/acpi/numa.c
@@ -0,0 +1,216 @@
+/*
+ * acpi_numa.c - ACPI NUMA support
+ *
+ * Copyright (C) 2002 Takayoshi Kochi <t-kochi@bq.jp.nec.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ */
+#if 0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#endif
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/acpi.h>
+#include <xen/numa.h>
+#include <acpi/acpi_bus.h>
+#include <acpi/acmacros.h>
+#include <asm/page.h> /* __va() */
+
+#define ACPI_NUMA 0x80000000
+#define _COMPONENT ACPI_NUMA
+ACPI_MODULE_NAME("numa")
+
+extern int __init acpi_table_parse_madt_family(enum acpi_table_id id,
+ unsigned long madt_size,
+ int entry_id,
+ acpi_madt_entry_handler handler,
+ unsigned int max_entries);
+
+void __init acpi_table_print_srat_entry(acpi_table_entry_header * header)
+{
+
+ ACPI_FUNCTION_NAME("acpi_table_print_srat_entry");
+
+ if (!header)
+ return;
+
+ switch (header->type) {
+
+ case ACPI_SRAT_PROCESSOR_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+ {
+ struct acpi_table_processor_affinity *p =
+ (struct acpi_table_processor_affinity *)header;
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n",
+ p->apic_id, p->lsapic_eid,
+ p->proximity_domain,
+ p->flags.
+ enabled ? "enabled" : "disabled"));
+ }
+#endif /* ACPI_DEBUG_OUTPUT */
+ break;
+
+ case ACPI_SRAT_MEMORY_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+ {
+ struct acpi_table_memory_affinity *p =
+ (struct acpi_table_memory_affinity *)header;
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "SRAT Memory (0x%08x%08x length 0x%08x%08x type 0x%x) in proximity domain %d %s%s\n",
+ p->base_addr_hi, p->base_addr_lo,
+ p->length_hi, p->length_lo,
+ p->memory_type, p->proximity_domain,
+ p->flags.
+ enabled ? "enabled" : "disabled",
+ p->flags.
+ hot_pluggable ? " hot-pluggable" :
+ ""));
+ }
+#endif /* ACPI_DEBUG_OUTPUT */
+ break;
+
+ default:
+ printk(KERN_WARNING PREFIX
+ "Found unsupported SRAT entry (type = 0x%x)\n",
+ header->type);
+ break;
+ }
+}
+
+static int __init acpi_parse_slit(unsigned long phys_addr, unsigned long size)
+{
+ struct acpi_table_slit *slit;
+ u32 localities;
+
+ if (!phys_addr || !size)
+ return -EINVAL;
+
+ slit = (struct acpi_table_slit *)__va(phys_addr);
+
+ /* downcast just for %llu vs %lu for i386/ia64 */
+ localities = (u32) slit->localities;
+
+ acpi_numa_slit_init(slit);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_processor_affinity(acpi_table_entry_header * header,
+ const unsigned long end)
+{
+ struct acpi_table_processor_affinity *processor_affinity;
+
+ processor_affinity = (struct acpi_table_processor_affinity *)header;
+ if (!processor_affinity)
+ return -EINVAL;
+
+ acpi_table_print_srat_entry(header);
+
+ /* let architecture-dependent part to do it */
+ acpi_numa_processor_affinity_init(processor_affinity);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_memory_affinity(acpi_table_entry_header * header,
+ const unsigned long end)
+{
+ struct acpi_table_memory_affinity *memory_affinity;
+
+ memory_affinity = (struct acpi_table_memory_affinity *)header;
+ if (!memory_affinity)
+ return -EINVAL;
+
+ acpi_table_print_srat_entry(header);
+
+ /* let architecture-dependent part to do it */
+ acpi_numa_memory_affinity_init(memory_affinity);
+
+ return 0;
+}
+
+static int __init acpi_parse_srat(unsigned long phys_addr, unsigned long size)
+{
+ struct acpi_table_srat *srat;
+
+ if (!phys_addr || !size)
+ return -EINVAL;
+
+ srat = (struct acpi_table_srat *)__va(phys_addr);
+
+ return 0;
+}
+
+int __init
+acpi_table_parse_srat(enum acpi_srat_entry_id id,
+ acpi_madt_entry_handler handler, unsigned int max_entries)
+{
+ return acpi_table_parse_madt_family(ACPI_SRAT,
+ sizeof(struct acpi_table_srat), id,
+ handler, max_entries);
+}
+
+int __init acpi_numa_init(void)
+{
+ int result;
+
+ /* SRAT: Static Resource Affinity Table */
+ result = acpi_table_parse(ACPI_SRAT, acpi_parse_srat);
+
+ if (result > 0) {
+ result = acpi_table_parse_srat(ACPI_SRAT_PROCESSOR_AFFINITY,
+ acpi_parse_processor_affinity,
+ NR_CPUS);
+ result = acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); // IA64 specific
+ }
+
+ /* SLIT: System Locality Information Table */
+ result = acpi_table_parse(ACPI_SLIT, acpi_parse_slit);
+
+ acpi_numa_arch_fixup();
+ return 0;
+}
+
+#if 0
+int acpi_get_pxm(acpi_handle h)
+{
+ unsigned long pxm;
+ acpi_status status;
+ acpi_handle handle;
+ acpi_handle phandle = h;
+
+ do {
+ handle = phandle;
+ status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
+ if (ACPI_SUCCESS(status))
+ return (int)pxm;
+ status = acpi_get_parent(handle, &phandle);
+ } while (ACPI_SUCCESS(status));
+ return -1;
+}
+
+EXPORT_SYMBOL(acpi_get_pxm);
+#endif
diff --git a/xen/include/asm-x86/acpi.h b/xen/include/asm-x86/acpi.h
index 51c4b8e293..c6dd5b2261 100644
--- a/xen/include/asm-x86/acpi.h
+++ b/xen/include/asm-x86/acpi.h
@@ -157,6 +157,8 @@ static inline void check_acpi_pci(void) { }
static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
static inline int acpi_irq_balance_set(char *str) { return 0; }
+extern int acpi_scan_nodes(u64 start, u64 end);
+extern int acpi_numa;
#ifdef CONFIG_ACPI_SLEEP
@@ -173,5 +175,6 @@ extern void acpi_reserve_bootmem(void);
#endif /*CONFIG_ACPI_SLEEP*/
extern u8 x86_acpiid_to_apicid[];
+#define MAX_LOCAL_APIC 256
#endif /*_ASM_ACPI_H*/
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index e2ef90700c..879bdbf80b 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -24,6 +24,11 @@
#define CONFIG_X86_IO_APIC 1
#define CONFIG_HPET_TIMER 1
#define CONFIG_X86_MCE_P4THERMAL 1
+#define CONFIG_ACPI_NUMA 1
+#define CONFIG_NUMA 1
+#define CONFIG_ACPI_SRAT 1
+#define CONFIG_DISCONTIGMEM 1
+#define CONFIG_NUMA_EMU 1
/* Intel P4 currently has largest cache line (L2 line size is 128 bytes). */
#define CONFIG_X86_L1_CACHE_SHIFT 7
diff --git a/xen/include/asm-x86/mach-generic/mach_apic.h b/xen/include/asm-x86/mach-generic/mach_apic.h
index 1d3ed4dc67..1e0a6019d6 100644
--- a/xen/include/asm-x86/mach-generic/mach_apic.h
+++ b/xen/include/asm-x86/mach-generic/mach_apic.h
@@ -22,11 +22,7 @@ static inline void enable_apic_mode(void)
return;
}
-/* No sane NUMA support right now. We should parse ACPI SRAT. */
-static inline int apicid_to_node(int logical_apicid)
-{
- return 0;
-}
+#define apicid_to_node(apicid) ((int)apicid_to_node[(u8)apicid])
extern u8 bios_cpu_apicid[];
static inline int cpu_present_to_apicid(int mps_cpu)
diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h
new file mode 100644
index 0000000000..a0db3cd272
--- /dev/null
+++ b/xen/include/asm-x86/numa.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X8664_NUMA_H
+#define _ASM_X8664_NUMA_H 1
+
+#include <xen/nodemask.h>
+#include <xen/topology.h>
+#include <asm/numnodes.h>
+#include <asm/smp.h>
+
+struct node {
+ u64 start,end;
+};
+
+extern int compute_hash_shift(struct node *nodes, int numnodes);
+extern int pxm_to_node(int nid);
+
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+#define VIRTUAL_BUG_ON(x)
+#define NODEMAPSIZE 0xfff
+
+extern void numa_add_cpu(int cpu);
+extern void numa_init_array(void);
+extern int numa_off;
+
+extern void numa_set_node(int cpu, int node);
+
+extern void setup_node_bootmem(int nodeid, u64 start, u64 end);
+extern unsigned char apicid_to_node[256];
+#ifdef CONFIG_NUMA
+extern void __init init_cpu_to_node(void);
+
+static inline void clear_node_cpumask(int cpu)
+{
+ clear_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+}
+
+/* Simple perfect hash to map physical addresses to node numbers */
+extern int memnode_shift;
+extern u8 memnodemap[NODEMAPSIZE];
+
+extern struct node_data node_data[];
+
+static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
+{
+ unsigned nid;
+ VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+ nid = memnodemap[addr >> memnode_shift];
+ VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
+ return nid;
+}
+
+#define NODE_DATA(nid) (&(node_data[nid]))
+
+#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
+ NODE_DATA(nid)->node_spanned_pages)
+
+
+#else
+#define init_cpu_to_node() do {} while (0)
+#define clear_node_cpumask(cpu) do {} while (0)
+#endif
+
+#define NUMA_NO_NODE 0xff
+
+#endif
diff --git a/xen/include/asm-x86/numnodes.h b/xen/include/asm-x86/numnodes.h
new file mode 100644
index 0000000000..92fadea5a8
--- /dev/null
+++ b/xen/include/asm-x86/numnodes.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_MAX_NUMNODES_H
+#define _ASM_MAX_NUMNODES_H
+
+#include <xen/config.h>
+
+#if defined(__i386__)
+#ifdef CONFIG_X86_NUMAQ
+
+/* Max 16 Nodes */
+#define NODES_SHIFT 4
+
+#elif defined(CONFIG_ACPI_SRAT)
+
+/* Max 8 Nodes */
+#define NODES_SHIFT 3
+
+#endif /* CONFIG_X86_NUMAQ */
+
+
+#endif /* __i386__ */
+
+#if defined(CONFIG_NUMA) && defined(__x86_64__)
+#define NODES_SHIFT 6
+#endif /* __x86_64__ */
+
+#endif /* _ASM_MAX_NUMNODES_H */
diff --git a/xen/include/asm-x86/topology.h b/xen/include/asm-x86/topology.h
new file mode 100644
index 0000000000..0a38cd0792
--- /dev/null
+++ b/xen/include/asm-x86/topology.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2006, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Ryan Harper <ryanh@us.ibm.com>
+ */
+
+#ifndef _ASM_X86_TOPOLOGY_H
+#define _ASM_X86_TOPOLOGY_H
+
+#include <xen/config.h>
+#include <xen/bitops.h>
+
+extern cpumask_t cpu_online_map;
+
+extern unsigned int cpu_to_node[];
+extern cpumask_t node_to_cpumask[];
+
+#define cpu_to_node(cpu) (cpu_to_node[cpu])
+#define parent_node(node) (node)
+#define node_to_first_cpu(node) (__ffs(node_to_cpumask[node]))
+#define node_to_cpumask(node) (node_to_cpumask[node])
+
+#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/xen/include/xen/config.h b/xen/include/xen/config.h
index e3f94d5843..f79472da77 100644
--- a/xen/include/xen/config.h
+++ b/xen/include/xen/config.h
@@ -50,5 +50,7 @@
#endif /* !__ASSEMBLY__ */
#define fastcall
+#define __cpuinitdata
+#define __cpuinit
#endif /* __XEN_CONFIG_H__ */
diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h
new file mode 100644
index 0000000000..b4a882e482
--- /dev/null
+++ b/xen/include/xen/nodemask.h
@@ -0,0 +1,342 @@
+#ifndef __LINUX_NODEMASK_H
+#define __LINUX_NODEMASK_H
+
+/*
+ * Nodemasks provide a bitmap suitable for representing the
+ * set of Node's in a system, one bit position per Node number.
+ *
+ * See detailed comments in the file linux/bitmap.h describing the
+ * data type on which these nodemasks are based.
+ *
+ * For details of nodemask_scnprintf() and nodemask_parse(),
+ * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ *
+ * The available nodemask operations are:
+ *
+ * void node_set(node, mask) turn on bit 'node' in mask
+ * void node_clear(node, mask) turn off bit 'node' in mask
+ * void nodes_setall(mask) set all bits
+ * void nodes_clear(mask) clear all bits
+ * int node_isset(node, mask) true iff bit 'node' set in mask
+ * int node_test_and_set(node, mask) test and set bit 'node' in mask
+ *
+ * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection]
+ * void nodes_or(dst, src1, src2) dst = src1 | src2 [union]
+ * void nodes_xor(dst, src1, src2) dst = src1 ^ src2
+ * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2
+ * void nodes_complement(dst, src) dst = ~src
+ *
+ * int nodes_equal(mask1, mask2) Does mask1 == mask2?
+ * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect?
+ * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2?
+ * int nodes_empty(mask) Is mask empty (no bits sets)?
+ * int nodes_full(mask) Is mask full (all bits sets)?
+ * int nodes_weight(mask) Hamming weight - number of set bits
+ *
+ * void nodes_shift_right(dst, src, n) Shift right
+ * void nodes_shift_left(dst, src, n) Shift left
+ *
+ * int first_node(mask) Number lowest set bit, or MAX_NUMNODES
+ * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES
+ * int first_unset_node(mask) First node not set in mask, or
+ * MAX_NUMNODES.
+ *
+ * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set
+ * NODE_MASK_ALL Initializer - all bits set
+ * NODE_MASK_NONE Initializer - no bits set
+ * unsigned long *nodes_addr(mask) Array of unsigned long's in mask
+ *
+ * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
+ * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask
+ *
+ * for_each_node_mask(node, mask) for-loop node over mask
+ *
+ * int num_online_nodes() Number of online Nodes
+ * int num_possible_nodes() Number of all possible Nodes
+ *
+ * int node_online(node) Is some node online?
+ * int node_possible(node) Is some node possible?
+ *
+ * int any_online_node(mask) First online node in mask
+ *
+ * node_set_online(node) set bit 'node' in node_online_map
+ * node_set_offline(node) clear bit 'node' in node_online_map
+ *
+ * for_each_node(node) for-loop node over node_possible_map
+ * for_each_online_node(node) for-loop node over node_online_map
+ *
+ * Subtlety:
+ * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
+ * to generate slightly worse code. So use a simple one-line #define
+ * for node_isset(), instead of wrapping an inline inside a macro, the
+ * way we do the other calls.
+ */
+
+#if 0
+#include <linux/threads.h>
+#include <asm/bug.h>
+#endif
+#include <xen/kernel.h>
+#include <xen/bitmap.h>
+#include <xen/numa.h>
+
+typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
+extern nodemask_t _unused_nodemask_arg_;
+
+#define node_set(node, dst) __node_set((node), &(dst))
+static inline void __node_set(int node, volatile nodemask_t *dstp)
+{
+ set_bit(node, dstp->bits);
+}
+
+#define node_clear(node, dst) __node_clear((node), &(dst))
+static inline void __node_clear(int node, volatile nodemask_t *dstp)
+{
+ clear_bit(node, dstp->bits);
+}
+
+#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
+static inline void __nodes_setall(nodemask_t *dstp, int nbits)
+{
+ bitmap_fill(dstp->bits, nbits);
+}
+
+#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
+static inline void __nodes_clear(nodemask_t *dstp, int nbits)
+{
+ bitmap_zero(dstp->bits, nbits);
+}
+
+/* No static inline type checking - see Subtlety (1) above. */
+#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)
+
+#define node_test_and_set(node, nodemask) \
+ __node_test_and_set((node), &(nodemask))
+static inline int __node_test_and_set(int node, nodemask_t *addr)
+{
+ return test_and_set_bit(node, addr->bits);
+}
+
+#define nodes_and(dst, src1, src2) \
+ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
+ const nodemask_t *src2p, int nbits)
+{
+ bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_or(dst, src1, src2) \
+ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
+ const nodemask_t *src2p, int nbits)
+{
+ bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_xor(dst, src1, src2) \
+ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
+ const nodemask_t *src2p, int nbits)
+{
+ bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_andnot(dst, src1, src2) \
+ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
+ const nodemask_t *src2p, int nbits)
+{
+ bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_complement(dst, src) \
+ __nodes_complement(&(dst), &(src), MAX_NUMNODES)
+static inline void __nodes_complement(nodemask_t *dstp,
+ const nodemask_t *srcp, int nbits)
+{
+ bitmap_complement(dstp->bits, srcp->bits, nbits);
+}
+
+#define nodes_equal(src1, src2) \
+ __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_equal(const nodemask_t *src1p,
+ const nodemask_t *src2p, int nbits)
+{
+ return bitmap_equal(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_intersects(src1, src2) \
+ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_intersects(const nodemask_t *src1p,
+ const nodemask_t *src2p, int nbits)
+{
+ return bitmap_intersects(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_subset(src1, src2) \
+ __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_subset(const nodemask_t *src1p,
+ const nodemask_t *src2p, int nbits)
+{
+ return bitmap_subset(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
+static inline int __nodes_empty(const nodemask_t *srcp, int nbits)
+{
+ return bitmap_empty(srcp->bits, nbits);
+}
+
+#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
+static inline int __nodes_full(const nodemask_t *srcp, int nbits)
+{
+ return bitmap_full(srcp->bits, nbits);
+}
+
+#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
+static inline int __nodes_weight(const nodemask_t *srcp, int nbits)
+{
+ return bitmap_weight(srcp->bits, nbits);
+}
+
+#define nodes_shift_right(dst, src, n) \
+ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
+static inline void __nodes_shift_right(nodemask_t *dstp,
+ const nodemask_t *srcp, int n, int nbits)
+{
+ bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
+}
+
+#define nodes_shift_left(dst, src, n) \
+ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
+static inline void __nodes_shift_left(nodemask_t *dstp,
+ const nodemask_t *srcp, int n, int nbits)
+{
+ bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
+}
+
+/* FIXME: better would be to fix all architectures to never return
+ > MAX_NUMNODES, then the silly min_ts could be dropped. */
+
+#define first_node(src) __first_node(&(src))
+static inline int __first_node(const nodemask_t *srcp)
+{
+ return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
+}
+
+#define next_node(n, src) __next_node((n), &(src))
+static inline int __next_node(int n, const nodemask_t *srcp)
+{
+ return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
+}
+
+#define nodemask_of_node(node) \
+({ \
+ typeof(_unused_nodemask_arg_) m; \
+ if (sizeof(m) == sizeof(unsigned long)) { \
+ m.bits[0] = 1UL<<(node); \
+ } else { \
+ nodes_clear(m); \
+ node_set((node), m); \
+ } \
+ m; \
+})
+
+#define first_unset_node(mask) __first_unset_node(&(mask))
+static inline int __first_unset_node(const nodemask_t *maskp)
+{
+ return min_t(int,MAX_NUMNODES,
+ find_first_zero_bit(maskp->bits, MAX_NUMNODES));
+}
+
+#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)
+
+#if MAX_NUMNODES <= BITS_PER_LONG
+
+#define NODE_MASK_ALL \
+((nodemask_t) { { \
+ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \
+} })
+
+#else
+
+#define NODE_MASK_ALL \
+((nodemask_t) { { \
+ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \
+ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \
+} })
+
+#endif
+
+#define NODE_MASK_NONE \
+((nodemask_t) { { \
+ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \
+} })
+
+#define nodes_addr(src) ((src).bits)
+
+#if 0
+#define nodemask_scnprintf(buf, len, src) \
+ __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES)
+static inline int __nodemask_scnprintf(char *buf, int len,
+ const nodemask_t *srcp, int nbits)
+{
+ return bitmap_scnprintf(buf, len, srcp->bits, nbits);
+}
+
+#define nodemask_parse(ubuf, ulen, dst) \
+ __nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES)
+static inline int __nodemask_parse(const char __user *buf, int len,
+ nodemask_t *dstp, int nbits)
+{
+ return bitmap_parse(buf, len, dstp->bits, nbits);
+}
+#endif
+
+#if MAX_NUMNODES > 1
+#define for_each_node_mask(node, mask) \
+ for ((node) = first_node(mask); \
+ (node) < MAX_NUMNODES; \
+ (node) = next_node((node), (mask)))
+#else /* MAX_NUMNODES == 1 */
+#define for_each_node_mask(node, mask) \
+ if (!nodes_empty(mask)) \
+ for ((node) = 0; (node) < 1; (node)++)
+#endif /* MAX_NUMNODES */
+
+/*
+ * The following particular system nodemasks and operations
+ * on them manage all possible and online nodes.
+ */
+
+extern nodemask_t node_online_map;
+extern nodemask_t node_possible_map;
+
+#if MAX_NUMNODES > 1
+#define num_online_nodes() nodes_weight(node_online_map)
+#define num_possible_nodes() nodes_weight(node_possible_map)
+#define node_online(node) node_isset((node), node_online_map)
+#define node_possible(node) node_isset((node), node_possible_map)
+#else
+#define num_online_nodes() 1
+#define num_possible_nodes() 1
+#define node_online(node) ((node) == 0)
+#define node_possible(node) ((node) == 0)
+#endif
+
+#define any_online_node(mask) \
+({ \
+ int node; \
+ for_each_node_mask(node, (mask)) \
+ if (node_online(node)) \
+ break; \
+ node; \
+})
+
+#define node_set_online(node) set_bit((node), node_online_map.bits)
+#define node_set_offline(node) clear_bit((node), node_online_map.bits)
+
+#define for_each_node(node) for_each_node_mask((node), node_possible_map)
+#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
+
+#endif /* __LINUX_NODEMASK_H */
diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
new file mode 100644
index 0000000000..30afceb52f
--- /dev/null
+++ b/xen/include/xen/numa.h
@@ -0,0 +1,35 @@
+#ifndef _XEN_NUMA_H
+#define _XEN_NUMA_H
+
+#include <xen/config.h>
+
+#ifdef CONFIG_DISCONTIGMEM
+#include <asm/numnodes.h>
+#endif
+
+#ifndef NODES_SHIFT
+#define NODES_SHIFT 0
+#endif
+
+#define MAX_NUMNODES (1 << NODES_SHIFT)
+#define NUMA_NO_NODE 0xff
+
+#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
+#define MAX_CHUNKS_PER_NODE 4
+#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+
+/* needed for drivers/acpi/numa.c */
+#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+
+extern unsigned int cpu_to_node[];
+#include <xen/cpumask.h>
+extern cpumask_t node_to_cpumask[];
+
+typedef struct node_data {
+ unsigned long node_start_pfn;
+ unsigned long node_spanned_pages;
+ unsigned int node_id;
+} node_data_t;
+
+#endif /* _XEN_NUMA_H */
diff --git a/xen/include/xen/topology.h b/xen/include/xen/topology.h
new file mode 100644
index 0000000000..e836bf132f
--- /dev/null
+++ b/xen/include/xen/topology.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2006, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _XEN_TOPOLOGY_H
+#define _XEN_TOPOLOGY_H
+
+#include <asm/topology.h>
+
+#endif /* _XEN_TOPOLOGY_H */