17 files changed, 1428 insertions, 7 deletions
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index 31f2793fb9..89cc508d02 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -28,12 +28,14 @@ obj-y += microcode.o
 obj-y += mm.o
 obj-y += mpparse.o
 obj-y += nmi.o
+obj-y += numa.o
 obj-y += physdev.o
 obj-y += rwlock.o
 obj-y += setup.o
 obj-y += shutdown.o
 obj-y += smp.o
 obj-y += smpboot.o
+obj-y += srat.o
 obj-y += string.o
 obj-y += sysctl.o
 obj-y += time.o
diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
new file mode 100644
index 0000000000..86bff703f3
--- /dev/null
+++ b/xen/arch/x86/numa.c
@@ -0,0 +1,302 @@
+/* 
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
+ */ 
+
+#include <xen/mm.h>
+#include <xen/string.h>
+#include <xen/init.h>
+#include <xen/ctype.h>
+#include <xen/nodemask.h>
+#include <xen/numa.h>
+#include <xen/keyhandler.h>
+#include <xen/time.h>
+
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+/* from proto.h */
+#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
+
+struct node_data node_data[MAX_NUMNODES];
+
+int memnode_shift;
+u8  memnodemap[NODEMAPSIZE];
+
+unsigned int cpu_to_node[NR_CPUS] __read_mostly = {
+	[0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+ 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
+
+nodemask_t node_online_map = { { [0] = 1UL } };
+
+int numa_off __initdata;
+
+int acpi_numa __initdata;
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init
+populate_memnodemap(const struct node *nodes, int numnodes, int shift)
+{
+	int i; 
+	int res = -1;
+	unsigned long addr, end;
+
+	if (shift >= 64)
+		return -1;
+	memset(memnodemap, 0xff, sizeof(memnodemap));
+	for (i = 0; i < numnodes; i++) {
+		addr = nodes[i].start;
+		end = nodes[i].end;
+		if (addr >= end)
+			continue;
+		if ((end >> shift) >= NODEMAPSIZE)
+			return 0;
+		do {
+			if (memnodemap[addr >> shift] != 0xff)
+				return -1;
+			memnodemap[addr >> shift] = i;
+                       addr += (1UL << shift);
+		} while (addr < end);
+		res = 1;
+	} 
+	return res;
+}
+
+int __init compute_hash_shift(struct node *nodes, int numnodes)
+{
+	int shift = 20;
+
+	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
+		shift++;
+
+	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
+		shift);
+
+	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+		printk(KERN_INFO
+	"Your memory is not aligned you need to rebuild your kernel "
+	"with a bigger NODEMAPSIZE shift=%d\n",
+			shift);
+		return -1;
+	}
+	return shift;
+}
+
+/* initialize NODE_DATA given nodeid and start/end */
+void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
+{ 
+	unsigned long start_pfn, end_pfn;
+
+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = end >> PAGE_SHIFT;
+
+	NODE_DATA(nodeid)->node_id = nodeid;
+	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+	NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+	node_set_online(nodeid);
+} 
+
+void __init numa_init_array(void)
+{
+	int rr, i;
+	/* There are unfortunately some poorly designed mainboards around
+	   that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+	   mapping. To avoid this fill in the mapping for all possible
+	   CPUs, as the number of CPUs is not known yet. 
+	   We round robin the existing nodes. */
+	rr = first_node(node_online_map);
+	for (i = 0; i < NR_CPUS; i++) {
+		if (cpu_to_node[i] != NUMA_NO_NODE)
+			continue;
+ 		numa_set_node(i, rr);
+		rr = next_node(rr, node_online_map);
+		if (rr == MAX_NUMNODES)
+			rr = first_node(node_online_map);
+	}
+
+}
+
+#ifdef CONFIG_NUMA_EMU
+/* default to faking a single node as fallback for non-NUMA hardware */
+int numa_fake __initdata = 1;
+
+/* Numa emulation */
+static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+ 	int i;
+ 	struct node nodes[MAX_NUMNODES];
+ 	unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+
+ 	/* Kludge needed for the hash function */
+ 	if (hweight64(sz) > 1) {
+ 		unsigned long x = 1;
+ 		while ((x << 1) < sz)
+ 			x <<= 1;
+ 		if (x < sz/2)
+ 			printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
+ 		sz = x;
+ 	}
+
+ 	memset(&nodes,0,sizeof(nodes));
+ 	for (i = 0; i < numa_fake; i++) {
+ 		nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+ 		if (i == numa_fake-1)
+ 			sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
+ 		nodes[i].end = nodes[i].start + sz;
+ 		printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
+ 		       i,
+ 		       nodes[i].start, nodes[i].end,
+ 		       (nodes[i].end - nodes[i].start) >> 20);
+		node_set_online(i);
+ 	}
+ 	memnode_shift = compute_hash_shift(nodes, numa_fake);
+ 	if (memnode_shift < 0) {
+ 		memnode_shift = 0;
+ 		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
+ 		return -1;
+ 	}
+ 	for_each_online_node(i)
+ 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ 	numa_init_array();
+ 	return 0;
+}
+#endif
+
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{ 
+	int i;
+
+#ifdef CONFIG_ACPI_NUMA
+	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+					  end_pfn << PAGE_SHIFT))
+ 		return;
+#endif
+
+#ifdef CONFIG_NUMA_EMU
+   /* fake a numa node for non-numa hardware */
+	if (numa_fake && !numa_emulation(start_pfn, end_pfn))
+ 		return;
+#endif
+
+	printk(KERN_INFO "%s\n",
+	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+	printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+	       start_pfn << PAGE_SHIFT,
+	       end_pfn << PAGE_SHIFT); 
+		/* setup dummy node covering all memory */ 
+	memnode_shift = 63; 
+	memnodemap[0] = 0;
+	nodes_clear(node_online_map);
+	node_set_online(0);
+	for (i = 0; i < NR_CPUS; i++)
+		numa_set_node(i, 0);
+	node_to_cpumask[0] = cpumask_of_cpu(0);
+	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+__cpuinit void numa_add_cpu(int cpu)
+{
+	set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+} 
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	cpu_to_node[cpu] = node;
+}
+
+/* [numa=off] */
+__init int numa_setup(char *opt) 
+{ 
+	if (!strncmp(opt,"off",3))
+		numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+	if(!strncmp(opt, "fake=", 5)) {
+		numa_fake = simple_strtoul(opt+5,NULL,0); ;
+		if (numa_fake >= MAX_NUMNODES)
+			numa_fake = MAX_NUMNODES;
+	}
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ 	if (!strncmp(opt,"noacpi",6))
+ 		acpi_numa = -1;
+#endif
+	return 1;
+} 
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ */
+void __init init_cpu_to_node(void)
+{
+	int i;
+ 	for (i = 0; i < NR_CPUS; i++) {
+		u8 apicid = x86_cpu_to_apicid[i];
+		if (apicid == BAD_APICID)
+			continue;
+		if (apicid_to_node[apicid] == NUMA_NO_NODE)
+			continue;
+		numa_set_node(i,apicid_to_node[apicid]);
+	}
+}
+
+EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(node_to_cpumask);
+EXPORT_SYMBOL(memnode_shift);
+EXPORT_SYMBOL(memnodemap);
+EXPORT_SYMBOL(node_data);
+
+static void dump_numa(unsigned char key)
+{
+	s_time_t now = NOW();
+	int i;
+
+	printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
+		  (u32)(now>>32), (u32)now);
+
+	for_each_online_node(i) {
+		unsigned long pa = (NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT;
+		printk("idx%d -> NODE%d start->%lu size->%lu\n",
+			  i, NODE_DATA(i)->node_id,
+			  NODE_DATA(i)->node_start_pfn,
+			  NODE_DATA(i)->node_spanned_pages);
+		/* sanity check phys_to_nid() */
+		printk("phys_to_nid(%lx) -> %d should be %d\n", pa, phys_to_nid(pa),
+			  NODE_DATA(i)->node_id);
+	}
+	for_each_online_cpu(i)
+		printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]);
+}
+
+static __init int register_numa_trigger(void)
+{
+	register_keyhandler('u', dump_numa, "dump numa info");
+	return 0;
+}
+__initcall(register_numa_trigger);
+
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 2c8b638944..03da8af41c 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -16,6 +16,7 @@
 #include <xen/percpu.h>
 #include <xen/hypercall.h>
 #include <xen/keyhandler.h>
+#include <xen/numa.h>
 #include <public/version.h>
 #include <asm/bitops.h>
 #include <asm/smp.h>
@@ -25,10 +26,12 @@
 #include <asm/desc.h>
 #include <asm/shadow.h>
 #include <asm/e820.h>
+#include <asm/numa.h>
 #include <acm/acm_hooks.h>
 
 extern void dmi_scan_machine(void);
 extern void generic_apic_probe(void);
+extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
 
 /*
  * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the
@@ -60,6 +63,9 @@ boolean_param("watchdog", opt_watchdog);
 static void parse_acpi_param(char *s);
 custom_param("acpi", parse_acpi_param);
 
+extern int numa_setup(char *s);
+custom_param("numa", numa_setup);
+
 /* **** Linux config option: propagated to domain0. */
 /* acpi_skip_timer_override: Skip IRQ0 overrides. */
 extern int acpi_skip_timer_override;
@@ -257,6 +263,20 @@ static void __init init_idle_domain(void)
     setup_idle_pagetable();
 }
 
+static void srat_detect_node(int cpu)
+{
+   unsigned node;
+   u8 apicid = x86_cpu_to_apicid[cpu];
+
+   node = apicid_to_node[apicid];
+   if (node == NUMA_NO_NODE)
+      node = 0;
+   numa_set_node(cpu, node);
+
+   if (acpi_numa > 0)
+      printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -485,6 +505,12 @@ void __init __start_xen(multiboot_info_t *mbi)
 
     init_frametable();
 
+    acpi_boot_table_init();
+
+    acpi_numa_init();
+
+    numa_initmem_init(0, max_page);
+
     end_boot_allocator();
 
     /* Initialise the Xen heap, skipping RAM holes. */
@@ -536,9 +562,10 @@ void __init __start_xen(multiboot_info_t *mbi)
 
     generic_apic_probe();
 
-    acpi_boot_table_init();
     acpi_boot_init();
 
+    init_cpu_to_node();
+
     if ( smp_found_config )
         get_smp_config();
 
@@ -589,6 +616,11 @@ void __init __start_xen(multiboot_info_t *mbi)
             break;
         if ( !cpu_online(i) )
             __cpu_up(i);
+
+		/* setup cpu_to_node[] */
+        srat_detect_node(i);
+		/* setup node_to_cpumask based on cpu_to_node[] */
+        numa_add_cpu(i);        
     }
 
     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index eb2d21111c..b971069cf2 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -43,6 +43,8 @@
 #include <xen/delay.h>
 #include <xen/softirq.h>
 #include <xen/serial.h>
+#include <xen/numa.h>
+#include <asm/numa.h>
 #include <asm/current.h>
 #include <asm/mc146818rtc.h>
 #include <asm/desc.h>
@@ -628,7 +630,7 @@ u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICI
 static void map_cpu_to_logical_apicid(void)
 {
 	int cpu = smp_processor_id();
-	int apicid = logical_smp_processor_id();
+	int apicid = hard_smp_processor_id();
 
 	cpu_2_logical_apicid[cpu] = apicid;
 	map_cpu_to_node(cpu, apicid_to_node(apicid));
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
new file mode 100644
index 0000000000..dfa85b2539
--- /dev/null
+++ b/xen/arch/x86/srat.c
@@ -0,0 +1,325 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ * 
+ * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
+ */
+
+#if 0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/proto.h>
+#include <xen/bitmap.h>
+#include <xen/numa.h>
+#include <xen/topology.h>
+#include <asm/e820.h>
+#endif
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/inttypes.h>
+#include <xen/nodemask.h>
+#include <xen/acpi.h>
+
+#include <asm/numa.h>
+#include <asm/page.h>
+
+static struct acpi_table_slit *acpi_slit;
+
+static nodemask_t nodes_parsed __initdata;
+static nodemask_t nodes_found __initdata;
+static struct node nodes[MAX_NUMNODES] __initdata;
+static u8 pxm2node[256] = { [0 ... 255] = 0xff };
+
+/* Too small nodes confuse the VM badly. Usually they result
+   from BIOS bugs. */
+#define NODE_MIN_SIZE (4*1024*1024)
+
+static int node_to_pxm(int n);
+
+int pxm_to_node(int pxm)
+{
+	if ((unsigned)pxm >= 256)
+		return -1;
+	/* Extend 0xff to (int)-1 */
+	return (signed char)pxm2node[pxm];
+}
+
+static __init int setup_node(int pxm)
+{
+	unsigned node = pxm2node[pxm];
+	if (node == 0xff) {
+		if (nodes_weight(nodes_found) >= MAX_NUMNODES)
+			return -1;
+		node = first_unset_node(nodes_found); 
+		node_set(node, nodes_found);
+		pxm2node[pxm] = node;
+	}
+	return pxm2node[pxm];
+}
+
+static __init int conflicting_nodes(u64 start, u64 end)
+{
+	int i;
+	for_each_node_mask(i, nodes_parsed) {
+		struct node *nd = &nodes[i];
+		if (nd->start == nd->end)
+			continue;
+		if (nd->end > start && nd->start < end)
+			return i;
+		if (nd->end == end && nd->start == start)
+			return i;
+	}
+	return -1;
+}
+
+static __init void cutoff_node(int i, u64 start, u64 end)
+{
+	struct node *nd = &nodes[i];
+	if (nd->start < start) {
+		nd->start = start;
+		if (nd->end < nd->start)
+			nd->start = nd->end;
+	}
+	if (nd->end > end) {
+		nd->end = end;
+		if (nd->start > nd->end)
+			nd->start = nd->end;
+	}
+}
+
+static __init void bad_srat(void)
+{
+	int i;
+	printk(KERN_ERR "SRAT: SRAT not used.\n");
+	acpi_numa = -1;
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
+		apicid_to_node[i] = NUMA_NO_NODE;
+}
+
+static __init inline int srat_disabled(void)
+{
+	return numa_off || acpi_numa < 0;
+}
+
+/*
+ * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
+ * up the NUMA heuristics which wants the local node to have a smaller
+ * distance than the others.
+ * Do some quick checks here and only use the SLIT if it passes.
+ */
+static __init int slit_valid(struct acpi_table_slit *slit)
+{
+	int i, j;
+	int d = slit->localities;
+	for (i = 0; i < d; i++) {
+		for (j = 0; j < d; j++)  {
+			u8 val = slit->entry[d*i + j];
+			if (i == j) {
+				if (val != 10)
+					return 0;
+			} else if (val <= 10)
+				return 0;
+		}
+	}
+	return 1;
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+	if (!slit_valid(slit)) {
+		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
+		return;
+	}
+	acpi_slit = slit;
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
+{
+	int pxm, node;
+	if (srat_disabled())
+		return;
+	if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) {		bad_srat();
+		return;
+	}
+	if (pa->flags.enabled == 0)
+		return;
+	pxm = pa->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+	apicid_to_node[pa->apic_id] = node;
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+	       pxm, pa->apic_id, node);
+}
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
+{
+	struct node *nd;
+	u64 start, end;
+	int node, pxm;
+	int i;
+
+	if (srat_disabled())
+		return;
+	if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
+		bad_srat();
+		return;
+	}
+	if (ma->flags.enabled == 0)
+		return;
+	start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
+	end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
+	pxm = ma->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+		bad_srat();
+		return;
+	}
+	/* It is fine to add this area to the nodes data it will be used later*/
+	if (ma->flags.hot_pluggable == 1)
+		printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - %"PRIx64" \n",
+				start, end);
+	i = conflicting_nodes(start, end);
+	if (i == node) {
+		printk(KERN_WARNING
+		"SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%"
+		PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, nodes[i].end);
+	} else if (i >= 0) {
+		printk(KERN_ERR
+		       "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d (%"
+		       PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i),
+			   nodes[i].start, nodes[i].end);
+		bad_srat();
+		return;
+	}
+	nd = &nodes[node];
+	if (!node_test_and_set(node, nodes_parsed)) {
+		nd->start = start;
+		nd->end = end;
+	} else {
+		if (start < nd->start)
+			nd->start = start;
+		if (nd->end < end)
+			nd->end = end;
+	}
+	printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, pxm,
+	       nd->start, nd->end);
+}
+
+/* Sanity check to catch more bad SRATs (they are amazingly common).
+   Make sure the PXMs cover all memory. */
+static int nodes_cover_memory(void)
+{
+	int i;
+	u64 pxmram, e820ram;
+
+	pxmram = 0;
+	for_each_node_mask(i, nodes_parsed) {
+		u64 s = nodes[i].start >> PAGE_SHIFT;
+		u64 e = nodes[i].end >> PAGE_SHIFT;
+		pxmram += e - s;
+	}
+
+	e820ram = max_page;
+	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
+	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+		printk(KERN_ERR "SRAT: PXMs only cover %"PRIu64"MB of your %"
+			PRIu64"MB e820 RAM. Not used.\n",
+			(pxmram << PAGE_SHIFT) >> 20,
+			(e820ram << PAGE_SHIFT) >> 20);
+		return 0;
+	}
+	return 1;
+}
+
+static void unparse_node(int node)
+{
+	int i;
+	node_clear(node, nodes_parsed);
+	for (i = 0; i < MAX_LOCAL_APIC; i++) {
+		if (apicid_to_node[i] == node)
+			apicid_to_node[i] = NUMA_NO_NODE;
+	}
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+/* Use the information discovered above to actually set up the nodes. */
+int __init acpi_scan_nodes(u64 start, u64 end)
+{
+	int i;
+
+	/* First clean up the node list */
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cutoff_node(i, start, end);
+		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
+			unparse_node(i);
+	}
+
+	if (acpi_numa <= 0)
+		return -1;
+
+	if (!nodes_cover_memory()) {
+		bad_srat();
+		return -1;
+	}
+
+	memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+	if (memnode_shift < 0) {
+		printk(KERN_ERR
+		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
+		bad_srat();
+		return -1;
+	}
+
+	/* Finally register nodes */
+	for_each_node_mask(i, nodes_parsed)
+		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+	for (i = 0; i < NR_CPUS; i++) { 
+		if (cpu_to_node[i] == NUMA_NO_NODE)
+			continue;
+		if (!node_isset(cpu_to_node[i], nodes_parsed))
+			numa_set_node(i, NUMA_NO_NODE);
+	}
+	numa_init_array();
+	return 0;
+}
+
+static int node_to_pxm(int n)
+{
+       int i;
+       if (pxm2node[n] == n)
+               return n;
+       for (i = 0; i < 256; i++)
+               if (pxm2node[i] == n)
+                       return i;
+       return 0;
+}
+
+int __node_distance(int a, int b)
+{
+	int index;
+
+	if (!acpi_slit)
+		return a == b ? 10 : 20;
+	index = acpi_slit->localities * node_to_pxm(a);
+	return acpi_slit->entry[index + node_to_pxm(b)];
+}
+
+EXPORT_SYMBOL(__node_distance);
diff --git a/xen/drivers/acpi/Makefile b/xen/drivers/acpi/Makefile
index 68dafe3a52..08844a529d 100644
--- a/xen/drivers/acpi/Makefile
+++ b/xen/drivers/acpi/Makefile
@@ -1 +1,2 @@
 obj-y += tables.o
+obj-y += numa.o
diff --git a/xen/drivers/acpi/numa.c b/xen/drivers/acpi/numa.c
new file mode 100644
index 0000000000..ecf426ece4
--- /dev/null
+++ b/xen/drivers/acpi/numa.c
@@ -0,0 +1,216 @@
+/*
+ *  acpi_numa.c - ACPI NUMA support
+ *
+ *  Copyright (C) 2002 Takayoshi Kochi <t-kochi@bq.jp.nec.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ */
+#if 0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#endif
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/acpi.h>
+#include <xen/numa.h>
+#include <acpi/acpi_bus.h>
+#include <acpi/acmacros.h>
+#include <asm/page.h> /* __va() */
+
+#define ACPI_NUMA	0x80000000
+#define _COMPONENT	ACPI_NUMA
+ACPI_MODULE_NAME("numa")
+
+extern int __init acpi_table_parse_madt_family(enum acpi_table_id id,
+					       unsigned long madt_size,
+					       int entry_id,
+					       acpi_madt_entry_handler handler,
+					       unsigned int max_entries);
+
+void __init acpi_table_print_srat_entry(acpi_table_entry_header * header)
+{
+
+	ACPI_FUNCTION_NAME("acpi_table_print_srat_entry");
+
+	if (!header)
+		return;
+
+	switch (header->type) {
+
+	case ACPI_SRAT_PROCESSOR_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+		{
+			struct acpi_table_processor_affinity *p =
+			    (struct acpi_table_processor_affinity *)header;
+			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+					  "SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n",
+					  p->apic_id, p->lsapic_eid,
+					  p->proximity_domain,
+					  p->flags.
+					  enabled ? "enabled" : "disabled"));
+		}
+#endif				/* ACPI_DEBUG_OUTPUT */
+		break;
+
+	case ACPI_SRAT_MEMORY_AFFINITY:
+#ifdef ACPI_DEBUG_OUTPUT
+		{
+			struct acpi_table_memory_affinity *p =
+			    (struct acpi_table_memory_affinity *)header;
+			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+					  "SRAT Memory (0x%08x%08x length 0x%08x%08x type 0x%x) in proximity domain %d %s%s\n",
+					  p->base_addr_hi, p->base_addr_lo,
+					  p->length_hi, p->length_lo,
+					  p->memory_type, p->proximity_domain,
+					  p->flags.
+					  enabled ? "enabled" : "disabled",
+					  p->flags.
+					  hot_pluggable ? " hot-pluggable" :
+					  ""));
+		}
+#endif				/* ACPI_DEBUG_OUTPUT */
+		break;
+
+	default:
+		printk(KERN_WARNING PREFIX
+		       "Found unsupported SRAT entry (type = 0x%x)\n",
+		       header->type);
+		break;
+	}
+}
+
+static int __init acpi_parse_slit(unsigned long phys_addr, unsigned long size)
+{
+	struct acpi_table_slit *slit;
+	u32 localities;
+
+	if (!phys_addr || !size)
+		return -EINVAL;
+
+	slit = (struct acpi_table_slit *)__va(phys_addr);
+
+	/* downcast just for %llu vs %lu for i386/ia64  */
+	localities = (u32) slit->localities;
+
+	acpi_numa_slit_init(slit);
+
+	return 0;
+}
+
+static int __init
+acpi_parse_processor_affinity(acpi_table_entry_header * header,
+			      const unsigned long end)
+{
+	struct acpi_table_processor_affinity *processor_affinity;
+
+	processor_affinity = (struct acpi_table_processor_affinity *)header;
+	if (!processor_affinity)
+		return -EINVAL;
+
+	acpi_table_print_srat_entry(header);
+
+	/* let architecture-dependent part to do it */
+	acpi_numa_processor_affinity_init(processor_affinity);
+
+	return 0;
+}
+
+static int __init
+acpi_parse_memory_affinity(acpi_table_entry_header * header,
+			   const unsigned long end)
+{
+	struct acpi_table_memory_affinity *memory_affinity;
+
+	memory_affinity = (struct acpi_table_memory_affinity *)header;
+	if (!memory_affinity)
+		return -EINVAL;
+
+	acpi_table_print_srat_entry(header);
+
+	/* let architecture-dependent part to do it */
+	acpi_numa_memory_affinity_init(memory_affinity);
+
+	return 0;
+}
+
+static int __init acpi_parse_srat(unsigned long phys_addr, unsigned long size)
+{
+	struct acpi_table_srat *srat;
+
+	if (!phys_addr || !size)
+		return -EINVAL;
+
+	srat = (struct acpi_table_srat *)__va(phys_addr);
+
+	return 0;
+}
+
+int __init
+acpi_table_parse_srat(enum acpi_srat_entry_id id,
+		      acpi_madt_entry_handler handler, unsigned int max_entries)
+{
+	return acpi_table_parse_madt_family(ACPI_SRAT,
+					    sizeof(struct acpi_table_srat), id,
+					    handler, max_entries);
+}
+
+int __init acpi_numa_init(void)
+{
+	int result;
+
+	/* SRAT: Static Resource Affinity Table */
+	result = acpi_table_parse(ACPI_SRAT, acpi_parse_srat);
+
+	if (result > 0) {
+		result = acpi_table_parse_srat(ACPI_SRAT_PROCESSOR_AFFINITY,
+					       acpi_parse_processor_affinity,
+					       NR_CPUS);
+		result = acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS);	// IA64 specific
+	}
+
+	/* SLIT: System Locality Information Table */
+	result = acpi_table_parse(ACPI_SLIT, acpi_parse_slit);
+
+	acpi_numa_arch_fixup();
+	return 0;
+}
+
+#if 0
+int acpi_get_pxm(acpi_handle h)
+{
+	unsigned long pxm;
+	acpi_status status;
+	acpi_handle handle;
+	acpi_handle phandle = h;
+
+	do {
+		handle = phandle;
+		status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
+		if (ACPI_SUCCESS(status))
+			return (int)pxm;
+		status = acpi_get_parent(handle, &phandle);
+	} while (ACPI_SUCCESS(status));
+	return -1;
+}
+
+EXPORT_SYMBOL(acpi_get_pxm);
+#endif
diff --git a/xen/include/asm-x86/acpi.h b/xen/include/asm-x86/acpi.h
index 51c4b8e293..c6dd5b2261 100644
--- a/xen/include/asm-x86/acpi.h
+++ b/xen/include/asm-x86/acpi.h
@@ -157,6 +157,8 @@ static inline void check_acpi_pci(void) { }
 
 static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
 static inline int acpi_irq_balance_set(char *str) { return 0; }
+extern int acpi_scan_nodes(u64 start, u64 end);
+extern int acpi_numa;
 
 #ifdef CONFIG_ACPI_SLEEP
 
@@ -173,5 +175,6 @@ extern void acpi_reserve_bootmem(void);
 #endif /*CONFIG_ACPI_SLEEP*/
 
 extern u8 x86_acpiid_to_apicid[];
+#define MAX_LOCAL_APIC 256
 
 #endif /*_ASM_ACPI_H*/
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index e2ef90700c..879bdbf80b 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -24,6 +24,11 @@
 #define CONFIG_X86_IO_APIC 1
 #define CONFIG_HPET_TIMER 1
 #define CONFIG_X86_MCE_P4THERMAL 1
+#define CONFIG_ACPI_NUMA 1
+#define CONFIG_NUMA 1
+#define CONFIG_ACPI_SRAT 1
+#define CONFIG_DISCONTIGMEM 1
+#define CONFIG_NUMA_EMU 1
 
 /* Intel P4 currently has largest cache line (L2 line size is 128 bytes). */
 #define CONFIG_X86_L1_CACHE_SHIFT 7
diff --git a/xen/include/asm-x86/mach-generic/mach_apic.h b/xen/include/asm-x86/mach-generic/mach_apic.h
index 1d3ed4dc67..1e0a6019d6 100644
--- a/xen/include/asm-x86/mach-generic/mach_apic.h
+++ b/xen/include/asm-x86/mach-generic/mach_apic.h
@@ -22,11 +22,7 @@ static inline void enable_apic_mode(void)
 	return;
 }
 
-/* No sane NUMA support right now. We should parse ACPI SRAT. */
-static inline int apicid_to_node(int logical_apicid)
-{
-	return 0;
-}
+#define apicid_to_node(apicid) ((int)apicid_to_node[(u8)apicid])
 
 extern u8 bios_cpu_apicid[];
 static inline int cpu_present_to_apicid(int mps_cpu)
diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h
new file mode 100644
index 0000000000..a0db3cd272
--- /dev/null
+++ b/xen/include/asm-x86/numa.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X8664_NUMA_H 
+#define _ASM_X8664_NUMA_H 1
+
+#include <xen/nodemask.h>
+#include <xen/topology.h>
+#include <asm/numnodes.h>
+#include <asm/smp.h>
+
+struct node { 
+	u64 start,end; 
+};
+
+extern int compute_hash_shift(struct node *nodes, int numnodes);
+extern int pxm_to_node(int nid);
+
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+#define VIRTUAL_BUG_ON(x) 
+#define NODEMAPSIZE 0xfff
+
+extern void numa_add_cpu(int cpu);
+extern void numa_init_array(void);
+extern int numa_off;
+
+extern void numa_set_node(int cpu, int node);
+
+extern void setup_node_bootmem(int nodeid, u64 start, u64 end);
+extern unsigned char apicid_to_node[256];
+#ifdef CONFIG_NUMA
+extern void __init init_cpu_to_node(void);
+
+static inline void clear_node_cpumask(int cpu)
+{
+	clear_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+}
+
+/* Simple perfect hash to map physical addresses to node numbers */
+extern int memnode_shift; 
+extern u8  memnodemap[NODEMAPSIZE]; 
+
+extern struct node_data node_data[];
+
+static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 
+{ 
+	unsigned nid; 
+	VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+	nid = memnodemap[addr >> memnode_shift]; 
+	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
+	return nid; 
+} 
+
+#define NODE_DATA(nid)		(&(node_data[nid]))
+
+#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
+				 NODE_DATA(nid)->node_spanned_pages)
+
+
+#else
+#define init_cpu_to_node() do {} while (0)
+#define clear_node_cpumask(cpu) do {} while (0)
+#endif
+
+#define NUMA_NO_NODE 0xff
+
+#endif
diff --git a/xen/include/asm-x86/numnodes.h b/xen/include/asm-x86/numnodes.h
new file mode 100644
index 0000000000..92fadea5a8
--- /dev/null
+++ b/xen/include/asm-x86/numnodes.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_MAX_NUMNODES_H
+#define _ASM_MAX_NUMNODES_H
+
+#include <xen/config.h>
+
+#if defined(__i386__)
+#ifdef CONFIG_X86_NUMAQ
+
+/* Max 16 Nodes */
+#define NODES_SHIFT	4
+
+#elif defined(CONFIG_ACPI_SRAT)
+
+/* Max 8 Nodes */
+#define NODES_SHIFT	3
+
+#endif /* CONFIG_X86_NUMAQ */
+
+
+#endif /* __i386__ */
+
+#if defined(CONFIG_NUMA) && defined(__x86_64__)
+#define NODES_SHIFT  6
+#endif /* __x86_64__ */
+
+#endif /* _ASM_MAX_NUMNODES_H */
diff --git a/xen/include/asm-x86/topology.h b/xen/include/asm-x86/topology.h
new file mode 100644
index 0000000000..0a38cd0792
--- /dev/null
+++ b/xen/include/asm-x86/topology.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2006, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Ryan Harper <ryanh@us.ibm.com>
+ */
+
+#ifndef _ASM_X86_TOPOLOGY_H
+#define _ASM_X86_TOPOLOGY_H
+
+#include <xen/config.h>
+#include <xen/bitops.h>
+
+extern cpumask_t cpu_online_map;
+
+extern unsigned int cpu_to_node[];
+extern cpumask_t     node_to_cpumask[];
+
+#define cpu_to_node(cpu)		(cpu_to_node[cpu])
+#define parent_node(node)		(node)
+#define node_to_first_cpu(node)  (__ffs(node_to_cpumask[node]))
+#define node_to_cpumask(node)    (node_to_cpumask[node])
+
+#endif  /* _ASM_X86_TOPOLOGY_H */
diff --git a/xen/include/xen/config.h b/xen/include/xen/config.h
index e3f94d5843..f79472da77 100644
--- a/xen/include/xen/config.h
+++ b/xen/include/xen/config.h
@@ -50,5 +50,7 @@
 #endif /* !__ASSEMBLY__ */
 
 #define fastcall
+#define __cpuinitdata
+#define __cpuinit
 
 #endif /* __XEN_CONFIG_H__ */
diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h
new file mode 100644
index 0000000000..b4a882e482
--- /dev/null
+++ b/xen/include/xen/nodemask.h
@@ -0,0 +1,342 @@
+#ifndef __LINUX_NODEMASK_H
+#define __LINUX_NODEMASK_H
+
+/*
+ * Nodemasks provide a bitmap suitable for representing the
+ * set of Node's in a system, one bit position per Node number.
+ *
+ * See detailed comments in the file linux/bitmap.h describing the
+ * data type on which these nodemasks are based.
+ *
+ * For details of nodemask_scnprintf() and nodemask_parse(),
+ * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ *
+ * The available nodemask operations are:
+ *
+ * void node_set(node, mask)		turn on bit 'node' in mask
+ * void node_clear(node, mask)		turn off bit 'node' in mask
+ * void nodes_setall(mask)		set all bits
+ * void nodes_clear(mask)		clear all bits
+ * int node_isset(node, mask)		true iff bit 'node' set in mask
+ * int node_test_and_set(node, mask)	test and set bit 'node' in mask
+ *
+ * void nodes_and(dst, src1, src2)	dst = src1 & src2  [intersection]
+ * void nodes_or(dst, src1, src2)	dst = src1 | src2  [union]
+ * void nodes_xor(dst, src1, src2)	dst = src1 ^ src2
+ * void nodes_andnot(dst, src1, src2)	dst = src1 & ~src2
+ * void nodes_complement(dst, src)	dst = ~src
+ *
+ * int nodes_equal(mask1, mask2)	Does mask1 == mask2?
+ * int nodes_intersects(mask1, mask2)	Do mask1 and mask2 intersect?
+ * int nodes_subset(mask1, mask2)	Is mask1 a subset of mask2?
+ * int nodes_empty(mask)		Is mask empty (no bits sets)?
+ * int nodes_full(mask)			Is mask full (all bits sets)?
+ * int nodes_weight(mask)		Hamming weight - number of set bits
+ *
+ * void nodes_shift_right(dst, src, n)	Shift right
+ * void nodes_shift_left(dst, src, n)	Shift left
+ *
+ * int first_node(mask)			Number lowest set bit, or MAX_NUMNODES
+ * int next_node(node, mask)		Next node past 'node', or MAX_NUMNODES
+ * int first_unset_node(mask)		First node not set in mask, or 
+ *					MAX_NUMNODES.
+ *
+ * nodemask_t nodemask_of_node(node)	Return nodemask with bit 'node' set
+ * NODE_MASK_ALL			Initializer - all bits set
+ * NODE_MASK_NONE			Initializer - no bits set
+ * unsigned long *nodes_addr(mask)	Array of unsigned long's in mask
+ *
+ * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
+ * int nodemask_parse(ubuf, ulen, mask)	Parse ascii string as nodemask
+ *
+ * for_each_node_mask(node, mask)	for-loop node over mask
+ *
+ * int num_online_nodes()		Number of online Nodes
+ * int num_possible_nodes()		Number of all possible Nodes
+ *
+ * int node_online(node)		Is some node online?
+ * int node_possible(node)		Is some node possible?
+ *
+ * int any_online_node(mask)		First online node in mask
+ *
+ * node_set_online(node)		set bit 'node' in node_online_map
+ * node_set_offline(node)		clear bit 'node' in node_online_map
+ *
+ * for_each_node(node)			for-loop node over node_possible_map
+ * for_each_online_node(node)		for-loop node over node_online_map
+ *
+ * Subtlety:
+ * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
+ *    to generate slightly worse code.  So use a simple one-line #define
+ *    for node_isset(), instead of wrapping an inline inside a macro, the
+ *    way we do the other calls.
+ */
+
+#if 0
+#include <linux/threads.h>
+#include <asm/bug.h>
+#endif
+#include <xen/kernel.h>
+#include <xen/bitmap.h>
+#include <xen/numa.h>
+
+typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
+extern nodemask_t _unused_nodemask_arg_;
+
+#define node_set(node, dst) __node_set((node), &(dst))
+static inline void __node_set(int node, volatile nodemask_t *dstp)
+{
+	set_bit(node, dstp->bits);
+}
+
+#define node_clear(node, dst) __node_clear((node), &(dst))
+static inline void __node_clear(int node, volatile nodemask_t *dstp)
+{
+	clear_bit(node, dstp->bits);
+}
+
+#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
+static inline void __nodes_setall(nodemask_t *dstp, int nbits)
+{
+	bitmap_fill(dstp->bits, nbits);
+}
+
+#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
+static inline void __nodes_clear(nodemask_t *dstp, int nbits)
+{
+	bitmap_zero(dstp->bits, nbits);
+}
+
+/* No static inline type checking - see Subtlety (1) above. */
+#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)
+
+#define node_test_and_set(node, nodemask) \
+			__node_test_and_set((node), &(nodemask))
+static inline int __node_test_and_set(int node, nodemask_t *addr)
+{
+	return test_and_set_bit(node, addr->bits);
+}
+
+#define nodes_and(dst, src1, src2) \
+			__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
+					const nodemask_t *src2p, int nbits)
+{
+	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_or(dst, src1, src2) \
+			__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
+					const nodemask_t *src2p, int nbits)
+{
+	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_xor(dst, src1, src2) \
+			__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
+					const nodemask_t *src2p, int nbits)
+{
+	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_andnot(dst, src1, src2) \
+			__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
+static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
+					const nodemask_t *src2p, int nbits)
+{
+	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_complement(dst, src) \
+			__nodes_complement(&(dst), &(src), MAX_NUMNODES)
+static inline void __nodes_complement(nodemask_t *dstp,
+					const nodemask_t *srcp, int nbits)
+{
+	bitmap_complement(dstp->bits, srcp->bits, nbits);
+}
+
+#define nodes_equal(src1, src2) \
+			__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_equal(const nodemask_t *src1p,
+					const nodemask_t *src2p, int nbits)
+{
+	return bitmap_equal(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_intersects(src1, src2) \
+			__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_intersects(const nodemask_t *src1p,
+					const nodemask_t *src2p, int nbits)
+{
+	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_subset(src1, src2) \
+			__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
+static inline int __nodes_subset(const nodemask_t *src1p,
+					const nodemask_t *src2p, int nbits)
+{
+	return bitmap_subset(src1p->bits, src2p->bits, nbits);
+}
+
+#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
+static inline int __nodes_empty(const nodemask_t *srcp, int nbits)
+{
+	return bitmap_empty(srcp->bits, nbits);
+}
+
+#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
+static inline int __nodes_full(const nodemask_t *srcp, int nbits)
+{
+	return bitmap_full(srcp->bits, nbits);
+}
+
+#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
+static inline int __nodes_weight(const nodemask_t *srcp, int nbits)
+{
+	return bitmap_weight(srcp->bits, nbits);
+}
+
+#define nodes_shift_right(dst, src, n) \
+			__nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
+static inline void __nodes_shift_right(nodemask_t *dstp,
+					const nodemask_t *srcp, int n, int nbits)
+{
+	bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
+}
+
+#define nodes_shift_left(dst, src, n) \
+			__nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
+static inline void __nodes_shift_left(nodemask_t *dstp,
+					const nodemask_t *srcp, int n, int nbits)
+{
+	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
+}
+
+/* FIXME: better would be to fix all architectures to never return
+          > MAX_NUMNODES, then the silly min_ts could be dropped. */
+
+#define first_node(src) __first_node(&(src))
+static inline int __first_node(const nodemask_t *srcp)
+{
+	return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
+}
+
+#define next_node(n, src) __next_node((n), &(src))
+static inline int __next_node(int n, const nodemask_t *srcp)
+{
+	return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
+}
+
+#define nodemask_of_node(node)						\
+({									\
+	typeof(_unused_nodemask_arg_) m;				\
+	if (sizeof(m) == sizeof(unsigned long)) {			\
+		m.bits[0] = 1UL<<(node);				\
+	} else {							\
+		nodes_clear(m);						\
+		node_set((node), m);					\
+	}								\
+	m;								\
+})
+
+#define first_unset_node(mask) __first_unset_node(&(mask))
+static inline int __first_unset_node(const nodemask_t *maskp)
+{
+	return min_t(int,MAX_NUMNODES,
+			find_first_zero_bit(maskp->bits, MAX_NUMNODES));
+}
+
+#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)
+
+#if MAX_NUMNODES <= BITS_PER_LONG
+
+#define NODE_MASK_ALL							\
+((nodemask_t) { {							\
+	[BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD		\
+} })
+
+#else
+
+#define NODE_MASK_ALL							\
+((nodemask_t) { {							\
+	[0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,			\
+	[BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD		\
+} })
+
+#endif
+
+#define NODE_MASK_NONE							\
+((nodemask_t) { {							\
+	[0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL			\
+} })
+
+#define nodes_addr(src) ((src).bits)
+
+#if 0
+#define nodemask_scnprintf(buf, len, src) \
+			__nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES)
+static inline int __nodemask_scnprintf(char *buf, int len,
+					const nodemask_t *srcp, int nbits)
+{
+	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
+}
+
+#define nodemask_parse(ubuf, ulen, dst) \
+			__nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES)
+static inline int __nodemask_parse(const char __user *buf, int len,
+					nodemask_t *dstp, int nbits)
+{
+	return bitmap_parse(buf, len, dstp->bits, nbits);
+}
+#endif
+
+#if MAX_NUMNODES > 1
+#define for_each_node_mask(node, mask)			\
+	for ((node) = first_node(mask);			\
+		(node) < MAX_NUMNODES;			\
+		(node) = next_node((node), (mask)))
+#else /* MAX_NUMNODES == 1 */
+#define for_each_node_mask(node, mask)			\
+	if (!nodes_empty(mask))				\
+		for ((node) = 0; (node) < 1; (node)++)
+#endif /* MAX_NUMNODES */
+
+/*
+ * The following particular system nodemasks and operations
+ * on them manage all possible and online nodes.
+ */
+
+extern nodemask_t node_online_map;
+extern nodemask_t node_possible_map;
+
+#if MAX_NUMNODES > 1
+#define num_online_nodes()	nodes_weight(node_online_map)
+#define num_possible_nodes()	nodes_weight(node_possible_map)
+#define node_online(node)	node_isset((node), node_online_map)
+#define node_possible(node)	node_isset((node), node_possible_map)
+#else
+#define num_online_nodes()	1
+#define num_possible_nodes()	1
+#define node_online(node)	((node) == 0)
+#define node_possible(node)	((node) == 0)
+#endif
+
+#define any_online_node(mask)			\
+({						\
+	int node;				\
+	for_each_node_mask(node, (mask))	\
+		if (node_online(node))		\
+			break;			\
+	node;					\
+})
+
+#define node_set_online(node)	   set_bit((node), node_online_map.bits)
+#define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
+
+#define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
+#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
+
+#endif /* __LINUX_NODEMASK_H */
diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
new file mode 100644
index 0000000000..30afceb52f
--- /dev/null
+++ b/xen/include/xen/numa.h
@@ -0,0 +1,35 @@
+#ifndef _XEN_NUMA_H
+#define _XEN_NUMA_H
+
+#include <xen/config.h>
+
+#ifdef CONFIG_DISCONTIGMEM
+#include <asm/numnodes.h>
+#endif
+
+#ifndef NODES_SHIFT
+#define NODES_SHIFT     0
+#endif
+
+#define MAX_NUMNODES    (1 << NODES_SHIFT)
+#define NUMA_NO_NODE    0xff
+
+#define MAX_PXM_DOMAINS    256   /* 1 byte and no promises about values */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
+#define MAX_CHUNKS_PER_NODE   4
+#define MAXCHUNKS    (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+
+/* needed for drivers/acpi/numa.c */
+#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+
+extern unsigned int cpu_to_node[];
+#include <xen/cpumask.h>
+extern cpumask_t node_to_cpumask[];
+
+typedef struct node_data {
+    unsigned long node_start_pfn;
+    unsigned long node_spanned_pages;
+    unsigned int  node_id;
+} node_data_t;
+
+#endif /* _XEN_NUMA_H */
diff --git a/xen/include/xen/topology.h b/xen/include/xen/topology.h
new file mode 100644
index 0000000000..e836bf132f
--- /dev/null
+++ b/xen/include/xen/topology.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2006, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _XEN_TOPOLOGY_H
+#define _XEN_TOPOLOGY_H
+
+#include <asm/topology.h>
+
+#endif /* _XEN_TOPOLOGY_H */