SRAT memory hotplug 2/2: Support overlapped and sparse node memory arrangement.

Currently xen hypervisor use nodes to keep start/end address of node. It assume memory among nodes has no overlap, this is not always true, especially if we have memory hotplug support in the system. This patch backport Linux kernel's memblks to support overlapping among node. The memblks will be used both for checking conflict, and caculate memnode_shift. Also, currently if there is no memory populated in a node when system booting, the node will be unparsed later, and the corresponding CPU's numa information will be removed also. This patch will keep the CPU information. One thing need notice is, currently we caculate memnode_shift with all memory, including un-populated ones. This should work if the smallest chuck is not so small. Other option can be flags in the page_info structure, etc. The memnodemap is changed from paddr to pdx, both to save space, and also because currently most access is from pfn. A flag is mem_hotplug added if there is hotplug memory range. Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author: Keir Fraser <keir.fraser@citrix.com> 2009-12-09 10:42:53 +0000
committer: Keir Fraser <keir.fraser@citrix.com> 2009-12-09 10:42:53 +0000
commit: 6369c4faa303cd4e8af9ff6ad73315122d9defc5 (patch)
tree: 170dcce577b237ae629f6572685f65c2d3f99171 /xen/arch/x86/numa.c
parent: 4d817e3923416c21a61c0df32f244893974eb2e8 (diff)
download: xen-6369c4faa303cd4e8af9ff6ad73315122d9defc5.tar.gz
xen-6369c4faa303cd4e8af9ff6ad73315122d9defc5.tar.bz2
xen-6369c4faa303cd4e8af9ff6ad73315122d9defc5.zip
1 files changed, 56 insertions, 28 deletions
diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
index 51fb342a15..676ff37001 100644
--- a/xen/arch/x86/numa.c
+++ b/xen/arch/x86/numa.c
@@ -28,6 +28,7 @@ custom_param("numa", numa_setup);
 
 struct node_data node_data[MAX_NUMNODES];
 
+/* Mapping from pdx to node id */
 int memnode_shift;
 u8  memnodemap[NODEMAPSIZE];
 
@@ -52,54 +53,81 @@ int acpi_numa __devinitdata;
  * 0 if memnodmap[] too small (of shift too small)
  * -1 if node overlap or lost ram (shift too big)
  */
-static int __devinit
-populate_memnodemap(const struct node *nodes, int numnodes, int shift)
+static int __init populate_memnodemap(const struct node *nodes,
+                                      int numnodes, int shift, int *nodeids)
 {
-	int i; 
-	int res = -1;
-	paddr_t addr, end;
+	unsigned long spdx, epdx;
+	int i, res = -1;
 
-	if (shift >= 64)
-		return -1;
-	memset(memnodemap, 0xff, sizeof(memnodemap));
+	memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
 	for (i = 0; i < numnodes; i++) {
-		addr = nodes[i].start;
-		end = nodes[i].end;
-		if (addr >= end)
+		spdx = paddr_to_pdx(nodes[i].start);
+		epdx = paddr_to_pdx(nodes[i].end);
+		if (spdx >= epdx)
 			continue;
-		if ((end >> shift) >= NODEMAPSIZE)
+		if ((epdx >> shift) >= NODEMAPSIZE)
 			return 0;
 		do {
-			if (memnodemap[addr >> shift] != 0xff)
+			if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
 				return -1;
-			memnodemap[addr >> shift] = i;
-			addr += (1ULL << shift);
-		} while (addr < end);
+
+			if (!nodeids)
+				memnodemap[spdx >> shift] = i;
+			else
+				memnodemap[spdx >> shift] = nodeids[i];
+
+			spdx += (1UL << shift);
+		} while (spdx < epdx);
 		res = 1;
-	} 
+	}
 	return res;
 }
 
-int __init compute_hash_shift(struct node *nodes, int numnodes)
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init extract_lsb_from_nodes(const struct node *nodes,
+					 int numnodes)
 {
-	int shift = 20;
+	int i, nodes_used = 0;
+	unsigned long spdx, epdx;
+	unsigned long bitfield = 0, memtop = 0;
 
-	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
-		shift++;
+	for (i = 0; i < numnodes; i++) {
+		spdx = paddr_to_pdx(nodes[i].start);
+		epdx = paddr_to_pdx(nodes[i].end);
+		if (spdx >= epdx)
+			continue;
+		bitfield |= spdx;
+		nodes_used++;
+		if (epdx > memtop)
+			memtop = epdx;
+	}
+	if (nodes_used <= 1)
+		i = 63;
+	else
+		i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+	return i;
+}
 
+int __init compute_hash_shift(struct node *nodes, int numnodes,
+			      int *nodeids)
+{
+	int shift;
+
+	shift = extract_lsb_from_nodes(nodes, numnodes);
 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 		shift);
 
-	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
-		printk(KERN_INFO
-	"Your memory is not aligned you need to rebuild your kernel "
-	"with a bigger NODEMAPSIZE shift=%d\n",
-			shift);
+	if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
+		printk(KERN_INFO "Your memory is not aligned you need to "
+		       "rebuild your kernel with a bigger NODEMAPSIZE "
+		       "shift=%d\n", shift);
 		return -1;
 	}
 	return shift;
 }
-
 /* initialize NODE_DATA given nodeid and start/end */
 void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
 { 
@@ -167,7 +195,7 @@ static int numa_emulation(u64 start_pfn, u64 end_pfn)
 		       (nodes[i].end - nodes[i].start) >> 20);
 		node_set_online(i);
  	}
- 	memnode_shift = compute_hash_shift(nodes, numa_fake);
+ 	memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
  	if (memnode_shift < 0) {
  		memnode_shift = 0;
  		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
author	Keir Fraser <keir.fraser@citrix.com>	2009-12-09 10:42:53 +0000
committer	Keir Fraser <keir.fraser@citrix.com>	2009-12-09 10:42:53 +0000
commit	6369c4faa303cd4e8af9ff6ad73315122d9defc5 (patch)
tree	170dcce577b237ae629f6572685f65c2d3f99171 /xen/arch/x86/numa.c
parent	4d817e3923416c21a61c0df32f244893974eb2e8 (diff)
download	xen-6369c4faa303cd4e8af9ff6ad73315122d9defc5.tar.gz xen-6369c4faa303cd4e8af9ff6ad73315122d9defc5.tar.bz2 xen-6369c4faa303cd4e8af9ff6ad73315122d9defc5.zip