Transcendent memory ("tmem") for Xen.

Tmem, when called from a tmem-capable (paravirtualized) guest, makes use of otherwise unutilized ("fallow") memory to create and manage pools of pages that can be accessed from the guest either as "ephemeral" pages or as "persistent" pages. In either case, the pages are not directly addressible by the guest, only copied to and fro via the tmem interface. Ephemeral pages are a nice place for a guest to put recently evicted clean pages that it might need again; these pages can be reclaimed synchronously by Xen for other guests or other uses. Persistent pages are a nice place for a guest to put "swap" pages to avoid sending them to disk. These pages retain data as long as the guest lives, but count against the guest memory allocation. Tmem pages may optionally be compressed and, in certain cases, can be shared between guests. Tmem also handles concurrency nicely and provides limited QoS settings to combat malicious DoS attempts. Save/restore and live migration support is not yet provided. Tmem is primarily targeted for an x86 64-bit hypervisor. On a 32-bit x86 hypervisor, it has limited functionality and testing due to limitations of the xen heap. Nearly all of tmem is architecture-independent; three routines remain to be ported to ia64 and it should work on that architecture too. It is also structured to be portable to non-Xen environments. Tmem defaults off (for now) and must be enabled with a "tmem" xen boot option (and does nothing unless a tmem-capable guest is running). The "tmem_compress" boot option enables compression which takes about 10x more CPU but approximately doubles the number of pages that can be stored. Tmem can be controlled via several "xm" commands and many interesting tmem statistics can be obtained. A README and internal specification will follow, but lots of useful prose about tmem, as well as Linux patches, can be found at http://oss.oracle.com/projects/tmem . Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
author: Keir Fraser <keir.fraser@citrix.com> 2009-05-26 11:05:04 +0100
committer: Keir Fraser <keir.fraser@citrix.com> 2009-05-26 11:05:04 +0100
commit: 6009f4ddb2cdb8555d2d5e030d351893e971b995 (patch)
tree: 6f146a530b5065a1688aa456280f965e1751f2c8 /xen/common
parent: ff811c2bc429a70798cf65913549c0ddaab70c3d (diff)
download: xen-6009f4ddb2cdb8555d2d5e030d351893e971b995.tar.gz
xen-6009f4ddb2cdb8555d2d5e030d351893e971b995.tar.bz2
xen-6009f4ddb2cdb8555d2d5e030d351893e971b995.zip
13 files changed, 3915 insertions, 29 deletions
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 3054f2e271..08b9e2b00e 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -28,6 +28,11 @@ obj-y += version.o
 obj-y += vsprintf.o
 obj-y += xmalloc_tlsf.o
 obj-y += rcupdate.o
+obj-y += tmem.o
+obj-y += tmem_xen.o
+obj-y += radix-tree.o
+obj-y += rbtree.o
+obj-y += lzo.o
 
 obj-$(perfc)       += perfc.o
 obj-$(crash_debug) += gdbstub.o
diff --git a/xen/common/compat/Makefile b/xen/common/compat/Makefile
index 9a36a3dcd3..1cf289ab3e 100644
--- a/xen/common/compat/Makefile
+++ b/xen/common/compat/Makefile
@@ -3,3 +3,4 @@ obj-y += kernel.o
 obj-y += memory.o
 obj-y += multicall.o
 obj-y += xlat.o
+obj-y += tmem_xen.o
diff --git a/xen/common/compat/tmem_xen.c b/xen/common/compat/tmem_xen.c
new file mode 100644
index 0000000000..f6c9e0453d
--- /dev/null
+++ b/xen/common/compat/tmem_xen.c
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * tmem_xen.c
+ *
+ */
+
+#include <xen/config.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/guest_access.h>
+#include <xen/hypercall.h>
+#include <compat/tmem.h>
+
+#define xen_tmem_op tmem_op
+/*CHECK_tmem_op;*/
+#undef xen_tmem_op
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 187735b18c..66694168a2 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -31,6 +31,7 @@
 #include <public/vcpu.h>
 #include <xsm/xsm.h>
 #include <xen/trace.h>
+#include <xen/tmem.h>
 
 /* Linux config option: propageted to domain0 */
 /* xen_processor_pmbits: xen control Cx, Px, ... */
@@ -558,6 +559,9 @@ static void complete_domain_destroy(struct rcu_head *head)
 
     grant_table_destroy(d);
 
+    if ( d->tmem != NULL )
+        tmem_destroy(d->tmem);
+
     arch_domain_destroy(d);
 
     rangeset_domain_destroy(d);
diff --git a/xen/common/lzo.c b/xen/common/lzo.c
new file mode 100644
index 0000000000..eeb200b281
--- /dev/null
+++ b/xen/common/lzo.c
@@ -0,0 +1,518 @@
+/*
+ *  lzo.c -- LZO1X Compressor from MiniLZO
+ *
+ *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ *  The full LZO package can be found at:
+ *  http://www.oberhumer.com/opensource/lzo/
+ *
+ *  Adapted for Xen (files combined and syntactic/header changes) by:
+ *  Dan Magenheimer <dan.magenheimer@oracle.com>
+ *
+ */
+
+/*
+ *  lzodefs.h -- architecture, OS and compiler specific defines
+ *
+ *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ *  The full LZO package can be found at:
+ *  http://www.oberhumer.com/opensource/lzo/
+ *
+ *  Changed for kernel use by:
+ *  Nitin Gupta <nitingupta910@gmail.com>
+ *  Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#define LZO_VERSION  0x2020
+#define LZO_VERSION_STRING "2.02"
+#define LZO_VERSION_DATE "Oct 17 2005"
+
+#define M1_MAX_OFFSET 0x0400
+#define M2_MAX_OFFSET 0x0800
+#define M3_MAX_OFFSET 0x4000
+#define M4_MAX_OFFSET 0xbfff
+
+#define M1_MIN_LEN 2
+#define M1_MAX_LEN 2
+#define M2_MIN_LEN 3
+#define M2_MAX_LEN 8
+#define M3_MIN_LEN 3
+#define M3_MAX_LEN 33
+#define M4_MIN_LEN 3
+#define M4_MAX_LEN 9
+
+#define M1_MARKER 0
+#define M2_MARKER 64
+#define M3_MARKER 32
+#define M4_MARKER 16
+
+#define D_BITS  14
+#define D_MASK  ((1u << D_BITS) - 1)
+#define D_HIGH  ((D_MASK >> 1) + 1)
+
+#define DX2(p, s1, s2) (((((size_t)((p)[2]) << (s2)) ^ (p)[1]) \
+       << (s1)) ^ (p)[0])
+#define DX3(p, s1, s2, s3) ((DX2((p)+1, s2, s3) << (s1)) ^ (p)[0])
+
+/*
+ *  LZO1X Compressor from MiniLZO
+ *
+ *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ *  The full LZO package can be found at:
+ *  http://www.oberhumer.com/opensource/lzo/
+ *
+ *  Changed for kernel use by:
+ *  Nitin Gupta <nitingupta910@gmail.com>
+ *  Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#include <xen/types.h>
+#include <xen/lzo.h>
+#define get_unaligned(_p) (*(_p))
+#define put_unaligned(_val,_p) (*(_p)=_val)
+#define get_unaligned_le16(_p) (*(u16 *)(_p))
+
+static noinline size_t
+_lzo1x_1_do_compress(const unsigned char *in, size_t in_len,
+                     unsigned char *out, size_t *out_len, void *wrkmem)
+{
+    const unsigned char * const in_end = in + in_len;
+    const unsigned char * const ip_end = in + in_len - M2_MAX_LEN - 5;
+    const unsigned char ** const dict = wrkmem;
+    const unsigned char *ip = in, *ii = ip;
+    const unsigned char *end, *m, *m_pos;
+    size_t m_off, m_len, dindex;
+    unsigned char *op = out;
+
+    ip += 4;
+
+    for (;;) {
+        dindex = ((size_t)(0x21 * DX3(ip, 5, 5, 6)) >> 5) & D_MASK;
+        m_pos = dict[dindex];
+
+        if (m_pos < in)
+            goto literal;
+
+        if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET))
+            goto literal;
+
+        m_off = ip - m_pos;
+        if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+            goto try_match;
+
+        dindex = (dindex & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f);
+        m_pos = dict[dindex];
+
+        if (m_pos < in)
+            goto literal;
+
+        if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET))
+            goto literal;
+
+        m_off = ip - m_pos;
+        if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+            goto try_match;
+
+        goto literal;
+
+    try_match:
+        if (get_unaligned((const unsigned short *)m_pos)
+            == get_unaligned((const unsigned short *)ip)) {
+            if (likely(m_pos[2] == ip[2]))
+                goto match;
+        }
+
+    literal:
+        dict[dindex] = ip;
+        ++ip;
+        if (unlikely(ip >= ip_end))
+            break;
+        continue;
+
+    match:
+        dict[dindex] = ip;
+        if (ip != ii) {
+            size_t t = ip - ii;
+
+            if (t <= 3) {
+                op[-2] |= t;
+            } else if (t <= 18) {
+                *op++ = (t - 3);
+            } else {
+                size_t tt = t - 18;
+
+                *op++ = 0;
+                while (tt > 255) {
+                    tt -= 255;
+                    *op++ = 0;
+                }
+                *op++ = tt;
+            }
+            do {
+                *op++ = *ii++;
+            } while (--t > 0);
+        }
+
+        ip += 3;
+        if (m_pos[3] != *ip++ || m_pos[4] != *ip++
+            || m_pos[5] != *ip++ || m_pos[6] != *ip++
+            || m_pos[7] != *ip++ || m_pos[8] != *ip++) {
+            --ip;
+            m_len = ip - ii;
+
+            if (m_off <= M2_MAX_OFFSET) {
+                m_off -= 1;
+                *op++ = (((m_len - 1) << 5)
+                         | ((m_off & 7) << 2));
+                *op++ = (m_off >> 3);
+            } else if (m_off <= M3_MAX_OFFSET) {
+                m_off -= 1;
+                *op++ = (M3_MARKER | (m_len - 2));
+                goto m3_m4_offset;
+            } else {
+                m_off -= 0x4000;
+
+                *op++ = (M4_MARKER | ((m_off & 0x4000) >> 11)
+                         | (m_len - 2));
+                goto m3_m4_offset;
+            }
+        } else {
+            end = in_end;
+            m = m_pos + M2_MAX_LEN + 1;
+
+            while (ip < end && *m == *ip) {
+                m++;
+                ip++;
+            }
+            m_len = ip - ii;
+
+            if (m_off <= M3_MAX_OFFSET) {
+                m_off -= 1;
+                if (m_len <= 33) {
+                    *op++ = (M3_MARKER | (m_len - 2));
+                } else {
+                    m_len -= 33;
+                    *op++ = M3_MARKER | 0;
+                    goto m3_m4_len;
+                }
+            } else {
+                m_off -= 0x4000;
+                if (m_len <= M4_MAX_LEN) {
+                    *op++ = (M4_MARKER
+                             | ((m_off & 0x4000) >> 11)
+                             | (m_len - 2));
+                } else {
+                    m_len -= M4_MAX_LEN;
+                    *op++ = (M4_MARKER
+                             | ((m_off & 0x4000) >> 11));
+                m3_m4_len:
+                    while (m_len > 255) {
+                        m_len -= 255;
+                        *op++ = 0;
+                    }
+
+                    *op++ = (m_len);
+                }
+            }
+        m3_m4_offset:
+            *op++ = ((m_off & 63) << 2);
+            *op++ = (m_off >> 6);
+        }
+
+        ii = ip;
+        if (unlikely(ip >= ip_end))
+            break;
+    }
+
+    *out_len = op - out;
+    return in_end - ii;
+}
+
+int lzo1x_1_compress(const unsigned char *in, size_t in_len, unsigned char *out,
+                     size_t *out_len, void *wrkmem)
+{
+    const unsigned char *ii;
+    unsigned char *op = out;
+    size_t t;
+
+    if (unlikely(in_len <= M2_MAX_LEN + 5)) {
+        t = in_len;
+    } else {
+        t = _lzo1x_1_do_compress(in, in_len, op, out_len, wrkmem);
+        op += *out_len;
+    }
+
+    if (t > 0) {
+        ii = in + in_len - t;
+
+        if (op == out && t <= 238) {
+            *op++ = (17 + t);
+        } else if (t <= 3) {
+            op[-2] |= t;
+        } else if (t <= 18) {
+            *op++ = (t - 3);
+        } else {
+            size_t tt = t - 18;
+
+            *op++ = 0;
+            while (tt > 255) {
+                tt -= 255;
+                *op++ = 0;
+            }
+
+            *op++ = tt;
+        }
+        do {
+            *op++ = *ii++;
+        } while (--t > 0);
+    }
+
+    *op++ = M4_MARKER | 1;
+    *op++ = 0;
+    *op++ = 0;
+
+    *out_len = op - out;
+    return LZO_E_OK;
+}
+
+/*
+ *  LZO1X Decompressor from MiniLZO
+ *
+ *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ *  The full LZO package can be found at:
+ *  http://www.oberhumer.com/opensource/lzo/
+ *
+ *  Changed for kernel use by:
+ *  Nitin Gupta <nitingupta910@gmail.com>
+ *  Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#define HAVE_IP(x, ip_end, ip) ((size_t)(ip_end - ip) < (x))
+#define HAVE_OP(x, op_end, op) ((size_t)(op_end - op) < (x))
+#define HAVE_LB(m_pos, out, op) (m_pos < out || m_pos >= op)
+
+#define COPY4(dst, src) \
+  put_unaligned(get_unaligned((const u32 *)(src)), (u32 *)(dst))
+
+int lzo1x_decompress_safe(const unsigned char *in, size_t in_len,
+                          unsigned char *out, size_t *out_len)
+{
+    const unsigned char * const ip_end = in + in_len;
+    unsigned char * const op_end = out + *out_len;
+    const unsigned char *ip = in, *m_pos;
+    unsigned char *op = out;
+    size_t t;
+
+    *out_len = 0;
+
+    if (*ip > 17) {
+        t = *ip++ - 17;
+        if (t < 4)
+            goto match_next;
+        if (HAVE_OP(t, op_end, op))
+            goto output_overrun;
+        if (HAVE_IP(t + 1, ip_end, ip))
+            goto input_overrun;
+        do {
+            *op++ = *ip++;
+        } while (--t > 0);
+        goto first_literal_run;
+    }
+
+    while ((ip < ip_end)) {
+        t = *ip++;
+        if (t >= 16)
+            goto match;
+        if (t == 0) {
+            if (HAVE_IP(1, ip_end, ip))
+                goto input_overrun;
+            while (*ip == 0) {
+                t += 255;
+                ip++;
+                if (HAVE_IP(1, ip_end, ip))
+                    goto input_overrun;
+            }
+            t += 15 + *ip++;
+        }
+        if (HAVE_OP(t + 3, op_end, op))
+            goto output_overrun;
+        if (HAVE_IP(t + 4, ip_end, ip))
+            goto input_overrun;
+
+        COPY4(op, ip);
+        op += 4;
+        ip += 4;
+        if (--t > 0) {
+            if (t >= 4) {
+                do {
+                    COPY4(op, ip);
+                    op += 4;
+                    ip += 4;
+                    t -= 4;
+                } while (t >= 4);
+                if (t > 0) {
+                    do {
+                        *op++ = *ip++;
+                    } while (--t > 0);
+                }
+            } else {
+                do {
+                    *op++ = *ip++;
+                } while (--t > 0);
+            }
+        }
+
+    first_literal_run:
+        t = *ip++;
+        if (t >= 16)
+            goto match;
+        m_pos = op - (1 + M2_MAX_OFFSET);
+        m_pos -= t >> 2;
+        m_pos -= *ip++ << 2;
+
+        if (HAVE_LB(m_pos, out, op))
+            goto lookbehind_overrun;
+
+        if (HAVE_OP(3, op_end, op))
+            goto output_overrun;
+        *op++ = *m_pos++;
+        *op++ = *m_pos++;
+        *op++ = *m_pos;
+
+        goto match_done;
+
+        do {
+        match:
+            if (t >= 64) {
+                m_pos = op - 1;
+                m_pos -= (t >> 2) & 7;
+                m_pos -= *ip++ << 3;
+                t = (t >> 5) - 1;
+                if (HAVE_LB(m_pos, out, op))
+                    goto lookbehind_overrun;
+                if (HAVE_OP(t + 3 - 1, op_end, op))
+                    goto output_overrun;
+                goto copy_match;
+            } else if (t >= 32) {
+                t &= 31;
+                if (t == 0) {
+                    if (HAVE_IP(1, ip_end, ip))
+                        goto input_overrun;
+                    while (*ip == 0) {
+                        t += 255;
+                        ip++;
+                        if (HAVE_IP(1, ip_end, ip))
+                            goto input_overrun;
+                    }
+                    t += 31 + *ip++;
+                }
+                m_pos = op - 1;
+                m_pos -= get_unaligned_le16(ip) >> 2;
+                ip += 2;
+            } else if (t >= 16) {
+                m_pos = op;
+                m_pos -= (t & 8) << 11;
+
+                t &= 7;
+                if (t == 0) {
+                    if (HAVE_IP(1, ip_end, ip))
+                        goto input_overrun;
+                    while (*ip == 0) {
+                        t += 255;
+                        ip++;
+                        if (HAVE_IP(1, ip_end, ip))
+                            goto input_overrun;
+                    }
+                    t += 7 + *ip++;
+                }
+                m_pos -= get_unaligned_le16(ip) >> 2;
+                ip += 2;
+                if (m_pos == op)
+                    goto eof_found;
+                m_pos -= 0x4000;
+            } else {
+                m_pos = op - 1;
+                m_pos -= t >> 2;
+                m_pos -= *ip++ << 2;
+
+                if (HAVE_LB(m_pos, out, op))
+                    goto lookbehind_overrun;
+                if (HAVE_OP(2, op_end, op))
+                    goto output_overrun;
+
+                *op++ = *m_pos++;
+                *op++ = *m_pos;
+                goto match_done;
+            }
+
+            if (HAVE_LB(m_pos, out, op))
+                goto lookbehind_overrun;
+            if (HAVE_OP(t + 3 - 1, op_end, op))
+                goto output_overrun;
+
+            if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
+                COPY4(op, m_pos);
+                op += 4;
+                m_pos += 4;
+                t -= 4 - (3 - 1);
+                do {
+                    COPY4(op, m_pos);
+                    op += 4;
+                    m_pos += 4;
+                    t -= 4;
+                } while (t >= 4);
+                if (t > 0)
+                    do {
+                        *op++ = *m_pos++;
+                    } while (--t > 0);
+            } else {
+            copy_match:
+                *op++ = *m_pos++;
+                *op++ = *m_pos++;
+                do {
+                    *op++ = *m_pos++;
+                } while (--t > 0);
+            }
+        match_done:
+            t = ip[-2] & 3;
+            if (t == 0)
+                break;
+        match_next:
+            if (HAVE_OP(t, op_end, op))
+                goto output_overrun;
+            if (HAVE_IP(t + 1, ip_end, ip))
+                goto input_overrun;
+
+            *op++ = *ip++;
+            if (t > 1) {
+                *op++ = *ip++;
+                if (t > 2)
+                    *op++ = *ip++;
+            }
+
+            t = *ip++;
+        } while (ip < ip_end);
+    }
+
+    *out_len = op - out;
+    return LZO_E_EOF_NOT_FOUND;
+
+ eof_found:
+    *out_len = op - out;
+    return (ip == ip_end ? LZO_E_OK :
+            (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
+ input_overrun:
+    *out_len = op - out;
+    return LZO_E_INPUT_OVERRUN;
+
+ output_overrun:
+    *out_len = op - out;
+    return LZO_E_OUTPUT_OVERRUN;
+
+ lookbehind_overrun:
+    *out_len = op - out;
+    return LZO_E_LOOKBEHIND_OVERRUN;
+}
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 55e2d8a046..0dd2b9282f 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -560,17 +560,6 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg)
     return rc;
 }
 
-/* Temporary placeholder. */
-int do_tmem_op(void *tmem_op)
-{
-    static bool_t warned;
-
-    if ( !test_and_set_bool(warned) )
-        printk("tmem: not implemented\n");
-
-    return -ENOSYS;
-}
-
 /*
  * Local variables:
  * mode: C
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index ab3445b44b..bb143aedd6 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -35,6 +35,7 @@
 #include <xen/perfc.h>
 #include <xen/numa.h>
 #include <xen/nodemask.h>
+#include <xen/tmem.h>
 #include <public/sysctl.h>
 #include <asm/page.h>
 #include <asm/numa.h>
@@ -335,9 +336,9 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
 /* Allocate 2^@order contiguous pages. */
 static struct page_info *alloc_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi,
-    unsigned int node, unsigned int order)
+    unsigned int node, unsigned int order, unsigned int memflags)
 {
-    unsigned int i, j, zone;
+    unsigned int i, j, zone = 0;
     unsigned int num_nodes = num_online_nodes();
     unsigned long request = 1UL << order;
     cpumask_t extra_cpus_mask, mask;
@@ -380,6 +381,14 @@ static struct page_info *alloc_heap_pages(
             node = 0;
     }
 
+    /* Try to free memory from tmem */
+    if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL )
+    {
+        /* reassigning an already allocated anonymous heap page */
+        spin_unlock(&heap_lock);
+        return pg;
+    }
+
     /* No suitable memory blocks. Fail the request. */
     spin_unlock(&heap_lock);
     return NULL;
@@ -1018,8 +1027,8 @@ void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
 
     ASSERT(!in_irq());
 
-    pg = alloc_heap_pages(
-        MEMZONE_XEN, MEMZONE_XEN, cpu_to_node(smp_processor_id()), order);
+    pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
+        cpu_to_node(smp_processor_id()), order, memflags);
     if ( unlikely(pg == NULL) )
         return NULL;
 
@@ -1172,11 +1181,11 @@ struct page_info *alloc_domheap_pages(
         return NULL;
 
     if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
-        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order);
+        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
 
     if ( (pg == NULL) &&
          ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
-                                 node, order)) == NULL) )
+                                 node, order, memflags)) == NULL) )
          return NULL;
 
     if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
@@ -1373,6 +1382,28 @@ static void page_scrub_softirq(void)
     spin_unlock(&serialise_lock);
 }
 
+void scrub_list_splice(struct page_list_head *list)
+{
+    spin_lock(&page_scrub_lock);
+    page_list_splice(list, &page_scrub_list);
+    spin_unlock(&page_scrub_lock);
+}
+
+void scrub_list_add(struct page_info *pg)
+{
+    spin_lock(&page_scrub_lock);
+    page_list_add(pg, &page_scrub_list);
+    spin_unlock(&page_scrub_lock);
+}
+
+void scrub_one_page(struct page_info *pg)
+{
+    void *p = map_domain_page(page_to_mfn(pg));
+
+    scrub_page(p);
+    unmap_domain_page(p);
+}
+
 static void page_scrub_timer_fn(void *unused)
 {
     page_scrub_schedule_work();
diff --git a/xen/common/radix-tree.c b/xen/common/radix-tree.c
new file mode 100644
index 0000000000..414f0cef72
--- /dev/null
+++ b/xen/common/radix-tree.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Copyright (C) 2009 adaption for Xen tmem by Dan Magenheimer, Oracle Corp.
+ * Changed:
+ * o Linux 2.6.18 source used (prior to read-copy-update addition)
+ * o constants and data structures moved out to radix-tree.h header
+ * o tagging code removed
+ * o radix_tree_insert has func parameter for dynamic data struct allocation
+ * o radix_tree_destroy added (including recursive helper function)
+ * o __init functions must be called explicitly
+ * o other include files adapted to Xen
+ */
+
+#include <xen/config.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/radix-tree.h>
+#include <asm/cache.h>
+
+static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
+
+/*
+ * Return the maximum key which can be store into a
+ * radix tree with height HEIGHT.
+ */
+static inline unsigned long radix_tree_maxindex(unsigned int height)
+{
+    return height_to_maxindex[height];
+}
+
+/*
+ * Extend a radix tree so it can store key @index.
+ */
+static int radix_tree_extend(struct radix_tree_root *root, unsigned long index,
+                             struct radix_tree_node *(*node_alloc)(void *), void *arg)
+{
+    struct radix_tree_node *node;
+    unsigned int height;
+
+    /* Figure out what the height should be.  */
+    height = root->height + 1;
+    if (index > radix_tree_maxindex(height))
+        while (index > radix_tree_maxindex(height))
+            height++;
+
+    if (root->rnode == NULL) {
+        root->height = height;
+        goto out;
+    }
+
+    do {
+        if (!(node = node_alloc(arg)))
+            return -ENOMEM;
+
+        /* Increase the height.  */
+        node->slots[0] = root->rnode;
+
+        node->count = 1;
+        root->rnode = node;
+        root->height++;
+    } while (height > root->height);
+ out:
+    return 0;
+}
+
+/**
+ * radix_tree_insert    -    insert into a radix tree
+ * @root:  radix tree root
+ * @index:  index key
+ * @item:  item to insert
+ *
+ * Insert an item into the radix tree at position @index.
+ */
+int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
+                      void *item, struct radix_tree_node *(*node_alloc)(void *), void *arg)
+{
+    struct radix_tree_node *node = NULL, *slot;
+    unsigned int height, shift;
+    int offset;
+    int error;
+
+    /* Make sure the tree is high enough.  */
+    if (index > radix_tree_maxindex(root->height)) {
+        error = radix_tree_extend(root, index, node_alloc, arg);
+        if (error)
+            return error;
+    }
+
+    slot = root->rnode;
+    height = root->height;
+    shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+    offset = 0;   /* uninitialised var warning */
+    while (height > 0) {
+        if (slot == NULL) {
+            /* Have to add a child node.  */
+            if (!(slot = node_alloc(arg)))
+                return -ENOMEM;
+            if (node) {
+
+                node->slots[offset] = slot;
+                node->count++;
+            } else
+                root->rnode = slot;
+        }
+
+        /* Go a level down */
+        offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+        node = slot;
+        slot = node->slots[offset];
+        shift -= RADIX_TREE_MAP_SHIFT;
+        height--;
+    }
+
+    if (slot != NULL)
+        return -EEXIST;
+
+    if (node) {
+        node->count++;
+        node->slots[offset] = item;
+    } else {
+        root->rnode = item;
+    }
+
+    return 0;
+}
+EXPORT_SYMBOL(radix_tree_insert);
+
+static inline void **__lookup_slot(struct radix_tree_root *root,
+                                   unsigned long index)
+{
+    unsigned int height, shift;
+    struct radix_tree_node **slot;
+
+    height = root->height;
+
+    if (index > radix_tree_maxindex(height))
+        return NULL;
+
+    if (height == 0 && root->rnode)
+        return (void **)&root->rnode;
+
+    shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+    slot = &root->rnode;
+
+    while (height > 0) {
+        if (*slot == NULL)
+            return NULL;
+
+        slot = (struct radix_tree_node **)
+            ((*slot)->slots +
+             ((index >> shift) & RADIX_TREE_MAP_MASK));
+        shift -= RADIX_TREE_MAP_SHIFT;
+        height--;
+    }
+
+    return (void **)slot;
+}
+
+/**
+ * radix_tree_lookup_slot    -    lookup a slot in a radix tree
+ * @root:  radix tree root
+ * @index:  index key
+ *
+ * Lookup the slot corresponding to the position @index in the radix tree
+ * @root. This is useful for update-if-exists operations.
+ */
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+{
+    return __lookup_slot(root, index);
+}
+EXPORT_SYMBOL(radix_tree_lookup_slot);
+
+/**
+ * radix_tree_lookup    -    perform lookup operation on a radix tree
+ * @root:  radix tree root
+ * @index:  index key
+ *
+ * Lookup the item at the position @index in the radix tree @root.
+ */
+void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+    void **slot;
+
+    slot = __lookup_slot(root, index);
+    return slot != NULL ? *slot : NULL;
+}
+EXPORT_SYMBOL(radix_tree_lookup);
+
+static unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+         unsigned int max_items, unsigned long *next_index)
+{
+    unsigned int nr_found = 0;
+    unsigned int shift, height;
+    struct radix_tree_node *slot;
+    unsigned long i;
+
+    height = root->height;
+    if (index > radix_tree_maxindex(height))
+        if (height == 0) {
+            if (root->rnode && index == 0)
+                results[nr_found++] = root->rnode;
+            goto out;
+        }
+
+    shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+    slot = root->rnode;
+
+    for ( ; height > 1; height--) {
+
+        for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
+             i < RADIX_TREE_MAP_SIZE; i++) {
+            if (slot->slots[i] != NULL)
+                break;
+            index &= ~((1UL << shift) - 1);
+            index += 1UL << shift;
+            if (index == 0)
+                goto out; /* 32-bit wraparound */
+        }
+        if (i == RADIX_TREE_MAP_SIZE)
+            goto out;
+
+        shift -= RADIX_TREE_MAP_SHIFT;
+        slot = slot->slots[i];
+    }
+
+    /* Bottom level: grab some items */
+    for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
+        index++;
+        if (slot->slots[i]) {
+            results[nr_found++] = slot->slots[i];
+            if (nr_found == max_items)
+                goto out;
+        }
+    }
+ out:
+    *next_index = index;
+    return nr_found;
+}
+
+/**
+ * radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ * @root:  radix tree root
+ * @results: where the results of the lookup are placed
+ * @first_index: start the lookup from this key
+ * @max_items: place up to this many items at *results
+ *
+ * Performs an index-ascending scan of the tree for present items.  Places
+ * them at *@results and returns the number of items which were placed at
+ * *@results.
+ *
+ * The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+                       unsigned long first_index, unsigned int max_items)
+{
+    const unsigned long max_index = radix_tree_maxindex(root->height);
+    unsigned long cur_index = first_index;
+    unsigned int ret = 0;
+
+    while (ret < max_items) {
+        unsigned int nr_found;
+        unsigned long next_index; /* Index of next search */
+
+        if (cur_index > max_index)
+            break;
+        nr_found = __lookup(root, results + ret, cur_index,
+                            max_items - ret, &next_index);
+        ret += nr_found;
+        if (next_index == 0)
+            break;
+        cur_index = next_index;
+    }
+    return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup);
+
+/**
+ * radix_tree_shrink    -    shrink height of a radix tree to minimal
+ * @root  radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root,
+                                     void (*node_free)(struct radix_tree_node *))
+{
+    /* try to shrink tree height */
+    while (root->height > 0 &&
+           root->rnode->count == 1 &&
+           root->rnode->slots[0]) {
+        struct radix_tree_node *to_free = root->rnode;
+
+        root->rnode = to_free->slots[0];
+        root->height--;
+        to_free->slots[0] = NULL;
+        to_free->count = 0;
+        node_free(to_free);
+    }
+}
+
+/**
+ * radix_tree_delete    -    delete an item from a radix tree
+ * @root:  radix tree root
+ * @index:  index key
+ *
+ * Remove the item at @index from the radix tree rooted at @root.
+ *
+ * Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index,
+                        void(*node_free)(struct radix_tree_node *))
+{
+    struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
+    struct radix_tree_node *slot = NULL;
+    unsigned int height, shift;
+    int offset;
+
+    height = root->height;
+    if (index > radix_tree_maxindex(height))
+        goto out;
+
+    slot = root->rnode;
+    if (height == 0 && root->rnode) {
+        root->rnode = NULL;
+        goto out;
+    }
+
+    shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+    pathp->node = NULL;
+
+    do {
+        if (slot == NULL)
+            goto out;
+
+        pathp++;
+        offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+        pathp->offset = offset;
+        pathp->node = slot;
+        slot = slot->slots[offset];
+        shift -= RADIX_TREE_MAP_SHIFT;
+        height--;
+    } while (height > 0);
+
+    if (slot == NULL)
+        goto out;
+
+    /* Now free the nodes we do not need anymore */
+    while (pathp->node) {
+        pathp->node->slots[pathp->offset] = NULL;
+        pathp->node->count--;
+
+        if (pathp->node->count) {
+            if (pathp->node == root->rnode)
+                radix_tree_shrink(root, node_free);
+            goto out;
+        }
+
+        /* Node with zero slots in use so free it */
+        node_free(pathp->node);
+
+        pathp--;
+    }
+    root->height = 0;
+    root->rnode = NULL;
+
+ out:
+    return slot;
+}
+EXPORT_SYMBOL(radix_tree_delete);
+
+static void
+radix_tree_node_destroy(struct radix_tree_node *node, unsigned int height,
+                        void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *))
+{
+    int i;
+
+    if (height == 0)
+        return;
+    for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+        if (node->slots[i]) {
+            if (height == 1) {
+                slot_free(node->slots[i]);
+                node->slots[i] = NULL;
+                continue;
+            }
+            radix_tree_node_destroy(node->slots[i], height-1,
+                                    slot_free, node_free);
+            node_free(node->slots[i]);
+            node->slots[i] = NULL;
+        }
+    }
+}
+
+void radix_tree_destroy(struct radix_tree_root *root,
+                        void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *))
+{
+    if (root->rnode == NULL)
+        return;
+    if (root->height == 0)
+        slot_free(root->rnode);
+    else {
+        radix_tree_node_destroy(root->rnode, root->height,
+                                slot_free, node_free);
+        node_free(root->rnode);
+        root->height = 0;
+    }
+    root->rnode = NULL;
+    /* caller must delete root if desired */
+}
+EXPORT_SYMBOL(radix_tree_destroy);
+
+static /*__init*/ unsigned long __maxindex(unsigned int height)
+{
+    unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
+    unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
+
+    if (tmp >= RADIX_TREE_INDEX_BITS)
+        index = ~0UL;
+    return index;
+}
+
+/*__init*/ void radix_tree_init(void)
+{
+    unsigned int i;
+
+    for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
+        height_to_maxindex[i] = __maxindex(i);
+}
diff --git a/xen/common/rbtree.c b/xen/common/rbtree.c
new file mode 100644
index 0000000000..67564c81b3
--- /dev/null
+++ b/xen/common/rbtree.c
@@ -0,0 +1,398 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/lib/rbtree.c
+*/
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/rbtree.h>
+
+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
+{
+    struct rb_node *right = node->rb_right;
+    struct rb_node *parent = rb_parent(node);
+
+    if ((node->rb_right = right->rb_left))
+        rb_set_parent(right->rb_left, node);
+    right->rb_left = node;
+
+    rb_set_parent(right, parent);
+
+    if (parent)
+    {
+        if (node == parent->rb_left)
+            parent->rb_left = right;
+        else
+            parent->rb_right = right;
+    }
+    else
+        root->rb_node = right;
+    rb_set_parent(node, right);
+}
+
+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+{
+    struct rb_node *left = node->rb_left;
+    struct rb_node *parent = rb_parent(node);
+
+    if ((node->rb_left = left->rb_right))
+        rb_set_parent(left->rb_right, node);
+    left->rb_right = node;
+
+    rb_set_parent(left, parent);
+
+    if (parent)
+    {
+        if (node == parent->rb_right)
+            parent->rb_right = left;
+        else
+            parent->rb_left = left;
+    }
+    else
+        root->rb_node = left;
+    rb_set_parent(node, left);
+}
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+    struct rb_node *parent, *gparent;
+
+    while ((parent = rb_parent(node)) && rb_is_red(parent))
+    {
+        gparent = rb_parent(parent);
+
+        if (parent == gparent->rb_left)
+        {
+            {
+                register struct rb_node *uncle = gparent->rb_right;
+                if (uncle && rb_is_red(uncle))
+                {
+                    rb_set_black(uncle);
+                    rb_set_black(parent);
+                    rb_set_red(gparent);
+                    node = gparent;
+                    continue;
+                }
+            }
+
+            if (parent->rb_right == node)
+            {
+                register struct rb_node *tmp;
+                __rb_rotate_left(parent, root);
+                tmp = parent;
+                parent = node;
+                node = tmp;
+            }
+
+            rb_set_black(parent);
+            rb_set_red(gparent);
+            __rb_rotate_right(gparent, root);
+        } else {
+            {
+                register struct rb_node *uncle = gparent->rb_left;
+                if (uncle && rb_is_red(uncle))
+                {
+                    rb_set_black(uncle);
+                    rb_set_black(parent);
+                    rb_set_red(gparent);
+                    node = gparent;
+                    continue;
+                }
+            }
+
+            if (parent->rb_left == node)
+            {
+                register struct rb_node *tmp;
+                __rb_rotate_right(parent, root);
+                tmp = parent;
+                parent = node;
+                node = tmp;
+            }
+
+            rb_set_black(parent);
+            rb_set_red(gparent);
+            __rb_rotate_left(gparent, root);
+        }
+    }
+
+    rb_set_black(root->rb_node);
+}
+EXPORT_SYMBOL(rb_insert_color);
+
+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+                             struct rb_root *root)
+{
+    struct rb_node *other;
+
+    while ((!node || rb_is_black(node)) && node != root->rb_node)
+    {
+        if (parent->rb_left == node)
+        {
+            other = parent->rb_right;
+            if (rb_is_red(other))
+            {
+                rb_set_black(other);
+                rb_set_red(parent);
+                __rb_rotate_left(parent, root);
+                other = parent->rb_right;
+            }
+            if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+                (!other->rb_right || rb_is_black(other->rb_right)))
+            {
+                rb_set_red(other);
+                node = parent;
+                parent = rb_parent(node);
+            }
+            else
+            {
+                if (!other->rb_right || rb_is_black(other->rb_right))
+                {
+                    struct rb_node *o_left;
+                    if ((o_left = other->rb_left))
+                        rb_set_black(o_left);
+                    rb_set_red(other);
+                    __rb_rotate_right(other, root);
+                    other = parent->rb_right;
+                }
+                rb_set_color(other, rb_color(parent));
+                rb_set_black(parent);
+                if (other->rb_right)
+                    rb_set_black(other->rb_right);
+                __rb_rotate_left(parent, root);
+                node = root->rb_node;
+                break;
+            }
+        }
+        else
+        {
+            other = parent->rb_left;
+            if (rb_is_red(other))
+            {
+                rb_set_black(other);
+                rb_set_red(parent);
+                __rb_rotate_right(parent, root);
+                other = parent->rb_left;
+            }
+            if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+                (!other->rb_right || rb_is_black(other->rb_right)))
+            {
+                rb_set_red(other);
+                node = parent;
+                parent = rb_parent(node);
+            }
+            else
+            {
+                if (!other->rb_left || rb_is_black(other->rb_left))
+                {
+                    register struct rb_node *o_right;
+                    if ((o_right = other->rb_right))
+                        rb_set_black(o_right);
+                    rb_set_red(other);
+                    __rb_rotate_left(other, root);
+                    other = parent->rb_left;
+                }
+                rb_set_color(other, rb_color(parent));
+                rb_set_black(parent);
+                if (other->rb_left)
+                    rb_set_black(other->rb_left);
+                __rb_rotate_right(parent, root);
+                node = root->rb_node;
+                break;
+            }
+        }
+    }
+    if (node)
+        rb_set_black(node);
+}
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+    struct rb_node *child, *parent;
+    int color;
+
+    if (!node->rb_left)
+        child = node->rb_right;
+    else if (!node->rb_right)
+        child = node->rb_left;
+    else
+    {
+        struct rb_node *old = node, *left;
+
+        node = node->rb_right;
+        while ((left = node->rb_left) != NULL)
+            node = left;
+        child = node->rb_right;
+        parent = rb_parent(node);
+        color = rb_color(node);
+
+        if (child)
+            rb_set_parent(child, parent);
+        if (parent == old) {
+            parent->rb_right = child;
+            parent = node;
+        } else
+            parent->rb_left = child;
+
+        node->rb_parent_color = old->rb_parent_color;
+        node->rb_right = old->rb_right;
+        node->rb_left = old->rb_left;
+
+        if (rb_parent(old))
+        {
+            if (rb_parent(old)->rb_left == old)
+                rb_parent(old)->rb_left = node;
+            else
+                rb_parent(old)->rb_right = node;
+        } else
+            root->rb_node = node;
+
+        rb_set_parent(old->rb_left, node);
+        if (old->rb_right)
+            rb_set_parent(old->rb_right, node);
+        goto color;
+    }
+
+    parent = rb_parent(node);
+    color = rb_color(node);
+
+    if (child)
+        rb_set_parent(child, parent);
+    if (parent)
+    {
+        if (parent->rb_left == node)
+            parent->rb_left = child;
+        else
+            parent->rb_right = child;
+    }
+    else
+        root->rb_node = child;
+
+ color:
+    if (color == RB_BLACK)
+        __rb_erase_color(child, parent, root);
+}
+EXPORT_SYMBOL(rb_erase);
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(struct rb_root *root)
+{
+    struct rb_node *n;
+
+    n = root->rb_node;
+    if (!n)
+        return NULL;
+    while (n->rb_left)
+        n = n->rb_left;
+    return n;
+}
+EXPORT_SYMBOL(rb_first);
+
+struct rb_node *rb_last(struct rb_root *root)
+{
+    struct rb_node *n;
+
+    n = root->rb_node;
+    if (!n)
+        return NULL;
+    while (n->rb_right)
+        n = n->rb_right;
+    return n;
+}
+EXPORT_SYMBOL(rb_last);
+
+struct rb_node *rb_next(struct rb_node *node)
+{
+    struct rb_node *parent;
+
+    if (rb_parent(node) == node)
+        return NULL;
+
+    /* If we have a right-hand child, go down and then left as far
+       as we can. */
+    if (node->rb_right) {
+        node = node->rb_right; 
+        while (node->rb_left)
+            node=node->rb_left;
+        return node;
+    }
+
+    /* No right-hand children.  Everything down and left is
+       smaller than us, so any 'next' node must be in the general
+       direction of our parent. Go up the tree; any time the
+       ancestor is a right-hand child of its parent, keep going
+       up. First time it's a left-hand child of its parent, said
+       parent is our 'next' node. */
+    while ((parent = rb_parent(node)) && node == parent->rb_right)
+        node = parent;
+
+    return parent;
+}
+EXPORT_SYMBOL(rb_next);
+
+struct rb_node *rb_prev(struct rb_node *node)
+{
+    struct rb_node *parent;
+
+    if (rb_parent(node) == node)
+        return NULL;
+
+    /* If we have a left-hand child, go down and then right as far
+       as we can. */
+    if (node->rb_left) {
+        node = node->rb_left; 
+        while (node->rb_right)
+            node=node->rb_right;
+        return node;
+    }
+
+    /* No left-hand children. Go up till we find an ancestor which
+       is a right-hand child of its parent */
+    while ((parent = rb_parent(node)) && node == parent->rb_left)
+        node = parent;
+
+    return parent;
+}
+EXPORT_SYMBOL(rb_prev);
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+                     struct rb_root *root)
+{
+    struct rb_node *parent = rb_parent(victim);
+
+    /* Set the surrounding nodes to point to the replacement */
+    if (parent) {
+        if (victim == parent->rb_left)
+            parent->rb_left = new;
+        else
+            parent->rb_right = new;
+    } else {
+        root->rb_node = new;
+    }
+    if (victim->rb_left)
+        rb_set_parent(victim->rb_left, new);
+    if (victim->rb_right)
+        rb_set_parent(victim->rb_right, new);
+
+    /* Copy the pointers/colour from the victim to the replacement */
+    *new = *victim;
+}
+EXPORT_SYMBOL(rb_replace_node);
diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c
index ac2aaab814..a17f0b2124 100644
--- a/xen/common/spinlock.c
+++ b/xen/common/spinlock.c
@@ -214,6 +214,12 @@ unsigned long _write_lock_irqsave(rwlock_t *lock)
     return flags;
 }
 
+int _write_trylock(rwlock_t *lock)
+{
+    check_lock(&lock->debug);
+    return _raw_write_trylock(&lock->raw);
+}
+
 void _write_unlock(rwlock_t *lock)
 {
     _raw_write_unlock(&lock->raw);
@@ -236,3 +242,9 @@ int _rw_is_locked(rwlock_t *lock)
     check_lock(&lock->debug);
     return _raw_rw_is_locked(&lock->raw);
 }
+
+int _rw_is_write_locked(rwlock_t *lock)
+{
+    check_lock(&lock->debug);
+    return _raw_rw_is_write_locked(&lock->raw);
+}
diff --git a/xen/common/tmem.c b/xen/common/tmem.c
new file mode 100644
index 0000000000..19d8bec05c
--- /dev/null
+++ b/xen/common/tmem.c
@@ -0,0 +1,2109 @@
+/******************************************************************************
+ * tmem.c
+ *
+ * Transcendent memory
+ *
+ * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
+ */
+
+/* TODO list: 090129
+   - improve on reclamation policy
+   - use different tlsf pools for each client (maybe each pool)
+   - implement page accounting and minimal QoS limits
+   - test shared access more completely (need pv cluster fs)
+   - add feedback-driven compression (not for persistent pools though!)
+   - add data-structure total bytes overhead stats
+ */
+
+#ifdef __XEN__
+#include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
+#endif
+
+#include <xen/tmem.h>
+#include <xen/rbtree.h>
+#include <xen/radix-tree.h>
+#include <xen/list.h>
+
+#define EXPORT /* indicates code other modules are dependent upon */
+#define FORWARD
+
+/************  INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
+
+#define CLI_ID_NULL TMH_CLI_ID_NULL
+#define cli_id_str  tmh_cli_id_str
+#define client_str  tmh_client_str
+
+/************ DEBUG and STATISTICS (+ some compression testing) *******/
+
+#ifndef NDEBUG
+#define SENTINELS
+#define NOINLINE noinline
+#else
+#define NOINLINE
+#endif
+
+#ifdef SENTINELS
+#define DECL_SENTINEL unsigned long sentinel;
+#define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
+#define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
+#define ASSERT_SENTINEL(_x,_y) \
+    ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
+#ifdef __i386__
+#define POOL_SENTINEL 0x87658765
+#define OBJ_SENTINEL 0x12345678
+#define OBJNODE_SENTINEL 0xfedcba09
+#define PGD_SENTINEL  0x43214321
+#else
+#define POOL_SENTINEL 0x8765876587658765
+#define OBJ_SENTINEL 0x1234567812345678
+#define OBJNODE_SENTINEL 0xfedcba0987654321
+#define PGD_SENTINEL  0x4321432143214321
+#endif
+#else
+#define DECL_SENTINEL
+#define SET_SENTINEL(_x,_y) do { } while (0)
+#define ASSERT_SENTINEL(_x,_y) do { } while (0)
+#define INVERT_SENTINEL(_x,_y) do { } while (0)
+#endif
+
+/* global statistics (none need to be locked) */
+static unsigned long total_tmem_ops = 0;
+static unsigned long errored_tmem_ops = 0;
+static unsigned long total_flush_pool = 0;
+static unsigned long alloc_failed = 0, alloc_page_failed = 0;
+static unsigned long evicted_pgs = 0, evict_attempts = 0;
+static unsigned long relinq_pgs = 0, relinq_attempts = 0;
+static unsigned long max_evicts_per_relinq = 0;
+static unsigned long low_on_memory = 0;
+static int global_obj_count_max = 0;
+static int global_pgp_count_max = 0;
+static int global_page_count_max = 0;
+static int global_rtree_node_count_max = 0;
+static long global_eph_count_max = 0;
+static unsigned long failed_copies;
+
+DECL_CYC_COUNTER(succ_get);
+DECL_CYC_COUNTER(succ_put);
+DECL_CYC_COUNTER(non_succ_get);
+DECL_CYC_COUNTER(non_succ_put);
+DECL_CYC_COUNTER(flush);
+DECL_CYC_COUNTER(flush_obj);
+#ifdef COMPARE_COPY_PAGE_SSE2
+EXTERN_CYC_COUNTER(pg_copy1);
+EXTERN_CYC_COUNTER(pg_copy2);
+EXTERN_CYC_COUNTER(pg_copy3);
+EXTERN_CYC_COUNTER(pg_copy4);
+#else
+EXTERN_CYC_COUNTER(pg_copy);
+#endif
+DECL_CYC_COUNTER(compress);
+DECL_CYC_COUNTER(decompress);
+
+/************ CORE DATA STRUCTURES ************************************/
+
+#define MAX_POOLS_PER_DOMAIN 16
+#define MAX_GLOBAL_SHARED_POOLS  16
+
+struct tm_pool;
+struct client {
+    struct list_head client_list;
+    struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
+    tmh_client_t *tmh;
+    struct list_head ephemeral_page_list;
+    long eph_count, eph_count_max;
+    cli_id_t cli_id;
+    uint32_t weight;
+    uint32_t cap;
+    bool_t compress;
+    bool_t frozen;
+    unsigned long compress_poor, compress_nomem;
+    unsigned long compressed_pages;
+    uint64_t compressed_sum_size;
+};
+typedef struct client client_t;
+
+struct share_list {
+    struct list_head share_list;
+    client_t *client;
+};
+typedef struct share_list sharelist_t;
+
+#define OBJ_HASH_BUCKETS 256 /* must be power of two */
+#define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
+#define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK)
+
+struct tm_pool {
+    bool_t shared;
+    bool_t persistent;
+    struct list_head pool_list; /* FIXME do we need this anymore? */
+    client_t *client;
+    uint64_t uuid[2]; /* 0 for private, non-zero for shared */
+    uint32_t pool_id;
+    rwlock_t pool_rwlock;
+    struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
+    struct list_head share_list; /* valid if shared */
+    DECL_SENTINEL
+    int shared_count; /* valid if shared */
+    atomic_t pgp_count;
+    int pgp_count_max;
+    long obj_count;  /* atomicity depends on pool_rwlock held for write */
+    long obj_count_max;  
+    unsigned long objnode_count, objnode_count_max;
+    uint64_t sum_life_cycles;
+    uint64_t sum_evicted_cycles;
+    unsigned long puts, good_puts, no_mem_puts;
+    unsigned long dup_puts_flushed, dup_puts_replaced;
+    unsigned long gets, found_gets;
+    unsigned long flushs, flushs_found;
+    unsigned long flush_objs, flush_objs_found;
+};
+typedef struct tm_pool pool_t;
+
+#define is_persistent(_p)  (_p->persistent)
+#define is_ephemeral(_p)   (!(_p->persistent))
+#define is_shared(_p)      (_p->shared)
+#define is_private(_p)     (!(_p->shared))
+
+struct tmem_object_root {
+    DECL_SENTINEL
+    uint64_t oid;
+    struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
+    unsigned long objnode_count; /* atomicity depends on obj_spinlock */
+    long pgp_count; /* atomicity depends on obj_spinlock */
+    struct radix_tree_root tree_root; /* tree of pages within object */
+    pool_t *pool;
+    cli_id_t last_client;
+    spinlock_t obj_spinlock;
+    bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
+};
+typedef struct tmem_object_root obj_t;
+
+typedef struct radix_tree_node rtn_t;
+struct tmem_object_node {
+    obj_t *obj;
+    DECL_SENTINEL
+    rtn_t rtn;
+};
+typedef struct tmem_object_node objnode_t;
+
+struct tmem_page_descriptor {
+    struct list_head global_eph_pages;
+    struct list_head client_eph_pages;
+    obj_t *obj;
+    uint32_t index;
+    size_t size; /* 0 == PAGE_SIZE (pfp), else compressed data (cdata) */
+    union {
+        pfp_t *pfp;  /* page frame pointer */
+        char *cdata; /* compressed data */
+    };
+    uint64_t timestamp;
+    DECL_SENTINEL
+};
+typedef struct tmem_page_descriptor pgp_t;
+
+static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
+
+static LIST_HEAD(global_client_list);
+static LIST_HEAD(global_pool_list);
+
+static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
+static atomic_t client_weight_total = ATOMIC_INIT(0);
+static int tmem_initialized = 0;
+
+/************ CONCURRENCY  ***********************************************/
+
+EXPORT DEFINE_SPINLOCK(tmem_spinlock);  /* used iff tmh_lock_all */
+EXPORT DEFINE_RWLOCK(tmem_rwlock);      /* used iff !tmh_lock_all */
+static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
+
+#define tmem_spin_lock(_l)  do {if (!tmh_lock_all) spin_lock(_l);}while(0)
+#define tmem_spin_unlock(_l)  do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
+#define tmem_read_lock(_l)  do {if (!tmh_lock_all) read_lock(_l);}while(0)
+#define tmem_read_unlock(_l)  do {if (!tmh_lock_all) read_unlock(_l);}while(0)
+#define tmem_write_lock(_l)  do {if (!tmh_lock_all) write_lock(_l);}while(0)
+#define tmem_write_unlock(_l)  do {if (!tmh_lock_all) write_unlock(_l);}while(0)
+#define tmem_write_trylock(_l)  ((tmh_lock_all)?1:write_trylock(_l))
+#define tmem_spin_trylock(_l)  (tmh_lock_all?1:spin_trylock(_l))
+
+#define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
+#define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
+
+/* global counters (should use long_atomic_t access) */
+static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
+static atomic_t global_obj_count = ATOMIC_INIT(0);
+static atomic_t global_pgp_count = ATOMIC_INIT(0);
+static atomic_t global_page_count = ATOMIC_INIT(0);
+static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
+
+#define atomic_inc_and_max(_c) do { \
+    atomic_inc(&_c); \
+    if ( _atomic_read(_c) > _c##_max ) \
+        _c##_max = _atomic_read(_c); \
+} while (0)
+
+#define atomic_dec_and_assert(_c) do { \
+    atomic_dec(&_c); \
+    ASSERT(_atomic_read(_c) >= 0); \
+} while (0)
+
+
+/************ MEMORY ALLOCATION INTERFACE *****************************/
+
+#define tmem_malloc(_type,_pool) \
+       _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
+
+#define tmem_malloc_bytes(_size,_pool) \
+       _tmem_malloc(_size, 1, _pool)
+
+static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
+{
+    void *v;
+
+    if ( (pool != NULL) && is_persistent(pool) )
+        v = tmh_alloc_subpage_thispool(pool,size,align);
+    else
+        v = tmh_alloc_subpage(pool, size, align);
+    if ( v == NULL )
+        alloc_failed++;
+    return v;
+}
+
+static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
+{
+    if ( pool == NULL || !is_persistent(pool) )
+        tmh_free_subpage(p,size);
+    else
+        tmh_free_subpage_thispool(pool,p,size);
+}
+
+static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
+{
+    pfp_t *pfp = NULL;
+
+    if ( pool != NULL && is_persistent(pool) )
+        pfp = tmh_alloc_page_thispool(pool);
+    else
+        pfp = tmh_alloc_page(pool,0);
+    if ( pfp == NULL )
+        alloc_page_failed++;
+    else
+        atomic_inc_and_max(global_page_count);
+    return pfp;
+}
+
+static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
+{
+    ASSERT(pfp);
+    if ( pool == NULL || !is_persistent(pool) )
+        tmh_free_page(pfp);
+    else
+        tmh_free_page_thispool(pool,pfp);
+    atomic_dec_and_assert(global_page_count);
+}
+
+/************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
+
+/* allocate a pgp_t and associate it with an object */
+static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
+{
+    pgp_t *pgp;
+    pool_t *pool;
+
+    ASSERT(obj != NULL);
+    ASSERT(obj->pool != NULL);
+    pool = obj->pool;
+    if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
+        return NULL;
+    pgp->obj = obj;
+    INIT_LIST_HEAD(&pgp->global_eph_pages);
+    INIT_LIST_HEAD(&pgp->client_eph_pages);
+    pgp->pfp = NULL;
+    pgp->size = -1;
+    pgp->index = -1;
+    pgp->timestamp = get_cycles();
+    SET_SENTINEL(pgp,PGD);
+    atomic_inc_and_max(global_pgp_count);
+    atomic_inc_and_max(pool->pgp_count);
+    return pgp;
+}
+
+static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
+{
+    ASSERT(obj != NULL);
+    ASSERT_SPINLOCK(&obj->obj_spinlock);
+    ASSERT_SENTINEL(obj,OBJ);
+    ASSERT(obj->pool != NULL);
+    ASSERT_SENTINEL(obj->pool,POOL);
+    return radix_tree_lookup(&obj->tree_root, index);
+}
+
+static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
+{
+    if ( pgp->pfp == NULL )
+        return;
+    if ( !pgp->size )
+        tmem_page_free(pgp->obj->pool,pgp->pfp);
+    else
+    {
+        tmem_free(pgp->cdata,pgp->size,pool);
+        if ( pool != NULL )
+        {
+            pool->client->compressed_pages--;
+            pool->client->compressed_sum_size -= pgp->size;
+        }
+    }
+    pgp->pfp = NULL;
+    pgp->size = -1;
+}
+
+static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
+{
+    pool_t *pool = NULL;
+
+    ASSERT_SENTINEL(pgp,PGD);
+    ASSERT(pgp->obj != NULL);
+    ASSERT_SENTINEL(pgp->obj,OBJ);
+    ASSERT_SENTINEL(pgp->obj->pool,POOL);
+    ASSERT(list_empty(&pgp->global_eph_pages));
+    ASSERT(list_empty(&pgp->client_eph_pages));
+    if ( from_delete )
+        ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
+    ASSERT(pgp->obj->pool != NULL);
+    pool = pgp->obj->pool;
+    pgp_free_data(pgp, pool);
+    INVERT_SENTINEL(pgp,PGD);
+    pgp->obj = NULL;
+    pgp->index = -1;
+    pgp->size = -1;
+    atomic_dec_and_assert(global_pgp_count);
+    atomic_dec_and_assert(pool->pgp_count);
+    tmem_free(pgp,sizeof(pgp_t),pool);
+}
+
+/* remove the page from appropriate lists but not from parent object */
+static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
+{
+    ASSERT(pgp != NULL);
+    ASSERT(pgp->obj != NULL);
+    ASSERT(pgp->obj->pool != NULL);
+    ASSERT(pgp->obj->pool->client != NULL);
+    if ( is_ephemeral(pgp->obj->pool) )
+    {
+        if ( !no_eph_lock )
+            tmem_spin_lock(&eph_lists_spinlock);
+        if ( !list_empty(&pgp->client_eph_pages) )
+            pgp->obj->pool->client->eph_count--;
+        ASSERT(pgp->obj->pool->client->eph_count >= 0);
+        list_del_init(&pgp->client_eph_pages);
+        if ( !list_empty(&pgp->global_eph_pages) )
+            global_eph_count--;
+        ASSERT(global_eph_count >= 0);
+        list_del_init(&pgp->global_eph_pages);
+        if ( !no_eph_lock )
+            tmem_spin_unlock(&eph_lists_spinlock);
+    }
+}
+
+/* remove page from lists (but not from parent object) and free it */
+static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
+{
+    uint64_t life;
+
+    ASSERT(pgp != NULL);
+    ASSERT(pgp->obj != NULL);
+    ASSERT(pgp->obj->pool != NULL);
+    life = get_cycles() - pgp->timestamp;
+    pgp->obj->pool->sum_life_cycles += life;
+    pgp_delist(pgp, no_eph_lock);
+    pgp_free(pgp,1);
+}
+
+/* called only indirectly by radix_tree_destroy */
+static NOINLINE void pgp_destroy(void *v)
+{
+    pgp_t *pgp = (pgp_t *)v;
+
+    ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
+    pgp_delist(pgp,0);
+    ASSERT(pgp->obj != NULL);
+    pgp->obj->pgp_count--;
+    ASSERT(pgp->obj->pgp_count >= 0);
+    pgp_free(pgp,0);
+}
+
+FORWARD static rtn_t *rtn_alloc(void *arg);
+FORWARD static void rtn_free(rtn_t *rtn);
+
+static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
+{
+    int ret;
+
+    ASSERT_SPINLOCK(&obj->obj_spinlock);
+    ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
+    if ( !ret )
+        obj->pgp_count++;
+    return ret;
+}
+
+static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
+{
+    pgp_t *pgp;
+
+    ASSERT(obj != NULL);
+    ASSERT_SPINLOCK(&obj->obj_spinlock);
+    ASSERT_SENTINEL(obj,OBJ);
+    ASSERT(obj->pool != NULL);
+    ASSERT_SENTINEL(obj->pool,POOL);
+    pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
+    if ( pgp != NULL )
+        obj->pgp_count--;
+    ASSERT(obj->pgp_count >= 0);
+
+    return pgp;
+}
+
+/************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
+
+/* called only indirectly from radix_tree_insert */
+static NOINLINE rtn_t *rtn_alloc(void *arg)
+{
+    objnode_t *objnode;
+    obj_t *obj = (obj_t *)arg;
+
+    ASSERT_SENTINEL(obj,OBJ);
+    ASSERT(obj->pool != NULL);
+    ASSERT_SENTINEL(obj->pool,POOL);
+    objnode = tmem_malloc(objnode_t,obj->pool);
+    if (objnode == NULL)
+        return NULL;
+    objnode->obj = obj;
+    SET_SENTINEL(objnode,OBJNODE);
+    memset(&objnode->rtn, 0, sizeof(rtn_t));
+    if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
+        obj->pool->objnode_count_max = obj->pool->objnode_count;
+    atomic_inc_and_max(global_rtree_node_count);
+    obj->objnode_count++;
+    return &objnode->rtn;
+}
+
+/* called only indirectly from radix_tree_delete/destroy */
+static void rtn_free(rtn_t *rtn)
+{
+    pool_t *pool;
+    objnode_t *objnode;
+    int i;
+
+    ASSERT(rtn != NULL);
+    for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
+        ASSERT(rtn->slots[i] == NULL);
+    objnode = container_of(rtn,objnode_t,rtn);
+    ASSERT_SENTINEL(objnode,OBJNODE);
+    INVERT_SENTINEL(objnode,OBJNODE);
+    ASSERT(objnode->obj != NULL);
+    ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
+    ASSERT_SENTINEL(objnode->obj,OBJ);
+    pool = objnode->obj->pool;
+    ASSERT(pool != NULL);
+    ASSERT_SENTINEL(pool,POOL);
+    pool->objnode_count--;
+    objnode->obj->objnode_count--;
+    objnode->obj = NULL;
+    tmem_free(objnode,sizeof(objnode_t),pool);
+    atomic_dec_and_assert(global_rtree_node_count);
+}
+
+/************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
+
+/* searches for object==oid in pool, returns locked object if found */
+static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid)
+{
+    struct rb_node *node;
+    obj_t *obj;
+
+restart_find:
+    tmem_read_lock(&pool->pool_rwlock);
+    node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node;
+    while ( node )
+    {
+        obj = container_of(node, obj_t, rb_tree_node);
+        if ( obj->oid == oid )
+        {
+            if ( tmh_lock_all )
+                obj->no_evict = 1;
+            else
+            {
+                if ( !tmem_spin_trylock(&obj->obj_spinlock) )
+                {
+                    tmem_read_unlock(&pool->pool_rwlock);
+                    goto restart_find;
+                }
+                tmem_read_unlock(&pool->pool_rwlock);
+            }
+            return obj;
+        }
+        else if ( oid < obj->oid )
+            node = node->rb_left;
+        else
+            node = node->rb_right;
+    }
+    tmem_read_unlock(&pool->pool_rwlock);
+    return NULL;
+}
+
+/* free an object that has no more pgps in it */
+static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
+{
+    pool_t *pool;
+    uint64_t old_oid;
+
+    ASSERT_SPINLOCK(&obj->obj_spinlock);
+    ASSERT(obj != NULL);
+    ASSERT_SENTINEL(obj,OBJ);
+    ASSERT(obj->pgp_count == 0);
+    pool = obj->pool;
+    ASSERT(pool != NULL);
+    ASSERT_WRITELOCK(&pool->pool_rwlock);
+    if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
+        radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
+    ASSERT((long)obj->objnode_count == 0);
+    ASSERT(obj->tree_root.rnode == NULL);
+    pool->obj_count--;
+    ASSERT(pool->obj_count >= 0);
+    INVERT_SENTINEL(obj,OBJ);
+    obj->pool = NULL;
+    old_oid = obj->oid;
+    obj->oid = -1;
+    obj->last_client = CLI_ID_NULL;
+    atomic_dec_and_assert(global_obj_count);
+    /* use no_rebalance only if all objects are being destroyed anyway */
+    if ( !no_rebalance )
+        rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]);
+    tmem_free(obj,sizeof(obj_t),pool);
+}
+
+static NOINLINE void obj_rb_destroy_node(struct rb_node *node)
+{
+    obj_t * obj;
+
+    if ( node == NULL )
+        return;
+    obj_rb_destroy_node(node->rb_left);
+    obj_rb_destroy_node(node->rb_right);
+    obj = container_of(node, obj_t, rb_tree_node);
+    tmem_spin_lock(&obj->obj_spinlock);
+    ASSERT(obj->no_evict == 0);
+    radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
+    obj_free(obj,1);
+}
+
+static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
+{
+    struct rb_node **new, *parent = NULL;
+    obj_t *this;
+
+    new = &(root->rb_node);
+    while ( *new )
+    {
+        this = container_of(*new, obj_t, rb_tree_node);
+        parent = *new;
+        if ( obj->oid < this->oid )
+            new = &((*new)->rb_left);
+        else if ( obj->oid > this->oid )
+            new = &((*new)->rb_right);
+        else
+            return 0;
+    }
+    rb_link_node(&obj->rb_tree_node, parent, new);
+    rb_insert_color(&obj->rb_tree_node, root);
+    return 1;
+}
+
+/*
+ * allocate, initialize, and insert an tmem_object_root
+ * (should be called only if find failed)
+ */
+static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid)
+{
+    obj_t *obj;
+
+    ASSERT(pool != NULL);
+    ASSERT_WRITELOCK(&pool->pool_rwlock);
+    if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
+        return NULL;
+    pool->obj_count++;
+    if (pool->obj_count > pool->obj_count_max)
+        pool->obj_count_max = pool->obj_count;
+    atomic_inc_and_max(global_obj_count);
+    INIT_RADIX_TREE(&obj->tree_root,0);
+    spin_lock_init(&obj->obj_spinlock);
+    obj->pool = pool;
+    obj->oid = oid;
+    obj->objnode_count = 0;
+    obj->pgp_count = 0;
+    obj->last_client = CLI_ID_NULL;
+    SET_SENTINEL(obj,OBJ);
+    tmem_spin_lock(&obj->obj_spinlock);
+    obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj);
+    obj->no_evict = 1;
+    ASSERT_SPINLOCK(&obj->obj_spinlock);
+    return obj;
+}
+
+/* free an object after destroying any pgps in it */
+static NOINLINE void obj_destroy(obj_t *obj)
+{
+    ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
+    radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
+    obj_free(obj,0);
+}
+
+/* destroy all objects in a pool */
+static NOINLINE void obj_rb_destroy_all(pool_t *pool)
+{
+    int i;
+
+    tmem_write_lock(&pool->pool_rwlock);
+    for (i = 0; i < OBJ_HASH_BUCKETS; i++)
+        obj_rb_destroy_node(pool->obj_rb_root[i].rb_node);
+    tmem_write_unlock(&pool->pool_rwlock);
+}
+
+/* destroys all objects in a pool that have last_client set to cli_id */
+static void obj_free_selective(pool_t *pool, cli_id_t cli_id)
+{
+    struct rb_node *node;
+    obj_t *obj;
+    int i;
+
+    tmem_write_lock(&pool->pool_rwlock);
+    for (i = 0; i < OBJ_HASH_BUCKETS; i++)
+    {
+        node = rb_first(&pool->obj_rb_root[i]);
+        while ( node != NULL )
+        {
+            obj = container_of(node, obj_t, rb_tree_node);
+            tmem_spin_lock(&obj->obj_spinlock);
+            node = rb_next(node);
+            if ( obj->last_client == cli_id )
+                obj_destroy(obj);
+            else
+                tmem_spin_unlock(&obj->obj_spinlock);
+        }
+    }
+    tmem_write_unlock(&pool->pool_rwlock);
+}
+
+
+/************ POOL MANIPULATION ROUTINES ******************************/
+
+static pool_t * pool_alloc(void)
+{
+    pool_t *pool;
+    int i;
+
+    if ( (pool = tmem_malloc(pool_t,NULL)) == NULL )
+        return NULL;
+    for (i = 0; i < OBJ_HASH_BUCKETS; i++)
+        pool->obj_rb_root[i] = RB_ROOT;
+    INIT_LIST_HEAD(&pool->pool_list);
+    rwlock_init(&pool->pool_rwlock);
+    pool->pgp_count_max = pool->obj_count_max = 0;
+    pool->objnode_count = pool->objnode_count_max = 0;
+    atomic_set(&pool->pgp_count,0);
+    pool->obj_count = 0;
+    pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
+    pool->dup_puts_replaced = pool->no_mem_puts = 0;
+    pool->found_gets = pool->gets = 0;
+    pool->flushs_found = pool->flushs = 0;
+    pool->flush_objs_found = pool->flush_objs = 0;
+    SET_SENTINEL(pool,POOL);
+    return pool;
+}
+
+static NOINLINE void pool_free(pool_t *pool)
+{
+    ASSERT_SENTINEL(pool,POOL);
+    INVERT_SENTINEL(pool,POOL);
+    pool->client = NULL;
+    list_del(&pool->pool_list);
+    tmem_free(pool,sizeof(pool_t),NULL);
+}
+
+/* register new_client as a user of this shared pool and return new
+   total number of registered users */
+static int shared_pool_join(pool_t *pool, client_t *new_client)
+{
+    sharelist_t *sl;
+
+    ASSERT(is_shared(pool));
+    if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
+        return -1;
+    sl->client = new_client;
+    list_add_tail(&sl->share_list, &pool->share_list);
+    printk("adding new %s %d to shared pool owned by %s %d\n",
+        client_str, new_client->cli_id, client_str, pool->client->cli_id);
+    return ++pool->shared_count;
+}
+
+/* reassign "ownership" of the pool to another client that shares this pool */
+static NOINLINE void shared_pool_reassign(pool_t *pool)
+{
+    sharelist_t *sl;
+    int poolid;
+    client_t *old_client = pool->client, *new_client;
+
+    ASSERT(is_shared(pool));
+    if ( list_empty(&pool->share_list) )
+    {
+        ASSERT(pool->shared_count == 0);
+        return;
+    }
+    old_client->pools[pool->pool_id] = NULL;
+    sl = list_entry(pool->share_list.next, sharelist_t, share_list);
+    ASSERT(sl->client != old_client);
+    pool->client = new_client = sl->client;
+    for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
+        if (new_client->pools[poolid] == pool)
+            break;
+    ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
+    printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
+        cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
+    pool->pool_id = poolid;
+}
+
+/* destroy all objects with last_client same as passed cli_id,
+   remove pool's cli_id from list of sharers of this pool */
+static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
+{
+    sharelist_t *sl;
+    int s_poolid;
+
+    ASSERT(is_shared(pool));
+    ASSERT(pool->client != NULL);
+    
+    obj_free_selective(pool,cli_id);
+    list_for_each_entry(sl,&pool->share_list, share_list)
+    {
+        if (sl->client->cli_id != cli_id)
+            continue;
+        list_del(&sl->share_list);
+        tmem_free(sl,sizeof(sharelist_t),pool);
+        --pool->shared_count;
+        if (pool->client->cli_id == cli_id)
+            shared_pool_reassign(pool);
+        if (pool->shared_count)
+            return pool->shared_count;
+        for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
+            if ( (global_shared_pools[s_poolid]) == pool )
+            {
+                global_shared_pools[s_poolid] = NULL;
+                break;
+            }
+        return 0;
+    }
+    printk("tmem: no match unsharing pool, %s=%d\n",
+        cli_id_str,pool->client->cli_id);
+    return -1;
+}
+
+/* flush all data (owned by cli_id) from a pool and, optionally, free it */
+static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
+{
+    ASSERT(pool != NULL);
+    if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
+    {
+        printk("tmem: unshared shared pool %d from %s=%d\n",
+           pool->pool_id, cli_id_str,pool->client->cli_id);
+        return;
+    }
+    printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
+        is_persistent(pool) ? "persistent" : "ephemeral" ,
+        is_shared(pool) ? "shared" : "private");
+    printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
+    obj_rb_destroy_all(pool);
+    if ( destroy )
+    {
+        pool->client->pools[pool->pool_id] = NULL;
+        pool_free(pool);
+    }
+}
+
+/************ CLIENT MANIPULATION OPERATIONS **************************/
+
+static client_t *client_create(void)
+{
+    client_t *client = tmem_malloc(client_t,NULL);
+    cli_id_t cli_id = tmh_get_cli_id_from_current();
+
+    printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
+    if ( client == NULL )
+    {
+        printk("failed... out of memory\n");
+        return NULL;
+    }
+    memset(client,0,sizeof(client_t));
+    if ( (client->tmh = tmh_client_init()) == NULL )
+    {
+        printk("failed... can't allocate host-dependent part of client\n");
+        if ( client )
+            tmem_free(client,sizeof(client_t),NULL);
+        return NULL;
+    }
+    tmh_set_current_client(client);
+    client->cli_id = cli_id;
+#ifdef __i386__
+    client->compress = 0;
+#else
+    client->compress = tmh_compression_enabled();
+#endif
+    list_add_tail(&client->client_list, &global_client_list);
+    INIT_LIST_HEAD(&client->ephemeral_page_list);
+    client->eph_count = client->eph_count_max = 0;
+    printk("ok\n");
+    return client;
+}
+
+static void client_free(client_t *client)
+{
+    list_del(&client->client_list);
+    tmh_client_destroy(client->tmh);
+    tmh_set_current_client(NULL);
+    tmem_free(client,sizeof(client_t),NULL);
+}
+
+/* flush all data from a client and, optionally, free it */
+static void client_flush(client_t *client, bool_t destroy)
+{
+    int i;
+    pool_t *pool;
+
+    for  (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
+    {
+        if ( (pool = client->pools[i]) == NULL )
+            continue;
+        pool_flush(pool,client->cli_id,destroy);
+        if ( destroy )
+            client->pools[i] = NULL;
+    }
+    if ( destroy )
+        client_free(client);
+}
+
+static bool_t client_over_quota(client_t *client)
+{
+    int total = _atomic_read(client_weight_total);
+
+    ASSERT(client != NULL);
+    if ( (total == 0) || (client->weight == 0) || 
+          (client->eph_count == 0) )
+        return 0;
+    return ( ((global_eph_count*100L) / client->eph_count ) >
+             ((total*100L) / client->weight) );
+}
+
+/************ MEMORY REVOCATION ROUTINES *******************************/
+
+static int tmem_evict(void)
+{
+    client_t *client = tmh_client_from_current();
+    pgp_t *pgp = NULL, *pgp_del;
+    obj_t *obj;
+    pool_t *pool;
+    int ret = 0;
+    bool_t hold_pool_rwlock = 0;
+
+    evict_attempts++;
+    tmem_spin_lock(&eph_lists_spinlock);
+    if ( (client != NULL) && client_over_quota(client) &&
+         !list_empty(&client->ephemeral_page_list) )
+    {
+        list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
+        {
+            obj = pgp->obj;
+            pool = obj->pool;
+            if ( tmh_lock_all && !obj->no_evict )
+                goto found;
+            if ( tmem_spin_trylock(&obj->obj_spinlock) )
+            {
+                if ( obj->pgp_count > 1 )
+                    goto found;
+                if ( tmem_write_trylock(&pool->pool_rwlock) )
+                {
+                    hold_pool_rwlock = 1;
+                    goto found;
+                }
+                tmem_spin_unlock(&obj->obj_spinlock);
+            }
+        }
+    } else if ( list_empty(&global_ephemeral_page_list) ) {
+        goto out;
+    } else {
+        list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
+        {
+            obj = pgp->obj;
+            pool = obj->pool;
+            if ( tmh_lock_all && !obj->no_evict )
+                goto found;
+            if ( tmem_spin_trylock(&obj->obj_spinlock) )
+            {
+                if ( obj->pgp_count > 1 )
+                    goto found;
+                if ( tmem_write_trylock(&pool->pool_rwlock) )
+                {
+                    hold_pool_rwlock = 1;
+                    goto found;
+                }
+                tmem_spin_unlock(&obj->obj_spinlock);
+            }
+        }
+    }
+
+    ret = 0;
+    goto out;
+
+found:
+    ASSERT(pgp != NULL);
+    ASSERT_SENTINEL(pgp,PGD);
+    obj = pgp->obj;
+    ASSERT(obj != NULL);
+    ASSERT(obj->no_evict == 0);
+    ASSERT(obj->pool != NULL);
+    ASSERT_SENTINEL(obj,OBJ);
+
+    ASSERT_SPINLOCK(&obj->obj_spinlock);
+    pgp_del = pgp_delete_from_obj(obj, pgp->index);
+    ASSERT(pgp_del == pgp);
+    pgp_delete(pgp,1);
+    if ( obj->pgp_count == 0 )
+    {
+        ASSERT_WRITELOCK(&pool->pool_rwlock);
+        obj_free(obj,0);
+    }
+    else
+        tmem_spin_unlock(&obj->obj_spinlock);
+    if ( hold_pool_rwlock )
+        tmem_write_unlock(&pool->pool_rwlock);
+    evicted_pgs++;
+    ret = 1;
+
+out:
+    tmem_spin_unlock(&eph_lists_spinlock);
+    return ret;
+}
+
+static unsigned long tmem_relinquish_npages(unsigned long n)
+{
+    unsigned long avail_pages = 0;
+
+    while ( (avail_pages = tmh_avail_pages()) < n )
+    {
+        if (  !tmem_evict() )
+            break;
+    }
+    if ( avail_pages )
+        tmh_release_avail_pages_to_host();
+    return avail_pages;
+}
+
+/************ TMEM CORE OPERATIONS ************************************/
+
+static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn)
+{
+    void *dst, *p;
+    size_t size;
+    int ret = 0;
+    DECL_LOCAL_CYC_COUNTER(compress);
+    
+    ASSERT(pgp != NULL);
+    ASSERT(pgp->obj != NULL);
+    ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
+    ASSERT(pgp->obj->pool != NULL);
+    ASSERT(pgp->obj->pool->client != NULL);
+#ifdef __i386__
+    return -ENOMEM;
+#endif
+    if ( pgp->pfp != NULL )
+        pgp_free_data(pgp, pgp->obj->pool);  /* FIXME... is this right? */
+    START_CYC_COUNTER(compress);
+    ret = tmh_compress_from_client(cmfn, &dst, &size);
+    if ( (ret == -EFAULT) || (ret == 0) )
+        goto out;
+    else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
+        ret = 0;
+    else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
+        ret = -ENOMEM;
+    else
+    {
+        memcpy(p,dst,size);
+        pgp->cdata = p;
+        pgp->size = size;
+        pgp->obj->pool->client->compressed_pages++;
+        pgp->obj->pool->client->compressed_sum_size += size;
+        ret = 1;
+    }
+
+out:
+    END_CYC_COUNTER(compress);
+    return ret;
+}
+
+static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
+              uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
+{
+    pool_t *pool;
+    obj_t *obj;
+    client_t *client;
+    pgp_t *pgpfound = NULL;
+    int ret;
+
+    /* if we can successfully manipulate pgp to change out the data, do so */
+    ASSERT(pgp != NULL);
+    ASSERT(pgp->pfp != NULL);
+    ASSERT(pgp->size != -1);
+    obj = pgp->obj;
+    ASSERT_SPINLOCK(&obj->obj_spinlock);
+    ASSERT(obj != NULL);
+    pool = obj->pool;
+    ASSERT(pool != NULL);
+    client = pool->client;
+    if ( len != 0 && tmh_compression_enabled() &&
+         client->compress && pgp->size != 0 )
+    {
+        ret = do_tmem_put_compress(pgp,cmfn);
+        if ( ret == 1 )
+            goto done;
+        else if ( ret == 0 )
+            goto copy_uncompressed;
+        else if ( ret == -ENOMEM )
+            goto failed_dup;
+        else if ( ret == -EFAULT )
+            goto bad_copy;
+    }
+
+copy_uncompressed:
+    if ( pgp->pfp )
+        pgp_free_data(pgp, pool);
+    if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
+        goto failed_dup;
+    /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
+    ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
+    if ( ret == -EFAULT )
+        goto bad_copy;
+    pgp->size = 0;
+
+done:
+    /* successfully replaced data, clean up and return success */
+    if ( is_shared(pool) )
+        obj->last_client = client->cli_id;
+    obj->no_evict = 0;
+    tmem_spin_unlock(&obj->obj_spinlock);
+    pool->dup_puts_replaced++;
+    pool->good_puts++;
+    return 1;
+
+bad_copy:
+    /* this should only happen if the client passed a bad mfn */
+    failed_copies++;
+ASSERT(0);
+    return -EFAULT;
+
+failed_dup:
+   /* couldn't change out the data, flush the old data and return
+    * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
+    pgpfound = pgp_delete_from_obj(obj, pgp->index);
+    ASSERT(pgpfound == pgp);
+    pgp_delete(pgpfound,0);
+    if ( obj->pgp_count == 0 )
+    {
+        tmem_write_lock(&pool->pool_rwlock);
+        obj_free(obj,0);
+        tmem_write_unlock(&pool->pool_rwlock);
+    } else {
+        obj->no_evict = 0;
+        tmem_spin_unlock(&obj->obj_spinlock);
+    }
+    pool->dup_puts_flushed++;
+    return -ENOSPC;
+}
+
+
+static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index,
+              tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
+              uint32_t pfn_offset, uint32_t len)
+{
+    obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
+    pgp_t *pgp = NULL, *pgpdel = NULL;
+    client_t *client = pool->client;
+    int ret = client->frozen ? -EFROZEN : -ENOMEM;
+
+    ASSERT(pool != NULL);
+    pool->puts++;
+    /* does page already exist (dup)?  if so, handle specially */
+    if ( (obj = objfound = obj_find(pool,oid)) != NULL )
+    {
+        ASSERT_SPINLOCK(&objfound->obj_spinlock);
+        if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
+            return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len);
+    }
+
+    /* no puts allowed into a frozen pool (except dup puts) */
+    if ( client->frozen )
+        goto free;
+
+    if ( (objfound == NULL) )
+    {
+        tmem_write_lock(&pool->pool_rwlock);
+        if ( (obj = objnew = obj_new(pool,oid)) == NULL )
+        {
+            tmem_write_unlock(&pool->pool_rwlock);
+            return -ENOMEM;
+        }
+        ASSERT_SPINLOCK(&objnew->obj_spinlock);
+        tmem_write_unlock(&pool->pool_rwlock);
+    }
+
+    ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
+    ASSERT_SPINLOCK(&obj->obj_spinlock);
+    if ( (pgp = pgp_alloc(obj)) == NULL )
+        goto free;
+
+    ret = pgp_add_to_obj(obj, index, pgp);
+    if ( ret == -ENOMEM  )
+        /* warning, may result in partially built radix tree ("stump") */
+        goto free;
+    ASSERT(ret != -EEXIST);
+    pgp->index = index;
+
+    if ( len != 0 && tmh_compression_enabled() && client->compress )
+    {
+        ASSERT(pgp->pfp == NULL);
+        ret = do_tmem_put_compress(pgp,cmfn);
+        if ( ret == 1 )
+            goto insert_page;
+        if ( ret == -ENOMEM )
+        {
+            client->compress_nomem++;
+            goto delete_and_free;
+        }
+        if ( ret == 0 )
+        {
+            client->compress_poor++;
+            goto copy_uncompressed;
+        }
+        if ( ret == -EFAULT )
+            goto bad_copy;
+    }
+
+copy_uncompressed:
+    if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
+    {
+        ret == -ENOMEM;
+        goto delete_and_free;
+    }
+    /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
+    ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
+    if ( ret == -EFAULT )
+        goto bad_copy;
+    pgp->size = 0;
+
+insert_page:
+    if ( is_ephemeral(pool) )
+    {
+        tmem_spin_lock(&eph_lists_spinlock);
+        list_add_tail(&pgp->global_eph_pages,
+            &global_ephemeral_page_list);
+        if (++global_eph_count > global_eph_count_max)
+            global_eph_count_max = global_eph_count;
+        list_add_tail(&pgp->client_eph_pages,
+            &client->ephemeral_page_list);
+        if (++client->eph_count > client->eph_count_max)
+            client->eph_count_max = client->eph_count;
+        tmem_spin_unlock(&eph_lists_spinlock);
+    }
+    ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
+    if ( is_shared(pool) )
+        obj->last_client = client->cli_id;
+    obj->no_evict = 0;
+    tmem_spin_unlock(&obj->obj_spinlock);
+    pool->good_puts++;
+    return 1;
+
+delete_and_free:
+    ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
+    pgpdel = pgp_delete_from_obj(obj, pgp->index);
+    ASSERT(pgp == pgpdel);
+
+free:
+    if ( pgp )
+        pgp_delete(pgp,0);
+    if ( objfound )
+    {
+        objfound->no_evict = 0;
+        tmem_spin_unlock(&objfound->obj_spinlock);
+    }
+    if ( objnew )
+    {
+        tmem_write_lock(&pool->pool_rwlock);
+        obj_free(objnew,0);
+        tmem_write_unlock(&pool->pool_rwlock);
+    }
+    pool->no_mem_puts++;
+    return ret;
+
+bad_copy:
+    /* this should only happen if the client passed a bad mfn */
+    failed_copies++;
+ASSERT(0);
+    goto free;
+}
+
+static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
+              tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
+              uint32_t pfn_offset, uint32_t len)
+{
+    obj_t *obj;
+    pgp_t *pgp;
+    client_t *client = pool->client;
+    DECL_LOCAL_CYC_COUNTER(decompress);
+
+    if ( !_atomic_read(pool->pgp_count) )
+        return -EEMPTY;
+
+    pool->gets++;
+    obj = obj_find(pool,oid);
+    if ( obj == NULL )
+        return 0;
+
+    ASSERT_SPINLOCK(&obj->obj_spinlock);
+    if (is_shared(pool) || is_persistent(pool) )
+        pgp = pgp_lookup_in_obj(obj, index);
+    else
+        pgp = pgp_delete_from_obj(obj, index);
+    if ( pgp == NULL )
+    {
+        obj->no_evict = 0;
+        tmem_spin_unlock(&obj->obj_spinlock);
+        return 0;
+    }
+    ASSERT(pgp->size != -1);
+    if ( pgp->size != 0 )
+    {
+        START_CYC_COUNTER(decompress);
+        if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size) == -EFAULT )
+            goto bad_copy;
+        END_CYC_COUNTER(decompress);
+    }
+    else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
+                                 pfn_offset, len) == -EFAULT)
+        goto bad_copy;
+    if ( is_ephemeral(pool) )
+    {
+        if ( is_private(pool) )
+        {
+            pgp_delete(pgp,0);
+            if ( obj->pgp_count == 0 )
+            {
+                tmem_write_lock(&pool->pool_rwlock);
+                obj_free(obj,0);
+                obj = NULL;
+                tmem_write_unlock(&pool->pool_rwlock);
+            }
+        } else {
+            tmem_spin_lock(&eph_lists_spinlock);
+            list_del(&pgp->global_eph_pages);
+            list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
+            list_del(&pgp->client_eph_pages);
+            list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
+            tmem_spin_unlock(&eph_lists_spinlock);
+            ASSERT(obj != NULL);
+            obj->last_client = tmh_get_cli_id_from_current();
+        }
+    }
+    if ( obj != NULL )
+    {
+        obj->no_evict = 0;
+        tmem_spin_unlock(&obj->obj_spinlock);
+    }
+    pool->found_gets++;
+    return 1;
+
+bad_copy:
+    /* this should only happen if the client passed a bad mfn */
+    failed_copies++;
+ASSERT(0);
+    return -EFAULT;
+
+}
+
+static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t index)
+{
+    obj_t *obj;
+    pgp_t *pgp;
+
+    pool->flushs++;
+    obj = obj_find(pool,oid);
+    if ( obj == NULL )
+        goto out;
+    pgp = pgp_delete_from_obj(obj, index);
+    if ( pgp == NULL )
+    {
+        obj->no_evict = 0;
+        tmem_spin_unlock(&obj->obj_spinlock);
+        goto out;
+    }
+    pgp_delete(pgp,0);
+    if ( obj->pgp_count == 0 )
+    {
+        tmem_write_lock(&pool->pool_rwlock);
+        obj_free(obj,0);
+        tmem_write_unlock(&pool->pool_rwlock);
+    } else {
+        obj->no_evict = 0;
+        tmem_spin_unlock(&obj->obj_spinlock);
+    }
+    pool->flushs_found++;
+
+out:
+    if ( pool->client->frozen )
+        return -EFROZEN;
+    else
+        return 1;
+}
+
+static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid)
+{
+    obj_t *obj;
+
+    pool->flush_objs++;
+    obj = obj_find(pool,oid);
+    if ( obj == NULL )
+        goto out;
+    tmem_write_lock(&pool->pool_rwlock);
+    obj_destroy(obj);
+    pool->flush_objs_found++;
+    tmem_write_unlock(&pool->pool_rwlock);
+
+out:
+    if ( pool->client->frozen )
+        return -EFROZEN;
+    else
+        return 1;
+}
+
+static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
+{
+    client_t *client = tmh_client_from_current();
+    pool_t *pool;
+
+    if ( client->pools == NULL )
+        return 0;
+    if ( (pool = client->pools[pool_id]) == NULL )
+        return 0;
+    client->pools[pool_id] = NULL;
+    pool_flush(pool,client->cli_id,1);
+    return 1;
+}
+
+static NOINLINE int do_tmem_new_pool(uint32_t flags, uint64_t uuid_lo, uint64_t uuid_hi)
+{
+    client_t *client = tmh_client_from_current();
+    cli_id_t cli_id = tmh_get_cli_id_from_current();
+    int persistent = flags & TMEM_POOL_PERSIST;
+    int shared = flags & TMEM_POOL_SHARED;
+    int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
+         & TMEM_POOL_PAGESIZE_MASK;
+    int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
+         & TMEM_POOL_VERSION_MASK;
+    pool_t *pool, *shpool;
+    int s_poolid, d_poolid, first_unused_s_poolid;
+
+    ASSERT(client != NULL);
+    printk("tmem: allocating %s-%s tmem pool for %s=%d...",
+        persistent ? "persistent" : "ephemeral" ,
+        shared ? "shared" : "private", cli_id_str, cli_id);
+    if ( specversion != 0 )
+    {
+        printk("failed... unsupported spec version\n");
+        return -EPERM;
+    }
+    if ( pagebits != (PAGE_SHIFT - 12) )
+    {
+        printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
+        return -EPERM;
+    }
+    if ( (pool = pool_alloc()) == NULL )
+    {
+        printk("failed... out of memory\n");
+        return -ENOMEM;
+    }
+    for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
+        if ( client->pools[d_poolid] == NULL )
+            break;
+    if ( d_poolid == MAX_POOLS_PER_DOMAIN )
+    {
+        printk("failed... no more pool slots available for this %s\n",
+            client_str);
+        goto fail;
+    }
+    pool->shared = shared;
+    pool->client = client;
+    if ( shared )
+    {
+        first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
+        for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
+        {
+            if ( (shpool = global_shared_pools[s_poolid]) != NULL )
+            {
+                if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
+                {
+                    printk("(matches shared pool uuid=%"PRIx64".%"PRIu64") ",
+                        uuid_hi, uuid_lo);
+                    printk("pool_id=%d\n",d_poolid);
+                    client->pools[d_poolid] = global_shared_pools[s_poolid];
+                    shared_pool_join(global_shared_pools[s_poolid], client);
+                    pool_free(pool);
+                    return d_poolid;
+                }
+            }
+            else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
+                first_unused_s_poolid = s_poolid;
+        }
+        if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
+        {
+            printk("tmem: failed... no global shared pool slots available\n");
+            goto fail;
+        }
+        else
+        {
+            INIT_LIST_HEAD(&pool->share_list);
+            pool->shared_count = 0;
+            global_shared_pools[first_unused_s_poolid] = pool;
+            (void)shared_pool_join(pool,client);
+        }
+    }
+    client->pools[d_poolid] = pool;
+    list_add_tail(&pool->pool_list, &global_pool_list);
+    pool->pool_id = d_poolid;
+    pool->persistent = persistent;
+    pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
+    printk("pool_id=%d\n",d_poolid);
+    return d_poolid;
+
+fail:
+    pool_free(pool);
+    return -EPERM;
+}
+
+/************ TMEM CONTROL OPERATIONS ************************************/
+
+/* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
+static int tmemc_freeze_pools(int cli_id, int arg)
+{
+    client_t *client;
+    bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
+    bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
+    char *s;
+
+    s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
+    if ( cli_id == CLI_ID_NULL )
+    {
+        list_for_each_entry(client,&global_client_list,client_list)
+        {
+            client->frozen = freeze;
+            printk("tmem: all pools %s for all %ss\n",s,client_str);
+        }
+    }
+    else
+    {
+        if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
+            return -1;
+        client->frozen = freeze;
+        printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
+    }
+    return 0;
+}
+
+static int tmemc_flush_mem(int cli_id, uint32_t kb)
+{
+    uint32_t npages, flushed_pages, flushed_kb;
+
+    if ( cli_id != CLI_ID_NULL )
+    {
+        printk("tmem: %s-specific flush not supported yet, use --all\n",
+           client_str);
+        return -1;
+    }
+    /* convert kb to pages, rounding up if necessary */
+    npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
+    flushed_pages = tmem_relinquish_npages(npages);
+    flushed_kb = flushed_pages << (PAGE_SHIFT-10);
+    return flushed_kb;
+}
+
+/*
+ * These tmemc_list* routines output lots of stats in a format that is
+ *  intended to be program-parseable, not human-readable. Further, by
+ *  tying each group of stats to a line format indicator (e.g. G= for
+ *  global stats) and each individual stat to a two-letter specifier
+ *  (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
+ *  global ephemeral pool), it should allow the stats reported to be
+ *  forward and backwards compatible as tmem evolves.
+ */
+#define BSIZE 1024
+
+static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off, 
+                             uint32_t len, bool_t use_long)
+{
+    char info[BSIZE];
+    int i, n = 0, sum = 0;
+    pool_t *p;
+    bool_t s;
+
+    n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d%c",
+        c->cli_id, c->weight, c->cap, c->compress,
+        c->frozen, use_long ? ',' : '\n');
+    if (use_long)
+        n += scnprintf(info+n,BSIZE-n,
+             "Ec:%ld,Em:%ld,cp:%ld,cb:%lld,cn:%ld,cm:%ld\n",
+             c->eph_count, c->eph_count_max,
+             c->compressed_pages, (long long)c->compressed_sum_size,
+             c->compress_poor, c->compress_nomem);
+    tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+    sum += n;
+    for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
+    {
+        if ( (p = c->pools[i]) == NULL )
+            continue;
+        s = is_shared(p);
+        n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,PT:%c%c,U0:%llx,U1:%llx%c",
+             c->cli_id, p->pool_id,
+             is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
+             s ? p->uuid[0] : 0LL, s ? p->uuid[1] : 0LL,
+             use_long ? ',' : '\n');
+        if (use_long)
+            n += scnprintf(info+n,BSIZE-n,
+             "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
+             "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
+             "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
+             _atomic_read(p->pgp_count), p->pgp_count_max,
+             p->obj_count, p->obj_count_max,
+             p->objnode_count, p->objnode_count_max,
+             p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
+             p->no_mem_puts, 
+             p->found_gets, p->gets,
+             p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
+        if ( sum + n >= len )
+            return sum;
+        tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+        sum += n;
+    }
+    return sum;
+}
+
+static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
+                              bool_t use_long)
+{
+    char info[BSIZE];
+    int i, n = 0, sum = 0;
+    pool_t *p;
+    sharelist_t *sl;
+
+    for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
+    {
+        if ( (p = global_shared_pools[i]) == NULL )
+            continue;
+        n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%llx,U1:%llx",
+            i, is_persistent(p) ? 'P' : 'E', is_shared(p) ? 'S' : 'P',
+             (unsigned long long)p->uuid[0], (unsigned long long)p->uuid[1]);
+        list_for_each_entry(sl,&p->share_list, share_list)
+            n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
+        n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
+        if (use_long)
+            n += scnprintf(info+n,BSIZE-n,
+             "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
+             "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
+             "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
+             _atomic_read(p->pgp_count), p->pgp_count_max,
+             p->obj_count, p->obj_count_max,
+             p->objnode_count, p->objnode_count_max,
+             p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
+             p->no_mem_puts, 
+             p->found_gets, p->gets,
+             p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
+        if ( sum + n >= len )
+            return sum;
+        tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+        sum += n;
+    }
+    return sum;
+}
+
+#ifdef TMEM_PERF
+static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
+                              bool_t use_long)
+{
+    char info[BSIZE];
+    int n = 0, sum = 0;
+
+    n = scnprintf(info+n,BSIZE-n,"T=");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
+#ifdef COMPARE_COPY_PAGE_SSE2
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
+#else
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
+#endif
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
+    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
+    n--; /* overwrite trailing comma */
+    n += scnprintf(info+n,BSIZE-n,"\n");
+    if ( sum + n >= len )
+        return sum;
+    tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+    sum += n;
+    return sum;
+}
+#else
+#define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
+#endif
+
+static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
+                              bool_t use_long)
+{
+    char info[BSIZE];
+    int n = 0, sum = off;
+
+    n += scnprintf(info,BSIZE,"G="
+      "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
+      "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
+      total_tmem_ops, errored_tmem_ops, failed_copies,
+      alloc_failed, alloc_page_failed, tmh_avail_pages(),
+      low_on_memory, evicted_pgs,
+      evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
+      total_flush_pool, use_long ? ',' : '\n');
+    if (use_long)
+        n += scnprintf(info+n,BSIZE-n,
+          "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
+          global_eph_count, global_eph_count_max,
+          _atomic_read(global_obj_count), global_obj_count_max,
+          _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
+          _atomic_read(global_pgp_count), global_pgp_count_max);
+    if ( sum + n >= len )
+        return sum;
+    tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+    sum += n;
+    return sum;
+}
+
+static int tmemc_list(int cli_id, tmem_cli_va_t buf, uint32_t len,
+                               bool_t use_long)
+{
+    client_t *client;
+    int off = 0;
+
+    if ( cli_id == CLI_ID_NULL ) {
+        off = tmemc_list_global(buf,0,len,use_long);
+        off += tmemc_list_shared(buf,off,len-off,use_long);
+        list_for_each_entry(client,&global_client_list,client_list)
+            off += tmemc_list_client(client, buf, off, len-off, use_long);
+        off += tmemc_list_global_perf(buf,off,len-off,use_long);
+    }
+    else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
+        return -1;
+    else
+        off = tmemc_list_client(client, buf, 0, len, use_long);
+
+
+    return 0;
+}
+
+static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
+{
+    cli_id_t cli_id = client->cli_id;
+    uint32_t old_weight;
+
+    switch (subop)
+    {
+    case TMEMC_SET_WEIGHT:
+        old_weight = client->weight;
+        client->weight = arg1;
+        printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
+        atomic_sub(old_weight,&client_weight_total);
+        atomic_add(client->weight,&client_weight_total);
+        break;
+    case TMEMC_SET_CAP:
+        client->cap = arg1;
+        printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
+        break;
+    case TMEMC_SET_COMPRESS:
+        client->compress = arg1 ? 1 : 0;
+        printk("tmem: compression %s for %s=%d\n",
+            arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
+        break;
+    default:
+        printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
+        return -1;
+    }
+    return 0;
+}
+
+static int tmemc_set_var(int cli_id, uint32_t subop, uint32_t arg1)
+{
+    client_t *client;
+
+    if ( cli_id == CLI_ID_NULL )
+        list_for_each_entry(client,&global_client_list,client_list)
+            tmemc_set_var_one(client, subop, arg1);
+    else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
+        return -1;
+    else
+            tmemc_set_var_one(client, subop, arg1);
+    return 0;
+}
+
+static int do_tmem_control(uint32_t subop, uint32_t cli_id32,
+   uint32_t arg1, uint32_t arg2, tmem_cli_va_t buf)
+{
+    int ret;
+    cli_id_t cli_id = (cli_id_t)cli_id32;
+
+    if (!tmh_current_is_privileged())
+    {
+        /* don't fail... mystery: sometimes dom0 fails here */
+        /* return -EPERM; */
+    }
+    switch(subop)
+    {
+    case TMEMC_THAW:
+    case TMEMC_FREEZE:
+    case TMEMC_DESTROY:
+        ret = tmemc_freeze_pools(cli_id,subop);
+        break;
+    case TMEMC_FLUSH:
+        ret = tmemc_flush_mem(cli_id,arg1);
+        break;
+    case TMEMC_LIST:
+        ret = tmemc_list(cli_id,buf,arg1,arg2);
+        break;
+    case TMEMC_SET_WEIGHT:
+    case TMEMC_SET_CAP:
+    case TMEMC_SET_COMPRESS:
+        ret = tmemc_set_var(cli_id,subop,arg1);
+        break;
+    default:
+        ret = -1;
+    }
+    return ret;
+}
+
+/************ EXPORTed FUNCTIONS **************************************/
+
+EXPORT long do_tmem_op(tmem_cli_op_t uops)
+{
+    struct tmem_op op;
+    client_t *client = tmh_client_from_current();
+    pool_t *pool = NULL;
+    int rc = 0;
+    bool_t succ_get = 0, succ_put = 0;
+    bool_t non_succ_get = 0, non_succ_put = 0;
+    bool_t flush = 0, flush_obj = 0;
+    bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
+    static bool_t warned = 0;
+    DECL_LOCAL_CYC_COUNTER(succ_get);
+    DECL_LOCAL_CYC_COUNTER(succ_put);
+    DECL_LOCAL_CYC_COUNTER(non_succ_get);
+    DECL_LOCAL_CYC_COUNTER(non_succ_put);
+    DECL_LOCAL_CYC_COUNTER(flush);
+    DECL_LOCAL_CYC_COUNTER(flush_obj);
+
+    if ( !tmem_initialized )
+    {
+        if ( !warned )
+            printk("tmem: must specify tmem parameter on xen boot line\n");
+        warned = 1;
+        return -ENODEV;
+    }
+
+    total_tmem_ops++;
+
+    if ( tmh_lock_all )
+    {
+        if ( tmh_lock_all > 1 )
+            spin_lock_irq(&tmem_spinlock);
+        else
+            spin_lock(&tmem_spinlock);
+    }
+
+    START_CYC_COUNTER(succ_get);
+    DUP_START_CYC_COUNTER(succ_put,succ_get);
+    DUP_START_CYC_COUNTER(non_succ_get,succ_get);
+    DUP_START_CYC_COUNTER(non_succ_put,succ_get);
+    DUP_START_CYC_COUNTER(flush,succ_get);
+    DUP_START_CYC_COUNTER(flush_obj,succ_get);
+
+    if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
+    {
+        printk("tmem: can't get tmem struct from %s\n",client_str);
+        rc = -EFAULT;
+        goto out;
+    }
+
+    if ( op.cmd == TMEM_CONTROL )
+    {
+        tmem_write_lock(&tmem_rwlock);
+        tmem_write_lock_set = 1;
+        rc = do_tmem_control(op.subop, op.cli_id, op.arg1, op.arg2, op.buf);
+        goto out;
+    }
+
+    /* create per-client tmem structure dynamically on first use by client */
+    if ( client == NULL )
+    {
+        tmem_write_lock(&tmem_rwlock);
+        tmem_write_lock_set = 1;
+        if ( (client = client_create()) == NULL )
+        {
+            printk("tmem: can't create tmem structure for %s\n",client_str);
+            rc = -ENOMEM;
+            goto out;
+        }
+    }
+
+    if ( op.cmd == TMEM_NEW_POOL )
+    {
+        if ( !tmem_write_lock_set )
+        {
+            tmem_write_lock(&tmem_rwlock);
+            tmem_write_lock_set = 1;
+        }
+    }
+    else
+    {
+        if ( !tmem_write_lock_set )
+        {
+            tmem_read_lock(&tmem_rwlock);
+            tmem_read_lock_set = 1;
+        }
+        if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
+             ((pool = client->pools[op.pool_id]) == NULL) )
+        {
+            rc = -ENODEV;
+            printk("tmem: operation requested on uncreated pool\n");
+            goto out;
+        }
+        ASSERT_SENTINEL(pool,POOL);
+    }
+
+    switch ( op.cmd )
+    {
+    case TMEM_NEW_POOL:
+        rc = do_tmem_new_pool(op.flags,op.uuid[0],op.uuid[1]);
+        break;
+    case TMEM_NEW_PAGE:
+        rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, 0);
+        break;
+    case TMEM_PUT_PAGE:
+        rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE);
+        if (rc == 1) succ_put = 1;
+        else non_succ_put = 1;
+        break;
+    case TMEM_GET_PAGE:
+        rc = do_tmem_get(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE);
+        if (rc == 1) succ_get = 1;
+        else non_succ_get = 1;
+        break;
+    case TMEM_FLUSH_PAGE:
+        flush = 1;
+        rc = do_tmem_flush_page(pool, op.object, op.index);
+        break;
+    case TMEM_FLUSH_OBJECT:
+        rc = do_tmem_flush_object(pool, op.object);
+        flush_obj = 1;
+        break;
+    case TMEM_DESTROY_POOL:
+        flush = 1;
+        rc = do_tmem_destroy_pool(op.pool_id);
+        break;
+    case TMEM_READ:
+        rc = do_tmem_get(pool, op.object, op.index, op.cmfn,
+                         op.tmem_offset, op.pfn_offset, op.len);
+        break;
+    case TMEM_WRITE:
+        rc = do_tmem_put(pool, op.object, op.index, op.cmfn,
+                         op.tmem_offset, op.pfn_offset, op.len);
+        break;
+    case TMEM_XCHG:
+        /* need to hold global lock to ensure xchg is atomic */
+        printk("tmem_xchg op not implemented yet\n");
+        rc = 0;
+        break;
+    default:
+        printk("tmem: op %d not implemented\n", op.cmd);
+        rc = 0;
+        break;
+    }
+
+out:
+    if ( rc < 0 )
+        errored_tmem_ops++;
+    if ( succ_get )
+        END_CYC_COUNTER(succ_get);
+    else if ( succ_put )
+        END_CYC_COUNTER(succ_put);
+    else if ( non_succ_get )
+        END_CYC_COUNTER(non_succ_get);
+    else if ( non_succ_put )
+        END_CYC_COUNTER(non_succ_put);
+    else if ( flush )
+        END_CYC_COUNTER(flush);
+    else
+        END_CYC_COUNTER(flush_obj);
+
+    if ( tmh_lock_all )
+    {
+        if ( tmh_lock_all > 1 )
+            spin_unlock_irq(&tmem_spinlock);
+        else
+            spin_unlock(&tmem_spinlock);
+    } else {
+        if ( tmem_write_lock_set )
+            write_unlock(&tmem_rwlock);
+        else if ( tmem_read_lock_set )
+            read_unlock(&tmem_rwlock);
+        else 
+            ASSERT(0);
+    }
+
+    return rc;
+}
+
+/* this should be called when the host is destroying a client */
+EXPORT void tmem_destroy(void *v)
+{
+    client_t *client = (client_t *)v;
+
+    if ( tmh_lock_all )
+        spin_lock(&tmem_spinlock);
+    else
+        write_lock(&tmem_rwlock);
+
+    if ( client == NULL )
+        printk("tmem: can't destroy tmem pools for %s=%d\n",
+               cli_id_str,client->cli_id);
+    else
+    {
+        printk("tmem: flushing tmem pools for %s=%d\n",
+               cli_id_str,client->cli_id);
+        client_flush(client,1);
+    }
+
+    if ( tmh_lock_all )
+        spin_unlock(&tmem_spinlock);
+    else
+        write_unlock(&tmem_rwlock);
+}
+
+/* freezing all pools guarantees that no additional memory will be consumed */
+EXPORT void tmem_freeze_all(unsigned char key)
+{
+    static int freeze = 0;
+ 
+    if ( tmh_lock_all )
+        spin_lock(&tmem_spinlock);
+    else
+        write_lock(&tmem_rwlock);
+
+    freeze = !freeze;
+    tmemc_freeze_pools(CLI_ID_NULL,freeze);
+
+    if ( tmh_lock_all )
+        spin_unlock(&tmem_spinlock);
+    else
+        write_unlock(&tmem_rwlock);
+}
+
+#define MAX_EVICTS 10  /* should be variable or set via TMEMC_ ?? */
+
+EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
+{
+    pfp_t *pfp;
+    unsigned long evicts_per_relinq = 0;
+    int max_evictions = 10;
+
+    if (!tmh_enabled())
+        return NULL;
+#ifdef __i386__
+    return NULL;
+#endif
+
+    relinq_attempts++;
+    if ( order > 0 )
+    {
+        printk("tmem_relinquish_page: failing order=%d\n", order);
+        return NULL;
+    }
+
+    if ( tmh_called_from_tmem(memflags) )
+    {
+        if ( tmh_lock_all )
+            spin_lock(&tmem_spinlock);
+        else
+            read_lock(&tmem_rwlock);
+    }
+
+    while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
+    {
+        if ( (max_evictions-- <= 0) || !tmem_evict())
+            break;
+        evicts_per_relinq++;
+    }
+    if ( evicts_per_relinq > max_evicts_per_relinq )
+        max_evicts_per_relinq = evicts_per_relinq;
+    tmh_scrub_page(pfp, memflags);
+    if ( pfp != NULL )
+        relinq_pgs++;
+
+    if ( tmh_called_from_tmem(memflags) )
+    {
+        if ( tmh_lock_all )
+            spin_unlock(&tmem_spinlock);
+        else
+            read_unlock(&tmem_rwlock);
+    }
+
+    return pfp;
+}
+
+/* called at hypervisor startup */
+EXPORT void init_tmem(void)
+{
+    if ( !tmh_enabled() )
+        return;
+
+    radix_tree_init();
+    if ( tmh_init() )
+    {
+        printk("tmem: initialized comp=%d global-lock=%d\n",
+            tmh_compression_enabled(), tmh_lock_all);
+        tmem_initialized = 1;
+    }
+    else
+        printk("tmem: initialization FAILED\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/tmem_xen.c b/xen/common/tmem_xen.c
new file mode 100644
index 0000000000..6a0b14f456
--- /dev/null
+++ b/xen/common/tmem_xen.c
@@ -0,0 +1,334 @@
+/******************************************************************************
+ * tmem-xen.c
+ *
+ * Xen-specific Transcendent memory
+ *
+ * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
+ */
+
+#include <xen/tmem.h>
+#include <xen/tmem_xen.h>
+#include <xen/lzo.h> /* compression code */
+#include <xen/paging.h>
+#include <xen/domain_page.h>
+
+#define EXPORT /* indicates code other modules are dependent upon */
+
+EXPORT int opt_tmem = 0;
+boolean_param("tmem", opt_tmem);
+
+EXPORT int opt_tmem_compress = 0;
+boolean_param("tmem_compress", opt_tmem_compress);
+
+EXPORT int opt_tmem_lock = 0;
+integer_param("tmem_lock", opt_tmem_lock);
+
+#ifdef COMPARE_COPY_PAGE_SSE2
+DECL_CYC_COUNTER(pg_copy1);
+DECL_CYC_COUNTER(pg_copy2);
+DECL_CYC_COUNTER(pg_copy3);
+DECL_CYC_COUNTER(pg_copy4);
+#else
+DECL_CYC_COUNTER(pg_copy);
+#endif
+
+/* these are a concurrency bottleneck, could be percpu and dynamically
+ * allocated iff opt_tmem_compress */
+#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
+#define LZO_DSTMEM_PAGES 2
+static DEFINE_PER_CPU(unsigned char *, workmem);
+static DEFINE_PER_CPU(unsigned char *, dstmem);
+
+#ifdef COMPARE_COPY_PAGE_SSE2
+#include <asm/flushtlb.h>  /* REMOVE ME AFTER TEST */
+#include <asm/page.h>  /* REMOVE ME AFTER TEST */
+#endif
+void tmh_copy_page(char *to, char*from)
+{
+#ifdef COMPARE_COPY_PAGE_SSE2
+    DECL_LOCAL_CYC_COUNTER(pg_copy1);
+    DECL_LOCAL_CYC_COUNTER(pg_copy2);
+    DECL_LOCAL_CYC_COUNTER(pg_copy3);
+    DECL_LOCAL_CYC_COUNTER(pg_copy4);
+    *to = *from;  /* don't measure TLB misses */
+    flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
+    flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
+    START_CYC_COUNTER(pg_copy1);
+    copy_page_sse2(to, from);  /* cold cache */
+    END_CYC_COUNTER(pg_copy1);
+    START_CYC_COUNTER(pg_copy2);
+    copy_page_sse2(to, from);  /* hot cache */
+    END_CYC_COUNTER(pg_copy2);
+    flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
+    flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
+    START_CYC_COUNTER(pg_copy3);
+    memcpy(to, from, PAGE_SIZE);  /* cold cache */
+    END_CYC_COUNTER(pg_copy3);
+    START_CYC_COUNTER(pg_copy4);
+    memcpy(to, from, PAGE_SIZE); /* hot cache */
+    END_CYC_COUNTER(pg_copy4);
+#else
+    DECL_LOCAL_CYC_COUNTER(pg_copy);
+    START_CYC_COUNTER(pg_copy);
+    memcpy(to, from, PAGE_SIZE);
+    END_CYC_COUNTER(pg_copy);
+#endif
+}
+
+#ifdef __ia64__
+static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn)
+{
+    ASSERT(0);
+}
+#define paging_mark_dirty(_x,_y) do {} while(0)
+#else
+static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn)
+{
+    unsigned long cli_mfn;
+    p2m_type_t t;
+
+
+    if (is_pv_32on64_vcpu(current))
+        cmfn.p = (void *)((unsigned long)cmfn.p & 0xffffffffUL);
+    cli_mfn = mfn_x(gfn_to_mfn(current->domain,(unsigned long)cmfn.p,&t));
+    if (t != p2m_ram_rw)
+        return NULL;
+    if (pcli_mfn != NULL)
+        *pcli_mfn = cli_mfn;
+    return map_domain_page(cli_mfn);
+}
+#endif
+
+EXPORT int tmh_copy_from_client(pfp_t *pfp,
+    tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
+    uint32_t pfn_offset, uint32_t len)
+{
+    unsigned long tmem_mfn;
+    void *tmem_va, *cli_va = NULL;
+
+    ASSERT(pfp != NULL);
+    if ( tmem_offset || pfn_offset || len )
+        if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
+            return -EFAULT;
+    tmem_mfn = page_to_mfn(pfp);
+    tmem_va = map_domain_page(tmem_mfn);
+    mb();
+    if (!len && !tmem_offset && !pfn_offset)
+        memset(tmem_va, 0, PAGE_SIZE);
+    else if (len == PAGE_SIZE && !tmem_offset && !pfn_offset)
+        tmh_copy_page(tmem_va, cli_va);
+    else if ( (tmem_offset+len <= PAGE_SIZE) &&
+                (pfn_offset+len <= PAGE_SIZE) ) 
+        memcpy((char *)tmem_va+tmem_offset,(char *)cli_va+pfn_offset,len);
+    unmap_domain_page(cli_va);
+    unmap_domain_page(tmem_va);
+    return 1;
+}
+
+EXPORT int tmh_compress_from_client(tmem_cli_mfn_t cmfn,
+    void **out_va, size_t *out_len)
+{
+    void *cli_va;
+    int ret = 0;
+    unsigned char *dmem = this_cpu(dstmem);
+    unsigned char *wmem = this_cpu(workmem);
+
+    if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
+        return -EFAULT;
+    if ( dmem == NULL || wmem == NULL )
+        return 0;  /* no buffer, so can't compress */
+    mb();
+    ret = lzo1x_1_compress(cli_va, PAGE_SIZE, dmem, out_len, wmem);
+    ASSERT(ret == LZO_E_OK);
+    *out_va = dmem;
+    unmap_domain_page(cli_va);
+    return 1;
+}
+
+EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
+    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
+{
+    unsigned long tmem_mfn, cli_mfn;
+    void *tmem_va, *cli_va;
+
+    ASSERT(pfp != NULL);
+    if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+        return -EFAULT;
+    tmem_mfn = page_to_mfn(pfp);
+    tmem_va = map_domain_page(tmem_mfn);
+    if (len == PAGE_SIZE && !tmem_offset && !pfn_offset)
+        tmh_copy_page(cli_va, tmem_va);
+    else if ( (tmem_offset+len <= PAGE_SIZE) && (pfn_offset+len <= PAGE_SIZE) )
+        memcpy((char *)cli_va+pfn_offset,(char *)tmem_va+tmem_offset,len);
+    unmap_domain_page(tmem_va);
+    unmap_domain_page(cli_va);
+    paging_mark_dirty(current->domain,cli_mfn);
+    mb();
+    return 1;
+}
+
+EXPORT int tmh_decompress_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, size_t size)
+{
+    unsigned long cli_mfn;
+    void *cli_va;
+    size_t out_len = PAGE_SIZE;
+    int ret;
+
+    if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+        return -EFAULT;
+    ret = lzo1x_decompress_safe(tmem_va, size, cli_va, &out_len);
+    ASSERT(ret == LZO_E_OK);
+    ASSERT(out_len == PAGE_SIZE);
+    unmap_domain_page(cli_va);
+    paging_mark_dirty(current->domain,cli_mfn);
+    mb();
+    return 1;
+}
+
+/******************  XEN-SPECIFIC MEMORY ALLOCATION ********************/
+
+EXPORT struct xmem_pool *tmh_mempool = 0;
+EXPORT unsigned int tmh_mempool_maxalloc = 0;
+
+EXPORT DEFINE_SPINLOCK(tmh_page_list_lock);
+EXPORT PAGE_LIST_HEAD(tmh_page_list);
+EXPORT unsigned long tmh_page_list_pages = 0;
+
+/* free anything on tmh_page_list to Xen's scrub list */
+EXPORT void tmh_release_avail_pages_to_host(void)
+{
+    spin_lock(&tmh_page_list_lock);
+    if ( !page_list_empty(&tmh_page_list) )
+    {
+        scrub_list_splice(&tmh_page_list);
+        INIT_PAGE_LIST_HEAD(&tmh_page_list);
+    }
+    spin_unlock(&tmh_page_list_lock);
+}
+
+EXPORT void tmh_scrub_page(struct page_info *pi, unsigned int memflags)
+{
+    if ( pi == NULL )
+        return;
+    if ( !(memflags & MEMF_tmem) )
+        scrub_one_page(pi);
+}
+
+#ifndef __i386__
+static noinline void *tmh_mempool_page_get(unsigned long size)
+{
+    struct page_info *pi;
+
+    ASSERT(size == PAGE_SIZE);
+    if ( (pi = tmh_alloc_page(NULL,0)) == NULL )
+        return NULL;
+    ASSERT(IS_VALID_PAGE(pi));
+    return page_to_virt(pi);
+}
+
+static void tmh_mempool_page_put(void *page_va)
+{
+    ASSERT(IS_PAGE_ALIGNED(page_va));
+    tmh_free_page(virt_to_page(page_va));
+}
+
+static int tmh_mempool_init(void)
+{
+    tmh_mempool = xmem_pool_create("tmem", tmh_mempool_page_get,
+        tmh_mempool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
+    if ( tmh_mempool )
+        tmh_mempool_maxalloc = xmem_pool_maxalloc(tmh_mempool);
+    return tmh_mempool != NULL;
+}
+
+/* persistent pools are per-domain */
+
+static void *tmh_persistent_pool_page_get(unsigned long size)
+{
+    struct page_info *pi;
+    struct domain *d = current->domain;
+
+    ASSERT(size == PAGE_SIZE);
+    if ( (pi = _tmh_alloc_page_thispool(d)) == NULL )
+        return NULL;
+    ASSERT(IS_VALID_PAGE(pi));
+    return map_domain_page(page_to_mfn(pi));
+}
+
+static void tmh_persistent_pool_page_put(void *page_va)
+{
+    struct page_info *pi;
+
+    ASSERT(IS_PAGE_ALIGNED(page_va));
+    pi = virt_to_page(page_va);
+    ASSERT(IS_VALID_PAGE(pi));
+    _tmh_free_page_thispool(pi);
+}
+#endif
+
+/******************  XEN-SPECIFIC CLIENT HANDLING ********************/
+
+EXPORT tmh_client_t *tmh_client_init(void)
+{
+    tmh_client_t *tmh;
+    char name[5];
+    domid_t domid = current->domain->domain_id;
+    int i, shift;
+
+    if ( (tmh = xmalloc(tmh_client_t)) == NULL )
+        return NULL;
+    for (i = 0, shift = 12; i < 4; shift -=4, i++)
+        name[i] = ((unsigned short)domid >> shift) & 0xf;
+    name[4] = '\0';
+#ifndef __i386__
+    tmh->persistent_pool = xmem_pool_create(name, tmh_persistent_pool_page_get,
+        tmh_persistent_pool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
+    if ( tmh->persistent_pool == NULL )
+    {
+        xfree(tmh);
+        return NULL;
+    }
+#endif
+    tmh->domain = current->domain;
+    return tmh;
+}
+
+EXPORT void tmh_client_destroy(tmh_client_t *tmh)
+{
+#ifndef __i386__
+    xmem_pool_destroy(tmh->persistent_pool);
+#endif
+    xfree(tmh);
+}
+
+/******************  XEN-SPECIFIC HOST INITIALIZATION ********************/
+
+EXPORT int tmh_init(void)
+{
+#ifndef __i386__
+    int dstmem_order, workmem_order;
+    bool_t bad_alloc = 0;
+    struct page_info *pi;
+    unsigned char *p1, *p2;
+    int cpu;
+
+    if ( !tmh_mempool_init() )
+        return 0;
+
+    dstmem_order = get_order_from_pages(LZO_DSTMEM_PAGES);
+    workmem_order = get_order_from_bytes(LZO1X_1_MEM_COMPRESS);
+    for_each_cpu ( cpu )
+    {
+        pi = alloc_domheap_pages(0,dstmem_order,0);
+        per_cpu(dstmem, cpu) = p1 = ((pi == NULL) ? NULL : page_to_virt(pi));
+        pi = alloc_domheap_pages(0,workmem_order,0);
+        per_cpu(workmem, cpu) = p2 = ((pi == NULL) ? NULL : page_to_virt(pi));
+        if ( (p1 == NULL) || (p2 == NULL) )
+            bad_alloc++;
+    }
+    if ( bad_alloc )
+        printk("tmem: can't allocate compression buffers for %d cpus\n",
+               bad_alloc);
+#endif
+    return 1;
+}
diff --git a/xen/common/xmalloc_tlsf.c b/xen/common/xmalloc_tlsf.c
index 7a476e8fb7..3f85389e23 100644
--- a/xen/common/xmalloc_tlsf.c
+++ b/xen/common/xmalloc_tlsf.c
@@ -292,7 +292,6 @@ struct xmem_pool *xmem_pool_create(
     unsigned long grow_size)
 {
     struct xmem_pool *pool;
-    void *region;
     int pool_bytes, pool_order;
 
     BUG_ON(max_size && (max_size < init_size));
@@ -319,11 +318,9 @@ struct xmem_pool *xmem_pool_create(
     pool->get_mem = get_mem;
     pool->put_mem = put_mem;
     strlcpy(pool->name, name, sizeof(pool->name));
-    region = get_mem(init_size);
-    if ( region == NULL )
-        goto out_region;
-    ADD_REGION(region, init_size, pool);
-    pool->init_region = region;
+
+    /* always obtain init_region lazily now to ensure it is get_mem'd
+     * in the same "context" as all other regions */
 
     spin_lock_init(&pool->lock);
 
@@ -332,10 +329,6 @@ struct xmem_pool *xmem_pool_create(
     spin_unlock(&pool_list_lock);
 
     return pool;
-
- out_region:
-    free_xenheap_pages(pool, pool_order);
-    return NULL;
 }
 
 unsigned long xmem_pool_get_used_size(struct xmem_pool *pool)
@@ -354,13 +347,15 @@ unsigned long xmem_pool_get_total_size(struct xmem_pool *pool)
 
 void xmem_pool_destroy(struct xmem_pool *pool) 
 {
+    int pool_bytes, pool_order;
+
     if ( pool == NULL )
         return;
 
     /* User is destroying without ever allocating from this pool */
     if ( xmem_pool_get_used_size(pool) == BHDR_OVERHEAD )
     {
-        pool->put_mem(pool->init_region);
+        ASSERT(!pool->init_region);
         pool->used_size -= BHDR_OVERHEAD;
     }
 
@@ -373,7 +368,10 @@ void xmem_pool_destroy(struct xmem_pool *pool)
     spin_lock(&pool_list_lock);
     list_del_init(&pool->list);
     spin_unlock(&pool_list_lock);
-    pool->put_mem(pool);
+
+    pool_bytes = ROUNDUP_SIZE(sizeof(*pool));
+    pool_order = get_order_from_bytes(pool_bytes);
+    free_xenheap_pages(pool,pool_order);
 }
 
 void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool)
@@ -382,6 +380,14 @@ void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool)
     int fl, sl;
     unsigned long tmp_size;
 
+    if ( pool->init_region == NULL )
+    {
+        if ( (region = pool->get_mem(pool->init_size)) == NULL )
+            goto out;
+        ADD_REGION(region, pool->init_size, pool);
+        pool->init_region = region;
+    }
+
     size = (size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : ROUNDUP_SIZE(size);
     /* Rounding up the requested size and calculating fl and sl */
 
@@ -496,6 +502,11 @@ void xmem_pool_free(void *ptr, struct xmem_pool *pool)
     spin_unlock(&pool->lock);
 }
 
+int xmem_pool_maxalloc(struct xmem_pool *pool)
+{
+    return pool->grow_size - (2 * BHDR_OVERHEAD);
+}
+
 /*
  * Glue for xmalloc().
  */
author	Keir Fraser <keir.fraser@citrix.com>	2009-05-26 11:05:04 +0100
committer	Keir Fraser <keir.fraser@citrix.com>	2009-05-26 11:05:04 +0100
commit	6009f4ddb2cdb8555d2d5e030d351893e971b995 (patch)
tree	6f146a530b5065a1688aa456280f965e1751f2c8 /xen/common
parent	ff811c2bc429a70798cf65913549c0ddaab70c3d (diff)
download	xen-6009f4ddb2cdb8555d2d5e030d351893e971b995.tar.gz xen-6009f4ddb2cdb8555d2d5e030d351893e971b995.tar.bz2 xen-6009f4ddb2cdb8555d2d5e030d351893e971b995.zip