aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.hgignore1
-rw-r--r--tools/libxc/Makefile1
-rw-r--r--tools/libxc/xc_tmem.c83
-rw-r--r--tools/libxc/xenctrl.h11
-rw-r--r--tools/misc/Makefile4
-rw-r--r--tools/misc/xen-tmem-list-parse.c288
-rw-r--r--tools/python/xen/lowlevel/xc/xc.c57
-rw-r--r--tools/python/xen/xend/XendAPI.py74
-rw-r--r--tools/python/xen/xend/XendConstants.py26
-rw-r--r--tools/python/xen/xend/XendNode.py64
-rw-r--r--tools/python/xen/xend/balloon.py17
-rw-r--r--tools/python/xen/xend/server/XMLRPCServer.py6
-rw-r--r--tools/python/xen/xm/main.py224
-rw-r--r--xen/arch/ia64/xen/mm.c7
-rw-r--r--xen/arch/x86/mm.c36
-rw-r--r--xen/arch/x86/setup.c3
-rw-r--r--xen/common/Makefile5
-rw-r--r--xen/common/compat/Makefile1
-rw-r--r--xen/common/compat/tmem_xen.c26
-rw-r--r--xen/common/domain.c4
-rw-r--r--xen/common/lzo.c518
-rw-r--r--xen/common/memory.c11
-rw-r--r--xen/common/page_alloc.c43
-rw-r--r--xen/common/radix-tree.c448
-rw-r--r--xen/common/rbtree.c398
-rw-r--r--xen/common/spinlock.c12
-rw-r--r--xen/common/tmem.c2109
-rw-r--r--xen/common/tmem_xen.c334
-rw-r--r--xen/common/xmalloc_tlsf.c35
-rw-r--r--xen/include/Makefile1
-rw-r--r--xen/include/asm-ia64/mm.h2
-rw-r--r--xen/include/asm-x86/mm.h2
-rw-r--r--xen/include/asm-x86/spinlock.h21
-rw-r--r--xen/include/public/tmem.h112
-rw-r--r--xen/include/public/xen.h1
-rw-r--r--xen/include/xen/config.h1
-rw-r--r--xen/include/xen/hash.h58
-rw-r--r--xen/include/xen/hypercall.h5
-rw-r--r--xen/include/xen/lib.h2
-rw-r--r--xen/include/xen/lzo.h44
-rw-r--r--xen/include/xen/mm.h32
-rw-r--r--xen/include/xen/radix-tree.h78
-rw-r--r--xen/include/xen/rbtree.h82
-rw-r--r--xen/include/xen/sched.h3
-rw-r--r--xen/include/xen/spinlock.h4
-rw-r--r--xen/include/xen/tmem.h16
-rw-r--r--xen/include/xen/tmem_xen.h356
-rw-r--r--xen/include/xen/xmalloc.h8
-rw-r--r--xen/include/xlat.lst3
49 files changed, 5638 insertions, 39 deletions
diff --git a/.hgignore b/.hgignore
index fb16719e24..1b798d15f4 100644
--- a/.hgignore
+++ b/.hgignore
@@ -181,6 +181,7 @@
^tools/misc/xc_shadow$
^tools/misc/xen_cpuperf$
^tools/misc/xen-detect$
+^tools/misc/xen-tmem-list-parse$
^tools/misc/xenperf$
^tools/misc/xenpm$
^tools/pygrub/build/.*$
diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
index acd7067e57..e984df9088 100644
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -21,6 +21,7 @@ CTRL_SRCS-y += xc_tbuf.c
CTRL_SRCS-y += xc_pm.c
CTRL_SRCS-y += xc_cpu_hotplug.c
CTRL_SRCS-y += xc_resume.c
+CTRL_SRCS-y += xc_tmem.c
CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c
CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c
CTRL_SRCS-$(CONFIG_SunOS) += xc_solaris.c
diff --git a/tools/libxc/xc_tmem.c b/tools/libxc/xc_tmem.c
new file mode 100644
index 0000000000..ba618ef98c
--- /dev/null
+++ b/tools/libxc/xc_tmem.c
@@ -0,0 +1,83 @@
+/******************************************************************************
+ * xc_tmem.c
+ *
+ * Copyright (C) 2008 Oracle Corp.
+ */
+
+#include "xc_private.h"
+#include <xen/tmem.h>
+
+static int do_tmem_op(int xc, tmem_op_t *op)
+{
+ int ret;
+ DECLARE_HYPERCALL;
+
+ hypercall.op = __HYPERVISOR_tmem_op;
+ hypercall.arg[0] = (unsigned long)op;
+ if (lock_pages(op, sizeof(*op)) != 0)
+ {
+ PERROR("Could not lock memory for Xen hypercall");
+ return -EFAULT;
+ }
+ if ((ret = do_xen_hypercall(xc, &hypercall)) < 0)
+ {
+ if ( errno == EACCES )
+ DPRINTF("tmem operation failed -- need to"
+ " rebuild the user-space tool set?\n");
+ }
+ unlock_pages(op, sizeof(*op));
+
+ return ret;
+}
+
+int xc_tmem_control(int xc,
+ int32_t pool_id,
+ uint32_t subop,
+ uint32_t cli_id,
+ uint32_t arg1,
+ uint32_t arg2,
+ void *buf)
+{
+ tmem_op_t op;
+ int rc;
+
+ op.cmd = TMEM_CONTROL;
+ op.pool_id = pool_id;
+ op.subop = subop;
+ op.cli_id = cli_id;
+ op.arg1 = arg1;
+ op.arg2 = arg2;
+ op.buf.p = buf;
+
+ if (subop == TMEMC_LIST) {
+ if ((arg1 != 0) && (lock_pages(buf, arg1) != 0))
+ {
+ PERROR("Could not lock memory for Xen hypercall");
+ return -ENOMEM;
+ }
+ }
+
+#ifdef VALGRIND
+ if (arg1 != 0)
+ memset(buf, 0, arg1);
+#endif
+
+ rc = do_tmem_op(xc, &op);
+
+ if (subop == TMEMC_LIST) {
+ if (arg1 != 0)
+ unlock_pages(buf, arg1);
+ }
+
+ return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index c9b1866b60..8a54d5775d 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -1267,4 +1267,15 @@ int xc_get_vcpu_migration_delay(int xc_handle, uint32_t *value);
int xc_get_cpuidle_max_cstate(int xc_handle, uint32_t *value);
int xc_set_cpuidle_max_cstate(int xc_handle, uint32_t value);
+/**
+ * tmem operations
+ */
+int xc_tmem_control(int xc,
+ int32_t pool_id,
+ uint32_t subop,
+ uint32_t cli_id,
+ uint32_t arg1,
+ uint32_t arg2,
+ void *buf);
+
#endif /* XENCTRL_H */
diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index c309a3f106..b6a735bfdd 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -10,7 +10,7 @@ CFLAGS += $(INCLUDES)
HDRS = $(wildcard *.h)
-TARGETS-y := xenperf xenpm
+TARGETS-y := xenperf xenpm xen-tmem-list-parse
TARGETS-$(CONFIG_X86) += xen-detect
TARGETS := $(TARGETS-y)
@@ -22,7 +22,7 @@ INSTALL_BIN-y := xencons
INSTALL_BIN-$(CONFIG_X86) += xen-detect
INSTALL_BIN := $(INSTALL_BIN-y)
-INSTALL_SBIN-y := xm xen-bugtool xen-python-path xend xenperf xsview xenpm
+INSTALL_SBIN-y := xm xen-bugtool xen-python-path xend xenperf xsview xenpm xen-tmem-list-parse
INSTALL_SBIN := $(INSTALL_SBIN-y)
DEFAULT_PYTHON_PATH := $(shell $(XEN_ROOT)/tools/python/get-path)
diff --git a/tools/misc/xen-tmem-list-parse.c b/tools/misc/xen-tmem-list-parse.c
new file mode 100644
index 0000000000..383daee158
--- /dev/null
+++ b/tools/misc/xen-tmem-list-parse.c
@@ -0,0 +1,288 @@
+/*
+ * Parse output from tmem-list and reformat to human-readable
+ *
+ * NOTE: NEVER delete a parse call as this file documents backwards
+ * compatibility for older versions of tmem-list and we don't want to
+ * accidentally reuse an old tag
+ *
+ * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+
+#define BUFSIZE 4096
+#define PAGE_SIZE 4096
+
+unsigned long long parse(char *s,char *match)
+{
+ char *s1 = strstr(s,match);
+ unsigned long long ret;
+
+ if ( s1 == NULL )
+ return 0LL;
+ s1 += 2;
+ if ( *s1++ != ':' )
+ return 0LL;
+ sscanf(s1,"%llu",&ret);
+ return ret;
+}
+
+unsigned long long parse2(char *s,char *match1, char *match2)
+{
+ char match[3];
+ match[0] = *match1;
+ match[1] = *match2;
+ match[2] = '\0';
+ return parse(s,match);
+}
+
+void parse_string(char *s,char *match, char *buf, int len)
+{
+ char *s1 = strstr(s,match);
+ int i;
+
+ if ( s1 == NULL )
+ return;
+ s1 += 2;
+ if ( *s1++ != ':' )
+ return;
+ for ( i = 0; i < len; i++ )
+ *buf++ = *s1++;
+}
+
+void parse_sharers(char *s, char *match, char *buf, int len)
+{
+ char *s1 = strstr(s,match);
+ char *b = buf;
+
+ if ( s1 == NULL )
+ return;
+ while ( s1 )
+ {
+ s1 += 2;
+ if (*s1++ != ':')
+ return;
+ while (*s1 <= '0' && *s1 <= '9')
+ *b++ = *s1++;
+ *b++ = ',';
+ s1 = strstr(s1,match);
+ }
+ if ( b != buf )
+ *--b = '\0';
+}
+
+void parse_global(char *s)
+{
+ unsigned long long total_ops = parse(s,"Tt");
+ unsigned long long errored_ops = parse(s,"Te");
+ unsigned long long failed_copies = parse(s,"Cf");
+ unsigned long long alloc_failed = parse(s,"Af");
+ unsigned long long alloc_page_failed = parse(s,"Pf");
+ unsigned long long avail_pages = parse(s,"Ta");
+ unsigned long long low_on_memory = parse(s,"Lm");
+ unsigned long long evicted_pgs = parse(s,"Et");
+ unsigned long long evict_attempts = parse(s,"Ea");
+ unsigned long long relinq_pgs = parse(s,"Rt");
+ unsigned long long relinq_attempts = parse(s,"Ra");
+ unsigned long long max_evicts_per_relinq = parse(s,"Rx");
+ unsigned long long total_flush_pool = parse(s,"Fp");
+ unsigned long long global_eph_count = parse(s,"Ec");
+ unsigned long long global_eph_max = parse(s,"Em");
+ unsigned long long obj_count = parse(s,"Oc");
+ unsigned long long obj_max = parse(s,"Om");
+ unsigned long long rtree_node_count = parse(s,"Nc");
+ unsigned long long rtree_node_max = parse(s,"Nm");
+ unsigned long long pgp_count = parse(s,"Pc");
+ unsigned long long pgp_max = parse(s,"Pm");
+
+ printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n",
+ total_ops, errored_ops, avail_pages);
+ printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) "
+ "nodes=%llu (max=%llu)\n",
+ obj_count, obj_max, pgp_count, pgp_max,
+ rtree_node_count, rtree_node_max);
+ printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu "
+ "low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, "
+ "max_evicts_per_relinq=%llu, flush_pools=%llu, "
+ "eph_count=%llu, eph_max=%llu\n",
+ failed_copies, alloc_failed, alloc_page_failed, low_on_memory,
+ evicted_pgs, evict_attempts, relinq_pgs, relinq_attempts,
+ max_evicts_per_relinq, total_flush_pool,
+ global_eph_count, global_eph_max);
+}
+
+#define PARSE_CYC_COUNTER(s,x,prefix) unsigned long long \
+ x##_count = parse2(s,prefix,"n"), \
+ x##_sum_cycles = parse2(s,prefix,"t"), \
+ x##_max_cycles = parse2(s,prefix,"x"), \
+ x##_min_cycles = parse2(s,prefix,"m")
+#define PRINTF_CYC_COUNTER(x,text) \
+ if (x##_count) printf(text" avg=%llu, max=%llu, " \
+ "min=%llu, samples=%llu\n", \
+ x##_sum_cycles ? (x##_sum_cycles/x##_count) : 0, \
+ x##_max_cycles, x##_min_cycles, x##_count)
+
+void parse_time_stats(char *s)
+{
+ PARSE_CYC_COUNTER(s,succ_get,"G");
+ PARSE_CYC_COUNTER(s,succ_put,"P");
+ PARSE_CYC_COUNTER(s,non_succ_get,"g");
+ PARSE_CYC_COUNTER(s,non_succ_put,"p");
+ PARSE_CYC_COUNTER(s,flush,"F");
+ PARSE_CYC_COUNTER(s,flush_obj,"O");
+ PARSE_CYC_COUNTER(s,pg_copy,"C");
+ PARSE_CYC_COUNTER(s,compress,"c");
+ PARSE_CYC_COUNTER(s,decompress,"d");
+
+ PRINTF_CYC_COUNTER(succ_get,"succ get cycles:");
+ PRINTF_CYC_COUNTER(succ_put,"succ put cycles:");
+ PRINTF_CYC_COUNTER(non_succ_get,"failed get cycles:");
+ PRINTF_CYC_COUNTER(non_succ_put,"failed put cycles:");
+ PRINTF_CYC_COUNTER(flush,"flush cycles:");
+ PRINTF_CYC_COUNTER(flush_obj,"flush_obj cycles:");
+ PRINTF_CYC_COUNTER(pg_copy,"page copy cycles:");
+ PRINTF_CYC_COUNTER(compress,"compression cycles:");
+ PRINTF_CYC_COUNTER(decompress,"decompression cycles:");
+}
+
+void parse_client(char *s)
+{
+ unsigned long cli_id = parse(s,"CI");
+ unsigned long weight = parse(s,"ww");
+ unsigned long cap = parse(s,"ca");
+ unsigned long compress = parse(s,"co");
+ unsigned long frozen = parse(s,"fr");
+ unsigned long long eph_count = parse(s,"Ec");
+ unsigned long long max_eph_count = parse(s,"Em");
+ unsigned long long compressed_pages = parse(s,"cp");
+ unsigned long long compressed_sum_size = parse(s,"cb");
+ unsigned long long compress_poor = parse(s,"cn");
+ unsigned long long compress_nomem = parse(s,"cm");
+
+ printf("domid%lu: weight=%lu,cap=%lu,compress=%d,frozen=%d,"
+ "eph_count=%llu,max_eph=%llu,"
+ "compression ratio=%lu%% (samples=%llu,poor=%llu,nomem=%llu)\n",
+ cli_id, weight, cap, compress?1:0, frozen?1:0,
+ eph_count, max_eph_count,
+ compressed_pages ? (long)((compressed_sum_size*100LL) /
+ (compressed_pages*PAGE_SIZE)) : 0,
+ compressed_pages, compress_poor, compress_nomem);
+
+}
+
+void parse_pool(char *s)
+{
+ char pool_type[3];
+ unsigned long cli_id = parse(s,"CI");
+ unsigned long pool_id = parse(s,"PI");
+ unsigned long long pgp_count = parse(s,"Pc");
+ unsigned long long max_pgp_count = parse(s,"Pm");
+ unsigned long long obj_count = parse(s,"Oc");
+ unsigned long long max_obj_count = parse(s,"Om");
+ unsigned long long objnode_count = parse(s,"Nc");
+ unsigned long long max_objnode_count = parse(s,"Nm");
+ unsigned long long good_puts = parse(s,"ps");
+ unsigned long long puts = parse(s,"pt");
+ unsigned long long no_mem_puts = parse(s,"px");
+ unsigned long long dup_puts_flushed = parse(s,"pd");
+ unsigned long long dup_puts_replaced = parse(s,"pr");
+ unsigned long long found_gets = parse(s,"gs");
+ unsigned long long gets = parse(s,"gt");
+ unsigned long long flushs_found = parse(s,"fs");
+ unsigned long long flushs = parse(s,"ft");
+ unsigned long long flush_objs_found = parse(s,"os");
+ unsigned long long flush_objs = parse(s,"ot");
+
+ parse_string(s,"PT",pool_type,2);
+ printf("domid%lu,id%lu[%s]:pgp=%llu(max=%llu) obj=%llu(%llu) "
+ "objnode=%llu(%llu) puts=%llu/%llu/%llu(dup=%llu/%llu) "
+ "gets=%llu/%llu(%llu%%) "
+ "flush=%llu/%llu flobj=%llu/%llu\n",
+ cli_id, pool_id, pool_type,
+ pgp_count, max_pgp_count, obj_count, max_obj_count,
+ objnode_count, max_objnode_count,
+ good_puts, puts, no_mem_puts,
+ dup_puts_flushed, dup_puts_replaced,
+ found_gets, gets,
+ gets ? (found_gets*100LL)/gets : 0,
+ flushs_found, flushs, flush_objs_found, flush_objs);
+
+}
+
+void parse_shared_pool(char *s)
+{
+ char pool_type[3];
+ char buf[BUFSIZE];
+ unsigned long pool_id = parse(s,"PI");
+ unsigned long long uid0 = parse(s,"U0");
+ unsigned long long uid1 = parse(s,"U1");
+ unsigned long long pgp_count = parse(s,"Pc");
+ unsigned long long max_pgp_count = parse(s,"Pm");
+ unsigned long long obj_count = parse(s,"Oc");
+ unsigned long long max_obj_count = parse(s,"Om");
+ unsigned long long objnode_count = parse(s,"Nc");
+ unsigned long long max_objnode_count = parse(s,"Nm");
+ unsigned long long good_puts = parse(s,"ps");
+ unsigned long long puts = parse(s,"pt");
+ unsigned long long no_mem_puts = parse(s,"px");
+ unsigned long long dup_puts_flushed = parse(s,"pd");
+ unsigned long long dup_puts_replaced = parse(s,"pr");
+ unsigned long long found_gets = parse(s,"gs");
+ unsigned long long gets = parse(s,"gt");
+ unsigned long long flushs_found = parse(s,"fs");
+ unsigned long long flushs = parse(s,"ft");
+ unsigned long long flush_objs_found = parse(s,"os");
+ unsigned long long flush_objs = parse(s,"ot");
+
+ parse_string(s,"PT",pool_type,2);
+ parse_sharers(s,"SC",buf,BUFSIZE);
+ printf("poolid=%lu[%s] uuid=%llu.%llu, shared-by:%s: "
+ "pgp=%llu(max=%llu) obj=%llu(%llu) "
+ "objnode=%llu(%llu) puts=%llu/%llu/%llu(dup=%llu/%llu) "
+ "gets=%llu/%llu(%llu%%) "
+ "flush=%llu/%llu flobj=%llu/%llu\n",
+ pool_id, pool_type, uid0, uid1, buf,
+ pgp_count, max_pgp_count, obj_count, max_obj_count,
+ objnode_count, max_objnode_count,
+ good_puts, puts, no_mem_puts,
+ dup_puts_flushed, dup_puts_replaced,
+ found_gets, gets,
+ gets ? (found_gets*100LL)/gets : 0,
+ flushs_found, flushs, flush_objs_found, flush_objs);
+}
+
+int main(int ac, char **av)
+{
+ char *p, c;
+ char buf[BUFSIZE];
+
+ while ( (p = fgets(buf,BUFSIZE,stdin)) != NULL )
+ {
+ c = *p++;
+ if ( *p++ != '=' )
+ continue;
+ switch ( c )
+ {
+ case 'G':
+ parse_global(p);
+ break;
+ case 'T':
+ parse_time_stats(p);
+ break;
+ case 'C':
+ parse_client(p);
+ break;
+ case 'P':
+ parse_pool(p);
+ break;
+ case 'S':
+ parse_shared_pool(p);
+ break;
+ default:
+ continue;
+ }
+ }
+ return 0;
+}
diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c
index 75a19d3d6b..5a0bf1807c 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -19,6 +19,7 @@
#include "xenctrl.h"
#include <xen/elfnote.h>
+#include <xen/tmem.h>
#include "xc_dom.h"
#include <xen/hvm/hvm_info_table.h>
#include <xen/hvm/params.h>
@@ -1506,6 +1507,50 @@ static PyObject *dom_op(XcObject *self, PyObject *args,
return zero;
}
+static PyObject *pyxc_tmem_control(XcObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ int32_t pool_id;
+ uint32_t subop;
+ uint32_t cli_id;
+ uint32_t arg1;
+ uint32_t arg2;
+ char *buf;
+ char _buffer[32768], *buffer = _buffer;
+ int rc;
+
+ static char *kwd_list[] = { "pool_id", "subop", "cli_id", "arg1", "arg2", "buf", NULL };
+
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiiis", kwd_list,
+ &pool_id, &subop, &cli_id, &arg1, &arg2, &buf) )
+ return NULL;
+
+ if ( (subop == TMEMC_LIST) && (arg1 > 32768) )
+ arg1 = 32768;
+
+ if ( (rc = xc_tmem_control(self->xc_handle, pool_id, subop, cli_id, arg1, arg2, buffer)) < 0 )
+ return Py_BuildValue("i", rc);
+
+ switch (subop) {
+ case TMEMC_LIST:
+ return Py_BuildValue("s", buffer);
+ case TMEMC_FLUSH:
+ return Py_BuildValue("i", rc);
+ case TMEMC_THAW:
+ case TMEMC_FREEZE:
+ case TMEMC_DESTROY:
+ case TMEMC_SET_WEIGHT:
+ case TMEMC_SET_CAP:
+ case TMEMC_SET_COMPRESS:
+ default:
+ break;
+ }
+
+ Py_INCREF(zero);
+ return zero;
+}
+
static PyMethodDef pyxc_methods[] = {
{ "handle",
(PyCFunction)pyxc_handle,
@@ -1965,6 +2010,18 @@ static PyMethodDef pyxc_methods[] = {
" dom [int]: Identifier of domain.\n" },
#endif
+ { "tmem_control",
+ (PyCFunction)pyxc_tmem_control,
+ METH_VARARGS | METH_KEYWORDS, "\n"
+ "Do various control on a tmem pool.\n"
+ " pool_id [int]: Identifier of the tmem pool (-1 == all).\n"
+ " subop [int]: Supplementary Operation.\n"
+ " cli_id [int]: Client identifier (-1 == all).\n"
+ " arg1 [int]: Argument.\n"
+ " arg2 [int]: Argument.\n"
+ " buf [str]: Buffer.\n\n"
+ "Returns: [int] 0 or [str] tmem info on success; exception on error.\n" },
+
{ NULL, NULL, 0, NULL }
};
diff --git a/tools/python/xen/xend/XendAPI.py b/tools/python/xen/xend/XendAPI.py
index 6dda3a9c79..126db6421c 100644
--- a/tools/python/xen/xend/XendAPI.py
+++ b/tools/python/xen/xend/XendAPI.py
@@ -925,7 +925,15 @@ class XendAPI(object):
('dmesg', 'String'),
('dmesg_clear', 'String'),
('get_log', 'String'),
- ('send_debug_keys', None)]
+ ('send_debug_keys', None),
+ ('tmem_thaw', None),
+ ('tmem_freeze', None),
+ ('tmem_flush', None),
+ ('tmem_destroy', None),
+ ('tmem_list', None),
+ ('tmem_set_weight', None),
+ ('tmem_set_cap', None),
+ ('tmem_set_compress', None)]
host_funcs = [('get_by_name_label', None),
('list_methods', None)]
@@ -1061,6 +1069,70 @@ class XendAPI(object):
'PSCSIs': XendPSCSI.get_all()}
return xen_api_success(record)
+ def host_tmem_thaw(self, _, host_ref, cli_id):
+ node = XendNode.instance()
+ try:
+ node.tmem_thaw(cli_id)
+ except Exception, e:
+ return xen_api_error(e)
+ return xen_api_success_void()
+
+ def host_tmem_freeze(self, _, host_ref, cli_id):
+ node = XendNode.instance()
+ try:
+ node.tmem_freeze(cli_id)
+ except Exception, e:
+ return xen_api_error(e)
+ return xen_api_success_void()
+
+ def host_tmem_flush(self, _, host_ref, cli_id, pages):
+ node = XendNode.instance()
+ try:
+ node.tmem_flush(cli_id, pages)
+ except Exception, e:
+ return xen_api_error(e)
+ return xen_api_success_void()
+
+ def host_tmem_destroy(self, _, host_ref, cli_id):
+ node = XendNode.instance()
+ try:
+ node.tmem_destroy(cli_id)
+ except Exception, e:
+ return xen_api_error(e)
+ return xen_api_success_void()
+
+ def host_tmem_list(self, _, host_ref, cli_id, use_long):
+ node = XendNode.instance()
+ try:
+ info = node.tmem_list(cli_id, use_long)
+ except Exception, e:
+ return xen_api_error(e)
+ return xen_api_success(info)
+
+ def host_tmem_set_weight(self, _, host_ref, cli_id, value):
+ node = XendNode.instance()
+ try:
+ node.tmem_set_weight(cli_id, value)
+ except Exception, e:
+ return xen_api_error(e)
+ return xen_api_success_void()
+
+ def host_tmem_set_cap(self, _, host_ref, cli_id, value):
+ node = XendNode.instance()
+ try:
+ node.tmem_set_cap(cli_id, value)
+ except Exception, e:
+ return xen_api_error(e)
+ return xen_api_success_void()
+
+ def host_tmem_set_compress(self, _, host_ref, cli_id, value):
+ node = XendNode.instance()
+ try:
+ node.tmem_set_compress(cli_id, value)
+ except Exception, e:
+ return xen_api_error(e)
+ return xen_api_success_void()
+
# class methods
def host_get_all(self, session):
return xen_api_success((XendNode.instance().uuid,))
diff --git a/tools/python/xen/xend/XendConstants.py b/tools/python/xen/xend/XendConstants.py
index b0a7c66abb..c25ba2935a 100644
--- a/tools/python/xen/xend/XendConstants.py
+++ b/tools/python/xen/xend/XendConstants.py
@@ -141,3 +141,29 @@ XS_VMROOT = "/vm/"
NR_PCI_DEV = 32
AUTO_PHP_SLOT = NR_PCI_DEV
AUTO_PHP_SLOT_STR = "%02x" % NR_PCI_DEV
+
+#
+# tmem
+#
+
+TMEM_CONTROL = 0
+TMEM_NEW_POOL = 1
+TMEM_DESTROY_POOL = 2
+TMEM_NEW_PAGE = 3
+TMEM_PUT_PAGE = 4
+TMEM_GET_PAGE = 5
+TMEM_FLUSH_PAGE = 6
+TMEM_FLUSH_OBJECT = 7
+TMEM_READ = 8
+TMEM_WRITE = 9
+TMEM_XCHG = 10
+
+TMEMC_THAW = 0
+TMEMC_FREEZE = 1
+TMEMC_FLUSH = 2
+TMEMC_DESTROY = 3
+TMEMC_LIST = 4
+TMEMC_SET_WEIGHT = 5
+TMEMC_SET_CAP = 6
+TMEMC_SET_COMPRESS = 7
+
diff --git a/tools/python/xen/xend/XendNode.py b/tools/python/xen/xend/XendNode.py
index d1c4055ba4..34682b90aa 100644
--- a/tools/python/xen/xend/XendNode.py
+++ b/tools/python/xen/xend/XendNode.py
@@ -26,6 +26,7 @@ from xen.util import pci as PciUtil
from xen.util import vscsi_util
from xen.xend import XendAPIStore
from xen.xend import osdep
+from xen.xend.XendConstants import *
import uuid, arch
from XendPBD import XendPBD
@@ -940,6 +941,69 @@ class XendNode:
def info_dict(self):
return dict(self.info())
+ # tmem
+ def tmem_list(self, cli_id, use_long):
+ pool_id = -1
+ subop = TMEMC_LIST
+ arg1 = 32768
+ arg2 = use_long
+ buf = ''
+ return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+
+ def tmem_thaw(self, cli_id):
+ pool_id = -1
+ subop = TMEMC_THAW
+ arg1 = 0
+ arg2 = 0
+ buf = ''
+ return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+
+ def tmem_freeze(self, cli_id):
+ pool_id = -1
+ subop = TMEMC_FREEZE
+ arg1 = 0
+ arg2 = 0
+ buf = ''
+ return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+
+ def tmem_flush(self, cli_id, pages):
+ pool_id = -1
+ subop = TMEMC_FLUSH
+ arg1 = pages
+ arg2 = 0
+ buf = ''
+ return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+
+ def tmem_destroy(self, cli_id):
+ pool_id = -1
+ subop = TMEMC_DESTROY
+ arg1 = 0
+ arg2 = 0
+ buf = ''
+ return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+
+ def tmem_set_weight(self, cli_id, arg1):
+ pool_id = -1
+ subop = TMEMC_SET_WEIGHT
+ arg2 = 0
+ buf = ''
+ return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+
+ def tmem_set_cap(self, cli_id, arg1):
+ pool_id = -1
+ subop = TMEMC_SET_CAP
+ arg2 = 0
+ buf = ''
+ return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+
+ def tmem_set_compress(self, cli_id, arg1):
+ pool_id = -1
+ subop = TMEMC_SET_COMPRESS
+ arg2 = 0
+ buf = ''
+ return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+
+
def instance():
global inst
try:
diff --git a/tools/python/xen/xend/balloon.py b/tools/python/xen/xend/balloon.py
index b31398c745..42c8ea0aa7 100644
--- a/tools/python/xen/xend/balloon.py
+++ b/tools/python/xen/xend/balloon.py
@@ -26,6 +26,7 @@ import XendOptions
from XendLogging import log
from XendError import VmError
import osdep
+from xen.xend.XendConstants import *
RETRY_LIMIT = 20
RETRY_LIMIT_INCR = 5
@@ -109,6 +110,9 @@ def free(need_mem, dominfo):
last_free = None
rlimit = RETRY_LIMIT
+ # stop tmem from absorbing any more memory (must THAW when done!)
+ xc.tmem_control(0,TMEMC_FREEZE,-1, 0, 0, "")
+
# If unreasonable memory size is required, we give up waiting
# for ballooning or scrubbing, as if had retried.
physinfo = xc.physinfo()
@@ -122,6 +126,17 @@ def free(need_mem, dominfo):
if need_mem >= max_free_mem:
retries = rlimit
+ freeable_mem = free_mem + scrub_mem
+ if freeable_mem < need_mem and need_mem < max_free_mem:
+ # flush memory from tmem to scrub_mem and reobtain physinfo
+ need_tmem_kb = need_mem - freeable_mem
+ tmem_kb = xc.tmem_control(0,TMEMC_FLUSH,-1, need_tmem_kb, 0, "")
+ log.debug("Balloon: tmem relinquished %d KiB of %d KiB requested.",
+ tmem_kb, need_tmem_kb)
+ physinfo = xc.physinfo()
+ free_mem = physinfo['free_memory']
+ scrub_mem = physinfo['scrub_memory']
+
# Check whethercurrent machine is a numa system and the new
# created hvm has all its vcpus in the same node, if all the
# conditions above are fit. We will wait until all the pages
@@ -216,4 +231,6 @@ def free(need_mem, dominfo):
' be shrunk any further'))
finally:
+ # allow tmem to accept pages again
+ xc.tmem_control(0,TMEMC_THAW,-1, 0, 0, "")
del xc
diff --git a/tools/python/xen/xend/server/XMLRPCServer.py b/tools/python/xen/xend/server/XMLRPCServer.py
index fb9bdfee34..93c6caef1b 100644
--- a/tools/python/xen/xend/server/XMLRPCServer.py
+++ b/tools/python/xen/xend/server/XMLRPCServer.py
@@ -198,7 +198,11 @@ class XMLRPCServer:
self.server.register_function(fn, "xend.domain.%s" % name[7:])
# Functions in XendNode and XendDmesg
- for type, lst, n in [(XendNode, ['info', 'pciinfo', 'send_debug_keys'],
+ for type, lst, n in [(XendNode,
+ ['info', 'pciinfo', 'send_debug_keys',
+ 'tmem_list', 'tmem_freeze', 'tmem_thaw',
+ 'tmem_flush', 'tmem_destroy', 'tmem_set_weight',
+ 'tmem_set_cap', 'tmem_set_compress'],
'node'),
(XendDmesg, ['info', 'clear'], 'node.dmesg')]:
inst = type.instance()
diff --git a/tools/python/xen/xm/main.py b/tools/python/xen/xm/main.py
index b7897b248a..e55fab42e5 100644
--- a/tools/python/xen/xm/main.py
+++ b/tools/python/xen/xm/main.py
@@ -199,6 +199,15 @@ SUBCOMMAND_HELP = {
'scsi-list' : ('<Domain> [--long]',
'List all SCSI devices currently attached.'),
+ # tmem
+ 'tmem-list' : ('[-l|--long] [<Domain>|-a|--all]', 'List tmem pools.'),
+ 'tmem-thaw' : ('[<Domain>|-a|--all]', 'Thaw tmem pools.'),
+ 'tmem-freeze' : ('[<Domain>|-a|--all]', 'Freeze tmem pools.'),
+ 'tmem-destroy' : ('[<Domain>|-a|--all]', 'Destroy tmem pools.'),
+ 'tmem-set' : ('[<Domain>|-a|--all] [weight=<weight>] [cap=<cap>] '
+ '[compress=<compress>]',
+ 'Change tmem settings.'),
+
# security
'addlabel' : ('<label> {dom <ConfigFile>|res <resource>|mgt <managed domain>}\n'
@@ -283,6 +292,21 @@ SUBCOMMAND_OPTIONS = {
'info': (
('-c', '--config', 'List Xend configuration parameters'),
),
+ 'tmem-list': (
+ ('-l', '--long', 'List tmem stats.'),
+ ),
+ 'tmem-thaw': (
+ ('-a', '--all', 'Thaw all tmem.'),
+ ),
+ 'tmem-freeze': (
+ ('-a', '--all', 'Freeze all tmem.'),
+ ),
+ 'tmem-destroy': (
+ ('-a', '--all', 'Destroy all tmem.'),
+ ),
+ 'tmem-set': (
+ ('-a', '--all', 'Operate on all tmem.'),
+ ),
}
common_commands = [
@@ -397,9 +421,17 @@ acm_commands = [
"getpolicy",
]
+tmem_commands = [
+ "tmem-list",
+ "tmem-thaw",
+ "tmem-freeze",
+ "tmem-destroy",
+ "tmem-set",
+ ]
+
all_commands = (domain_commands + host_commands + scheduler_commands +
device_commands + vnet_commands + acm_commands +
- ['shell', 'event-monitor'])
+ tmem_commands + ['shell', 'event-monitor'])
##
@@ -2837,7 +2869,188 @@ def xm_network_show(args):
print format2 % r
-
+def xm_tmem_list(args):
+ try:
+ (options, params) = getopt.gnu_getopt(args, 'la', ['long','all'])
+ except getopt.GetoptError, opterr:
+ err(opterr)
+ usage('tmem-list')
+
+ use_long = False
+ for (k, v) in options:
+ if k in ['-l', '--long']:
+ use_long = True
+
+ all = False
+ for (k, v) in options:
+ if k in ['-a', '--all']:
+ all = True
+
+ if not all and len(params) == 0:
+ err('You must specify -a or --all or a domain id.')
+ usage('tmem-list')
+
+ if all:
+ domid = -1
+ else:
+ try:
+ domid = int(params[0])
+ params = params[1:]
+ except:
+ err('Unrecognized domain id: %s' % params[0])
+ usage('tmem-list')
+
+ if serverType == SERVER_XEN_API:
+ print server.xenapi.host.tmem_list(domid,use_long)
+ else:
+ print server.xend.node.tmem_list(domid,use_long)
+
+def parse_tmem_args(args, name):
+ try:
+ (options, params) = getopt.gnu_getopt(args, 'a', ['all'])
+ except getopt.GetoptError, opterr:
+ err(opterr)
+ usage(name)
+
+ all = False
+ for (k, v) in options:
+ if k in ['-a', '--all']:
+ all = True
+
+ if not all and len(params) == 0:
+ err('You must specify -a or --all or a domain id.')
+ usage(name)
+
+ if all:
+ domid = -1
+ else:
+ try:
+ domid = int(params[0])
+ params = params[1:]
+ except:
+ err('Unrecognized domain id: %s' % params[0])
+ usage(name)
+
+ return domid, params
+
+def xm_tmem_destroy(args):
+ (domid, _) = parse_tmem_args(args, 'tmem-destroy')
+ if serverType == SERVER_XEN_API:
+ server.xenapi.host.tmem_destroy(domid)
+ else:
+ server.xend.node.tmem_destroy(domid)
+
+def xm_tmem_thaw(args):
+ (domid, _) = parse_tmem_args(args, 'tmem-thaw')
+ if serverType == SERVER_XEN_API:
+ server.xenapi.host.tmem_thaw(domid)
+ else:
+ server.xend.node.tmem_thaw(domid)
+
+def xm_tmem_freeze(args):
+ (domid, _) = parse_tmem_args(args, 'tmem-freeze')
+ if serverType == SERVER_XEN_API:
+ server.xenapi.host.tmem_freeze(domid)
+ else:
+ server.xend.node.tmem_freeze(domid)
+
+def xm_tmem_flush(args):
+ try:
+ (options, params) = getopt.gnu_getopt(args, 'a', ['all'])
+ except getopt.GetoptError, opterr:
+ err(opterr)
+ usage(name)
+
+ all = False
+ for (k, v) in options:
+ if k in ['-a', '--all']:
+ all = True
+
+ if not all and len(params) == 0:
+ err('You must specify -a or --all or a domain id.')
+ usage('tmem-flush')
+
+ if all:
+ domid = -1
+ else:
+ try:
+ domid = int(params[0])
+ params = params[1:]
+ except:
+ err('Unrecognized domain id: %s' % params[0])
+ usage('tmem-flush')
+
+ pages = -1
+ for (k, v) in options:
+ if k in ['-p', '--pages']:
+ pages = v
+
+ if serverType == SERVER_XEN_API:
+ server.xenapi.host.tmem_flush(domid,pages)
+ else:
+ server.xend.node.tmem_flush(domid,pages)
+
+def xm_tmem_set(args):
+ try:
+ (options, params) = getopt.gnu_getopt(args, 'a', ['all'])
+ except getopt.GetoptError, opterr:
+ err(opterr)
+ usage(name)
+
+ all = False
+ for (k, v) in options:
+ if k in ['-a', '--all']:
+ all = True
+
+ if not all and len(params) == 0:
+ err('You must specify -a or --all or a domain id.')
+ usage('tmem-set')
+
+ if all:
+ domid = -1
+ else:
+ try:
+ domid = int(params[0])
+ params = params[1:]
+ except:
+ err('Unrecognized domain id: %s' % params[0])
+ usage('tmem-set')
+
+ weight = None
+ cap = None
+ compress = None
+ for item in params:
+ if item.startswith('weight='):
+ try:
+ weight = int(item[7:])
+ except:
+ err('weight should be a integer')
+ usage('tmem-set')
+ if item.startswith('cap='):
+ cap = int(item[4:])
+ if item.startswith('compress='):
+ compress = int(item[9:])
+
+ if weight is None and cap is None and compress is None:
+ err('Unrecognized tmem configuration option: %s' % item)
+ usage('tmem-set')
+
+ if serverType == SERVER_XEN_API:
+ if weight is not None:
+ server.xenapi.host.tmem_set_weight(domid, weight)
+ if cap is not None:
+ server.xenapi.host.tmem_set_cap(domid, cap)
+ if compress is not None:
+ server.xenapi.host.tmem_set_compress(domid, compress)
+ else:
+ if weight is not None:
+ server.xend.node.tmem_set_weight(domid, weight)
+ if cap is not None:
+ server.xend.node.tmem_set_cap(domid, cap)
+ if compress is not None:
+ server.xend.node.tmem_set_compress(domid, compress)
+
+
commands = {
"shell": xm_shell,
"event-monitor": xm_event_monitor,
@@ -2912,6 +3125,13 @@ commands = {
"scsi-attach": xm_scsi_attach,
"scsi-detach": xm_scsi_detach,
"scsi-list": xm_scsi_list,
+ # tmem
+ "tmem-thaw": xm_tmem_thaw,
+ "tmem-freeze": xm_tmem_freeze,
+ "tmem-flush": xm_tmem_flush,
+ "tmem-destroy": xm_tmem_destroy,
+ "tmem-list": xm_tmem_list,
+ "tmem-set": xm_tmem_set,
}
## The commands supported by a separate argument parser in xend.xm.
diff --git a/xen/arch/ia64/xen/mm.c b/xen/arch/ia64/xen/mm.c
index c98272a0b3..20071061f2 100644
--- a/xen/arch/ia64/xen/mm.c
+++ b/xen/arch/ia64/xen/mm.c
@@ -2870,6 +2870,13 @@ steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
return -1;
}
+int
+donate_page(struct domain *d, struct page_info *page, unsigned int memflags)
+{
+ /* needs to be implemented for transcendent memory (tmem) */
+ ASSERT(0);
+}
+
static void
__guest_physmap_add_page(struct domain *d, unsigned long gpfn,
unsigned long mfn)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 113b8dd4de..1f4199d55f 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3539,6 +3539,42 @@ int replace_grant_host_mapping(
return rc;
}
+int donate_page(
+ struct domain *d, struct page_info *page, unsigned int memflags)
+{
+ spin_lock(&d->page_alloc_lock);
+
+ if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) )
+ goto fail;
+
+ if ( d->is_dying )
+ goto fail;
+
+ if ( page->count_info & ~(PGC_allocated | 1) )
+ goto fail;
+
+ if ( !(memflags & MEMF_no_refcount) )
+ {
+ if ( d->tot_pages >= d->max_pages )
+ goto fail;
+ d->tot_pages++;
+ }
+
+ page->count_info = PGC_allocated | 1;
+ page_set_owner(page, d);
+ page_list_add_tail(page,&d->page_list);
+
+ spin_unlock(&d->page_alloc_lock);
+ return 0;
+
+ fail:
+ spin_unlock(&d->page_alloc_lock);
+ MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
+ (void *)page_to_mfn(page), d, d->domain_id,
+ page_get_owner(page), page->count_info, page->u.inuse.type_info);
+ return -1;
+}
+
int steal_page(
struct domain *d, struct page_info *page, unsigned int memflags)
{
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index d87d0827aa..026996ec09 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -110,6 +110,7 @@ extern void early_time_init(void);
extern void early_cpu_init(void);
extern void vesa_init(void);
extern void vesa_mtrr_init(void);
+extern void init_tmem(void);
DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
#ifdef CONFIG_COMPAT
@@ -1063,6 +1064,8 @@ void __init __start_xen(unsigned long mbi_p)
init_trace_bufs();
+ init_tmem();
+
console_endboot();
/* Hide UART from DOM0 if we're using it */
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 3054f2e271..08b9e2b00e 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -28,6 +28,11 @@ obj-y += version.o
obj-y += vsprintf.o
obj-y += xmalloc_tlsf.o
obj-y += rcupdate.o
+obj-y += tmem.o
+obj-y += tmem_xen.o
+obj-y += radix-tree.o
+obj-y += rbtree.o
+obj-y += lzo.o
obj-$(perfc) += perfc.o
obj-$(crash_debug) += gdbstub.o
diff --git a/xen/common/compat/Makefile b/xen/common/compat/Makefile
index 9a36a3dcd3..1cf289ab3e 100644
--- a/xen/common/compat/Makefile
+++ b/xen/common/compat/Makefile
@@ -3,3 +3,4 @@ obj-y += kernel.o
obj-y += memory.o
obj-y += multicall.o
obj-y += xlat.o
+obj-y += tmem_xen.o
diff --git a/xen/common/compat/tmem_xen.c b/xen/common/compat/tmem_xen.c
new file mode 100644
index 0000000000..f6c9e0453d
--- /dev/null
+++ b/xen/common/compat/tmem_xen.c
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * tmem_xen.c
+ *
+ */
+
+#include <xen/config.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/guest_access.h>
+#include <xen/hypercall.h>
+#include <compat/tmem.h>
+
+#define xen_tmem_op tmem_op
+/*CHECK_tmem_op;*/
+#undef xen_tmem_op
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 187735b18c..66694168a2 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -31,6 +31,7 @@
#include <public/vcpu.h>
#include <xsm/xsm.h>
#include <xen/trace.h>
+#include <xen/tmem.h>
/* Linux config option: propageted to domain0 */
/* xen_processor_pmbits: xen control Cx, Px, ... */
@@ -558,6 +559,9 @@ static void complete_domain_destroy(struct rcu_head *head)
grant_table_destroy(d);
+ if ( d->tmem != NULL )
+ tmem_destroy(d->tmem);
+
arch_domain_destroy(d);
rangeset_domain_destroy(d);
diff --git a/xen/common/lzo.c b/xen/common/lzo.c
new file mode 100644
index 0000000000..eeb200b281
--- /dev/null
+++ b/xen/common/lzo.c
@@ -0,0 +1,518 @@
+/*
+ * lzo.c -- LZO1X Compressor from MiniLZO
+ *
+ * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ * The full LZO package can be found at:
+ * http://www.oberhumer.com/opensource/lzo/
+ *
+ * Adapted for Xen (files combined and syntactic/header changes) by:
+ * Dan Magenheimer <dan.magenheimer@oracle.com>
+ *
+ */
+
+/*
+ * lzodefs.h -- architecture, OS and compiler specific defines
+ *
+ * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ * The full LZO package can be found at:
+ * http://www.oberhumer.com/opensource/lzo/
+ *
+ * Changed for kernel use by:
+ * Nitin Gupta <nitingupta910@gmail.com>
+ * Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#define LZO_VERSION 0x2020
+#define LZO_VERSION_STRING "2.02"
+#define LZO_VERSION_DATE "Oct 17 2005"
+
+#define M1_MAX_OFFSET 0x0400
+#define M2_MAX_OFFSET 0x0800
+#define M3_MAX_OFFSET 0x4000
+#define M4_MAX_OFFSET 0xbfff
+
+#define M1_MIN_LEN 2
+#define M1_MAX_LEN 2
+#define M2_MIN_LEN 3
+#define M2_MAX_LEN 8
+#define M3_MIN_LEN 3
+#define M3_MAX_LEN 33
+#define M4_MIN_LEN 3
+#define M4_MAX_LEN 9
+
+#define M1_MARKER 0
+#define M2_MARKER 64
+#define M3_MARKER 32
+#define M4_MARKER 16
+
+#define D_BITS 14
+#define D_MASK ((1u << D_BITS) - 1)
+#define D_HIGH ((D_MASK >> 1) + 1)
+
+#define DX2(p, s1, s2) (((((size_t)((p)[2]) << (s2)) ^ (p)[1]) \
+ << (s1)) ^ (p)[0])
+#define DX3(p, s1, s2, s3) ((DX2((p)+1, s2, s3) << (s1)) ^ (p)[0])
+
+/*
+ * LZO1X Compressor from MiniLZO
+ *
+ * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ * The full LZO package can be found at:
+ * http://www.oberhumer.com/opensource/lzo/
+ *
+ * Changed for kernel use by:
+ * Nitin Gupta <nitingupta910@gmail.com>
+ * Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#include <xen/types.h>
+#include <xen/lzo.h>
+#define get_unaligned(_p) (*(_p))
+#define put_unaligned(_val,_p) (*(_p)=_val)
+#define get_unaligned_le16(_p) (*(u16 *)(_p))
+
+static noinline size_t
+_lzo1x_1_do_compress(const unsigned char *in, size_t in_len,
+ unsigned char *out, size_t *out_len, void *wrkmem)
+{
+ const unsigned char * const in_end = in + in_len;
+ const unsigned char * const ip_end = in + in_len - M2_MAX_LEN - 5;
+ const unsigned char ** const dict = wrkmem;
+ const unsigned char *ip = in, *ii = ip;
+ const unsigned char *end, *m, *m_pos;
+ size_t m_off, m_len, dindex;
+ unsigned char *op = out;
+
+ ip += 4;
+
+ for (;;) {
+ dindex = ((size_t)(0x21 * DX3(ip, 5, 5, 6)) >> 5) & D_MASK;
+ m_pos = dict[dindex];
+
+ if (m_pos < in)
+ goto literal;
+
+ if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET))
+ goto literal;
+
+ m_off = ip - m_pos;
+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+ goto try_match;
+
+ dindex = (dindex & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f);
+ m_pos = dict[dindex];
+
+ if (m_pos < in)
+ goto literal;
+
+ if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET))
+ goto literal;
+
+ m_off = ip - m_pos;
+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+ goto try_match;
+
+ goto literal;
+
+ try_match:
+ if (get_unaligned((const unsigned short *)m_pos)
+ == get_unaligned((const unsigned short *)ip)) {
+ if (likely(m_pos[2] == ip[2]))
+ goto match;
+ }
+
+ literal:
+ dict[dindex] = ip;
+ ++ip;
+ if (unlikely(ip >= ip_end))
+ break;
+ continue;
+
+ match:
+ dict[dindex] = ip;
+ if (ip != ii) {
+ size_t t = ip - ii;
+
+ if (t <= 3) {
+ op[-2] |= t;
+ } else if (t <= 18) {
+ *op++ = (t - 3);
+ } else {
+ size_t tt = t - 18;
+
+ *op++ = 0;
+ while (tt > 255) {
+ tt -= 255;
+ *op++ = 0;
+ }
+ *op++ = tt;
+ }
+ do {
+ *op++ = *ii++;
+ } while (--t > 0);
+ }
+
+ ip += 3;
+ if (m_pos[3] != *ip++ || m_pos[4] != *ip++
+ || m_pos[5] != *ip++ || m_pos[6] != *ip++
+ || m_pos[7] != *ip++ || m_pos[8] != *ip++) {
+ --ip;
+ m_len = ip - ii;
+
+ if (m_off <= M2_MAX_OFFSET) {
+ m_off -= 1;
+ *op++ = (((m_len - 1) << 5)
+ | ((m_off & 7) << 2));
+ *op++ = (m_off >> 3);
+ } else if (m_off <= M3_MAX_OFFSET) {
+ m_off -= 1;
+ *op++ = (M3_MARKER | (m_len - 2));
+ goto m3_m4_offset;
+ } else {
+ m_off -= 0x4000;
+
+ *op++ = (M4_MARKER | ((m_off & 0x4000) >> 11)
+ | (m_len - 2));
+ goto m3_m4_offset;
+ }
+ } else {
+ end = in_end;
+ m = m_pos + M2_MAX_LEN + 1;
+
+ while (ip < end && *m == *ip) {
+ m++;
+ ip++;
+ }
+ m_len = ip - ii;
+
+ if (m_off <= M3_MAX_OFFSET) {
+ m_off -= 1;
+ if (m_len <= 33) {
+ *op++ = (M3_MARKER | (m_len - 2));
+ } else {
+ m_len -= 33;
+ *op++ = M3_MARKER | 0;
+ goto m3_m4_len;
+ }
+ } else {
+ m_off -= 0x4000;
+ if (m_len <= M4_MAX_LEN) {
+ *op++ = (M4_MARKER
+ | ((m_off & 0x4000) >> 11)
+ | (m_len - 2));
+ } else {
+ m_len -= M4_MAX_LEN;
+ *op++ = (M4_MARKER
+ | ((m_off & 0x4000) >> 11));
+ m3_m4_len:
+ while (m_len > 255) {
+ m_len -= 255;
+ *op++ = 0;
+ }
+
+ *op++ = (m_len);
+ }
+ }
+ m3_m4_offset:
+ *op++ = ((m_off & 63) << 2);
+ *op++ = (m_off >> 6);
+ }
+
+ ii = ip;
+ if (unlikely(ip >= ip_end))
+ break;
+ }
+
+ *out_len = op - out;
+ return in_end - ii;
+}
+
+int lzo1x_1_compress(const unsigned char *in, size_t in_len, unsigned char *out,
+ size_t *out_len, void *wrkmem)
+{
+ const unsigned char *ii;
+ unsigned char *op = out;
+ size_t t;
+
+ if (unlikely(in_len <= M2_MAX_LEN + 5)) {
+ t = in_len;
+ } else {
+ t = _lzo1x_1_do_compress(in, in_len, op, out_len, wrkmem);
+ op += *out_len;
+ }
+
+ if (t > 0) {
+ ii = in + in_len - t;
+
+ if (op == out && t <= 238) {
+ *op++ = (17 + t);
+ } else if (t <= 3) {
+ op[-2] |= t;
+ } else if (t <= 18) {
+ *op++ = (t - 3);
+ } else {
+ size_t tt = t - 18;
+
+ *op++ = 0;
+ while (tt > 255) {
+ tt -= 255;
+ *op++ = 0;
+ }
+
+ *op++ = tt;
+ }
+ do {
+ *op++ = *ii++;
+ } while (--t > 0);
+ }
+
+ *op++ = M4_MARKER | 1;
+ *op++ = 0;
+ *op++ = 0;
+
+ *out_len = op - out;
+ return LZO_E_OK;
+}
+
+/*
+ * LZO1X Decompressor from MiniLZO
+ *
+ * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ * The full LZO package can be found at:
+ * http://www.oberhumer.com/opensource/lzo/
+ *
+ * Changed for kernel use by:
+ * Nitin Gupta <nitingupta910@gmail.com>
+ * Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#define HAVE_IP(x, ip_end, ip) ((size_t)(ip_end - ip) < (x))
+#define HAVE_OP(x, op_end, op) ((size_t)(op_end - op) < (x))
+#define HAVE_LB(m_pos, out, op) (m_pos < out || m_pos >= op)
+
+#define COPY4(dst, src) \
+ put_unaligned(get_unaligned((const u32 *)(src)), (u32 *)(dst))
+
+int lzo1x_decompress_safe(const unsigned char *in, size_t in_len,
+ unsigned char *out, size_t *out_len)
+{
+ const unsigned char * const ip_end = in + in_len;
+ unsigned char * const op_end = out + *out_len;
+ const unsigned char *ip = in, *m_pos;
+ unsigned char *op = out;
+ size_t t;
+
+ *out_len = 0;
+
+ if (*ip > 17) {
+ t = *ip++ - 17;
+ if (t < 4)
+ goto match_next;
+ if (HAVE_OP(t, op_end, op))
+ goto output_overrun;
+ if (HAVE_IP(t + 1, ip_end, ip))
+ goto input_overrun;
+ do {
+ *op++ = *ip++;
+ } while (--t > 0);
+ goto first_literal_run;
+ }
+
+ while ((ip < ip_end)) {
+ t = *ip++;
+ if (t >= 16)
+ goto match;
+ if (t == 0) {
+ if (HAVE_IP(1, ip_end, ip))
+ goto input_overrun;
+ while (*ip == 0) {
+ t += 255;
+ ip++;
+ if (HAVE_IP(1, ip_end, ip))
+ goto input_overrun;
+ }
+ t += 15 + *ip++;
+ }
+ if (HAVE_OP(t + 3, op_end, op))
+ goto output_overrun;
+ if (HAVE_IP(t + 4, ip_end, ip))
+ goto input_overrun;
+
+ COPY4(op, ip);
+ op += 4;
+ ip += 4;
+ if (--t > 0) {
+ if (t >= 4) {
+ do {
+ COPY4(op, ip);
+ op += 4;
+ ip += 4;
+ t -= 4;
+ } while (t >= 4);
+ if (t > 0) {
+ do {
+ *op++ = *ip++;
+ } while (--t > 0);
+ }
+ } else {
+ do {
+ *op++ = *ip++;
+ } while (--t > 0);
+ }
+ }
+
+ first_literal_run:
+ t = *ip++;
+ if (t >= 16)
+ goto match;
+ m_pos = op - (1 + M2_MAX_OFFSET);
+ m_pos -= t >> 2;
+ m_pos -= *ip++ << 2;
+
+ if (HAVE_LB(m_pos, out, op))
+ goto lookbehind_overrun;
+
+ if (HAVE_OP(3, op_end, op))
+ goto output_overrun;
+ *op++ = *m_pos++;
+ *op++ = *m_pos++;
+ *op++ = *m_pos;
+
+ goto match_done;
+
+ do {
+ match:
+ if (t >= 64) {
+ m_pos = op - 1;
+ m_pos -= (t >> 2) & 7;
+ m_pos -= *ip++ << 3;
+ t = (t >> 5) - 1;
+ if (HAVE_LB(m_pos, out, op))
+ goto lookbehind_overrun;
+ if (HAVE_OP(t + 3 - 1, op_end, op))
+ goto output_overrun;
+ goto copy_match;
+ } else if (t >= 32) {
+ t &= 31;
+ if (t == 0) {
+ if (HAVE_IP(1, ip_end, ip))
+ goto input_overrun;
+ while (*ip == 0) {
+ t += 255;
+ ip++;
+ if (HAVE_IP(1, ip_end, ip))
+ goto input_overrun;
+ }
+ t += 31 + *ip++;
+ }
+ m_pos = op - 1;
+ m_pos -= get_unaligned_le16(ip) >> 2;
+ ip += 2;
+ } else if (t >= 16) {
+ m_pos = op;
+ m_pos -= (t & 8) << 11;
+
+ t &= 7;
+ if (t == 0) {
+ if (HAVE_IP(1, ip_end, ip))
+ goto input_overrun;
+ while (*ip == 0) {
+ t += 255;
+ ip++;
+ if (HAVE_IP(1, ip_end, ip))
+ goto input_overrun;
+ }
+ t += 7 + *ip++;
+ }
+ m_pos -= get_unaligned_le16(ip) >> 2;
+ ip += 2;
+ if (m_pos == op)
+ goto eof_found;
+ m_pos -= 0x4000;
+ } else {
+ m_pos = op - 1;
+ m_pos -= t >> 2;
+ m_pos -= *ip++ << 2;
+
+ if (HAVE_LB(m_pos, out, op))
+ goto lookbehind_overrun;
+ if (HAVE_OP(2, op_end, op))
+ goto output_overrun;
+
+ *op++ = *m_pos++;
+ *op++ = *m_pos;
+ goto match_done;
+ }
+
+ if (HAVE_LB(m_pos, out, op))
+ goto lookbehind_overrun;
+ if (HAVE_OP(t + 3 - 1, op_end, op))
+ goto output_overrun;
+
+ if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
+ COPY4(op, m_pos);
+ op += 4;
+ m_pos += 4;
+ t -= 4 - (3 - 1);
+ do {
+ COPY4(op, m_pos);
+ op += 4;
+ m_pos += 4;
+ t -= 4;
+ } while (t >= 4);
+ if (t > 0)
+ do {
+ *op++ = *m_pos++;
+ } while (--t > 0);
+ } else {
+ copy_match:
+ *op++ = *m_pos++;
+ *op++ = *m_pos++;
+ do {
+ *op++ = *m_pos++;
+ } while (--t > 0);
+ }
+ match_done:
+ t = ip[-2] & 3;
+ if (t == 0)
+ break;
+ match_next:
+ if (HAVE_OP(t, op_end, op))
+ goto output_overrun;
+ if (HAVE_IP(t + 1, ip_end, ip))
+ goto input_overrun;
+
+ *op++ = *ip++;
+ if (t > 1) {
+ *op++ = *ip++;
+ if (t > 2)
+ *op++ = *ip++;
+ }
+
+ t = *ip++;
+ } while (ip < ip_end);
+ }
+
+ *out_len = op - out;
+ return LZO_E_EOF_NOT_FOUND;
+
+ eof_found:
+ *out_len = op - out;
+ return (ip == ip_end ? LZO_E_OK :
+ (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
+ input_overrun:
+ *out_len = op - out;
+ return LZO_E_INPUT_OVERRUN;
+
+ output_overrun:
+ *out_len = op - out;
+ return LZO_E_OUTPUT_OVERRUN;
+
+ lookbehind_overrun:
+ *out_len = op - out;
+ return LZO_E_LOOKBEHIND_OVERRUN;
+}
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 55e2d8a046..0dd2b9282f 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -560,17 +560,6 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg)
return rc;
}
-/* Temporary placeholder. */
-int do_tmem_op(void *tmem_op)
-{
- static bool_t warned;
-
- if ( !test_and_set_bool(warned) )
- printk("tmem: not implemented\n");
-
- return -ENOSYS;
-}
-
/*
* Local variables:
* mode: C
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index ab3445b44b..bb143aedd6 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -35,6 +35,7 @@
#include <xen/perfc.h>
#include <xen/numa.h>
#include <xen/nodemask.h>
+#include <xen/tmem.h>
#include <public/sysctl.h>
#include <asm/page.h>
#include <asm/numa.h>
@@ -335,9 +336,9 @@ static unsigned long init_node_heap(int node, unsigned long mfn,
/* Allocate 2^@order contiguous pages. */
static struct page_info *alloc_heap_pages(
unsigned int zone_lo, unsigned int zone_hi,
- unsigned int node, unsigned int order)
+ unsigned int node, unsigned int order, unsigned int memflags)
{
- unsigned int i, j, zone;
+ unsigned int i, j, zone = 0;
unsigned int num_nodes = num_online_nodes();
unsigned long request = 1UL << order;
cpumask_t extra_cpus_mask, mask;
@@ -380,6 +381,14 @@ static struct page_info *alloc_heap_pages(
node = 0;
}
+ /* Try to free memory from tmem */
+ if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL )
+ {
+ /* reassigning an already allocated anonymous heap page */
+ spin_unlock(&heap_lock);
+ return pg;
+ }
+
/* No suitable memory blocks. Fail the request. */
spin_unlock(&heap_lock);
return NULL;
@@ -1018,8 +1027,8 @@ void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
ASSERT(!in_irq());
- pg = alloc_heap_pages(
- MEMZONE_XEN, MEMZONE_XEN, cpu_to_node(smp_processor_id()), order);
+ pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
+ cpu_to_node(smp_processor_id()), order, memflags);
if ( unlikely(pg == NULL) )
return NULL;
@@ -1172,11 +1181,11 @@ struct page_info *alloc_domheap_pages(
return NULL;
if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
- pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order);
+ pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
if ( (pg == NULL) &&
((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
- node, order)) == NULL) )
+ node, order, memflags)) == NULL) )
return NULL;
if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
@@ -1373,6 +1382,28 @@ static void page_scrub_softirq(void)
spin_unlock(&serialise_lock);
}
+void scrub_list_splice(struct page_list_head *list)
+{
+ spin_lock(&page_scrub_lock);
+ page_list_splice(list, &page_scrub_list);
+ spin_unlock(&page_scrub_lock);
+}
+
+void scrub_list_add(struct page_info *pg)
+{
+ spin_lock(&page_scrub_lock);
+ page_list_add(pg, &page_scrub_list);
+ spin_unlock(&page_scrub_lock);
+}
+
+void scrub_one_page(struct page_info *pg)
+{
+ void *p = map_domain_page(page_to_mfn(pg));
+
+ scrub_page(p);
+ unmap_domain_page(p);
+}
+
static void page_scrub_timer_fn(void *unused)
{
page_scrub_schedule_work();
diff --git a/xen/common/radix-tree.c b/xen/common/radix-tree.c
new file mode 100644
index 0000000000..414f0cef72
--- /dev/null
+++ b/xen/common/radix-tree.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Copyright (C) 2009 adaption for Xen tmem by Dan Magenheimer, Oracle Corp.
+ * Changed:
+ * o Linux 2.6.18 source used (prior to read-copy-update addition)
+ * o constants and data structures moved out to radix-tree.h header
+ * o tagging code removed
+ * o radix_tree_insert has func parameter for dynamic data struct allocation
+ * o radix_tree_destroy added (including recursive helper function)
+ * o __init functions must be called explicitly
+ * o other include files adapted to Xen
+ */
+
+#include <xen/config.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/radix-tree.h>
+#include <asm/cache.h>
+
+static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
+
+/*
+ * Return the maximum key which can be store into a
+ * radix tree with height HEIGHT.
+ */
+static inline unsigned long radix_tree_maxindex(unsigned int height)
+{
+ return height_to_maxindex[height];
+}
+
+/*
+ * Extend a radix tree so it can store key @index.
+ */
+static int radix_tree_extend(struct radix_tree_root *root, unsigned long index,
+ struct radix_tree_node *(*node_alloc)(void *), void *arg)
+{
+ struct radix_tree_node *node;
+ unsigned int height;
+
+ /* Figure out what the height should be. */
+ height = root->height + 1;
+ if (index > radix_tree_maxindex(height))
+ while (index > radix_tree_maxindex(height))
+ height++;
+
+ if (root->rnode == NULL) {
+ root->height = height;
+ goto out;
+ }
+
+ do {
+ if (!(node = node_alloc(arg)))
+ return -ENOMEM;
+
+ /* Increase the height. */
+ node->slots[0] = root->rnode;
+
+ node->count = 1;
+ root->rnode = node;
+ root->height++;
+ } while (height > root->height);
+ out:
+ return 0;
+}
+
+/**
+ * radix_tree_insert - insert into a radix tree
+ * @root: radix tree root
+ * @index: index key
+ * @item: item to insert
+ *
+ * Insert an item into the radix tree at position @index.
+ */
+int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
+ void *item, struct radix_tree_node *(*node_alloc)(void *), void *arg)
+{
+ struct radix_tree_node *node = NULL, *slot;
+ unsigned int height, shift;
+ int offset;
+ int error;
+
+ /* Make sure the tree is high enough. */
+ if (index > radix_tree_maxindex(root->height)) {
+ error = radix_tree_extend(root, index, node_alloc, arg);
+ if (error)
+ return error;
+ }
+
+ slot = root->rnode;
+ height = root->height;
+ shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+ offset = 0; /* uninitialised var warning */
+ while (height > 0) {
+ if (slot == NULL) {
+ /* Have to add a child node. */
+ if (!(slot = node_alloc(arg)))
+ return -ENOMEM;
+ if (node) {
+
+ node->slots[offset] = slot;
+ node->count++;
+ } else
+ root->rnode = slot;
+ }
+
+ /* Go a level down */
+ offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+ node = slot;
+ slot = node->slots[offset];
+ shift -= RADIX_TREE_MAP_SHIFT;
+ height--;
+ }
+
+ if (slot != NULL)
+ return -EEXIST;
+
+ if (node) {
+ node->count++;
+ node->slots[offset] = item;
+ } else {
+ root->rnode = item;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(radix_tree_insert);
+
+static inline void **__lookup_slot(struct radix_tree_root *root,
+ unsigned long index)
+{
+ unsigned int height, shift;
+ struct radix_tree_node **slot;
+
+ height = root->height;
+
+ if (index > radix_tree_maxindex(height))
+ return NULL;
+
+ if (height == 0 && root->rnode)
+ return (void **)&root->rnode;
+
+ shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+ slot = &root->rnode;
+
+ while (height > 0) {
+ if (*slot == NULL)
+ return NULL;
+
+ slot = (struct radix_tree_node **)
+ ((*slot)->slots +
+ ((index >> shift) & RADIX_TREE_MAP_MASK));
+ shift -= RADIX_TREE_MAP_SHIFT;
+ height--;
+ }
+
+ return (void **)slot;
+}
+
+/**
+ * radix_tree_lookup_slot - lookup a slot in a radix tree
+ * @root: radix tree root
+ * @index: index key
+ *
+ * Lookup the slot corresponding to the position @index in the radix tree
+ * @root. This is useful for update-if-exists operations.
+ */
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+{
+ return __lookup_slot(root, index);
+}
+EXPORT_SYMBOL(radix_tree_lookup_slot);
+
+/**
+ * radix_tree_lookup - perform lookup operation on a radix tree
+ * @root: radix tree root
+ * @index: index key
+ *
+ * Lookup the item at the position @index in the radix tree @root.
+ */
+void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+ void **slot;
+
+ slot = __lookup_slot(root, index);
+ return slot != NULL ? *slot : NULL;
+}
+EXPORT_SYMBOL(radix_tree_lookup);
+
+static unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+ unsigned int max_items, unsigned long *next_index)
+{
+ unsigned int nr_found = 0;
+ unsigned int shift, height;
+ struct radix_tree_node *slot;
+ unsigned long i;
+
+ height = root->height;
+ if (index > radix_tree_maxindex(height))
+ if (height == 0) {
+ if (root->rnode && index == 0)
+ results[nr_found++] = root->rnode;
+ goto out;
+ }
+
+ shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+ slot = root->rnode;
+
+ for ( ; height > 1; height--) {
+
+ for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
+ i < RADIX_TREE_MAP_SIZE; i++) {
+ if (slot->slots[i] != NULL)
+ break;
+ index &= ~((1UL << shift) - 1);
+ index += 1UL << shift;
+ if (index == 0)
+ goto out; /* 32-bit wraparound */
+ }
+ if (i == RADIX_TREE_MAP_SIZE)
+ goto out;
+
+ shift -= RADIX_TREE_MAP_SHIFT;
+ slot = slot->slots[i];
+ }
+
+ /* Bottom level: grab some items */
+ for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
+ index++;
+ if (slot->slots[i]) {
+ results[nr_found++] = slot->slots[i];
+ if (nr_found == max_items)
+ goto out;
+ }
+ }
+ out:
+ *next_index = index;
+ return nr_found;
+}
+
+/**
+ * radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ * @root: radix tree root
+ * @results: where the results of the lookup are placed
+ * @first_index: start the lookup from this key
+ * @max_items: place up to this many items at *results
+ *
+ * Performs an index-ascending scan of the tree for present items. Places
+ * them at *@results and returns the number of items which were placed at
+ * *@results.
+ *
+ * The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+ unsigned long first_index, unsigned int max_items)
+{
+ const unsigned long max_index = radix_tree_maxindex(root->height);
+ unsigned long cur_index = first_index;
+ unsigned int ret = 0;
+
+ while (ret < max_items) {
+ unsigned int nr_found;
+ unsigned long next_index; /* Index of next search */
+
+ if (cur_index > max_index)
+ break;
+ nr_found = __lookup(root, results + ret, cur_index,
+ max_items - ret, &next_index);
+ ret += nr_found;
+ if (next_index == 0)
+ break;
+ cur_index = next_index;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup);
+
+/**
+ * radix_tree_shrink - shrink height of a radix tree to minimal
+ * @root radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root,
+ void (*node_free)(struct radix_tree_node *))
+{
+ /* try to shrink tree height */
+ while (root->height > 0 &&
+ root->rnode->count == 1 &&
+ root->rnode->slots[0]) {
+ struct radix_tree_node *to_free = root->rnode;
+
+ root->rnode = to_free->slots[0];
+ root->height--;
+ to_free->slots[0] = NULL;
+ to_free->count = 0;
+ node_free(to_free);
+ }
+}
+
+/**
+ * radix_tree_delete - delete an item from a radix tree
+ * @root: radix tree root
+ * @index: index key
+ *
+ * Remove the item at @index from the radix tree rooted at @root.
+ *
+ * Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index,
+ void(*node_free)(struct radix_tree_node *))
+{
+ struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
+ struct radix_tree_node *slot = NULL;
+ unsigned int height, shift;
+ int offset;
+
+ height = root->height;
+ if (index > radix_tree_maxindex(height))
+ goto out;
+
+ slot = root->rnode;
+ if (height == 0 && root->rnode) {
+ root->rnode = NULL;
+ goto out;
+ }
+
+ shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ pathp->node = NULL;
+
+ do {
+ if (slot == NULL)
+ goto out;
+
+ pathp++;
+ offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+ pathp->offset = offset;
+ pathp->node = slot;
+ slot = slot->slots[offset];
+ shift -= RADIX_TREE_MAP_SHIFT;
+ height--;
+ } while (height > 0);
+
+ if (slot == NULL)
+ goto out;
+
+ /* Now free the nodes we do not need anymore */
+ while (pathp->node) {
+ pathp->node->slots[pathp->offset] = NULL;
+ pathp->node->count--;
+
+ if (pathp->node->count) {
+ if (pathp->node == root->rnode)
+ radix_tree_shrink(root, node_free);
+ goto out;
+ }
+
+ /* Node with zero slots in use so free it */
+ node_free(pathp->node);
+
+ pathp--;
+ }
+ root->height = 0;
+ root->rnode = NULL;
+
+ out:
+ return slot;
+}
+EXPORT_SYMBOL(radix_tree_delete);
+
+static void
+radix_tree_node_destroy(struct radix_tree_node *node, unsigned int height,
+ void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *))
+{
+ int i;
+
+ if (height == 0)
+ return;
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[i]) {
+ if (height == 1) {
+ slot_free(node->slots[i]);
+ node->slots[i] = NULL;
+ continue;
+ }
+ radix_tree_node_destroy(node->slots[i], height-1,
+ slot_free, node_free);
+ node_free(node->slots[i]);
+ node->slots[i] = NULL;
+ }
+ }
+}
+
+void radix_tree_destroy(struct radix_tree_root *root,
+ void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *))
+{
+ if (root->rnode == NULL)
+ return;
+ if (root->height == 0)
+ slot_free(root->rnode);
+ else {
+ radix_tree_node_destroy(root->rnode, root->height,
+ slot_free, node_free);
+ node_free(root->rnode);
+ root->height = 0;
+ }
+ root->rnode = NULL;
+ /* caller must delete root if desired */
+}
+EXPORT_SYMBOL(radix_tree_destroy);
+
+static /*__init*/ unsigned long __maxindex(unsigned int height)
+{
+ unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
+ unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
+
+ if (tmp >= RADIX_TREE_INDEX_BITS)
+ index = ~0UL;
+ return index;
+}
+
+/*__init*/ void radix_tree_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
+ height_to_maxindex[i] = __maxindex(i);
+}
diff --git a/xen/common/rbtree.c b/xen/common/rbtree.c
new file mode 100644
index 0000000000..67564c81b3
--- /dev/null
+++ b/xen/common/rbtree.c
@@ -0,0 +1,398 @@
+/*
+ Red Black Trees
+ (C) 1999 Andrea Arcangeli <andrea@suse.de>
+ (C) 2002 David Woodhouse <dwmw2@infradead.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ linux/lib/rbtree.c
+*/
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/rbtree.h>
+
+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *right = node->rb_right;
+ struct rb_node *parent = rb_parent(node);
+
+ if ((node->rb_right = right->rb_left))
+ rb_set_parent(right->rb_left, node);
+ right->rb_left = node;
+
+ rb_set_parent(right, parent);
+
+ if (parent)
+ {
+ if (node == parent->rb_left)
+ parent->rb_left = right;
+ else
+ parent->rb_right = right;
+ }
+ else
+ root->rb_node = right;
+ rb_set_parent(node, right);
+}
+
+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *left = node->rb_left;
+ struct rb_node *parent = rb_parent(node);
+
+ if ((node->rb_left = left->rb_right))
+ rb_set_parent(left->rb_right, node);
+ left->rb_right = node;
+
+ rb_set_parent(left, parent);
+
+ if (parent)
+ {
+ if (node == parent->rb_right)
+ parent->rb_right = left;
+ else
+ parent->rb_left = left;
+ }
+ else
+ root->rb_node = left;
+ rb_set_parent(node, left);
+}
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *parent, *gparent;
+
+ while ((parent = rb_parent(node)) && rb_is_red(parent))
+ {
+ gparent = rb_parent(parent);
+
+ if (parent == gparent->rb_left)
+ {
+ {
+ register struct rb_node *uncle = gparent->rb_right;
+ if (uncle && rb_is_red(uncle))
+ {
+ rb_set_black(uncle);
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ node = gparent;
+ continue;
+ }
+ }
+
+ if (parent->rb_right == node)
+ {
+ register struct rb_node *tmp;
+ __rb_rotate_left(parent, root);
+ tmp = parent;
+ parent = node;
+ node = tmp;
+ }
+
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ __rb_rotate_right(gparent, root);
+ } else {
+ {
+ register struct rb_node *uncle = gparent->rb_left;
+ if (uncle && rb_is_red(uncle))
+ {
+ rb_set_black(uncle);
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ node = gparent;
+ continue;
+ }
+ }
+
+ if (parent->rb_left == node)
+ {
+ register struct rb_node *tmp;
+ __rb_rotate_right(parent, root);
+ tmp = parent;
+ parent = node;
+ node = tmp;
+ }
+
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ __rb_rotate_left(gparent, root);
+ }
+ }
+
+ rb_set_black(root->rb_node);
+}
+EXPORT_SYMBOL(rb_insert_color);
+
+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+ struct rb_root *root)
+{
+ struct rb_node *other;
+
+ while ((!node || rb_is_black(node)) && node != root->rb_node)
+ {
+ if (parent->rb_left == node)
+ {
+ other = parent->rb_right;
+ if (rb_is_red(other))
+ {
+ rb_set_black(other);
+ rb_set_red(parent);
+ __rb_rotate_left(parent, root);
+ other = parent->rb_right;
+ }
+ if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+ (!other->rb_right || rb_is_black(other->rb_right)))
+ {
+ rb_set_red(other);
+ node = parent;
+ parent = rb_parent(node);
+ }
+ else
+ {
+ if (!other->rb_right || rb_is_black(other->rb_right))
+ {
+ struct rb_node *o_left;
+ if ((o_left = other->rb_left))
+ rb_set_black(o_left);
+ rb_set_red(other);
+ __rb_rotate_right(other, root);
+ other = parent->rb_right;
+ }
+ rb_set_color(other, rb_color(parent));
+ rb_set_black(parent);
+ if (other->rb_right)
+ rb_set_black(other->rb_right);
+ __rb_rotate_left(parent, root);
+ node = root->rb_node;
+ break;
+ }
+ }
+ else
+ {
+ other = parent->rb_left;
+ if (rb_is_red(other))
+ {
+ rb_set_black(other);
+ rb_set_red(parent);
+ __rb_rotate_right(parent, root);
+ other = parent->rb_left;
+ }
+ if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+ (!other->rb_right || rb_is_black(other->rb_right)))
+ {
+ rb_set_red(other);
+ node = parent;
+ parent = rb_parent(node);
+ }
+ else
+ {
+ if (!other->rb_left || rb_is_black(other->rb_left))
+ {
+ register struct rb_node *o_right;
+ if ((o_right = other->rb_right))
+ rb_set_black(o_right);
+ rb_set_red(other);
+ __rb_rotate_left(other, root);
+ other = parent->rb_left;
+ }
+ rb_set_color(other, rb_color(parent));
+ rb_set_black(parent);
+ if (other->rb_left)
+ rb_set_black(other->rb_left);
+ __rb_rotate_right(parent, root);
+ node = root->rb_node;
+ break;
+ }
+ }
+ }
+ if (node)
+ rb_set_black(node);
+}
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *child, *parent;
+ int color;
+
+ if (!node->rb_left)
+ child = node->rb_right;
+ else if (!node->rb_right)
+ child = node->rb_left;
+ else
+ {
+ struct rb_node *old = node, *left;
+
+ node = node->rb_right;
+ while ((left = node->rb_left) != NULL)
+ node = left;
+ child = node->rb_right;
+ parent = rb_parent(node);
+ color = rb_color(node);
+
+ if (child)
+ rb_set_parent(child, parent);
+ if (parent == old) {
+ parent->rb_right = child;
+ parent = node;
+ } else
+ parent->rb_left = child;
+
+ node->rb_parent_color = old->rb_parent_color;
+ node->rb_right = old->rb_right;
+ node->rb_left = old->rb_left;
+
+ if (rb_parent(old))
+ {
+ if (rb_parent(old)->rb_left == old)
+ rb_parent(old)->rb_left = node;
+ else
+ rb_parent(old)->rb_right = node;
+ } else
+ root->rb_node = node;
+
+ rb_set_parent(old->rb_left, node);
+ if (old->rb_right)
+ rb_set_parent(old->rb_right, node);
+ goto color;
+ }
+
+ parent = rb_parent(node);
+ color = rb_color(node);
+
+ if (child)
+ rb_set_parent(child, parent);
+ if (parent)
+ {
+ if (parent->rb_left == node)
+ parent->rb_left = child;
+ else
+ parent->rb_right = child;
+ }
+ else
+ root->rb_node = child;
+
+ color:
+ if (color == RB_BLACK)
+ __rb_erase_color(child, parent, root);
+}
+EXPORT_SYMBOL(rb_erase);
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(struct rb_root *root)
+{
+ struct rb_node *n;
+
+ n = root->rb_node;
+ if (!n)
+ return NULL;
+ while (n->rb_left)
+ n = n->rb_left;
+ return n;
+}
+EXPORT_SYMBOL(rb_first);
+
+struct rb_node *rb_last(struct rb_root *root)
+{
+ struct rb_node *n;
+
+ n = root->rb_node;
+ if (!n)
+ return NULL;
+ while (n->rb_right)
+ n = n->rb_right;
+ return n;
+}
+EXPORT_SYMBOL(rb_last);
+
+struct rb_node *rb_next(struct rb_node *node)
+{
+ struct rb_node *parent;
+
+ if (rb_parent(node) == node)
+ return NULL;
+
+ /* If we have a right-hand child, go down and then left as far
+ as we can. */
+ if (node->rb_right) {
+ node = node->rb_right;
+ while (node->rb_left)
+ node=node->rb_left;
+ return node;
+ }
+
+ /* No right-hand children. Everything down and left is
+ smaller than us, so any 'next' node must be in the general
+ direction of our parent. Go up the tree; any time the
+ ancestor is a right-hand child of its parent, keep going
+ up. First time it's a left-hand child of its parent, said
+ parent is our 'next' node. */
+ while ((parent = rb_parent(node)) && node == parent->rb_right)
+ node = parent;
+
+ return parent;
+}
+EXPORT_SYMBOL(rb_next);
+
+struct rb_node *rb_prev(struct rb_node *node)
+{
+ struct rb_node *parent;
+
+ if (rb_parent(node) == node)
+ return NULL;
+
+ /* If we have a left-hand child, go down and then right as far
+ as we can. */
+ if (node->rb_left) {
+ node = node->rb_left;
+ while (node->rb_right)
+ node=node->rb_right;
+ return node;
+ }
+
+ /* No left-hand children. Go up till we find an ancestor which
+ is a right-hand child of its parent */
+ while ((parent = rb_parent(node)) && node == parent->rb_left)
+ node = parent;
+
+ return parent;
+}
+EXPORT_SYMBOL(rb_prev);
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+ struct rb_root *root)
+{
+ struct rb_node *parent = rb_parent(victim);
+
+ /* Set the surrounding nodes to point to the replacement */
+ if (parent) {
+ if (victim == parent->rb_left)
+ parent->rb_left = new;
+ else
+ parent->rb_right = new;
+ } else {
+ root->rb_node = new;
+ }
+ if (victim->rb_left)
+ rb_set_parent(victim->rb_left, new);
+ if (victim->rb_right)
+ rb_set_parent(victim->rb_right, new);
+
+ /* Copy the pointers/colour from the victim to the replacement */
+ *new = *victim;
+}
+EXPORT_SYMBOL(rb_replace_node);
diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c
index ac2aaab814..a17f0b2124 100644
--- a/xen/common/spinlock.c
+++ b/xen/common/spinlock.c
@@ -214,6 +214,12 @@ unsigned long _write_lock_irqsave(rwlock_t *lock)
return flags;
}
+int _write_trylock(rwlock_t *lock)
+{
+ check_lock(&lock->debug);
+ return _raw_write_trylock(&lock->raw);
+}
+
void _write_unlock(rwlock_t *lock)
{
_raw_write_unlock(&lock->raw);
@@ -236,3 +242,9 @@ int _rw_is_locked(rwlock_t *lock)
check_lock(&lock->debug);
return _raw_rw_is_locked(&lock->raw);
}
+
+int _rw_is_write_locked(rwlock_t *lock)
+{
+ check_lock(&lock->debug);
+ return _raw_rw_is_write_locked(&lock->raw);
+}
diff --git a/xen/common/tmem.c b/xen/common/tmem.c
new file mode 100644
index 0000000000..19d8bec05c
--- /dev/null
+++ b/xen/common/tmem.c
@@ -0,0 +1,2109 @@
+/******************************************************************************
+ * tmem.c
+ *
+ * Transcendent memory
+ *
+ * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
+ */
+
+/* TODO list: 090129
+ - improve on reclamation policy
+ - use different tlsf pools for each client (maybe each pool)
+ - implement page accounting and minimal QoS limits
+ - test shared access more completely (need pv cluster fs)
+ - add feedback-driven compression (not for persistent pools though!)
+ - add data-structure total bytes overhead stats
+ */
+
+#ifdef __XEN__
+#include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
+#endif
+
+#include <xen/tmem.h>
+#include <xen/rbtree.h>
+#include <xen/radix-tree.h>
+#include <xen/list.h>
+
+#define EXPORT /* indicates code other modules are dependent upon */
+#define FORWARD
+
+/************ INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
+
+#define CLI_ID_NULL TMH_CLI_ID_NULL
+#define cli_id_str tmh_cli_id_str
+#define client_str tmh_client_str
+
+/************ DEBUG and STATISTICS (+ some compression testing) *******/
+
+#ifndef NDEBUG
+#define SENTINELS
+#define NOINLINE noinline
+#else
+#define NOINLINE
+#endif
+
+#ifdef SENTINELS
+#define DECL_SENTINEL unsigned long sentinel;
+#define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
+#define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
+#define ASSERT_SENTINEL(_x,_y) \
+ ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
+#ifdef __i386__
+#define POOL_SENTINEL 0x87658765
+#define OBJ_SENTINEL 0x12345678
+#define OBJNODE_SENTINEL 0xfedcba09
+#define PGD_SENTINEL 0x43214321
+#else
+#define POOL_SENTINEL 0x8765876587658765
+#define OBJ_SENTINEL 0x1234567812345678
+#define OBJNODE_SENTINEL 0xfedcba0987654321
+#define PGD_SENTINEL 0x4321432143214321
+#endif
+#else
+#define DECL_SENTINEL
+#define SET_SENTINEL(_x,_y) do { } while (0)
+#define ASSERT_SENTINEL(_x,_y) do { } while (0)
+#define INVERT_SENTINEL(_x,_y) do { } while (0)
+#endif
+
+/* global statistics (none need to be locked) */
+static unsigned long total_tmem_ops = 0;
+static unsigned long errored_tmem_ops = 0;
+static unsigned long total_flush_pool = 0;
+static unsigned long alloc_failed = 0, alloc_page_failed = 0;
+static unsigned long evicted_pgs = 0, evict_attempts = 0;
+static unsigned long relinq_pgs = 0, relinq_attempts = 0;
+static unsigned long max_evicts_per_relinq = 0;
+static unsigned long low_on_memory = 0;
+static int global_obj_count_max = 0;
+static int global_pgp_count_max = 0;
+static int global_page_count_max = 0;
+static int global_rtree_node_count_max = 0;
+static long global_eph_count_max = 0;
+static unsigned long failed_copies;
+
+DECL_CYC_COUNTER(succ_get);
+DECL_CYC_COUNTER(succ_put);
+DECL_CYC_COUNTER(non_succ_get);
+DECL_CYC_COUNTER(non_succ_put);
+DECL_CYC_COUNTER(flush);
+DECL_CYC_COUNTER(flush_obj);
+#ifdef COMPARE_COPY_PAGE_SSE2
+EXTERN_CYC_COUNTER(pg_copy1);
+EXTERN_CYC_COUNTER(pg_copy2);
+EXTERN_CYC_COUNTER(pg_copy3);
+EXTERN_CYC_COUNTER(pg_copy4);
+#else
+EXTERN_CYC_COUNTER(pg_copy);
+#endif
+DECL_CYC_COUNTER(compress);
+DECL_CYC_COUNTER(decompress);
+
+/************ CORE DATA STRUCTURES ************************************/
+
+#define MAX_POOLS_PER_DOMAIN 16
+#define MAX_GLOBAL_SHARED_POOLS 16
+
+struct tm_pool;
+struct client {
+ struct list_head client_list;
+ struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
+ tmh_client_t *tmh;
+ struct list_head ephemeral_page_list;
+ long eph_count, eph_count_max;
+ cli_id_t cli_id;
+ uint32_t weight;
+ uint32_t cap;
+ bool_t compress;
+ bool_t frozen;
+ unsigned long compress_poor, compress_nomem;
+ unsigned long compressed_pages;
+ uint64_t compressed_sum_size;
+};
+typedef struct client client_t;
+
+struct share_list {
+ struct list_head share_list;
+ client_t *client;
+};
+typedef struct share_list sharelist_t;
+
+#define OBJ_HASH_BUCKETS 256 /* must be power of two */
+#define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
+#define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK)
+
+struct tm_pool {
+ bool_t shared;
+ bool_t persistent;
+ struct list_head pool_list; /* FIXME do we need this anymore? */
+ client_t *client;
+ uint64_t uuid[2]; /* 0 for private, non-zero for shared */
+ uint32_t pool_id;
+ rwlock_t pool_rwlock;
+ struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
+ struct list_head share_list; /* valid if shared */
+ DECL_SENTINEL
+ int shared_count; /* valid if shared */
+ atomic_t pgp_count;
+ int pgp_count_max;
+ long obj_count; /* atomicity depends on pool_rwlock held for write */
+ long obj_count_max;
+ unsigned long objnode_count, objnode_count_max;
+ uint64_t sum_life_cycles;
+ uint64_t sum_evicted_cycles;
+ unsigned long puts, good_puts, no_mem_puts;
+ unsigned long dup_puts_flushed, dup_puts_replaced;
+ unsigned long gets, found_gets;
+ unsigned long flushs, flushs_found;
+ unsigned long flush_objs, flush_objs_found;
+};
+typedef struct tm_pool pool_t;
+
+#define is_persistent(_p) (_p->persistent)
+#define is_ephemeral(_p) (!(_p->persistent))
+#define is_shared(_p) (_p->shared)
+#define is_private(_p) (!(_p->shared))
+
+struct tmem_object_root {
+ DECL_SENTINEL
+ uint64_t oid;
+ struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
+ unsigned long objnode_count; /* atomicity depends on obj_spinlock */
+ long pgp_count; /* atomicity depends on obj_spinlock */
+ struct radix_tree_root tree_root; /* tree of pages within object */
+ pool_t *pool;
+ cli_id_t last_client;
+ spinlock_t obj_spinlock;
+ bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
+};
+typedef struct tmem_object_root obj_t;
+
+typedef struct radix_tree_node rtn_t;
+struct tmem_object_node {
+ obj_t *obj;
+ DECL_SENTINEL
+ rtn_t rtn;
+};
+typedef struct tmem_object_node objnode_t;
+
+struct tmem_page_descriptor {
+ struct list_head global_eph_pages;
+ struct list_head client_eph_pages;
+ obj_t *obj;
+ uint32_t index;
+ size_t size; /* 0 == PAGE_SIZE (pfp), else compressed data (cdata) */
+ union {
+ pfp_t *pfp; /* page frame pointer */
+ char *cdata; /* compressed data */
+ };
+ uint64_t timestamp;
+ DECL_SENTINEL
+};
+typedef struct tmem_page_descriptor pgp_t;
+
+static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
+
+static LIST_HEAD(global_client_list);
+static LIST_HEAD(global_pool_list);
+
+static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
+static atomic_t client_weight_total = ATOMIC_INIT(0);
+static int tmem_initialized = 0;
+
+/************ CONCURRENCY ***********************************************/
+
+EXPORT DEFINE_SPINLOCK(tmem_spinlock); /* used iff tmh_lock_all */
+EXPORT DEFINE_RWLOCK(tmem_rwlock); /* used iff !tmh_lock_all */
+static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
+
+#define tmem_spin_lock(_l) do {if (!tmh_lock_all) spin_lock(_l);}while(0)
+#define tmem_spin_unlock(_l) do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
+#define tmem_read_lock(_l) do {if (!tmh_lock_all) read_lock(_l);}while(0)
+#define tmem_read_unlock(_l) do {if (!tmh_lock_all) read_unlock(_l);}while(0)
+#define tmem_write_lock(_l) do {if (!tmh_lock_all) write_lock(_l);}while(0)
+#define tmem_write_unlock(_l) do {if (!tmh_lock_all) write_unlock(_l);}while(0)
+#define tmem_write_trylock(_l) ((tmh_lock_all)?1:write_trylock(_l))
+#define tmem_spin_trylock(_l) (tmh_lock_all?1:spin_trylock(_l))
+
+#define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
+#define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
+
+/* global counters (should use long_atomic_t access) */
+static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
+static atomic_t global_obj_count = ATOMIC_INIT(0);
+static atomic_t global_pgp_count = ATOMIC_INIT(0);
+static atomic_t global_page_count = ATOMIC_INIT(0);
+static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
+
+#define atomic_inc_and_max(_c) do { \
+ atomic_inc(&_c); \
+ if ( _atomic_read(_c) > _c##_max ) \
+ _c##_max = _atomic_read(_c); \
+} while (0)
+
+#define atomic_dec_and_assert(_c) do { \
+ atomic_dec(&_c); \
+ ASSERT(_atomic_read(_c) >= 0); \
+} while (0)
+
+
+/************ MEMORY ALLOCATION INTERFACE *****************************/
+
+#define tmem_malloc(_type,_pool) \
+ _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
+
+#define tmem_malloc_bytes(_size,_pool) \
+ _tmem_malloc(_size, 1, _pool)
+
+static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
+{
+ void *v;
+
+ if ( (pool != NULL) && is_persistent(pool) )
+ v = tmh_alloc_subpage_thispool(pool,size,align);
+ else
+ v = tmh_alloc_subpage(pool, size, align);
+ if ( v == NULL )
+ alloc_failed++;
+ return v;
+}
+
+static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
+{
+ if ( pool == NULL || !is_persistent(pool) )
+ tmh_free_subpage(p,size);
+ else
+ tmh_free_subpage_thispool(pool,p,size);
+}
+
+static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
+{
+ pfp_t *pfp = NULL;
+
+ if ( pool != NULL && is_persistent(pool) )
+ pfp = tmh_alloc_page_thispool(pool);
+ else
+ pfp = tmh_alloc_page(pool,0);
+ if ( pfp == NULL )
+ alloc_page_failed++;
+ else
+ atomic_inc_and_max(global_page_count);
+ return pfp;
+}
+
+static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
+{
+ ASSERT(pfp);
+ if ( pool == NULL || !is_persistent(pool) )
+ tmh_free_page(pfp);
+ else
+ tmh_free_page_thispool(pool,pfp);
+ atomic_dec_and_assert(global_page_count);
+}
+
+/************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
+
+/* allocate a pgp_t and associate it with an object */
+static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
+{
+ pgp_t *pgp;
+ pool_t *pool;
+
+ ASSERT(obj != NULL);
+ ASSERT(obj->pool != NULL);
+ pool = obj->pool;
+ if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
+ return NULL;
+ pgp->obj = obj;
+ INIT_LIST_HEAD(&pgp->global_eph_pages);
+ INIT_LIST_HEAD(&pgp->client_eph_pages);
+ pgp->pfp = NULL;
+ pgp->size = -1;
+ pgp->index = -1;
+ pgp->timestamp = get_cycles();
+ SET_SENTINEL(pgp,PGD);
+ atomic_inc_and_max(global_pgp_count);
+ atomic_inc_and_max(pool->pgp_count);
+ return pgp;
+}
+
+static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
+{
+ ASSERT(obj != NULL);
+ ASSERT_SPINLOCK(&obj->obj_spinlock);
+ ASSERT_SENTINEL(obj,OBJ);
+ ASSERT(obj->pool != NULL);
+ ASSERT_SENTINEL(obj->pool,POOL);
+ return radix_tree_lookup(&obj->tree_root, index);
+}
+
+static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
+{
+ if ( pgp->pfp == NULL )
+ return;
+ if ( !pgp->size )
+ tmem_page_free(pgp->obj->pool,pgp->pfp);
+ else
+ {
+ tmem_free(pgp->cdata,pgp->size,pool);
+ if ( pool != NULL )
+ {
+ pool->client->compressed_pages--;
+ pool->client->compressed_sum_size -= pgp->size;
+ }
+ }
+ pgp->pfp = NULL;
+ pgp->size = -1;
+}
+
+static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
+{
+ pool_t *pool = NULL;
+
+ ASSERT_SENTINEL(pgp,PGD);
+ ASSERT(pgp->obj != NULL);
+ ASSERT_SENTINEL(pgp->obj,OBJ);
+ ASSERT_SENTINEL(pgp->obj->pool,POOL);
+ ASSERT(list_empty(&pgp->global_eph_pages));
+ ASSERT(list_empty(&pgp->client_eph_pages));
+ if ( from_delete )
+ ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
+ ASSERT(pgp->obj->pool != NULL);
+ pool = pgp->obj->pool;
+ pgp_free_data(pgp, pool);
+ INVERT_SENTINEL(pgp,PGD);
+ pgp->obj = NULL;
+ pgp->index = -1;
+ pgp->size = -1;
+ atomic_dec_and_assert(global_pgp_count);
+ atomic_dec_and_assert(pool->pgp_count);
+ tmem_free(pgp,sizeof(pgp_t),pool);
+}
+
+/* remove the page from appropriate lists but not from parent object */
+static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
+{
+ ASSERT(pgp != NULL);
+ ASSERT(pgp->obj != NULL);
+ ASSERT(pgp->obj->pool != NULL);
+ ASSERT(pgp->obj->pool->client != NULL);
+ if ( is_ephemeral(pgp->obj->pool) )
+ {
+ if ( !no_eph_lock )
+ tmem_spin_lock(&eph_lists_spinlock);
+ if ( !list_empty(&pgp->client_eph_pages) )
+ pgp->obj->pool->client->eph_count--;
+ ASSERT(pgp->obj->pool->client->eph_count >= 0);
+ list_del_init(&pgp->client_eph_pages);
+ if ( !list_empty(&pgp->global_eph_pages) )
+ global_eph_count--;
+ ASSERT(global_eph_count >= 0);
+ list_del_init(&pgp->global_eph_pages);
+ if ( !no_eph_lock )
+ tmem_spin_unlock(&eph_lists_spinlock);
+ }
+}
+
+/* remove page from lists (but not from parent object) and free it */
+static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
+{
+ uint64_t life;
+
+ ASSERT(pgp != NULL);
+ ASSERT(pgp->obj != NULL);
+ ASSERT(pgp->obj->pool != NULL);
+ life = get_cycles() - pgp->timestamp;
+ pgp->obj->pool->sum_life_cycles += life;
+ pgp_delist(pgp, no_eph_lock);
+ pgp_free(pgp,1);
+}
+
+/* called only indirectly by radix_tree_destroy */
+static NOINLINE void pgp_destroy(void *v)
+{
+ pgp_t *pgp = (pgp_t *)v;
+
+ ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
+ pgp_delist(pgp,0);
+ ASSERT(pgp->obj != NULL);
+ pgp->obj->pgp_count--;
+ ASSERT(pgp->obj->pgp_count >= 0);
+ pgp_free(pgp,0);
+}
+
+FORWARD static rtn_t *rtn_alloc(void *arg);
+FORWARD static void rtn_free(rtn_t *rtn);
+
+static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
+{
+ int ret;
+
+ ASSERT_SPINLOCK(&obj->obj_spinlock);
+ ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
+ if ( !ret )
+ obj->pgp_count++;
+ return ret;
+}
+
+static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
+{
+ pgp_t *pgp;
+
+ ASSERT(obj != NULL);
+ ASSERT_SPINLOCK(&obj->obj_spinlock);
+ ASSERT_SENTINEL(obj,OBJ);
+ ASSERT(obj->pool != NULL);
+ ASSERT_SENTINEL(obj->pool,POOL);
+ pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
+ if ( pgp != NULL )
+ obj->pgp_count--;
+ ASSERT(obj->pgp_count >= 0);
+
+ return pgp;
+}
+
+/************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
+
+/* called only indirectly from radix_tree_insert */
+static NOINLINE rtn_t *rtn_alloc(void *arg)
+{
+ objnode_t *objnode;
+ obj_t *obj = (obj_t *)arg;
+
+ ASSERT_SENTINEL(obj,OBJ);
+ ASSERT(obj->pool != NULL);
+ ASSERT_SENTINEL(obj->pool,POOL);
+ objnode = tmem_malloc(objnode_t,obj->pool);
+ if (objnode == NULL)
+ return NULL;
+ objnode->obj = obj;
+ SET_SENTINEL(objnode,OBJNODE);
+ memset(&objnode->rtn, 0, sizeof(rtn_t));
+ if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
+ obj->pool->objnode_count_max = obj->pool->objnode_count;
+ atomic_inc_and_max(global_rtree_node_count);
+ obj->objnode_count++;
+ return &objnode->rtn;
+}
+
+/* called only indirectly from radix_tree_delete/destroy */
+static void rtn_free(rtn_t *rtn)
+{
+ pool_t *pool;
+ objnode_t *objnode;
+ int i;
+
+ ASSERT(rtn != NULL);
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
+ ASSERT(rtn->slots[i] == NULL);
+ objnode = container_of(rtn,objnode_t,rtn);
+ ASSERT_SENTINEL(objnode,OBJNODE);
+ INVERT_SENTINEL(objnode,OBJNODE);
+ ASSERT(objnode->obj != NULL);
+ ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
+ ASSERT_SENTINEL(objnode->obj,OBJ);
+ pool = objnode->obj->pool;
+ ASSERT(pool != NULL);
+ ASSERT_SENTINEL(pool,POOL);
+ pool->objnode_count--;
+ objnode->obj->objnode_count--;
+ objnode->obj = NULL;
+ tmem_free(objnode,sizeof(objnode_t),pool);
+ atomic_dec_and_assert(global_rtree_node_count);
+}
+
+/************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
+
+/* searches for object==oid in pool, returns locked object if found */
+static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid)
+{
+ struct rb_node *node;
+ obj_t *obj;
+
+restart_find:
+ tmem_read_lock(&pool->pool_rwlock);
+ node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node;
+ while ( node )
+ {
+ obj = container_of(node, obj_t, rb_tree_node);
+ if ( obj->oid == oid )
+ {
+ if ( tmh_lock_all )
+ obj->no_evict = 1;
+ else
+ {
+ if ( !tmem_spin_trylock(&obj->obj_spinlock) )
+ {
+ tmem_read_unlock(&pool->pool_rwlock);
+ goto restart_find;
+ }
+ tmem_read_unlock(&pool->pool_rwlock);
+ }
+ return obj;
+ }
+ else if ( oid < obj->oid )
+ node = node->rb_left;
+ else
+ node = node->rb_right;
+ }
+ tmem_read_unlock(&pool->pool_rwlock);
+ return NULL;
+}
+
+/* free an object that has no more pgps in it */
+static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
+{
+ pool_t *pool;
+ uint64_t old_oid;
+
+ ASSERT_SPINLOCK(&obj->obj_spinlock);
+ ASSERT(obj != NULL);
+ ASSERT_SENTINEL(obj,OBJ);
+ ASSERT(obj->pgp_count == 0);
+ pool = obj->pool;
+ ASSERT(pool != NULL);
+ ASSERT_WRITELOCK(&pool->pool_rwlock);
+ if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
+ radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
+ ASSERT((long)obj->objnode_count == 0);
+ ASSERT(obj->tree_root.rnode == NULL);
+ pool->obj_count--;
+ ASSERT(pool->obj_count >= 0);
+ INVERT_SENTINEL(obj,OBJ);
+ obj->pool = NULL;
+ old_oid = obj->oid;
+ obj->oid = -1;
+ obj->last_client = CLI_ID_NULL;
+ atomic_dec_and_assert(global_obj_count);
+ /* use no_rebalance only if all objects are being destroyed anyway */
+ if ( !no_rebalance )
+ rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]);
+ tmem_free(obj,sizeof(obj_t),pool);
+}
+
+static NOINLINE void obj_rb_destroy_node(struct rb_node *node)
+{
+ obj_t * obj;
+
+ if ( node == NULL )
+ return;
+ obj_rb_destroy_node(node->rb_left);
+ obj_rb_destroy_node(node->rb_right);
+ obj = container_of(node, obj_t, rb_tree_node);
+ tmem_spin_lock(&obj->obj_spinlock);
+ ASSERT(obj->no_evict == 0);
+ radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
+ obj_free(obj,1);
+}
+
+static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
+{
+ struct rb_node **new, *parent = NULL;
+ obj_t *this;
+
+ new = &(root->rb_node);
+ while ( *new )
+ {
+ this = container_of(*new, obj_t, rb_tree_node);
+ parent = *new;
+ if ( obj->oid < this->oid )
+ new = &((*new)->rb_left);
+ else if ( obj->oid > this->oid )
+ new = &((*new)->rb_right);
+ else
+ return 0;
+ }
+ rb_link_node(&obj->rb_tree_node, parent, new);
+ rb_insert_color(&obj->rb_tree_node, root);
+ return 1;
+}
+
+/*
+ * allocate, initialize, and insert an tmem_object_root
+ * (should be called only if find failed)
+ */
+static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid)
+{
+ obj_t *obj;
+
+ ASSERT(pool != NULL);
+ ASSERT_WRITELOCK(&pool->pool_rwlock);
+ if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
+ return NULL;
+ pool->obj_count++;
+ if (pool->obj_count > pool->obj_count_max)
+ pool->obj_count_max = pool->obj_count;
+ atomic_inc_and_max(global_obj_count);
+ INIT_RADIX_TREE(&obj->tree_root,0);
+ spin_lock_init(&obj->obj_spinlock);
+ obj->pool = pool;
+ obj->oid = oid;
+ obj->objnode_count = 0;
+ obj->pgp_count = 0;
+ obj->last_client = CLI_ID_NULL;
+ SET_SENTINEL(obj,OBJ);
+ tmem_spin_lock(&obj->obj_spinlock);
+ obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj);
+ obj->no_evict = 1;
+ ASSERT_SPINLOCK(&obj->obj_spinlock);
+ return obj;
+}
+
+/* free an object after destroying any pgps in it */
+static NOINLINE void obj_destroy(obj_t *obj)
+{
+ ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
+ radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
+ obj_free(obj,0);
+}
+
+/* destroy all objects in a pool */
+static NOINLINE void obj_rb_destroy_all(pool_t *pool)
+{
+ int i;
+
+ tmem_write_lock(&pool->pool_rwlock);
+ for (i = 0; i < OBJ_HASH_BUCKETS; i++)
+ obj_rb_destroy_node(pool->obj_rb_root[i].rb_node);
+ tmem_write_unlock(&pool->pool_rwlock);
+}
+
+/* destroys all objects in a pool that have last_client set to cli_id */
+static void obj_free_selective(pool_t *pool, cli_id_t cli_id)
+{
+ struct rb_node *node;
+ obj_t *obj;
+ int i;
+
+ tmem_write_lock(&pool->pool_rwlock);
+ for (i = 0; i < OBJ_HASH_BUCKETS; i++)
+ {
+ node = rb_first(&pool->obj_rb_root[i]);
+ while ( node != NULL )
+ {
+ obj = container_of(node, obj_t, rb_tree_node);
+ tmem_spin_lock(&obj->obj_spinlock);
+ node = rb_next(node);
+ if ( obj->last_client == cli_id )
+ obj_destroy(obj);
+ else
+ tmem_spin_unlock(&obj->obj_spinlock);
+ }
+ }
+ tmem_write_unlock(&pool->pool_rwlock);
+}
+
+
+/************ POOL MANIPULATION ROUTINES ******************************/
+
+static pool_t * pool_alloc(void)
+{
+ pool_t *pool;
+ int i;
+
+ if ( (pool = tmem_malloc(pool_t,NULL)) == NULL )
+ return NULL;
+ for (i = 0; i < OBJ_HASH_BUCKETS; i++)
+ pool->obj_rb_root[i] = RB_ROOT;
+ INIT_LIST_HEAD(&pool->pool_list);
+ rwlock_init(&pool->pool_rwlock);
+ pool->pgp_count_max = pool->obj_count_max = 0;
+ pool->objnode_count = pool->objnode_count_max = 0;
+ atomic_set(&pool->pgp_count,0);
+ pool->obj_count = 0;
+ pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
+ pool->dup_puts_replaced = pool->no_mem_puts = 0;
+ pool->found_gets = pool->gets = 0;
+ pool->flushs_found = pool->flushs = 0;
+ pool->flush_objs_found = pool->flush_objs = 0;
+ SET_SENTINEL(pool,POOL);
+ return pool;
+}
+
+static NOINLINE void pool_free(pool_t *pool)
+{
+ ASSERT_SENTINEL(pool,POOL);
+ INVERT_SENTINEL(pool,POOL);
+ pool->client = NULL;
+ list_del(&pool->pool_list);
+ tmem_free(pool,sizeof(pool_t),NULL);
+}
+
+/* register new_client as a user of this shared pool and return new
+ total number of registered users */
+static int shared_pool_join(pool_t *pool, client_t *new_client)
+{
+ sharelist_t *sl;
+
+ ASSERT(is_shared(pool));
+ if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
+ return -1;
+ sl->client = new_client;
+ list_add_tail(&sl->share_list, &pool->share_list);
+ printk("adding new %s %d to shared pool owned by %s %d\n",
+ client_str, new_client->cli_id, client_str, pool->client->cli_id);
+ return ++pool->shared_count;
+}
+
+/* reassign "ownership" of the pool to another client that shares this pool */
+static NOINLINE void shared_pool_reassign(pool_t *pool)
+{
+ sharelist_t *sl;
+ int poolid;
+ client_t *old_client = pool->client, *new_client;
+
+ ASSERT(is_shared(pool));
+ if ( list_empty(&pool->share_list) )
+ {
+ ASSERT(pool->shared_count == 0);
+ return;
+ }
+ old_client->pools[pool->pool_id] = NULL;
+ sl = list_entry(pool->share_list.next, sharelist_t, share_list);
+ ASSERT(sl->client != old_client);
+ pool->client = new_client = sl->client;
+ for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
+ if (new_client->pools[poolid] == pool)
+ break;
+ ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
+ printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
+ cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
+ pool->pool_id = poolid;
+}
+
+/* destroy all objects with last_client same as passed cli_id,
+ remove pool's cli_id from list of sharers of this pool */
+static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
+{
+ sharelist_t *sl;
+ int s_poolid;
+
+ ASSERT(is_shared(pool));
+ ASSERT(pool->client != NULL);
+
+ obj_free_selective(pool,cli_id);
+ list_for_each_entry(sl,&pool->share_list, share_list)
+ {
+ if (sl->client->cli_id != cli_id)
+ continue;
+ list_del(&sl->share_list);
+ tmem_free(sl,sizeof(sharelist_t),pool);
+ --pool->shared_count;
+ if (pool->client->cli_id == cli_id)
+ shared_pool_reassign(pool);
+ if (pool->shared_count)
+ return pool->shared_count;
+ for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
+ if ( (global_shared_pools[s_poolid]) == pool )
+ {
+ global_shared_pools[s_poolid] = NULL;
+ break;
+ }
+ return 0;
+ }
+ printk("tmem: no match unsharing pool, %s=%d\n",
+ cli_id_str,pool->client->cli_id);
+ return -1;
+}
+
+/* flush all data (owned by cli_id) from a pool and, optionally, free it */
+static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
+{
+ ASSERT(pool != NULL);
+ if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
+ {
+ printk("tmem: unshared shared pool %d from %s=%d\n",
+ pool->pool_id, cli_id_str,pool->client->cli_id);
+ return;
+ }
+ printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
+ is_persistent(pool) ? "persistent" : "ephemeral" ,
+ is_shared(pool) ? "shared" : "private");
+ printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
+ obj_rb_destroy_all(pool);
+ if ( destroy )
+ {
+ pool->client->pools[pool->pool_id] = NULL;
+ pool_free(pool);
+ }
+}
+
+/************ CLIENT MANIPULATION OPERATIONS **************************/
+
+static client_t *client_create(void)
+{
+ client_t *client = tmem_malloc(client_t,NULL);
+ cli_id_t cli_id = tmh_get_cli_id_from_current();
+
+ printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
+ if ( client == NULL )
+ {
+ printk("failed... out of memory\n");
+ return NULL;
+ }
+ memset(client,0,sizeof(client_t));
+ if ( (client->tmh = tmh_client_init()) == NULL )
+ {
+ printk("failed... can't allocate host-dependent part of client\n");
+ if ( client )
+ tmem_free(client,sizeof(client_t),NULL);
+ return NULL;
+ }
+ tmh_set_current_client(client);
+ client->cli_id = cli_id;
+#ifdef __i386__
+ client->compress = 0;
+#else
+ client->compress = tmh_compression_enabled();
+#endif
+ list_add_tail(&client->client_list, &global_client_list);
+ INIT_LIST_HEAD(&client->ephemeral_page_list);
+ client->eph_count = client->eph_count_max = 0;
+ printk("ok\n");
+ return client;
+}
+
+static void client_free(client_t *client)
+{
+ list_del(&client->client_list);
+ tmh_client_destroy(client->tmh);
+ tmh_set_current_client(NULL);
+ tmem_free(client,sizeof(client_t),NULL);
+}
+
+/* flush all data from a client and, optionally, free it */
+static void client_flush(client_t *client, bool_t destroy)
+{
+ int i;
+ pool_t *pool;
+
+ for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
+ {
+ if ( (pool = client->pools[i]) == NULL )
+ continue;
+ pool_flush(pool,client->cli_id,destroy);
+ if ( destroy )
+ client->pools[i] = NULL;
+ }
+ if ( destroy )
+ client_free(client);
+}
+
+static bool_t client_over_quota(client_t *client)
+{
+ int total = _atomic_read(client_weight_total);
+
+ ASSERT(client != NULL);
+ if ( (total == 0) || (client->weight == 0) ||
+ (client->eph_count == 0) )
+ return 0;
+ return ( ((global_eph_count*100L) / client->eph_count ) >
+ ((total*100L) / client->weight) );
+}
+
+/************ MEMORY REVOCATION ROUTINES *******************************/
+
+static int tmem_evict(void)
+{
+ client_t *client = tmh_client_from_current();
+ pgp_t *pgp = NULL, *pgp_del;
+ obj_t *obj;
+ pool_t *pool;
+ int ret = 0;
+ bool_t hold_pool_rwlock = 0;
+
+ evict_attempts++;
+ tmem_spin_lock(&eph_lists_spinlock);
+ if ( (client != NULL) && client_over_quota(client) &&
+ !list_empty(&client->ephemeral_page_list) )
+ {
+ list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
+ {
+ obj = pgp->obj;
+ pool = obj->pool;
+ if ( tmh_lock_all && !obj->no_evict )
+ goto found;
+ if ( tmem_spin_trylock(&obj->obj_spinlock) )
+ {
+ if ( obj->pgp_count > 1 )
+ goto found;
+ if ( tmem_write_trylock(&pool->pool_rwlock) )
+ {
+ hold_pool_rwlock = 1;
+ goto found;
+ }
+ tmem_spin_unlock(&obj->obj_spinlock);
+ }
+ }
+ } else if ( list_empty(&global_ephemeral_page_list) ) {
+ goto out;
+ } else {
+ list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
+ {
+ obj = pgp->obj;
+ pool = obj->pool;
+ if ( tmh_lock_all && !obj->no_evict )
+ goto found;
+ if ( tmem_spin_trylock(&obj->obj_spinlock) )
+ {
+ if ( obj->pgp_count > 1 )
+ goto found;
+ if ( tmem_write_trylock(&pool->pool_rwlock) )
+ {
+ hold_pool_rwlock = 1;
+ goto found;
+ }
+ tmem_spin_unlock(&obj->obj_spinlock);
+ }
+ }
+ }
+
+ ret = 0;
+ goto out;
+
+found:
+ ASSERT(pgp != NULL);
+ ASSERT_SENTINEL(pgp,PGD);
+ obj = pgp->obj;
+ ASSERT(obj != NULL);
+ ASSERT(obj->no_evict == 0);
+ ASSERT(obj->pool != NULL);
+ ASSERT_SENTINEL(obj,OBJ);
+
+ ASSERT_SPINLOCK(&obj->obj_spinlock);
+ pgp_del = pgp_delete_from_obj(obj, pgp->index);
+ ASSERT(pgp_del == pgp);
+ pgp_delete(pgp,1);
+ if ( obj->pgp_count == 0 )
+ {
+ ASSERT_WRITELOCK(&pool->pool_rwlock);
+ obj_free(obj,0);
+ }
+ else
+ tmem_spin_unlock(&obj->obj_spinlock);
+ if ( hold_pool_rwlock )
+ tmem_write_unlock(&pool->pool_rwlock);
+ evicted_pgs++;
+ ret = 1;
+
+out:
+ tmem_spin_unlock(&eph_lists_spinlock);
+ return ret;
+}
+
+static unsigned long tmem_relinquish_npages(unsigned long n)
+{
+ unsigned long avail_pages = 0;
+
+ while ( (avail_pages = tmh_avail_pages()) < n )
+ {
+ if ( !tmem_evict() )
+ break;
+ }
+ if ( avail_pages )
+ tmh_release_avail_pages_to_host();
+ return avail_pages;
+}
+
+/************ TMEM CORE OPERATIONS ************************************/
+
+static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn)
+{
+ void *dst, *p;
+ size_t size;
+ int ret = 0;
+ DECL_LOCAL_CYC_COUNTER(compress);
+
+ ASSERT(pgp != NULL);
+ ASSERT(pgp->obj != NULL);
+ ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
+ ASSERT(pgp->obj->pool != NULL);
+ ASSERT(pgp->obj->pool->client != NULL);
+#ifdef __i386__
+ return -ENOMEM;
+#endif
+ if ( pgp->pfp != NULL )
+ pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */
+ START_CYC_COUNTER(compress);
+ ret = tmh_compress_from_client(cmfn, &dst, &size);
+ if ( (ret == -EFAULT) || (ret == 0) )
+ goto out;
+ else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
+ ret = 0;
+ else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
+ ret = -ENOMEM;
+ else
+ {
+ memcpy(p,dst,size);
+ pgp->cdata = p;
+ pgp->size = size;
+ pgp->obj->pool->client->compressed_pages++;
+ pgp->obj->pool->client->compressed_sum_size += size;
+ ret = 1;
+ }
+
+out:
+ END_CYC_COUNTER(compress);
+ return ret;
+}
+
+static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
+ uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
+{
+ pool_t *pool;
+ obj_t *obj;
+ client_t *client;
+ pgp_t *pgpfound = NULL;
+ int ret;
+
+ /* if we can successfully manipulate pgp to change out the data, do so */
+ ASSERT(pgp != NULL);
+ ASSERT(pgp->pfp != NULL);
+ ASSERT(pgp->size != -1);
+ obj = pgp->obj;
+ ASSERT_SPINLOCK(&obj->obj_spinlock);
+ ASSERT(obj != NULL);
+ pool = obj->pool;
+ ASSERT(pool != NULL);
+ client = pool->client;
+ if ( len != 0 && tmh_compression_enabled() &&
+ client->compress && pgp->size != 0 )
+ {
+ ret = do_tmem_put_compress(pgp,cmfn);
+ if ( ret == 1 )
+ goto done;
+ else if ( ret == 0 )
+ goto copy_uncompressed;
+ else if ( ret == -ENOMEM )
+ goto failed_dup;
+ else if ( ret == -EFAULT )
+ goto bad_copy;
+ }
+
+copy_uncompressed:
+ if ( pgp->pfp )
+ pgp_free_data(pgp, pool);
+ if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
+ goto failed_dup;
+ /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
+ ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
+ if ( ret == -EFAULT )
+ goto bad_copy;
+ pgp->size = 0;
+
+done:
+ /* successfully replaced data, clean up and return success */
+ if ( is_shared(pool) )
+ obj->last_client = client->cli_id;
+ obj->no_evict = 0;
+ tmem_spin_unlock(&obj->obj_spinlock);
+ pool->dup_puts_replaced++;
+ pool->good_puts++;
+ return 1;
+
+bad_copy:
+ /* this should only happen if the client passed a bad mfn */
+ failed_copies++;
+ASSERT(0);
+ return -EFAULT;
+
+failed_dup:
+ /* couldn't change out the data, flush the old data and return
+ * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
+ pgpfound = pgp_delete_from_obj(obj, pgp->index);
+ ASSERT(pgpfound == pgp);
+ pgp_delete(pgpfound,0);
+ if ( obj->pgp_count == 0 )
+ {
+ tmem_write_lock(&pool->pool_rwlock);
+ obj_free(obj,0);
+ tmem_write_unlock(&pool->pool_rwlock);
+ } else {
+ obj->no_evict = 0;
+ tmem_spin_unlock(&obj->obj_spinlock);
+ }
+ pool->dup_puts_flushed++;
+ return -ENOSPC;
+}
+
+
+static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index,
+ tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
+ uint32_t pfn_offset, uint32_t len)
+{
+ obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
+ pgp_t *pgp = NULL, *pgpdel = NULL;
+ client_t *client = pool->client;
+ int ret = client->frozen ? -EFROZEN : -ENOMEM;
+
+ ASSERT(pool != NULL);
+ pool->puts++;
+ /* does page already exist (dup)? if so, handle specially */
+ if ( (obj = objfound = obj_find(pool,oid)) != NULL )
+ {
+ ASSERT_SPINLOCK(&objfound->obj_spinlock);
+ if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
+ return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len);
+ }
+
+ /* no puts allowed into a frozen pool (except dup puts) */
+ if ( client->frozen )
+ goto free;
+
+ if ( (objfound == NULL) )
+ {
+ tmem_write_lock(&pool->pool_rwlock);
+ if ( (obj = objnew = obj_new(pool,oid)) == NULL )
+ {
+ tmem_write_unlock(&pool->pool_rwlock);
+ return -ENOMEM;
+ }
+ ASSERT_SPINLOCK(&objnew->obj_spinlock);
+ tmem_write_unlock(&pool->pool_rwlock);
+ }
+
+ ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
+ ASSERT_SPINLOCK(&obj->obj_spinlock);
+ if ( (pgp = pgp_alloc(obj)) == NULL )
+ goto free;
+
+ ret = pgp_add_to_obj(obj, index, pgp);
+ if ( ret == -ENOMEM )
+ /* warning, may result in partially built radix tree ("stump") */
+ goto free;
+ ASSERT(ret != -EEXIST);
+ pgp->index = index;
+
+ if ( len != 0 && tmh_compression_enabled() && client->compress )
+ {
+ ASSERT(pgp->pfp == NULL);
+ ret = do_tmem_put_compress(pgp,cmfn);
+ if ( ret == 1 )
+ goto insert_page;
+ if ( ret == -ENOMEM )
+ {
+ client->compress_nomem++;
+ goto delete_and_free;
+ }
+ if ( ret == 0 )
+ {
+ client->compress_poor++;
+ goto copy_uncompressed;
+ }
+ if ( ret == -EFAULT )
+ goto bad_copy;
+ }
+
+copy_uncompressed:
+ if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
+ {
+ ret == -ENOMEM;
+ goto delete_and_free;
+ }
+ /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
+ ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
+ if ( ret == -EFAULT )
+ goto bad_copy;
+ pgp->size = 0;
+
+insert_page:
+ if ( is_ephemeral(pool) )
+ {
+ tmem_spin_lock(&eph_lists_spinlock);
+ list_add_tail(&pgp->global_eph_pages,
+ &global_ephemeral_page_list);
+ if (++global_eph_count > global_eph_count_max)
+ global_eph_count_max = global_eph_count;
+ list_add_tail(&pgp->client_eph_pages,
+ &client->ephemeral_page_list);
+ if (++client->eph_count > client->eph_count_max)
+ client->eph_count_max = client->eph_count;
+ tmem_spin_unlock(&eph_lists_spinlock);
+ }
+ ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
+ if ( is_shared(pool) )
+ obj->last_client = client->cli_id;
+ obj->no_evict = 0;
+ tmem_spin_unlock(&obj->obj_spinlock);
+ pool->good_puts++;
+ return 1;
+
+delete_and_free:
+ ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
+ pgpdel = pgp_delete_from_obj(obj, pgp->index);
+ ASSERT(pgp == pgpdel);
+
+free:
+ if ( pgp )
+ pgp_delete(pgp,0);
+ if ( objfound )
+ {
+ objfound->no_evict = 0;
+ tmem_spin_unlock(&objfound->obj_spinlock);
+ }
+ if ( objnew )
+ {
+ tmem_write_lock(&pool->pool_rwlock);
+ obj_free(objnew,0);
+ tmem_write_unlock(&pool->pool_rwlock);
+ }
+ pool->no_mem_puts++;
+ return ret;
+
+bad_copy:
+ /* this should only happen if the client passed a bad mfn */
+ failed_copies++;
+ASSERT(0);
+ goto free;
+}
+
+static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
+ tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
+ uint32_t pfn_offset, uint32_t len)
+{
+ obj_t *obj;
+ pgp_t *pgp;
+ client_t *client = pool->client;
+ DECL_LOCAL_CYC_COUNTER(decompress);
+
+ if ( !_atomic_read(pool->pgp_count) )
+ return -EEMPTY;
+
+ pool->gets++;
+ obj = obj_find(pool,oid);
+ if ( obj == NULL )
+ return 0;
+
+ ASSERT_SPINLOCK(&obj->obj_spinlock);
+ if (is_shared(pool) || is_persistent(pool) )
+ pgp = pgp_lookup_in_obj(obj, index);
+ else
+ pgp = pgp_delete_from_obj(obj, index);
+ if ( pgp == NULL )
+ {
+ obj->no_evict = 0;
+ tmem_spin_unlock(&obj->obj_spinlock);
+ return 0;
+ }
+ ASSERT(pgp->size != -1);
+ if ( pgp->size != 0 )
+ {
+ START_CYC_COUNTER(decompress);
+ if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size) == -EFAULT )
+ goto bad_copy;
+ END_CYC_COUNTER(decompress);
+ }
+ else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
+ pfn_offset, len) == -EFAULT)
+ goto bad_copy;
+ if ( is_ephemeral(pool) )
+ {
+ if ( is_private(pool) )
+ {
+ pgp_delete(pgp,0);
+ if ( obj->pgp_count == 0 )
+ {
+ tmem_write_lock(&pool->pool_rwlock);
+ obj_free(obj,0);
+ obj = NULL;
+ tmem_write_unlock(&pool->pool_rwlock);
+ }
+ } else {
+ tmem_spin_lock(&eph_lists_spinlock);
+ list_del(&pgp->global_eph_pages);
+ list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
+ list_del(&pgp->client_eph_pages);
+ list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
+ tmem_spin_unlock(&eph_lists_spinlock);
+ ASSERT(obj != NULL);
+ obj->last_client = tmh_get_cli_id_from_current();
+ }
+ }
+ if ( obj != NULL )
+ {
+ obj->no_evict = 0;
+ tmem_spin_unlock(&obj->obj_spinlock);
+ }
+ pool->found_gets++;
+ return 1;
+
+bad_copy:
+ /* this should only happen if the client passed a bad mfn */
+ failed_copies++;
+ASSERT(0);
+ return -EFAULT;
+
+}
+
+static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t index)
+{
+ obj_t *obj;
+ pgp_t *pgp;
+
+ pool->flushs++;
+ obj = obj_find(pool,oid);
+ if ( obj == NULL )
+ goto out;
+ pgp = pgp_delete_from_obj(obj, index);
+ if ( pgp == NULL )
+ {
+ obj->no_evict = 0;
+ tmem_spin_unlock(&obj->obj_spinlock);
+ goto out;
+ }
+ pgp_delete(pgp,0);
+ if ( obj->pgp_count == 0 )
+ {
+ tmem_write_lock(&pool->pool_rwlock);
+ obj_free(obj,0);
+ tmem_write_unlock(&pool->pool_rwlock);
+ } else {
+ obj->no_evict = 0;
+ tmem_spin_unlock(&obj->obj_spinlock);
+ }
+ pool->flushs_found++;
+
+out:
+ if ( pool->client->frozen )
+ return -EFROZEN;
+ else
+ return 1;
+}
+
+static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid)
+{
+ obj_t *obj;
+
+ pool->flush_objs++;
+ obj = obj_find(pool,oid);
+ if ( obj == NULL )
+ goto out;
+ tmem_write_lock(&pool->pool_rwlock);
+ obj_destroy(obj);
+ pool->flush_objs_found++;
+ tmem_write_unlock(&pool->pool_rwlock);
+
+out:
+ if ( pool->client->frozen )
+ return -EFROZEN;
+ else
+ return 1;
+}
+
+static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
+{
+ client_t *client = tmh_client_from_current();
+ pool_t *pool;
+
+ if ( client->pools == NULL )
+ return 0;
+ if ( (pool = client->pools[pool_id]) == NULL )
+ return 0;
+ client->pools[pool_id] = NULL;
+ pool_flush(pool,client->cli_id,1);
+ return 1;
+}
+
+static NOINLINE int do_tmem_new_pool(uint32_t flags, uint64_t uuid_lo, uint64_t uuid_hi)
+{
+ client_t *client = tmh_client_from_current();
+ cli_id_t cli_id = tmh_get_cli_id_from_current();
+ int persistent = flags & TMEM_POOL_PERSIST;
+ int shared = flags & TMEM_POOL_SHARED;
+ int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
+ & TMEM_POOL_PAGESIZE_MASK;
+ int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
+ & TMEM_POOL_VERSION_MASK;
+ pool_t *pool, *shpool;
+ int s_poolid, d_poolid, first_unused_s_poolid;
+
+ ASSERT(client != NULL);
+ printk("tmem: allocating %s-%s tmem pool for %s=%d...",
+ persistent ? "persistent" : "ephemeral" ,
+ shared ? "shared" : "private", cli_id_str, cli_id);
+ if ( specversion != 0 )
+ {
+ printk("failed... unsupported spec version\n");
+ return -EPERM;
+ }
+ if ( pagebits != (PAGE_SHIFT - 12) )
+ {
+ printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
+ return -EPERM;
+ }
+ if ( (pool = pool_alloc()) == NULL )
+ {
+ printk("failed... out of memory\n");
+ return -ENOMEM;
+ }
+ for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
+ if ( client->pools[d_poolid] == NULL )
+ break;
+ if ( d_poolid == MAX_POOLS_PER_DOMAIN )
+ {
+ printk("failed... no more pool slots available for this %s\n",
+ client_str);
+ goto fail;
+ }
+ pool->shared = shared;
+ pool->client = client;
+ if ( shared )
+ {
+ first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
+ for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
+ {
+ if ( (shpool = global_shared_pools[s_poolid]) != NULL )
+ {
+ if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
+ {
+ printk("(matches shared pool uuid=%"PRIx64".%"PRIu64") ",
+ uuid_hi, uuid_lo);
+ printk("pool_id=%d\n",d_poolid);
+ client->pools[d_poolid] = global_shared_pools[s_poolid];
+ shared_pool_join(global_shared_pools[s_poolid], client);
+ pool_free(pool);
+ return d_poolid;
+ }
+ }
+ else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
+ first_unused_s_poolid = s_poolid;
+ }
+ if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
+ {
+ printk("tmem: failed... no global shared pool slots available\n");
+ goto fail;
+ }
+ else
+ {
+ INIT_LIST_HEAD(&pool->share_list);
+ pool->shared_count = 0;
+ global_shared_pools[first_unused_s_poolid] = pool;
+ (void)shared_pool_join(pool,client);
+ }
+ }
+ client->pools[d_poolid] = pool;
+ list_add_tail(&pool->pool_list, &global_pool_list);
+ pool->pool_id = d_poolid;
+ pool->persistent = persistent;
+ pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
+ printk("pool_id=%d\n",d_poolid);
+ return d_poolid;
+
+fail:
+ pool_free(pool);
+ return -EPERM;
+}
+
+/************ TMEM CONTROL OPERATIONS ************************************/
+
+/* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
+static int tmemc_freeze_pools(int cli_id, int arg)
+{
+ client_t *client;
+ bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
+ bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
+ char *s;
+
+ s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
+ if ( cli_id == CLI_ID_NULL )
+ {
+ list_for_each_entry(client,&global_client_list,client_list)
+ {
+ client->frozen = freeze;
+ printk("tmem: all pools %s for all %ss\n",s,client_str);
+ }
+ }
+ else
+ {
+ if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
+ return -1;
+ client->frozen = freeze;
+ printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
+ }
+ return 0;
+}
+
+static int tmemc_flush_mem(int cli_id, uint32_t kb)
+{
+ uint32_t npages, flushed_pages, flushed_kb;
+
+ if ( cli_id != CLI_ID_NULL )
+ {
+ printk("tmem: %s-specific flush not supported yet, use --all\n",
+ client_str);
+ return -1;
+ }
+ /* convert kb to pages, rounding up if necessary */
+ npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
+ flushed_pages = tmem_relinquish_npages(npages);
+ flushed_kb = flushed_pages << (PAGE_SHIFT-10);
+ return flushed_kb;
+}
+
+/*
+ * These tmemc_list* routines output lots of stats in a format that is
+ * intended to be program-parseable, not human-readable. Further, by
+ * tying each group of stats to a line format indicator (e.g. G= for
+ * global stats) and each individual stat to a two-letter specifier
+ * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
+ * global ephemeral pool), it should allow the stats reported to be
+ * forward and backwards compatible as tmem evolves.
+ */
+#define BSIZE 1024
+
+static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off,
+ uint32_t len, bool_t use_long)
+{
+ char info[BSIZE];
+ int i, n = 0, sum = 0;
+ pool_t *p;
+ bool_t s;
+
+ n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d%c",
+ c->cli_id, c->weight, c->cap, c->compress,
+ c->frozen, use_long ? ',' : '\n');
+ if (use_long)
+ n += scnprintf(info+n,BSIZE-n,
+ "Ec:%ld,Em:%ld,cp:%ld,cb:%lld,cn:%ld,cm:%ld\n",
+ c->eph_count, c->eph_count_max,
+ c->compressed_pages, (long long)c->compressed_sum_size,
+ c->compress_poor, c->compress_nomem);
+ tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+ sum += n;
+ for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
+ {
+ if ( (p = c->pools[i]) == NULL )
+ continue;
+ s = is_shared(p);
+ n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,PT:%c%c,U0:%llx,U1:%llx%c",
+ c->cli_id, p->pool_id,
+ is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
+ s ? p->uuid[0] : 0LL, s ? p->uuid[1] : 0LL,
+ use_long ? ',' : '\n');
+ if (use_long)
+ n += scnprintf(info+n,BSIZE-n,
+ "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
+ "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
+ "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
+ _atomic_read(p->pgp_count), p->pgp_count_max,
+ p->obj_count, p->obj_count_max,
+ p->objnode_count, p->objnode_count_max,
+ p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
+ p->no_mem_puts,
+ p->found_gets, p->gets,
+ p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
+ if ( sum + n >= len )
+ return sum;
+ tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+ sum += n;
+ }
+ return sum;
+}
+
+static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
+ bool_t use_long)
+{
+ char info[BSIZE];
+ int i, n = 0, sum = 0;
+ pool_t *p;
+ sharelist_t *sl;
+
+ for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
+ {
+ if ( (p = global_shared_pools[i]) == NULL )
+ continue;
+ n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%llx,U1:%llx",
+ i, is_persistent(p) ? 'P' : 'E', is_shared(p) ? 'S' : 'P',
+ (unsigned long long)p->uuid[0], (unsigned long long)p->uuid[1]);
+ list_for_each_entry(sl,&p->share_list, share_list)
+ n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
+ n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
+ if (use_long)
+ n += scnprintf(info+n,BSIZE-n,
+ "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
+ "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
+ "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
+ _atomic_read(p->pgp_count), p->pgp_count_max,
+ p->obj_count, p->obj_count_max,
+ p->objnode_count, p->objnode_count_max,
+ p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
+ p->no_mem_puts,
+ p->found_gets, p->gets,
+ p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
+ if ( sum + n >= len )
+ return sum;
+ tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+ sum += n;
+ }
+ return sum;
+}
+
+#ifdef TMEM_PERF
+static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
+ bool_t use_long)
+{
+ char info[BSIZE];
+ int n = 0, sum = 0;
+
+ n = scnprintf(info+n,BSIZE-n,"T=");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
+#ifdef COMPARE_COPY_PAGE_SSE2
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
+#else
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
+#endif
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
+ n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
+ n--; /* overwrite trailing comma */
+ n += scnprintf(info+n,BSIZE-n,"\n");
+ if ( sum + n >= len )
+ return sum;
+ tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+ sum += n;
+ return sum;
+}
+#else
+#define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
+#endif
+
+static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
+ bool_t use_long)
+{
+ char info[BSIZE];
+ int n = 0, sum = off;
+
+ n += scnprintf(info,BSIZE,"G="
+ "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
+ "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
+ total_tmem_ops, errored_tmem_ops, failed_copies,
+ alloc_failed, alloc_page_failed, tmh_avail_pages(),
+ low_on_memory, evicted_pgs,
+ evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
+ total_flush_pool, use_long ? ',' : '\n');
+ if (use_long)
+ n += scnprintf(info+n,BSIZE-n,
+ "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
+ global_eph_count, global_eph_count_max,
+ _atomic_read(global_obj_count), global_obj_count_max,
+ _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
+ _atomic_read(global_pgp_count), global_pgp_count_max);
+ if ( sum + n >= len )
+ return sum;
+ tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
+ sum += n;
+ return sum;
+}
+
+static int tmemc_list(int cli_id, tmem_cli_va_t buf, uint32_t len,
+ bool_t use_long)
+{
+ client_t *client;
+ int off = 0;
+
+ if ( cli_id == CLI_ID_NULL ) {
+ off = tmemc_list_global(buf,0,len,use_long);
+ off += tmemc_list_shared(buf,off,len-off,use_long);
+ list_for_each_entry(client,&global_client_list,client_list)
+ off += tmemc_list_client(client, buf, off, len-off, use_long);
+ off += tmemc_list_global_perf(buf,off,len-off,use_long);
+ }
+ else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
+ return -1;
+ else
+ off = tmemc_list_client(client, buf, 0, len, use_long);
+
+
+ return 0;
+}
+
+static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
+{
+ cli_id_t cli_id = client->cli_id;
+ uint32_t old_weight;
+
+ switch (subop)
+ {
+ case TMEMC_SET_WEIGHT:
+ old_weight = client->weight;
+ client->weight = arg1;
+ printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
+ atomic_sub(old_weight,&client_weight_total);
+ atomic_add(client->weight,&client_weight_total);
+ break;
+ case TMEMC_SET_CAP:
+ client->cap = arg1;
+ printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
+ break;
+ case TMEMC_SET_COMPRESS:
+ client->compress = arg1 ? 1 : 0;
+ printk("tmem: compression %s for %s=%d\n",
+ arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
+ break;
+ default:
+ printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
+ return -1;
+ }
+ return 0;
+}
+
+static int tmemc_set_var(int cli_id, uint32_t subop, uint32_t arg1)
+{
+ client_t *client;
+
+ if ( cli_id == CLI_ID_NULL )
+ list_for_each_entry(client,&global_client_list,client_list)
+ tmemc_set_var_one(client, subop, arg1);
+ else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
+ return -1;
+ else
+ tmemc_set_var_one(client, subop, arg1);
+ return 0;
+}
+
+static int do_tmem_control(uint32_t subop, uint32_t cli_id32,
+ uint32_t arg1, uint32_t arg2, tmem_cli_va_t buf)
+{
+ int ret;
+ cli_id_t cli_id = (cli_id_t)cli_id32;
+
+ if (!tmh_current_is_privileged())
+ {
+ /* don't fail... mystery: sometimes dom0 fails here */
+ /* return -EPERM; */
+ }
+ switch(subop)
+ {
+ case TMEMC_THAW:
+ case TMEMC_FREEZE:
+ case TMEMC_DESTROY:
+ ret = tmemc_freeze_pools(cli_id,subop);
+ break;
+ case TMEMC_FLUSH:
+ ret = tmemc_flush_mem(cli_id,arg1);
+ break;
+ case TMEMC_LIST:
+ ret = tmemc_list(cli_id,buf,arg1,arg2);
+ break;
+ case TMEMC_SET_WEIGHT:
+ case TMEMC_SET_CAP:
+ case TMEMC_SET_COMPRESS:
+ ret = tmemc_set_var(cli_id,subop,arg1);
+ break;
+ default:
+ ret = -1;
+ }
+ return ret;
+}
+
+/************ EXPORTed FUNCTIONS **************************************/
+
+EXPORT long do_tmem_op(tmem_cli_op_t uops)
+{
+ struct tmem_op op;
+ client_t *client = tmh_client_from_current();
+ pool_t *pool = NULL;
+ int rc = 0;
+ bool_t succ_get = 0, succ_put = 0;
+ bool_t non_succ_get = 0, non_succ_put = 0;
+ bool_t flush = 0, flush_obj = 0;
+ bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
+ static bool_t warned = 0;
+ DECL_LOCAL_CYC_COUNTER(succ_get);
+ DECL_LOCAL_CYC_COUNTER(succ_put);
+ DECL_LOCAL_CYC_COUNTER(non_succ_get);
+ DECL_LOCAL_CYC_COUNTER(non_succ_put);
+ DECL_LOCAL_CYC_COUNTER(flush);
+ DECL_LOCAL_CYC_COUNTER(flush_obj);
+
+ if ( !tmem_initialized )
+ {
+ if ( !warned )
+ printk("tmem: must specify tmem parameter on xen boot line\n");
+ warned = 1;
+ return -ENODEV;
+ }
+
+ total_tmem_ops++;
+
+ if ( tmh_lock_all )
+ {
+ if ( tmh_lock_all > 1 )
+ spin_lock_irq(&tmem_spinlock);
+ else
+ spin_lock(&tmem_spinlock);
+ }
+
+ START_CYC_COUNTER(succ_get);
+ DUP_START_CYC_COUNTER(succ_put,succ_get);
+ DUP_START_CYC_COUNTER(non_succ_get,succ_get);
+ DUP_START_CYC_COUNTER(non_succ_put,succ_get);
+ DUP_START_CYC_COUNTER(flush,succ_get);
+ DUP_START_CYC_COUNTER(flush_obj,succ_get);
+
+ if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
+ {
+ printk("tmem: can't get tmem struct from %s\n",client_str);
+ rc = -EFAULT;
+ goto out;
+ }
+
+ if ( op.cmd == TMEM_CONTROL )
+ {
+ tmem_write_lock(&tmem_rwlock);
+ tmem_write_lock_set = 1;
+ rc = do_tmem_control(op.subop, op.cli_id, op.arg1, op.arg2, op.buf);
+ goto out;
+ }
+
+ /* create per-client tmem structure dynamically on first use by client */
+ if ( client == NULL )
+ {
+ tmem_write_lock(&tmem_rwlock);
+ tmem_write_lock_set = 1;
+ if ( (client = client_create()) == NULL )
+ {
+ printk("tmem: can't create tmem structure for %s\n",client_str);
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if ( op.cmd == TMEM_NEW_POOL )
+ {
+ if ( !tmem_write_lock_set )
+ {
+ tmem_write_lock(&tmem_rwlock);
+ tmem_write_lock_set = 1;
+ }
+ }
+ else
+ {
+ if ( !tmem_write_lock_set )
+ {
+ tmem_read_lock(&tmem_rwlock);
+ tmem_read_lock_set = 1;
+ }
+ if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
+ ((pool = client->pools[op.pool_id]) == NULL) )
+ {
+ rc = -ENODEV;
+ printk("tmem: operation requested on uncreated pool\n");
+ goto out;
+ }
+ ASSERT_SENTINEL(pool,POOL);
+ }
+
+ switch ( op.cmd )
+ {
+ case TMEM_NEW_POOL:
+ rc = do_tmem_new_pool(op.flags,op.uuid[0],op.uuid[1]);
+ break;
+ case TMEM_NEW_PAGE:
+ rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, 0);
+ break;
+ case TMEM_PUT_PAGE:
+ rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE);
+ if (rc == 1) succ_put = 1;
+ else non_succ_put = 1;
+ break;
+ case TMEM_GET_PAGE:
+ rc = do_tmem_get(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE);
+ if (rc == 1) succ_get = 1;
+ else non_succ_get = 1;
+ break;
+ case TMEM_FLUSH_PAGE:
+ flush = 1;
+ rc = do_tmem_flush_page(pool, op.object, op.index);
+ break;
+ case TMEM_FLUSH_OBJECT:
+ rc = do_tmem_flush_object(pool, op.object);
+ flush_obj = 1;
+ break;
+ case TMEM_DESTROY_POOL:
+ flush = 1;
+ rc = do_tmem_destroy_pool(op.pool_id);
+ break;
+ case TMEM_READ:
+ rc = do_tmem_get(pool, op.object, op.index, op.cmfn,
+ op.tmem_offset, op.pfn_offset, op.len);
+ break;
+ case TMEM_WRITE:
+ rc = do_tmem_put(pool, op.object, op.index, op.cmfn,
+ op.tmem_offset, op.pfn_offset, op.len);
+ break;
+ case TMEM_XCHG:
+ /* need to hold global lock to ensure xchg is atomic */
+ printk("tmem_xchg op not implemented yet\n");
+ rc = 0;
+ break;
+ default:
+ printk("tmem: op %d not implemented\n", op.cmd);
+ rc = 0;
+ break;
+ }
+
+out:
+ if ( rc < 0 )
+ errored_tmem_ops++;
+ if ( succ_get )
+ END_CYC_COUNTER(succ_get);
+ else if ( succ_put )
+ END_CYC_COUNTER(succ_put);
+ else if ( non_succ_get )
+ END_CYC_COUNTER(non_succ_get);
+ else if ( non_succ_put )
+ END_CYC_COUNTER(non_succ_put);
+ else if ( flush )
+ END_CYC_COUNTER(flush);
+ else
+ END_CYC_COUNTER(flush_obj);
+
+ if ( tmh_lock_all )
+ {
+ if ( tmh_lock_all > 1 )
+ spin_unlock_irq(&tmem_spinlock);
+ else
+ spin_unlock(&tmem_spinlock);
+ } else {
+ if ( tmem_write_lock_set )
+ write_unlock(&tmem_rwlock);
+ else if ( tmem_read_lock_set )
+ read_unlock(&tmem_rwlock);
+ else
+ ASSERT(0);
+ }
+
+ return rc;
+}
+
+/* this should be called when the host is destroying a client */
+EXPORT void tmem_destroy(void *v)
+{
+ client_t *client = (client_t *)v;
+
+ if ( tmh_lock_all )
+ spin_lock(&tmem_spinlock);
+ else
+ write_lock(&tmem_rwlock);
+
+ if ( client == NULL )
+ printk("tmem: can't destroy tmem pools for %s=%d\n",
+ cli_id_str,client->cli_id);
+ else
+ {
+ printk("tmem: flushing tmem pools for %s=%d\n",
+ cli_id_str,client->cli_id);
+ client_flush(client,1);
+ }
+
+ if ( tmh_lock_all )
+ spin_unlock(&tmem_spinlock);
+ else
+ write_unlock(&tmem_rwlock);
+}
+
+/* freezing all pools guarantees that no additional memory will be consumed */
+EXPORT void tmem_freeze_all(unsigned char key)
+{
+ static int freeze = 0;
+
+ if ( tmh_lock_all )
+ spin_lock(&tmem_spinlock);
+ else
+ write_lock(&tmem_rwlock);
+
+ freeze = !freeze;
+ tmemc_freeze_pools(CLI_ID_NULL,freeze);
+
+ if ( tmh_lock_all )
+ spin_unlock(&tmem_spinlock);
+ else
+ write_unlock(&tmem_rwlock);
+}
+
+#define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */
+
+EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
+{
+ pfp_t *pfp;
+ unsigned long evicts_per_relinq = 0;
+ int max_evictions = 10;
+
+ if (!tmh_enabled())
+ return NULL;
+#ifdef __i386__
+ return NULL;
+#endif
+
+ relinq_attempts++;
+ if ( order > 0 )
+ {
+ printk("tmem_relinquish_page: failing order=%d\n", order);
+ return NULL;
+ }
+
+ if ( tmh_called_from_tmem(memflags) )
+ {
+ if ( tmh_lock_all )
+ spin_lock(&tmem_spinlock);
+ else
+ read_lock(&tmem_rwlock);
+ }
+
+ while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
+ {
+ if ( (max_evictions-- <= 0) || !tmem_evict())
+ break;
+ evicts_per_relinq++;
+ }
+ if ( evicts_per_relinq > max_evicts_per_relinq )
+ max_evicts_per_relinq = evicts_per_relinq;
+ tmh_scrub_page(pfp, memflags);
+ if ( pfp != NULL )
+ relinq_pgs++;
+
+ if ( tmh_called_from_tmem(memflags) )
+ {
+ if ( tmh_lock_all )
+ spin_unlock(&tmem_spinlock);
+ else
+ read_unlock(&tmem_rwlock);
+ }
+
+ return pfp;
+}
+
+/* called at hypervisor startup */
+EXPORT void init_tmem(void)
+{
+ if ( !tmh_enabled() )
+ return;
+
+ radix_tree_init();
+ if ( tmh_init() )
+ {
+ printk("tmem: initialized comp=%d global-lock=%d\n",
+ tmh_compression_enabled(), tmh_lock_all);
+ tmem_initialized = 1;
+ }
+ else
+ printk("tmem: initialization FAILED\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/tmem_xen.c b/xen/common/tmem_xen.c
new file mode 100644
index 0000000000..6a0b14f456
--- /dev/null
+++ b/xen/common/tmem_xen.c
@@ -0,0 +1,334 @@
+/******************************************************************************
+ * tmem-xen.c
+ *
+ * Xen-specific Transcendent memory
+ *
+ * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
+ */
+
+#include <xen/tmem.h>
+#include <xen/tmem_xen.h>
+#include <xen/lzo.h> /* compression code */
+#include <xen/paging.h>
+#include <xen/domain_page.h>
+
+#define EXPORT /* indicates code other modules are dependent upon */
+
+EXPORT int opt_tmem = 0;
+boolean_param("tmem", opt_tmem);
+
+EXPORT int opt_tmem_compress = 0;
+boolean_param("tmem_compress", opt_tmem_compress);
+
+EXPORT int opt_tmem_lock = 0;
+integer_param("tmem_lock", opt_tmem_lock);
+
+#ifdef COMPARE_COPY_PAGE_SSE2
+DECL_CYC_COUNTER(pg_copy1);
+DECL_CYC_COUNTER(pg_copy2);
+DECL_CYC_COUNTER(pg_copy3);
+DECL_CYC_COUNTER(pg_copy4);
+#else
+DECL_CYC_COUNTER(pg_copy);
+#endif
+
+/* these are a concurrency bottleneck, could be percpu and dynamically
+ * allocated iff opt_tmem_compress */
+#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
+#define LZO_DSTMEM_PAGES 2
+static DEFINE_PER_CPU(unsigned char *, workmem);
+static DEFINE_PER_CPU(unsigned char *, dstmem);
+
+#ifdef COMPARE_COPY_PAGE_SSE2
+#include <asm/flushtlb.h> /* REMOVE ME AFTER TEST */
+#include <asm/page.h> /* REMOVE ME AFTER TEST */
+#endif
+void tmh_copy_page(char *to, char*from)
+{
+#ifdef COMPARE_COPY_PAGE_SSE2
+ DECL_LOCAL_CYC_COUNTER(pg_copy1);
+ DECL_LOCAL_CYC_COUNTER(pg_copy2);
+ DECL_LOCAL_CYC_COUNTER(pg_copy3);
+ DECL_LOCAL_CYC_COUNTER(pg_copy4);
+ *to = *from; /* don't measure TLB misses */
+ flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
+ flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
+ START_CYC_COUNTER(pg_copy1);
+ copy_page_sse2(to, from); /* cold cache */
+ END_CYC_COUNTER(pg_copy1);
+ START_CYC_COUNTER(pg_copy2);
+ copy_page_sse2(to, from); /* hot cache */
+ END_CYC_COUNTER(pg_copy2);
+ flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
+ flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
+ START_CYC_COUNTER(pg_copy3);
+ memcpy(to, from, PAGE_SIZE); /* cold cache */
+ END_CYC_COUNTER(pg_copy3);
+ START_CYC_COUNTER(pg_copy4);
+ memcpy(to, from, PAGE_SIZE); /* hot cache */
+ END_CYC_COUNTER(pg_copy4);
+#else
+ DECL_LOCAL_CYC_COUNTER(pg_copy);
+ START_CYC_COUNTER(pg_copy);
+ memcpy(to, from, PAGE_SIZE);
+ END_CYC_COUNTER(pg_copy);
+#endif
+}
+
+#ifdef __ia64__
+static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn)
+{
+ ASSERT(0);
+}
+#define paging_mark_dirty(_x,_y) do {} while(0)
+#else
+static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn)
+{
+ unsigned long cli_mfn;
+ p2m_type_t t;
+
+
+ if (is_pv_32on64_vcpu(current))
+ cmfn.p = (void *)((unsigned long)cmfn.p & 0xffffffffUL);
+ cli_mfn = mfn_x(gfn_to_mfn(current->domain,(unsigned long)cmfn.p,&t));
+ if (t != p2m_ram_rw)
+ return NULL;
+ if (pcli_mfn != NULL)
+ *pcli_mfn = cli_mfn;
+ return map_domain_page(cli_mfn);
+}
+#endif
+
+EXPORT int tmh_copy_from_client(pfp_t *pfp,
+ tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
+ uint32_t pfn_offset, uint32_t len)
+{
+ unsigned long tmem_mfn;
+ void *tmem_va, *cli_va = NULL;
+
+ ASSERT(pfp != NULL);
+ if ( tmem_offset || pfn_offset || len )
+ if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
+ return -EFAULT;
+ tmem_mfn = page_to_mfn(pfp);
+ tmem_va = map_domain_page(tmem_mfn);
+ mb();
+ if (!len && !tmem_offset && !pfn_offset)
+ memset(tmem_va, 0, PAGE_SIZE);
+ else if (len == PAGE_SIZE && !tmem_offset && !pfn_offset)
+ tmh_copy_page(tmem_va, cli_va);
+ else if ( (tmem_offset+len <= PAGE_SIZE) &&
+ (pfn_offset+len <= PAGE_SIZE) )
+ memcpy((char *)tmem_va+tmem_offset,(char *)cli_va+pfn_offset,len);
+ unmap_domain_page(cli_va);
+ unmap_domain_page(tmem_va);
+ return 1;
+}
+
+EXPORT int tmh_compress_from_client(tmem_cli_mfn_t cmfn,
+ void **out_va, size_t *out_len)
+{
+ void *cli_va;
+ int ret = 0;
+ unsigned char *dmem = this_cpu(dstmem);
+ unsigned char *wmem = this_cpu(workmem);
+
+ if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
+ return -EFAULT;
+ if ( dmem == NULL || wmem == NULL )
+ return 0; /* no buffer, so can't compress */
+ mb();
+ ret = lzo1x_1_compress(cli_va, PAGE_SIZE, dmem, out_len, wmem);
+ ASSERT(ret == LZO_E_OK);
+ *out_va = dmem;
+ unmap_domain_page(cli_va);
+ return 1;
+}
+
+EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
+ uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
+{
+ unsigned long tmem_mfn, cli_mfn;
+ void *tmem_va, *cli_va;
+
+ ASSERT(pfp != NULL);
+ if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+ return -EFAULT;
+ tmem_mfn = page_to_mfn(pfp);
+ tmem_va = map_domain_page(tmem_mfn);
+ if (len == PAGE_SIZE && !tmem_offset && !pfn_offset)
+ tmh_copy_page(cli_va, tmem_va);
+ else if ( (tmem_offset+len <= PAGE_SIZE) && (pfn_offset+len <= PAGE_SIZE) )
+ memcpy((char *)cli_va+pfn_offset,(char *)tmem_va+tmem_offset,len);
+ unmap_domain_page(tmem_va);
+ unmap_domain_page(cli_va);
+ paging_mark_dirty(current->domain,cli_mfn);
+ mb();
+ return 1;
+}
+
+EXPORT int tmh_decompress_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, size_t size)
+{
+ unsigned long cli_mfn;
+ void *cli_va;
+ size_t out_len = PAGE_SIZE;
+ int ret;
+
+ if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+ return -EFAULT;
+ ret = lzo1x_decompress_safe(tmem_va, size, cli_va, &out_len);
+ ASSERT(ret == LZO_E_OK);
+ ASSERT(out_len == PAGE_SIZE);
+ unmap_domain_page(cli_va);
+ paging_mark_dirty(current->domain,cli_mfn);
+ mb();
+ return 1;
+}
+
+/****************** XEN-SPECIFIC MEMORY ALLOCATION ********************/
+
+EXPORT struct xmem_pool *tmh_mempool = 0;
+EXPORT unsigned int tmh_mempool_maxalloc = 0;
+
+EXPORT DEFINE_SPINLOCK(tmh_page_list_lock);
+EXPORT PAGE_LIST_HEAD(tmh_page_list);
+EXPORT unsigned long tmh_page_list_pages = 0;
+
+/* free anything on tmh_page_list to Xen's scrub list */
+EXPORT void tmh_release_avail_pages_to_host(void)
+{
+ spin_lock(&tmh_page_list_lock);
+ if ( !page_list_empty(&tmh_page_list) )
+ {
+ scrub_list_splice(&tmh_page_list);
+ INIT_PAGE_LIST_HEAD(&tmh_page_list);
+ }
+ spin_unlock(&tmh_page_list_lock);
+}
+
+EXPORT void tmh_scrub_page(struct page_info *pi, unsigned int memflags)
+{
+ if ( pi == NULL )
+ return;
+ if ( !(memflags & MEMF_tmem) )
+ scrub_one_page(pi);
+}
+
+#ifndef __i386__
+static noinline void *tmh_mempool_page_get(unsigned long size)
+{
+ struct page_info *pi;
+
+ ASSERT(size == PAGE_SIZE);
+ if ( (pi = tmh_alloc_page(NULL,0)) == NULL )
+ return NULL;
+ ASSERT(IS_VALID_PAGE(pi));
+ return page_to_virt(pi);
+}
+
+static void tmh_mempool_page_put(void *page_va)
+{
+ ASSERT(IS_PAGE_ALIGNED(page_va));
+ tmh_free_page(virt_to_page(page_va));
+}
+
+static int tmh_mempool_init(void)
+{
+ tmh_mempool = xmem_pool_create("tmem", tmh_mempool_page_get,
+ tmh_mempool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
+ if ( tmh_mempool )
+ tmh_mempool_maxalloc = xmem_pool_maxalloc(tmh_mempool);
+ return tmh_mempool != NULL;
+}
+
+/* persistent pools are per-domain */
+
+static void *tmh_persistent_pool_page_get(unsigned long size)
+{
+ struct page_info *pi;
+ struct domain *d = current->domain;
+
+ ASSERT(size == PAGE_SIZE);
+ if ( (pi = _tmh_alloc_page_thispool(d)) == NULL )
+ return NULL;
+ ASSERT(IS_VALID_PAGE(pi));
+ return map_domain_page(page_to_mfn(pi));
+}
+
+static void tmh_persistent_pool_page_put(void *page_va)
+{
+ struct page_info *pi;
+
+ ASSERT(IS_PAGE_ALIGNED(page_va));
+ pi = virt_to_page(page_va);
+ ASSERT(IS_VALID_PAGE(pi));
+ _tmh_free_page_thispool(pi);
+}
+#endif
+
+/****************** XEN-SPECIFIC CLIENT HANDLING ********************/
+
+EXPORT tmh_client_t *tmh_client_init(void)
+{
+ tmh_client_t *tmh;
+ char name[5];
+ domid_t domid = current->domain->domain_id;
+ int i, shift;
+
+ if ( (tmh = xmalloc(tmh_client_t)) == NULL )
+ return NULL;
+ for (i = 0, shift = 12; i < 4; shift -=4, i++)
+ name[i] = ((unsigned short)domid >> shift) & 0xf;
+ name[4] = '\0';
+#ifndef __i386__
+ tmh->persistent_pool = xmem_pool_create(name, tmh_persistent_pool_page_get,
+ tmh_persistent_pool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
+ if ( tmh->persistent_pool == NULL )
+ {
+ xfree(tmh);
+ return NULL;
+ }
+#endif
+ tmh->domain = current->domain;
+ return tmh;
+}
+
+EXPORT void tmh_client_destroy(tmh_client_t *tmh)
+{
+#ifndef __i386__
+ xmem_pool_destroy(tmh->persistent_pool);
+#endif
+ xfree(tmh);
+}
+
+/****************** XEN-SPECIFIC HOST INITIALIZATION ********************/
+
+EXPORT int tmh_init(void)
+{
+#ifndef __i386__
+ int dstmem_order, workmem_order;
+ bool_t bad_alloc = 0;
+ struct page_info *pi;
+ unsigned char *p1, *p2;
+ int cpu;
+
+ if ( !tmh_mempool_init() )
+ return 0;
+
+ dstmem_order = get_order_from_pages(LZO_DSTMEM_PAGES);
+ workmem_order = get_order_from_bytes(LZO1X_1_MEM_COMPRESS);
+ for_each_cpu ( cpu )
+ {
+ pi = alloc_domheap_pages(0,dstmem_order,0);
+ per_cpu(dstmem, cpu) = p1 = ((pi == NULL) ? NULL : page_to_virt(pi));
+ pi = alloc_domheap_pages(0,workmem_order,0);
+ per_cpu(workmem, cpu) = p2 = ((pi == NULL) ? NULL : page_to_virt(pi));
+ if ( (p1 == NULL) || (p2 == NULL) )
+ bad_alloc++;
+ }
+ if ( bad_alloc )
+ printk("tmem: can't allocate compression buffers for %d cpus\n",
+ bad_alloc);
+#endif
+ return 1;
+}
diff --git a/xen/common/xmalloc_tlsf.c b/xen/common/xmalloc_tlsf.c
index 7a476e8fb7..3f85389e23 100644
--- a/xen/common/xmalloc_tlsf.c
+++ b/xen/common/xmalloc_tlsf.c
@@ -292,7 +292,6 @@ struct xmem_pool *xmem_pool_create(
unsigned long grow_size)
{
struct xmem_pool *pool;
- void *region;
int pool_bytes, pool_order;
BUG_ON(max_size && (max_size < init_size));
@@ -319,11 +318,9 @@ struct xmem_pool *xmem_pool_create(
pool->get_mem = get_mem;
pool->put_mem = put_mem;
strlcpy(pool->name, name, sizeof(pool->name));
- region = get_mem(init_size);
- if ( region == NULL )
- goto out_region;
- ADD_REGION(region, init_size, pool);
- pool->init_region = region;
+
+ /* always obtain init_region lazily now to ensure it is get_mem'd
+ * in the same "context" as all other regions */
spin_lock_init(&pool->lock);
@@ -332,10 +329,6 @@ struct xmem_pool *xmem_pool_create(
spin_unlock(&pool_list_lock);
return pool;
-
- out_region:
- free_xenheap_pages(pool, pool_order);
- return NULL;
}
unsigned long xmem_pool_get_used_size(struct xmem_pool *pool)
@@ -354,13 +347,15 @@ unsigned long xmem_pool_get_total_size(struct xmem_pool *pool)
void xmem_pool_destroy(struct xmem_pool *pool)
{
+ int pool_bytes, pool_order;
+
if ( pool == NULL )
return;
/* User is destroying without ever allocating from this pool */
if ( xmem_pool_get_used_size(pool) == BHDR_OVERHEAD )
{
- pool->put_mem(pool->init_region);
+ ASSERT(!pool->init_region);
pool->used_size -= BHDR_OVERHEAD;
}
@@ -373,7 +368,10 @@ void xmem_pool_destroy(struct xmem_pool *pool)
spin_lock(&pool_list_lock);
list_del_init(&pool->list);
spin_unlock(&pool_list_lock);
- pool->put_mem(pool);
+
+ pool_bytes = ROUNDUP_SIZE(sizeof(*pool));
+ pool_order = get_order_from_bytes(pool_bytes);
+ free_xenheap_pages(pool,pool_order);
}
void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool)
@@ -382,6 +380,14 @@ void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool)
int fl, sl;
unsigned long tmp_size;
+ if ( pool->init_region == NULL )
+ {
+ if ( (region = pool->get_mem(pool->init_size)) == NULL )
+ goto out;
+ ADD_REGION(region, pool->init_size, pool);
+ pool->init_region = region;
+ }
+
size = (size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : ROUNDUP_SIZE(size);
/* Rounding up the requested size and calculating fl and sl */
@@ -496,6 +502,11 @@ void xmem_pool_free(void *ptr, struct xmem_pool *pool)
spin_unlock(&pool->lock);
}
+int xmem_pool_maxalloc(struct xmem_pool *pool)
+{
+ return pool->grow_size - (2 * BHDR_OVERHEAD);
+}
+
/*
* Glue for xmalloc().
*/
diff --git a/xen/include/Makefile b/xen/include/Makefile
index 8427371596..15acff963b 100644
--- a/xen/include/Makefile
+++ b/xen/include/Makefile
@@ -14,6 +14,7 @@ headers-y := \
compat/physdev.h \
compat/platform.h \
compat/sched.h \
+ compat/tmem.h \
compat/trace.h \
compat/vcpu.h \
compat/version.h \
diff --git a/xen/include/asm-ia64/mm.h b/xen/include/asm-ia64/mm.h
index bb3dc8ae5b..e8803ba9b1 100644
--- a/xen/include/asm-ia64/mm.h
+++ b/xen/include/asm-ia64/mm.h
@@ -590,6 +590,8 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg);
int steal_page(
struct domain *d, struct page_info *page, unsigned int memflags);
+int donate_page(
+ struct domain *d, struct page_info *page, unsigned int memflags);
#define domain_clamp_alloc_bitsize(d, b) (b)
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 6772b40505..4900aa9b13 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -487,6 +487,8 @@ int compat_subarch_memory_op(int op, XEN_GUEST_HANDLE(void));
int steal_page(
struct domain *d, struct page_info *page, unsigned int memflags);
+int donate_page(
+ struct domain *d, struct page_info *page, unsigned int memflags);
int map_ldt_shadow_page(unsigned int);
diff --git a/xen/include/asm-x86/spinlock.h b/xen/include/asm-x86/spinlock.h
index f1a5feb03c..35ed9759e7 100644
--- a/xen/include/asm-x86/spinlock.h
+++ b/xen/include/asm-x86/spinlock.h
@@ -32,10 +32,10 @@ static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
}
typedef struct {
- volatile unsigned int lock;
+ volatile int lock;
} raw_rwlock_t;
-#define RW_LOCK_BIAS 0x01000000
+#define RW_LOCK_BIAS 0x01000000
#define _RAW_RW_LOCK_UNLOCKED /*(raw_rwlock_t)*/ { RW_LOCK_BIAS }
static always_inline void _raw_read_lock(raw_rwlock_t *rw)
@@ -66,6 +66,22 @@ static always_inline void _raw_write_lock(raw_rwlock_t *rw)
: "=m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory" );
}
+static always_inline int _raw_write_trylock(raw_rwlock_t *rw)
+{
+ int rc;
+
+ asm volatile (
+ " lock; subl %2,%0 \n"
+ " jz 1f \n"
+ " lock; addl %2,%0 \n"
+ " dec %1 \n"
+ "1:"
+ : "=m" (rw->lock), "=r" (rc) : "i" (RW_LOCK_BIAS), "1" (1)
+ : "memory" );
+
+ return rc;
+}
+
static always_inline void _raw_read_unlock(raw_rwlock_t *rw)
{
asm volatile (
@@ -81,5 +97,6 @@ static always_inline void _raw_write_unlock(raw_rwlock_t *rw)
}
#define _raw_rw_is_locked(x) ((x)->lock < RW_LOCK_BIAS)
+#define _raw_rw_is_write_locked(x) ((x)->lock <= 0)
#endif /* __ASM_SPINLOCK_H */
diff --git a/xen/include/public/tmem.h b/xen/include/public/tmem.h
new file mode 100644
index 0000000000..b8d608f591
--- /dev/null
+++ b/xen/include/public/tmem.h
@@ -0,0 +1,112 @@
+/******************************************************************************
+ * tmem.h
+ *
+ * Guest OS interface to Xen Transcendent Memory.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_TMEM_H__
+#define __XEN_PUBLIC_TMEM_H__
+
+#include "xen.h"
+
+/* Commands to HYPERVISOR_tmem_op() */
+#define TMEM_CONTROL 0
+#define TMEM_NEW_POOL 1
+#define TMEM_DESTROY_POOL 2
+#define TMEM_NEW_PAGE 3
+#define TMEM_PUT_PAGE 4
+#define TMEM_GET_PAGE 5
+#define TMEM_FLUSH_PAGE 6
+#define TMEM_FLUSH_OBJECT 7
+#define TMEM_READ 8
+#define TMEM_WRITE 9
+#define TMEM_XCHG 10
+
+/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */
+#define TMEMC_THAW 0
+#define TMEMC_FREEZE 1
+#define TMEMC_FLUSH 2
+#define TMEMC_DESTROY 3
+#define TMEMC_LIST 4
+#define TMEMC_SET_WEIGHT 5
+#define TMEMC_SET_CAP 6
+#define TMEMC_SET_COMPRESS 7
+
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_PERSIST 1
+#define TMEM_POOL_SHARED 2
+#define TMEM_POOL_PAGESIZE_SHIFT 4
+#define TMEM_POOL_PAGESIZE_MASK 0xf
+#define TMEM_POOL_VERSION_SHIFT 24
+#define TMEM_POOL_VERSION_MASK 0xff
+
+/* Special errno values */
+#define EFROZEN 1000
+#define EEMPTY 1001
+
+
+#ifndef __ASSEMBLY__
+typedef XEN_GUEST_HANDLE(void) tmem_cli_mfn_t;
+typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t;
+struct tmem_op {
+ uint32_t cmd;
+ int32_t pool_id; /* private > 0; shared < 0; 0 is invalid */
+ union {
+ struct { /* for cmd == TMEM_NEW_POOL */
+ uint64_t uuid[2];
+ uint32_t flags;
+ };
+ struct { /* for cmd == TMEM_CONTROL */
+ uint32_t subop;
+ uint32_t cli_id;
+ uint32_t arg1;
+ uint32_t arg2;
+ tmem_cli_va_t buf;
+ };
+ struct {
+ uint64_t object;
+ uint32_t index;
+ uint32_t tmem_offset;
+ uint32_t pfn_offset;
+ uint32_t len;
+ tmem_cli_mfn_t cmfn; /* client machine page frame */
+ };
+ };
+};
+typedef struct tmem_op tmem_op_t;
+DEFINE_XEN_GUEST_HANDLE(tmem_op_t);
+typedef XEN_GUEST_HANDLE_64(tmem_op_t) tmem_cli_op_t;
+
+#endif
+
+#endif /* __XEN_PUBLIC_TMEM_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
index 524118b6d7..72aa667430 100644
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -91,6 +91,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
#define __HYPERVISOR_sysctl 35
#define __HYPERVISOR_domctl 36
#define __HYPERVISOR_kexec_op 37
+#define __HYPERVISOR_tmem_op 38
/* Architecture-specific hypercall definitions. */
#define __HYPERVISOR_arch_0 48
diff --git a/xen/include/xen/config.h b/xen/include/xen/config.h
index 9b26d882bb..7872f13e8d 100644
--- a/xen/include/xen/config.h
+++ b/xen/include/xen/config.h
@@ -11,7 +11,6 @@
#define EXPORT_SYMBOL(var)
#define EXPORT_SYMBOL_GPL(var)
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]) + __must_be_array(x))
/*
* The following log levels are as follows:
diff --git a/xen/include/xen/hash.h b/xen/include/xen/hash.h
new file mode 100644
index 0000000000..0658c8b619
--- /dev/null
+++ b/xen/include/xen/hash.h
@@ -0,0 +1,58 @@
+#ifndef _XEN_HASH_H
+#define _XEN_HASH_H
+/* Fast hashing routine for a long.
+ (C) 2002 William Lee Irwin III, IBM */
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+#if BITS_PER_LONG == 32
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME 0x9e370001UL
+#elif BITS_PER_LONG == 64
+/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
+#else
+#error Define GOLDEN_RATIO_PRIME for your wordsize.
+#endif
+
+static inline unsigned long hash_long(unsigned long val, unsigned int bits)
+{
+ unsigned long hash = val;
+
+#if BITS_PER_LONG == 64
+ /* Sigh, gcc can't optimise this alone like it does for 32 bits. */
+ unsigned long n = hash;
+ n <<= 18;
+ hash -= n;
+ n <<= 33;
+ hash -= n;
+ n <<= 3;
+ hash += n;
+ n <<= 3;
+ hash -= n;
+ n <<= 4;
+ hash += n;
+ n <<= 2;
+ hash += n;
+#else
+ /* On some cpus multiply is faster, on others gcc will do shifts */
+ hash *= GOLDEN_RATIO_PRIME;
+#endif
+
+ /* High bits are more random, so use them. */
+ return hash >> (BITS_PER_LONG - bits);
+}
+
+static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
+{
+ return hash_long((unsigned long)ptr, bits);
+}
+#endif /* _XEN_HASH_H */
diff --git a/xen/include/xen/hypercall.h b/xen/include/xen/hypercall.h
index 43758b9de8..f2e0150fa1 100644
--- a/xen/include/xen/hypercall.h
+++ b/xen/include/xen/hypercall.h
@@ -13,6 +13,7 @@
#include <public/sysctl.h>
#include <public/platform.h>
#include <public/event_channel.h>
+#include <public/tmem.h>
#include <asm/hypercall.h>
#include <xsm/xsm.h>
@@ -116,6 +117,10 @@ extern long
do_xsm_op(
XEN_GUEST_HANDLE(xsm_op_t) u_xsm_op);
+extern long
+do_tmem_op(
+ XEN_GUEST_HANDLE(tmem_op_t) uops);
+
#ifdef CONFIG_COMPAT
extern int
diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
index ab76120da6..6b5033a36e 100644
--- a/xen/include/xen/lib.h
+++ b/xen/include/xen/lib.h
@@ -45,6 +45,8 @@ do { \
#define DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]) + __must_be_array(x))
+
#define reserve_bootmem(_p,_l) ((void)0)
struct domain;
diff --git a/xen/include/xen/lzo.h b/xen/include/xen/lzo.h
new file mode 100644
index 0000000000..cbf135f984
--- /dev/null
+++ b/xen/include/xen/lzo.h
@@ -0,0 +1,44 @@
+#ifndef __LZO_H__
+#define __LZO_H__
+/*
+ * LZO Public Kernel Interface
+ * A mini subset of the LZO real-time data compression library
+ *
+ * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
+ *
+ * The full LZO package can be found at:
+ * http://www.oberhumer.com/opensource/lzo/
+ *
+ * Changed for kernel use by:
+ * Nitin Gupta <nitingupta910@gmail.com>
+ * Richard Purdie <rpurdie@openedhand.com>
+ */
+
+#define LZO1X_MEM_COMPRESS (16384 * sizeof(unsigned char *))
+#define LZO1X_1_MEM_COMPRESS LZO1X_MEM_COMPRESS
+
+#define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3)
+
+/* This requires 'workmem' of size LZO1X_1_MEM_COMPRESS */
+int lzo1x_1_compress(const unsigned char *src, size_t src_len,
+ unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+/* safe decompression with overrun testing */
+int lzo1x_decompress_safe(const unsigned char *src, size_t src_len,
+ unsigned char *dst, size_t *dst_len);
+
+/*
+ * Return values (< 0 = Error)
+ */
+#define LZO_E_OK 0
+#define LZO_E_ERROR (-1)
+#define LZO_E_OUT_OF_MEMORY (-2)
+#define LZO_E_NOT_COMPRESSIBLE (-3)
+#define LZO_E_INPUT_OVERRUN (-4)
+#define LZO_E_OUTPUT_OVERRUN (-5)
+#define LZO_E_LOOKBEHIND_OVERRUN (-6)
+#define LZO_E_EOF_NOT_FOUND (-7)
+#define LZO_E_INPUT_NOT_CONSUMED (-8)
+#define LZO_E_NOT_YET_IMPLEMENTED (-9)
+
+#endif
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 82340f3ae4..50c47b00e2 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -77,6 +77,8 @@ int assign_pages(
#define MEMF_no_refcount (1U<<_MEMF_no_refcount)
#define _MEMF_populate_on_demand 1
#define MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand)
+#define _MEMF_tmem 2
+#define MEMF_tmem (1U<<_MEMF_tmem)
#define _MEMF_node 8
#define MEMF_node(n) ((((n)+1)&0xff)<<_MEMF_node)
#define _MEMF_bits 24
@@ -222,6 +224,32 @@ page_list_remove_head(struct page_list_head *head)
return page;
}
+static inline void
+page_list_splice(struct page_list_head *list, struct page_list_head *head)
+{
+ struct page_info *first, *last, *at;
+
+ if ( page_list_empty(list) )
+ return;
+
+ if ( page_list_empty(head) )
+ {
+ head->next = list->next;
+ head->tail = list->tail;
+ return;
+ }
+
+ first = list->next;
+ last = list->tail;
+ at = head->next;
+
+ first->list.prev = page_to_mfn(head->next);
+ head->next = first;
+
+ last->list.next = page_to_mfn(at);
+ at->list.prev = page_to_mfn(last);
+}
+
#define page_list_for_each(pos, head) \
for ( pos = (head)->next; pos; pos = page_list_next(pos, head) )
#define page_list_for_each_safe(pos, tmp, head) \
@@ -258,6 +286,7 @@ page_list_remove_head(struct page_list_head *head)
list_for_each_entry_safe(pos, tmp, head, list)
# define page_list_for_each_safe_reverse(pos, tmp, head) \
list_for_each_entry_safe_reverse(pos, tmp, head, list)
+# define page_list_splice(list, hd) list_splice(list, hd)
#endif
/* Automatic page scrubbing for dead domains. */
@@ -272,6 +301,9 @@ extern struct page_list_head page_scrub_list;
if ( !page_list_empty(&page_scrub_list) ) \
cpumask_raise_softirq(cpu_online_map, PAGE_SCRUB_SOFTIRQ); \
} while ( 0 )
+void scrub_list_splice(struct page_list_head *);
+void scrub_list_add(struct page_info *);
+void scrub_one_page(struct page_info *);
unsigned long avail_scrub_pages(void);
int guest_remove_page(struct domain *d, unsigned long gmfn);
diff --git a/xen/include/xen/radix-tree.h b/xen/include/xen/radix-tree.h
new file mode 100644
index 0000000000..d4bb4e8992
--- /dev/null
+++ b/xen/include/xen/radix-tree.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Adapted for Xen by Dan Magenheimer, Oracle Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _XEN_RADIX_TREE_H
+#define _XEN_RADIX_TREE_H
+
+/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
+struct radix_tree_root {
+ unsigned int height;
+ struct radix_tree_node *rnode;
+};
+
+#define RADIX_TREE_MAP_SHIFT 6
+
+#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS \
+ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+struct radix_tree_node {
+ unsigned int count;
+ void *slots[RADIX_TREE_MAP_SIZE];
+};
+
+struct radix_tree_path {
+ struct radix_tree_node *node;
+ int offset;
+};
+
+#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
+
+
+#define RADIX_TREE_INIT(mask) { \
+ .height = 0, \
+ .rnode = NULL, \
+}
+
+#define RADIX_TREE(name, mask) \
+ struct radix_tree_root name = RADIX_TREE_INIT(mask)
+
+#define INIT_RADIX_TREE(root, mask) \
+do { \
+ (root)->height = 0; \
+ (root)->rnode = NULL; \
+} while (0)
+
+int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
+ void *item, struct radix_tree_node *(*node_alloc)(void *), void *arg);
+void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void radix_tree_destroy(struct radix_tree_root *root,
+ void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *));
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index,
+ void(*node_free)(struct radix_tree_node *));
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+ unsigned long first_index, unsigned int max_items);
+void radix_tree_init(void);
+
+#endif /* _XEN_RADIX_TREE_H */
diff --git a/xen/include/xen/rbtree.h b/xen/include/xen/rbtree.h
new file mode 100644
index 0000000000..b16dc5036f
--- /dev/null
+++ b/xen/include/xen/rbtree.h
@@ -0,0 +1,82 @@
+/*
+ Red Black Trees
+ (C) 1999 Andrea Arcangeli <andrea@suse.de>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RBTREE_H__
+#define __RBTREE_H__
+
+struct rb_node
+{
+ unsigned long rb_parent_color;
+#define RB_RED 0
+#define RB_BLACK 1
+ struct rb_node *rb_right;
+ struct rb_node *rb_left;
+};
+
+struct rb_root
+{
+ struct rb_node *rb_node;
+};
+
+#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3))
+#define rb_color(r) ((r)->rb_parent_color & 1)
+#define rb_is_red(r) (!rb_color(r))
+#define rb_is_black(r) rb_color(r)
+#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0)
+#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+ rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
+}
+static inline void rb_set_color(struct rb_node *rb, int color)
+{
+ rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
+}
+
+#define RB_ROOT (struct rb_root) { NULL, }
+#define rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
+#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(struct rb_node *);
+extern struct rb_node *rb_prev(struct rb_node *);
+extern struct rb_node *rb_first(struct rb_root *);
+extern struct rb_node *rb_last(struct rb_root *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+ struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
+ struct rb_node ** rb_link)
+{
+ node->rb_parent_color = (unsigned long )parent;
+ node->rb_left = node->rb_right = NULL;
+
+ *rb_link = node;
+}
+
+#endif /* __RBTREE_H__ */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 46731a5e98..7ab04a3343 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -269,6 +269,9 @@ struct domain
/* VRAM dirty support. */
struct sh_dirty_vram *dirty_vram;
+
+ /* transcendent memory, auto-allocated on first tmem op by each domain */
+ void *tmem;
};
struct domain_setup_info
diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h
index 7a5a5ab5d7..a952f0700d 100644
--- a/xen/include/xen/spinlock.h
+++ b/xen/include/xen/spinlock.h
@@ -67,12 +67,14 @@ void _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags);
void _write_lock(rwlock_t *lock);
void _write_lock_irq(rwlock_t *lock);
unsigned long _write_lock_irqsave(rwlock_t *lock);
+int _write_trylock(rwlock_t *lock);
void _write_unlock(rwlock_t *lock);
void _write_unlock_irq(rwlock_t *lock);
void _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags);
int _rw_is_locked(rwlock_t *lock);
+int _rw_is_write_locked(rwlock_t *lock);
#define spin_lock(l) _spin_lock(l)
#define spin_lock_irq(l) _spin_lock_irq(l)
@@ -110,11 +112,13 @@ int _rw_is_locked(rwlock_t *lock);
#define write_lock(l) _write_lock(l)
#define write_lock_irq(l) _write_lock_irq(l)
#define write_lock_irqsave(l, f) ((f) = _write_lock_irqsave(l))
+#define write_trylock(l) _write_trylock(l)
#define write_unlock(l) _write_unlock(l)
#define write_unlock_irq(l) _write_unlock_irq(l)
#define write_unlock_irqrestore(l, f) _write_unlock_irqrestore(l, f)
#define rw_is_locked(l) _rw_is_locked(l)
+#define rw_is_write_locked(l) _rw_is_write_locked(l)
#endif /* __SPINLOCK_H__ */
diff --git a/xen/include/xen/tmem.h b/xen/include/xen/tmem.h
new file mode 100644
index 0000000000..ff009b6cf0
--- /dev/null
+++ b/xen/include/xen/tmem.h
@@ -0,0 +1,16 @@
+/******************************************************************************
+ * tmem.h
+ *
+ * Transcendent memory
+ *
+ * Copyright (c) 2008, Dan Magenheimer, Oracle Corp.
+ */
+
+#ifndef __XEN_TMEM_H__
+#define __XEN_TMEM_H__
+
+extern void init_tmem(void);
+extern void tmem_destroy(void *);
+extern void *tmem_relinquish_pages(unsigned int, unsigned int);
+
+#endif /* __XEN_TMEM_H__ */
diff --git a/xen/include/xen/tmem_xen.h b/xen/include/xen/tmem_xen.h
new file mode 100644
index 0000000000..8d653c27df
--- /dev/null
+++ b/xen/include/xen/tmem_xen.h
@@ -0,0 +1,356 @@
+/******************************************************************************
+ * tmem_xen.h
+ *
+ * Xen-specific Transcendent memory
+ *
+ * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
+ */
+
+#ifndef __XEN_TMEM_XEN_H__
+#define __XEN_TMEM_XEN_H__
+
+#include <xen/config.h>
+#include <xen/mm.h> /* heap alloc/free */
+#include <xen/xmalloc.h> /* xmalloc/xfree */
+#include <xen/sched.h> /* struct domain */
+#include <xen/guest_access.h> /* copy_from_guest */
+#include <xen/hash.h> /* hash_long */
+#include <public/tmem.h>
+
+struct tmem_host_dependent_client {
+ struct domain *domain;
+ struct xmem_pool *persistent_pool;
+};
+typedef struct tmem_host_dependent_client tmh_client_t;
+
+#define IS_PAGE_ALIGNED(addr) \
+ ((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) == addr)
+#define IS_VALID_PAGE(_pi) ( mfn_valid(page_to_mfn(_pi)) )
+
+extern struct xmem_pool *tmh_mempool;
+extern unsigned int tmh_mempool_maxalloc;
+extern struct page_list_head tmh_page_list;
+extern spinlock_t tmh_page_list_lock;
+extern unsigned long tmh_page_list_pages;
+
+extern spinlock_t tmem_lock;
+extern spinlock_t tmem_spinlock;
+extern rwlock_t tmem_rwlock;
+
+extern void tmh_copy_page(char *to, char*from);
+extern int tmh_init(void);
+extern tmh_client_t *tmh_client_init(void);
+extern void tmh_client_destroy(tmh_client_t *);
+#define tmh_hash hash_long
+
+extern void tmh_release_avail_pages_to_host(void);
+extern void tmh_scrub_page(struct page_info *pi, unsigned int memflags);
+
+extern int opt_tmem_compress;
+static inline int tmh_compression_enabled(void)
+{
+ return opt_tmem_compress;
+}
+
+extern int opt_tmem;
+static inline int tmh_enabled(void)
+{
+ return opt_tmem;
+}
+
+extern int opt_tmem_lock;
+
+extern int opt_tmem_flush_dups;
+
+/*
+ * Memory free page list management
+ */
+
+static inline struct page_info *tmh_page_list_get(void)
+{
+ struct page_info *pi;
+
+ spin_lock(&tmh_page_list_lock);
+ if ( (pi = page_list_remove_head(&tmh_page_list)) != NULL )
+ tmh_page_list_pages--;
+ spin_unlock(&tmh_page_list_lock);
+ ASSERT((pi == NULL) || IS_VALID_PAGE(pi));
+ return pi;
+}
+
+static inline void tmh_page_list_put(struct page_info *pi)
+{
+ ASSERT(IS_VALID_PAGE(pi));
+ spin_lock(&tmh_page_list_lock);
+ page_list_add(pi, &tmh_page_list);
+ tmh_page_list_pages++;
+ spin_unlock(&tmh_page_list_lock);
+}
+
+static inline unsigned long tmh_avail_pages(void)
+{
+ return tmh_page_list_pages;
+}
+
+/*
+ * Ephemeral memory allocation for persistent data
+ */
+
+static inline bool_t domain_fully_allocated(struct domain *d)
+{
+ return ( d->tot_pages >= d->max_pages );
+}
+#define tmh_client_memory_fully_allocated(_pool) \
+ domain_fully_allocated(_pool->client->tmh->domain)
+
+static inline void *_tmh_alloc_subpage_thispool(struct xmem_pool *cmem_mempool,
+ size_t size, size_t align)
+{
+#if 0
+ if ( d->tot_pages >= d->max_pages )
+ return NULL;
+#endif
+#ifdef __i386__
+ return _xmalloc(size,align);
+#else
+ ASSERT( size < tmh_mempool_maxalloc );
+ if ( cmem_mempool == NULL )
+ return NULL;
+ return xmem_pool_alloc(size, cmem_mempool);
+#endif
+}
+#define tmh_alloc_subpage_thispool(_pool, _s, _a) \
+ _tmh_alloc_subpage_thispool(pool->client->tmh->persistent_pool, \
+ _s, _a)
+
+static inline void _tmh_free_subpage_thispool(struct xmem_pool *cmem_mempool,
+ void *ptr, size_t size)
+{
+#ifdef __i386__
+ xfree(ptr);
+#else
+ ASSERT( size < tmh_mempool_maxalloc );
+ ASSERT( cmem_mempool != NULL );
+ xmem_pool_free(ptr,cmem_mempool);
+#endif
+}
+#define tmh_free_subpage_thispool(_pool, _p, _s) \
+ _tmh_free_subpage_thispool(_pool->client->tmh->persistent_pool, _p, _s)
+
+static inline struct page_info *_tmh_alloc_page_thispool(struct domain *d)
+{
+ struct page_info *pi;
+
+ /* note that this tot_pages check is not protected by d->page_alloc_lock,
+ * so may race and periodically fail in donate_page or alloc_domheap_pages
+ * That's OK... neither is a problem, though chatty if log_lvl is set */
+ if ( d->tot_pages >= d->max_pages )
+ return NULL;
+
+ if ( tmh_page_list_pages )
+ {
+ if ( (pi = tmh_page_list_get()) != NULL )
+ {
+ if ( donate_page(d,pi,0) == 0 )
+ goto out;
+ else
+ tmh_page_list_put(pi);
+ }
+ }
+
+ pi = alloc_domheap_pages(d,0,MEMF_tmem);
+
+out:
+ ASSERT((pi == NULL) || IS_VALID_PAGE(pi));
+ return pi;
+}
+#define tmh_alloc_page_thispool(_pool) \
+ _tmh_alloc_page_thispool(_pool->client->tmh->domain)
+
+static inline void _tmh_free_page_thispool(struct page_info *pi)
+{
+ struct domain *d = page_get_owner(pi);
+
+ ASSERT(IS_VALID_PAGE(pi));
+ if ( (d == NULL) || steal_page(d,pi,0) == 0 )
+ tmh_page_list_put(pi);
+ else
+ {
+ scrub_one_page(pi);
+ ASSERT((pi->count_info & ~(PGC_allocated | 1)) == 0);
+ free_domheap_pages(pi,0);
+ }
+}
+#define tmh_free_page_thispool(_pool,_pg) \
+ _tmh_free_page_thispool(_pg)
+
+/*
+ * Memory allocation for ephemeral (non-persistent) data
+ */
+
+static inline void *tmh_alloc_subpage(void *pool, size_t size,
+ size_t align)
+{
+#ifdef __i386__
+ ASSERT( size < PAGE_SIZE );
+ return _xmalloc(size, align);
+#else
+ ASSERT( size < tmh_mempool_maxalloc );
+ ASSERT( tmh_mempool != NULL );
+ return xmem_pool_alloc(size, tmh_mempool);
+#endif
+}
+
+static inline void tmh_free_subpage(void *ptr, size_t size)
+{
+#ifdef __i386__
+ ASSERT( size < PAGE_SIZE );
+ xfree(ptr);
+#else
+ ASSERT( size < tmh_mempool_maxalloc );
+ xmem_pool_free(ptr,tmh_mempool);
+#endif
+}
+
+static inline struct page_info *tmh_alloc_page(void *pool, int no_heap)
+{
+ struct page_info *pi = tmh_page_list_get();
+
+ if ( pi == NULL && !no_heap )
+ pi = alloc_domheap_pages(0,0,MEMF_tmem);
+ ASSERT((pi == NULL) || IS_VALID_PAGE(pi));
+ return pi;
+}
+
+static inline void tmh_free_page(struct page_info *pi)
+{
+ ASSERT(IS_VALID_PAGE(pi));
+ tmh_page_list_put(pi);
+}
+
+static inline unsigned int tmem_subpage_maxsize(void)
+{
+ return tmh_mempool_maxalloc;
+}
+
+#define tmh_lock_all opt_tmem_lock
+#define tmh_flush_dups opt_tmem_flush_dups
+#define tmh_called_from_tmem(_memflags) (_memflags & MEMF_tmem)
+
+/* "Client" (==domain) abstraction */
+
+struct client;
+typedef domid_t cli_id_t;
+typedef struct domain tmh_cli_ptr_t;
+typedef struct page_info pfp_t;
+
+/* this appears to be unreliable when a domain is being shut down */
+static inline struct client *tmh_client_from_cli_id(cli_id_t cli_id)
+{
+ struct domain *d = get_domain_by_id(cli_id);
+ if (d == NULL)
+ return NULL;
+ return (struct client *)(d->tmem);
+}
+
+static inline struct client *tmh_client_from_current(void)
+{
+ return (struct client *)(current->domain->tmem);
+}
+
+static inline cli_id_t tmh_get_cli_id_from_current(void)
+{
+ return current->domain->domain_id;
+}
+
+static inline tmh_cli_ptr_t *tmh_get_cli_ptr_from_current(void)
+{
+ return current->domain;
+}
+
+static inline void tmh_set_current_client(struct client *client)
+{
+ current->domain->tmem = client;
+}
+
+static inline bool_t tmh_current_is_privileged(void)
+{
+ return IS_PRIV(current->domain);
+}
+
+/* these typedefs are in the public/tmem.h interface
+typedef XEN_GUEST_HANDLE(void) cli_mfn_t;
+typedef XEN_GUEST_HANDLE(char) cli_va_t;
+typedef XEN_GUEST_HANDLE(tmem_op_t) cli_tmemop_t;
+*/
+
+static inline int tmh_get_tmemop_from_client(tmem_op_t *op, tmem_cli_op_t uops)
+{
+ return __copy_from_guest(op, uops, 1);
+}
+
+static inline void tmh_copy_to_client_buf_offset(tmem_cli_va_t clibuf, int off,
+ char *tmembuf, int len)
+{
+ copy_to_guest_offset(clibuf,off,tmembuf,len);
+}
+
+#define TMH_CLI_ID_NULL ((cli_id_t)((domid_t)-1L))
+
+#define tmh_cli_id_str "domid"
+#define tmh_client_str "domain"
+
+extern int tmh_decompress_to_client(tmem_cli_mfn_t,void*,size_t);
+
+extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *);
+
+extern int tmh_copy_from_client(pfp_t *pfp,
+ tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
+ uint32_t pfn_offset, uint32_t len);
+
+extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
+ uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len);
+
+
+#define TMEM_PERF
+#ifdef TMEM_PERF
+#define DECL_CYC_COUNTER(x) \
+ uint64_t x##_sum_cycles = 0, x##_count = 0; \
+ uint32_t x##_min_cycles = 0x7fffffff, x##_max_cycles = 0;
+#define EXTERN_CYC_COUNTER(x) \
+ extern uint64_t x##_sum_cycles, x##_count; \
+ extern uint32_t x##_min_cycles, x##_max_cycles;
+#define DECL_LOCAL_CYC_COUNTER(x) \
+ int64_t x##_start = 0
+#define START_CYC_COUNTER(x) x##_start = get_cycles()
+#define DUP_START_CYC_COUNTER(x,y) x##_start = y##_start
+/* following might race, but since its advisory only, don't care */
+#define END_CYC_COUNTER(x) \
+ do { \
+ x##_start = get_cycles() - x##_start; \
+ if (x##_start > 0 && x##_start < 1000000000) { \
+ x##_sum_cycles += x##_start; x##_count++; \
+ if ((uint32_t)x##_start < x##_min_cycles) x##_min_cycles = x##_start; \
+ if ((uint32_t)x##_start > x##_max_cycles) x##_max_cycles = x##_start; \
+ } \
+ } while (0)
+#define RESET_CYC_COUNTER(x) { x##_sum_cycles = 0, x##_count = 0; \
+ x##_min_cycles = 0x7fffffff, x##_max_cycles = 0; }
+#define SCNPRINTF_CYC_COUNTER(buf,size,x,tag) \
+ scnprintf(buf,size, \
+ tag"n:%"PRIu64","tag"t:%"PRIu64","tag"x:%"PRId32","tag"m:%"PRId32",", \
+ x##_count,x##_sum_cycles,x##_max_cycles,x##_min_cycles)
+#else
+#define DECL_CYC_COUNTER(x)
+#define EXTERN_CYC_COUNTER(x) \
+ extern uint64_t x##_sum_cycles, x##_count; \
+ extern uint32_t x##_min_cycles, x##_max_cycles;
+#define DECL_LOCAL_CYC_COUNTER(x) do { } while (0)
+#define START_CYC_COUNTER(x) do { } while (0)
+#define DUP_START_CYC_COUNTER(x) do { } while (0)
+#define END_CYC_COUNTER(x) do { } while (0)
+#define SCNPRINTF_CYC_COUNTER(buf,size,x,tag) (0)
+#define RESET_CYC_COUNTER(x) do { } while (0)
+#endif
+
+#endif /* __XEN_TMEM_XEN_H__ */
diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h
index e41cc36e74..a5188e8629 100644
--- a/xen/include/xen/xmalloc.h
+++ b/xen/include/xen/xmalloc.h
@@ -76,7 +76,13 @@ void xmem_pool_destroy(struct xmem_pool *pool);
void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool);
/**
- * xmem_pool_free - free memory from given pool
+ * xmem_pool_maxalloc - xmem_pool_alloc's greater than this size will fail
+ * @mem_pool: pool
+ */
+int xmem_pool_maxalloc(struct xmem_pool *pool);
+
+/**
+ * xmem_pool_maxsize -
* @ptr: address of memory to be freed
* @mem_pool: pool to free from
*/
diff --git a/xen/include/xlat.lst b/xen/include/xlat.lst
index f2e4597648..0dfd7c75e1 100644
--- a/xen/include/xlat.lst
+++ b/xen/include/xlat.lst
@@ -74,3 +74,6 @@
? processor_px platform.h
! psd_package platform.h
! processor_performance platform.h
+# ? tmem_op_t tmem.h
+# ? tmem_cli_mfn_t tmem.h
+# ? tmem_cli_va_t tmem.h